48 files changed, 3410 insertions, 1710 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b2176374b98e..d5c8019c6627 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
 config NO_BOOTMEM
        boolean
+config MEMORY_ISOLATION
+        boolean
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
+        select MEMORY_ISOLATION
        depends on SPARSEMEM || X86_64_ACPI_NUMA
        depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
        depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
        depends on MMU
        depends on ARCH_SUPPORTS_MEMORY_FAILURE
        bool "Enable recovery from hardware memory errors"
+        select MEMORY_ISOLATION
        help
          Enables code to recover from some memory failures on systems
          with MCA recovery. This allows a system to continue running
@@ -389,3 +394,20 @@ config CLEANCACHE
          in a negligible performance hit.
          If unsure, say Y to enable cleancache
+config FRONTSWAP
+        bool "Enable frontswap to cache swap pages if tmem is present"
+        depends on SWAP
+        default n
+        help
+          Frontswap is so named because it can be thought of as the opposite
+          of a "backing" store for a swap device.  The data is stored into
+          "transcendent memory", memory that is not directly accessible or
+          addressable by the kernel and is of unknown and possibly
+          time-varying size.  When space in transcendent memory is available,
+          a significant swap I/O reduction may be achieved.  When none is
+          available, all frontswap calls are reduced to a single pointer-
+          compare-against-NULL resulting in a negligible performance hit
+          and swap data is stored as normal on the matching swap device.
+          If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index a156285ce88d..92753e2d82da 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,8 +15,9 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                           page_isolation.o mm_init.o mmu_context.o percpu.o \
+                           mm_init.o mmu_context.o percpu.o slab_common.o \
                           compaction.o $(mmu-y)
 obj-y += init-mm.o
 ifdef CONFIG_NO_BOOTMEM
@@ -29,6 +30,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 obj-$(CONFIG_BOUNCE)    += bounce.o
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
 obj-$(CONFIG_HAS_DMA)   += dmapool.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
@@ -47,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aafb07e..6b4718e2ee34 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -677,7 +677,7 @@ int bdi_init(struct backing_dev_info *bdi)
        bdi->min_ratio = 0;
        bdi->max_ratio = 100;
-        bdi->max_prop_frac = PROP_FRAC_BASE;
+        bdi->max_prop_frac = FPROP_FRAC_BASE;
        spin_lock_init(&bdi->wb_lock);
        INIT_LIST_HEAD(&bdi->bdi_list);
        INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +700,7 @@ int bdi_init(struct backing_dev_info *bdi)
        bdi->write_bandwidth = INIT_BW;
        bdi->avg_write_bandwidth = INIT_BW;
-        err = prop_local_init_percpu(&bdi->completions);
+        err = fprop_local_init_percpu(&bdi->completions);
        if (err) {
 err:
@@ -744,7 +744,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
                percpu_counter_destroy(&bdi->bdi_stat[i]);
-        prop_local_destroy_percpu(&bdi->completions);
+        fprop_local_destroy_percpu(&bdi->completions);
 }
 EXPORT_SYMBOL(bdi_destroy);
@@ -886,3 +886,23 @@ out:
        return ret;
 }
 EXPORT_SYMBOL(wait_iff_congested);
+int pdflush_proc_obsolete(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        char kbuf[] = "0\n";
+        if (*ppos) {
+                *lenp = 0;
+                return 0;
+        }
+        if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
+                return -EFAULT;
+        printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
+                        table->procname);
+        *lenp = 2;
+        *ppos += *lenp;
+        return 2;
+}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ec4fcb7a56c8..bcb63ac48cc5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, limit);
 }
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                unsigned long size, unsigned long align,
                                unsigned long goal, unsigned long limit)
 {
@@ -710,6 +710,10 @@ again:
        if (ptr)
                return ptr;
+        /* do not panic in alloc_bootmem_bdata() */
+        if (limit && goal + size > limit)
+                limit = 0;
        ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
        if (ptr)
                return ptr;
diff --git a/mm/bounce.c b/mm/bounce.c
index d1be02ca1889..042086775561 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -24,23 +24,25 @@
 static mempool_t *page_pool, *isa_page_pool;
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
 static __init int init_emergency_pool(void)
 {
-#ifndef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
        if (max_pfn <= max_low_pfn)
                return 0;
 #endif
        page_pool = mempool_create_page_pool(POOL_SIZE, 0);
        BUG_ON(!page_pool);
-        printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+        printk("bounce pool size: %d pages\n", POOL_SIZE);
        return 0;
 }
 __initcall(init_emergency_pool);
+#endif
+#ifdef CONFIG_HIGHMEM
 /*
 * highmem version, map in to vec
 */
diff --git a/mm/compaction.c b/mm/compaction.c
index 7ea259d82a99..e78cb9688421 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone,
                                        pfn -= pageblock_nr_pages) {
                unsigned long isolated;
+                /*
+                 * Skip ahead if another thread is compacting in the area
+                 * simultaneously. If we wrapped around, we can only skip
+                 * ahead if zone->compact_cached_free_pfn also wrapped to
+                 * above our starting point.
+                 */
+                if (cc->order > 0 && (!cc->wrapped ||
+                                      zone->compact_cached_free_pfn >
+                                      cc->start_free_pfn))
+                        pfn = min(pfn, zone->compact_cached_free_pfn);
                if (!pfn_valid(pfn))
                        continue;
@@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone,
                 * looking for free pages, the search will restart here as
                 * page migration may have returned some pages to the allocator
                 */
-                if (isolated)
+                if (isolated) {
                        high_pfn = max(high_pfn, pfn);
+                        if (cc->order > 0)
+                                zone->compact_cached_free_pfn = high_pfn;
+                }
        }
        /* split_free_page does not map the pages */
@@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        return ISOLATE_SUCCESS;
 }
+/*
+ * Returns the start pfn of the last page block in a zone.  This is the starting
+ * point for full compaction of a zone.  Compaction searches for free pages from
+ * the end of each zone, while isolate_freepages_block scans forward inside each
+ * page block.
+ */
+static unsigned long start_free_pfn(struct zone *zone)
+{
+        unsigned long free_pfn;
+        free_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        free_pfn &= ~(pageblock_nr_pages-1);
+        return free_pfn;
+}
 static int compact_finished(struct zone *zone,
                            struct compact_control *cc)
 {
@@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone,
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
-        /* Compaction run completes if the migrate and free scanner meet */
+        /*
-        if (cc->free_pfn <= cc->migrate_pfn)
+         * A full (order == -1) compaction run starts at the beginning and
+         * end of a zone; it completes when the migrate and free scanner meet.
+         * A partial (order > 0) compaction can start with the free scanner
+         * at a random point in the zone, and may have to restart.
+         */
+        if (cc->free_pfn <= cc->migrate_pfn) {
+                if (cc->order > 0 && !cc->wrapped) {
+                        /* We started partway through; restart at the end. */
+                        unsigned long free_pfn = start_free_pfn(zone);
+                        zone->compact_cached_free_pfn = free_pfn;
+                        cc->free_pfn = free_pfn;
+                        cc->wrapped = 1;
+                        return COMPACT_CONTINUE;
+                }
+                return COMPACT_COMPLETE;
+        }
+        /* We wrapped around and ended up where we started. */
+        if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
                return COMPACT_COMPLETE;
        /*
@@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
-        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
-        cc->free_pfn &= ~(pageblock_nr_pages-1);
+        if (cc->order > 0) {
+                /* Incremental compaction. Start where the last one stopped. */
+                cc->free_pfn = zone->compact_cached_free_pfn;
+                cc->start_free_pfn = cc->free_pfn;
+        } else {
+                /* Order == -1 starts at the end of the zone. */
+                cc->free_pfn = start_free_pfn(zone);
+        }
        migrate_prep_local();
@@ -701,8 +754,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                if (err) {
                        putback_lru_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
+                        if (err == -ENOMEM) {
+                                ret = COMPACT_PARTIAL;
+                                goto out;
+                        }
                }
        }
 out:
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af79..9b75a045dbf4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                spin_unlock(&file->f_lock);
                break;
        case POSIX_FADV_WILLNEED:
-                if (!mapping->a_ops->readpage) {
-                        ret = -EINVAL;
-                        break;
-                }
                /* First and last PARTIAL page! */
                start_index = offset >> PAGE_CACHE_SHIFT;
                end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                nrpages = end_index - start_index + 1;
                if (!nrpages)
                        nrpages = ~0UL;
-                
-                ret = force_page_cache_readahead(mapping, file,
+                /*
-                                start_index,
+                 * Ignore return value because fadvise() shall return
-                                nrpages);
+                 * success even if filesystem can't retrieve a hint,
-                if (ret > 0)
+                 */
-                        ret = 0;
+                force_page_cache_readahead(mapping, file, start_index,
+                                           nrpages);
                break;
        case POSIX_FADV_NOREUSE:
                break;
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..6b3e71a2cd48
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,344 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap.  See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2012 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/security.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops __read_mostly;
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+bool frontswap_enabled __read_mostly;
+EXPORT_SYMBOL(frontswap_enabled);
+/*
+ * If enabled, frontswap_store will return failure even on success.  As
+ * a result, the swap subsystem will always write the page to swap, in
+ * effect converting frontswap into a writethrough cache.  In this mode,
+ * there is no direct reduction in swap writes, but a frontswap backend
+ * can unilaterally "reclaim" any pages in use with no data loss, thus
+ * providing increases control over maximum memory usage due to frontswap.
+ */
+static bool frontswap_writethrough_enabled __read_mostly;
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured).  These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_loads;
+static u64 frontswap_succ_stores;
+static u64 frontswap_failed_stores;
+static u64 frontswap_invalidates;
+static inline void inc_frontswap_loads(void) {
+        frontswap_loads++;
+}
+static inline void inc_frontswap_succ_stores(void) {
+        frontswap_succ_stores++;
+}
+static inline void inc_frontswap_failed_stores(void) {
+        frontswap_failed_stores++;
+}
+static inline void inc_frontswap_invalidates(void) {
+        frontswap_invalidates++;
+}
+#else
+static inline void inc_frontswap_loads(void) { }
+static inline void inc_frontswap_succ_stores(void) { }
+static inline void inc_frontswap_failed_stores(void) { }
+static inline void inc_frontswap_invalidates(void) { }
+#endif
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting.
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+        struct frontswap_ops old = frontswap_ops;
+        frontswap_ops = *ops;
+        frontswap_enabled = true;
+        return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+/*
+ * Enable/disable frontswap writethrough (see above).
+ */
+void frontswap_writethrough(bool enable)
+{
+        frontswap_writethrough_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_writethrough);
+/*
+ * Called when a swap device is swapon'd.
+ */
+void __frontswap_init(unsigned type)
+{
+        struct swap_info_struct *sis = swap_info[type];
+        BUG_ON(sis == NULL);
+        if (sis->frontswap_map == NULL)
+                return;
+        frontswap_ops.init(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+        frontswap_clear(sis, offset);
+        atomic_dec(&sis->frontswap_pages);
+}
+/*
+ * "Store" data from a page to frontswap and associate it with the page's
+ * swaptype and offset.  Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implementation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure.
+ */
+int __frontswap_store(struct page *page)
+{
+        int ret = -1, dup = 0;
+        swp_entry_t entry = { .val = page_private(page), };
+        int type = swp_type(entry);
+        struct swap_info_struct *sis = swap_info[type];
+        pgoff_t offset = swp_offset(entry);
+        BUG_ON(!PageLocked(page));
+        BUG_ON(sis == NULL);
+        if (frontswap_test(sis, offset))
+                dup = 1;
+        ret = frontswap_ops.store(type, offset, page);
+        if (ret == 0) {
+                frontswap_set(sis, offset);
+                inc_frontswap_succ_stores();
+                if (!dup)
+                        atomic_inc(&sis->frontswap_pages);
+        } else {
+                /*
+                  failed dup always results in automatic invalidate of
+                  the (older) page from frontswap
+                 */
+                inc_frontswap_failed_stores();
+                if (dup)
+                        __frontswap_clear(sis, offset);
+        }
+        if (frontswap_writethrough_enabled)
+                /* report failure so swap also writes to swap device */
+                ret = -1;
+        return ret;
+}
+EXPORT_SYMBOL(__frontswap_store);
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache.
+ */
+int __frontswap_load(struct page *page)
+{
+        int ret = -1;
+        swp_entry_t entry = { .val = page_private(page), };
+        int type = swp_type(entry);
+        struct swap_info_struct *sis = swap_info[type];
+        pgoff_t offset = swp_offset(entry);
+        BUG_ON(!PageLocked(page));
+        BUG_ON(sis == NULL);
+        if (frontswap_test(sis, offset))
+                ret = frontswap_ops.load(type, offset, page);
+        if (ret == 0)
+                inc_frontswap_loads();
+        return ret;
+}
+EXPORT_SYMBOL(__frontswap_load);
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+        struct swap_info_struct *sis = swap_info[type];
+        BUG_ON(sis == NULL);
+        if (frontswap_test(sis, offset)) {
+                frontswap_ops.invalidate_page(type, offset);
+                __frontswap_clear(sis, offset);
+                inc_frontswap_invalidates();
+        }
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+        struct swap_info_struct *sis = swap_info[type];
+        BUG_ON(sis == NULL);
+        if (sis->frontswap_map == NULL)
+                return;
+        frontswap_ops.invalidate_area(type);
+        atomic_set(&sis->frontswap_pages, 0);
+        memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+static unsigned long __frontswap_curr_pages(void)
+{
+        int type;
+        unsigned long totalpages = 0;
+        struct swap_info_struct *si = NULL;
+        assert_spin_locked(&swap_lock);
+        for (type = swap_list.head; type >= 0; type = si->next) {
+                si = swap_info[type];
+                totalpages += atomic_read(&si->frontswap_pages);
+        }
+        return totalpages;
+}
+static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+                                        int *swapid)
+{
+        int ret = -EINVAL;
+        struct swap_info_struct *si = NULL;
+        int si_frontswap_pages;
+        unsigned long total_pages_to_unuse = total;
+        unsigned long pages = 0, pages_to_unuse = 0;
+        int type;
+        assert_spin_locked(&swap_lock);
+        for (type = swap_list.head; type >= 0; type = si->next) {
+                si = swap_info[type];
+                si_frontswap_pages = atomic_read(&si->frontswap_pages);
+                if (total_pages_to_unuse < si_frontswap_pages) {
+                        pages = pages_to_unuse = total_pages_to_unuse;
+                } else {
+                        pages = si_frontswap_pages;
+                        pages_to_unuse = 0; /* unuse all */
+                }
+                /* ensure there is enough RAM to fetch pages from frontswap */
+                if (security_vm_enough_memory_mm(current->mm, pages)) {
+                        ret = -ENOMEM;
+                        continue;
+                }
+                vm_unacct_memory(pages);
+                *unused = pages_to_unuse;
+                *swapid = type;
+                ret = 0;
+                break;
+        }
+        return ret;
+}
+static int __frontswap_shrink(unsigned long target_pages,
+                                unsigned long *pages_to_unuse,
+                                int *type)
+{
+        unsigned long total_pages = 0, total_pages_to_unuse;
+        assert_spin_locked(&swap_lock);
+        total_pages = __frontswap_curr_pages();
+        if (total_pages <= target_pages) {
+                /* Nothing to do */
+                *pages_to_unuse = 0;
+                return 0;
+        }
+        total_pages_to_unuse = total_pages - target_pages;
+        return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
+}
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+        unsigned long pages_to_unuse = 0;
+        int type, ret;
+        /*
+         * we don't want to hold swap_lock while doing a very
+         * lengthy try_to_unuse, but swap_list may change
+         * so restart scan from swap_list.head each time
+         */
+        spin_lock(&swap_lock);
+        ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+        spin_unlock(&swap_lock);
+        if (ret == 0 && pages_to_unuse)
+                try_to_unuse(type, true, pages_to_unuse);
+        return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices.  This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+        unsigned long totalpages = 0;
+        spin_lock(&swap_lock);
+        totalpages = __frontswap_curr_pages();
+        spin_unlock(&swap_lock);
+        return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_DEBUG_FS
+        struct dentry *root = debugfs_create_dir("frontswap", NULL);
+        if (root == NULL)
+                return -ENXIO;
+        debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
+        debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
+        debugfs_create_u64("failed_stores", S_IRUGO, root,
+                                &frontswap_failed_stores);
+        debugfs_create_u64("invalidates", S_IRUGO,
+                                root, &frontswap_invalidates);
+#endif
+        return 0;
+}
+module_init(init_frontswap);
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6250c3..d517cd16a6eb 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
                do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
 #endif
+struct page *kmap_to_page(void *vaddr)
+{
+        unsigned long addr = (unsigned long)vaddr;
+        if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
+                int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
+                return pte_page(pkmap_page_table[i]);
+        }
+        return virt_to_page(addr);
+}
 static void flush_all_zero_pkmaps(void)
 {
        int i;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831276a3..bc727122dd44 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <linux/io.h>
+#include <asm/tlb.h>
+#include <linux/io.h>
 #include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
+#include <linux/hugetlb_cgroup.h>
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-static int max_hstate;
+int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
 struct hstate hstates[HUGE_MAX_HSTATE];
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
 static unsigned long __initdata default_hstate_max_huge_pages;
 static unsigned long __initdata default_hstate_size;
-#define for_each_hstate(h) \
-        for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
-static DEFINE_SPINLOCK(hugetlb_lock);
+DEFINE_SPINLOCK(hugetlb_lock);
 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
 {
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
-        list_add(&page->lru, &h->hugepage_freelists[nid]);
+        list_move(&page->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
 }
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
        if (list_empty(&h->hugepage_freelists[nid]))
                return NULL;
        page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
-        list_del(&page->lru);
+        list_move(&page->lru, &h->hugepage_activelist);
        set_page_refcounted(page);
        h->free_huge_pages--;
        h->free_huge_pages_node[nid]--;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                1 << PG_active | 1 << PG_reserved |
                                1 << PG_private | 1 << PG_writeback);
        }
+        VM_BUG_ON(hugetlb_cgroup_from_page(page));
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
        arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
        page->mapping = NULL;
        BUG_ON(page_count(page));
        BUG_ON(page_mapcount(page));
-        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
+        hugetlb_cgroup_uncharge_page(hstate_index(h),
+                                     pages_per_huge_page(h), page);
        if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+                /* remove the page from active list */
+                list_del(&page->lru);
                update_and_free_page(h, page);
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
+        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, free_huge_page);
        spin_lock(&hugetlb_lock);
+        set_hugetlb_cgroup(page, NULL);
        h->nr_huge_pages++;
        h->nr_huge_pages_node[nid]++;
        spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
        spin_lock(&hugetlb_lock);
        if (page) {
+                INIT_LIST_HEAD(&page->lru);
                r_nid = page_to_nid(page);
                set_compound_page_dtor(page, free_huge_page);
+                set_hugetlb_cgroup(page, NULL);
                /*
                 * We incremented the global counters already
                 */
@@ -993,7 +1001,6 @@ retry:
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
                        break;
-                list_del(&page->lru);
                /*
                 * This page is now managed by the hugetlb allocator and has
                 * no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
        /* Free unnecessary surplus pages to the buddy allocator */
        if (!list_empty(&surplus_list)) {
                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
-                        list_del(&page->lru);
                        put_page(page);
                }
        }
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        struct hstate *h = hstate_vma(vma);
        struct page *page;
        long chg;
+        int ret, idx;
+        struct hugetlb_cgroup *h_cg;
+        idx = hstate_index(h);
        /*
         * Processes that did not create the mapping will have no
         * reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
         */
        chg = vma_needs_reservation(h, vma, addr);
        if (chg < 0)
-                return ERR_PTR(-VM_FAULT_OOM);
+                return ERR_PTR(-ENOMEM);
        if (chg)
                if (hugepage_subpool_get_pages(spool, chg))
-                        return ERR_PTR(-VM_FAULT_SIGBUS);
+                        return ERR_PTR(-ENOSPC);
+        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+        if (ret) {
+                hugepage_subpool_put_pages(spool, chg);
+                return ERR_PTR(-ENOSPC);
+        }
        spin_lock(&hugetlb_lock);
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
-        spin_unlock(&hugetlb_lock);
+        if (page) {
+                /* update page cgroup details */
-        if (!page) {
+                hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+                                             h_cg, page);
+                spin_unlock(&hugetlb_lock);
+        } else {
+                spin_unlock(&hugetlb_lock);
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
+                        hugetlb_cgroup_uncharge_cgroup(idx,
+                                                       pages_per_huge_page(h),
+                                                       h_cg);
                        hugepage_subpool_put_pages(spool, chg);
-                        return ERR_PTR(-VM_FAULT_SIGBUS);
+                        return ERR_PTR(-ENOSPC);
                }
+                spin_lock(&hugetlb_lock);
+                hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+                                             h_cg, page);
+                list_move(&page->lru, &h->hugepage_activelist);
+                spin_unlock(&hugetlb_lock);
        }
        set_page_private(page, (unsigned long)spool);
        vma_commit_reservation(h, vma, addr);
        return page;
 }
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
                                    struct attribute_group *hstate_attr_group)
 {
        int retval;
-        int hi = h - hstates;
+        int hi = hstate_index(h);
        hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
        if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
        if (!nhs->hugepages_kobj)
                return;         /* no hstate attributes */
-        for_each_hstate(h)
+        for_each_hstate(h) {
-                if (nhs->hstate_kobjs[h - hstates]) {
+                int idx = hstate_index(h);
-                        kobject_put(nhs->hstate_kobjs[h - hstates]);
+                if (nhs->hstate_kobjs[idx]) {
-                        nhs->hstate_kobjs[h - hstates] = NULL;
+                        kobject_put(nhs->hstate_kobjs[idx]);
+                        nhs->hstate_kobjs[idx] = NULL;
                }
+        }
        kobject_put(nhs->hugepages_kobj);
        nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
        hugetlb_unregister_all_nodes();
        for_each_hstate(h) {
-                kobject_put(hstate_kobjs[h - hstates]);
+                kobject_put(hstate_kobjs[hstate_index(h)]);
        }
        kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
                if (!size_to_hstate(default_hstate_size))
                        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
        }
-        default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+        default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
        if (default_hstate_max_huge_pages)
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
                printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
                return;
        }
-        BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
        BUG_ON(order == 0);
-        h = &hstates[max_hstate++];
+        h = &hstates[hugetlb_max_hstate++];
        h->order = order;
        h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
        h->nr_huge_pages = 0;
        h->free_huge_pages = 0;
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+        INIT_LIST_HEAD(&h->hugepage_activelist);
        h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
        h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
+        /*
+         * Add cgroup control files only if the huge page consists
+         * of more than two normal pages. This is because we use
+         * page[2].lru.next for storing cgoup details.
+         */
+        if (order >= HUGETLB_CGROUP_MIN_ORDER)
+                hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
        parsed_hstate = h;
 }
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
        static unsigned long *last_mhp;
        /*
-         * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+         * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
         * so this hugepages= parameter goes to the "default hstate".
         */
-        if (!max_hstate)
+        if (!hugetlb_max_hstate)
                mhp = &default_hstate_max_huge_pages;
        else
                mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
         * But we need to allocate >= MAX_ORDER hstates here early to still
         * use the bootmem allocator.
         */
-        if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+        if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
                hugetlb_hstate_alloc_pages(parsed_hstate);
        last_mhp = mhp;
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
                return 0;
 }
-void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
-                            unsigned long end, struct page *ref_page)
+                            unsigned long start, unsigned long end,
+                            struct page *ref_page)
 {
+        int force_flush = 0;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
        pte_t *ptep;
        pte_t pte;
        struct page *page;
-        struct page *tmp;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
-        /*
-         * A page gathering list, protected by per file i_mmap_mutex. The
-         * lock is used to avoid list corruption from multiple unmapping
-         * of the same page since we are using page->lru.
-         */
-        LIST_HEAD(page_list);
        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));
+        tlb_start_vma(tlb, vma);
        mmu_notifier_invalidate_range_start(mm, start, end);
+again:
        spin_lock(&mm->page_table_lock);
        for (address = start; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address);
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                }
                pte = huge_ptep_get_and_clear(mm, address, ptep);
+                tlb_remove_tlb_entry(tlb, ptep, address);
                if (pte_dirty(pte))
                        set_page_dirty(page);
-                list_add(&page->lru, &page_list);
+                page_remove_rmap(page);
+                force_flush = !__tlb_remove_page(tlb, page);
+                if (force_flush)
+                        break;
                /* Bail out after unmapping reference page if supplied */
                if (ref_page)
                        break;
        }
-        flush_tlb_range(vma, start, end);
        spin_unlock(&mm->page_table_lock);
-        mmu_notifier_invalidate_range_end(mm, start, end);
+        /*
-        list_for_each_entry_safe(page, tmp, &page_list, lru) {
+         * mmu_gather ran out of room to batch pages, we break out of
-                page_remove_rmap(page);
+         * the PTE lock to avoid doing the potential expensive TLB invalidate
-                list_del(&page->lru);
+         * and page-free while holding it.
-                put_page(page);
+         */
+        if (force_flush) {
+                force_flush = 0;
+                tlb_flush_mmu(tlb);
+                if (address < end && !ref_page)
+                        goto again;
        }
+        mmu_notifier_invalidate_range_end(mm, start, end);
+        tlb_end_vma(tlb, vma);
+}
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+                          struct vm_area_struct *vma, unsigned long start,
+                          unsigned long end, struct page *ref_page)
+{
+        __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+        /*
+         * Clear this flag so that x86's huge_pmd_share page_table_shareable
+         * test will fail on a vma being torn down, and not grab a page table
+         * on its way out.  We're lucky that the flag has such an appropriate
+         * name, and can in fact be safely cleared here. We could clear it
+         * before the __unmap_hugepage_range above, but all that's necessary
+         * is to clear it before releasing the i_mmap_mutex. This works
+         * because in the context this is called, the VMA is about to be
+         * destroyed and the i_mmap_mutex is held.
+         */
+        vma->vm_flags &= ~VM_MAYSHARE;
 }
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page)
 {
-        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        struct mm_struct *mm;
-        __unmap_hugepage_range(vma, start, end, ref_page);
+        struct mmu_gather tlb;
-        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        mm = vma->vm_mm;
+        tlb_gather_mmu(&tlb, mm, 0);
+        __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+        tlb_finish_mmu(&tlb, start, end);
 }
 /*
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                 * from the time of fork. This would look like data corruption
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
-                        __unmap_hugepage_range(iter_vma,
+                        unmap_hugepage_range(iter_vma, address,
-                                address, address + huge_page_size(h),
+                                             address + huge_page_size(h), page);
-                                page);
        }
        mutex_unlock(&mapping->i_mmap_mutex);
@@ -2496,6 +2560,7 @@ retry_avoidcopy:
        new_page = alloc_huge_page(vma, address, outside_reserve);
        if (IS_ERR(new_page)) {
+                long err = PTR_ERR(new_page);
                page_cache_release(old_page);
                /*
@@ -2524,7 +2589,10 @@ retry_avoidcopy:
                /* Caller expects lock to be held */
                spin_lock(&mm->page_table_lock);
-                return -PTR_ERR(new_page);
+                if (err == -ENOMEM)
+                        return VM_FAULT_OOM;
+                else
+                        return VM_FAULT_SIGBUS;
        }
        /*
@@ -2642,7 +2710,11 @@ retry:
                        goto out;
                page = alloc_huge_page(vma, address, 0);
                if (IS_ERR(page)) {
-                        ret = -PTR_ERR(page);
+                        ret = PTR_ERR(page);
+                        if (ret == -ENOMEM)
+                                ret = VM_FAULT_OOM;
+                        else
+                                ret = VM_FAULT_SIGBUS;
                        goto out;
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2679,7 +2751,7 @@ retry:
                 */
                if (unlikely(PageHWPoison(page))) {
                        ret = VM_FAULT_HWPOISON |
-                              VM_FAULT_SET_HINDEX(h - hstates);
+                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto backout_unlocked;
                }
        }
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        return 0;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                        return VM_FAULT_HWPOISON_LARGE |
-                               VM_FAULT_SET_HINDEX(h - hstates);
+                                VM_FAULT_SET_HINDEX(hstate_index(h));
        }
        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                }
        }
        spin_unlock(&mm->page_table_lock);
-        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        /*
+         * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+         * may have cleared our pud entry and done put_page on the page table:
+         * once we release i_mmap_mutex, another task can do the final put_page
+         * and that page table be reused and filled with junk.
+         */
        flush_tlb_range(vma, start, end);
+        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
 int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 000000000000..a3f358fb8a0c
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,418 @@
+/*
+ *
+ * Copyright IBM Corporation, 2012
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+struct hugetlb_cgroup {
+        struct cgroup_subsys_state css;
+        /*
+         * the counter to account for hugepages from hugetlb.
+         */
+        struct res_counter hugepage[HUGE_MAX_HSTATE];
+};
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_IDX(val)        (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val)       ((val) & 0xffff)
+struct cgroup_subsys hugetlb_subsys __read_mostly;
+static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+        return container_of(s, struct hugetlb_cgroup, css);
+}
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+        return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
+                                                           hugetlb_subsys_id));
+}
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
+{
+        return hugetlb_cgroup_from_css(task_subsys_state(task,
+                                                         hugetlb_subsys_id));
+}
+static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
+{
+        return (h_cg == root_h_cgroup);
+}
+static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
+{
+        if (!cg->parent)
+                return NULL;
+        return hugetlb_cgroup_from_cgroup(cg->parent);
+}
+static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
+{
+        int idx;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
+        for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+                if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
+                        return true;
+        }
+        return false;
+}
+static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
+{
+        int idx;
+        struct cgroup *parent_cgroup;
+        struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
+        h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
+        if (!h_cgroup)
+                return ERR_PTR(-ENOMEM);
+        parent_cgroup = cgroup->parent;
+        if (parent_cgroup) {
+                parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
+                for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+                        res_counter_init(&h_cgroup->hugepage[idx],
+                                         &parent_h_cgroup->hugepage[idx]);
+        } else {
+                root_h_cgroup = h_cgroup;
+                for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+                        res_counter_init(&h_cgroup->hugepage[idx], NULL);
+        }
+        return &h_cgroup->css;
+}
+static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
+{
+        struct hugetlb_cgroup *h_cgroup;
+        h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
+        kfree(h_cgroup);
+}
+/*
+ * Should be called with hugetlb_lock held.
+ * Since we are holding hugetlb_lock, pages cannot get moved from
+ * active list or uncharged from the cgroup, So no need to get
+ * page reference and test for page active here. This function
+ * cannot fail.
+ */
+static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
+                                       struct page *page)
+{
+        int csize;
+        struct res_counter *counter;
+        struct res_counter *fail_res;
+        struct hugetlb_cgroup *page_hcg;
+        struct hugetlb_cgroup *h_cg   = hugetlb_cgroup_from_cgroup(cgroup);
+        struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
+        page_hcg = hugetlb_cgroup_from_page(page);
+        /*
+         * We can have pages in active list without any cgroup
+         * ie, hugepage with less than 3 pages. We can safely
+         * ignore those pages.
+         */
+        if (!page_hcg || page_hcg != h_cg)
+                goto out;
+        csize = PAGE_SIZE << compound_order(page);
+        if (!parent) {
+                parent = root_h_cgroup;
+                /* root has no limit */
+                res_counter_charge_nofail(&parent->hugepage[idx],
+                                          csize, &fail_res);
+        }
+        counter = &h_cg->hugepage[idx];
+        res_counter_uncharge_until(counter, counter->parent, csize);
+        set_hugetlb_cgroup(page, parent);
+out:
+        return;
+}
+/*
+ * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
+ * the parent cgroup.
+ */
+static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
+{
+        struct hstate *h;
+        struct page *page;
+        int ret = 0, idx = 0;
+        do {
+                if (cgroup_task_count(cgroup) ||
+                    !list_empty(&cgroup->children)) {
+                        ret = -EBUSY;
+                        goto out;
+                }
+                for_each_hstate(h) {
+                        spin_lock(&hugetlb_lock);
+                        list_for_each_entry(page, &h->hugepage_activelist, lru)
+                                hugetlb_cgroup_move_parent(idx, cgroup, page);
+                        spin_unlock(&hugetlb_lock);
+                        idx++;
+                }
+                cond_resched();
+        } while (hugetlb_cgroup_have_usage(cgroup));
+out:
+        return ret;
+}
+int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+                                 struct hugetlb_cgroup **ptr)
+{
+        int ret = 0;
+        struct res_counter *fail_res;
+        struct hugetlb_cgroup *h_cg = NULL;
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled())
+                goto done;
+        /*
+         * We don't charge any cgroup if the compound page have less
+         * than 3 pages.
+         */
+        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+                goto done;
+again:
+        rcu_read_lock();
+        h_cg = hugetlb_cgroup_from_task(current);
+        if (!css_tryget(&h_cg->css)) {
+                rcu_read_unlock();
+                goto again;
+        }
+        rcu_read_unlock();
+        ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
+        css_put(&h_cg->css);
+done:
+        *ptr = h_cg;
+        return ret;
+}
+/* Should be called with hugetlb_lock held */
+void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+                                  struct hugetlb_cgroup *h_cg,
+                                  struct page *page)
+{
+        if (hugetlb_cgroup_disabled() || !h_cg)
+                return;
+        set_hugetlb_cgroup(page, h_cg);
+        return;
+}
+/*
+ * Should be called with hugetlb_lock held
+ */
+void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+                                  struct page *page)
+{
+        struct hugetlb_cgroup *h_cg;
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled())
+                return;
+        VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
+        h_cg = hugetlb_cgroup_from_page(page);
+        if (unlikely(!h_cg))
+                return;
+        set_hugetlb_cgroup(page, NULL);
+        res_counter_uncharge(&h_cg->hugepage[idx], csize);
+        return;
+}
+void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
+                                    struct hugetlb_cgroup *h_cg)
+{
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled() || !h_cg)
+                return;
+        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+                return;
+        res_counter_uncharge(&h_cg->hugepage[idx], csize);
+        return;
+}
+static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
+                                   struct file *file, char __user *buf,
+                                   size_t nbytes, loff_t *ppos)
+{
+        u64 val;
+        char str[64];
+        int idx, name, len;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        val = res_counter_read_u64(&h_cg->hugepage[idx], name);
+        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
+                                const char *buffer)
+{
+        int idx, name, ret;
+        unsigned long long val;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        switch (name) {
+        case RES_LIMIT:
+                if (hugetlb_cgroup_is_root(h_cg)) {
+                        /* Can't set limit on root */
+                        ret = -EINVAL;
+                        break;
+                }
+                /* This function does all necessary parse...reuse it */
+                ret = res_counter_memparse_write_strategy(buffer, &val);
+                if (ret)
+                        break;
+                ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret;
+}
+static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
+{
+        int idx, name, ret = 0;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(event);
+        name = MEMFILE_ATTR(event);
+        switch (name) {
+        case RES_MAX_USAGE:
+                res_counter_reset_max(&h_cg->hugepage[idx]);
+                break;
+        case RES_FAILCNT:
+                res_counter_reset_failcnt(&h_cg->hugepage[idx]);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret;
+}
+static char *mem_fmt(char *buf, int size, unsigned long hsize)
+{
+        if (hsize >= (1UL << 30))
+                snprintf(buf, size, "%luGB", hsize >> 30);
+        else if (hsize >= (1UL << 20))
+                snprintf(buf, size, "%luMB", hsize >> 20);
+        else
+                snprintf(buf, size, "%luKB", hsize >> 10);
+        return buf;
+}
+int __init hugetlb_cgroup_file_init(int idx)
+{
+        char buf[32];
+        struct cftype *cft;
+        struct hstate *h = &hstates[idx];
+        /* format the size */
+        mem_fmt(buf, 32, huge_page_size(h));
+        /* Add the limit file */
+        cft = &h->cgroup_files[0];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
+        cft->read = hugetlb_cgroup_read;
+        cft->write_string = hugetlb_cgroup_write;
+        /* Add the usage file */
+        cft = &h->cgroup_files[1];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
+        cft->read = hugetlb_cgroup_read;
+        /* Add the MAX usage file */
+        cft = &h->cgroup_files[2];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
+        cft->trigger = hugetlb_cgroup_reset;
+        cft->read = hugetlb_cgroup_read;
+        /* Add the failcntfile */
+        cft = &h->cgroup_files[3];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
+        cft->private  = MEMFILE_PRIVATE(idx, RES_FAILCNT);
+        cft->trigger  = hugetlb_cgroup_reset;
+        cft->read = hugetlb_cgroup_read;
+        /* NULL terminate the last cft */
+        cft = &h->cgroup_files[4];
+        memset(cft, 0, sizeof(*cft));
+        WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
+        return 0;
+}
+/*
+ * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
+ * when we migrate hugepages
+ */
+void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
+{
+        struct hugetlb_cgroup *h_cg;
+        struct hstate *h = page_hstate(oldhpage);
+        if (hugetlb_cgroup_disabled())
+                return;
+        VM_BUG_ON(!PageHuge(oldhpage));
+        spin_lock(&hugetlb_lock);
+        h_cg = hugetlb_cgroup_from_page(oldhpage);
+        set_hugetlb_cgroup(oldhpage, NULL);
+        /* move the h_cg details to new cgroup */
+        set_hugetlb_cgroup(newhpage, h_cg);
+        list_move(&newhpage->lru, &h->hugepage_activelist);
+        spin_unlock(&hugetlb_lock);
+        return;
+}
+struct cgroup_subsys hugetlb_subsys = {
+        .name = "hugetlb",
+        .create     = hugetlb_cgroup_create,
+        .pre_destroy = hugetlb_cgroup_pre_destroy,
+        .destroy    = hugetlb_cgroup_destroy,
+        .subsys_id  = hugetlb_subsys_id,
+};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index cc448bb983ba..3a61efc518d5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -123,7 +123,7 @@ static int pfn_inject_init(void)
        if (!dentry)
                goto fail;
-#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
        dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
                                    hwpoison_dir, &hwpoison_filter_memcg);
        if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 2ba87fbfb75b..3314f79d775a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,8 +118,14 @@ struct compact_control {
        unsigned long nr_freepages;     /* Number of isolated free pages */
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
+        unsigned long start_free_pfn;   /* where we started the search */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
+        bool wrapped;                   /* Order > 0 compactions are
+                                           incremental, once free_pfn
+                                           and migrate_pfn meet, we restart
+                                           from the top of the zone;
+                                           remember we wrapped around. */
        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
@@ -347,3 +353,5 @@ extern u32 hwpoison_filter_enable;
 extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);
+extern void set_pageblock_order(void);
diff --git a/mm/madvise.c b/mm/madvise.c
index deff1b64a08c..14d260fa0d17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -15,6 +15,7 @@
 #include <linux/sched.h>
 #include <linux/ksm.h>
 #include <linux/fs.h>
+#include <linux/file.h>
 /*
 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -204,14 +205,16 @@ static long madvise_remove(struct vm_area_struct *vma,
 {
        loff_t offset;
        int error;
+        struct file *f;
        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
                return -EINVAL;
-        if (!vma->vm_file || !vma->vm_file->f_mapping
+        f = vma->vm_file;
-                || !vma->vm_file->f_mapping->host) {
+        if (!f || !f->f_mapping || !f->f_mapping->host) {
                        return -EINVAL;
        }
@@ -221,11 +224,18 @@ static long madvise_remove(struct vm_area_struct *vma,
        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        /* filesystem's fallocate may need to take i_mutex */
+        /*
+         * Filesystem's fallocate may need to take i_mutex.  We need to
+         * explicitly grab a reference because the vma (and hence the
+         * vma's reference to the file) can go away as soon as we drop
+         * mmap_sem.
+         */
+        get_file(f);
        up_read(&current->mm->mmap_sem);
-        error = do_fallocate(vma->vm_file,
+        error = do_fallocate(f,
                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
                                offset, end - start);
+        fput(f);
        down_read(&current->mm->mmap_sem);
        return error;
 }
diff --git a/mm/memblock.c b/mm/memblock.c
index 952123eba433..4d9393c7edc9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                           MAX_NUMNODES);
 }
-/*
- * Free memblock.reserved.regions
- */
-int __init_memblock memblock_free_reserved_regions(void)
-{
-        if (memblock.reserved.regions == memblock_reserved_init_regions)
-                return 0;
-        return memblock_free(__pa(memblock.reserved.regions),
-                 sizeof(struct memblock_region) * memblock.reserved.max);
-}
-/*
- * Reserve memblock.reserved.regions
- */
-int __init_memblock memblock_reserve_reserved_regions(void)
-{
-        if (memblock.reserved.regions == memblock_reserved_init_regions)
-                return 0;
-        return memblock_reserve(__pa(memblock.reserved.regions),
-                 sizeof(struct memblock_region) * memblock.reserved.max);
-}
 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
 {
        type->total_size -= type->regions[r].size;
@@ -184,9 +160,39 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
        }
 }
-static int __init_memblock memblock_double_array(struct memblock_type *type)
+phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
+                                        phys_addr_t *addr)
+{
+        if (memblock.reserved.regions == memblock_reserved_init_regions)
+                return 0;
+        *addr = __pa(memblock.reserved.regions);
+        return PAGE_ALIGN(sizeof(struct memblock_region) *
+                          memblock.reserved.max);
+}
+/**
+ * memblock_double_array - double the size of the memblock regions array
+ * @type: memblock type of the regions array being doubled
+ * @new_area_start: starting address of memory range to avoid overlap with
+ * @new_area_size: size of memory range to avoid overlap with
+ *
+ * Double the size of the @type regions array. If memblock is being used to
+ * allocate memory for a new reserved regions array and there is a previously
+ * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
+ * waiting to be reserved, ensure the memory used by the new array does
+ * not overlap.
+ *
+ * RETURNS:
+ * 0 on success, -1 on failure.
+ */
+static int __init_memblock memblock_double_array(struct memblock_type *type,
+                                                phys_addr_t new_area_start,
+                                                phys_addr_t new_area_size)
 {
        struct memblock_region *new_array, *old_array;
+        phys_addr_t old_alloc_size, new_alloc_size;
        phys_addr_t old_size, new_size, addr;
        int use_slab = slab_is_available();
        int *in_slab;
@@ -200,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        /* Calculate new doubled size */
        old_size = type->max * sizeof(struct memblock_region);
        new_size = old_size << 1;
+        /*
+         * We need to allocated new one align to PAGE_SIZE,
+         *   so we can free them completely later.
+         */
+        old_alloc_size = PAGE_ALIGN(old_size);
+        new_alloc_size = PAGE_ALIGN(new_size);
        /* Retrieve the slab flag */
        if (type == &memblock.memory)
@@ -210,19 +222,30 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        /* Try to find some space for it.
         *
         * WARNING: We assume that either slab_is_available() and we use it or
-         * we use MEMBLOCK for allocations. That means that this is unsafe to use
+         * we use MEMBLOCK for allocations. That means that this is unsafe to
-         * when bootmem is currently active (unless bootmem itself is implemented
+         * use when bootmem is currently active (unless bootmem itself is
-         * on top of MEMBLOCK which isn't the case yet)
+         * implemented on top of MEMBLOCK which isn't the case yet)
         *
         * This should however not be an issue for now, as we currently only
-         * call into MEMBLOCK while it's still active, or much later when slab is
+         * call into MEMBLOCK while it's still active, or much later when slab
-         * active for memory hotplug operations
+         * is active for memory hotplug operations
         */
        if (use_slab) {
                new_array = kmalloc(new_size, GFP_KERNEL);
                addr = new_array ? __pa(new_array) : 0;
        } else {
-                addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+                /* only exclude range when trying to double reserved.regions */
+                if (type != &memblock.reserved)
+                        new_area_start = new_area_size = 0;
+                addr = memblock_find_in_range(new_area_start + new_area_size,
+                                                memblock.current_limit,
+                                                new_alloc_size, PAGE_SIZE);
+                if (!addr && new_area_size)
+                        addr = memblock_find_in_range(0,
+                                min(new_area_start, memblock.current_limit),
+                                new_alloc_size, PAGE_SIZE);
                new_array = addr ? __va(addr) : 0;
        }
        if (!addr) {
@@ -231,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
                return -1;
        }
-        memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
+        memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
-                 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
+                        memblock_type_name(type), type->max * 2, (u64)addr,
+                        (u64)addr + new_size - 1);
-        /* Found space, we now need to move the array over before
+        /*
-         * we add the reserved region since it may be our reserved
+         * Found space, we now need to move the array over before we add the
-         * array itself that is full.
+         * reserved region since it may be our reserved array itself that is
+         * full.
         */
        memcpy(new_array, type->regions, old_size);
        memset(new_array + type->max, 0, old_size);
@@ -244,20 +269,19 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        type->regions = new_array;
        type->max <<= 1;
-        /* Free old array. We needn't free it if the array is the
+        /* Free old array. We needn't free it if the array is the static one */
-         * static one
-         */
        if (*in_slab)
                kfree(old_array);
        else if (old_array != memblock_memory_init_regions &&
                 old_array != memblock_reserved_init_regions)
-                memblock_free(__pa(old_array), old_size);
+                memblock_free(__pa(old_array), old_alloc_size);
-        /* Reserve the new array if that comes from the memblock.
+        /*
-         * Otherwise, we needn't do it
+         * Reserve the new array if that comes from the memblock.  Otherwise, we
+         * needn't do it
         */
        if (!use_slab)
-                BUG_ON(memblock_reserve(addr, new_size));
+                BUG_ON(memblock_reserve(addr, new_alloc_size));
        /* Update slab flag */
        *in_slab = use_slab;
@@ -399,7 +423,7 @@ repeat:
         */
        if (!insert) {
                while (type->cnt + nr_new > type->max)
-                        if (memblock_double_array(type) < 0)
+                        if (memblock_double_array(type, obase, size) < 0)
                                return -ENOMEM;
                insert = true;
                goto repeat;
@@ -450,7 +474,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
        /* we'll create at most two more regions */
        while (type->cnt + 2 > type->max)
-                if (memblock_double_array(type) < 0)
+                if (memblock_double_array(type, base, size) < 0)
                        return -ENOMEM;
        for (i = 0; i < type->cnt; i++) {
@@ -540,9 +564,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 * __next_free_mem_range - next function for for_each_free_mem_range()
 * @idx: pointer to u64 loop variable
 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Find the first free area from *@idx which matches @nid, fill the out
 * parameters, and update *@idx for the next iteration.  The lower 32bit of
@@ -616,9 +640,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
 * @idx: pointer to u64 loop variable
 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Reverse of __next_free_mem_range().
 */
@@ -867,6 +891,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
        return memblock_search(&memblock.memory, addr) != -1;
 }
+/**
+ * memblock_is_region_memory - check if a region is a subset of memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) is a subset of a memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
 {
        int idx = memblock_search(&memblock.memory, base);
@@ -879,6 +913,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
                 memblock.memory.regions[idx].size) >= end;
 }
+/**
+ * memblock_is_region_reserved - check if a region intersects reserved memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) intersects a reserved memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
 int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
 {
        memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ac35bccadb7b..795e525afaba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES      5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
 /* for remember boot option*/
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
 #else
 static int really_do_swap_account __initdata = 0;
@@ -87,7 +87,7 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
        MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
        MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
-        MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+        MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
        MEM_CGROUP_STAT_NSTATS,
 };
@@ -378,9 +378,7 @@ static bool move_file(void)
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
-        MEM_CGROUP_CHARGE_TYPE_MAPPED,
+        MEM_CGROUP_CHARGE_TYPE_ANON,
-        MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
-        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
        MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
        NR_CHARGE_TYPE,
@@ -407,8 +405,14 @@ enum charge_type {
 static void mem_cgroup_get(struct mem_cgroup *memcg);
 static void mem_cgroup_put(struct mem_cgroup *memcg);
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+        return container_of(s, struct mem_cgroup, css);
+}
 /* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
 #include <net/sock.h>
 #include <net/ip.h>
@@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif /* CONFIG_INET */
-#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+#endif /* CONFIG_MEMCG_KMEM */
-#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
        if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
                                         bool charge)
 {
        int val = (charge) ? 1 : -1;
-        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
+        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 }
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
-        return container_of(cgroup_subsys_state(cont,
+        return mem_cgroup_from_css(
-                                mem_cgroup_subsys_id), struct mem_cgroup,
+                cgroup_subsys_state(cont, mem_cgroup_subsys_id));
-                                css);
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
        if (unlikely(!p))
                return NULL;
-        return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
+        return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
-                                struct mem_cgroup, css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
                if (css) {
                        if (css == &root->css || css_tryget(css))
-                                memcg = container_of(css,
+                                memcg = mem_cgroup_from_css(css);
-                                                     struct mem_cgroup, css);
                } else
                        id = 0;
                rcu_read_unlock();
@@ -1148,7 +1149,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 {
        if (root_memcg == memcg)
                return true;
-        if (!root_memcg->use_hierarchy)
+        if (!root_memcg->use_hierarchy || !memcg)
                return false;
        return css_is_ancestor(&memcg->css, &root_memcg->css);
 }
@@ -1234,7 +1235,7 @@ int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
 /**
 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
- * @mem: the memory cgroup
+ * @memcg: the memory cgroup
 *
 * Returns the maximum amount of memory @mem can be charged with, in
 * pages.
@@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 /*
 * Return the memory (and swap, if configured) limit for a memcg.
 */
-u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
        u64 limit;
        u64 memsw;
@@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        return min(limit, memsw);
 }
+void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                              int order)
+{
+        struct mem_cgroup *iter;
+        unsigned long chosen_points = 0;
+        unsigned long totalpages;
+        unsigned int points = 0;
+        struct task_struct *chosen = NULL;
+        /*
+         * If current has a pending SIGKILL, then automatically select it.  The
+         * goal is to allow it to allocate so that it may quickly exit and free
+         * its memory.
+         */
+        if (fatal_signal_pending(current)) {
+                set_thread_flag(TIF_MEMDIE);
+                return;
+        }
+        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+        totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+        for_each_mem_cgroup_tree(iter, memcg) {
+                struct cgroup *cgroup = iter->css.cgroup;
+                struct cgroup_iter it;
+                struct task_struct *task;
+                cgroup_iter_start(cgroup, &it);
+                while ((task = cgroup_iter_next(cgroup, &it))) {
+                        switch (oom_scan_process_thread(task, totalpages, NULL,
+                                                        false)) {
+                        case OOM_SCAN_SELECT:
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                chosen = task;
+                                chosen_points = ULONG_MAX;
+                                get_task_struct(chosen);
+                                /* fall through */
+                        case OOM_SCAN_CONTINUE:
+                                continue;
+                        case OOM_SCAN_ABORT:
+                                cgroup_iter_end(cgroup, &it);
+                                mem_cgroup_iter_break(memcg, iter);
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                return;
+                        case OOM_SCAN_OK:
+                                break;
+                        };
+                        points = oom_badness(task, memcg, NULL, totalpages);
+                        if (points > chosen_points) {
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                chosen = task;
+                                chosen_points = points;
+                                get_task_struct(chosen);
+                        }
+                }
+                cgroup_iter_end(cgroup, &it);
+        }
+        if (!chosen)
+                return;
+        points = chosen_points * 1000 / totalpages;
+        oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
+                         NULL, "Memory cgroup out of memory");
+}
 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
                                        gfp_t gfp_mask,
                                        unsigned long flags)
@@ -1508,7 +1576,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 /**
 * test_mem_cgroup_node_reclaimable
- * @mem: the target memcg
+ * @memcg: the target memcg
 * @nid: the node ID to be checked.
 * @noswap : specify true here if the user wants flle only information.
 *
@@ -1899,7 +1967,7 @@ again:
                return;
        /*
         * If this memory cgroup is not under account moving, we don't
-         * need to take move_lock_page_cgroup(). Because we already hold
+         * need to take move_lock_mem_cgroup(). Because we already hold
         * rcu_read_lock(), any calls to move_account will be delayed until
         * rcu_read_unlock() if mem_cgroup_stolen() == true.
         */
@@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
        /*
         * It's guaranteed that pc->mem_cgroup never changes while
         * lock is held because a routine modifies pc->mem_cgroup
-         * should take move_lock_page_cgroup().
+         * should take move_lock_mem_cgroup().
         */
        move_unlock_mem_cgroup(pc->mem_cgroup, flags);
 }
@@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
         * thread group leader migrates. It's possible that mm is not
-         * set, if so charge the init_mm (happens for pagecache usage).
+         * set, if so charge the root memcg (happens for pagecache usage).
         */
        if (!*ptr && !mm)
                *ptr = root_mem_cgroup;
@@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
        css = css_lookup(&mem_cgroup_subsys, id);
        if (!css)
                return NULL;
-        return container_of(css, struct mem_cgroup, css);
+        return mem_cgroup_from_css(css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        bool anon;
        lock_page_cgroup(pc);
-        if (unlikely(PageCgroupUsed(pc))) {
+        VM_BUG_ON(PageCgroupUsed(pc));
-                unlock_page_cgroup(pc);
-                __mem_cgroup_cancel_charge(memcg, nr_pages);
-                return;
-        }
        /*
         * we don't need page_cgroup_lock about tail pages, becase they are not
         * accessed by any other context at this point.
@@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
                spin_unlock_irq(&zone->lru_lock);
        }
-        if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+        if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
                anon = true;
        else
                anon = false;
@@ -2644,8 +2708,7 @@ out:
 static int mem_cgroup_move_parent(struct page *page,
                                  struct page_cgroup *pc,
-                                  struct mem_cgroup *child,
+                                  struct mem_cgroup *child)
-                                  gfp_t gfp_mask)
 {
        struct mem_cgroup *parent;
        unsigned int nr_pages;
@@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page,
        VM_BUG_ON(page->mapping && !PageAnon(page));
        VM_BUG_ON(!mm);
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                        MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                        MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
-                                        enum charge_type ctype);
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
-                                gfp_t gfp_mask)
-{
-        struct mem_cgroup *memcg = NULL;
-        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
-        int ret;
-        if (mem_cgroup_disabled())
-                return 0;
-        if (PageCompound(page))
-                return 0;
-        if (unlikely(!mm))
-                mm = &init_mm;
-        if (!page_is_file_cache(page))
-                type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-        if (!PageSwapCache(page))
-                ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
-        else { /* page is swapcache/shmem */
-                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
-                if (!ret)
-                        __mem_cgroup_commit_charge_swapin(page, memcg, type);
-        }
-        return ret;
 }
 /*
@@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 * struct page_cgroup is acquired. This refcnt will be consumed by
 * "commit()" or removed by "cancel()"
 */
-int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-                                 struct page *page,
+                                          struct page *page,
-                                 gfp_t mask, struct mem_cgroup **memcgp)
+                                          gfp_t mask,
+                                          struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg;
+        struct page_cgroup *pc;
        int ret;
-        *memcgp = NULL;
+        pc = lookup_page_cgroup(page);
-        if (mem_cgroup_disabled())
-                return 0;
-        if (!do_swap_account)
-                goto charge_cur_mm;
        /*
-         * A racing thread's fault, or swapoff, may have already updated
+         * Every swap fault against a single page tries to charge the
-         * the pte, and even removed page from swap cache: in those cases
+         * page, bail as early as possible.  shmem_unuse() encounters
-         * do_swap_page()'s pte_same() test will fail; but there's also a
+         * already charged pages, too.  The USED bit is protected by
-         * KSM case which does need to charge the page.
+         * the page lock, which serializes swap cache removal, which
+         * in turn serializes uncharging.
         */
-        if (!PageSwapCache(page))
+        if (PageCgroupUsed(pc))
+                return 0;
+        if (!do_swap_account)
                goto charge_cur_mm;
        memcg = try_get_mem_cgroup_from_page(page);
        if (!memcg)
@@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                ret = 0;
        return ret;
 charge_cur_mm:
-        if (unlikely(!mm))
-                mm = &init_mm;
        ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
        if (ret == -EINTR)
                ret = 0;
        return ret;
 }
+int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
+                                 gfp_t gfp_mask, struct mem_cgroup **memcgp)
+{
+        *memcgp = NULL;
+        if (mem_cgroup_disabled())
+                return 0;
+        /*
+         * A racing thread's fault, or swapoff, may have already
+         * updated the pte, and even removed page from swap cache: in
+         * those cases unuse_pte()'s pte_same() test will fail; but
+         * there's also a KSM case which does need to charge the page.
+         */
+        if (!PageSwapCache(page)) {
+                int ret;
+                ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
+                if (ret == -EINTR)
+                        ret = 0;
+                return ret;
+        }
+        return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
+}
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+{
+        if (mem_cgroup_disabled())
+                return;
+        if (!memcg)
+                return;
+        __mem_cgroup_cancel_charge(memcg, 1);
+}
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                                        enum charge_type ctype)
@@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
                                     struct mem_cgroup *memcg)
 {
        __mem_cgroup_commit_charge_swapin(page, memcg,
-                                          MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                          MEM_CGROUP_CHARGE_TYPE_ANON);
 }
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+                                gfp_t gfp_mask)
 {
+        struct mem_cgroup *memcg = NULL;
+        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        int ret;
        if (mem_cgroup_disabled())
-                return;
+                return 0;
-        if (!memcg)
+        if (PageCompound(page))
-                return;
+                return 0;
-        __mem_cgroup_cancel_charge(memcg, 1);
+        if (!PageSwapCache(page))
+                ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
+        else { /* page is swapcache/shmem */
+                ret = __mem_cgroup_try_charge_swapin(mm, page,
+                                                     gfp_mask, &memcg);
+                if (!ret)
+                        __mem_cgroup_commit_charge_swapin(page, memcg, type);
+        }
+        return ret;
 }
 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -2911,7 +2986,8 @@ direct_uncharge:
 * uncharge if !page_mapped(page)
 */
 static struct mem_cgroup *
-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
+                             bool end_migration)
 {
        struct mem_cgroup *memcg = NULL;
        unsigned int nr_pages = 1;
@@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        if (mem_cgroup_disabled())
                return NULL;
-        if (PageSwapCache(page))
+        VM_BUG_ON(PageSwapCache(page));
-                return NULL;
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
@@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        anon = PageAnon(page);
        switch (ctype) {
-        case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+        case MEM_CGROUP_CHARGE_TYPE_ANON:
                /*
                 * Generally PageAnon tells if it's the anon statistics to be
                 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
@@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                /* fallthrough */
        case MEM_CGROUP_CHARGE_TYPE_DROP:
                /* See mem_cgroup_prepare_migration() */
-                if (page_mapped(page) || PageCgroupMigration(pc))
+                if (page_mapped(page))
+                        goto unlock_out;
+                /*
+                 * Pages under migration may not be uncharged.  But
+                 * end_migration() /must/ be the one uncharging the
+                 * unused post-migration page and so it has to call
+                 * here with the migration bit still set.  See the
+                 * res_counter handling below.
+                 */
+                if (!end_migration && PageCgroupMigration(pc))
                        goto unlock_out;
                break;
        case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                mem_cgroup_swap_statistics(memcg, true);
                mem_cgroup_get(memcg);
        }
-        if (!mem_cgroup_is_root(memcg))
+        /*
+         * Migration does not charge the res_counter for the
+         * replacement page, so leave it alone when phasing out the
+         * page that is unused after the migration.
+         */
+        if (!end_migration && !mem_cgroup_is_root(memcg))
                mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
        return memcg;
@@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
        if (page_mapped(page))
                return;
        VM_BUG_ON(page->mapping && !PageAnon(page));
-        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+        if (PageSwapCache(page))
+                return;
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
 }
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
        VM_BUG_ON(page_mapped(page));
        VM_BUG_ON(page->mapping);
-        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
 }
 /*
@@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
        if (!swapout) /* this was a swap cache but the swap is unused ! */
                ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
-        memcg = __mem_cgroup_uncharge_common(page, ctype);
+        memcg = __mem_cgroup_uncharge_common(page, ctype, false);
        /*
         * record memcg information,  if swapout && memcg != NULL,
@@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 }
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 /*
 * called from swap_entry_free(). remove record in swap_cgroup and
 * uncharge "memsw" account.
@@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
 * page belongs to.
 */
-int mem_cgroup_prepare_migration(struct page *page,
+void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
-        struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
+                                  struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg = NULL;
        struct page_cgroup *pc;
        enum charge_type ctype;
-        int ret = 0;
        *memcgp = NULL;
        VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
-                return 0;
+                return;
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
@@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page,
         * we return here.
         */
        if (!memcg)
-                return 0;
+                return;
        *memcgp = memcg;
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
-        css_put(&memcg->css);/* drop extra refcnt */
-        if (ret) {
-                if (PageAnon(page)) {
-                        lock_page_cgroup(pc);
-                        ClearPageCgroupMigration(pc);
-                        unlock_page_cgroup(pc);
-                        /*
-                         * The old page may be fully unmapped while we kept it.
-                         */
-                        mem_cgroup_uncharge_page(page);
-                }
-                /* we'll need to revisit this error code (we have -EINTR) */
-                return -ENOMEM;
-        }
        /*
         * We charge new page before it's used/mapped. So, even if unlock_page()
         * is called before end_migration, we can catch all events on this new
@@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page,
         * mapcount will be finally 0 and we call uncharge in end_migration().
         */
        if (PageAnon(page))
-                ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+                ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
-        else if (page_is_file_cache(page))
-                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
        else
-                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        /*
+         * The page is committed to the memcg, but it's not actually
+         * charged to the res_counter since we plan on replacing the
+         * old one and only one page is going to be left afterwards.
+         */
        __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
-        return ret;
 }
 /* remove redundant charge if migration failed*/
@@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
                used = newpage;
                unused = oldpage;
        }
+        anon = PageAnon(used);
+        __mem_cgroup_uncharge_common(unused,
+                                     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
+                                     : MEM_CGROUP_CHARGE_TYPE_CACHE,
+                                     true);
+        css_put(&memcg->css);
        /*
         * We disallowed uncharge of pages under migration because mapcount
         * of the page goes down to zero, temporarly.
@@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        lock_page_cgroup(pc);
        ClearPageCgroupMigration(pc);
        unlock_page_cgroup(pc);
-        anon = PageAnon(used);
-        __mem_cgroup_uncharge_common(unused,
-                anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
-                     : MEM_CGROUP_CHARGE_TYPE_CACHE);
        /*
         * If a page is a file cache, radix-tree replacement is very atomic
@@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
         */
        if (!memcg)
                return;
-        if (PageSwapBacked(oldpage))
-                type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
        /*
         * Even if newpage->mapping was NULL before starting replacement,
         * the newpage may be on LRU(or pagevec for LRU) already. We lock
@@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                /*
                 * Rather than hide all in some function, I do this in
                 * open coded manner. You see what this really does.
-                 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                /*
                 * Rather than hide all in some function, I do this in
                 * open coded manner. You see what this really does.
-                 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 }
 /*
- * This routine traverse page_cgroup in given list and drop them all.
+ * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
- * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
+ * reclaim the pages page themselves - it just removes the page_cgroups.
+ * Returns true if some page_cgroups were not freed, indicating that the caller
+ * must retry this operation.
 */
-static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
+static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                                int node, int zid, enum lru_list lru)
 {
        struct mem_cgroup_per_zone *mz;
@@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
        struct list_head *list;
        struct page *busy;
        struct zone *zone;
-        int ret = 0;
        zone = &NODE_DATA(node)->node_zones[zid];
        mz = mem_cgroup_zoneinfo(memcg, node, zid);
@@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                struct page_cgroup *pc;
                struct page *page;
-                ret = 0;
                spin_lock_irqsave(&zone->lru_lock, flags);
                if (list_empty(list)) {
                        spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                pc = lookup_page_cgroup(page);
-                ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
+                if (mem_cgroup_move_parent(page, pc, memcg)) {
-                if (ret == -ENOMEM || ret == -EINTR)
-                        break;
-                if (ret == -EBUSY || ret == -EINVAL) {
                        /* found lock contention or "pc" is obsolete. */
                        busy = page;
                        cond_resched();
                } else
                        busy = NULL;
        }
+        return !list_empty(list);
-        if (!ret && !list_empty(list))
-                return -EBUSY;
-        return ret;
 }
 /*
@@ -3692,9 +3760,6 @@ move_account:
                ret = -EBUSY;
                if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
                        goto out;
-                ret = -EINTR;
-                if (signal_pending(current))
-                        goto out;
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
                drain_all_stock_sync(memcg);
@@ -3715,9 +3780,6 @@ move_account:
                }
                mem_cgroup_end_move(memcg);
                memcg_oom_recover(memcg);
-                /* it seems parent cgroup doesn't have enough mem */
-                if (ret == -ENOMEM)
-                        goto try_to_free;
                cond_resched();
        /* "ret" should also be checked to ensure all lists are empty. */
        } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
@@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                parent_memcg = mem_cgroup_from_cont(parent);
        cgroup_lock();
+        if (memcg->use_hierarchy == val)
+                goto out;
        /*
         * If parent's use_hierarchy is set, we can't make any modifications
         * in the child subtrees. If it is unset, then the change can
@@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                        retval = -EBUSY;
        } else
                retval = -EINVAL;
+out:
        cgroup_unlock();
        return retval;
@@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
        val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
        if (swap)
-                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
+                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
        return val << PAGE_SHIFT;
 }
@@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 #endif
 #ifdef CONFIG_NUMA
-static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft,
+static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
                                      struct seq_file *m)
 {
        int nid;
@@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 }
-static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
                                 struct seq_file *m)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        unsigned int i;
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-                if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
+                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
                seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
                           mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
@@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                long long val = 0;
-                if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
+                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
                for_each_mem_cgroup_tree(mi, memcg)
                        val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
@@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        return 0;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
        return mem_cgroup_sockets_init(memcg, ss);
@@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = {
        },
        {
                .name = "stat",
-                .read_seq_string = mem_control_stat_show,
+                .read_seq_string = memcg_stat_show,
        },
        {
                .name = "force_empty",
@@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = {
 #ifdef CONFIG_NUMA
        {
                .name = "numa_stat",
-                .read_seq_string = mem_control_numa_stat_show,
+                .read_seq_string = memcg_numa_stat_show,
        },
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
        {
                .name = "memsw.usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static void __init enable_swap_cgroup(void)
 {
        if (!mem_cgroup_disabled() && really_do_swap_account)
@@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .__DEPRECATED_clear_css_refs = true,
 };
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ab1e7145e290..a6e2141a6610 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
 * can only guarantee that the page either belongs to the memcg tasks, or is
 * a freed page.
 */
-#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef  CONFIG_MEMCG_SWAP
 u64 hwpoison_filter_memcg;
 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 static int hwpoison_filter_task(struct page *p)
@@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 * Also when FAIL is set do a force kill because something went
 * wrong earlier.
 */
-static void kill_procs(struct list_head *to_kill, int doit, int trapno,
+static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
                          int fail, struct page *page, unsigned long pfn,
                          int flags)
 {
        struct to_kill *tk, *next;
        list_for_each_entry_safe (tk, next, to_kill, nd) {
-                if (doit) {
+                if (forcekill) {
                        /*
                         * In case something went wrong with munmapping
                         * make sure the process doesn't catch the
@@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        struct address_space *mapping;
        LIST_HEAD(tokill);
        int ret;
-        int kill = 1;
+        int kill = 1, forcekill;
        struct page *hpage = compound_head(p);
        struct page *ppage;
@@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * be called inside page lock (it's recommended but not enforced).
         */
        mapping = page_mapping(hpage);
-        if (!PageDirty(hpage) && mapping &&
+        if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
            mapping_cap_writeback_dirty(mapping)) {
                if (page_mkclean(hpage)) {
                        SetPageDirty(hpage);
@@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * Now that the dirty bit has been propagated to the
         * struct page and all unmaps done we can decide if
         * killing is needed or not.  Only kill when the page
-         * was dirty, otherwise the tokill list is merely
+         * was dirty or the process is not restartable,
+         * otherwise the tokill list is merely
         * freed.  When there was a problem unmapping earlier
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        kill_procs(&tokill, !!PageDirty(ppage), trapno,
+        forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+        kill_procs(&tokill, forcekill, trapno,
                      ret != SWAP_SUCCESS, p, pfn, flags);
        return ret;
@@ -1414,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
        int ret;
        unsigned long pfn = page_to_pfn(page);
        struct page *hpage = compound_head(page);
-        LIST_HEAD(pagelist);
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
@@ -1429,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
        }
        /* Keep page count to indicate a given hugepage is isolated. */
+        ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
-        list_add(&hpage->lru, &pagelist);
+                                MIGRATE_SYNC);
-        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+        put_page(hpage);
-                                true);
        if (ret) {
-                struct page *page1, *page2;
-                list_for_each_entry_safe(page1, page2, &pagelist, lru)
-                        put_page(page1);
                pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                        pfn, ret, page->flags);
-                if (ret > 0)
-                        ret = -EIO;
                return ret;
        }
 done:
        if (!PageHWPoison(hpage))
-                atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
+                atomic_long_add(1 << compound_trans_order(hpage),
+                                &mce_bad_pages);
        set_page_hwpoison_huge_page(hpage);
        dequeue_hwpoisoned_huge_page(hpage);
        /* keep elevated page count for bad page */
@@ -1561,7 +1556,7 @@ int soft_offline_page(struct page *page, int flags)
                                            page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-                                                        0, MIGRATE_SYNC);
+                                                        false, MIGRATE_SYNC);
                if (ret) {
                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 1b7dc662bf9f..482f089765ff 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
        tlb->mm = mm;
        tlb->fullmm     = fullmm;
+        tlb->start      = -1UL;
+        tlb->end        = 0;
        tlb->need_flush = 0;
        tlb->fast_mode  = (num_possible_cpus() == 1);
        tlb->local.next = NULL;
@@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
 {
        struct mmu_gather_batch *batch, *next;
+        tlb->start = start;
+        tlb->end   = end;
        tlb_flush_mmu(tlb);
        /* keep the page table cache within bounds */
@@ -1204,6 +1208,11 @@ again:
         */
        if (force_flush) {
                force_flush = 0;
+#ifdef HAVE_GENERIC_MMU_GATHER
+                tlb->start = addr;
+                tlb->end = end;
+#endif
                tlb_flush_mmu(tlb);
                if (addr != end)
                        goto again;
@@ -1225,7 +1234,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE) {
-                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+#ifdef CONFIG_DEBUG_VM
+                                if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
+                                        pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
+                                                __func__, addr, end,
+                                                vma->vm_start,
+                                                vma->vm_end);
+                                        BUG();
+                                }
+#endif
                                split_huge_page_pmd(vma->vm_mm, pmd);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
@@ -1326,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
                         */
-                        if (vma->vm_file)
+                        if (vma->vm_file) {
-                                unmap_hugepage_range(vma, start, end, NULL);
+                                mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                                __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
+                                mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
        }
@@ -1366,7 +1386,7 @@ void unmap_vmas(struct mmu_gather *tlb,
 /**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
+ * @start: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of nonlinear truncation or shared cache invalidation
 *
@@ -3921,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
                        free_page((unsigned long)buf);
                }
        }
-        up_read(&current->mm->mmap_sem);
+        up_read(&mm->mmap_sem);
 }
 #ifdef CONFIG_PROVE_LOCKING
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0d7e3ec8e0f3..3ad25f9d1fc1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
-        if (need_zonelists_rebuild)
+        if (onlined_pages) {
-                build_all_zonelists(zone);
+                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
-        else
+                if (need_zonelists_rebuild)
-                zone_pcp_update(zone);
+                        build_all_zonelists(NULL, zone);
+                else
+                        zone_pcp_update(zone);
+        }
        mutex_unlock(&zonelists_mutex);
        init_per_zone_wmark_min();
-        if (onlined_pages) {
+        if (onlined_pages)
                kswapd_run(zone_to_nid(zone));
-                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
-        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
         * to access not-initialized zonelist, build here.
         */
        mutex_lock(&zonelists_mutex);
-        build_all_zonelists(NULL);
+        build_all_zonelists(pgdat, NULL);
        mutex_unlock(&zonelists_mutex);
        return pgdat;
@@ -618,7 +619,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
                pgdat = hotadd_new_pgdat(nid, start);
                ret = -ENOMEM;
                if (!pgdat)
-                        goto out;
+                        goto error;
                new_pgdat = 1;
        }
@@ -965,6 +966,9 @@ repeat:
        init_per_zone_wmark_min();
+        if (!populated_zone(zone))
+                zone_pcp_reset(zone);
        if (!node_present_pages(node)) {
                node_clear_state(node, N_HIGH_MEMORY);
                kswapd_stop(node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f15c1b24ca18..bd92431d4c49 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1177,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
                                                (unsigned long)vma,
-                                                false, true);
+                                                false, MIGRATE_SYNC);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
@@ -1602,8 +1602,14 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 * task can change it's policy.  The system default policy requires no
 * such protection.
 */
-unsigned slab_node(struct mempolicy *policy)
+unsigned slab_node(void)
 {
+        struct mempolicy *policy;
+        if (in_interrupt())
+                return numa_node_id();
+        policy = current->mempolicy;
        if (!policy || policy->flags & MPOL_F_LOCAL)
                return numa_node_id();
diff --git a/mm/migrate.c b/mm/migrate.c
index be26d5cbe56b..77ed2d773705 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -33,6 +33,7 @@
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
 #include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
 #include <asm/tlbflush.h>
@@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 {
        int rc = -EAGAIN;
        int remap_swapcache = 1;
-        int charge = 0;
        struct mem_cgroup *mem;
        struct anon_vma *anon_vma = NULL;
@@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
        }
        /* charge against new page */
-        charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
+        mem_cgroup_prepare_migration(page, newpage, &mem);
-        if (charge == -ENOMEM) {
-                rc = -ENOMEM;
-                goto unlock;
-        }
-        BUG_ON(charge);
        if (PageWriteback(page)) {
                /*
@@ -819,8 +814,7 @@ skip_unmap:
                put_anon_vma(anon_vma);
 uncharge:
-        if (!charge)
+        mem_cgroup_end_migration(mem, page, newpage, rc == 0);
-                mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
        unlock_page(page);
 out:
@@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (anon_vma)
                put_anon_vma(anon_vma);
-        unlock_page(hpage);
-out:
+        if (!rc)
-        if (rc != -EAGAIN) {
+                hugetlb_cgroup_migrate(hpage, new_hpage);
-                list_del(&hpage->lru);
-                put_page(hpage);
-        }
+        unlock_page(hpage);
+out:
        put_page(new_hpage);
        if (result) {
                if (rc)
                        *result = rc;
@@ -1016,48 +1007,32 @@ out:
        return nr_failed + retry;
 }
-int migrate_huge_pages(struct list_head *from,
+int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
-                new_page_t get_new_page, unsigned long private, bool offlining,
+                      unsigned long private, bool offlining,
-                enum migrate_mode mode)
+                      enum migrate_mode mode)
 {
-        int retry = 1;
+        int pass, rc;
-        int nr_failed = 0;
-        int pass = 0;
+        for (pass = 0; pass < 10; pass++) {
-        struct page *page;
+                rc = unmap_and_move_huge_page(get_new_page,
-        struct page *page2;
+                                              private, hpage, pass > 2, offlining,
-        int rc;
+                                              mode);
+                switch (rc) {
-        for (pass = 0; pass < 10 && retry; pass++) {
+                case -ENOMEM:
-                retry = 0;
+                        goto out;
+                case -EAGAIN:
-                list_for_each_entry_safe(page, page2, from, lru) {
+                        /* try again */
                        cond_resched();
+                        break;
-                        rc = unmap_and_move_huge_page(get_new_page,
+                case 0:
-                                        private, page, pass > 2, offlining,
+                        goto out;
-                                        mode);
+                default:
+                        rc = -EIO;
-                        switch(rc) {
+                        goto out;
-                        case -ENOMEM:
-                                goto out;
-                        case -EAGAIN:
-                                retry++;
-                                break;
-                        case 0:
-                                break;
-                        default:
-                                /* Permanent failure */
-                                nr_failed++;
-                                break;
-                        }
                }
        }
-        rc = 0;
 out:
-        if (rc)
+        return rc;
-                return rc;
-        return nr_failed + retry;
 }
 #ifdef CONFIG_NUMA
diff --git a/mm/mmap.c b/mm/mmap.c
index 3edfcdfa42d9..e3e86914f11a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
        const unsigned long stack_flags
                = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
+        mm->total_vm += pages;
        if (file) {
                mm->shared_vm += pages;
                if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -1347,7 +1349,6 @@ munmap_back:
 out:
        perf_event_mmap(vma);
-        mm->total_vm += len >> PAGE_SHIFT;
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                if (!mlock_vma_pages_range(vma, addr, addr + len))
@@ -1707,7 +1708,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
                return -ENOMEM;
        /* Ok, everything looks good - let it rip */
-        mm->total_vm += grow;
        if (vma->vm_flags & VM_LOCKED)
                mm->locked_vm += grow;
        vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
@@ -1889,7 +1889,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += nrpages;
-                mm->total_vm -= nrpages;
                vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
                vma = remove_vma(vma);
        } while (vma);
@@ -2345,9 +2344,6 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
             security_vm_enough_memory_mm(mm, vma_pages(vma)))
                return -ENOMEM;
-        if (vma->vm_file && uprobe_mmap(vma))
-                return -EINVAL;
        vma_link(mm, vma, prev, rb_link, rb_parent);
        return 0;
 }
@@ -2418,9 +2414,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                        if (new_vma->vm_file) {
                                get_file(new_vma->vm_file);
-                                if (uprobe_mmap(new_vma))
-                                        goto out_free_mempol;
                                if (vma->vm_flags & VM_EXECUTABLE)
                                        added_exe_file_vma(mm);
                        }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9a611d3a1848..862b60822d9f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
 void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        /*
+         * RCU here will block mmu_notifier_unregister until
+         * ->release returns.
+         */
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
+                /*
+                 * if ->release runs before mmu_notifier_unregister it
+                 * must be handled as it's the only way for the driver
+                 * to flush all existing sptes and stop the driver
+                 * from establishing any more sptes before all the
+                 * pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+        rcu_read_unlock();
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
                 * mmu_notifier_unregister to return.
                 */
                hlist_del_init_rcu(&mn->hlist);
-                /*
-                 * RCU here will block mmu_notifier_unregister until
-                 * ->release returns.
-                 */
-                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
-                /*
-                 * if ->release runs before mmu_notifier_unregister it
-                 * must be handled as it's the only way for the driver
-                 * to flush all existing sptes and stop the driver
-                 * from establishing any more sptes before all the
-                 * pages in the mm are freed.
-                 */
-                if (mn->ops->release)
-                        mn->ops->release(mn, mm);
-                rcu_read_unlock();
-                spin_lock(&mm->mmu_notifier_mm->lock);
        }
        spin_unlock(&mm->mmu_notifier_mm->lock);
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
-        spin_lock(&mm->mmu_notifier_mm->lock);
        if (!hlist_unhashed(&mn->hlist)) {
-                hlist_del_rcu(&mn->hlist);
                /*
                 * RCU here will force exit_mmap to wait ->release to finish
                 * before freeing the pages.
                 */
                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
                /*
                 * exit_mmap will block in mmu_notifier_release to
                 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
                rcu_read_unlock();
-        } else
+                spin_lock(&mm->mmu_notifier_mm->lock);
+                hlist_del_rcu(&mn->hlist);
                spin_unlock(&mm->mmu_notifier_mm->lock);
+        }
        /*
         * Wait any running method to finish, of course including
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 6830eab5bf09..3cef80f6ac79 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
        for_each_lru(lru)
                INIT_LIST_HEAD(&lruvec->lists[lru]);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
        lruvec->zone = zone;
 #endif
 }
diff --git a/mm/mremap.c b/mm/mremap.c
index 21fed202ddad..cc06d0e48d05 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         * If this were a serious issue, we'd add a flag to do_munmap().
         */
        hiwater_vm = mm->hiwater_vm;
-        mm->total_vm += new_len >> PAGE_SHIFT;
        vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
        if (do_munmap(mm, old_addr, old_len) < 0) {
@@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                                goto out;
                        }
-                        mm->total_vm += pages;
                        vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
                        if (vma->vm_flags & VM_LOCKED) {
                                mm->locked_vm += pages;
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index d23415c001bc..405573010f99 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
                __free_pages_bootmem(pfn_to_page(i), 0);
 }
+static unsigned long __init __free_memory_core(phys_addr_t start,
+                                 phys_addr_t end)
+{
+        unsigned long start_pfn = PFN_UP(start);
+        unsigned long end_pfn = min_t(unsigned long,
+                                      PFN_DOWN(end), max_low_pfn);
+        if (start_pfn > end_pfn)
+                return 0;
+        __free_pages_memory(start_pfn, end_pfn);
+        return end_pfn - start_pfn;
+}
 unsigned long __init free_low_memory_core_early(int nodeid)
 {
        unsigned long count = 0;
-        phys_addr_t start, end;
+        phys_addr_t start, end, size;
        u64 i;
-        /* free reserved array temporarily so that it's treated as free area */
+        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
-        memblock_free_reserved_regions();
+                count += __free_memory_core(start, end);
-        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+        /* free range that is used for reserved array if we allocate it */
-                unsigned long start_pfn = PFN_UP(start);
+        size = get_allocated_memblock_reserved_regions_info(&start);
-                unsigned long end_pfn = min_t(unsigned long,
+        if (size)
-                                              PFN_DOWN(end), max_low_pfn);
+                count += __free_memory_core(start, start + size);
-                if (start_pfn < end_pfn) {
-                        __free_pages_memory(start_pfn, end_pfn);
-                        count += end_pfn - start_pfn;
-                }
-        }
-        /* put region array back? */
-        memblock_reserve_reserved_regions();
        return count;
 }
@@ -274,7 +282,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, limit);
 }
-static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                                   unsigned long size,
                                                   unsigned long align,
                                                   unsigned long goal,
diff --git a/mm/nommu.c b/mm/nommu.c
index c4acfbc09972..d4b0c10872de 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1486,7 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-        ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
        if (file)
                fput(file);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ed0e19677360..198600861638 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -183,7 +183,8 @@ static bool oom_unkillable_task(struct task_struct *p,
 unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
                          const nodemask_t *nodemask, unsigned long totalpages)
 {
-        unsigned long points;
+        long points;
+        long adj;
        if (oom_unkillable_task(p, memcg, nodemask))
                return 0;
@@ -192,7 +193,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
        if (!p)
                return 0;
-        if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+        adj = p->signal->oom_score_adj;
+        if (adj == OOM_SCORE_ADJ_MIN) {
                task_unlock(p);
                return 0;
        }
@@ -210,20 +212,17 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         * implementation used by LSMs.
         */
        if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-                points -= 30 * totalpages / 1000;
+                adj -= 30;
-        /*
+        /* Normalize to oom_score_adj units */
-         * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+        adj *= totalpages / 1000;
-         * either completely disable oom killing or always prefer a certain
+        points += adj;
-         * task.
-         */
-        points += p->signal->oom_score_adj * totalpages / 1000;
        /*
         * Never return 0 for an eligible task regardless of the root bonus and
         * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
         */
-        return points ? points : 1;
+        return points > 0 ? points : 1;
 }
 /*
@@ -289,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 }
 #endif
+enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
+                unsigned long totalpages, const nodemask_t *nodemask,
+                bool force_kill)
+{
+        if (task->exit_state)
+                return OOM_SCAN_CONTINUE;
+        if (oom_unkillable_task(task, NULL, nodemask))
+                return OOM_SCAN_CONTINUE;
+        /*
+         * This task already has access to memory reserves and is being killed.
+         * Don't allow any other task to have access to the reserves.
+         */
+        if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
+                if (unlikely(frozen(task)))
+                        __thaw_task(task);
+                if (!force_kill)
+                        return OOM_SCAN_ABORT;
+        }
+        if (!task->mm)
+                return OOM_SCAN_CONTINUE;
+        if (task->flags & PF_EXITING) {
+                /*
+                 * If task is current and is in the process of releasing memory,
+                 * allow the "kill" to set TIF_MEMDIE, which will allow it to
+                 * access memory reserves.  Otherwise, it may stall forever.
+                 *
+                 * The iteration isn't broken here, however, in case other
+                 * threads are found to have already been oom killed.
+                 */
+                if (task == current)
+                        return OOM_SCAN_SELECT;
+                else if (!force_kill) {
+                        /*
+                         * If this task is not being ptraced on exit, then wait
+                         * for it to finish before killing some other task
+                         * unnecessarily.
+                         */
+                        if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
+                                return OOM_SCAN_ABORT;
+                }
+        }
+        return OOM_SCAN_OK;
+}
 /*
 * Simple selection loop. We chose the process with the highest
- * number of 'points'. We expect the caller will lock the tasklist.
+ * number of 'points'.
 *
 * (not docbooked, we don't want this one cluttering up the manual)
 */
 static struct task_struct *select_bad_process(unsigned int *ppoints,
-                unsigned long totalpages, struct mem_cgroup *memcg,
+                unsigned long totalpages, const nodemask_t *nodemask,
-                const nodemask_t *nodemask, bool force_kill)
+                bool force_kill)
 {
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
        unsigned long chosen_points = 0;
+        rcu_read_lock();
        do_each_thread(g, p) {
                unsigned int points;
-                if (p->exit_state)
+                switch (oom_scan_process_thread(p, totalpages, nodemask,
-                        continue;
+                                                force_kill)) {
-                if (oom_unkillable_task(p, memcg, nodemask))
+                case OOM_SCAN_SELECT:
-                        continue;
+                        chosen = p;
+                        chosen_points = ULONG_MAX;
-                /*
+                        /* fall through */
-                 * This task already has access to memory reserves and is
+                case OOM_SCAN_CONTINUE:
-                 * being killed. Don't allow any other task access to the
-                 * memory reserve.
-                 *
-                 * Note: this may have a chance of deadlock if it gets
-                 * blocked waiting for another task which itself is waiting
-                 * for memory. Is there a better alternative?
-                 */
-                if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
-                        if (unlikely(frozen(p)))
-                                __thaw_task(p);
-                        if (!force_kill)
-                                return ERR_PTR(-1UL);
-                }
-                if (!p->mm)
                        continue;
+                case OOM_SCAN_ABORT:
-                if (p->flags & PF_EXITING) {
+                        rcu_read_unlock();
-                        /*
+                        return ERR_PTR(-1UL);
-                         * If p is the current task and is in the process of
+                case OOM_SCAN_OK:
-                         * releasing memory, we allow the "kill" to set
+                        break;
-                         * TIF_MEMDIE, which will allow it to gain access to
+                };
-                         * memory reserves.  Otherwise, it may stall forever.
+                points = oom_badness(p, NULL, nodemask, totalpages);
-                         *
-                         * The loop isn't broken here, however, in case other
-                         * threads are found to have already been oom killed.
-                         */
-                        if (p == current) {
-                                chosen = p;
-                                chosen_points = ULONG_MAX;
-                        } else if (!force_kill) {
-                                /*
-                                 * If this task is not being ptraced on exit,
-                                 * then wait for it to finish before killing
-                                 * some other task unnecessarily.
-                                 */
-                                if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
-                                        return ERR_PTR(-1UL);
-                        }
-                }
-                points = oom_badness(p, memcg, nodemask, totalpages);
                if (points > chosen_points) {
                        chosen = p;
                        chosen_points = points;
                }
        } while_each_thread(g, p);
+        if (chosen)
+                get_task_struct(chosen);
+        rcu_read_unlock();
        *ppoints = chosen_points * 1000 / totalpages;
        return chosen;
@@ -366,23 +382,22 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 /**
 * dump_tasks - dump current memory state of all system tasks
- * @mem: current's memory controller, if constrained
+ * @memcg: current's memory controller, if constrained
 * @nodemask: nodemask passed to page allocator for mempolicy ooms
 *
 * Dumps the current memory state of all eligible tasks.  Tasks not in the same
 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
 * are not shown.
- * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
+ * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
- * value, oom_score_adj value, and name.
+ * swapents, oom_score_adj value, and name.
- *
- * Call with tasklist_lock read-locked.
 */
 static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
        struct task_struct *p;
        struct task_struct *task;
-        pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
+        pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes swapents oom_score_adj name\n");
+        rcu_read_lock();
        for_each_process(p) {
                if (oom_unkillable_task(p, memcg, nodemask))
                        continue;
@@ -397,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
                        continue;
                }
-                pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
+                pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5d %s\n",
                        task->pid, from_kuid(&init_user_ns, task_uid(task)),
                        task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-                        task_cpu(task), task->signal->oom_adj,
+                        task->mm->nr_ptes,
+                        get_mm_counter(task->mm, MM_SWAPENTS),
                        task->signal->oom_score_adj, task->comm);
                task_unlock(task);
        }
+        rcu_read_unlock();
 }
 static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -424,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
-static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+/*
-                             unsigned int points, unsigned long totalpages,
+ * Must be called while holding a reference to p, which will be released upon
-                             struct mem_cgroup *memcg, nodemask_t *nodemask,
+ * returning.
-                             const char *message)
+ */
+void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+                      unsigned int points, unsigned long totalpages,
+                      struct mem_cgroup *memcg, nodemask_t *nodemask,
+                      const char *message)
 {
        struct task_struct *victim = p;
        struct task_struct *child;
@@ -443,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         */
        if (p->flags & PF_EXITING) {
                set_tsk_thread_flag(p, TIF_MEMDIE);
+                put_task_struct(p);
                return;
        }
@@ -460,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * parent.  This attempts to lose the minimal amount of work done while
         * still freeing memory.
         */
+        read_lock(&tasklist_lock);
        do {
                list_for_each_entry(child, &t->children, sibling) {
                        unsigned int child_points;
@@ -472,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        child_points = oom_badness(child, memcg, nodemask,
                                                                totalpages);
                        if (child_points > victim_points) {
+                                put_task_struct(victim);
                                victim = child;
                                victim_points = child_points;
+                                get_task_struct(victim);
                        }
                }
        } while_each_thread(p, t);
+        read_unlock(&tasklist_lock);
-        victim = find_lock_task_mm(victim);
+        rcu_read_lock();
-        if (!victim)
+        p = find_lock_task_mm(victim);
+        if (!p) {
+                rcu_read_unlock();
+                put_task_struct(victim);
                return;
+        } else if (victim != p) {
+                get_task_struct(p);
+                put_task_struct(victim);
+                victim = p;
+        }
        /* mm cannot safely be dereferenced after task_unlock(victim) */
        mm = victim->mm;
@@ -511,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        task_unlock(p);
                        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
                }
+        rcu_read_unlock();
        set_tsk_thread_flag(victim, TIF_MEMDIE);
        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+        put_task_struct(victim);
 }
 #undef K
 /*
 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
 */
-static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-                                int order, const nodemask_t *nodemask)
+                        int order, const nodemask_t *nodemask)
 {
        if (likely(!sysctl_panic_on_oom))
                return;
@@ -534,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
                if (constraint != CONSTRAINT_NONE)
                        return;
        }
-        read_lock(&tasklist_lock);
        dump_header(NULL, gfp_mask, order, NULL, nodemask);
-        read_unlock(&tasklist_lock);
        panic("Out of memory: %s panic_on_oom is enabled\n",
                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
-                              int order)
-{
-        unsigned long limit;
-        unsigned int points = 0;
-        struct task_struct *p;
-        /*
-         * If current has a pending SIGKILL, then automatically select it.  The
-         * goal is to allow it to allocate so that it may quickly exit and free
-         * its memory.
-         */
-        if (fatal_signal_pending(current)) {
-                set_thread_flag(TIF_MEMDIE);
-                return;
-        }
-        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
-        limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
-        read_lock(&tasklist_lock);
-        p = select_bad_process(&points, limit, memcg, NULL, false);
-        if (p && PTR_ERR(p) != -1UL)
-                oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
-                                 "Memory cgroup out of memory");
-        read_unlock(&tasklist_lock);
-}
-#endif
 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
 int register_oom_notifier(struct notifier_block *nb)
@@ -691,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        struct task_struct *p;
        unsigned long totalpages;
        unsigned long freed = 0;
-        unsigned int points;
+        unsigned int uninitialized_var(points);
        enum oom_constraint constraint = CONSTRAINT_NONE;
        int killed = 0;
@@ -719,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
        check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
-        read_lock(&tasklist_lock);
+        if (sysctl_oom_kill_allocating_task && current->mm &&
-        if (sysctl_oom_kill_allocating_task &&
            !oom_unkillable_task(current, NULL, nodemask) &&
-            current->mm) {
+            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+                get_task_struct(current);
                oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
                                 nodemask,
                                 "Out of memory (oom_kill_allocating_task)");
                goto out;
        }
-        p = select_bad_process(&points, totalpages, NULL, mpol_mask,
+        p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
-                               force_kill);
        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (!p) {
                dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
-                read_unlock(&tasklist_lock);
                panic("Out of memory and no killable processes...\n");
        }
        if (PTR_ERR(p) != -1UL) {
@@ -743,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                killed = 1;
        }
 out:
-        read_unlock(&tasklist_lock);
        /*
-         * Give "p" a good chance of killing itself before we
+         * Give the killed threads a good chance of exiting before trying to
-         * retry to allocate memory unless "p" is current
+         * allocate memory again.
         */
-        if (killed && !test_thread_flag(TIF_MEMDIE))
+        if (killed)
-                schedule_timeout_uninterruptible(1);
+                schedule_timeout_killable(1);
 }
 /*
@@ -765,6 +766,5 @@ void pagefault_out_of_memory(void)
                out_of_memory(NULL, 0, 0, NULL, false);
                clear_system_oom();
        }
-        if (!test_thread_flag(TIF_MEMDIE))
+        schedule_timeout_killable(1);
-                schedule_timeout_uninterruptible(1);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 93d8d2f7108c..e5363f34e025 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
 #include <linux/pagevec.h>
+#include <linux/timer.h>
 #include <trace/events/writeback.h>
 /*
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
 * measured in page writeback completions.
 *
 */
-static struct prop_descriptor vm_completions;
+static struct fprop_global writeout_completions;
+static void writeout_period(unsigned long t);
+/* Timer for aging of writeout_completions */
+static struct timer_list writeout_period_timer =
+                TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
+static unsigned long writeout_period_time = 0;
+/*
+ * Length of period for aging writeout fractions of bdis. This is an
+ * arbitrarily chosen number. The longer the period, the slower fractions will
+ * reflect changes in current writeout rate.
+ */
+#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
 /*
 * Work out the current dirty-memory clamping and background writeout
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
               zone_page_state(zone, NR_WRITEBACK) <= limit;
 }
-/*
- * couple the period to the dirty_ratio:
- *
- *   period/2 ~ roundup_pow_of_two(dirty limit)
- */
-static int calc_period_shift(void)
-{
-        unsigned long dirty_total;
-        if (vm_dirty_bytes)
-                dirty_total = vm_dirty_bytes / PAGE_SIZE;
-        else
-                dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
-                                100;
-        return 2 + ilog2(dirty_total - 1);
-}
-/*
- * update the period when the dirty threshold changes.
- */
-static void update_completion_period(void)
-{
-        int shift = calc_period_shift();
-        prop_change_shift(&vm_completions, shift);
-        writeback_set_ratelimit();
-}
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
-                update_completion_period();
+                writeback_set_ratelimit();
                vm_dirty_bytes = 0;
        }
        return ret;
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
-                update_completion_period();
+                writeback_set_ratelimit();
                vm_dirty_ratio = 0;
        }
        return ret;
 }
+static unsigned long wp_next_time(unsigned long cur_time)
+{
+        cur_time += VM_COMPLETIONS_PERIOD_LEN;
+        /* 0 has a special meaning... */
+        if (!cur_time)
+                return 1;
+        return cur_time;
+}
 /*
 * Increment the BDI's writeout completion count and the global writeout
 * completion count. Called from test_clear_page_writeback().
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
        __inc_bdi_stat(bdi, BDI_WRITTEN);
-        __prop_inc_percpu_max(&vm_completions, &bdi->completions,
+        __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
-                              bdi->max_prop_frac);
+                               bdi->max_prop_frac);
+        /* First event after period switching was turned off? */
+        if (!unlikely(writeout_period_time)) {
+                /*
+                 * We can race with other __bdi_writeout_inc calls here but
+                 * it does not cause any harm since the resulting time when
+                 * timer will fire and what is in writeout_period_time will be
+                 * roughly the same.
+                 */
+                writeout_period_time = wp_next_time(jiffies);
+                mod_timer(&writeout_period_timer, writeout_period_time);
+        }
 }
 void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
                long *numerator, long *denominator)
 {
-        prop_fraction_percpu(&vm_completions, &bdi->completions,
+        fprop_fraction_percpu(&writeout_completions, &bdi->completions,
                                numerator, denominator);
 }
 /*
+ * On idle system, we can be called long after we scheduled because we use
+ * deferred timers so count with missed periods.
+ */
+static void writeout_period(unsigned long t)
+{
+        int miss_periods = (jiffies - writeout_period_time) /
+                                                 VM_COMPLETIONS_PERIOD_LEN;
+        if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
+                writeout_period_time = wp_next_time(writeout_period_time +
+                                miss_periods * VM_COMPLETIONS_PERIOD_LEN);
+                mod_timer(&writeout_period_timer, writeout_period_time);
+        } else {
+                /*
+                 * Aging has zeroed all fractions. Stop wasting CPU on period
+                 * updates.
+                 */
+                writeout_period_time = 0;
+        }
+}
+/*
 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
 * registered backing devices, which, for obvious reasons, can not
 * exceed 100%.
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
                ret = -EINVAL;
        } else {
                bdi->max_ratio = max_ratio;
-                bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
+                bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
        }
        spin_unlock_bh(&bdi_lock);
@@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
         *      bdi->dirty_ratelimit = balanced_dirty_ratelimit;
         *
         * However to get a more stable dirty_ratelimit, the below elaborated
-         * code makes use of task_ratelimit to filter out sigular points and
+         * code makes use of task_ratelimit to filter out singular points and
         * limit the step size.
         *
         * The below code essentially only uses the relative value of
@@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
         * feel and care are stable dirty rate and small position error.
         *
         * |task_ratelimit - dirty_ratelimit| is used to limit the step size
-         * and filter out the sigular points of balanced_dirty_ratelimit. Which
+         * and filter out the singular points of balanced_dirty_ratelimit. Which
         * keeps jumping around randomly and can even leap far away at times
         * due to the small 200ms estimation period of dirty_rate (we want to
         * keep that period small to reduce time lags).
@@ -1606,13 +1634,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
 */
 void __init page_writeback_init(void)
 {
-        int shift;
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
-        shift = calc_period_shift();
+        fprop_global_init(&writeout_completions);
-        prop_descriptor_init(&vm_completions, shift);
 }
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 44030096da63..889532b8e6c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
-#include <linux/memory.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+/*
+ * NOTE:
+ * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
+ * Instead, use {un}set_pageblock_isolate.
+ */
+void set_pageblock_migratetype(struct page *page, int migratetype)
 {
        if (unlikely(page_group_by_mobility_disabled))
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
        return pages_moved;
 }
-static int move_freepages_block(struct zone *zone, struct page *page,
+int move_freepages_block(struct zone *zone, struct page *page,
                                int migratetype)
 {
        unsigned long start_pfn, end_pfn;
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
                to_drain = pcp->batch;
        else
                to_drain = pcp->count;
-        free_pcppages_bulk(zone, to_drain, pcp);
+        if (to_drain > 0) {
-        pcp->count -= to_drain;
+                free_pcppages_bulk(zone, to_drain, pcp);
+                pcp->count -= to_drain;
+        }
        local_irq_restore(flags);
 }
 #endif
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
-static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
        if (order < fail_page_alloc.min_order)
-                return 0;
+                return false;
        if (gfp_mask & __GFP_NOFAIL)
-                return 0;
+                return false;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
-                return 0;
+                return false;
        if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
-                return 0;
+                return false;
        return should_fail(&fail_page_alloc.attr, 1 << order);
 }
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
 #else /* CONFIG_FAIL_PAGE_ALLOC */
-static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
-        return 0;
+        return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
+        long lowmem_reserve = z->lowmem_reserve[classzone_idx];
        int o;
        free_pages -= (1 << order) - 1;
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
-        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+        if (free_pages <= min + lowmem_reserve)
                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return true;
 }
+#ifdef CONFIG_MEMORY_ISOLATION
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+        if (unlikely(zone->nr_pageblock_isolate))
+                return zone->nr_pageblock_isolate * pageblock_nr_pages;
+        return 0;
+}
+#else
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+        return 0;
+}
+#endif
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                      int classzone_idx, int alloc_flags)
 {
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+        /*
+         * If the zone has MIGRATE_ISOLATE type free pages, we should consider
+         * it.  nr_zone_isolate_freepages is never accurate so kswapd might not
+         * sleep although it could do so.  But this is more desirable for memory
+         * hotplug than sleeping which can cause a livelock in the direct
+         * reclaim path.
+         */
+        free_pages -= nr_zone_isolate_freepages(z);
        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                                                                free_pages);
 }
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                page = get_page_from_freelist(gfp_mask, nodemask,
                                order, zonelist, high_zoneidx,
-                                alloc_flags, preferred_zone,
+                                alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                migratetype);
+                                preferred_zone, migratetype);
                if (page) {
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 retry:
        page = get_page_from_freelist(gfp_mask, nodemask, order,
                                        zonelist, high_zoneidx,
-                                        alloc_flags, preferred_zone,
+                                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                        migratetype);
+                                        preferred_zone, migratetype);
        /*
         * If an allocation failed after direct reclaim, it could be because
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
-                if (!in_interrupt() &&
+                if (gfp_mask & __GFP_MEMALLOC)
-                    ((current->flags & PF_MEMALLOC) ||
+                        alloc_flags |= ALLOC_NO_WATERMARKS;
-                     unlikely(test_thread_flag(TIF_MEMDIE))))
+                else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+                        alloc_flags |= ALLOC_NO_WATERMARKS;
+                else if (!in_interrupt() &&
+                                ((current->flags & PF_MEMALLOC) ||
+                                 unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
        return alloc_flags;
 }
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+        return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
+}
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2340,11 +2378,27 @@ rebalance:
        /* Allocate without watermarks if the context allows */
        if (alloc_flags & ALLOC_NO_WATERMARKS) {
+                /*
+                 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
+                 * the allocation is high priority and these type of
+                 * allocations are system rather than user orientated
+                 */
+                zonelist = node_zonelist(numa_node_id(), gfp_mask);
                page = __alloc_pages_high_priority(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-                if (page)
+                if (page) {
+                        /*
+                         * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+                         * necessary to allocate the page. The expectation is
+                         * that the caller is taking steps that will free more
+                         * memory. The caller should avoid the page being used
+                         * for !PFMEMALLOC purposes.
+                         */
+                        page->pfmemalloc = true;
                        goto got_pg;
+                }
        }
        /* Atomic allocations - we can't balance anything */
@@ -2463,8 +2517,8 @@ nopage:
 got_pg:
        if (kmemcheck_enabled)
                kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-        return page;
+        return page;
 }
 /*
@@ -2515,6 +2569,8 @@ retry_cpuset:
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
+        else
+                page->pfmemalloc = false;
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                        user_zonelist_order = oldval;
                } else if (oldval != user_zonelist_order) {
                        mutex_lock(&zonelists_mutex);
-                        build_all_zonelists(NULL);
+                        build_all_zonelists(NULL, NULL);
                        mutex_unlock(&zonelists_mutex);
                }
        }
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone);
 DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
-static __init_refok int __build_all_zonelists(void *data)
+static int __build_all_zonelists(void *data)
 {
        int nid;
        int cpu;
+        pg_data_t *self = data;
 #ifdef CONFIG_NUMA
        memset(node_load, 0, sizeof(node_load));
 #endif
+        if (self && !node_online(self->node_id)) {
+                build_zonelists(self);
+                build_zonelist_cache(self);
+        }
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data)
 * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
 */
-void __ref build_all_zonelists(void *data)
+void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
        set_zonelist_order();
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data)
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
 #ifdef CONFIG_MEMORY_HOTPLUG
-                if (data)
+                if (zone)
-                        setup_zone_pageset((struct zone *)data);
+                        setup_zone_pageset(zone);
 #endif
-                stop_machine(__build_all_zonelists, NULL, NULL);
+                stop_machine(__build_all_zonelists, pgdat, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
        memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
-static int zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
        int batch;
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                pcp->batch = PAGE_SHIFT * 8;
 }
-static void setup_zone_pageset(struct zone *zone)
+static void __meminit setup_zone_pageset(struct zone *zone)
 {
        int cpu;
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
        return 0;
 }
-static int __zone_pcp_update(void *data)
-{
-        struct zone *zone = data;
-        int cpu;
-        unsigned long batch = zone_batchsize(zone), flags;
-        for_each_possible_cpu(cpu) {
-                struct per_cpu_pageset *pset;
-                struct per_cpu_pages *pcp;
-                pset = per_cpu_ptr(zone->pageset, cpu);
-                pcp = &pset->pcp;
-                local_irq_save(flags);
-                free_pcppages_bulk(zone, pcp->count, pcp);
-                setup_pageset(pset, batch);
-                local_irq_restore(flags);
-        }
-        return 0;
-}
-void zone_pcp_update(struct zone *zone)
-{
-        stop_machine(__zone_pcp_update, zone, NULL);
-}
 static __meminit void zone_pcp_init(struct zone *zone)
 {
        /*
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
                                         zone_batchsize(zone));
 }
-__meminit int init_currently_empty_zone(struct zone *zone,
+int __meminit init_currently_empty_zone(struct zone *zone,
                                        unsigned long zone_start_pfn,
                                        unsigned long size,
                                        enum memmap_context context)
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(void)
+void __init set_pageblock_order(void)
 {
        unsigned int order;
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void)
 * include/linux/pageblock-flags.h for the values of pageblock_order based on
 * the kernel config
 */
-static inline void set_pageblock_order(void)
+void __init set_pageblock_order(void)
 {
 }
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void)
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
 */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        int ret;
        pgdat_resize_init(pgdat);
-        pgdat->nr_zones = 0;
        init_waitqueue_head(&pgdat->kswapd_wait);
-        pgdat->kswapd_max_order = 0;
+        init_waitqueue_head(&pgdat->pfmemalloc_wait);
        pgdat_page_cgroup_init(pgdat);
        for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->spanned_pages = size;
                zone->present_pages = realsize;
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+                zone->compact_cached_free_pfn = zone->zone_start_pfn +
+                                                zone->spanned_pages;
+                zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
+#endif
 #ifdef CONFIG_NUMA
                zone->node = nid;
                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone_pcp_init(zone);
                lruvec_init(&zone->lruvec, zone);
-                zap_zone_vm_stats(zone);
-                zone->flags = 0;
                if (!size)
                        continue;
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 {
        pg_data_t *pgdat = NODE_DATA(nid);
+        /* pg_data_t should be reset to zero when it's allocated */
+        WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx);
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
        calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4750,7 +4794,7 @@ out:
 }
 /* Any regular memory on that node ? */
-static void check_for_regular_memory(pg_data_t *pgdat)
+static void __init check_for_regular_memory(pg_data_t *pgdat)
 {
 #ifdef CONFIG_HIGHMEM
        enum zone_type zone_type;
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 }
 /*
- * This is designed as sub function...plz see page_isolation.c also.
+ * This function checks whether pageblock includes unmovable pages or not.
- * set/clear page block's type to be ISOLATE.
+ * If @count is not zero, it is okay to include less @count unmovable pages
- * page allocater never alloc memory from ISOLATE block.
+ *
+ * PageLRU check wihtout isolation or lru_lock could race so that
+ * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
+ * expect this function should be exact.
 */
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
-static int
-__count_immobile_pages(struct zone *zone, struct page *page, int count)
 {
        unsigned long pfn, iter, found;
        int mt;
        /*
         * For avoiding noise data, lru_add_drain_all() should be called
-         * If ZONE_MOVABLE, the zone never contains immobile pages
+         * If ZONE_MOVABLE, the zone never contains unmovable pages
         */
        if (zone_idx(zone) == ZONE_MOVABLE)
-                return true;
+                return false;
        mt = get_pageblock_migratetype(page);
        if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
-                return true;
+                return false;
        pfn = page_to_pfn(page);
        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
                        continue;
                page = pfn_to_page(check);
-                if (!page_count(page)) {
+                /*
+                 * We can't use page_count without pin a page
+                 * because another CPU can free compound page.
+                 * This check already skips compound tails of THP
+                 * because their page->_count is zero at all time.
+                 */
+                if (!atomic_read(&page->_count)) {
                        if (PageBuddy(page))
                                iter += (1 << page_order(page)) - 1;
                        continue;
                }
                if (!PageLRU(page))
                        found++;
                /*
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
                 * page at boot.
                 */
                if (found > count)
-                        return false;
+                        return true;
        }
-        return true;
+        return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page)
                        zone->zone_start_pfn + zone->spanned_pages <= pfn)
                return false;
-        return __count_immobile_pages(zone, page, 0);
+        return !has_unmovable_pages(zone, page, 0);
-}
-int set_migratetype_isolate(struct page *page)
-{
-        struct zone *zone;
-        unsigned long flags, pfn;
-        struct memory_isolate_notify arg;
-        int notifier_ret;
-        int ret = -EBUSY;
-        zone = page_zone(page);
-        spin_lock_irqsave(&zone->lock, flags);
-        pfn = page_to_pfn(page);
-        arg.start_pfn = pfn;
-        arg.nr_pages = pageblock_nr_pages;
-        arg.pages_found = 0;
-        /*
-         * It may be possible to isolate a pageblock even if the
-         * migratetype is not MIGRATE_MOVABLE. The memory isolation
-         * notifier chain is used by balloon drivers to return the
-         * number of pages in a range that are held by the balloon
-         * driver to shrink memory. If all the pages are accounted for
-         * by balloons, are free, or on the LRU, isolation can continue.
-         * Later, for example, when memory hotplug notifier runs, these
-         * pages reported as "can be isolated" should be isolated(freed)
-         * by the balloon driver through the memory notifier chain.
-         */
-        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
-        notifier_ret = notifier_to_errno(notifier_ret);
-        if (notifier_ret)
-                goto out;
-        /*
-         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
-         * We just check MOVABLE pages.
-         */
-        if (__count_immobile_pages(zone, page, arg.pages_found))
-                ret = 0;
-        /*
-         * immobile means "not-on-lru" paes. If immobile is larger than
-         * removable-by-driver pages reported by notifier, we'll fail.
-         */
-out:
-        if (!ret) {
-                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-                move_freepages_block(zone, page, MIGRATE_ISOLATE);
-        }
-        spin_unlock_irqrestore(&zone->lock, flags);
-        if (!ret)
-                drain_all_pages();
-        return ret;
-}
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
-{
-        struct zone *zone;
-        unsigned long flags;
-        zone = page_zone(page);
-        spin_lock_irqsave(&zone->lock, flags);
-        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
-                goto out;
-        set_pageblock_migratetype(page, migratetype);
-        move_freepages_block(zone, page, migratetype);
-out:
-        spin_unlock_irqrestore(&zone->lock, flags);
 }
 #ifdef CONFIG_CMA
@@ -5635,7 +5617,12 @@ static struct page *
 __alloc_contig_migrate_alloc(struct page *page, unsigned long private,
                             int **resultp)
 {
-        return alloc_page(GFP_HIGHUSER_MOVABLE);
+        gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+        if (PageHighMem(page))
+                gfp_mask |= __GFP_HIGHMEM;
+        return alloc_page(gfp_mask);
 }
 /* [start, end) must belong to a single zone. */
@@ -5864,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
 }
 #endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int __meminit __zone_pcp_update(void *data)
+{
+        struct zone *zone = data;
+        int cpu;
+        unsigned long batch = zone_batchsize(zone), flags;
+        for_each_possible_cpu(cpu) {
+                struct per_cpu_pageset *pset;
+                struct per_cpu_pages *pcp;
+                pset = per_cpu_ptr(zone->pageset, cpu);
+                pcp = &pset->pcp;
+                local_irq_save(flags);
+                if (pcp->count > 0)
+                        free_pcppages_bulk(zone, pcp->count, pcp);
+                setup_pageset(pset, batch);
+                local_irq_restore(flags);
+        }
+        return 0;
+}
+void __meminit zone_pcp_update(struct zone *zone)
+{
+        stop_machine(__zone_pcp_update, zone, NULL);
+}
+#endif
 #ifdef CONFIG_MEMORY_HOTREMOVE
+void zone_pcp_reset(struct zone *zone)
+{
+        unsigned long flags;
+        /* avoid races with drain_pages()  */
+        local_irq_save(flags);
+        if (zone->pageset != &boot_pageset) {
+                free_percpu(zone->pageset);
+                zone->pageset = &boot_pageset;
+        }
+        local_irq_restore(flags);
+}
 /*
 * All pages in the range must be isolated before calling this.
 */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059c..5ddad0c6daa6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static DEFINE_MUTEX(swap_cgroup_mutex);
 struct swap_cgroup_ctrl {
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
 /**
 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @end: swap entry to be cmpxchged
+ * @ent: swap entry to be cmpxchged
 * @old: old id
 * @new: new id
 *
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 /**
 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 * @ent: swap entry to be recorded into
- * @mem: mem_cgroup to be recorded
+ * @id: mem_cgroup to be recorded
 *
 * Returns old value at success, 0 at failure.
 * (Of course, old value can be 0.)
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..78eee32ee486 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -17,7 +17,9 @@
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/swapops.h>
+#include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/frontswap.h>
 #include <asm/pgtable.h>
 static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -85,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err)
        bio_put(bio);
 }
+int generic_swapfile_activate(struct swap_info_struct *sis,
+                                struct file *swap_file,
+                                sector_t *span)
+{
+        struct address_space *mapping = swap_file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned blocks_per_page;
+        unsigned long page_no;
+        unsigned blkbits;
+        sector_t probe_block;
+        sector_t last_block;
+        sector_t lowest_block = -1;
+        sector_t highest_block = 0;
+        int nr_extents = 0;
+        int ret;
+        blkbits = inode->i_blkbits;
+        blocks_per_page = PAGE_SIZE >> blkbits;
+        /*
+         * Map all the blocks into the extent list.  This code doesn't try
+         * to be very smart.
+         */
+        probe_block = 0;
+        page_no = 0;
+        last_block = i_size_read(inode) >> blkbits;
+        while ((probe_block + blocks_per_page) <= last_block &&
+                        page_no < sis->max) {
+                unsigned block_in_page;
+                sector_t first_block;
+                first_block = bmap(inode, probe_block);
+                if (first_block == 0)
+                        goto bad_bmap;
+                /*
+                 * It must be PAGE_SIZE aligned on-disk
+                 */
+                if (first_block & (blocks_per_page - 1)) {
+                        probe_block++;
+                        goto reprobe;
+                }
+                for (block_in_page = 1; block_in_page < blocks_per_page;
+                                        block_in_page++) {
+                        sector_t block;
+                        block = bmap(inode, probe_block + block_in_page);
+                        if (block == 0)
+                                goto bad_bmap;
+                        if (block != first_block + block_in_page) {
+                                /* Discontiguity */
+                                probe_block++;
+                                goto reprobe;
+                        }
+                }
+                first_block >>= (PAGE_SHIFT - blkbits);
+                if (page_no) {  /* exclude the header page */
+                        if (first_block < lowest_block)
+                                lowest_block = first_block;
+                        if (first_block > highest_block)
+                                highest_block = first_block;
+                }
+                /*
+                 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+                 */
+                ret = add_swap_extent(sis, page_no, 1, first_block);
+                if (ret < 0)
+                        goto out;
+                nr_extents += ret;
+                page_no++;
+                probe_block += blocks_per_page;
+reprobe:
+                continue;
+        }
+        ret = nr_extents;
+        *span = 1 + highest_block - lowest_block;
+        if (page_no == 0)
+                page_no = 1;    /* force Empty message */
+        sis->max = page_no;
+        sis->pages = page_no - 1;
+        sis->highest_bit = page_no - 1;
+out:
+        return ret;
+bad_bmap:
+        printk(KERN_ERR "swapon: swapfile has holes\n");
+        ret = -EINVAL;
+        goto out;
+}
 /*
 * We may have stale swap cache pages in memory: notice
 * them here and get rid of the unnecessary final write.
@@ -93,11 +187,45 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct bio *bio;
        int ret = 0, rw = WRITE;
+        struct swap_info_struct *sis = page_swap_info(page);
        if (try_to_free_swap(page)) {
                unlock_page(page);
                goto out;
        }
+        if (frontswap_store(page) == 0) {
+                set_page_writeback(page);
+                unlock_page(page);
+                end_page_writeback(page);
+                goto out;
+        }
+        if (sis->flags & SWP_FILE) {
+                struct kiocb kiocb;
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                struct iovec iov = {
+                        .iov_base = kmap(page),
+                        .iov_len  = PAGE_SIZE,
+                };
+                init_sync_kiocb(&kiocb, swap_file);
+                kiocb.ki_pos = page_file_offset(page);
+                kiocb.ki_left = PAGE_SIZE;
+                kiocb.ki_nbytes = PAGE_SIZE;
+                unlock_page(page);
+                ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
+                                                &kiocb, &iov,
+                                                kiocb.ki_pos, 1);
+                kunmap(page);
+                if (ret == PAGE_SIZE) {
+                        count_vm_event(PSWPOUT);
+                        ret = 0;
+                }
+                return ret;
+        }
        bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
        if (bio == NULL) {
                set_page_dirty(page);
@@ -119,9 +247,26 @@ int swap_readpage(struct page *page)
 {
        struct bio *bio;
        int ret = 0;
+        struct swap_info_struct *sis = page_swap_info(page);
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageUptodate(page));
+        if (frontswap_load(page) == 0) {
+                SetPageUptodate(page);
+                unlock_page(page);
+                goto out;
+        }
+        if (sis->flags & SWP_FILE) {
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                ret = mapping->a_ops->readpage(swap_file, page);
+                if (!ret)
+                        count_vm_event(PSWPIN);
+                return ret;
+        }
        bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
        if (bio == NULL) {
                unlock_page(page);
@@ -133,3 +278,15 @@ int swap_readpage(struct page *page)
 out:
        return ret;
 }
+int swap_set_page_dirty(struct page *page)
+{
+        struct swap_info_struct *sis = page_swap_info(page);
+        if (sis->flags & SWP_FILE) {
+                struct address_space *mapping = sis->swap_file->f_mapping;
+                return mapping->a_ops->set_page_dirty(page);
+        } else {
+                return __set_page_dirty_no_writeback(page);
+        }
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c9f04774f2b8..247d1f175739 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,101 @@
 #include <linux/mm.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
+#include <linux/memory.h>
 #include "internal.h"
+/* called while holding zone->lock */
+static void set_pageblock_isolate(struct page *page)
+{
+        if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
+                return;
+        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+        page_zone(page)->nr_pageblock_isolate++;
+}
+/* called while holding zone->lock */
+static void restore_pageblock_isolate(struct page *page, int migratetype)
+{
+        struct zone *zone = page_zone(page);
+        if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
+                return;
+        BUG_ON(zone->nr_pageblock_isolate <= 0);
+        set_pageblock_migratetype(page, migratetype);
+        zone->nr_pageblock_isolate--;
+}
+int set_migratetype_isolate(struct page *page)
+{
+        struct zone *zone;
+        unsigned long flags, pfn;
+        struct memory_isolate_notify arg;
+        int notifier_ret;
+        int ret = -EBUSY;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        pfn = page_to_pfn(page);
+        arg.start_pfn = pfn;
+        arg.nr_pages = pageblock_nr_pages;
+        arg.pages_found = 0;
+        /*
+         * It may be possible to isolate a pageblock even if the
+         * migratetype is not MIGRATE_MOVABLE. The memory isolation
+         * notifier chain is used by balloon drivers to return the
+         * number of pages in a range that are held by the balloon
+         * driver to shrink memory. If all the pages are accounted for
+         * by balloons, are free, or on the LRU, isolation can continue.
+         * Later, for example, when memory hotplug notifier runs, these
+         * pages reported as "can be isolated" should be isolated(freed)
+         * by the balloon driver through the memory notifier chain.
+         */
+        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+        notifier_ret = notifier_to_errno(notifier_ret);
+        if (notifier_ret)
+                goto out;
+        /*
+         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+         * We just check MOVABLE pages.
+         */
+        if (!has_unmovable_pages(zone, page, arg.pages_found))
+                ret = 0;
+        /*
+         * immobile means "not-on-lru" paes. If immobile is larger than
+         * removable-by-driver pages reported by notifier, we'll fail.
+         */
+out:
+        if (!ret) {
+                set_pageblock_isolate(page);
+                move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        }
+        spin_unlock_irqrestore(&zone->lock, flags);
+        if (!ret)
+                drain_all_pages();
+        return ret;
+}
+void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+{
+        struct zone *zone;
+        unsigned long flags;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+                goto out;
+        move_freepages_block(zone, page, migratetype);
+        restore_pageblock_isolate(page, migratetype);
+out:
+        spin_unlock_irqrestore(&zone->lock, flags);
+}
 static inline struct page *
 __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 {
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..6c118d012bb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
 /**
 * walk_page_range - walk a memory map's page tables with a callback
- * @mm: memory map to walk
 * @addr: starting address
 * @end: ending address
 * @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c3..3707c71ae4cd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
 * @chunk: chunk to depopulate
 * @off: offset to the area to depopulate
 * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
 *
 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 * from @chunk.  If @flush is true, vcache is flushed before unmapping
diff --git a/mm/shmem.c b/mm/shmem.c
index c244e93a70fa..d4e184e2a38e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -264,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
 }
 /*
+ * Sometimes, before we decide whether to proceed or to fail, we must check
+ * that an entry was not already brought back from swap by a racing thread.
+ *
+ * Checking page is not enough: by the time a SwapCache page is locked, it
+ * might be reused, and again be SwapCache, using the same swap as before.
+ */
+static bool shmem_confirm_swap(struct address_space *mapping,
+                               pgoff_t index, swp_entry_t swap)
+{
+        void *item;
+        rcu_read_lock();
+        item = radix_tree_lookup(&mapping->page_tree, index);
+        rcu_read_unlock();
+        return item == swp_to_radix_entry(swap);
+}
+/*
 * Like add_to_page_cache_locked, but error if expected item has gone.
 */
 static int shmem_add_to_page_cache(struct page *page,
                                   struct address_space *mapping,
                                   pgoff_t index, gfp_t gfp, void *expected)
 {
-        int error = 0;
+        int error;
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(!PageSwapBacked(page));
+        page_cache_get(page);
+        page->mapping = mapping;
+        page->index = index;
+        spin_lock_irq(&mapping->tree_lock);
        if (!expected)
-                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+                error = radix_tree_insert(&mapping->page_tree, index, page);
+        else
+                error = shmem_radix_tree_replace(mapping, index, expected,
+                                                                 page);
        if (!error) {
-                page_cache_get(page);
+                mapping->nrpages++;
-                page->mapping = mapping;
+                __inc_zone_page_state(page, NR_FILE_PAGES);
-                page->index = index;
+                __inc_zone_page_state(page, NR_SHMEM);
+                spin_unlock_irq(&mapping->tree_lock);
-                spin_lock_irq(&mapping->tree_lock);
+        } else {
-                if (!expected)
+                page->mapping = NULL;
-                        error = radix_tree_insert(&mapping->page_tree,
+                spin_unlock_irq(&mapping->tree_lock);
-                                                        index, page);
+                page_cache_release(page);
-                else
-                        error = shmem_radix_tree_replace(mapping, index,
-                                                        expected, page);
-                if (!error) {
-                        mapping->nrpages++;
-                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                        __inc_zone_page_state(page, NR_SHMEM);
-                        spin_unlock_irq(&mapping->tree_lock);
-                } else {
-                        page->mapping = NULL;
-                        spin_unlock_irq(&mapping->tree_lock);
-                        page_cache_release(page);
-                }
-                if (!expected)
-                        radix_tree_preload_end();
        }
-        if (error)
-                mem_cgroup_uncharge_cache_page(page);
        return error;
 }
@@ -683,10 +692,21 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
                mutex_lock(&shmem_swaplist_mutex);
                /*
                 * We needed to drop mutex to make that restrictive page
-                 * allocation; but the inode might already be freed by now,
+                 * allocation, but the inode might have been freed while we
-                 * and we cannot refer to inode or mapping or info to check.
+                 * dropped it: although a racing shmem_evict_inode() cannot
-                 * However, we do hold page lock on the PageSwapCache page,
+                 * complete without emptying the radix_tree, our page lock
-                 * so can check if that still has our reference remaining.
+                 * on this swapcache page is not enough to prevent that -
+                 * free_swap_and_cache() of our swap entry will only
+                 * trylock_page(), removing swap from radix_tree whatever.
+                 *
+                 * We must not proceed to shmem_add_to_page_cache() if the
+                 * inode has been freed, but of course we cannot rely on
+                 * inode or mapping or info to check that.  However, we can
+                 * safely check if our swap entry is still in use (and here
+                 * it can't have got reused for another page): if it's still
+                 * in use, then the inode cannot have been freed yet, and we
+                 * can safely proceed (if it's no longer in use, that tells
+                 * nothing about the inode, but we don't need to unuse swap).
                 */
                if (!page_swapcount(*pagep))
                        error = -ENOENT;
@@ -730,9 +750,9 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        /*
         * There's a faint possibility that swap page was replaced before
-         * caller locked it: it will come back later with the right page.
+         * caller locked it: caller will come back later with the right page.
         */
-        if (unlikely(!PageSwapCache(page)))
+        if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
                goto out;
        /*
@@ -909,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = index;
+        /* Bias interleave by inode number to distribute better across nodes */
+        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
        pvma.vm_ops = NULL;
        pvma.vm_policy = spol;
        return swapin_readahead(swap, gfp, &pvma, 0);
@@ -922,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp,
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = index;
+        /* Bias interleave by inode number to distribute better across nodes */
+        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
        pvma.vm_ops = NULL;
        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
@@ -995,21 +1017,15 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        newpage = shmem_alloc_page(gfp, info, index);
        if (!newpage)
                return -ENOMEM;
-        VM_BUG_ON(shmem_should_replace_page(newpage, gfp));
-        *pagep = newpage;
        page_cache_get(newpage);
        copy_highpage(newpage, oldpage);
+        flush_dcache_page(newpage);
-        VM_BUG_ON(!PageLocked(oldpage));
        __set_page_locked(newpage);
-        VM_BUG_ON(!PageUptodate(oldpage));
        SetPageUptodate(newpage);
-        VM_BUG_ON(!PageSwapBacked(oldpage));
        SetPageSwapBacked(newpage);
-        VM_BUG_ON(!swap_index);
        set_page_private(newpage, swap_index);
-        VM_BUG_ON(!PageSwapCache(oldpage));
        SetPageSwapCache(newpage);
        /*
@@ -1019,13 +1035,24 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        spin_lock_irq(&swap_mapping->tree_lock);
        error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
                                                                   newpage);
-        __inc_zone_page_state(newpage, NR_FILE_PAGES);
+        if (!error) {
-        __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+                __inc_zone_page_state(newpage, NR_FILE_PAGES);
+                __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+        }
        spin_unlock_irq(&swap_mapping->tree_lock);
-        BUG_ON(error);
-        mem_cgroup_replace_page_cache(oldpage, newpage);
+        if (unlikely(error)) {
-        lru_cache_add_anon(newpage);
+                /*
+                 * Is this possible?  I think not, now that our callers check
+                 * both PageSwapCache and page_private after getting page lock;
+                 * but be defensive.  Reverse old to newpage for clear and free.
+                 */
+                oldpage = newpage;
+        } else {
+                mem_cgroup_replace_page_cache(oldpage, newpage);
+                lru_cache_add_anon(newpage);
+                *pagep = newpage;
+        }
        ClearPageSwapCache(oldpage);
        set_page_private(oldpage, 0);
@@ -1033,7 +1060,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
        unlock_page(oldpage);
        page_cache_release(oldpage);
        page_cache_release(oldpage);
-        return 0;
+        return error;
 }
 /*
@@ -1107,9 +1134,10 @@ repeat:
                /* We have to do this with page locked to prevent races */
                lock_page(page);
-                if (!PageSwapCache(page) || page->mapping) {
+                if (!PageSwapCache(page) || page_private(page) != swap.val ||
+                    !shmem_confirm_swap(mapping, index, swap)) {
                        error = -EEXIST;        /* try again */
-                        goto failed;
+                        goto unlock;
                }
                if (!PageUptodate(page)) {
                        error = -EIO;
@@ -1125,9 +1153,12 @@ repeat:
                error = mem_cgroup_cache_charge(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
-                if (!error)
+                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
                                                gfp, swp_to_radix_entry(swap));
+                        /* We already confirmed swap, and make no allocation */
+                        VM_BUG_ON(error);
+                }
                if (error)
                        goto failed;
@@ -1164,11 +1195,18 @@ repeat:
                __set_page_locked(page);
                error = mem_cgroup_cache_charge(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
-                if (!error)
-                        error = shmem_add_to_page_cache(page, mapping, index,
-                                                gfp, NULL);
                if (error)
                        goto decused;
+                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+                if (!error) {
+                        error = shmem_add_to_page_cache(page, mapping, index,
+                                                        gfp, NULL);
+                        radix_tree_preload_end();
+                }
+                if (error) {
+                        mem_cgroup_uncharge_cache_page(page);
+                        goto decused;
+                }
                lru_cache_add_anon(page);
                spin_lock(&info->lock);
@@ -1228,14 +1266,10 @@ decused:
 unacct:
        shmem_unacct_blocks(info->flags, 1);
 failed:
-        if (swap.val && error != -EINVAL) {
+        if (swap.val && error != -EINVAL &&
-                struct page *test = find_get_page(mapping, index);
+            !shmem_confirm_swap(mapping, index, swap))
-                if (test && !radix_tree_exceptional_entry(test))
+                error = -EEXIST;
-                        page_cache_release(test);
+unlock:
-                /* Have another try if the entry has changed */
-                if (test != swp_to_radix_entry(swap))
-                        error = -EEXIST;
-        }
        if (page) {
                unlock_page(page);
                page_cache_release(page);
@@ -1247,7 +1281,7 @@ failed:
                spin_unlock(&info->lock);
                goto repeat;
        }
-        if (error == -EEXIST)
+        if (error == -EEXIST)   /* from above or from radix_tree_insert */
                goto repeat;
        return error;
 }
@@ -1675,98 +1709,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        return error;
 }
-/*
- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
- */
-static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
-                                    pgoff_t index, pgoff_t end, int origin)
-{
-        struct page *page;
-        struct pagevec pvec;
-        pgoff_t indices[PAGEVEC_SIZE];
-        bool done = false;
-        int i;
-        pagevec_init(&pvec, 0);
-        pvec.nr = 1;            /* start small: we may be there already */
-        while (!done) {
-                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                                        pvec.nr, pvec.pages, indices);
-                if (!pvec.nr) {
-                        if (origin == SEEK_DATA)
-                                index = end;
-                        break;
-                }
-                for (i = 0; i < pvec.nr; i++, index++) {
-                        if (index < indices[i]) {
-                                if (origin == SEEK_HOLE) {
-                                        done = true;
-                                        break;
-                                }
-                                index = indices[i];
-                        }
-                        page = pvec.pages[i];
-                        if (page && !radix_tree_exceptional_entry(page)) {
-                                if (!PageUptodate(page))
-                                        page = NULL;
-                        }
-                        if (index >= end ||
-                            (page && origin == SEEK_DATA) ||
-                            (!page && origin == SEEK_HOLE)) {
-                                done = true;
-                                break;
-                        }
-                }
-                shmem_deswap_pagevec(&pvec);
-                pagevec_release(&pvec);
-                pvec.nr = PAGEVEC_SIZE;
-                cond_resched();
-        }
-        return index;
-}
-static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin)
-{
-        struct address_space *mapping;
-        struct inode *inode;
-        pgoff_t start, end;
-        loff_t new_offset;
-        if (origin != SEEK_DATA && origin != SEEK_HOLE)
-                return generic_file_llseek_size(file, offset, origin,
-                                                        MAX_LFS_FILESIZE);
-        mapping = file->f_mapping;
-        inode = mapping->host;
-        mutex_lock(&inode->i_mutex);
-        /* We're holding i_mutex so we can access i_size directly */
-        if (offset < 0)
-                offset = -EINVAL;
-        else if (offset >= inode->i_size)
-                offset = -ENXIO;
-        else {
-                start = offset >> PAGE_CACHE_SHIFT;
-                end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-                new_offset = shmem_seek_hole_data(mapping, start, end, origin);
-                new_offset <<= PAGE_CACHE_SHIFT;
-                if (new_offset > offset) {
-                        if (new_offset < inode->i_size)
-                                offset = new_offset;
-                        else if (origin == SEEK_DATA)
-                                offset = -ENXIO;
-                        else
-                                offset = inode->i_size;
-                }
-        }
-        if (offset >= 0 && offset != file->f_pos) {
-                file->f_pos = offset;
-                file->f_version = 0;
-        }
-        mutex_unlock(&inode->i_mutex);
-        return offset;
-}
 static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                                                         loff_t len)
 {
@@ -1937,7 +1879,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 }
 static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-                struct nameidata *nd)
+                bool excl)
 {
        return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
 }
@@ -2770,7 +2712,7 @@ static const struct address_space_operations shmem_aops = {
 static const struct file_operations shmem_file_operations = {
        .mmap           = shmem_mmap,
 #ifdef CONFIG_TMPFS
-        .llseek         = shmem_file_llseek,
+        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = shmem_file_aio_read,
diff --git a/mm/slab.c b/mm/slab.c
index e901a36e2520..f8b0d539b482 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
 * Further notes from the original documentation:
 *
 * 11 April '97.  Started multi-threading - markhe
- *      The global cache-chain is protected by the mutex 'cache_chain_mutex'.
+ *      The global cache-chain is protected by the mutex 'slab_mutex'.
 *      The sem is only needed when accessing/extending the cache-chain, which
 *      can never happen inside an interrupt (kmem_cache_create(),
 *      kmem_cache_shrink() and kmem_cache_reap()).
@@ -87,6 +87,7 @@
 */
 #include        <linux/slab.h>
+#include        "slab.h"
 #include        <linux/mm.h>
 #include        <linux/poison.h>
 #include        <linux/swap.h>
@@ -117,12 +118,16 @@
 #include        <linux/memory.h>
 #include        <linux/prefetch.h>
+#include        <net/sock.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
 #include        <asm/page.h>
 #include <trace/events/kmem.h>
+#include        "internal.h"
 /*
 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 *                0 for faster, smaller code (especially in the critical paths).
@@ -151,6 +156,12 @@
 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 #endif
+/*
+ * true if a page was allocated from pfmemalloc reserves for network-based
+ * swap
+ */
+static bool pfmemalloc_active __read_mostly;
 /* Legal flag mask for kmem_cache_create(). */
 #if DEBUG
 # define CREATE_MASK    (SLAB_RED_ZONE | \
@@ -256,9 +267,30 @@ struct array_cache {
                         * Must have this definition in here for the proper
                         * alignment of array_cache. Also simplifies accessing
                         * the entries.
+                         *
+                         * Entries should not be directly dereferenced as
+                         * entries belonging to slabs marked pfmemalloc will
+                         * have the lower bits set SLAB_OBJ_PFMEMALLOC
                         */
 };
+#define SLAB_OBJ_PFMEMALLOC     1
+static inline bool is_obj_pfmemalloc(void *objp)
+{
+        return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
+}
+static inline void set_obj_pfmemalloc(void **objp)
+{
+        *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
+        return;
+}
+static inline void clear_obj_pfmemalloc(void **objp)
+{
+        *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
+}
 /*
 * bootstrap: The caches do not work without cpuarrays anymore, but the
 * cpuarrays are allocated from the generic caches...
@@ -424,8 +456,8 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
 *              redzone word.
 * cachep->obj_offset: The real object.
- * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ * cachep->size - 1* BYTES_PER_WORD: last caller address
 *                                      [BYTES_PER_WORD long]
 */
 static int obj_offset(struct kmem_cache *cachep)
@@ -433,11 +465,6 @@ static int obj_offset(struct kmem_cache *cachep)
        return cachep->obj_offset;
 }
-static int obj_size(struct kmem_cache *cachep)
-{
-        return cachep->obj_size;
-}
 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
@@ -449,23 +476,22 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
        if (cachep->flags & SLAB_STORE_USER)
-                return (unsigned long long *)(objp + cachep->buffer_size -
+                return (unsigned long long *)(objp + cachep->size -
                                              sizeof(unsigned long long) -
                                              REDZONE_ALIGN);
-        return (unsigned long long *) (objp + cachep->buffer_size -
+        return (unsigned long long *) (objp + cachep->size -
                                       sizeof(unsigned long long));
 }
 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-        return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
+        return (void **)(objp + cachep->size - BYTES_PER_WORD);
 }
 #else
 #define obj_offset(x)                   0
-#define obj_size(cachep)                (cachep->buffer_size)
 #define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 #define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 #define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
@@ -475,7 +501,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #ifdef CONFIG_TRACING
 size_t slab_buffer_size(struct kmem_cache *cachep)
 {
-        return cachep->buffer_size;
+        return cachep->size;
 }
 EXPORT_SYMBOL(slab_buffer_size);
 #endif
@@ -489,56 +515,37 @@ EXPORT_SYMBOL(slab_buffer_size);
 static int slab_max_order = SLAB_MAX_ORDER_LO;
 static bool slab_max_order_set __initdata;
-/*
- * Functions for storing/retrieving the cachep and or slab from the page
- * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
- * these are used to find the cache which an obj belongs to.
- */
-static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
-{
-        page->lru.next = (struct list_head *)cache;
-}
 static inline struct kmem_cache *page_get_cache(struct page *page)
 {
        page = compound_head(page);
        BUG_ON(!PageSlab(page));
-        return (struct kmem_cache *)page->lru.next;
+        return page->slab_cache;
-}
-static inline void page_set_slab(struct page *page, struct slab *slab)
-{
-        page->lru.prev = (struct list_head *)slab;
-}
-static inline struct slab *page_get_slab(struct page *page)
-{
-        BUG_ON(!PageSlab(page));
-        return (struct slab *)page->lru.prev;
 }
 static inline struct kmem_cache *virt_to_cache(const void *obj)
 {
        struct page *page = virt_to_head_page(obj);
-        return page_get_cache(page);
+        return page->slab_cache;
 }
 static inline struct slab *virt_to_slab(const void *obj)
 {
        struct page *page = virt_to_head_page(obj);
-        return page_get_slab(page);
+        VM_BUG_ON(!PageSlab(page));
+        return page->slab_page;
 }
 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
                                 unsigned int idx)
 {
-        return slab->s_mem + cache->buffer_size * idx;
+        return slab->s_mem + cache->size * idx;
 }
 /*
- * We want to avoid an expensive divide : (offset / cache->buffer_size)
+ * We want to avoid an expensive divide : (offset / cache->size)
- *   Using the fact that buffer_size is a constant for a particular cache,
+ *   Using the fact that size is a constant for a particular cache,
- *   we can replace (offset / cache->buffer_size) by
+ *   we can replace (offset / cache->size) by
 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
 */
 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
@@ -584,33 +591,12 @@ static struct kmem_cache cache_cache = {
        .batchcount = 1,
        .limit = BOOT_CPUCACHE_ENTRIES,
        .shared = 1,
-        .buffer_size = sizeof(struct kmem_cache),
+        .size = sizeof(struct kmem_cache),
        .name = "kmem_cache",
 };
 #define BAD_ALIEN_MAGIC 0x01020304ul
-/*
- * chicken and egg problem: delay the per-cpu array allocation
- * until the general caches are up.
- */
-static enum {
-        NONE,
-        PARTIAL_AC,
-        PARTIAL_L3,
-        EARLY,
-        LATE,
-        FULL
-} g_cpucache_up;
-/*
- * used by boot code to determine if it can use slab based allocator
- */
-int slab_is_available(void)
-{
-        return g_cpucache_up >= EARLY;
-}
 #ifdef CONFIG_LOCKDEP
 /*
@@ -676,7 +662,7 @@ static void init_node_lock_keys(int q)
 {
        struct cache_sizes *s = malloc_sizes;
-        if (g_cpucache_up < LATE)
+        if (slab_state < UP)
                return;
        for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
@@ -716,12 +702,6 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 }
 #endif
-/*
- * Guard access to the cache-chain.
- */
-static DEFINE_MUTEX(cache_chain_mutex);
-static struct list_head cache_chain;
 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -951,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries,
        return nc;
 }
+static inline bool is_slab_pfmemalloc(struct slab *slabp)
+{
+        struct page *page = virt_to_page(slabp->s_mem);
+        return PageSlabPfmemalloc(page);
+}
+/* Clears pfmemalloc_active if no slabs have pfmalloc set */
+static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
+                                                struct array_cache *ac)
+{
+        struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
+        struct slab *slabp;
+        unsigned long flags;
+        if (!pfmemalloc_active)
+                return;
+        spin_lock_irqsave(&l3->list_lock, flags);
+        list_for_each_entry(slabp, &l3->slabs_full, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        list_for_each_entry(slabp, &l3->slabs_partial, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        list_for_each_entry(slabp, &l3->slabs_free, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        pfmemalloc_active = false;
+out:
+        spin_unlock_irqrestore(&l3->list_lock, flags);
+}
+static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                gfp_t flags, bool force_refill)
+{
+        int i;
+        void *objp = ac->entry[--ac->avail];
+        /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
+        if (unlikely(is_obj_pfmemalloc(objp))) {
+                struct kmem_list3 *l3;
+                if (gfp_pfmemalloc_allowed(flags)) {
+                        clear_obj_pfmemalloc(&objp);
+                        return objp;
+                }
+                /* The caller cannot use PFMEMALLOC objects, find another one */
+                for (i = 1; i < ac->avail; i++) {
+                        /* If a !PFMEMALLOC object is found, swap them */
+                        if (!is_obj_pfmemalloc(ac->entry[i])) {
+                                objp = ac->entry[i];
+                                ac->entry[i] = ac->entry[ac->avail];
+                                ac->entry[ac->avail] = objp;
+                                return objp;
+                        }
+                }
+                /*
+                 * If there are empty slabs on the slabs_free list and we are
+                 * being forced to refill the cache, mark this one !pfmemalloc.
+                 */
+                l3 = cachep->nodelists[numa_mem_id()];
+                if (!list_empty(&l3->slabs_free) && force_refill) {
+                        struct slab *slabp = virt_to_slab(objp);
+                        ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
+                        clear_obj_pfmemalloc(&objp);
+                        recheck_pfmemalloc_active(cachep, ac);
+                        return objp;
+                }
+                /* No !PFMEMALLOC objects available */
+                ac->avail++;
+                objp = NULL;
+        }
+        return objp;
+}
+static inline void *ac_get_obj(struct kmem_cache *cachep,
+                        struct array_cache *ac, gfp_t flags, bool force_refill)
+{
+        void *objp;
+        if (unlikely(sk_memalloc_socks()))
+                objp = __ac_get_obj(cachep, ac, flags, force_refill);
+        else
+                objp = ac->entry[--ac->avail];
+        return objp;
+}
+static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                                void *objp)
+{
+        if (unlikely(pfmemalloc_active)) {
+                /* Some pfmemalloc slabs exist, check if this is one */
+                struct page *page = virt_to_page(objp);
+                if (PageSlabPfmemalloc(page))
+                        set_obj_pfmemalloc(&objp);
+        }
+        return objp;
+}
+static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                                void *objp)
+{
+        if (unlikely(sk_memalloc_socks()))
+                objp = __ac_put_obj(cachep, ac, objp);
+        ac->entry[ac->avail++] = objp;
+}
 /*
 * Transfer objects in one arraycache to another.
 * Locking must be handled by the caller.
@@ -1127,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
                        STATS_INC_ACOVERFLOW(cachep);
                        __drain_alien_cache(cachep, alien, nodeid);
                }
-                alien->entry[alien->avail++] = objp;
+                ac_put_obj(cachep, alien, objp);
                spin_unlock(&alien->lock);
        } else {
                spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1145,7 +1243,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 * When hotplugging memory or a cpu, existing nodelists are not replaced if
 * already in use.
 *
- * Must hold cache_chain_mutex.
+ * Must hold slab_mutex.
 */
 static int init_cache_nodelists_node(int node)
 {
@@ -1153,7 +1251,7 @@ static int init_cache_nodelists_node(int node)
        struct kmem_list3 *l3;
        const int memsize = sizeof(struct kmem_list3);
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                /*
                 * Set up the size64 kmemlist for cpu before we can
                 * begin anything. Make sure some other cpu on this
@@ -1169,7 +1267,7 @@ static int init_cache_nodelists_node(int node)
                        /*
                         * The l3s don't come and go as CPUs come and
-                         * go.  cache_chain_mutex is sufficient
+                         * go.  slab_mutex is sufficient
                         * protection here.
                         */
                        cachep->nodelists[node] = l3;
@@ -1191,7 +1289,7 @@ static void __cpuinit cpuup_canceled(long cpu)
        int node = cpu_to_mem(cpu);
        const struct cpumask *mask = cpumask_of_node(node);
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                struct array_cache *nc;
                struct array_cache *shared;
                struct array_cache **alien;
@@ -1241,7 +1339,7 @@ free_array_cache:
         * the respective cache's slabs,  now we can go ahead and
         * shrink each nodelist to its limit.
         */
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                l3 = cachep->nodelists[node];
                if (!l3)
                        continue;
@@ -1270,7 +1368,7 @@ static int __cpuinit cpuup_prepare(long cpu)
         * Now we can go ahead with allocating the shared arrays and
         * array caches
         */
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                struct array_cache *nc;
                struct array_cache *shared = NULL;
                struct array_cache **alien = NULL;
@@ -1338,9 +1436,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                err = cpuup_prepare(cpu);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
@@ -1350,7 +1448,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                /*
-                 * Shutdown cache reaper. Note that the cache_chain_mutex is
+                 * Shutdown cache reaper. Note that the slab_mutex is
                 * held so that if cache_reap() is invoked it cannot do
                 * anything expensive but will only modify reap_work
                 * and reschedule the timer.
@@ -1377,9 +1475,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 #endif
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                cpuup_canceled(cpu);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        }
        return notifier_from_errno(err);
@@ -1395,14 +1493,14 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
 * Returns -EBUSY if all objects cannot be drained so that the node is not
 * removed.
 *
- * Must hold cache_chain_mutex.
+ * Must hold slab_mutex.
 */
 static int __meminit drain_cache_nodelists_node(int node)
 {
        struct kmem_cache *cachep;
        int ret = 0;
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                struct kmem_list3 *l3;
                l3 = cachep->nodelists[node];
@@ -1433,14 +1531,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
        switch (action) {
        case MEM_GOING_ONLINE:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                ret = init_cache_nodelists_node(nid);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        case MEM_GOING_OFFLINE:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                ret = drain_cache_nodelists_node(nid);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        case MEM_ONLINE:
        case MEM_OFFLINE:
@@ -1544,8 +1642,8 @@ void __init kmem_cache_init(void)
        node = numa_mem_id();
        /* 1) create the cache_cache */
-        INIT_LIST_HEAD(&cache_chain);
+        INIT_LIST_HEAD(&slab_caches);
-        list_add(&cache_cache.next, &cache_chain);
+        list_add(&cache_cache.list, &slab_caches);
        cache_cache.colour_off = cache_line_size();
        cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
        cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
@@ -1553,18 +1651,16 @@ void __init kmem_cache_init(void)
        /*
         * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
         */
-        cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+        cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
                                  nr_node_ids * sizeof(struct kmem_list3 *);
-#if DEBUG
+        cache_cache.object_size = cache_cache.size;
-        cache_cache.obj_size = cache_cache.buffer_size;
+        cache_cache.size = ALIGN(cache_cache.size,
-#endif
-        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
                                        cache_line_size());
        cache_cache.reciprocal_buffer_size =
-                reciprocal_value(cache_cache.buffer_size);
+                reciprocal_value(cache_cache.size);
        for (order = 0; order < MAX_ORDER; order++) {
-                cache_estimate(order, cache_cache.buffer_size,
+                cache_estimate(order, cache_cache.size,
                        cache_line_size(), 0, &left_over, &cache_cache.num);
                if (cache_cache.num)
                        break;
@@ -1585,7 +1681,7 @@ void __init kmem_cache_init(void)
         * bug.
         */
-        sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
+        sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
                                        sizes[INDEX_AC].cs_size,
                                        ARCH_KMALLOC_MINALIGN,
                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1593,7 +1689,7 @@ void __init kmem_cache_init(void)
        if (INDEX_AC != INDEX_L3) {
                sizes[INDEX_L3].cs_cachep =
-                        kmem_cache_create(names[INDEX_L3].name,
+                        __kmem_cache_create(names[INDEX_L3].name,
                                sizes[INDEX_L3].cs_size,
                                ARCH_KMALLOC_MINALIGN,
                                ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1611,14 +1707,14 @@ void __init kmem_cache_init(void)
                 * allow tighter packing of the smaller caches.
                 */
                if (!sizes->cs_cachep) {
-                        sizes->cs_cachep = kmem_cache_create(names->name,
+                        sizes->cs_cachep = __kmem_cache_create(names->name,
                                        sizes->cs_size,
                                        ARCH_KMALLOC_MINALIGN,
                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
                                        NULL);
                }
 #ifdef CONFIG_ZONE_DMA
-                sizes->cs_dmacachep = kmem_cache_create(
+                sizes->cs_dmacachep = __kmem_cache_create(
                                        names->name_dma,
                                        sizes->cs_size,
                                        ARCH_KMALLOC_MINALIGN,
@@ -1676,27 +1772,27 @@ void __init kmem_cache_init(void)
                }
        }
-        g_cpucache_up = EARLY;
+        slab_state = UP;
 }
 void __init kmem_cache_init_late(void)
 {
        struct kmem_cache *cachep;
-        g_cpucache_up = LATE;
+        slab_state = UP;
        /* Annotate slab for lockdep -- annotate the malloc caches */
        init_lock_keys();
        /* 6) resize the head arrays to their final sizes */
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
-        list_for_each_entry(cachep, &cache_chain, next)
+        list_for_each_entry(cachep, &slab_caches, list)
                if (enable_cpucache(cachep, GFP_NOWAIT))
                        BUG();
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        /* Done! */
-        g_cpucache_up = FULL;
+        slab_state = FULL;
        /*
         * Register a cpu startup notifier callback that initializes
@@ -1727,6 +1823,9 @@ static int __init cpucache_init(void)
         */
        for_each_online_cpu(cpu)
                start_cpu_timer(cpu);
+        /* Done! */
+        slab_state = FULL;
        return 0;
 }
 __initcall(cpucache_init);
@@ -1743,7 +1842,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
                "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
                nodeid, gfpflags);
        printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n",
-                cachep->name, cachep->buffer_size, cachep->gfporder);
+                cachep->name, cachep->size, cachep->gfporder);
        for_each_online_node(node) {
                unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
@@ -1798,7 +1897,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        flags |= __GFP_COMP;
 #endif
-        flags |= cachep->gfpflags;
+        flags |= cachep->allocflags;
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
                flags |= __GFP_RECLAIMABLE;
@@ -1809,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                return NULL;
        }
+        /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+        if (unlikely(page->pfmemalloc))
+                pfmemalloc_active = true;
        nr_pages = (1 << cachep->gfporder);
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
                add_zone_page_state(page_zone(page),
@@ -1816,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        else
                add_zone_page_state(page_zone(page),
                        NR_SLAB_UNRECLAIMABLE, nr_pages);
-        for (i = 0; i < nr_pages; i++)
+        for (i = 0; i < nr_pages; i++) {
                __SetPageSlab(page + i);
+                if (page->pfmemalloc)
+                        SetPageSlabPfmemalloc(page + i);
+        }
        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
                kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1850,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
                                NR_SLAB_UNRECLAIMABLE, nr_freed);
        while (i--) {
                BUG_ON(!PageSlab(page));
+                __ClearPageSlabPfmemalloc(page);
                __ClearPageSlab(page);
                page++;
        }
@@ -1874,7 +1982,7 @@ static void kmem_rcu_free(struct rcu_head *head)
 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
                            unsigned long caller)
 {
-        int size = obj_size(cachep);
+        int size = cachep->object_size;
        addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
@@ -1906,7 +2014,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
 {
-        int size = obj_size(cachep);
+        int size = cachep->object_size;
        addr = &((char *)addr)[obj_offset(cachep)];
        memset(addr, val, size);
@@ -1966,7 +2074,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
                printk("\n");
        }
        realobj = (char *)objp + obj_offset(cachep);
-        size = obj_size(cachep);
+        size = cachep->object_size;
        for (i = 0; i < size && lines; i += 16, lines--) {
                int limit;
                limit = 16;
@@ -1983,7 +2091,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
        int lines = 0;
        realobj = (char *)objp + obj_offset(cachep);
-        size = obj_size(cachep);
+        size = cachep->object_size;
        for (i = 0; i < size; i++) {
                char exp = POISON_FREE;
@@ -2047,10 +2155,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
                if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                        if (cachep->buffer_size % PAGE_SIZE == 0 &&
+                        if (cachep->size % PAGE_SIZE == 0 &&
                                        OFF_SLAB(cachep))
                                kernel_map_pages(virt_to_page(objp),
-                                        cachep->buffer_size / PAGE_SIZE, 1);
+                                        cachep->size / PAGE_SIZE, 1);
                        else
                                check_poison_obj(cachep, objp);
 #else
@@ -2194,10 +2302,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
-        if (g_cpucache_up == FULL)
+        if (slab_state >= FULL)
                return enable_cpucache(cachep, gfp);
-        if (g_cpucache_up == NONE) {
+        if (slab_state == DOWN) {
                /*
                 * Note: the first kmem_cache_create must create the cache
                 * that's used by kmalloc(24), otherwise the creation of
@@ -2212,16 +2320,16 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
                 */
                set_up_list3s(cachep, SIZE_AC);
                if (INDEX_AC == INDEX_L3)
-                        g_cpucache_up = PARTIAL_L3;
+                        slab_state = PARTIAL_L3;
                else
-                        g_cpucache_up = PARTIAL_AC;
+                        slab_state = PARTIAL_ARRAYCACHE;
        } else {
                cachep->array[smp_processor_id()] =
                        kmalloc(sizeof(struct arraycache_init), gfp);
-                if (g_cpucache_up == PARTIAL_AC) {
+                if (slab_state == PARTIAL_ARRAYCACHE) {
                        set_up_list3s(cachep, SIZE_L3);
-                        g_cpucache_up = PARTIAL_L3;
+                        slab_state = PARTIAL_L3;
                } else {
                        int node;
                        for_each_online_node(node) {
@@ -2247,7 +2355,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 }
 /**
- * kmem_cache_create - Create a cache.
+ * __kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
@@ -2274,59 +2382,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 * as davem.
 */
 struct kmem_cache *
-kmem_cache_create (const char *name, size_t size, size_t align,
+__kmem_cache_create (const char *name, size_t size, size_t align,
        unsigned long flags, void (*ctor)(void *))
 {
        size_t left_over, slab_size, ralign;
-        struct kmem_cache *cachep = NULL, *pc;
+        struct kmem_cache *cachep = NULL;
        gfp_t gfp;
-        /*
-         * Sanity checks... these are all serious usage bugs.
-         */
-        if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
-            size > KMALLOC_MAX_SIZE) {
-                printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
-                                name);
-                BUG();
-        }
-        /*
-         * We use cache_chain_mutex to ensure a consistent view of
-         * cpu_online_mask as well.  Please see cpuup_callback
-         */
-        if (slab_is_available()) {
-                get_online_cpus();
-                mutex_lock(&cache_chain_mutex);
-        }
-        list_for_each_entry(pc, &cache_chain, next) {
-                char tmp;
-                int res;
-                /*
-                 * This happens when the module gets unloaded and doesn't
-                 * destroy its slab cache and no-one else reuses the vmalloc
-                 * area of the module.  Print a warning.
-                 */
-                res = probe_kernel_address(pc->name, tmp);
-                if (res) {
-                        printk(KERN_ERR
-                               "SLAB: cache with size %d has lost its name\n",
-                               pc->buffer_size);
-                        continue;
-                }
-                if (!strcmp(pc->name, name)) {
-                        printk(KERN_ERR
-                               "kmem_cache_create: duplicate cache %s\n", name);
-                        dump_stack();
-                        goto oops;
-                }
-        }
 #if DEBUG
-        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
 #if FORCED_DEBUG
        /*
         * Enable redzoning and last user accounting, except for caches with
@@ -2415,11 +2478,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /* Get cache's description obj. */
        cachep = kmem_cache_zalloc(&cache_cache, gfp);
        if (!cachep)
-                goto oops;
+                return NULL;
        cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+        cachep->object_size = size;
+        cachep->align = align;
 #if DEBUG
-        cachep->obj_size = size;
        /*
         * Both debugging options require word-alignment which is calculated
@@ -2442,7 +2506,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        }
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
-            && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+            && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
                cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
                size = PAGE_SIZE;
        }
@@ -2471,8 +2535,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                printk(KERN_ERR
                       "kmem_cache_create: couldn't create cache %s.\n", name);
                kmem_cache_free(&cache_cache, cachep);
-                cachep = NULL;
+                return NULL;
-                goto oops;
        }
        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
                          + sizeof(struct slab), align);
@@ -2508,10 +2571,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->colour = left_over / cachep->colour_off;
        cachep->slab_size = slab_size;
        cachep->flags = flags;
-        cachep->gfpflags = 0;
+        cachep->allocflags = 0;
        if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
-                cachep->gfpflags |= GFP_DMA;
+                cachep->allocflags |= GFP_DMA;
-        cachep->buffer_size = size;
+        cachep->size = size;
        cachep->reciprocal_buffer_size = reciprocal_value(size);
        if (flags & CFLGS_OFF_SLAB) {
@@ -2530,8 +2593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (setup_cpu_cache(cachep, gfp)) {
                __kmem_cache_destroy(cachep);
-                cachep = NULL;
+                return NULL;
-                goto oops;
        }
        if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2545,18 +2607,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        }
        /* cache setup completed, link it into the list */
-        list_add(&cachep->next, &cache_chain);
+        list_add(&cachep->list, &slab_caches);
-oops:
-        if (!cachep && (flags & SLAB_PANIC))
-                panic("kmem_cache_create(): failed to create slab `%s'\n",
-                      name);
-        if (slab_is_available()) {
-                mutex_unlock(&cache_chain_mutex);
-                put_online_cpus();
-        }
        return cachep;
 }
-EXPORT_SYMBOL(kmem_cache_create);
 #if DEBUG
 static void check_irq_off(void)
@@ -2671,7 +2724,7 @@ out:
        return nr_freed;
 }
-/* Called with cache_chain_mutex held to protect against cpu hotplug */
+/* Called with slab_mutex held to protect against cpu hotplug */
 static int __cache_shrink(struct kmem_cache *cachep)
 {
        int ret = 0, i = 0;
@@ -2706,9 +2759,9 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
        BUG_ON(!cachep || in_interrupt());
        get_online_cpus();
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        ret = __cache_shrink(cachep);
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        put_online_cpus();
        return ret;
 }
@@ -2736,15 +2789,15 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
        /* Find the cache in the chain of caches. */
        get_online_cpus();
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        /*
         * the chain is never empty, cache_cache is never destroyed
         */
-        list_del(&cachep->next);
+        list_del(&cachep->list);
        if (__cache_shrink(cachep)) {
                slab_error(cachep, "Can't free all objects");
-                list_add(&cachep->next, &cache_chain);
+                list_add(&cachep->list, &slab_caches);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                put_online_cpus();
                return;
        }
@@ -2753,7 +2806,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
                rcu_barrier();
        __kmem_cache_destroy(cachep);
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        put_online_cpus();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2840,10 +2893,10 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                slab_error(cachep, "constructor overwrote the"
                                           " start of an object");
                }
-                if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
+                if ((cachep->size % PAGE_SIZE) == 0 &&
                            OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
                        kernel_map_pages(virt_to_page(objp),
-                                         cachep->buffer_size / PAGE_SIZE, 0);
+                                         cachep->size / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
                        cachep->ctor(objp);
@@ -2857,9 +2910,9 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
        if (CONFIG_ZONE_DMA_FLAG) {
                if (flags & GFP_DMA)
-                        BUG_ON(!(cachep->gfpflags & GFP_DMA));
+                        BUG_ON(!(cachep->allocflags & GFP_DMA));
                else
-                        BUG_ON(cachep->gfpflags & GFP_DMA);
+                        BUG_ON(cachep->allocflags & GFP_DMA);
        }
 }
@@ -2918,8 +2971,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
                nr_pages <<= cache->gfporder;
        do {
-                page_set_cache(page, cache);
+                page->slab_cache = cache;
-                page_set_slab(page, slab);
+                page->slab_page = slab;
                page++;
        } while (--nr_pages);
 }
@@ -3057,7 +3110,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        kfree_debugcheck(objp);
        page = virt_to_head_page(objp);
-        slabp = page_get_slab(page);
+        slabp = page->slab_page;
        if (cachep->flags & SLAB_RED_ZONE) {
                verify_redzone_free(cachep, objp);
@@ -3077,10 +3130,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 #endif
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
+                if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
                        store_stackinfo(cachep, objp, (unsigned long)caller);
                        kernel_map_pages(virt_to_page(objp),
-                                         cachep->buffer_size / PAGE_SIZE, 0);
+                                         cachep->size / PAGE_SIZE, 0);
                } else {
                        poison_obj(cachep, objp, POISON_FREE);
                }
@@ -3120,16 +3173,19 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
+                                                        bool force_refill)
 {
        int batchcount;
        struct kmem_list3 *l3;
        struct array_cache *ac;
        int node;
-retry:
        check_irq_off();
        node = numa_mem_id();
+        if (unlikely(force_refill))
+                goto force_grow;
+retry:
        ac = cpu_cache_get(cachep);
        batchcount = ac->batchcount;
        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3179,8 +3235,8 @@ retry:
                        STATS_INC_ACTIVE(cachep);
                        STATS_SET_HIGH(cachep);
-                        ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
+                        ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
-                                                            node);
+                                                                        node));
                }
                check_slabp(cachep, slabp);
@@ -3199,18 +3255,22 @@ alloc_done:
        if (unlikely(!ac->avail)) {
                int x;
+force_grow:
                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
                /* cache_grow can reenable interrupts, then ac could change. */
                ac = cpu_cache_get(cachep);
-                if (!x && ac->avail == 0)       /* no objects in sight? abort */
+                /* no objects in sight? abort */
+                if (!x && (ac->avail == 0 || force_refill))
                        return NULL;
                if (!ac->avail)         /* objects refilled by interrupt? */
                        goto retry;
        }
        ac->touched = 1;
-        return ac->entry[--ac->avail];
+        return ac_get_obj(cachep, ac, flags, force_refill);
 }
 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3230,9 +3290,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                return objp;
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+                if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
                        kernel_map_pages(virt_to_page(objp),
-                                         cachep->buffer_size / PAGE_SIZE, 1);
+                                         cachep->size / PAGE_SIZE, 1);
                else
                        check_poison_obj(cachep, objp);
 #else
@@ -3261,8 +3321,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                struct slab *slabp;
                unsigned objnr;
-                slabp = page_get_slab(virt_to_head_page(objp));
+                slabp = virt_to_head_page(objp)->slab_page;
-                objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+                objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
                slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
        }
 #endif
@@ -3285,30 +3345,42 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
        if (cachep == &cache_cache)
                return false;
-        return should_failslab(obj_size(cachep), flags, cachep->flags);
+        return should_failslab(cachep->object_size, flags, cachep->flags);
 }
 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
        void *objp;
        struct array_cache *ac;
+        bool force_refill = false;
        check_irq_off();
        ac = cpu_cache_get(cachep);
        if (likely(ac->avail)) {
-                STATS_INC_ALLOCHIT(cachep);
                ac->touched = 1;
-                objp = ac->entry[--ac->avail];
+                objp = ac_get_obj(cachep, ac, flags, false);
-        } else {
-                STATS_INC_ALLOCMISS(cachep);
-                objp = cache_alloc_refill(cachep, flags);
                /*
-                 * the 'ac' may be updated by cache_alloc_refill(),
+                 * Allow for the possibility all avail objects are not allowed
-                 * and kmemleak_erase() requires its correct value.
+                 * by the current flags
                 */
-                ac = cpu_cache_get(cachep);
+                if (objp) {
+                        STATS_INC_ALLOCHIT(cachep);
+                        goto out;
+                }
+                force_refill = true;
        }
+        STATS_INC_ALLOCMISS(cachep);
+        objp = cache_alloc_refill(cachep, flags, force_refill);
+        /*
+         * the 'ac' may be updated by cache_alloc_refill(),
+         * and kmemleak_erase() requires its correct value.
+         */
+        ac = cpu_cache_get(cachep);
+out:
        /*
         * To avoid a false negative, if an object that is in one of the
         * per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3336,7 +3408,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
                nid_alloc = cpuset_slab_spread_node();
        else if (current->mempolicy)
-                nid_alloc = slab_node(current->mempolicy);
+                nid_alloc = slab_node();
        if (nid_alloc != nid_here)
                return ____cache_alloc_node(cachep, flags, nid_alloc);
        return NULL;
@@ -3368,7 +3440,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 retry_cpuset:
        cpuset_mems_cookie = get_mems_allowed();
-        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+        zonelist = node_zonelist(slab_node(), flags);
 retry:
        /*
@@ -3545,14 +3617,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
  out:
        local_irq_restore(save_flags);
        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
-        kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
+        kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
                                 flags);
        if (likely(ptr))
-                kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
+                kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
        if (unlikely((flags & __GFP_ZERO) && ptr))
-                memset(ptr, 0, obj_size(cachep));
+                memset(ptr, 0, cachep->object_size);
        return ptr;
 }
@@ -3607,15 +3679,15 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
        objp = __do_cache_alloc(cachep, flags);
        local_irq_restore(save_flags);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
-        kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
+        kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
                                 flags);
        prefetchw(objp);
        if (likely(objp))
-                kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
+                kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
        if (unlikely((flags & __GFP_ZERO) && objp))
-                memset(objp, 0, obj_size(cachep));
+                memset(objp, 0, cachep->object_size);
        return objp;
 }
@@ -3630,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
        struct kmem_list3 *l3;
        for (i = 0; i < nr_objects; i++) {
-                void *objp = objpp[i];
+                void *objp;
                struct slab *slabp;
+                clear_obj_pfmemalloc(&objpp[i]);
+                objp = objpp[i];
                slabp = virt_to_slab(objp);
                l3 = cachep->nodelists[node];
                list_del(&slabp->list);
@@ -3731,7 +3806,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
        kmemleak_free_recursive(objp, cachep->flags);
        objp = cache_free_debugcheck(cachep, objp, caller);
-        kmemcheck_slab_free(cachep, objp, obj_size(cachep));
+        kmemcheck_slab_free(cachep, objp, cachep->object_size);
        /*
         * Skip calling cache_free_alien() when the platform is not numa.
@@ -3750,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
                cache_flusharray(cachep, ac);
        }
-        ac->entry[ac->avail++] = objp;
+        ac_put_obj(cachep, ac, objp);
 }
 /**
@@ -3766,7 +3841,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
        void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
        trace_kmem_cache_alloc(_RET_IP_, ret,
-                               obj_size(cachep), cachep->buffer_size, flags);
+                               cachep->object_size, cachep->size, flags);
        return ret;
 }
@@ -3794,7 +3869,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                                       __builtin_return_address(0));
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
-                                    obj_size(cachep), cachep->buffer_size,
+                                    cachep->object_size, cachep->size,
                                    flags, nodeid);
        return ret;
@@ -3876,7 +3951,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
        ret = __cache_alloc(cachep, flags, caller);
        trace_kmalloc((unsigned long) caller, ret,
-                      size, cachep->buffer_size, flags);
+                      size, cachep->size, flags);
        return ret;
 }
@@ -3916,9 +3991,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
        unsigned long flags;
        local_irq_save(flags);
-        debug_check_no_locks_freed(objp, obj_size(cachep));
+        debug_check_no_locks_freed(objp, cachep->object_size);
        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
-                debug_check_no_obj_freed(objp, obj_size(cachep));
+                debug_check_no_obj_freed(objp, cachep->object_size);
        __cache_free(cachep, objp, __builtin_return_address(0));
        local_irq_restore(flags);
@@ -3947,8 +4022,9 @@ void kfree(const void *objp)
        local_irq_save(flags);
        kfree_debugcheck(objp);
        c = virt_to_cache(objp);
-        debug_check_no_locks_freed(objp, obj_size(c));
+        debug_check_no_locks_freed(objp, c->object_size);
-        debug_check_no_obj_freed(objp, obj_size(c));
+        debug_check_no_obj_freed(objp, c->object_size);
        __cache_free(c, (void *)objp, __builtin_return_address(0));
        local_irq_restore(flags);
 }
@@ -3956,7 +4032,7 @@ EXPORT_SYMBOL(kfree);
 unsigned int kmem_cache_size(struct kmem_cache *cachep)
 {
-        return obj_size(cachep);
+        return cachep->object_size;
 }
 EXPORT_SYMBOL(kmem_cache_size);
@@ -4030,7 +4106,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
        return 0;
 fail:
-        if (!cachep->next.next) {
+        if (!cachep->list.next) {
                /* Cache is not active yet. Roll back what we did */
                node--;
                while (node >= 0) {
@@ -4065,7 +4141,7 @@ static void do_ccupdate_local(void *info)
        new->new[smp_processor_id()] = old;
 }
-/* Always called with the cache_chain_mutex held */
+/* Always called with the slab_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
                                int batchcount, int shared, gfp_t gfp)
 {
@@ -4109,7 +4185,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
        return alloc_kmemlist(cachep, gfp);
 }
-/* Called with cache_chain_mutex held always */
+/* Called with slab_mutex held always */
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
        int err;
@@ -4124,13 +4200,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
         * The numbers are guessed, we should auto-tune as described by
         * Bonwick.
         */
-        if (cachep->buffer_size > 131072)
+        if (cachep->size > 131072)
                limit = 1;
-        else if (cachep->buffer_size > PAGE_SIZE)
+        else if (cachep->size > PAGE_SIZE)
                limit = 8;
-        else if (cachep->buffer_size > 1024)
+        else if (cachep->size > 1024)
                limit = 24;
-        else if (cachep->buffer_size > 256)
+        else if (cachep->size > 256)
                limit = 54;
        else
                limit = 120;
@@ -4145,7 +4221,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
         * to a larger limit. Thus disabled by default.
         */
        shared = 0;
-        if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
+        if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
                shared = 8;
 #if DEBUG
@@ -4211,11 +4287,11 @@ static void cache_reap(struct work_struct *w)
        int node = numa_mem_id();
        struct delayed_work *work = to_delayed_work(w);
-        if (!mutex_trylock(&cache_chain_mutex))
+        if (!mutex_trylock(&slab_mutex))
                /* Give up. Setup the next iteration. */
                goto out;
-        list_for_each_entry(searchp, &cache_chain, next) {
+        list_for_each_entry(searchp, &slab_caches, list) {
                check_irq_on();
                /*
@@ -4253,7 +4329,7 @@ next:
                cond_resched();
        }
        check_irq_on();
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        next_reap_node();
 out:
        /* Set up the next iteration */
@@ -4289,26 +4365,26 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 {
        loff_t n = *pos;
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        if (!n)
                print_slabinfo_header(m);
-        return seq_list_start(&cache_chain, *pos);
+        return seq_list_start(&slab_caches, *pos);
 }
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-        return seq_list_next(p, &cache_chain, pos);
+        return seq_list_next(p, &slab_caches, pos);
 }
 static void s_stop(struct seq_file *m, void *p)
 {
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
 }
 static int s_show(struct seq_file *m, void *p)
 {
-        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
+        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
        struct slab *slabp;
        unsigned long active_objs;
        unsigned long num_objs;
@@ -4364,7 +4440,7 @@ static int s_show(struct seq_file *m, void *p)
                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-                   name, active_objs, num_objs, cachep->buffer_size,
+                   name, active_objs, num_objs, cachep->size,
                   cachep->num, (1 << cachep->gfporder));
        seq_printf(m, " : tunables %4u %4u %4u",
                   cachep->limit, cachep->batchcount, cachep->shared);
@@ -4454,9 +4530,9 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                return -EINVAL;
        /* Find the cache in the chain of caches. */
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        res = -EINVAL;
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                if (!strcmp(cachep->name, kbuf)) {
                        if (limit < 1 || batchcount < 1 ||
                                        batchcount > limit || shared < 0) {
@@ -4469,7 +4545,7 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                        break;
                }
        }
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        if (res >= 0)
                res = count;
        return res;
@@ -4492,8 +4568,8 @@ static const struct file_operations proc_slabinfo_operations = {
 static void *leaks_start(struct seq_file *m, loff_t *pos)
 {
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
-        return seq_list_start(&cache_chain, *pos);
+        return seq_list_start(&slab_caches, *pos);
 }
 static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4532,7 +4608,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
        int i;
        if (n[0] == n[1])
                return;
-        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
+        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
                if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
                        continue;
                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
@@ -4558,7 +4634,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
 static int leaks_show(struct seq_file *m, void *p)
 {
-        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
+        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
        struct slab *slabp;
        struct kmem_list3 *l3;
        const char *name;
@@ -4592,17 +4668,17 @@ static int leaks_show(struct seq_file *m, void *p)
        name = cachep->name;
        if (n[0] == n[1]) {
                /* Increase the buffer size */
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
                if (!m->private) {
                        /* Too bad, we are really out */
                        m->private = n;
-                        mutex_lock(&cache_chain_mutex);
+                        mutex_lock(&slab_mutex);
                        return -ENOMEM;
                }
                *(unsigned long *)m->private = n[0] * 2;
                kfree(n);
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                /* Now make sure this entry will be retried */
                m->count = m->size;
                return 0;
@@ -4677,6 +4753,6 @@ size_t ksize(const void *objp)
        if (unlikely(objp == ZERO_SIZE_PTR))
                return 0;
-        return obj_size(virt_to_cache(objp));
+        return virt_to_cache(objp)->object_size;
 }
 EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
new file mode 100644
index 000000000000..db7848caaa25
--- /dev/null
+++ b/mm/slab.h
@@ -0,0 +1,33 @@
+#ifndef MM_SLAB_H
+#define MM_SLAB_H
+/*
+ * Internal slab definitions
+ */
+/*
+ * State of the slab allocator.
+ *
+ * This is used to describe the states of the allocator during bootup.
+ * Allocators use this to gradually bootstrap themselves. Most allocators
+ * have the problem that the structures used for managing slab caches are
+ * allocated from slab caches themselves.
+ */
+enum slab_state {
+        DOWN,                   /* No slab functionality yet */
+        PARTIAL,                /* SLUB: kmem_cache_node available */
+        PARTIAL_ARRAYCACHE,     /* SLAB: kmalloc size for arraycache available */
+        PARTIAL_L3,             /* SLAB: kmalloc size for l3 struct available */
+        UP,                     /* Slab caches usable but not all extras yet */
+        FULL                    /* Everything is working */
+};
+extern enum slab_state slab_state;
+/* The slab cache mutex protects the management structures during changes */
+extern struct mutex slab_mutex;
+extern struct list_head slab_caches;
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+        size_t align, unsigned long flags, void (*ctor)(void *));
+#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
new file mode 100644
index 000000000000..aa3ca5bb01b5
--- /dev/null
+++ b/mm/slab_common.c
@@ -0,0 +1,120 @@
+/*
+ * Slab allocator functions that are independent of the allocator strategy
+ *
+ * (C) 2012 Christoph Lameter <cl@linux.com>
+ */
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/poison.h>
+#include <linux/interrupt.h>
+#include <linux/memory.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/page.h>
+#include "slab.h"
+enum slab_state slab_state;
+LIST_HEAD(slab_caches);
+DEFINE_MUTEX(slab_mutex);
+/*
+ * kmem_cache_create - Create a cache.
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @ctor: A constructor for the objects.
+ *
+ * Returns a ptr to the cache on success, NULL on failure.
+ * Cannot be called within a interrupt, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline.  This can be beneficial if you're counting cycles as closely
+ * as davem.
+ */
+struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
+                unsigned long flags, void (*ctor)(void *))
+{
+        struct kmem_cache *s = NULL;
+#ifdef CONFIG_DEBUG_VM
+        if (!name || in_interrupt() || size < sizeof(void *) ||
+                size > KMALLOC_MAX_SIZE) {
+                printk(KERN_ERR "kmem_cache_create(%s) integrity check"
+                        " failed\n", name);
+                goto out;
+        }
+#endif
+        get_online_cpus();
+        mutex_lock(&slab_mutex);
+#ifdef CONFIG_DEBUG_VM
+        list_for_each_entry(s, &slab_caches, list) {
+                char tmp;
+                int res;
+                /*
+                 * This happens when the module gets unloaded and doesn't
+                 * destroy its slab cache and no-one else reuses the vmalloc
+                 * area of the module.  Print a warning.
+                 */
+                res = probe_kernel_address(s->name, tmp);
+                if (res) {
+                        printk(KERN_ERR
+                               "Slab cache with size %d has lost its name\n",
+                               s->object_size);
+                        continue;
+                }
+                if (!strcmp(s->name, name)) {
+                        printk(KERN_ERR "kmem_cache_create(%s): Cache name"
+                                " already exists.\n",
+                                name);
+                        dump_stack();
+                        s = NULL;
+                        goto oops;
+                }
+        }
+        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
+#endif
+        s = __kmem_cache_create(name, size, align, flags, ctor);
+#ifdef CONFIG_DEBUG_VM
+oops:
+#endif
+        mutex_unlock(&slab_mutex);
+        put_online_cpus();
+#ifdef CONFIG_DEBUG_VM
+out:
+#endif
+        if (!s && (flags & SLAB_PANIC))
+                panic("kmem_cache_create: Failed to create slab '%s'\n", name);
+        return s;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+int slab_is_available(void)
+{
+        return slab_state >= UP;
+}
diff --git a/mm/slob.c b/mm/slob.c
index 8105be42cad1..45d4ca79933a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -59,6 +59,8 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include "slab.h"
 #include <linux/mm.h>
 #include <linux/swap.h> /* struct reclaim_state */
 #include <linux/cache.h>
@@ -92,36 +94,6 @@ struct slob_block {
 typedef struct slob_block slob_t;
 /*
- * We use struct page fields to manage some slob allocation aspects,
- * however to avoid the horrible mess in include/linux/mm_types.h, we'll
- * just define our own struct page type variant here.
- */
-struct slob_page {
-        union {
-                struct {
-                        unsigned long flags;    /* mandatory */
-                        atomic_t _count;        /* mandatory */
-                        slobidx_t units;        /* free units left in page */
-                        unsigned long pad[2];
-                        slob_t *free;           /* first free slob_t in page */
-                        struct list_head list;  /* linked list of free pages */
-                };
-                struct page page;
-        };
-};
-static inline void struct_slob_page_wrong_size(void)
-{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
-/*
- * free_slob_page: call before a slob_page is returned to the page allocator.
- */
-static inline void free_slob_page(struct slob_page *sp)
-{
-        reset_page_mapcount(&sp->page);
-        sp->page.mapping = NULL;
-}
-/*
 * All partially free slob pages go on these lists.
 */
 #define SLOB_BREAK1 256
@@ -131,46 +103,23 @@ static LIST_HEAD(free_slob_medium);
 static LIST_HEAD(free_slob_large);
 /*
- * is_slob_page: True for all slob pages (false for bigblock pages)
- */
-static inline int is_slob_page(struct slob_page *sp)
-{
-        return PageSlab((struct page *)sp);
-}
-static inline void set_slob_page(struct slob_page *sp)
-{
-        __SetPageSlab((struct page *)sp);
-}
-static inline void clear_slob_page(struct slob_page *sp)
-{
-        __ClearPageSlab((struct page *)sp);
-}
-static inline struct slob_page *slob_page(const void *addr)
-{
-        return (struct slob_page *)virt_to_page(addr);
-}
-/*
 * slob_page_free: true for pages on free_slob_pages list.
 */
-static inline int slob_page_free(struct slob_page *sp)
+static inline int slob_page_free(struct page *sp)
 {
-        return PageSlobFree((struct page *)sp);
+        return PageSlobFree(sp);
 }
-static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
+static void set_slob_page_free(struct page *sp, struct list_head *list)
 {
        list_add(&sp->list, list);
-        __SetPageSlobFree((struct page *)sp);
+        __SetPageSlobFree(sp);
 }
-static inline void clear_slob_page_free(struct slob_page *sp)
+static inline void clear_slob_page_free(struct page *sp)
 {
        list_del(&sp->list);
-        __ClearPageSlobFree((struct page *)sp);
+        __ClearPageSlobFree(sp);
 }
 #define SLOB_UNIT sizeof(slob_t)
@@ -267,12 +216,12 @@ static void slob_free_pages(void *b, int order)
 /*
 * Allocate a slob block within a given slob_page sp.
 */
-static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
+static void *slob_page_alloc(struct page *sp, size_t size, int align)
 {
        slob_t *prev, *cur, *aligned = NULL;
        int delta = 0, units = SLOB_UNITS(size);
-        for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
+        for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
                slobidx_t avail = slob_units(cur);
                if (align) {
@@ -296,12 +245,12 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
                                if (prev)
                                        set_slob(prev, slob_units(prev), next);
                                else
-                                        sp->free = next;
+                                        sp->freelist = next;
                        } else { /* fragment */
                                if (prev)
                                        set_slob(prev, slob_units(prev), cur + units);
                                else
-                                        sp->free = cur + units;
+                                        sp->freelist = cur + units;
                                set_slob(cur + units, avail - units, next);
                        }
@@ -320,7 +269,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
 */
 static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 {
-        struct slob_page *sp;
+        struct page *sp;
        struct list_head *prev;
        struct list_head *slob_list;
        slob_t *b = NULL;
@@ -341,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                 * If there's a node specification, search for a partial
                 * page with a matching node id in the freelist.
                 */
-                if (node != -1 && page_to_nid(&sp->page) != node)
+                if (node != -1 && page_to_nid(sp) != node)
                        continue;
 #endif
                /* Enough room on this page? */
@@ -369,12 +318,12 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
                if (!b)
                        return NULL;
-                sp = slob_page(b);
+                sp = virt_to_page(b);
-                set_slob_page(sp);
+                __SetPageSlab(sp);
                spin_lock_irqsave(&slob_lock, flags);
                sp->units = SLOB_UNITS(PAGE_SIZE);
-                sp->free = b;
+                sp->freelist = b;
                INIT_LIST_HEAD(&sp->list);
                set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
                set_slob_page_free(sp, slob_list);
@@ -392,7 +341,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 */
 static void slob_free(void *block, int size)
 {
-        struct slob_page *sp;
+        struct page *sp;
        slob_t *prev, *next, *b = (slob_t *)block;
        slobidx_t units;
        unsigned long flags;
@@ -402,7 +351,7 @@ static void slob_free(void *block, int size)
                return;
        BUG_ON(!size);
-        sp = slob_page(block);
+        sp = virt_to_page(block);
        units = SLOB_UNITS(size);
        spin_lock_irqsave(&slob_lock, flags);
@@ -412,8 +361,8 @@ static void slob_free(void *block, int size)
                if (slob_page_free(sp))
                        clear_slob_page_free(sp);
                spin_unlock_irqrestore(&slob_lock, flags);
-                clear_slob_page(sp);
+                __ClearPageSlab(sp);
-                free_slob_page(sp);
+                reset_page_mapcount(sp);
                slob_free_pages(b, 0);
                return;
        }
@@ -421,7 +370,7 @@ static void slob_free(void *block, int size)
        if (!slob_page_free(sp)) {
                /* This slob page is about to become partially free. Easy! */
                sp->units = units;
-                sp->free = b;
+                sp->freelist = b;
                set_slob(b, units,
                        (void *)((unsigned long)(b +
                                        SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
@@ -441,15 +390,15 @@ static void slob_free(void *block, int size)
         */
        sp->units += units;
-        if (b < sp->free) {
+        if (b < (slob_t *)sp->freelist) {
-                if (b + units == sp->free) {
+                if (b + units == sp->freelist) {
-                        units += slob_units(sp->free);
+                        units += slob_units(sp->freelist);
-                        sp->free = slob_next(sp->free);
+                        sp->freelist = slob_next(sp->freelist);
                }
-                set_slob(b, units, sp->free);
+                set_slob(b, units, sp->freelist);
-                sp->free = b;
+                sp->freelist = b;
        } else {
-                prev = sp->free;
+                prev = sp->freelist;
                next = slob_next(prev);
                while (b > next) {
                        prev = next;
@@ -522,7 +471,7 @@ EXPORT_SYMBOL(__kmalloc_node);
 void kfree(const void *block)
 {
-        struct slob_page *sp;
+        struct page *sp;
        trace_kfree(_RET_IP_, block);
@@ -530,43 +479,36 @@ void kfree(const void *block)
                return;
        kmemleak_free(block);
-        sp = slob_page(block);
+        sp = virt_to_page(block);
-        if (is_slob_page(sp)) {
+        if (PageSlab(sp)) {
                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                unsigned int *m = (unsigned int *)(block - align);
                slob_free(m, *m + align);
        } else
-                put_page(&sp->page);
+                put_page(sp);
 }
 EXPORT_SYMBOL(kfree);
 /* can't use ksize for kmem_cache_alloc memory, only kmalloc */
 size_t ksize(const void *block)
 {
-        struct slob_page *sp;
+        struct page *sp;
        BUG_ON(!block);
        if (unlikely(block == ZERO_SIZE_PTR))
                return 0;
-        sp = slob_page(block);
+        sp = virt_to_page(block);
-        if (is_slob_page(sp)) {
+        if (PageSlab(sp)) {
                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                unsigned int *m = (unsigned int *)(block - align);
                return SLOB_UNITS(*m) * SLOB_UNIT;
        } else
-                return sp->page.private;
+                return sp->private;
 }
 EXPORT_SYMBOL(ksize);
-struct kmem_cache {
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
-        unsigned int size, align;
-        unsigned long flags;
-        const char *name;
-        void (*ctor)(void *);
-};
-struct kmem_cache *kmem_cache_create(const char *name, size_t size,
        size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *c;
@@ -589,13 +531,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                        c->align = ARCH_SLAB_MINALIGN;
                if (c->align < align)
                        c->align = align;
-        } else if (flags & SLAB_PANIC)
-                panic("Cannot create slab cache %s\n", name);
-        kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
+                kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
+                c->refcount = 1;
+        }
        return c;
 }
-EXPORT_SYMBOL(kmem_cache_create);
 void kmem_cache_destroy(struct kmem_cache *c)
 {
@@ -678,19 +619,12 @@ int kmem_cache_shrink(struct kmem_cache *d)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
-static unsigned int slob_ready __read_mostly;
-int slab_is_available(void)
-{
-        return slob_ready;
-}
 void __init kmem_cache_init(void)
 {
-        slob_ready = 1;
+        slab_state = UP;
 }
 void __init kmem_cache_init_late(void)
 {
-        /* Nothing to do */
+        slab_state = FULL;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 8c691fa1cf3c..8f78e2577031 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/slab.h>
+#include "slab.h"
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/kmemcheck.h>
@@ -33,15 +34,17 @@
 #include <trace/events/kmem.h>
+#include "internal.h"
 /*
 * Lock order:
- *   1. slub_lock (Global Semaphore)
+ *   1. slab_mutex (Global Mutex)
 *   2. node->list_lock
 *   3. slab_lock(page) (Only on some arches and for debugging)
 *
- *   slub_lock
+ *   slab_mutex
 *
- *   The role of the slub_lock is to protect the list of all the slabs
+ *   The role of the slab_mutex is to protect the list of all the slabs
 *   and to synchronize major metadata changes to slab cache structures.
 *
 *   The slab_lock is only used for debugging and on arches that do not
@@ -182,17 +185,6 @@ static int kmem_size = sizeof(struct kmem_cache);
 static struct notifier_block slab_notifier;
 #endif
-static enum {
-        DOWN,           /* No slab functionality available */
-        PARTIAL,        /* Kmem_cache_node works */
-        UP,             /* Everything works but does not show up in sysfs */
-        SYSFS           /* Sysfs up */
-} slab_state = DOWN;
-/* A list of all slab caches on the system */
-static DECLARE_RWSEM(slub_lock);
-static LIST_HEAD(slab_caches);
 /*
 * Tracking user of a slab.
 */
@@ -237,11 +229,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
 *                      Core slab cache functions
 *******************************************************************/
-int slab_is_available(void)
-{
-        return slab_state >= UP;
-}
 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 {
        return s->node[node];
@@ -311,7 +298,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
         * and whatever may come after it.
         */
        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
-                return s->objsize;
+                return s->object_size;
 #endif
        /*
@@ -609,11 +596,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
        if (p > addr + 16)
                print_section("Bytes b4 ", p - 16, 16);
-        print_section("Object ", p, min_t(unsigned long, s->objsize,
+        print_section("Object ", p, min_t(unsigned long, s->object_size,
                                PAGE_SIZE));
        if (s->flags & SLAB_RED_ZONE)
-                print_section("Redzone ", p + s->objsize,
+                print_section("Redzone ", p + s->object_size,
-                        s->inuse - s->objsize);
+                        s->inuse - s->object_size);
        if (s->offset)
                off = s->offset + sizeof(void *);
@@ -655,12 +642,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
        u8 *p = object;
        if (s->flags & __OBJECT_POISON) {
-                memset(p, POISON_FREE, s->objsize - 1);
+                memset(p, POISON_FREE, s->object_size - 1);
-                p[s->objsize - 1] = POISON_END;
+                p[s->object_size - 1] = POISON_END;
        }
        if (s->flags & SLAB_RED_ZONE)
-                memset(p + s->objsize, val, s->inuse - s->objsize);
+                memset(p + s->object_size, val, s->inuse - s->object_size);
 }
 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
@@ -705,10 +692,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 *      0xa5 (POISON_END)
 *
- * object + s->objsize
+ * object + s->object_size
 *      Padding to reach word boundary. This is also used for Redzoning.
 *      Padding is extended by another word if Redzoning is enabled and
- *      objsize == inuse.
+ *      object_size == inuse.
 *
 *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 *      0xcc (RED_ACTIVE) for objects in use.
@@ -727,7 +714,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 * object + s->size
 *      Nothing is used beyond s->size.
 *
- * If slabcaches are merged then the objsize and inuse boundaries are mostly
+ * If slabcaches are merged then the object_size and inuse boundaries are mostly
 * ignored. And therefore no slab options that rely on these boundaries
 * may be used with merged slabcaches.
 */
@@ -787,25 +774,25 @@ static int check_object(struct kmem_cache *s, struct page *page,
                                        void *object, u8 val)
 {
        u8 *p = object;
-        u8 *endobject = object + s->objsize;
+        u8 *endobject = object + s->object_size;
        if (s->flags & SLAB_RED_ZONE) {
                if (!check_bytes_and_report(s, page, object, "Redzone",
-                        endobject, val, s->inuse - s->objsize))
+                        endobject, val, s->inuse - s->object_size))
                        return 0;
        } else {
-                if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
+                if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
                        check_bytes_and_report(s, page, p, "Alignment padding",
-                                endobject, POISON_INUSE, s->inuse - s->objsize);
+                                endobject, POISON_INUSE, s->inuse - s->object_size);
                }
        }
        if (s->flags & SLAB_POISON) {
                if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
                        (!check_bytes_and_report(s, page, p, "Poison", p,
-                                        POISON_FREE, s->objsize - 1) ||
+                                        POISON_FREE, s->object_size - 1) ||
                         !check_bytes_and_report(s, page, p, "Poison",
-                                p + s->objsize - 1, POISON_END, 1)))
+                                p + s->object_size - 1, POISON_END, 1)))
                        return 0;
                /*
                 * check_pad_bytes cleans up on its own.
@@ -926,7 +913,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
                        page->freelist);
                if (!alloc)
-                        print_section("Object ", (void *)object, s->objsize);
+                        print_section("Object ", (void *)object, s->object_size);
                dump_stack();
        }
@@ -942,14 +929,14 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
        lockdep_trace_alloc(flags);
        might_sleep_if(flags & __GFP_WAIT);
-        return should_failslab(s->objsize, flags, s->flags);
+        return should_failslab(s->object_size, flags, s->flags);
 }
 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
 {
        flags &= gfp_allowed_mask;
        kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
-        kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
+        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
 }
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -966,13 +953,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
                unsigned long flags;
                local_irq_save(flags);
-                kmemcheck_slab_free(s, x, s->objsize);
+                kmemcheck_slab_free(s, x, s->object_size);
-                debug_check_no_locks_freed(x, s->objsize);
+                debug_check_no_locks_freed(x, s->object_size);
                local_irq_restore(flags);
        }
 #endif
        if (!(s->flags & SLAB_DEBUG_OBJECTS))
-                debug_check_no_obj_freed(x, s->objsize);
+                debug_check_no_obj_freed(x, s->object_size);
 }
 /*
@@ -1207,7 +1194,7 @@ out:
 __setup("slub_debug", setup_slub_debug);
-static unsigned long kmem_cache_flags(unsigned long objsize,
+static unsigned long kmem_cache_flags(unsigned long object_size,
        unsigned long flags, const char *name,
        void (*ctor)(void *))
 {
@@ -1237,7 +1224,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
                                        struct page *page) {}
 static inline void remove_full(struct kmem_cache *s, struct page *page) {}
-static inline unsigned long kmem_cache_flags(unsigned long objsize,
+static inline unsigned long kmem_cache_flags(unsigned long object_size,
        unsigned long flags, const char *name,
        void (*ctor)(void *))
 {
@@ -1314,13 +1301,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                        stat(s, ORDER_FALLBACK);
        }
-        if (flags & __GFP_WAIT)
+        if (kmemcheck_enabled && page
-                local_irq_disable();
-        if (!page)
-                return NULL;
-        if (kmemcheck_enabled
                && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
                int pages = 1 << oo_order(oo);
@@ -1336,6 +1317,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                        kmemcheck_mark_unallocated_pages(page, pages);
        }
+        if (flags & __GFP_WAIT)
+                local_irq_disable();
+        if (!page)
+                return NULL;
        page->objects = oo_objects(oo);
        mod_zone_page_state(page_zone(page),
                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1370,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        inc_slabs_node(s, page_to_nid(page), page->objects);
        page->slab = s;
        __SetPageSlab(page);
+        if (page->pfmemalloc)
+                SetPageSlabPfmemalloc(page);
        start = page_address(page);
@@ -1413,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
                -pages);
+        __ClearPageSlabPfmemalloc(page);
        __ClearPageSlab(page);
        reset_page_mapcount(page);
        if (current->reclaim_state)
@@ -1490,12 +1479,12 @@ static inline void remove_partial(struct kmem_cache_node *n,
 }
 /*
- * Lock slab, remove from the partial list and put the object into the
+ * Remove slab from the partial list, freeze it and
- * per cpu freelist.
+ * return the pointer to the freelist.
 *
 * Returns a list of objects or NULL if it fails.
 *
- * Must hold list_lock.
+ * Must hold list_lock since we modify the partial list.
 */
 static inline void *acquire_slab(struct kmem_cache *s,
                struct kmem_cache_node *n, struct page *page,
@@ -1510,26 +1499,27 @@ static inline void *acquire_slab(struct kmem_cache *s,
         * The old freelist is the list of objects for the
         * per cpu allocation list.
         */
-        do {
+        freelist = page->freelist;
-                freelist = page->freelist;
+        counters = page->counters;
-                counters = page->counters;
+        new.counters = counters;
-                new.counters = counters;
+        if (mode) {
-                if (mode) {
+                new.inuse = page->objects;
-                        new.inuse = page->objects;
+                new.freelist = NULL;
-                        new.freelist = NULL;
+        } else {
-                } else {
+                new.freelist = freelist;
-                        new.freelist = freelist;
+        }
-                }
-                VM_BUG_ON(new.frozen);
+        VM_BUG_ON(new.frozen);
-                new.frozen = 1;
+        new.frozen = 1;
-        } while (!__cmpxchg_double_slab(s, page,
+        if (!__cmpxchg_double_slab(s, page,
                        freelist, counters,
                        new.freelist, new.counters,
-                        "lock and freeze"));
+                        "acquire_slab"))
+                return NULL;
        remove_partial(n, page);
+        WARN_ON(!freelist);
        return freelist;
 }
@@ -1563,7 +1553,6 @@ static void *get_partial_node(struct kmem_cache *s,
                if (!object) {
                        c->page = page;
-                        c->node = page_to_nid(page);
                        stat(s, ALLOC_FROM_PARTIAL);
                        object = t;
                        available =  page->objects - page->inuse;
@@ -1617,7 +1606,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
        do {
                cpuset_mems_cookie = get_mems_allowed();
-                zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+                zonelist = node_zonelist(slab_node(), flags);
                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
                        struct kmem_cache_node *n;
@@ -1731,14 +1720,12 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
 /*
 * Remove the cpu slab
 */
-static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist)
 {
        enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
-        struct page *page = c->page;
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
        int lock = 0;
        enum slab_modes l = M_NONE, m = M_NONE;
-        void *freelist;
        void *nextfree;
        int tail = DEACTIVATE_TO_HEAD;
        struct page new;
@@ -1749,11 +1736,6 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
                tail = DEACTIVATE_TO_TAIL;
        }
-        c->tid = next_tid(c->tid);
-        c->page = NULL;
-        freelist = c->freelist;
-        c->freelist = NULL;
        /*
         * Stage one: Free all available per cpu objects back
         * to the page freelist while it is still frozen. Leave the
@@ -1879,21 +1861,31 @@ redo:
        }
 }
-/* Unfreeze all the cpu partial slabs */
+/*
+ * Unfreeze all the cpu partial slabs.
+ *
+ * This function must be called with interrupt disabled.
+ */
 static void unfreeze_partials(struct kmem_cache *s)
 {
-        struct kmem_cache_node *n = NULL;
+        struct kmem_cache_node *n = NULL, *n2 = NULL;
        struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
        struct page *page, *discard_page = NULL;
        while ((page = c->partial)) {
-                enum slab_modes { M_PARTIAL, M_FREE };
-                enum slab_modes l, m;
                struct page new;
                struct page old;
                c->partial = page->next;
-                l = M_FREE;
+                n2 = get_node(s, page_to_nid(page));
+                if (n != n2) {
+                        if (n)
+                                spin_unlock(&n->list_lock);
+                        n = n2;
+                        spin_lock(&n->list_lock);
+                }
                do {
@@ -1906,43 +1898,17 @@ static void unfreeze_partials(struct kmem_cache *s)
                        new.frozen = 0;
-                        if (!new.inuse && (!n || n->nr_partial > s->min_partial))
+                } while (!__cmpxchg_double_slab(s, page,
-                                m = M_FREE;
-                        else {
-                                struct kmem_cache_node *n2 = get_node(s,
-                                                        page_to_nid(page));
-                                m = M_PARTIAL;
-                                if (n != n2) {
-                                        if (n)
-                                                spin_unlock(&n->list_lock);
-                                        n = n2;
-                                        spin_lock(&n->list_lock);
-                                }
-                        }
-                        if (l != m) {
-                                if (l == M_PARTIAL) {
-                                        remove_partial(n, page);
-                                        stat(s, FREE_REMOVE_PARTIAL);
-                                } else {
-                                        add_partial(n, page,
-                                                DEACTIVATE_TO_TAIL);
-                                        stat(s, FREE_ADD_PARTIAL);
-                                }
-                                l = m;
-                        }
-                } while (!cmpxchg_double_slab(s, page,
                                old.freelist, old.counters,
                                new.freelist, new.counters,
                                "unfreezing slab"));
-                if (m == M_FREE) {
+                if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
                        page->next = discard_page;
                        discard_page = page;
+                } else {
+                        add_partial(n, page, DEACTIVATE_TO_TAIL);
+                        stat(s, FREE_ADD_PARTIAL);
                }
        }
@@ -2011,7 +1977,11 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
        stat(s, CPUSLAB_FLUSH);
-        deactivate_slab(s, c);
+        deactivate_slab(s, c->page, c->freelist);
+        c->tid = next_tid(c->tid);
+        c->page = NULL;
+        c->freelist = NULL;
 }
 /*
@@ -2055,10 +2025,10 @@ static void flush_all(struct kmem_cache *s)
 * Check if the objects in a per cpu structure fit numa
 * locality expectations.
 */
-static inline int node_match(struct kmem_cache_cpu *c, int node)
+static inline int node_match(struct page *page, int node)
 {
 #ifdef CONFIG_NUMA
-        if (node != NUMA_NO_NODE && c->node != node)
+        if (node != NUMA_NO_NODE && page_to_nid(page) != node)
                return 0;
 #endif
        return 1;
@@ -2101,10 +2071,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
                "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
                nid, gfpflags);
        printk(KERN_WARNING "  cache: %s, object size: %d, buffer size: %d, "
-                "default order: %d, min order: %d\n", s->name, s->objsize,
+                "default order: %d, min order: %d\n", s->name, s->object_size,
                s->size, oo_order(s->oo), oo_order(s->min));
-        if (oo_order(s->min) > get_order(s->objsize))
+        if (oo_order(s->min) > get_order(s->object_size))
                printk(KERN_WARNING "  %s debugging increased min order, use "
                       "slub_debug=O to disable.\n", s->name);
@@ -2130,10 +2100,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
 static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
                        int node, struct kmem_cache_cpu **pc)
 {
-        void *object;
+        void *freelist;
-        struct kmem_cache_cpu *c;
+        struct kmem_cache_cpu *c = *pc;
-        struct page *page = new_slab(s, flags, node);
+        struct page *page;
+        freelist = get_partial(s, flags, node, c);
+        if (freelist)
+                return freelist;
+        page = new_slab(s, flags, node);
        if (page) {
                c = __this_cpu_ptr(s->cpu_slab);
                if (c->page)
@@ -2143,17 +2119,24 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
                 * No other reference to the page yet so we can
                 * muck around with it freely without cmpxchg
                 */
-                object = page->freelist;
+                freelist = page->freelist;
                page->freelist = NULL;
                stat(s, ALLOC_SLAB);
-                c->node = page_to_nid(page);
                c->page = page;
                *pc = c;
        } else
-                object = NULL;
+                freelist = NULL;
-        return object;
+        return freelist;
+}
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
+{
+        if (unlikely(PageSlabPfmemalloc(page)))
+                return gfp_pfmemalloc_allowed(gfpflags);
+        return true;
 }
 /*
@@ -2163,6 +2146,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
 * The page is still frozen if the return value is not NULL.
 *
 * If this function returns NULL then the page has been unfrozen.
+ *
+ * This function must be called with interrupt disabled.
 */
 static inline void *get_freelist(struct kmem_cache *s, struct page *page)
 {
@@ -2173,13 +2158,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
        do {
                freelist = page->freelist;
                counters = page->counters;
                new.counters = counters;
                VM_BUG_ON(!new.frozen);
                new.inuse = page->objects;
                new.frozen = freelist != NULL;
-        } while (!cmpxchg_double_slab(s, page,
+        } while (!__cmpxchg_double_slab(s, page,
                freelist, counters,
                NULL, new.counters,
                "get_freelist"));
@@ -2206,7 +2192,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
                          unsigned long addr, struct kmem_cache_cpu *c)
 {
-        void **object;
+        void *freelist;
+        struct page *page;
        unsigned long flags;
        local_irq_save(flags);
@@ -2219,25 +2206,41 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
        c = this_cpu_ptr(s->cpu_slab);
 #endif
-        if (!c->page)
+        page = c->page;
+        if (!page)
                goto new_slab;
 redo:
-        if (unlikely(!node_match(c, node))) {
+        if (unlikely(!node_match(page, node))) {
                stat(s, ALLOC_NODE_MISMATCH);
-                deactivate_slab(s, c);
+                deactivate_slab(s, page, c->freelist);
+                c->page = NULL;
+                c->freelist = NULL;
+                goto new_slab;
+        }
+        /*
+         * By rights, we should be searching for a slab page that was
+         * PFMEMALLOC but right now, we are losing the pfmemalloc
+         * information when the page leaves the per-cpu allocator
+         */
+        if (unlikely(!pfmemalloc_match(page, gfpflags))) {
+                deactivate_slab(s, page, c->freelist);
+                c->page = NULL;
+                c->freelist = NULL;
                goto new_slab;
        }
        /* must check again c->freelist in case of cpu migration or IRQ */
-        object = c->freelist;
+        freelist = c->freelist;
-        if (object)
+        if (freelist)
                goto load_freelist;
        stat(s, ALLOC_SLOWPATH);
-        object = get_freelist(s, c->page);
+        freelist = get_freelist(s, page);
-        if (!object) {
+        if (!freelist) {
                c->page = NULL;
                stat(s, DEACTIVATE_BYPASS);
                goto new_slab;
@@ -2246,50 +2249,50 @@ redo:
        stat(s, ALLOC_REFILL);
 load_freelist:
-        c->freelist = get_freepointer(s, object);
+        /*
+         * freelist is pointing to the list of objects to be used.
+         * page is pointing to the page from which the objects are obtained.
+         * That page must be frozen for per cpu allocations to work.
+         */
+        VM_BUG_ON(!c->page->frozen);
+        c->freelist = get_freepointer(s, freelist);
        c->tid = next_tid(c->tid);
        local_irq_restore(flags);
-        return object;
+        return freelist;
 new_slab:
        if (c->partial) {
-                c->page = c->partial;
+                page = c->page = c->partial;
-                c->partial = c->page->next;
+                c->partial = page->next;
-                c->node = page_to_nid(c->page);
                stat(s, CPU_PARTIAL_ALLOC);
                c->freelist = NULL;
                goto redo;
        }
-        /* Then do expensive stuff like retrieving pages from the partial lists */
+        freelist = new_slab_objects(s, gfpflags, node, &c);
-        object = get_partial(s, gfpflags, node, c);
-        if (unlikely(!object)) {
-                object = new_slab_objects(s, gfpflags, node, &c);
+        if (unlikely(!freelist)) {
+                if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+                        slab_out_of_memory(s, gfpflags, node);
-                if (unlikely(!object)) {
+                local_irq_restore(flags);
-                        if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+                return NULL;
-                                slab_out_of_memory(s, gfpflags, node);
-                        local_irq_restore(flags);
-                        return NULL;
-                }
        }
-        if (likely(!kmem_cache_debug(s)))
+        page = c->page;
+        if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
                goto load_freelist;
        /* Only entered in the debug case */
-        if (!alloc_debug_processing(s, c->page, object, addr))
+        if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
                goto new_slab;  /* Slab failed checks. Next slab needed */
-        c->freelist = get_freepointer(s, object);
+        deactivate_slab(s, page, get_freepointer(s, freelist));
-        deactivate_slab(s, c);
+        c->page = NULL;
-        c->node = NUMA_NO_NODE;
+        c->freelist = NULL;
        local_irq_restore(flags);
-        return object;
+        return freelist;
 }
 /*
@@ -2307,6 +2310,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 {
        void **object;
        struct kmem_cache_cpu *c;
+        struct page *page;
        unsigned long tid;
        if (slab_pre_alloc_hook(s, gfpflags))
@@ -2332,8 +2336,8 @@ redo:
        barrier();
        object = c->freelist;
-        if (unlikely(!object || !node_match(c, node)))
+        page = c->page;
+        if (unlikely(!object || !node_match(page, node)))
                object = __slab_alloc(s, gfpflags, node, addr, c);
        else {
@@ -2364,7 +2368,7 @@ redo:
        }
        if (unlikely(gfpflags & __GFP_ZERO) && object)
-                memset(object, 0, s->objsize);
+                memset(object, 0, s->object_size);
        slab_post_alloc_hook(s, gfpflags, object);
@@ -2375,7 +2379,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
        void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
-        trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
+        trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
        return ret;
 }
@@ -2405,7 +2409,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
-                                    s->objsize, s->size, gfpflags, node);
+                                    s->object_size, s->size, gfpflags, node);
        return ret;
 }
@@ -2900,7 +2904,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
 static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
        unsigned long flags = s->flags;
-        unsigned long size = s->objsize;
+        unsigned long size = s->object_size;
        unsigned long align = s->align;
        int order;
@@ -2929,7 +2933,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
         * end of the object and the free pointer. If not then add an
         * additional word to have some bytes to store Redzone information.
         */
-        if ((flags & SLAB_RED_ZONE) && size == s->objsize)
+        if ((flags & SLAB_RED_ZONE) && size == s->object_size)
                size += sizeof(void *);
 #endif
@@ -2977,7 +2981,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
         * user specified and the dynamic determination of cache line size
         * on bootup.
         */
-        align = calculate_alignment(flags, align, s->objsize);
+        align = calculate_alignment(flags, align, s->object_size);
        s->align = align;
        /*
@@ -3025,7 +3029,7 @@ static int kmem_cache_open(struct kmem_cache *s,
        memset(s, 0, kmem_size);
        s->name = name;
        s->ctor = ctor;
-        s->objsize = size;
+        s->object_size = size;
        s->align = align;
        s->flags = kmem_cache_flags(size, flags, name, ctor);
        s->reserved = 0;
@@ -3040,7 +3044,7 @@ static int kmem_cache_open(struct kmem_cache *s,
                 * Disable debugging flags that store metadata if the min slab
                 * order increased.
                 */
-                if (get_order(s->size) > get_order(s->objsize)) {
+                if (get_order(s->size) > get_order(s->object_size)) {
                        s->flags &= ~DEBUG_METADATA_FLAGS;
                        s->offset = 0;
                        if (!calculate_sizes(s, -1))
@@ -3114,7 +3118,7 @@ error:
 */
 unsigned int kmem_cache_size(struct kmem_cache *s)
 {
-        return s->objsize;
+        return s->object_size;
 }
 EXPORT_SYMBOL(kmem_cache_size);
@@ -3192,11 +3196,11 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 */
 void kmem_cache_destroy(struct kmem_cache *s)
 {
-        down_write(&slub_lock);
+        mutex_lock(&slab_mutex);
        s->refcount--;
        if (!s->refcount) {
                list_del(&s->list);
-                up_write(&slub_lock);
+                mutex_unlock(&slab_mutex);
                if (kmem_cache_close(s)) {
                        printk(KERN_ERR "SLUB %s: %s called for cache that "
                                "still has objects.\n", s->name, __func__);
@@ -3206,7 +3210,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
                        rcu_barrier();
                sysfs_slab_remove(s);
        } else
-                up_write(&slub_lock);
+                mutex_unlock(&slab_mutex);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -3268,7 +3272,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
        /*
         * This function is called with IRQs disabled during early-boot on
-         * single CPU so there's no need to take slub_lock here.
+         * single CPU so there's no need to take slab_mutex here.
         */
        if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
                                                                flags, NULL))
@@ -3553,10 +3557,10 @@ static int slab_mem_going_offline_callback(void *arg)
 {
        struct kmem_cache *s;
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list)
                kmem_cache_shrink(s);
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
        return 0;
 }
@@ -3577,7 +3581,7 @@ static void slab_mem_offline_callback(void *arg)
        if (offline_node < 0)
                return;
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list) {
                n = get_node(s, offline_node);
                if (n) {
@@ -3593,7 +3597,7 @@ static void slab_mem_offline_callback(void *arg)
                        kmem_cache_free(kmem_cache_node, n);
                }
        }
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
 }
 static int slab_mem_going_online_callback(void *arg)
@@ -3616,7 +3620,7 @@ static int slab_mem_going_online_callback(void *arg)
         * allocate a kmem_cache_node structure in order to bring the node
         * online.
         */
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list) {
                /*
                 * XXX: kmem_cache_alloc_node will fallback to other nodes
@@ -3632,7 +3636,7 @@ static int slab_mem_going_online_callback(void *arg)
                s->node[nid] = n;
        }
 out:
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
        return ret;
 }
@@ -3843,11 +3847,11 @@ void __init kmem_cache_init(void)
                if (s && s->size) {
                        char *name = kasprintf(GFP_NOWAIT,
-                                 "dma-kmalloc-%d", s->objsize);
+                                 "dma-kmalloc-%d", s->object_size);
                        BUG_ON(!name);
                        kmalloc_dma_caches[i] = create_kmalloc_cache(name,
-                                s->objsize, SLAB_CACHE_DMA);
+                                s->object_size, SLAB_CACHE_DMA);
                }
        }
 #endif
@@ -3924,16 +3928,12 @@ static struct kmem_cache *find_mergeable(size_t size,
        return NULL;
 }
-struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
                size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s;
        char *n;
-        if (WARN_ON(!name))
-                return NULL;
-        down_write(&slub_lock);
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
                s->refcount++;
@@ -3941,49 +3941,42 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                 * Adjust the object sizes so that we clear
                 * the complete object on kzalloc.
                 */
-                s->objsize = max(s->objsize, (int)size);
+                s->object_size = max(s->object_size, (int)size);
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
                if (sysfs_slab_alias(s, name)) {
                        s->refcount--;
-                        goto err;
+                        return NULL;
                }
-                up_write(&slub_lock);
                return s;
        }
        n = kstrdup(name, GFP_KERNEL);
        if (!n)
-                goto err;
+                return NULL;
        s = kmalloc(kmem_size, GFP_KERNEL);
        if (s) {
                if (kmem_cache_open(s, n,
                                size, align, flags, ctor)) {
+                        int r;
                        list_add(&s->list, &slab_caches);
-                        up_write(&slub_lock);
+                        mutex_unlock(&slab_mutex);
-                        if (sysfs_slab_add(s)) {
+                        r = sysfs_slab_add(s);
-                                down_write(&slub_lock);
+                        mutex_lock(&slab_mutex);
-                                list_del(&s->list);
-                                kfree(n);
+                        if (!r)
-                                kfree(s);
+                                return s;
-                                goto err;
-                        }
+                        list_del(&s->list);
-                        return s;
+                        kmem_cache_close(s);
                }
                kfree(s);
        }
        kfree(n);
-err:
+        return NULL;
-        up_write(&slub_lock);
-        if (flags & SLAB_PANIC)
-                panic("Cannot create slabcache %s\n", name);
-        else
-                s = NULL;
-        return s;
 }
-EXPORT_SYMBOL(kmem_cache_create);
 #ifdef CONFIG_SMP
 /*
@@ -4002,13 +3995,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
        case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                down_read(&slub_lock);
+                mutex_lock(&slab_mutex);
                list_for_each_entry(s, &slab_caches, list) {
                        local_irq_save(flags);
                        __flush_cpu_slab(s, cpu);
                        local_irq_restore(flags);
                }
-                up_read(&slub_lock);
+                mutex_unlock(&slab_mutex);
                break;
        default:
                break;
@@ -4500,30 +4493,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                for_each_possible_cpu(cpu) {
                        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
-                        int node = ACCESS_ONCE(c->node);
+                        int node;
                        struct page *page;
-                        if (node < 0)
-                                continue;
                        page = ACCESS_ONCE(c->page);
-                        if (page) {
+                        if (!page)
-                                if (flags & SO_TOTAL)
+                                continue;
-                                        x = page->objects;
-                                else if (flags & SO_OBJECTS)
-                                        x = page->inuse;
-                                else
-                                        x = 1;
-                                total += x;
+                        node = page_to_nid(page);
-                                nodes[node] += x;
+                        if (flags & SO_TOTAL)
-                        }
+                                x = page->objects;
-                        page = c->partial;
+                        else if (flags & SO_OBJECTS)
+                                x = page->inuse;
+                        else
+                                x = 1;
+                        total += x;
+                        nodes[node] += x;
+                        page = ACCESS_ONCE(c->partial);
                        if (page) {
                                x = page->pobjects;
                                total += x;
                                nodes[node] += x;
                        }
                        per_cpu[node]++;
                }
        }
@@ -4623,7 +4617,7 @@ SLAB_ATTR_RO(align);
 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", s->objsize);
+        return sprintf(buf, "%d\n", s->object_size);
 }
 SLAB_ATTR_RO(object_size);
@@ -5286,7 +5280,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
        const char *name;
        int unmergeable;
-        if (slab_state < SYSFS)
+        if (slab_state < FULL)
                /* Defer until later */
                return 0;
@@ -5331,7 +5325,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
 static void sysfs_slab_remove(struct kmem_cache *s)
 {
-        if (slab_state < SYSFS)
+        if (slab_state < FULL)
                /*
                 * Sysfs has not been setup yet so no need to remove the
                 * cache from sysfs.
@@ -5359,7 +5353,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
 {
        struct saved_alias *al;
-        if (slab_state == SYSFS) {
+        if (slab_state == FULL) {
                /*
                 * If we have a leftover link then remove it.
                 */
@@ -5383,16 +5377,16 @@ static int __init slab_sysfs_init(void)
        struct kmem_cache *s;
        int err;
-        down_write(&slub_lock);
+        mutex_lock(&slab_mutex);
        slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
        if (!slab_kset) {
-                up_write(&slub_lock);
+                mutex_unlock(&slab_mutex);
                printk(KERN_ERR "Cannot register slab subsystem.\n");
                return -ENOSYS;
        }
-        slab_state = SYSFS;
+        slab_state = FULL;
        list_for_each_entry(s, &slab_caches, list) {
                err = sysfs_slab_add(s);
@@ -5408,11 +5402,11 @@ static int __init slab_sysfs_init(void)
                err = sysfs_slab_alias(al->s, al->name);
                if (err)
                        printk(KERN_ERR "SLUB: Unable to add boot slab alias"
-                                        " %s to sysfs\n", s->name);
+                                        " %s to sysfs\n", al->name);
                kfree(al);
        }
-        up_write(&slub_lock);
+        mutex_unlock(&slab_mutex);
        resiliency_test();
        return 0;
 }
@@ -5427,7 +5421,7 @@ __initcall(slab_sysfs_init);
 static void print_slabinfo_header(struct seq_file *m)
 {
        seq_puts(m, "slabinfo - version: 2.1\n");
-        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
+        seq_puts(m, "# name            <active_objs> <num_objs> <object_size> "
                 "<objperslab> <pagesperslab>");
        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
@@ -5438,7 +5432,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 {
        loff_t n = *pos;
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        if (!n)
                print_slabinfo_header(m);
@@ -5452,7 +5446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 static void s_stop(struct seq_file *m, void *p)
 {
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
 }
 static int s_show(struct seq_file *m, void *p)
diff --git a/mm/sparse.c b/mm/sparse.c
index 6a4bf9160e85..fac95f2888f2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
        if (slab_is_available()) {
                if (node_state(nid, N_HIGH_MEMORY))
-                        section = kmalloc_node(array_size, GFP_KERNEL, nid);
+                        section = kzalloc_node(array_size, GFP_KERNEL, nid);
                else
-                        section = kmalloc(array_size, GFP_KERNEL);
+                        section = kzalloc(array_size, GFP_KERNEL);
-        } else
+        } else {
                section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+        }
-        if (section)
-                memset(section, 0, array_size);
        return section;
 }
 static int __meminit sparse_index_init(unsigned long section_nr, int nid)
 {
-        static DEFINE_SPINLOCK(index_init_lock);
        unsigned long root = SECTION_NR_TO_ROOT(section_nr);
        struct mem_section *section;
        int ret = 0;
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
        section = sparse_index_alloc(nid);
        if (!section)
                return -ENOMEM;
-        /*
-         * This lock keeps two different sections from
-         * reallocating for the same index
-         */
-        spin_lock(&index_init_lock);
-        if (mem_section[root]) {
-                ret = -EEXIST;
-                goto out;
-        }
        mem_section[root] = section;
-out:
-        spin_unlock(&index_init_lock);
        return ret;
 }
 #else /* !SPARSEMEM_EXTREME */
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms)
                     break;
        }
+        VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
        return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
@@ -275,8 +263,9 @@ static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
                                         unsigned long size)
 {
-        pg_data_t *host_pgdat;
+        unsigned long goal, limit;
-        unsigned long goal;
+        unsigned long *p;
+        int nid;
        /*
         * A page may contain usemaps for other sections preventing the
         * page being freed and making a section unremovable while
@@ -287,10 +276,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
         * from the same section as the pgdat where possible to avoid
         * this problem.
         */
-        goal = __pa(pgdat) & PAGE_SECTION_MASK;
+        goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
-        host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT));
+        limit = goal + (1UL << PA_SECTION_SHIFT);
-        return __alloc_bootmem_node_nopanic(host_pgdat, size,
+        nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
-                                            SMP_CACHE_BYTES, goal);
+again:
+        p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
+                                          SMP_CACHE_BYTES, goal, limit);
+        if (!p && limit) {
+                limit = 0;
+                goto again;
+        }
+        return p;
 }
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -485,6 +481,9 @@ void __init sparse_init(void)
        struct page **map_map;
 #endif
+        /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
+        set_pageblock_order();
        /*
         * map is using big page (aka 2M in x86 64 bit)
         * usemap is less one page (aka 24 bytes)
diff --git a/mm/swap.c b/mm/swap.c
index 4e7e2ec67078..77825883298f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -236,6 +236,58 @@ void put_pages_list(struct list_head *pages)
 }
 EXPORT_SYMBOL(put_pages_list);
+/*
+ * get_kernel_pages() - pin kernel pages in memory
+ * @kiov:       An array of struct kvec structures
+ * @nr_segs:    number of segments to pin
+ * @write:      pinning for read/write, currently ignored
+ * @pages:      array that receives pointers to the pages pinned.
+ *              Should be at least nr_segs long.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with.
+ */
+int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
+                struct page **pages)
+{
+        int seg;
+        for (seg = 0; seg < nr_segs; seg++) {
+                if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
+                        return seg;
+                pages[seg] = kmap_to_page(kiov[seg].iov_base);
+                page_cache_get(pages[seg]);
+        }
+        return seg;
+}
+EXPORT_SYMBOL_GPL(get_kernel_pages);
+/*
+ * get_kernel_page() - pin a kernel page in memory
+ * @start:      starting kernel address
+ * @write:      pinning for read/write, currently ignored
+ * @pages:      array that receives pointer to the page pinned.
+ *              Must be at least nr_segs long.
+ *
+ * Returns 1 if page is pinned. If the page was not pinned, returns
+ * -errno. The page returned must be released with a put_page() call
+ * when it is finished with.
+ */
+int get_kernel_page(unsigned long start, int write, struct page **pages)
+{
+        const struct kvec kiov = {
+                .iov_base = (void *)start,
+                .iov_len = PAGE_SIZE
+        };
+        return get_kernel_pages(&kiov, 1, write, pages);
+}
+EXPORT_SYMBOL_GPL(get_kernel_page);
 static void pagevec_lru_move_fn(struct pagevec *pvec,
        void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
        void *arg)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4c5ff7f284d9..0cb36fb1f61c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
+#include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
 #include <linux/page_cgroup.h>
@@ -26,7 +27,7 @@
 */
 static const struct address_space_operations swap_aops = {
        .writepage      = swap_writepage,
-        .set_page_dirty = __set_page_dirty_no_writeback,
+        .set_page_dirty = swap_set_page_dirty,
        .migratepage    = migrate_page,
 };
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        unsigned long offset = swp_offset(entry);
        unsigned long start_offset, end_offset;
        unsigned long mask = (1UL << page_cluster) - 1;
+        struct blk_plug plug;
        /* Read a page_cluster sized and aligned cluster around offset. */
        start_offset = offset & ~mask;
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        if (!start_offset)      /* First page is swap header. */
                start_offset++;
+        blk_start_plug(&plug);
        for (offset = start_offset; offset <= end_offset ; offset++) {
                /* Ok, do the async read-ahead now */
                page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
                        continue;
                page_cache_release(page);
        }
+        blk_finish_plug(&plug);
        lru_add_drain();        /* Push any new pages onto the LRU now */
        return read_swap_cache_async(entry, gfp_mask, vma, addr);
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 457b10baef59..14e254c768fc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,9 @@
 #include <linux/memcontrol.h>
 #include <linux/poll.h>
 #include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+#include <linux/export.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -42,7 +45,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
 static void free_swap_count_continuations(struct swap_info_struct *);
 static sector_t map_swap_entry(swp_entry_t, struct block_device**);
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 long nr_swap_pages;
 long total_swap_pages;
@@ -53,9 +56,9 @@ static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
 static DEFINE_MUTEX(swapon_mutex);
@@ -546,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
        /* free if no reference */
        if (!usage) {
-                struct gendisk *disk = p->bdev->bd_disk;
                if (offset < p->lowest_bit)
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
@@ -556,9 +558,13 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
                        swap_list.next = p->type;
                nr_swap_pages++;
                p->inuse_pages--;
-                if ((p->flags & SWP_BLKDEV) &&
+                frontswap_invalidate_page(p->type, offset);
-                                disk->fops->swap_slot_free_notify)
+                if (p->flags & SWP_BLKDEV) {
-                        disk->fops->swap_slot_free_notify(p->bdev, offset);
+                        struct gendisk *disk = p->bdev->bd_disk;
+                        if (disk->fops->swap_slot_free_notify)
+                                disk->fops->swap_slot_free_notify(p->bdev,
+                                                                  offset);
+                }
        }
        return usage;
@@ -829,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
-                if (ret > 0)
+                mem_cgroup_cancel_charge_swapin(memcg);
-                        mem_cgroup_cancel_charge_swapin(memcg);
                ret = 0;
                goto out;
        }
@@ -985,11 +990,12 @@ static int unuse_mm(struct mm_struct *mm,
 }
 /*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap_map (or frontswap_map if frontswap parameter is true)
+ * from current position to next entry still in use.
 * Recycle to start on reaching the end, returning 0 when empty.
 */
 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
-                                        unsigned int prev)
+                                        unsigned int prev, bool frontswap)
 {
        unsigned int max = si->max;
        unsigned int i = prev;
@@ -1015,6 +1021,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                        prev = 0;
                        i = 1;
                }
+                if (frontswap) {
+                        if (frontswap_test(si, i))
+                                break;
+                        else
+                                continue;
+                }
                count = si->swap_map[i];
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        break;
@@ -1026,8 +1038,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 * We completely avoid races by reading each swap page in advance,
 * and then search for the process using it.  All the necessary
 * page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages; ignored if frontswap is false
 */
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+                 unsigned long pages_to_unuse)
 {
        struct swap_info_struct *si = swap_info[type];
        struct mm_struct *start_mm;
@@ -1060,7 +1076,7 @@ static int try_to_unuse(unsigned int type)
         * one pass through swap_map is enough, but not necessarily:
         * there are races when an instance of an entry might be missed.
         */
-        while ((i = find_next_to_unuse(si, i)) != 0) {
+        while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
                if (signal_pending(current)) {
                        retval = -EINTR;
                        break;
@@ -1227,6 +1243,10 @@ static int try_to_unuse(unsigned int type)
                 * interactive performance.
                 */
                cond_resched();
+                if (frontswap && pages_to_unuse > 0) {
+                        if (!--pages_to_unuse)
+                                break;
+                }
        }
        mmput(start_mm);
@@ -1310,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
                list_del(&se->list);
                kfree(se);
        }
+        if (sis->flags & SWP_FILE) {
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                sis->flags &= ~SWP_FILE;
+                mapping->a_ops->swap_deactivate(swap_file);
+        }
 }
 /*
@@ -1318,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 *
 * This function rather assumes that it is called in ascending page order.
 */
-static int
+int
 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block)
 {
@@ -1391,102 +1419,33 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 */
 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 {
-        struct inode *inode;
+        struct file *swap_file = sis->swap_file;
-        unsigned blocks_per_page;
+        struct address_space *mapping = swap_file->f_mapping;
-        unsigned long page_no;
+        struct inode *inode = mapping->host;
-        unsigned blkbits;
-        sector_t probe_block;
-        sector_t last_block;
-        sector_t lowest_block = -1;
-        sector_t highest_block = 0;
-        int nr_extents = 0;
        int ret;
-        inode = sis->swap_file->f_mapping->host;
        if (S_ISBLK(inode->i_mode)) {
                ret = add_swap_extent(sis, 0, sis->max, 0);
                *span = sis->pages;
-                goto out;
+                return ret;
        }
-        blkbits = inode->i_blkbits;
+        if (mapping->a_ops->swap_activate) {
-        blocks_per_page = PAGE_SIZE >> blkbits;
+                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+                if (!ret) {
-        /*
+                        sis->flags |= SWP_FILE;
-         * Map all the blocks into the extent list.  This code doesn't try
+                        ret = add_swap_extent(sis, 0, sis->max, 0);
-         * to be very smart.
+                        *span = sis->pages;
-         */
-        probe_block = 0;
-        page_no = 0;
-        last_block = i_size_read(inode) >> blkbits;
-        while ((probe_block + blocks_per_page) <= last_block &&
-                        page_no < sis->max) {
-                unsigned block_in_page;
-                sector_t first_block;
-                first_block = bmap(inode, probe_block);
-                if (first_block == 0)
-                        goto bad_bmap;
-                /*
-                 * It must be PAGE_SIZE aligned on-disk
-                 */
-                if (first_block & (blocks_per_page - 1)) {
-                        probe_block++;
-                        goto reprobe;
-                }
-                for (block_in_page = 1; block_in_page < blocks_per_page;
-                                        block_in_page++) {
-                        sector_t block;
-                        block = bmap(inode, probe_block + block_in_page);
-                        if (block == 0)
-                                goto bad_bmap;
-                        if (block != first_block + block_in_page) {
-                                /* Discontiguity */
-                                probe_block++;
-                                goto reprobe;
-                        }
-                }
-                first_block >>= (PAGE_SHIFT - blkbits);
-                if (page_no) {  /* exclude the header page */
-                        if (first_block < lowest_block)
-                                lowest_block = first_block;
-                        if (first_block > highest_block)
-                                highest_block = first_block;
                }
+                return ret;
+        }
-                /*
+        return generic_swapfile_activate(sis, swap_file, span);
-                 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
-                 */
-                ret = add_swap_extent(sis, page_no, 1, first_block);
-                if (ret < 0)
-                        goto out;
-                nr_extents += ret;
-                page_no++;
-                probe_block += blocks_per_page;
-reprobe:
-                continue;
-        }
-        ret = nr_extents;
-        *span = 1 + highest_block - lowest_block;
-        if (page_no == 0)
-                page_no = 1;    /* force Empty message */
-        sis->max = page_no;
-        sis->pages = page_no - 1;
-        sis->highest_bit = page_no - 1;
-out:
-        return ret;
-bad_bmap:
-        printk(KERN_ERR "swapon: swapfile has holes\n");
-        ret = -EINVAL;
-        goto out;
 }
 static void enable_swap_info(struct swap_info_struct *p, int prio,
-                                unsigned char *swap_map)
+                                unsigned char *swap_map,
+                                unsigned long *frontswap_map)
 {
        int i, prev;
@@ -1496,6 +1455,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
        else
                p->prio = --least_priority;
        p->swap_map = swap_map;
+        frontswap_map_set(p, frontswap_map);
        p->flags |= SWP_WRITEOK;
        nr_swap_pages += p->pages;
        total_swap_pages += p->pages;
@@ -1512,6 +1472,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
                swap_list.head = swap_list.next = p->type;
        else
                swap_info[prev]->next = p->type;
+        frontswap_init(p->type);
        spin_unlock(&swap_lock);
 }
@@ -1585,7 +1546,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_unlock(&swap_lock);
        oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
-        err = try_to_unuse(type);
+        err = try_to_unuse(type, false, 0); /* force all pages to be unused */
        compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
        if (err) {
@@ -1596,7 +1557,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                 * sys_swapoff for this swap_info_struct at this point.
                 */
                /* re-insert swap space back into swap_list */
-                enable_swap_info(p, p->prio, p->swap_map);
+                enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
                goto out_dput;
        }
@@ -1622,9 +1583,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        swap_map = p->swap_map;
        p->swap_map = NULL;
        p->flags = 0;
+        frontswap_invalidate_area(type);
        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
        vfree(swap_map);
+        vfree(frontswap_map_get(p));
        /* Destroy swap account informatin */
        swap_cgroup_swapoff(type);
@@ -1893,24 +1856,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        /*
         * Find out how many pages are allowed for a single swap
-         * device. There are three limiting factors: 1) the number
+         * device. There are two limiting factors: 1) the number
         * of bits for the swap offset in the swp_entry_t type, and
         * 2) the number of bits in the swap pte as defined by the
-         * the different architectures, and 3) the number of free bits
+         * different architectures. In order to find the
-         * in an exceptional radix_tree entry. In order to find the
         * largest possible bit mask, a swap entry with swap type 0
         * and swap offset ~0UL is created, encoded to a swap pte,
         * decoded to a swp_entry_t again, and finally the swap
         * offset is extracted. This will mask all the bits from
         * the initial ~0UL mask that can't be encoded in either
         * the swp_entry_t or the architecture definition of a
-         * swap pte.  Then the same is done for a radix_tree entry.
+         * swap pte.
         */
        maxpages = swp_offset(pte_to_swp_entry(
-                        swp_entry_to_pte(swp_entry(0, ~0UL))));
+                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
-        maxpages = swp_offset(radix_to_swp_entry(
-                        swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
        if (maxpages > swap_header->info.last_page) {
                maxpages = swap_header->info.last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
@@ -1988,6 +1947,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        sector_t span;
        unsigned long maxpages;
        unsigned char *swap_map = NULL;
+        unsigned long *frontswap_map = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
@@ -2071,6 +2031,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                error = nr_extents;
                goto bad_swap;
        }
+        /* frontswap enabled? set up bit-per-page map for frontswap */
+        if (frontswap_enabled)
+                frontswap_map = vzalloc(maxpages / sizeof(long));
        if (p->bdev) {
                if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2086,14 +2049,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (swap_flags & SWAP_FLAG_PREFER)
                prio =
                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
-        enable_swap_info(p, prio, swap_map);
+        enable_swap_info(p, prio, swap_map, frontswap_map);
        printk(KERN_INFO "Adding %uk swap on %s.  "
-                        "Priority:%d extents:%d across:%lluk %s%s\n",
+                        "Priority:%d extents:%d across:%lluk %s%s%s\n",
                p->pages<<(PAGE_SHIFT-10), name, p->prio,
                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
-                (p->flags & SWP_DISCARDABLE) ? "D" : "");
+                (p->flags & SWP_DISCARDABLE) ? "D" : "",
+                (frontswap_map) ? "FS" : "");
        mutex_unlock(&swapon_mutex);
        atomic_inc(&proc_poll_event);
@@ -2261,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry)
        return __swap_duplicate(entry, SWAP_HAS_CACHE);
 }
+struct swap_info_struct *page_swap_info(struct page *page)
+{
+        swp_entry_t swap = { .val = page_private(page) };
+        BUG_ON(!PageSwapCache(page));
+        return swap_info[swp_type(swap)];
+}
+/*
+ * out-of-line __page_file_ methods to avoid include hell.
+ */
+struct address_space *__page_file_mapping(struct page *page)
+{
+        VM_BUG_ON(!PageSwapCache(page));
+        return page_swap_info(page)->swap_file->f_mapping;
+}
+EXPORT_SYMBOL_GPL(__page_file_mapping);
+pgoff_t __page_file_index(struct page *page)
+{
+        swp_entry_t swap = { .val = page_private(page) };
+        VM_BUG_ON(!PageSwapCache(page));
+        return swp_offset(swap);
+}
+EXPORT_SYMBOL_GPL(__page_file_index);
 /*
 * add_swap_count_continuation - called when a swap count is duplicated
 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2aad49981b57..2bb90b1d241c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,11 +413,11 @@ nocache:
                if (addr + size - 1 < addr)
                        goto overflow;
-                n = rb_next(&first->rb_node);
+                if (list_is_last(&first->list, &vmap_area_list))
-                if (n)
-                        first = rb_entry(n, struct vmap_area, rb_node);
-                else
                        goto found;
+                first = list_entry(first->list.next,
+                                struct vmap_area, list);
        }
 found:
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
        BUG_ON(size & ~PAGE_MASK);
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+        if (WARN_ON(size == 0)) {
+                /*
+                 * Allocating 0 bytes isn't what caller wants since
+                 * get_order(0) returns funny result. Just warn and terminate
+                 * early.
+                 */
+                return NULL;
+        }
        order = get_order(size);
 again:
@@ -1280,7 +1288,7 @@ DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-                              unsigned long flags, void *caller)
+                              unsigned long flags, const void *caller)
 {
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
@@ -1306,7 +1314,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm)
 }
 static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-                              unsigned long flags, void *caller)
+                              unsigned long flags, const void *caller)
 {
        setup_vmalloc_vm(vm, va, flags, caller);
        insert_vmalloc_vmlist(vm);
@@ -1314,7 +1322,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long flags, unsigned long start,
-                unsigned long end, int node, gfp_t gfp_mask, void *caller)
+                unsigned long end, int node, gfp_t gfp_mask, const void *caller)
 {
        struct vmap_area *va;
        struct vm_struct *area;
@@ -1375,7 +1383,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
-                                       void *caller)
+                                       const void *caller)
 {
        return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
                                  caller);
@@ -1397,13 +1405,21 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 }
 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
-                                void *caller)
+                                const void *caller)
 {
        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
                                                -1, GFP_KERNEL, caller);
 }
-static struct vm_struct *find_vm_area(const void *addr)
+/**
+ *      find_vm_area  -  find a continuous kernel virtual area
+ *      @addr:          base address
+ *
+ *      Search for the kernel VM area starting at @addr, and return it.
+ *      It is up to the caller to do all required locking to keep the returned
+ *      pointer valid.
+ */
+struct vm_struct *find_vm_area(const void *addr)
 {
        struct vmap_area *va;
@@ -1568,9 +1584,9 @@ EXPORT_SYMBOL(vmap);
 static void *__vmalloc_node(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, pgprot_t prot,
-                            int node, void *caller);
+                            int node, const void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-                                 pgprot_t prot, int node, void *caller)
+                                 pgprot_t prot, int node, const void *caller)
 {
        const int order = 0;
        struct page **pages;
@@ -1643,7 +1659,7 @@ fail:
 */
 void *__vmalloc_node_range(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
-                        pgprot_t prot, int node, void *caller)
+                        pgprot_t prot, int node, const void *caller)
 {
        struct vm_struct *area;
        void *addr;
@@ -1699,7 +1715,7 @@ fail:
 */
 static void *__vmalloc_node(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, pgprot_t prot,
-                            int node, void *caller)
+                            int node, const void *caller)
 {
        return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
                                gfp_mask, prot, node, caller);
@@ -1975,9 +1991,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
 *      IOREMAP area is treated as memory hole and no copy is done.
 *
 *      If [addr...addr+count) doesn't includes any intersects with alive
- *      vm_struct area, returns 0.
+ *      vm_struct area, returns 0. @buf should be kernel's buffer.
- *      @buf should be kernel's buffer. Because this function uses KM_USER0,
- *      the caller should guarantee KM_USER0 is not used.
 *
 *      Note: In usual ops, vread() is never necessary because the caller
 *      should know vmalloc() area is valid and can use memcpy().
@@ -2051,9 +2065,7 @@ finished:
 *      IOREMAP area is treated as memory hole and no copy is done.
 *
 *      If [addr...addr+count) doesn't includes any intersects with alive
- *      vm_struct area, returns 0.
+ *      vm_struct area, returns 0. @buf should be kernel's buffer.
- *      @buf should be kernel's buffer. Because this function uses KM_USER0,
- *      the caller should guarantee KM_USER0 is not used.
 *
 *      Note: In usual ops, vwrite() is never necessary because the caller
 *      should know vmalloc() area is valid and can use memcpy().
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eeb3bc9d1d36..8d01243d9560 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -133,7 +133,7 @@ long vm_total_pages;	/* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
 {
        return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        cond_resched();
+        mem_cgroup_uncharge_start();
        while (!list_empty(page_list)) {
                enum page_references references;
                struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
                if (PageWriteback(page)) {
-                        nr_writeback++;
+                        /*
-                        unlock_page(page);
+                         * memcg doesn't have any dirty pages throttling so we
-                        goto keep;
+                         * could easily OOM just because too many pages are in
+                         * writeback and there is nothing else to reclaim.
+                         *
+                         * Check __GFP_IO, certainly because a loop driver
+                         * thread might enter reclaim, and deadlock if it waits
+                         * on a page for which it is needed to do the write
+                         * (loop masks off __GFP_IO|__GFP_FS for this reason);
+                         * but more thought would probably show more reasons.
+                         *
+                         * Don't require __GFP_FS, since we're not going into
+                         * the FS, just waiting on its writeback completion.
+                         * Worryingly, ext4 gfs2 and xfs allocate pages with
+                         * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
+                         * testing may_enter_fs here is liable to OOM on them.
+                         */
+                        if (global_reclaim(sc) ||
+                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+                                /*
+                                 * This is slightly racy - end_page_writeback()
+                                 * might have just cleared PageReclaim, then
+                                 * setting PageReclaim here end up interpreted
+                                 * as PageReadahead - but that does not matter
+                                 * enough to care.  What we do want is for this
+                                 * page to have PageReclaim set next time memcg
+                                 * reclaim reaches the tests above, so it will
+                                 * then wait_on_page_writeback() to avoid OOM;
+                                 * and it's also appropriate in global reclaim.
+                                 */
+                                SetPageReclaim(page);
+                                nr_writeback++;
+                                goto keep_locked;
+                        }
+                        wait_on_page_writeback(page);
                }
                references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
+        mem_cgroup_uncharge_end();
        *ret_nr_dirty += nr_dirty;
        *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
@@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc)
 * by looking at the fraction of the pages scanned we did rotate back
 * onto the active list instead of evict.
 *
- * nr[0] = anon pages to scan; nr[1] = file pages to scan
+ * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
+ * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
 */
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                           unsigned long *nr)
@@ -2111,6 +2146,83 @@ out:
        return 0;
 }
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+{
+        struct zone *zone;
+        unsigned long pfmemalloc_reserve = 0;
+        unsigned long free_pages = 0;
+        int i;
+        bool wmark_ok;
+        for (i = 0; i <= ZONE_NORMAL; i++) {
+                zone = &pgdat->node_zones[i];
+                pfmemalloc_reserve += min_wmark_pages(zone);
+                free_pages += zone_page_state(zone, NR_FREE_PAGES);
+        }
+        wmark_ok = free_pages > pfmemalloc_reserve / 2;
+        /* kswapd must be awake if processes are being throttled */
+        if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
+                pgdat->classzone_idx = min(pgdat->classzone_idx,
+                                                (enum zone_type)ZONE_NORMAL);
+                wake_up_interruptible(&pgdat->kswapd_wait);
+        }
+        return wmark_ok;
+}
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+                                        nodemask_t *nodemask)
+{
+        struct zone *zone;
+        int high_zoneidx = gfp_zone(gfp_mask);
+        pg_data_t *pgdat;
+        /*
+         * Kernel threads should not be throttled as they may be indirectly
+         * responsible for cleaning pages necessary for reclaim to make forward
+         * progress. kjournald for example may enter direct reclaim while
+         * committing a transaction where throttling it could forcing other
+         * processes to block on log_wait_commit().
+         */
+        if (current->flags & PF_KTHREAD)
+                return;
+        /* Check if the pfmemalloc reserves are ok */
+        first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+        pgdat = zone->zone_pgdat;
+        if (pfmemalloc_watermark_ok(pgdat))
+                return;
+        /* Account for the throttling */
+        count_vm_event(PGSCAN_DIRECT_THROTTLE);
+        /*
+         * If the caller cannot enter the filesystem, it's possible that it
+         * is due to the caller holding an FS lock or performing a journal
+         * transaction in the case of a filesystem like ext[3|4]. In this case,
+         * it is not safe to block on pfmemalloc_wait as kswapd could be
+         * blocked waiting on the same lock. Instead, throttle for up to a
+         * second before continuing.
+         */
+        if (!(gfp_mask & __GFP_FS)) {
+                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+                        pfmemalloc_watermark_ok(pgdat), HZ);
+                return;
+        }
+        /* Throttle until kswapd wakes the process */
+        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+                pfmemalloc_watermark_ok(pgdat));
+}
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                gfp_t gfp_mask, nodemask_t *nodemask)
 {
@@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .gfp_mask = sc.gfp_mask,
        };
+        throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+        /*
+         * Do not enter reclaim if fatal signal is pending. 1 is returned so
+         * that the page allocator does not consider triggering OOM
+         */
+        if (fatal_signal_pending(current))
+                return 1;
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
                                gfp_mask);
@@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        return nr_reclaimed;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                                                gfp_t gfp_mask, bool noswap,
@@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
        return balanced_pages >= (present_pages >> 2);
 }
-/* is kswapd sleeping prematurely? */
+/*
-static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ * Prepare kswapd for sleeping. This verifies that there are no processes
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
+ *
+ * Returns true if kswapd is ready to sleep
+ */
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                                        int classzone_idx)
 {
        int i;
@@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
-                return true;
+                return false;
+        /*
+         * There is a potential race between when kswapd checks its watermarks
+         * and a process gets throttled. There is also a potential race if
+         * processes get throttled, kswapd wakes, a large process exits therby
+         * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+         * is going to sleep, no process should be sleeping on pfmemalloc_wait
+         * so wake them now if necessary. If necessary, processes will wake
+         * kswapd and get throttled again
+         */
+        if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+                wake_up(&pgdat->pfmemalloc_wait);
+                return false;
+        }
        /* Check the watermark levels */
        for (i = 0; i <= classzone_idx; i++) {
@@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
         * must be balanced
         */
        if (order)
-                return !pgdat_balanced(pgdat, balanced, classzone_idx);
+                return pgdat_balanced(pgdat, balanced, classzone_idx);
        else
-                return !all_zones_ok;
+                return all_zones_ok;
 }
 /*
@@ -2537,7 +2677,7 @@ loop_again:
                                 * consider it to be no longer congested. It's
                                 * possible there are dirty pages backed by
                                 * congested BDIs but as pressure is relieved,
-                                 * spectulatively avoid congestion waits
+                                 * speculatively avoid congestion waits
                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
                                if (i <= *classzone_idx)
@@ -2545,6 +2685,16 @@ loop_again:
                        }
                }
+                /*
+                 * If the low watermark is met there is no need for processes
+                 * to be throttled on pfmemalloc_wait as they should not be
+                 * able to safely make forward progress. Wake them
+                 */
+                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+                                pfmemalloc_watermark_ok(pgdat))
+                        wake_up(&pgdat->pfmemalloc_wait);
                if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                        break;          /* kswapd: all done */
                /*
@@ -2646,7 +2796,7 @@ out:
        }
        /*
-         * Return the order we were reclaiming at so sleeping_prematurely()
+         * Return the order we were reclaiming at so prepare_kswapd_sleep()
         * makes a decision on the order we were last reclaiming at. However,
         * if another caller entered the allocator slow path while kswapd
         * was awake, order will remain at the higher level
@@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
        /* Try to sleep for a short interval */
-        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                remaining = schedule_timeout(HZ/10);
                finish_wait(&pgdat->kswapd_wait, &wait);
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
-        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
                /*
@@ -2688,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 * them before going back to sleep.
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
-                schedule();
+                if (!kthread_should_stop())
+                        schedule();
                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
        } else {
                if (remaining)
@@ -2955,14 +3108,17 @@ int kswapd_run(int nid)
 }
 /*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined.  Caller must
+ * hold lock_memory_hotplug().
 */
 void kswapd_stop(int nid)
 {
        struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
-        if (kswapd)
+        if (kswapd) {
                kthread_stop(kswapd);
+                NODE_DATA(nid)->kswapd = NULL;
+        }
 }
 static int __init kswapd_init(void)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1bbbbd9776ad..df7a6748231d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -745,6 +745,7 @@ const char * const vmstat_text[] = {
        TEXTS_FOR_ZONES("pgsteal_direct")
        TEXTS_FOR_ZONES("pgscan_kswapd")
        TEXTS_FOR_ZONES("pgscan_direct")
+        "pgscan_direct_throttle",
 #ifdef CONFIG_NUMA
        "zone_reclaim_failed",