61 files changed, 5833 insertions, 3730 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e338407f1225..d5c8019c6627 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
 config NO_BOOTMEM
        boolean
+config MEMORY_ISOLATION
+        boolean
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
+        select MEMORY_ISOLATION
        depends on SPARSEMEM || X86_64_ACPI_NUMA
        depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
        depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -198,7 +202,7 @@ config COMPACTION
 config MIGRATION
        bool "Page migration"
        def_bool y
-        depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION
+        depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA
        help
          Allows the migration of the physical location of pages of processes
          while the virtual addresses are not changed. This is useful in
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
        depends on MMU
        depends on ARCH_SUPPORTS_MEMORY_FAILURE
        bool "Enable recovery from hardware memory errors"
+        select MEMORY_ISOLATION
        help
          Enables code to recover from some memory failures on systems
          with MCA recovery. This allows a system to continue running
@@ -349,6 +354,16 @@ choice
          benefit.
 endchoice
+config CROSS_MEMORY_ATTACH
+        bool "Cross Memory Support"
+        depends on MMU
+        default y
+        help
+          Enabling this option adds the system calls process_vm_readv and
+          process_vm_writev which allow a process with the correct privileges
+          to directly read from or write to to another process's address space.
+          See the man page for more details.
 #
 # UP and nommu archs use km based percpu allocator
 #
@@ -379,3 +394,20 @@ config CLEANCACHE
          in a negligible performance hit.
          If unsure, say Y to enable cleancache
+config FRONTSWAP
+        bool "Enable frontswap to cache swap pages if tmem is present"
+        depends on SWAP
+        default n
+        help
+          Frontswap is so named because it can be thought of as the opposite
+          of a "backing" store for a swap device.  The data is stored into
+          "transcendent memory", memory that is not directly accessible or
+          addressable by the kernel and is of unknown and possibly
+          time-varying size.  When space in transcendent memory is available,
+          a significant swap I/O reduction may be achieved.  When none is
+          available, all frontswap calls are reduced to a single pointer-
+          compare-against-NULL resulting in a negligible performance hit
+          and swap data is stored as normal on the matching swap device.
+          If unsure, say Y to enable frontswap.
diff --git a/mm/Makefile b/mm/Makefile
index 50ec00ef2a0e..92753e2d82da 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,15 +5,19 @@
 mmu-y                   := nommu.o
 mmu-$(CONFIG_MMU)       := fremap.o highmem.o madvise.o memory.o mincore.o \
                           mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-                           vmalloc.o pagewalk.o pgtable-generic.o \
+                           vmalloc.o pagewalk.o pgtable-generic.o
-                           process_vm_access.o
+ifdef CONFIG_CROSS_MEMORY_ATTACH
+mmu-$(CONFIG_MMU)       += process_vm_access.o
+endif
 obj-y                   := filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                           page_isolation.o mm_init.o mmu_context.o percpu.o \
+                           mm_init.o mmu_context.o percpu.o slab_common.o \
-                           $(mmu-y)
+                           compaction.o $(mmu-y)
 obj-y += init-mm.o
 ifdef CONFIG_NO_BOOTMEM
@@ -25,14 +29,14 @@ endif
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 obj-$(CONFIG_BOUNCE)    += bounce.o
-obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_FRONTSWAP) += frontswap.o
 obj-$(CONFIG_HAS_DMA)   += dmapool.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_SLOB) += slob.o
-obj-$(CONFIG_COMPACTION) += compaction.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_KSM) += ksm.o
 obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
@@ -45,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aafb07e..b41823cc05e6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -39,12 +39,6 @@ DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
 LIST_HEAD(bdi_pending_list);
-static struct task_struct *sync_supers_tsk;
-static struct timer_list sync_supers_timer;
-static int bdi_sync_supers(void *);
-static void sync_supers_timer_fn(unsigned long);
 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
 {
        if (wb1 < wb2) {
@@ -250,12 +244,6 @@ static int __init default_bdi_init(void)
 {
        int err;
-        sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
-        BUG_ON(IS_ERR(sync_supers_tsk));
-        setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
-        bdi_arm_supers_timer();
        err = bdi_init(&default_backing_dev_info);
        if (!err)
                bdi_register(&default_backing_dev_info, NULL, "default");
@@ -270,46 +258,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
        return wb_has_dirty_io(&bdi->wb);
 }
-/*
- * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
- * or we risk deadlocking on ->s_umount. The longer term solution would be
- * to implement sync_supers_bdi() or similar and simply do it from the
- * bdi writeback thread individually.
- */
-static int bdi_sync_supers(void *unused)
-{
-        set_user_nice(current, 0);
-        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                schedule();
-                /*
-                 * Do this periodically, like kupdated() did before.
-                 */
-                sync_supers();
-        }
-        return 0;
-}
-void bdi_arm_supers_timer(void)
-{
-        unsigned long next;
-        if (!dirty_writeback_interval)
-                return;
-        next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
-        mod_timer(&sync_supers_timer, round_jiffies_up(next));
-}
-static void sync_supers_timer_fn(unsigned long unused)
-{
-        wake_up_process(sync_supers_tsk);
-        bdi_arm_supers_timer();
-}
 static void wakeup_timer_fn(unsigned long data)
 {
        struct backing_dev_info *bdi = (struct backing_dev_info *)data;
@@ -677,7 +625,7 @@ int bdi_init(struct backing_dev_info *bdi)
        bdi->min_ratio = 0;
        bdi->max_ratio = 100;
-        bdi->max_prop_frac = PROP_FRAC_BASE;
+        bdi->max_prop_frac = FPROP_FRAC_BASE;
        spin_lock_init(&bdi->wb_lock);
        INIT_LIST_HEAD(&bdi->bdi_list);
        INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +648,7 @@ int bdi_init(struct backing_dev_info *bdi)
        bdi->write_bandwidth = INIT_BW;
        bdi->avg_write_bandwidth = INIT_BW;
-        err = prop_local_init_percpu(&bdi->completions);
+        err = fprop_local_init_percpu(&bdi->completions);
        if (err) {
 err:
@@ -744,7 +692,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
                percpu_counter_destroy(&bdi->bdi_stat[i]);
-        prop_local_destroy_percpu(&bdi->completions);
+        fprop_local_destroy_percpu(&bdi->completions);
 }
 EXPORT_SYMBOL(bdi_destroy);
@@ -886,3 +834,23 @@ out:
        return ret;
 }
 EXPORT_SYMBOL(wait_iff_congested);
+int pdflush_proc_obsolete(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        char kbuf[] = "0\n";
+        if (*ppos) {
+                *lenp = 0;
+                return 0;
+        }
+        if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
+                return -EFAULT;
+        printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
+                        table->procname);
+        *lenp = 2;
+        *ppos += *lenp;
+        return 2;
+}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 0131170c9d54..bcb63ac48cc5 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -77,16 +77,16 @@ unsigned long __init bootmem_bootmap_pages(unsigned long pages)
 */
 static void __init link_bootmem(bootmem_data_t *bdata)
 {
-        struct list_head *iter;
+        bootmem_data_t *ent;
-        list_for_each(iter, &bdata_list) {
+        list_for_each_entry(ent, &bdata_list, list) {
-                bootmem_data_t *ent;
+                if (bdata->node_min_pfn < ent->node_min_pfn) {
+                        list_add_tail(&bdata->list, &ent->list);
-                ent = list_entry(iter, bootmem_data_t, list);
+                        return;
-                if (bdata->node_min_pfn < ent->node_min_pfn)
+                }
-                        break;
        }
-        list_add_tail(&bdata->list, iter);
+        list_add_tail(&bdata->list, &bdata_list);
 }
 /*
@@ -203,7 +203,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
                } else {
                        unsigned long off = 0;
-                        while (vec && off < BITS_PER_LONG) {
+                        vec >>= start & (BITS_PER_LONG - 1);
+                        while (vec) {
                                if (vec & 1) {
                                        page = pfn_to_page(start + off);
                                        __free_pages_bootmem(page, 0);
@@ -467,7 +468,7 @@ static unsigned long __init align_off(struct bootmem_data *bdata,
        return ALIGN(base + off, align) - base;
 }
-static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
                                        unsigned long size, unsigned long align,
                                        unsigned long goal, unsigned long limit)
 {
@@ -588,14 +589,14 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
                p_bdata = bootmem_arch_preferred_node(bdata, size, align,
                                                        goal, limit);
                if (p_bdata)
-                        return alloc_bootmem_core(p_bdata, size, align,
+                        return alloc_bootmem_bdata(p_bdata, size, align,
                                                        goal, limit);
        }
 #endif
        return NULL;
 }
-static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+static void * __init alloc_bootmem_core(unsigned long size,
                                        unsigned long align,
                                        unsigned long goal,
                                        unsigned long limit)
@@ -603,7 +604,6 @@ static void * __init ___alloc_bootmem_nopanic(unsigned long size,
        bootmem_data_t *bdata;
        void *region;
-restart:
        region = alloc_arch_preferred_bootmem(NULL, size, align, goal, limit);
        if (region)
                return region;
@@ -614,11 +614,25 @@ restart:
                if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
                        break;
-                region = alloc_bootmem_core(bdata, size, align, goal, limit);
+                region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
                if (region)
                        return region;
        }
+        return NULL;
+}
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+                                              unsigned long align,
+                                              unsigned long goal,
+                                              unsigned long limit)
+{
+        void *ptr;
+restart:
+        ptr = alloc_bootmem_core(size, align, goal, limit);
+        if (ptr)
+                return ptr;
        if (goal) {
                goal = 0;
                goto restart;
@@ -684,21 +698,60 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, limit);
 }
-static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
                                unsigned long size, unsigned long align,
                                unsigned long goal, unsigned long limit)
 {
        void *ptr;
-        ptr = alloc_arch_preferred_bootmem(bdata, size, align, goal, limit);
+again:
+        ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size,
+                                           align, goal, limit);
        if (ptr)
                return ptr;
-        ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
+        /* do not panic in alloc_bootmem_bdata() */
+        if (limit && goal + size > limit)
+                limit = 0;
+        ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
        if (ptr)
                return ptr;
-        return ___alloc_bootmem(size, align, goal, limit);
+        ptr = alloc_bootmem_core(size, align, goal, limit);
+        if (ptr)
+                return ptr;
+        if (goal) {
+                goal = 0;
+                goto again;
+        }
+        return NULL;
+}
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+                                   unsigned long align, unsigned long goal)
+{
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+        return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+}
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+                                    unsigned long align, unsigned long goal,
+                                    unsigned long limit)
+{
+        void *ptr;
+        ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+        if (ptr)
+                return ptr;
+        printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+        panic("Out of memory");
+        return NULL;
 }
 /**
@@ -722,7 +775,7 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        return  ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+        return  ___alloc_bootmem_node(pgdat, size, align, goal, 0);
 }
 void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -743,7 +796,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
                unsigned long new_goal;
                new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
-                ptr = alloc_bootmem_core(pgdat->bdata, size, align,
+                ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
                                                 new_goal, 0);
                if (ptr)
                        return ptr;
@@ -754,47 +807,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 }
-#ifdef CONFIG_SPARSEMEM
-/**
- * alloc_bootmem_section - allocate boot memory from a specific section
- * @size: size of the request in bytes
- * @section_nr: sparse map section to allocate from
- *
- * Return NULL on failure.
- */
-void * __init alloc_bootmem_section(unsigned long size,
-                                    unsigned long section_nr)
-{
-        bootmem_data_t *bdata;
-        unsigned long pfn, goal;
-        pfn = section_nr_to_pfn(section_nr);
-        goal = pfn << PAGE_SHIFT;
-        bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
-        return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, 0);
-}
-#endif
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-                                   unsigned long align, unsigned long goal)
-{
-        void *ptr;
-        if (WARN_ON_ONCE(slab_is_available()))
-                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
-        if (ptr)
-                return ptr;
-        ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
-        if (ptr)
-                return ptr;
-        return __alloc_bootmem_nopanic(size, align, goal);
-}
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
 #endif
@@ -839,6 +851,6 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        return ___alloc_bootmem_node(pgdat->bdata, size, align,
+        return ___alloc_bootmem_node(pgdat, size, align,
-                                goal, ARCH_LOW_ADDRESS_LIMIT);
+                                     goal, ARCH_LOW_ADDRESS_LIMIT);
 }
diff --git a/mm/bounce.c b/mm/bounce.c
index d1be02ca1889..042086775561 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -24,23 +24,25 @@
 static mempool_t *page_pool, *isa_page_pool;
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
 static __init int init_emergency_pool(void)
 {
-#ifndef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
        if (max_pfn <= max_low_pfn)
                return 0;
 #endif
        page_pool = mempool_create_page_pool(POOL_SIZE, 0);
        BUG_ON(!page_pool);
-        printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+        printk("bounce pool size: %d pages\n", POOL_SIZE);
        return 0;
 }
 __initcall(init_emergency_pool);
+#endif
+#ifdef CONFIG_HIGHMEM
 /*
 * highmem version, map in to vec
 */
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 5646c740f613..32e6f4136fa2 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs);
 static int cleancache_get_key(struct inode *inode,
                              struct cleancache_filekey *key)
 {
-        int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+        int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
        int len = 0, maxlen = CLEANCACHE_KEY_MAX;
        struct super_block *sb = inode->i_sb;
@@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode,
        if (sb->s_export_op != NULL) {
                fhfn = sb->s_export_op->encode_fh;
                if  (fhfn) {
-                        struct dentry d;
+                        len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
-                        d.d_inode = inode;
-                        len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
                        if (len <= 0 || len == 255)
                                return -1;
                        if (maxlen > CLEANCACHE_KEY_MAX)
diff --git a/mm/compaction.c b/mm/compaction.c
index 74a8c825ff28..7fcd3a52e68d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,30 +16,11 @@
 #include <linux/sysfs.h>
 #include "internal.h"
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
 #define CREATE_TRACE_POINTS
 #include <trace/events/compaction.h>
-/*
- * compact_control is used to track pages being migrated and the free pages
- * they are being migrated to during memory compaction. The free_pfn starts
- * at the end of a zone and migrate_pfn begins at the start. Movable pages
- * are moved to the end of a zone during a compaction run and the run
- * completes when free_pfn <= migrate_pfn
- */
-struct compact_control {
-        struct list_head freepages;     /* List of free pages to migrate to */
-        struct list_head migratepages;  /* List of pages being migrated */
-        unsigned long nr_freepages;     /* Number of isolated free pages */
-        unsigned long nr_migratepages;  /* Number of pages to migrate */
-        unsigned long free_pfn;         /* isolate_freepages search base */
-        unsigned long migrate_pfn;      /* isolate_migratepages search base */
-        bool sync;                      /* Synchronous migration */
-        int order;                      /* order a direct compactor needs */
-        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
-        struct zone *zone;
-};
 static unsigned long release_freepages(struct list_head *freelist)
 {
        struct page *page, *next;
@@ -54,24 +35,76 @@ static unsigned long release_freepages(struct list_head *freelist)
        return count;
 }
-/* Isolate free pages onto a private freelist. Must hold zone->lock */
+static void map_pages(struct list_head *list)
-static unsigned long isolate_freepages_block(struct zone *zone,
-                                unsigned long blockpfn,
-                                struct list_head *freelist)
 {
-        unsigned long zone_end_pfn, end_pfn;
+        struct page *page;
-        int nr_scanned = 0, total_isolated = 0;
-        struct page *cursor;
-        /* Get the last PFN we should scan for free pages at */
+        list_for_each_entry(page, list, lru) {
-        zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+                arch_alloc_page(page, 0);
-        end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
+                kernel_map_pages(page, 1, 1);
+        }
+}
-        /* Find the first usable PFN in the block to initialse page cursor */
+static inline bool migrate_async_suitable(int migratetype)
-        for (; blockpfn < end_pfn; blockpfn++) {
+{
-                if (pfn_valid_within(blockpfn))
+        return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE;
-                        break;
+}
+/*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. Check if the process needs to be scheduled or
+ * if the lock is contended. For async compaction, back out in the event
+ * if contention is severe. For sync compaction, schedule.
+ *
+ * Returns true if the lock is held.
+ * Returns false if the lock is released and compaction should abort
+ */
+static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+                                      bool locked, struct compact_control *cc)
+{
+        if (need_resched() || spin_is_contended(lock)) {
+                if (locked) {
+                        spin_unlock_irqrestore(lock, *flags);
+                        locked = false;
+                }
+                /* async aborts if taking too long or contended */
+                if (!cc->sync) {
+                        if (cc->contended)
+                                *cc->contended = true;
+                        return false;
+                }
+                cond_resched();
+                if (fatal_signal_pending(current))
+                        return false;
        }
+        if (!locked)
+                spin_lock_irqsave(lock, *flags);
+        return true;
+}
+static inline bool compact_trylock_irqsave(spinlock_t *lock,
+                        unsigned long *flags, struct compact_control *cc)
+{
+        return compact_checklock_irqsave(lock, flags, false, cc);
+}
+/*
+ * Isolate free pages onto a private freelist. Caller must hold zone->lock.
+ * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
+ * pages inside of the pageblock (even though it may still end up isolating
+ * some pages).
+ */
+static unsigned long isolate_freepages_block(unsigned long blockpfn,
+                                unsigned long end_pfn,
+                                struct list_head *freelist,
+                                bool strict)
+{
+        int nr_scanned = 0, total_isolated = 0;
+        struct page *cursor;
        cursor = pfn_to_page(blockpfn);
        /* Isolate free pages. This assumes the block is valid */
@@ -79,15 +112,23 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                int isolated, i;
                struct page *page = cursor;
-                if (!pfn_valid_within(blockpfn))
+                if (!pfn_valid_within(blockpfn)) {
+                        if (strict)
+                                return 0;
                        continue;
+                }
                nr_scanned++;
-                if (!PageBuddy(page))
+                if (!PageBuddy(page)) {
+                        if (strict)
+                                return 0;
                        continue;
+                }
                /* Found a free page, break it into order-0 pages */
                isolated = split_free_page(page);
+                if (!isolated && strict)
+                        return 0;
                total_isolated += isolated;
                for (i = 0; i < isolated; i++) {
                        list_add(&page->lru, freelist);
@@ -105,118 +146,75 @@ static unsigned long isolate_freepages_block(struct zone *zone,
        return total_isolated;
 }
-/* Returns true if the page is within a block suitable for migration to */
+/**
-static bool suitable_migration_target(struct page *page)
+ * isolate_freepages_range() - isolate free pages.
-{
+ * @start_pfn: The first PFN to start isolating.
+ * @end_pfn:   The one-past-last PFN.
-        int migratetype = get_pageblock_migratetype(page);
+ *
+ * Non-free pages, invalid PFNs, or zone boundaries within the
-        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+ * [start_pfn, end_pfn) range are considered errors, cause function to
-        if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+ * undo its actions and return zero.
-                return false;
+ *
+ * Otherwise, function returns one-past-the-last PFN of isolated page
-        /* If the page is a large free page, then allow migration */
+ * (which may be greater then end_pfn if end fell in a middle of
-        if (PageBuddy(page) && page_order(page) >= pageblock_order)
+ * a free page).
-                return true;
-        /* If the block is MIGRATE_MOVABLE, allow migration */
-        if (migratetype == MIGRATE_MOVABLE)
-                return true;
-        /* Otherwise skip the block */
-        return false;
-}
-/*
- * Based on information in the current compact_control, find blocks
- * suitable for isolating free pages from and then isolate them.
 */
-static void isolate_freepages(struct zone *zone,
+unsigned long
-                                struct compact_control *cc)
+isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 {
-        struct page *page;
+        unsigned long isolated, pfn, block_end_pfn, flags;
-        unsigned long high_pfn, low_pfn, pfn;
+        struct zone *zone = NULL;
-        unsigned long flags;
+        LIST_HEAD(freelist);
-        int nr_freepages = cc->nr_freepages;
-        struct list_head *freelist = &cc->freepages;
-        /*
+        if (pfn_valid(start_pfn))
-         * Initialise the free scanner. The starting point is where we last
+                zone = page_zone(pfn_to_page(start_pfn));
-         * scanned from (or the end of the zone if starting). The low point
-         * is the end of the pageblock the migration scanner is using.
-         */
-        pfn = cc->free_pfn;
-        low_pfn = cc->migrate_pfn + pageblock_nr_pages;
-        /*
+        for (pfn = start_pfn; pfn < end_pfn; pfn += isolated) {
-         * Take care that if the migration scanner is at the end of the zone
+                if (!pfn_valid(pfn) || zone != page_zone(pfn_to_page(pfn)))
-         * that the free scanner does not accidentally move to the next zone
+                        break;
-         * in the next isolation cycle.
-         */
-        high_pfn = min(low_pfn, pfn);
-        /*
-         * Isolate free pages until enough are available to migrate the
-         * pages on cc->migratepages. We stop searching if the migrate
-         * and free page scanners meet or enough free pages are isolated.
-         */
-        for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
-                                        pfn -= pageblock_nr_pages) {
-                unsigned long isolated;
-                if (!pfn_valid(pfn))
-                        continue;
                /*
-                 * Check for overlapping nodes/zones. It's possible on some
+                 * On subsequent iterations ALIGN() is actually not needed,
-                 * configurations to have a setup like
+                 * but we keep it that we not to complicate the code.
-                 * node0 node1 node0
-                 * i.e. it's possible that all pages within a zones range of
-                 * pages do not belong to a single zone.
                 */
-                page = pfn_to_page(pfn);
+                block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
-                if (page_zone(page) != zone)
+                block_end_pfn = min(block_end_pfn, end_pfn);
-                        continue;
-                /* Check the block is suitable for migration */
+                spin_lock_irqsave(&zone->lock, flags);
-                if (!suitable_migration_target(page))
+                isolated = isolate_freepages_block(pfn, block_end_pfn,
-                        continue;
+                                                   &freelist, true);
+                spin_unlock_irqrestore(&zone->lock, flags);
                /*
-                 * Found a block suitable for isolating free pages from. Now
+                 * In strict mode, isolate_freepages_block() returns 0 if
-                 * we disabled interrupts, double check things are ok and
+                 * there are any holes in the block (ie. invalid PFNs or
-                 * isolate the pages. This is to minimise the time IRQs
+                 * non-free pages).
-                 * are disabled
                 */
-                isolated = 0;
+                if (!isolated)
-                spin_lock_irqsave(&zone->lock, flags);
+                        break;
-                if (suitable_migration_target(page)) {
-                        isolated = isolate_freepages_block(zone, pfn, freelist);
-                        nr_freepages += isolated;
-                }
-                spin_unlock_irqrestore(&zone->lock, flags);
                /*
-                 * Record the highest PFN we isolated pages from. When next
+                 * If we managed to isolate pages, it is always (1 << n) *
-                 * looking for free pages, the search will restart here as
+                 * pageblock_nr_pages for some non-negative n.  (Max order
-                 * page migration may have returned some pages to the allocator
+                 * page may span two pageblocks).
                 */
-                if (isolated)
-                        high_pfn = max(high_pfn, pfn);
        }
        /* split_free_page does not map the pages */
-        list_for_each_entry(page, freelist, lru) {
+        map_pages(&freelist);
-                arch_alloc_page(page, 0);
-                kernel_map_pages(page, 1, 1);
+        if (pfn < end_pfn) {
+                /* Loop terminated early, cleanup. */
+                release_freepages(&freelist);
+                return 0;
        }
-        cc->free_pfn = high_pfn;
+        /* We don't use freelists for anything. */
-        cc->nr_freepages = nr_freepages;
+        return pfn;
 }
 /* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, struct compact_control *cc)
+static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
 {
        struct page *page;
        unsigned int count[2] = { 0, };
@@ -224,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
        list_for_each_entry(page, &cc->migratepages, lru)
                count[!!page_is_file_cache(page)]++;
-        __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+        /* If locked we can use the interrupt unsafe versions */
-        __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        if (locked) {
+                __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+                __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        } else {
+                mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+                mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        }
 }
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -243,37 +247,36 @@ static bool too_many_isolated(struct zone *zone)
        return isolated > (inactive + active) / 2;
 }
-/* possible outcome of isolate_migratepages */
+/**
-typedef enum {
+ * isolate_migratepages_range() - isolate all migrate-able pages in range.
-        ISOLATE_ABORT,          /* Abort compaction now */
+ * @zone:       Zone pages are in.
-        ISOLATE_NONE,           /* No pages isolated, continue scanning */
+ * @cc:         Compaction control structure.
-        ISOLATE_SUCCESS,        /* Pages isolated, migrate */
+ * @low_pfn:    The first PFN of the range.
-} isolate_migrate_t;
+ * @end_pfn:    The one-past-the-last PFN of the range.
+ *
-/*
+ * Isolate all pages that can be migrated from the range specified by
- * Isolate all pages that can be migrated from the block pointed to by
+ * [low_pfn, end_pfn).  Returns zero if there is a fatal signal
- * the migrate scanner within compact_control.
+ * pending), otherwise PFN of the first page that was not scanned
+ * (which may be both less, equal to or more then end_pfn).
+ *
+ * Assumes that cc->migratepages is empty and cc->nr_migratepages is
+ * zero.
+ *
+ * Apart from cc->migratepages and cc->nr_migratetypes this function
+ * does not modify any cc's fields, in particular it does not modify
+ * (or read for that matter) cc->migrate_pfn.
 */
-static isolate_migrate_t isolate_migratepages(struct zone *zone,
+unsigned long
-                                        struct compact_control *cc)
+isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+                           unsigned long low_pfn, unsigned long end_pfn)
 {
-        unsigned long low_pfn, end_pfn;
        unsigned long last_pageblock_nr = 0, pageblock_nr;
        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
-        isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
+        isolate_mode_t mode = 0;
+        struct lruvec *lruvec;
-        /* Do not scan outside zone boundaries */
+        unsigned long flags;
-        low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
+        bool locked;
-        /* Only scan within a pageblock boundary */
-        end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
-        /* Do not cross the free scanner or scan within a memory hole */
-        if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
-                cc->migrate_pfn = end_pfn;
-                return ISOLATE_NONE;
-        }
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -283,35 +286,32 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        while (unlikely(too_many_isolated(zone))) {
                /* async migration should just abort */
                if (!cc->sync)
-                        return ISOLATE_ABORT;
+                        return 0;
                congestion_wait(BLK_RW_ASYNC, HZ/10);
                if (fatal_signal_pending(current))
-                        return ISOLATE_ABORT;
+                        return 0;
        }
        /* Time to isolate some pages for migration */
        cond_resched();
-        spin_lock_irq(&zone->lru_lock);
+        spin_lock_irqsave(&zone->lru_lock, flags);
+        locked = true;
        for (; low_pfn < end_pfn; low_pfn++) {
                struct page *page;
-                bool locked = true;
                /* give a chance to irqs before checking need_resched() */
                if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-                        spin_unlock_irq(&zone->lru_lock);
+                        spin_unlock_irqrestore(&zone->lru_lock, flags);
                        locked = false;
                }
-                if (need_resched() || spin_is_contended(&zone->lru_lock)) {
-                        if (locked)
+                /* Check if it is ok to still hold the lock */
-                                spin_unlock_irq(&zone->lru_lock);
+                locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
-                        cond_resched();
+                                                                locked, cc);
-                        spin_lock_irq(&zone->lru_lock);
+                if (!locked)
-                        if (fatal_signal_pending(current))
+                        break;
-                                break;
-                } else if (!locked)
-                        spin_lock_irq(&zone->lru_lock);
                /*
                 * migrate_pfn does not necessarily start aligned to a
@@ -351,7 +351,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                 */
                pageblock_nr = low_pfn >> pageblock_order;
                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
-                                get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+                    !migrate_async_suitable(get_pageblock_migratetype(page))) {
                        low_pfn += pageblock_nr_pages;
                        low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
                        last_pageblock_nr = pageblock_nr;
@@ -374,14 +374,16 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                if (!cc->sync)
                        mode |= ISOLATE_ASYNC_MIGRATE;
+                lruvec = mem_cgroup_page_lruvec(page, zone);
                /* Try isolate the page */
-                if (__isolate_lru_page(page, mode, 0) != 0)
+                if (__isolate_lru_page(page, mode) != 0)
                        continue;
                VM_BUG_ON(PageTransCompound(page));
                /* Successfully isolated */
-                del_page_from_lru_list(zone, page, page_lru(page));
+                del_page_from_lru_list(page, lruvec, page_lru(page));
                list_add(&page->lru, migratelist);
                cc->nr_migratepages++;
                nr_isolated++;
@@ -393,14 +395,167 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                }
        }
-        acct_isolated(zone, cc);
+        acct_isolated(zone, locked, cc);
-        spin_unlock_irq(&zone->lru_lock);
+        if (locked)
-        cc->migrate_pfn = low_pfn;
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
-        return ISOLATE_SUCCESS;
+        return low_pfn;
+}
+#endif /* CONFIG_COMPACTION || CONFIG_CMA */
+#ifdef CONFIG_COMPACTION
+/* Returns true if the page is within a block suitable for migration to */
+static bool suitable_migration_target(struct page *page)
+{
+        int migratetype = get_pageblock_migratetype(page);
+        /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
+        if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
+                return false;
+        /* If the page is a large free page, then allow migration */
+        if (PageBuddy(page) && page_order(page) >= pageblock_order)
+                return true;
+        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
+        if (migrate_async_suitable(migratetype))
+                return true;
+        /* Otherwise skip the block */
+        return false;
+}
+/*
+ * Returns the start pfn of the last page block in a zone.  This is the starting
+ * point for full compaction of a zone.  Compaction searches for free pages from
+ * the end of each zone, while isolate_freepages_block scans forward inside each
+ * page block.
+ */
+static unsigned long start_free_pfn(struct zone *zone)
+{
+        unsigned long free_pfn;
+        free_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        free_pfn &= ~(pageblock_nr_pages-1);
+        return free_pfn;
+}
+/*
+ * Based on information in the current compact_control, find blocks
+ * suitable for isolating free pages from and then isolate them.
+ */
+static void isolate_freepages(struct zone *zone,
+                                struct compact_control *cc)
+{
+        struct page *page;
+        unsigned long high_pfn, low_pfn, pfn, zone_end_pfn, end_pfn;
+        unsigned long flags;
+        int nr_freepages = cc->nr_freepages;
+        struct list_head *freelist = &cc->freepages;
+        /*
+         * Initialise the free scanner. The starting point is where we last
+         * scanned from (or the end of the zone if starting). The low point
+         * is the end of the pageblock the migration scanner is using.
+         */
+        pfn = cc->free_pfn;
+        low_pfn = cc->migrate_pfn + pageblock_nr_pages;
+        /*
+         * Take care that if the migration scanner is at the end of the zone
+         * that the free scanner does not accidentally move to the next zone
+         * in the next isolation cycle.
+         */
+        high_pfn = min(low_pfn, pfn);
+        zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        /*
+         * Isolate free pages until enough are available to migrate the
+         * pages on cc->migratepages. We stop searching if the migrate
+         * and free page scanners meet or enough free pages are isolated.
+         */
+        for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
+                                        pfn -= pageblock_nr_pages) {
+                unsigned long isolated;
+                if (!pfn_valid(pfn))
+                        continue;
+                /*
+                 * Check for overlapping nodes/zones. It's possible on some
+                 * configurations to have a setup like
+                 * node0 node1 node0
+                 * i.e. it's possible that all pages within a zones range of
+                 * pages do not belong to a single zone.
+                 */
+                page = pfn_to_page(pfn);
+                if (page_zone(page) != zone)
+                        continue;
+                /* Check the block is suitable for migration */
+                if (!suitable_migration_target(page))
+                        continue;
+                /*
+                 * Found a block suitable for isolating free pages from. Now
+                 * we disabled interrupts, double check things are ok and
+                 * isolate the pages. This is to minimise the time IRQs
+                 * are disabled
+                 */
+                isolated = 0;
+                /*
+                 * The zone lock must be held to isolate freepages. This
+                 * unfortunately this is a very coarse lock and can be
+                 * heavily contended if there are parallel allocations
+                 * or parallel compactions. For async compaction do not
+                 * spin on the lock
+                 */
+                if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
+                        break;
+                if (suitable_migration_target(page)) {
+                        end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
+                        isolated = isolate_freepages_block(pfn, end_pfn,
+                                                           freelist, false);
+                        nr_freepages += isolated;
+                }
+                spin_unlock_irqrestore(&zone->lock, flags);
+                /*
+                 * Record the highest PFN we isolated pages from. When next
+                 * looking for free pages, the search will restart here as
+                 * page migration may have returned some pages to the allocator
+                 */
+                if (isolated) {
+                        high_pfn = max(high_pfn, pfn);
+                        /*
+                         * If the free scanner has wrapped, update
+                         * compact_cached_free_pfn to point to the highest
+                         * pageblock with free pages. This reduces excessive
+                         * scanning of full pageblocks near the end of the
+                         * zone
+                         */
+                        if (cc->order > 0 && cc->wrapped)
+                                zone->compact_cached_free_pfn = high_pfn;
+                }
+        }
+        /* split_free_page does not map the pages */
+        map_pages(freelist);
+        cc->free_pfn = high_pfn;
+        cc->nr_freepages = nr_freepages;
+        /* If compact_cached_free_pfn is reset then set it now */
+        if (cc->order > 0 && !cc->wrapped &&
+                        zone->compact_cached_free_pfn == start_free_pfn(zone))
+                zone->compact_cached_free_pfn = high_pfn;
 }
 /*
@@ -449,6 +604,44 @@ static void update_nr_listpages(struct compact_control *cc)
        cc->nr_freepages = nr_freepages;
 }
+/* possible outcome of isolate_migratepages */
+typedef enum {
+        ISOLATE_ABORT,          /* Abort compaction now */
+        ISOLATE_NONE,           /* No pages isolated, continue scanning */
+        ISOLATE_SUCCESS,        /* Pages isolated, migrate */
+} isolate_migrate_t;
+/*
+ * Isolate all pages that can be migrated from the block pointed to by
+ * the migrate scanner within compact_control.
+ */
+static isolate_migrate_t isolate_migratepages(struct zone *zone,
+                                        struct compact_control *cc)
+{
+        unsigned long low_pfn, end_pfn;
+        /* Do not scan outside zone boundaries */
+        low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
+        /* Only scan within a pageblock boundary */
+        end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
+        /* Do not cross the free scanner or scan within a memory hole */
+        if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
+                cc->migrate_pfn = end_pfn;
+                return ISOLATE_NONE;
+        }
+        /* Perform the isolation */
+        low_pfn = isolate_migratepages_range(zone, cc, low_pfn, end_pfn);
+        if (!low_pfn)
+                return ISOLATE_ABORT;
+        cc->migrate_pfn = low_pfn;
+        return ISOLATE_SUCCESS;
+}
 static int compact_finished(struct zone *zone,
                            struct compact_control *cc)
 {
@@ -458,8 +651,26 @@ static int compact_finished(struct zone *zone,
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
-        /* Compaction run completes if the migrate and free scanner meet */
+        /*
-        if (cc->free_pfn <= cc->migrate_pfn)
+         * A full (order == -1) compaction run starts at the beginning and
+         * end of a zone; it completes when the migrate and free scanner meet.
+         * A partial (order > 0) compaction can start with the free scanner
+         * at a random point in the zone, and may have to restart.
+         */
+        if (cc->free_pfn <= cc->migrate_pfn) {
+                if (cc->order > 0 && !cc->wrapped) {
+                        /* We started partway through; restart at the end. */
+                        unsigned long free_pfn = start_free_pfn(zone);
+                        zone->compact_cached_free_pfn = free_pfn;
+                        cc->free_pfn = free_pfn;
+                        cc->wrapped = 1;
+                        return COMPACT_CONTINUE;
+                }
+                return COMPACT_COMPLETE;
+        }
+        /* We wrapped around and ended up where we started. */
+        if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
                return COMPACT_COMPLETE;
        /*
@@ -557,8 +768,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
-        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
-        cc->free_pfn &= ~(pageblock_nr_pages-1);
+        if (cc->order > 0) {
+                /* Incremental compaction. Start where the last one stopped. */
+                cc->free_pfn = zone->compact_cached_free_pfn;
+                cc->start_free_pfn = cc->free_pfn;
+        } else {
+                /* Order == -1 starts at the end of the zone. */
+                cc->free_pfn = start_free_pfn(zone);
+        }
        migrate_prep_local();
@@ -594,8 +812,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                if (err) {
                        putback_lru_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
+                        if (err == -ENOMEM) {
+                                ret = COMPACT_PARTIAL;
+                                goto out;
+                        }
                }
        }
 out:
@@ -608,7 +829,7 @@ out:
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                 bool sync)
+                                 bool sync, bool *contended)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -617,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
+                .contended = contended,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -638,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        bool sync)
+                        bool sync, bool *contended)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -662,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                                                                nodemask) {
                int status;
-                status = compact_zone_order(zone, order, gfp_mask, sync);
+                status = compact_zone_order(zone, order, gfp_mask, sync,
+                                                contended);
                rc = max(status, rc);
                /* If a normal allocation would succeed, stop compacting */
@@ -698,7 +921,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                if (cc->order > 0) {
                        int ok = zone_watermark_ok(zone, cc->order,
                                                low_wmark_pages(zone), 0, 0);
-                        if (ok && cc->order > zone->compact_order_failed)
+                        if (ok && cc->order >= zone->compact_order_failed)
                                zone->compact_order_failed = cc->order + 1;
                        /* Currently async compaction is never deferred. */
                        else if (!ok && cc->sync)
@@ -795,3 +1018,5 @@ void compaction_unregister_node(struct node *node)
        return device_remove_file(&node->dev, &dev_attr_compact);
 }
 #endif /* CONFIG_SYSFS && CONFIG_NUMA */
+#endif /* CONFIG_COMPACTION */
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af79..9b75a045dbf4 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                spin_unlock(&file->f_lock);
                break;
        case POSIX_FADV_WILLNEED:
-                if (!mapping->a_ops->readpage) {
-                        ret = -EINVAL;
-                        break;
-                }
                /* First and last PARTIAL page! */
                start_index = offset >> PAGE_CACHE_SHIFT;
                end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                nrpages = end_index - start_index + 1;
                if (!nrpages)
                        nrpages = ~0UL;
-                
-                ret = force_page_cache_readahead(mapping, file,
+                /*
-                                start_index,
+                 * Ignore return value because fadvise() shall return
-                                nrpages);
+                 * success even if filesystem can't retrieve a hint,
-                if (ret > 0)
+                 */
-                        ret = 0;
+                force_page_cache_readahead(mapping, file, start_index,
+                                           nrpages);
                break;
        case POSIX_FADV_NOREUSE:
                break;
diff --git a/mm/filemap.c b/mm/filemap.c
index 79c4b2b0b14e..384344575c37 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,7 +29,6 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/security.h>
-#include <linux/syscalls.h>
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
@@ -1413,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        retval = filemap_write_and_wait_range(mapping, pos,
                                        pos + iov_length(iov, nr_segs) - 1);
                        if (!retval) {
-                                struct blk_plug plug;
-                                blk_start_plug(&plug);
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
-                                blk_finish_plug(&plug);
                        }
                        if (retval > 0) {
                                *ppos = pos + retval;
@@ -1478,44 +1473,6 @@ out:
 }
 EXPORT_SYMBOL(generic_file_aio_read);
-static ssize_t
-do_readahead(struct address_space *mapping, struct file *filp,
-             pgoff_t index, unsigned long nr)
-{
-        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
-                return -EINVAL;
-        force_page_cache_readahead(mapping, filp, index, nr);
-        return 0;
-}
-SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
-{
-        ssize_t ret;
-        struct file *file;
-        ret = -EBADF;
-        file = fget(fd);
-        if (file) {
-                if (file->f_mode & FMODE_READ) {
-                        struct address_space *mapping = file->f_mapping;
-                        pgoff_t start = offset >> PAGE_CACHE_SHIFT;
-                        pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
-                        unsigned long len = end - start + 1;
-                        ret = do_readahead(mapping, file, start, len);
-                }
-                fput(file);
-        }
-        return ret;
-}
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
-{
-        return SYSC_readahead((int) fd, offset, (size_t) count);
-}
-SYSCALL_ALIAS(sys_readahead, SyS_readahead);
-#endif
 #ifdef CONFIG_MMU
 /**
 * page_cache_read - adds requested page to the page cache if not already there
@@ -1751,8 +1708,35 @@ page_not_uptodate:
 }
 EXPORT_SYMBOL(filemap_fault);
+int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct page *page = vmf->page;
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+        int ret = VM_FAULT_LOCKED;
+        sb_start_pagefault(inode->i_sb);
+        file_update_time(vma->vm_file);
+        lock_page(page);
+        if (page->mapping != inode->i_mapping) {
+                unlock_page(page);
+                ret = VM_FAULT_NOPAGE;
+                goto out;
+        }
+        /*
+         * We mark the page dirty already here so that when freeze is in
+         * progress, we are guaranteed that writeback during freezing will
+         * see the dirty page and writeprotect it again.
+         */
+        set_page_dirty(page);
+out:
+        sb_end_pagefault(inode->i_sb);
+        return ret;
+}
+EXPORT_SYMBOL(filemap_page_mkwrite);
 const struct vm_operations_struct generic_file_vm_ops = {
        .fault          = filemap_fault,
+        .page_mkwrite   = filemap_page_mkwrite,
 };
 /* This is used for a general mmap of a disk file */
@@ -1938,71 +1922,6 @@ struct page *read_cache_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(read_cache_page);
-/*
- * The logic we want is
- *
- *      if suid or (sgid and xgrp)
- *              remove privs
- */
-int should_remove_suid(struct dentry *dentry)
-{
-        umode_t mode = dentry->d_inode->i_mode;
-        int kill = 0;
-        /* suid always must be killed */
-        if (unlikely(mode & S_ISUID))
-                kill = ATTR_KILL_SUID;
-        /*
-         * sgid without any exec bits is just a mandatory locking mark; leave
-         * it alone.  If some exec bits are set, it's a real sgid; kill it.
-         */
-        if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
-                kill |= ATTR_KILL_SGID;
-        if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
-                return kill;
-        return 0;
-}
-EXPORT_SYMBOL(should_remove_suid);
-static int __remove_suid(struct dentry *dentry, int kill)
-{
-        struct iattr newattrs;
-        newattrs.ia_valid = ATTR_FORCE | kill;
-        return notify_change(dentry, &newattrs);
-}
-int file_remove_suid(struct file *file)
-{
-        struct dentry *dentry = file->f_path.dentry;
-        struct inode *inode = dentry->d_inode;
-        int killsuid;
-        int killpriv;
-        int error = 0;
-        /* Fast path for nothing security related */
-        if (IS_NOSEC(inode))
-                return 0;
-        killsuid = should_remove_suid(dentry);
-        killpriv = security_inode_need_killpriv(dentry);
-        if (killpriv < 0)
-                return killpriv;
-        if (killpriv)
-                error = security_inode_killpriv(dentry);
-        if (!error && killsuid)
-                error = __remove_suid(dentry, killsuid);
-        if (!error && (inode->i_sb->s_flags & MS_NOSEC))
-                inode->i_flags |= S_NOSEC;
-        return error;
-}
-EXPORT_SYMBOL(file_remove_suid);
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
                        const struct iovec *iov, size_t base, size_t bytes)
 {
@@ -2511,8 +2430,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        count = ocount;
        pos = *ppos;
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
        written = 0;
@@ -2528,7 +2445,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        if (err)
                goto out;
-        file_update_time(file);
+        err = file_update_time(file);
+        if (err)
+                goto out;
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (unlikely(file->f_flags & O_DIRECT)) {
@@ -2604,13 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
+        sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
-        blk_start_plug(&plug);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
@@ -2621,7 +2539,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
-        blk_finish_plug(&plug);
+        sb_end_write(inode->i_sb);
        return ret;
 }
 EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index a4eb31132229..13e013b1270c 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -304,6 +304,7 @@ out:
 static const struct vm_operations_struct xip_file_vm_ops = {
        .fault  = xip_file_fault,
+        .page_mkwrite   = filemap_page_mkwrite,
 };
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -401,6 +402,8 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        loff_t pos;
        ssize_t ret;
+        sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
        if (!access_ok(VERIFY_READ, buf, len)) {
@@ -411,8 +414,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        pos = *ppos;
        count = len;
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
@@ -426,7 +427,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        if (ret)
                goto out_backing;
-        file_update_time(filp);
+        ret = file_update_time(filp);
+        if (ret)
+                goto out_backing;
        ret = __xip_file_write (filp, buf, count, pos, ppos);
@@ -434,6 +437,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        current->backing_dev_info = NULL;
 out_up:
        mutex_unlock(&inode->i_mutex);
+        sb_end_write(inode->i_sb);
        return ret;
 }
 EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/frontswap.c b/mm/frontswap.c
new file mode 100644
index 000000000000..6b3e71a2cd48
--- /dev/null
+++ b/mm/frontswap.c
@@ -0,0 +1,344 @@
+/*
+ * Frontswap frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of frontswap.  See
+ * Documentation/vm/frontswap.txt for more information.
+ *
+ * Copyright (C) 2009-2012 Oracle Corp.  All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/mman.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/security.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+/*
+ * frontswap_ops is set by frontswap_register_ops to contain the pointers
+ * to the frontswap "backend" implementation functions.
+ */
+static struct frontswap_ops frontswap_ops __read_mostly;
+/*
+ * This global enablement flag reduces overhead on systems where frontswap_ops
+ * has not been registered, so is preferred to the slower alternative: a
+ * function call that checks a non-global.
+ */
+bool frontswap_enabled __read_mostly;
+EXPORT_SYMBOL(frontswap_enabled);
+/*
+ * If enabled, frontswap_store will return failure even on success.  As
+ * a result, the swap subsystem will always write the page to swap, in
+ * effect converting frontswap into a writethrough cache.  In this mode,
+ * there is no direct reduction in swap writes, but a frontswap backend
+ * can unilaterally "reclaim" any pages in use with no data loss, thus
+ * providing increases control over maximum memory usage due to frontswap.
+ */
+static bool frontswap_writethrough_enabled __read_mostly;
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * properly configured).  These are for information only so are not protected
+ * against increment races.
+ */
+static u64 frontswap_loads;
+static u64 frontswap_succ_stores;
+static u64 frontswap_failed_stores;
+static u64 frontswap_invalidates;
+static inline void inc_frontswap_loads(void) {
+        frontswap_loads++;
+}
+static inline void inc_frontswap_succ_stores(void) {
+        frontswap_succ_stores++;
+}
+static inline void inc_frontswap_failed_stores(void) {
+        frontswap_failed_stores++;
+}
+static inline void inc_frontswap_invalidates(void) {
+        frontswap_invalidates++;
+}
+#else
+static inline void inc_frontswap_loads(void) { }
+static inline void inc_frontswap_succ_stores(void) { }
+static inline void inc_frontswap_failed_stores(void) { }
+static inline void inc_frontswap_invalidates(void) { }
+#endif
+/*
+ * Register operations for frontswap, returning previous thus allowing
+ * detection of multiple backends and possible nesting.
+ */
+struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+{
+        struct frontswap_ops old = frontswap_ops;
+        frontswap_ops = *ops;
+        frontswap_enabled = true;
+        return old;
+}
+EXPORT_SYMBOL(frontswap_register_ops);
+/*
+ * Enable/disable frontswap writethrough (see above).
+ */
+void frontswap_writethrough(bool enable)
+{
+        frontswap_writethrough_enabled = enable;
+}
+EXPORT_SYMBOL(frontswap_writethrough);
+/*
+ * Called when a swap device is swapon'd.
+ */
+void __frontswap_init(unsigned type)
+{
+        struct swap_info_struct *sis = swap_info[type];
+        BUG_ON(sis == NULL);
+        if (sis->frontswap_map == NULL)
+                return;
+        frontswap_ops.init(type);
+}
+EXPORT_SYMBOL(__frontswap_init);
+static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+        frontswap_clear(sis, offset);
+        atomic_dec(&sis->frontswap_pages);
+}
+/*
+ * "Store" data from a page to frontswap and associate it with the page's
+ * swaptype and offset.  Page must be locked and in the swap cache.
+ * If frontswap already contains a page with matching swaptype and
+ * offset, the frontswap implementation may either overwrite the data and
+ * return success or invalidate the page from frontswap and return failure.
+ */
+int __frontswap_store(struct page *page)
+{
+        int ret = -1, dup = 0;
+        swp_entry_t entry = { .val = page_private(page), };
+        int type = swp_type(entry);
+        struct swap_info_struct *sis = swap_info[type];
+        pgoff_t offset = swp_offset(entry);
+        BUG_ON(!PageLocked(page));
+        BUG_ON(sis == NULL);
+        if (frontswap_test(sis, offset))
+                dup = 1;
+        ret = frontswap_ops.store(type, offset, page);
+        if (ret == 0) {
+                frontswap_set(sis, offset);
+                inc_frontswap_succ_stores();
+                if (!dup)
+                        atomic_inc(&sis->frontswap_pages);
+        } else {
+                /*
+                  failed dup always results in automatic invalidate of
+                  the (older) page from frontswap
+                 */
+                inc_frontswap_failed_stores();
+                if (dup)
+                        __frontswap_clear(sis, offset);
+        }
+        if (frontswap_writethrough_enabled)
+                /* report failure so swap also writes to swap device */
+                ret = -1;
+        return ret;
+}
+EXPORT_SYMBOL(__frontswap_store);
+/*
+ * "Get" data from frontswap associated with swaptype and offset that were
+ * specified when the data was put to frontswap and use it to fill the
+ * specified page with data. Page must be locked and in the swap cache.
+ */
+int __frontswap_load(struct page *page)
+{
+        int ret = -1;
+        swp_entry_t entry = { .val = page_private(page), };
+        int type = swp_type(entry);
+        struct swap_info_struct *sis = swap_info[type];
+        pgoff_t offset = swp_offset(entry);
+        BUG_ON(!PageLocked(page));
+        BUG_ON(sis == NULL);
+        if (frontswap_test(sis, offset))
+                ret = frontswap_ops.load(type, offset, page);
+        if (ret == 0)
+                inc_frontswap_loads();
+        return ret;
+}
+EXPORT_SYMBOL(__frontswap_load);
+/*
+ * Invalidate any data from frontswap associated with the specified swaptype
+ * and offset so that a subsequent "get" will fail.
+ */
+void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+        struct swap_info_struct *sis = swap_info[type];
+        BUG_ON(sis == NULL);
+        if (frontswap_test(sis, offset)) {
+                frontswap_ops.invalidate_page(type, offset);
+                __frontswap_clear(sis, offset);
+                inc_frontswap_invalidates();
+        }
+}
+EXPORT_SYMBOL(__frontswap_invalidate_page);
+/*
+ * Invalidate all data from frontswap associated with all offsets for the
+ * specified swaptype.
+ */
+void __frontswap_invalidate_area(unsigned type)
+{
+        struct swap_info_struct *sis = swap_info[type];
+        BUG_ON(sis == NULL);
+        if (sis->frontswap_map == NULL)
+                return;
+        frontswap_ops.invalidate_area(type);
+        atomic_set(&sis->frontswap_pages, 0);
+        memset(sis->frontswap_map, 0, sis->max / sizeof(long));
+}
+EXPORT_SYMBOL(__frontswap_invalidate_area);
+static unsigned long __frontswap_curr_pages(void)
+{
+        int type;
+        unsigned long totalpages = 0;
+        struct swap_info_struct *si = NULL;
+        assert_spin_locked(&swap_lock);
+        for (type = swap_list.head; type >= 0; type = si->next) {
+                si = swap_info[type];
+                totalpages += atomic_read(&si->frontswap_pages);
+        }
+        return totalpages;
+}
+static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+                                        int *swapid)
+{
+        int ret = -EINVAL;
+        struct swap_info_struct *si = NULL;
+        int si_frontswap_pages;
+        unsigned long total_pages_to_unuse = total;
+        unsigned long pages = 0, pages_to_unuse = 0;
+        int type;
+        assert_spin_locked(&swap_lock);
+        for (type = swap_list.head; type >= 0; type = si->next) {
+                si = swap_info[type];
+                si_frontswap_pages = atomic_read(&si->frontswap_pages);
+                if (total_pages_to_unuse < si_frontswap_pages) {
+                        pages = pages_to_unuse = total_pages_to_unuse;
+                } else {
+                        pages = si_frontswap_pages;
+                        pages_to_unuse = 0; /* unuse all */
+                }
+                /* ensure there is enough RAM to fetch pages from frontswap */
+                if (security_vm_enough_memory_mm(current->mm, pages)) {
+                        ret = -ENOMEM;
+                        continue;
+                }
+                vm_unacct_memory(pages);
+                *unused = pages_to_unuse;
+                *swapid = type;
+                ret = 0;
+                break;
+        }
+        return ret;
+}
+static int __frontswap_shrink(unsigned long target_pages,
+                                unsigned long *pages_to_unuse,
+                                int *type)
+{
+        unsigned long total_pages = 0, total_pages_to_unuse;
+        assert_spin_locked(&swap_lock);
+        total_pages = __frontswap_curr_pages();
+        if (total_pages <= target_pages) {
+                /* Nothing to do */
+                *pages_to_unuse = 0;
+                return 0;
+        }
+        total_pages_to_unuse = total_pages - target_pages;
+        return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
+}
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+        unsigned long pages_to_unuse = 0;
+        int type, ret;
+        /*
+         * we don't want to hold swap_lock while doing a very
+         * lengthy try_to_unuse, but swap_list may change
+         * so restart scan from swap_list.head each time
+         */
+        spin_lock(&swap_lock);
+        ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
+        spin_unlock(&swap_lock);
+        if (ret == 0 && pages_to_unuse)
+                try_to_unuse(type, true, pages_to_unuse);
+        return;
+}
+EXPORT_SYMBOL(frontswap_shrink);
+/*
+ * Count and return the number of frontswap pages across all
+ * swap devices.  This is exported so that backend drivers can
+ * determine current usage without reading debugfs.
+ */
+unsigned long frontswap_curr_pages(void)
+{
+        unsigned long totalpages = 0;
+        spin_lock(&swap_lock);
+        totalpages = __frontswap_curr_pages();
+        spin_unlock(&swap_lock);
+        return totalpages;
+}
+EXPORT_SYMBOL(frontswap_curr_pages);
+static int __init init_frontswap(void)
+{
+#ifdef CONFIG_DEBUG_FS
+        struct dentry *root = debugfs_create_dir("frontswap", NULL);
+        if (root == NULL)
+                return -ENXIO;
+        debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
+        debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
+        debugfs_create_u64("failed_stores", S_IRUGO, root,
+                                &frontswap_failed_stores);
+        debugfs_create_u64("invalidates", S_IRUGO,
+                                root, &frontswap_invalidates);
+#endif
+        return 0;
+}
+module_init(init_frontswap);
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6250c3..d517cd16a6eb 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
                do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
 #endif
+struct page *kmap_to_page(void *vaddr)
+{
+        unsigned long addr = (unsigned long)vaddr;
+        if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
+                int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
+                return pte_page(pkmap_page_table[i]);
+        }
+        return virt_to_page(addr);
+}
 static void flush_all_zero_pkmaps(void)
 {
        int i;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f0e5306eeb55..57c4b9309015 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -636,16 +636,12 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                                        unsigned long haddr, pmd_t *pmd,
                                        struct page *page)
 {
-        int ret = 0;
        pgtable_t pgtable;
        VM_BUG_ON(!PageCompound(page));
        pgtable = pte_alloc_one(mm, haddr);
-        if (unlikely(!pgtable)) {
+        if (unlikely(!pgtable))
-                mem_cgroup_uncharge_page(page);
-                put_page(page);
                return VM_FAULT_OOM;
-        }
        clear_huge_page(page, haddr, HPAGE_PMD_NR);
        __SetPageUptodate(page);
@@ -675,7 +671,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
                spin_unlock(&mm->page_table_lock);
        }
-        return ret;
+        return 0;
 }
 static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
@@ -724,8 +720,14 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        put_page(page);
                        goto out;
                }
+                if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd,
+                                                          page))) {
+                        mem_cgroup_uncharge_page(page);
+                        put_page(page);
+                        goto out;
+                }
-                return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+                return 0;
        }
 out:
        /*
@@ -950,6 +952,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(THP_FAULT_FALLBACK);
                ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
                                                   pmd, orig_pmd, page, haddr);
+                if (ret & VM_FAULT_OOM)
+                        split_huge_page(page);
                put_page(page);
                goto out;
        }
@@ -957,6 +961,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
                put_page(new_page);
+                split_huge_page(page);
                put_page(page);
                ret |= VM_FAULT_OOM;
                goto out;
@@ -968,8 +973,10 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_lock(&mm->page_table_lock);
        put_page(page);
        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+                spin_unlock(&mm->page_table_lock);
                mem_cgroup_uncharge_page(new_page);
                put_page(new_page);
+                goto out;
        } else {
                pmd_t entry;
                VM_BUG_ON(!PageHead(page));
@@ -1224,10 +1231,13 @@ static void __split_huge_page_refcount(struct page *page)
 {
        int i;
        struct zone *zone = page_zone(page);
+        struct lruvec *lruvec;
        int tail_count = 0;
        /* prevent PageLRU to go away from under us, and freeze lru stats */
        spin_lock_irq(&zone->lru_lock);
+        lruvec = mem_cgroup_page_lruvec(page, zone);
        compound_lock(page);
        /* complete memcg works before add pages to LRU */
        mem_cgroup_split_huge_fixup(page);
@@ -1302,13 +1312,12 @@ static void __split_huge_page_refcount(struct page *page)
                BUG_ON(!PageDirty(page_tail));
                BUG_ON(!PageSwapBacked(page_tail));
+                lru_add_page_tail(page, page_tail, lruvec);
-                lru_add_page_tail(zone, page, page_tail);
        }
        atomic_sub(tail_count, &page->_count);
        BUG_ON(atomic_read(&page->_count) <= 0);
-        __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+        __mod_zone_page_state(zone, NR_ANON_TRANSPARENT_HUGEPAGES, -1);
        __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
        ClearPageCompound(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b8ce6f450956..bc727122dd44 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <linux/io.h>
+#include <asm/tlb.h>
+#include <linux/io.h>
 #include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
+#include <linux/hugetlb_cgroup.h>
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-static int max_hstate;
+int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
 struct hstate hstates[HUGE_MAX_HSTATE];
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
 static unsigned long __initdata default_hstate_max_huge_pages;
 static unsigned long __initdata default_hstate_size;
-#define for_each_hstate(h) \
-        for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
-static DEFINE_SPINLOCK(hugetlb_lock);
+DEFINE_SPINLOCK(hugetlb_lock);
 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
 {
@@ -273,8 +273,8 @@ static long region_count(struct list_head *head, long f, long t)
        /* Locate each segment we overlap with, and count that overlap. */
        list_for_each_entry(rg, head, link) {
-                int seg_from;
+                long seg_from;
-                int seg_to;
+                long seg_to;
                if (rg->to <= f)
                        continue;
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
-        list_add(&page->lru, &h->hugepage_freelists[nid]);
+        list_move(&page->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
 }
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
        if (list_empty(&h->hugepage_freelists[nid]))
                return NULL;
        page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
-        list_del(&page->lru);
+        list_move(&page->lru, &h->hugepage_activelist);
        set_page_refcounted(page);
        h->free_huge_pages--;
        h->free_huge_pages_node[nid]--;
@@ -532,7 +532,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, int avoid_reserve)
 {
-        struct page *page;
+        struct page *page = NULL;
        struct mempolicy *mpol;
        nodemask_t *nodemask;
        struct zonelist *zonelist;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                1 << PG_active | 1 << PG_reserved |
                                1 << PG_private | 1 << PG_writeback);
        }
+        VM_BUG_ON(hugetlb_cgroup_from_page(page));
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
        arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
        page->mapping = NULL;
        BUG_ON(page_count(page));
        BUG_ON(page_mapcount(page));
-        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
+        hugetlb_cgroup_uncharge_page(hstate_index(h),
+                                     pages_per_huge_page(h), page);
        if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+                /* remove the page from active list */
+                list_del(&page->lru);
                update_and_free_page(h, page);
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
+        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, free_huge_page);
        spin_lock(&hugetlb_lock);
+        set_hugetlb_cgroup(page, NULL);
        h->nr_huge_pages++;
        h->nr_huge_pages_node[nid]++;
        spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
        spin_lock(&hugetlb_lock);
        if (page) {
+                INIT_LIST_HEAD(&page->lru);
                r_nid = page_to_nid(page);
                set_compound_page_dtor(page, free_huge_page);
+                set_hugetlb_cgroup(page, NULL);
                /*
                 * We incremented the global counters already
                 */
@@ -993,7 +1001,6 @@ retry:
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
                        break;
-                list_del(&page->lru);
                /*
                 * This page is now managed by the hugetlb allocator and has
                 * no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
        /* Free unnecessary surplus pages to the buddy allocator */
        if (!list_empty(&surplus_list)) {
                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
-                        list_del(&page->lru);
                        put_page(page);
                }
        }
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        struct hstate *h = hstate_vma(vma);
        struct page *page;
        long chg;
+        int ret, idx;
+        struct hugetlb_cgroup *h_cg;
+        idx = hstate_index(h);
        /*
         * Processes that did not create the mapping will have no
         * reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
         */
        chg = vma_needs_reservation(h, vma, addr);
        if (chg < 0)
-                return ERR_PTR(-VM_FAULT_OOM);
+                return ERR_PTR(-ENOMEM);
        if (chg)
                if (hugepage_subpool_get_pages(spool, chg))
-                        return ERR_PTR(-VM_FAULT_SIGBUS);
+                        return ERR_PTR(-ENOSPC);
+        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+        if (ret) {
+                hugepage_subpool_put_pages(spool, chg);
+                return ERR_PTR(-ENOSPC);
+        }
        spin_lock(&hugetlb_lock);
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
-        spin_unlock(&hugetlb_lock);
+        if (page) {
+                /* update page cgroup details */
-        if (!page) {
+                hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+                                             h_cg, page);
+                spin_unlock(&hugetlb_lock);
+        } else {
+                spin_unlock(&hugetlb_lock);
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
+                        hugetlb_cgroup_uncharge_cgroup(idx,
+                                                       pages_per_huge_page(h),
+                                                       h_cg);
                        hugepage_subpool_put_pages(spool, chg);
-                        return ERR_PTR(-VM_FAULT_SIGBUS);
+                        return ERR_PTR(-ENOSPC);
                }
+                spin_lock(&hugetlb_lock);
+                hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+                                             h_cg, page);
+                list_move(&page->lru, &h->hugepage_activelist);
+                spin_unlock(&hugetlb_lock);
        }
        set_page_private(page, (unsigned long)spool);
        vma_commit_reservation(h, vma, addr);
        return page;
 }
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
                                    struct attribute_group *hstate_attr_group)
 {
        int retval;
-        int hi = h - hstates;
+        int hi = hstate_index(h);
        hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
        if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
        if (!nhs->hugepages_kobj)
                return;         /* no hstate attributes */
-        for_each_hstate(h)
+        for_each_hstate(h) {
-                if (nhs->hstate_kobjs[h - hstates]) {
+                int idx = hstate_index(h);
-                        kobject_put(nhs->hstate_kobjs[h - hstates]);
+                if (nhs->hstate_kobjs[idx]) {
-                        nhs->hstate_kobjs[h - hstates] = NULL;
+                        kobject_put(nhs->hstate_kobjs[idx]);
+                        nhs->hstate_kobjs[idx] = NULL;
                }
+        }
        kobject_put(nhs->hugepages_kobj);
        nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
        hugetlb_unregister_all_nodes();
        for_each_hstate(h) {
-                kobject_put(hstate_kobjs[h - hstates]);
+                kobject_put(hstate_kobjs[hstate_index(h)]);
        }
        kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
                if (!size_to_hstate(default_hstate_size))
                        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
        }
-        default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+        default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
        if (default_hstate_max_huge_pages)
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
                printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
                return;
        }
-        BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
        BUG_ON(order == 0);
-        h = &hstates[max_hstate++];
+        h = &hstates[hugetlb_max_hstate++];
        h->order = order;
        h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
        h->nr_huge_pages = 0;
        h->free_huge_pages = 0;
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+        INIT_LIST_HEAD(&h->hugepage_activelist);
        h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
        h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
+        /*
+         * Add cgroup control files only if the huge page consists
+         * of more than two normal pages. This is because we use
+         * page[2].lru.next for storing cgoup details.
+         */
+        if (order >= HUGETLB_CGROUP_MIN_ORDER)
+                hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
        parsed_hstate = h;
 }
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
        static unsigned long *last_mhp;
        /*
-         * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+         * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
         * so this hugepages= parameter goes to the "default hstate".
         */
-        if (!max_hstate)
+        if (!hugetlb_max_hstate)
                mhp = &default_hstate_max_huge_pages;
        else
                mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
         * But we need to allocate >= MAX_ORDER hstates here early to still
         * use the bootmem allocator.
         */
-        if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+        if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
                hugetlb_hstate_alloc_pages(parsed_hstate);
        last_mhp = mhp;
@@ -2157,6 +2192,15 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
                kref_get(&reservations->refs);
 }
+static void resv_map_put(struct vm_area_struct *vma)
+{
+        struct resv_map *reservations = vma_resv_map(vma);
+        if (!reservations)
+                return;
+        kref_put(&reservations->refs, resv_map_release);
+}
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
        struct hstate *h = hstate_vma(vma);
@@ -2173,7 +2217,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
                reserve = (end - start) -
                        region_count(&reservations->regions, start, end);
-                kref_put(&reservations->refs, resv_map_release);
+                resv_map_put(vma);
                if (reserve) {
                        hugetlb_acct_memory(h, -reserve);
@@ -2213,6 +2257,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
        }
        entry = pte_mkyoung(entry);
        entry = pte_mkhuge(entry);
+        entry = arch_make_huge_pte(entry, vma, page, writable);
        return entry;
 }
@@ -2298,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
                return 0;
 }
-void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
-                            unsigned long end, struct page *ref_page)
+                            unsigned long start, unsigned long end,
+                            struct page *ref_page)
 {
+        int force_flush = 0;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
        pte_t *ptep;
        pte_t pte;
        struct page *page;
-        struct page *tmp;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
-        /*
-         * A page gathering list, protected by per file i_mmap_mutex. The
-         * lock is used to avoid list corruption from multiple unmapping
-         * of the same page since we are using page->lru.
-         */
-        LIST_HEAD(page_list);
        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));
+        tlb_start_vma(tlb, vma);
        mmu_notifier_invalidate_range_start(mm, start, end);
+again:
        spin_lock(&mm->page_table_lock);
        for (address = start; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address);
@@ -2360,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                }
                pte = huge_ptep_get_and_clear(mm, address, ptep);
+                tlb_remove_tlb_entry(tlb, ptep, address);
                if (pte_dirty(pte))
                        set_page_dirty(page);
-                list_add(&page->lru, &page_list);
+                page_remove_rmap(page);
+                force_flush = !__tlb_remove_page(tlb, page);
+                if (force_flush)
+                        break;
                /* Bail out after unmapping reference page if supplied */
                if (ref_page)
                        break;
        }
-        flush_tlb_range(vma, start, end);
        spin_unlock(&mm->page_table_lock);
-        mmu_notifier_invalidate_range_end(mm, start, end);
+        /*
-        list_for_each_entry_safe(page, tmp, &page_list, lru) {
+         * mmu_gather ran out of room to batch pages, we break out of
-                page_remove_rmap(page);
+         * the PTE lock to avoid doing the potential expensive TLB invalidate
-                list_del(&page->lru);
+         * and page-free while holding it.
-                put_page(page);
+         */
+        if (force_flush) {
+                force_flush = 0;
+                tlb_flush_mmu(tlb);
+                if (address < end && !ref_page)
+                        goto again;
        }
+        mmu_notifier_invalidate_range_end(mm, start, end);
+        tlb_end_vma(tlb, vma);
+}
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+                          struct vm_area_struct *vma, unsigned long start,
+                          unsigned long end, struct page *ref_page)
+{
+        __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+        /*
+         * Clear this flag so that x86's huge_pmd_share page_table_shareable
+         * test will fail on a vma being torn down, and not grab a page table
+         * on its way out.  We're lucky that the flag has such an appropriate
+         * name, and can in fact be safely cleared here. We could clear it
+         * before the __unmap_hugepage_range above, but all that's necessary
+         * is to clear it before releasing the i_mmap_mutex. This works
+         * because in the context this is called, the VMA is about to be
+         * destroyed and the i_mmap_mutex is held.
+         */
+        vma->vm_flags &= ~VM_MAYSHARE;
 }
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page)
 {
-        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        struct mm_struct *mm;
-        __unmap_hugepage_range(vma, start, end, ref_page);
+        struct mmu_gather tlb;
-        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        mm = vma->vm_mm;
+        tlb_gather_mmu(&tlb, mm, 0);
+        __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+        tlb_finish_mmu(&tlb, start, end);
 }
 /*
@@ -2428,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                 * from the time of fork. This would look like data corruption
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
-                        __unmap_hugepage_range(iter_vma,
+                        unmap_hugepage_range(iter_vma, address,
-                                address, address + huge_page_size(h),
+                                             address + huge_page_size(h), page);
-                                page);
        }
        mutex_unlock(&mapping->i_mmap_mutex);
@@ -2486,6 +2560,7 @@ retry_avoidcopy:
        new_page = alloc_huge_page(vma, address, outside_reserve);
        if (IS_ERR(new_page)) {
+                long err = PTR_ERR(new_page);
                page_cache_release(old_page);
                /*
@@ -2498,7 +2573,6 @@ retry_avoidcopy:
                if (outside_reserve) {
                        BUG_ON(huge_pte_none(pte));
                        if (unmap_ref_private(mm, vma, old_page, address)) {
-                                BUG_ON(page_count(old_page) != 1);
                                BUG_ON(huge_pte_none(pte));
                                spin_lock(&mm->page_table_lock);
                                ptep = huge_pte_offset(mm, address & huge_page_mask(h));
@@ -2515,7 +2589,10 @@ retry_avoidcopy:
                /* Caller expects lock to be held */
                spin_lock(&mm->page_table_lock);
-                return -PTR_ERR(new_page);
+                if (err == -ENOMEM)
+                        return VM_FAULT_OOM;
+                else
+                        return VM_FAULT_SIGBUS;
        }
        /*
@@ -2633,7 +2710,11 @@ retry:
                        goto out;
                page = alloc_huge_page(vma, address, 0);
                if (IS_ERR(page)) {
-                        ret = -PTR_ERR(page);
+                        ret = PTR_ERR(page);
+                        if (ret == -ENOMEM)
+                                ret = VM_FAULT_OOM;
+                        else
+                                ret = VM_FAULT_SIGBUS;
                        goto out;
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2670,7 +2751,7 @@ retry:
                 */
                if (unlikely(PageHWPoison(page))) {
                        ret = VM_FAULT_HWPOISON |
-                              VM_FAULT_SET_HINDEX(h - hstates);
+                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto backout_unlocked;
                }
        }
@@ -2743,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        return 0;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                        return VM_FAULT_HWPOISON_LARGE |
-                               VM_FAULT_SET_HINDEX(h - hstates);
+                                VM_FAULT_SET_HINDEX(hstate_index(h));
        }
        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2791,6 +2872,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * so no worry about deadlock.
         */
        page = pte_page(entry);
+        get_page(page);
        if (page != pagecache_page)
                lock_page(page);
@@ -2822,6 +2904,7 @@ out_page_table_lock:
        }
        if (page != pagecache_page)
                unlock_page(page);
+        put_page(page);
 out_mutex:
        mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2948,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                }
        }
        spin_unlock(&mm->page_table_lock);
-        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        /*
+         * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+         * may have cleared our pud entry and done put_page on the page table:
+         * once we release i_mmap_mutex, another task can do the final put_page
+         * and that page table be reused and filled with junk.
+         */
        flush_tlb_range(vma, start, end);
+        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
 int hugetlb_reserve_pages(struct inode *inode,
@@ -2989,12 +3077,16 @@ int hugetlb_reserve_pages(struct inode *inode,
                set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
        }
-        if (chg < 0)
+        if (chg < 0) {
-                return chg;
+                ret = chg;
+                goto out_err;
+        }
        /* There must be enough pages in the subpool for the mapping */
-        if (hugepage_subpool_get_pages(spool, chg))
+        if (hugepage_subpool_get_pages(spool, chg)) {
-                return -ENOSPC;
+                ret = -ENOSPC;
+                goto out_err;
+        }
        /*
         * Check enough hugepages are available for the reservation.
@@ -3003,7 +3095,7 @@ int hugetlb_reserve_pages(struct inode *inode,
        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
                hugepage_subpool_put_pages(spool, chg);
-                return ret;
+                goto out_err;
        }
        /*
@@ -3020,6 +3112,10 @@ int hugetlb_reserve_pages(struct inode *inode,
        if (!vma || vma->vm_flags & VM_MAYSHARE)
                region_add(&inode->i_mapping->private_list, from, to);
        return 0;
+out_err:
+        if (vma)
+                resv_map_put(vma);
+        return ret;
 }
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 000000000000..a3f358fb8a0c
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,418 @@
+/*
+ *
+ * Copyright IBM Corporation, 2012
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+struct hugetlb_cgroup {
+        struct cgroup_subsys_state css;
+        /*
+         * the counter to account for hugepages from hugetlb.
+         */
+        struct res_counter hugepage[HUGE_MAX_HSTATE];
+};
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_IDX(val)        (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val)       ((val) & 0xffff)
+struct cgroup_subsys hugetlb_subsys __read_mostly;
+static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+        return container_of(s, struct hugetlb_cgroup, css);
+}
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+        return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
+                                                           hugetlb_subsys_id));
+}
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
+{
+        return hugetlb_cgroup_from_css(task_subsys_state(task,
+                                                         hugetlb_subsys_id));
+}
+static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
+{
+        return (h_cg == root_h_cgroup);
+}
+static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
+{
+        if (!cg->parent)
+                return NULL;
+        return hugetlb_cgroup_from_cgroup(cg->parent);
+}
+static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
+{
+        int idx;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
+        for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+                if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
+                        return true;
+        }
+        return false;
+}
+static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
+{
+        int idx;
+        struct cgroup *parent_cgroup;
+        struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
+        h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
+        if (!h_cgroup)
+                return ERR_PTR(-ENOMEM);
+        parent_cgroup = cgroup->parent;
+        if (parent_cgroup) {
+                parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
+                for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+                        res_counter_init(&h_cgroup->hugepage[idx],
+                                         &parent_h_cgroup->hugepage[idx]);
+        } else {
+                root_h_cgroup = h_cgroup;
+                for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+                        res_counter_init(&h_cgroup->hugepage[idx], NULL);
+        }
+        return &h_cgroup->css;
+}
+static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
+{
+        struct hugetlb_cgroup *h_cgroup;
+        h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
+        kfree(h_cgroup);
+}
+/*
+ * Should be called with hugetlb_lock held.
+ * Since we are holding hugetlb_lock, pages cannot get moved from
+ * active list or uncharged from the cgroup, So no need to get
+ * page reference and test for page active here. This function
+ * cannot fail.
+ */
+static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
+                                       struct page *page)
+{
+        int csize;
+        struct res_counter *counter;
+        struct res_counter *fail_res;
+        struct hugetlb_cgroup *page_hcg;
+        struct hugetlb_cgroup *h_cg   = hugetlb_cgroup_from_cgroup(cgroup);
+        struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
+        page_hcg = hugetlb_cgroup_from_page(page);
+        /*
+         * We can have pages in active list without any cgroup
+         * ie, hugepage with less than 3 pages. We can safely
+         * ignore those pages.
+         */
+        if (!page_hcg || page_hcg != h_cg)
+                goto out;
+        csize = PAGE_SIZE << compound_order(page);
+        if (!parent) {
+                parent = root_h_cgroup;
+                /* root has no limit */
+                res_counter_charge_nofail(&parent->hugepage[idx],
+                                          csize, &fail_res);
+        }
+        counter = &h_cg->hugepage[idx];
+        res_counter_uncharge_until(counter, counter->parent, csize);
+        set_hugetlb_cgroup(page, parent);
+out:
+        return;
+}
+/*
+ * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
+ * the parent cgroup.
+ */
+static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
+{
+        struct hstate *h;
+        struct page *page;
+        int ret = 0, idx = 0;
+        do {
+                if (cgroup_task_count(cgroup) ||
+                    !list_empty(&cgroup->children)) {
+                        ret = -EBUSY;
+                        goto out;
+                }
+                for_each_hstate(h) {
+                        spin_lock(&hugetlb_lock);
+                        list_for_each_entry(page, &h->hugepage_activelist, lru)
+                                hugetlb_cgroup_move_parent(idx, cgroup, page);
+                        spin_unlock(&hugetlb_lock);
+                        idx++;
+                }
+                cond_resched();
+        } while (hugetlb_cgroup_have_usage(cgroup));
+out:
+        return ret;
+}
+int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+                                 struct hugetlb_cgroup **ptr)
+{
+        int ret = 0;
+        struct res_counter *fail_res;
+        struct hugetlb_cgroup *h_cg = NULL;
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled())
+                goto done;
+        /*
+         * We don't charge any cgroup if the compound page have less
+         * than 3 pages.
+         */
+        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+                goto done;
+again:
+        rcu_read_lock();
+        h_cg = hugetlb_cgroup_from_task(current);
+        if (!css_tryget(&h_cg->css)) {
+                rcu_read_unlock();
+                goto again;
+        }
+        rcu_read_unlock();
+        ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
+        css_put(&h_cg->css);
+done:
+        *ptr = h_cg;
+        return ret;
+}
+/* Should be called with hugetlb_lock held */
+void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+                                  struct hugetlb_cgroup *h_cg,
+                                  struct page *page)
+{
+        if (hugetlb_cgroup_disabled() || !h_cg)
+                return;
+        set_hugetlb_cgroup(page, h_cg);
+        return;
+}
+/*
+ * Should be called with hugetlb_lock held
+ */
+void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+                                  struct page *page)
+{
+        struct hugetlb_cgroup *h_cg;
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled())
+                return;
+        VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
+        h_cg = hugetlb_cgroup_from_page(page);
+        if (unlikely(!h_cg))
+                return;
+        set_hugetlb_cgroup(page, NULL);
+        res_counter_uncharge(&h_cg->hugepage[idx], csize);
+        return;
+}
+void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
+                                    struct hugetlb_cgroup *h_cg)
+{
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled() || !h_cg)
+                return;
+        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+                return;
+        res_counter_uncharge(&h_cg->hugepage[idx], csize);
+        return;
+}
+static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
+                                   struct file *file, char __user *buf,
+                                   size_t nbytes, loff_t *ppos)
+{
+        u64 val;
+        char str[64];
+        int idx, name, len;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        val = res_counter_read_u64(&h_cg->hugepage[idx], name);
+        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
+                                const char *buffer)
+{
+        int idx, name, ret;
+        unsigned long long val;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        switch (name) {
+        case RES_LIMIT:
+                if (hugetlb_cgroup_is_root(h_cg)) {
+                        /* Can't set limit on root */
+                        ret = -EINVAL;
+                        break;
+                }
+                /* This function does all necessary parse...reuse it */
+                ret = res_counter_memparse_write_strategy(buffer, &val);
+                if (ret)
+                        break;
+                ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret;
+}
+static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
+{
+        int idx, name, ret = 0;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(event);
+        name = MEMFILE_ATTR(event);
+        switch (name) {
+        case RES_MAX_USAGE:
+                res_counter_reset_max(&h_cg->hugepage[idx]);
+                break;
+        case RES_FAILCNT:
+                res_counter_reset_failcnt(&h_cg->hugepage[idx]);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret;
+}
+static char *mem_fmt(char *buf, int size, unsigned long hsize)
+{
+        if (hsize >= (1UL << 30))
+                snprintf(buf, size, "%luGB", hsize >> 30);
+        else if (hsize >= (1UL << 20))
+                snprintf(buf, size, "%luMB", hsize >> 20);
+        else
+                snprintf(buf, size, "%luKB", hsize >> 10);
+        return buf;
+}
+int __init hugetlb_cgroup_file_init(int idx)
+{
+        char buf[32];
+        struct cftype *cft;
+        struct hstate *h = &hstates[idx];
+        /* format the size */
+        mem_fmt(buf, 32, huge_page_size(h));
+        /* Add the limit file */
+        cft = &h->cgroup_files[0];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
+        cft->read = hugetlb_cgroup_read;
+        cft->write_string = hugetlb_cgroup_write;
+        /* Add the usage file */
+        cft = &h->cgroup_files[1];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
+        cft->read = hugetlb_cgroup_read;
+        /* Add the MAX usage file */
+        cft = &h->cgroup_files[2];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
+        cft->trigger = hugetlb_cgroup_reset;
+        cft->read = hugetlb_cgroup_read;
+        /* Add the failcntfile */
+        cft = &h->cgroup_files[3];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
+        cft->private  = MEMFILE_PRIVATE(idx, RES_FAILCNT);
+        cft->trigger  = hugetlb_cgroup_reset;
+        cft->read = hugetlb_cgroup_read;
+        /* NULL terminate the last cft */
+        cft = &h->cgroup_files[4];
+        memset(cft, 0, sizeof(*cft));
+        WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
+        return 0;
+}
+/*
+ * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
+ * when we migrate hugepages
+ */
+void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
+{
+        struct hugetlb_cgroup *h_cg;
+        struct hstate *h = page_hstate(oldhpage);
+        if (hugetlb_cgroup_disabled())
+                return;
+        VM_BUG_ON(!PageHuge(oldhpage));
+        spin_lock(&hugetlb_lock);
+        h_cg = hugetlb_cgroup_from_page(oldhpage);
+        set_hugetlb_cgroup(oldhpage, NULL);
+        /* move the h_cg details to new cgroup */
+        set_hugetlb_cgroup(newhpage, h_cg);
+        list_move(&newhpage->lru, &h->hugepage_activelist);
+        spin_unlock(&hugetlb_lock);
+        return;
+}
+struct cgroup_subsys hugetlb_subsys = {
+        .name = "hugetlb",
+        .create     = hugetlb_cgroup_create,
+        .pre_destroy = hugetlb_cgroup_pre_destroy,
+        .destroy    = hugetlb_cgroup_destroy,
+        .subsys_id  = hugetlb_subsys_id,
+};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index cc448bb983ba..3a61efc518d5 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -123,7 +123,7 @@ static int pfn_inject_init(void)
        if (!dentry)
                goto fail;
-#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
        dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
                                    hwpoison_dir, &hwpoison_filter_memcg);
        if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 2189af491783..b8c91b342e24 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -100,6 +100,46 @@ extern void prep_compound_page(struct page *page, unsigned long order);
 extern bool is_free_buddy_page(struct page *page);
 #endif
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+/*
+ * in mm/compaction.c
+ */
+/*
+ * compact_control is used to track pages being migrated and the free pages
+ * they are being migrated to during memory compaction. The free_pfn starts
+ * at the end of a zone and migrate_pfn begins at the start. Movable pages
+ * are moved to the end of a zone during a compaction run and the run
+ * completes when free_pfn <= migrate_pfn
+ */
+struct compact_control {
+        struct list_head freepages;     /* List of free pages to migrate to */
+        struct list_head migratepages;  /* List of pages being migrated */
+        unsigned long nr_freepages;     /* Number of isolated free pages */
+        unsigned long nr_migratepages;  /* Number of pages to migrate */
+        unsigned long free_pfn;         /* isolate_freepages search base */
+        unsigned long start_free_pfn;   /* where we started the search */
+        unsigned long migrate_pfn;      /* isolate_migratepages search base */
+        bool sync;                      /* Synchronous migration */
+        bool wrapped;                   /* Order > 0 compactions are
+                                           incremental, once free_pfn
+                                           and migrate_pfn meet, we restart
+                                           from the top of the zone;
+                                           remember we wrapped around. */
+        int order;                      /* order a direct compactor needs */
+        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
+        struct zone *zone;
+        bool *contended;                /* True if a lock was contended */
+};
+unsigned long
+isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn);
+unsigned long
+isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
+                           unsigned long low_pfn, unsigned long end_pfn);
+#endif
 /*
 * function for dealing with page's order in buddy system.
@@ -131,7 +171,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 * to determine if it's being mapped into a LOCKED vma.
 * If so, mark page as mlocked.
 */
-static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
+static inline int mlocked_vma_newpage(struct vm_area_struct *vma,
+                                    struct page *page)
 {
        VM_BUG_ON(PageLRU(page));
@@ -189,7 +230,7 @@ extern unsigned long vma_address(struct page *page,
                                 struct vm_area_struct *vma);
 #endif
 #else /* !CONFIG_MMU */
-static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
+static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p)
 {
        return 0;
 }
@@ -309,3 +350,9 @@ extern u64 hwpoison_filter_flags_mask;
 extern u64 hwpoison_filter_flags_value;
 extern u64 hwpoison_filter_memcg;
 extern u32 hwpoison_filter_enable;
+extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
+        unsigned long, unsigned long,
+        unsigned long, unsigned long);
+extern void set_pageblock_order(void);
diff --git a/mm/madvise.c b/mm/madvise.c
index 1ccbba5b6674..14d260fa0d17 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,8 +11,11 @@
 #include <linux/mempolicy.h>
 #include <linux/page-isolation.h>
 #include <linux/hugetlb.h>
+#include <linux/falloc.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
+#include <linux/fs.h>
+#include <linux/file.h>
 /*
 * Any behaviour which results in changes to the vma->vm_flags needs to
@@ -200,33 +203,39 @@ static long madvise_remove(struct vm_area_struct *vma,
                                struct vm_area_struct **prev,
                                unsigned long start, unsigned long end)
 {
-        struct address_space *mapping;
+        loff_t offset;
-        loff_t offset, endoff;
        int error;
+        struct file *f;
        *prev = NULL;   /* tell sys_madvise we drop mmap_sem */
        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
                return -EINVAL;
-        if (!vma->vm_file || !vma->vm_file->f_mapping
+        f = vma->vm_file;
-                || !vma->vm_file->f_mapping->host) {
+        if (!f || !f->f_mapping || !f->f_mapping->host) {
                        return -EINVAL;
        }
        if ((vma->vm_flags & (VM_SHARED|VM_WRITE)) != (VM_SHARED|VM_WRITE))
                return -EACCES;
-        mapping = vma->vm_file->f_mapping;
        offset = (loff_t)(start - vma->vm_start)
                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        endoff = (loff_t)(end - vma->vm_start - 1)
-                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
-        /* vmtruncate_range needs to take i_mutex */
+        /*
+         * Filesystem's fallocate may need to take i_mutex.  We need to
+         * explicitly grab a reference because the vma (and hence the
+         * vma's reference to the file) can go away as soon as we drop
+         * mmap_sem.
+         */
+        get_file(f);
        up_read(&current->mm->mmap_sem);
-        error = vmtruncate_range(mapping->host, offset, endoff);
+        error = do_fallocate(f,
+                                FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                                offset, end - start);
+        fput(f);
        down_read(&current->mm->mmap_sem);
        return error;
 }
diff --git a/mm/memblock.c b/mm/memblock.c
index 99f285599501..4d9393c7edc9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -37,6 +37,8 @@ struct memblock memblock __initdata_memblock = {
 int memblock_debug __initdata_memblock;
 static int memblock_can_resize __initdata_memblock;
+static int memblock_memory_in_slab __initdata_memblock = 0;
+static int memblock_reserved_in_slab __initdata_memblock = 0;
 /* inline so we don't get a warning when pr_debug is compiled out */
 static inline const char *memblock_type_name(struct memblock_type *type)
@@ -141,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
                                           MAX_NUMNODES);
 }
-/*
- * Free memblock.reserved.regions
- */
-int __init_memblock memblock_free_reserved_regions(void)
-{
-        if (memblock.reserved.regions == memblock_reserved_init_regions)
-                return 0;
-        return memblock_free(__pa(memblock.reserved.regions),
-                 sizeof(struct memblock_region) * memblock.reserved.max);
-}
-/*
- * Reserve memblock.reserved.regions
- */
-int __init_memblock memblock_reserve_reserved_regions(void)
-{
-        if (memblock.reserved.regions == memblock_reserved_init_regions)
-                return 0;
-        return memblock_reserve(__pa(memblock.reserved.regions),
-                 sizeof(struct memblock_region) * memblock.reserved.max);
-}
 static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
 {
        type->total_size -= type->regions[r].size;
@@ -182,11 +160,42 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
        }
 }
-static int __init_memblock memblock_double_array(struct memblock_type *type)
+phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info(
+                                        phys_addr_t *addr)
+{
+        if (memblock.reserved.regions == memblock_reserved_init_regions)
+                return 0;
+        *addr = __pa(memblock.reserved.regions);
+        return PAGE_ALIGN(sizeof(struct memblock_region) *
+                          memblock.reserved.max);
+}
+/**
+ * memblock_double_array - double the size of the memblock regions array
+ * @type: memblock type of the regions array being doubled
+ * @new_area_start: starting address of memory range to avoid overlap with
+ * @new_area_size: size of memory range to avoid overlap with
+ *
+ * Double the size of the @type regions array. If memblock is being used to
+ * allocate memory for a new reserved regions array and there is a previously
+ * allocated memory range [@new_area_start,@new_area_start+@new_area_size]
+ * waiting to be reserved, ensure the memory used by the new array does
+ * not overlap.
+ *
+ * RETURNS:
+ * 0 on success, -1 on failure.
+ */
+static int __init_memblock memblock_double_array(struct memblock_type *type,
+                                                phys_addr_t new_area_start,
+                                                phys_addr_t new_area_size)
 {
        struct memblock_region *new_array, *old_array;
+        phys_addr_t old_alloc_size, new_alloc_size;
        phys_addr_t old_size, new_size, addr;
        int use_slab = slab_is_available();
+        int *in_slab;
        /* We don't allow resizing until we know about the reserved regions
         * of memory that aren't suitable for allocation
@@ -197,36 +206,62 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        /* Calculate new doubled size */
        old_size = type->max * sizeof(struct memblock_region);
        new_size = old_size << 1;
+        /*
+         * We need to allocated new one align to PAGE_SIZE,
+         *   so we can free them completely later.
+         */
+        old_alloc_size = PAGE_ALIGN(old_size);
+        new_alloc_size = PAGE_ALIGN(new_size);
+        /* Retrieve the slab flag */
+        if (type == &memblock.memory)
+                in_slab = &memblock_memory_in_slab;
+        else
+                in_slab = &memblock_reserved_in_slab;
        /* Try to find some space for it.
         *
         * WARNING: We assume that either slab_is_available() and we use it or
-         * we use MEMBLOCK for allocations. That means that this is unsafe to use
+         * we use MEMBLOCK for allocations. That means that this is unsafe to
-         * when bootmem is currently active (unless bootmem itself is implemented
+         * use when bootmem is currently active (unless bootmem itself is
-         * on top of MEMBLOCK which isn't the case yet)
+         * implemented on top of MEMBLOCK which isn't the case yet)
         *
         * This should however not be an issue for now, as we currently only
-         * call into MEMBLOCK while it's still active, or much later when slab is
+         * call into MEMBLOCK while it's still active, or much later when slab
-         * active for memory hotplug operations
+         * is active for memory hotplug operations
         */
        if (use_slab) {
                new_array = kmalloc(new_size, GFP_KERNEL);
                addr = new_array ? __pa(new_array) : 0;
-        } else
+        } else {
-                addr = memblock_find_in_range(0, MEMBLOCK_ALLOC_ACCESSIBLE, new_size, sizeof(phys_addr_t));
+                /* only exclude range when trying to double reserved.regions */
+                if (type != &memblock.reserved)
+                        new_area_start = new_area_size = 0;
+                addr = memblock_find_in_range(new_area_start + new_area_size,
+                                                memblock.current_limit,
+                                                new_alloc_size, PAGE_SIZE);
+                if (!addr && new_area_size)
+                        addr = memblock_find_in_range(0,
+                                min(new_area_start, memblock.current_limit),
+                                new_alloc_size, PAGE_SIZE);
+                new_array = addr ? __va(addr) : 0;
+        }
        if (!addr) {
                pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
                       memblock_type_name(type), type->max, type->max * 2);
                return -1;
        }
-        new_array = __va(addr);
-        memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
+        memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
-                 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
+                        memblock_type_name(type), type->max * 2, (u64)addr,
+                        (u64)addr + new_size - 1);
-        /* Found space, we now need to move the array over before
+        /*
-         * we add the reserved region since it may be our reserved
+         * Found space, we now need to move the array over before we add the
-         * array itself that is full.
+         * reserved region since it may be our reserved array itself that is
+         * full.
         */
        memcpy(new_array, type->regions, old_size);
        memset(new_array + type->max, 0, old_size);
@@ -234,21 +269,22 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        type->regions = new_array;
        type->max <<= 1;
-        /* If we use SLAB that's it, we are done */
+        /* Free old array. We needn't free it if the array is the static one */
-        if (use_slab)
+        if (*in_slab)
-                return 0;
+                kfree(old_array);
+        else if (old_array != memblock_memory_init_regions &&
-        /* Add the new reserved region now. Should not fail ! */
+                 old_array != memblock_reserved_init_regions)
-        BUG_ON(memblock_reserve(addr, new_size));
+                memblock_free(__pa(old_array), old_alloc_size);
-        /* If the array wasn't our static init one, then free it. We only do
+        /*
-         * that before SLAB is available as later on, we don't know whether
+         * Reserve the new array if that comes from the memblock.  Otherwise, we
-         * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
+         * needn't do it
-         * anyways
         */
-        if (old_array != memblock_memory_init_regions &&
+        if (!use_slab)
-            old_array != memblock_reserved_init_regions)
+                BUG_ON(memblock_reserve(addr, new_alloc_size));
-                memblock_free(__pa(old_array), old_size);
+        /* Update slab flag */
+        *in_slab = use_slab;
        return 0;
 }
@@ -330,6 +366,9 @@ static int __init_memblock memblock_add_region(struct memblock_type *type,
        phys_addr_t end = base + memblock_cap_size(base, &size);
        int i, nr_new;
+        if (!size)
+                return 0;
        /* special case for empty array */
        if (type->regions[0].size == 0) {
                WARN_ON(type->cnt != 1 || type->total_size);
@@ -384,7 +423,7 @@ repeat:
         */
        if (!insert) {
                while (type->cnt + nr_new > type->max)
-                        if (memblock_double_array(type) < 0)
+                        if (memblock_double_array(type, obase, size) < 0)
                                return -ENOMEM;
                insert = true;
                goto repeat;
@@ -430,9 +469,12 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
        *start_rgn = *end_rgn = 0;
+        if (!size)
+                return 0;
        /* we'll create at most two more regions */
        while (type->cnt + 2 > type->max)
-                if (memblock_double_array(type) < 0)
+                if (memblock_double_array(type, base, size) < 0)
                        return -ENOMEM;
        for (i = 0; i < type->cnt; i++) {
@@ -514,7 +556,6 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
                     (unsigned long long)base,
                     (unsigned long long)base + size,
                     (void *)_RET_IP_);
-        BUG_ON(0 == size);
        return memblock_add_region(_rgn, base, size, MAX_NUMNODES);
 }
@@ -523,9 +564,9 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 * __next_free_mem_range - next function for for_each_free_mem_range()
 * @idx: pointer to u64 loop variable
 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Find the first free area from *@idx which matches @nid, fill the out
 * parameters, and update *@idx for the next iteration.  The lower 32bit of
@@ -599,9 +640,9 @@ void __init_memblock __next_free_mem_range(u64 *idx, int nid,
 * __next_free_mem_range_rev - next function for for_each_free_mem_range_reverse()
 * @idx: pointer to u64 loop variable
 * @nid: nid: node selector, %MAX_NUMNODES for all nodes
- * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL
+ * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
- * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL
+ * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
- * @p_nid: ptr to int for nid of the range, can be %NULL
+ * @out_nid: ptr to int for nid of the range, can be %NULL
 *
 * Reverse of __next_free_mem_range().
 */
@@ -850,6 +891,16 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
        return memblock_search(&memblock.memory, addr) != -1;
 }
+/**
+ * memblock_is_region_memory - check if a region is a subset of memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) is a subset of a memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
 int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
 {
        int idx = memblock_search(&memblock.memory, base);
@@ -862,6 +913,16 @@ int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size
                 memblock.memory.regions[idx].size) >= end;
 }
+/**
+ * memblock_is_region_reserved - check if a region intersects reserved memory
+ * @base: base of region to check
+ * @size: size of region to check
+ *
+ * Check if the region [@base, @base+@size) intersects a reserved memory block.
+ *
+ * RETURNS:
+ * 0 if false, non-zero if true
+ */
 int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
 {
        memblock_cap_size(base, &size);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7d698df4a067..795e525afaba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,21 +59,21 @@
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES      5
-struct mem_cgroup *root_mem_cgroup __read_mostly;
+static struct mem_cgroup *root_mem_cgroup __read_mostly;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
 /* for remember boot option*/
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
 #else
 static int really_do_swap_account __initdata = 0;
 #endif
 #else
-#define do_swap_account         (0)
+#define do_swap_account         0
 #endif
@@ -87,19 +87,32 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
        MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
        MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
-        MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+        MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
-        MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
        MEM_CGROUP_STAT_NSTATS,
 };
+static const char * const mem_cgroup_stat_names[] = {
+        "cache",
+        "rss",
+        "mapped_file",
+        "swap",
+};
 enum mem_cgroup_events_index {
        MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
        MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
-        MEM_CGROUP_EVENTS_COUNT,        /* # of pages paged in/out */
        MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
        MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
        MEM_CGROUP_EVENTS_NSTATS,
 };
+static const char * const mem_cgroup_events_names[] = {
+        "pgpgin",
+        "pgpgout",
+        "pgfault",
+        "pgmajfault",
+};
 /*
 * Per memcg event counter is incremented at every pagein/pageout. With THP,
 * it will be incremated by the number of pages. This counter is used for
@@ -112,13 +125,14 @@ enum mem_cgroup_events_target {
        MEM_CGROUP_TARGET_NUMAINFO,
        MEM_CGROUP_NTARGETS,
 };
-#define THRESHOLDS_EVENTS_TARGET (128)
+#define THRESHOLDS_EVENTS_TARGET 128
-#define SOFTLIMIT_EVENTS_TARGET (1024)
+#define SOFTLIMIT_EVENTS_TARGET 1024
-#define NUMAINFO_EVENTS_TARGET  (1024)
+#define NUMAINFO_EVENTS_TARGET  1024
 struct mem_cgroup_stat_cpu {
        long count[MEM_CGROUP_STAT_NSTATS];
        unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+        unsigned long nr_page_events;
        unsigned long targets[MEM_CGROUP_NTARGETS];
 };
@@ -138,7 +152,6 @@ struct mem_cgroup_per_zone {
        struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
-        struct zone_reclaim_stat reclaim_stat;
        struct rb_node          tree_node;      /* RB tree node */
        unsigned long long      usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
@@ -182,7 +195,7 @@ struct mem_cgroup_threshold {
 /* For threshold */
 struct mem_cgroup_threshold_ary {
-        /* An array index points to threshold just below usage. */
+        /* An array index points to threshold just below or equal to usage. */
        int current_threshold;
        /* Size of entries[] */
        unsigned int size;
@@ -245,8 +258,8 @@ struct mem_cgroup {
                 */
                struct rcu_head rcu_freeing;
                /*
-                 * But when using vfree(), that cannot be done at
+                 * We also need some space for a worker in deferred freeing.
-                 * interrupt time, so we must then queue the work.
+                 * By the time we call it, rcu_freeing is no longer in use.
                 */
                struct work_struct work_freeing;
        };
@@ -305,7 +318,7 @@ struct mem_cgroup {
        /*
         * percpu counter.
         */
-        struct mem_cgroup_stat_cpu *stat;
+        struct mem_cgroup_stat_cpu __percpu *stat;
        /*
         * used when a cpu is offlined or other synchronizations
         * See mem_cgroup_read_stat().
@@ -360,14 +373,12 @@ static bool move_file(void)
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
 */
-#define MEM_CGROUP_MAX_RECLAIM_LOOPS            (100)
+#define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
-#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
+#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
-        MEM_CGROUP_CHARGE_TYPE_MAPPED,
+        MEM_CGROUP_CHARGE_TYPE_ANON,
-        MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
-        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
        MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
        NR_CHARGE_TYPE,
@@ -377,8 +388,8 @@ enum charge_type {
 #define _MEM                    (0)
 #define _MEMSWAP                (1)
 #define _OOM_TYPE               (2)
-#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
-#define MEMFILE_TYPE(val)       (((val) >> 16) & 0xffff)
+#define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
 /* Used for OOM nofiier */
 #define OOM_CONTROL             (0)
@@ -394,8 +405,14 @@ enum charge_type {
 static void mem_cgroup_get(struct mem_cgroup *memcg);
 static void mem_cgroup_put(struct mem_cgroup *memcg);
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+        return container_of(s, struct mem_cgroup, css);
+}
 /* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
 #include <net/sock.h>
 #include <net/ip.h>
@@ -404,6 +421,7 @@ void sock_update_memcg(struct sock *sk)
 {
        if (mem_cgroup_sockets_enabled) {
                struct mem_cgroup *memcg;
+                struct cg_proto *cg_proto;
                BUG_ON(!sk->sk_prot->proto_cgroup);
@@ -423,9 +441,10 @@ void sock_update_memcg(struct sock *sk)
                rcu_read_lock();
                memcg = mem_cgroup_from_task(current);
-                if (!mem_cgroup_is_root(memcg)) {
+                cg_proto = sk->sk_prot->proto_cgroup(memcg);
+                if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
                        mem_cgroup_get(memcg);
-                        sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg);
+                        sk->sk_cgrp = cg_proto;
                }
                rcu_read_unlock();
        }
@@ -452,7 +471,20 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif /* CONFIG_INET */
-#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+#endif /* CONFIG_MEMCG_KMEM */
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+        if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+                return;
+        static_key_slow_dec(&memcg_socket_limit_enabled);
+}
+#else
+static void disarm_sock_keys(struct mem_cgroup *memcg)
+{
+}
+#endif
 static void drain_all_stock_async(struct mem_cgroup *memcg);
@@ -675,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
                                         bool charge)
 {
        int val = (charge) ? 1 : -1;
-        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
+        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 }
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -718,12 +750,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                nr_pages = -nr_pages; /* for event */
        }
-        __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
+        __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
        preempt_enable();
 }
 unsigned long
+mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
+{
+        struct mem_cgroup_per_zone *mz;
+        mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+        return mz->lru_size[lru];
+}
+static unsigned long
 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
                        unsigned int lru_mask)
 {
@@ -770,7 +811,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 {
        unsigned long val, next;
-        val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+        val = __this_cpu_read(memcg->stat->nr_page_events);
        next = __this_cpu_read(memcg->stat->targets[target]);
        /* from time_after() in jiffies.h */
        if ((long)next - (long)val < 0) {
@@ -827,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
-        return container_of(cgroup_subsys_state(cont,
+        return mem_cgroup_from_css(
-                                mem_cgroup_subsys_id), struct mem_cgroup,
+                cgroup_subsys_state(cont, mem_cgroup_subsys_id));
-                                css);
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -842,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
        if (unlikely(!p))
                return NULL;
-        return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
+        return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
-                                struct mem_cgroup, css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -929,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
                if (css) {
                        if (css == &root->css || css_tryget(css))
-                                memcg = container_of(css,
+                                memcg = mem_cgroup_from_css(css);
-                                                     struct mem_cgroup, css);
                } else
                        id = 0;
                rcu_read_unlock();
@@ -1013,7 +1051,7 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
 /**
 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
 * @zone: zone of the wanted lruvec
- * @mem: memcg of the wanted lruvec
+ * @memcg: memcg of the wanted lruvec
 *
 * Returns the lru list vector holding pages for the given @zone and
 * @mem.  This can be the global zone lruvec, if the memory controller
@@ -1046,19 +1084,11 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
 */
 /**
- * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec
+ * mem_cgroup_page_lruvec - return lruvec for adding an lru page
- * @zone: zone of the page
 * @page: the page
- * @lru: current lru
+ * @zone: zone of the page
- *
- * This function accounts for @page being added to @lru, and returns
- * the lruvec for the given @zone and the memcg @page is charged to.
- *
- * The callsite is then responsible for physically linking the page to
- * the returned lruvec->lists[@lru].
 */
-struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
+struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
-                                       enum lru_list lru)
 {
        struct mem_cgroup_per_zone *mz;
        struct mem_cgroup *memcg;
@@ -1071,7 +1101,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
        memcg = pc->mem_cgroup;
        /*
-         * Surreptitiously switch any uncharged page to root:
+         * Surreptitiously switch any uncharged offlist page to root:
         * an uncharged page off lru does nothing to secure
         * its former mem_cgroup from sudden removal.
         *
@@ -1079,85 +1109,60 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
         * under page_cgroup lock: between them, they make all uses
         * of pc->mem_cgroup safe.
         */
-        if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup)
+        if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
                pc->mem_cgroup = memcg = root_mem_cgroup;
        mz = page_cgroup_zoneinfo(memcg, page);
-        /* compound_order() is stabilized through lru_lock */
-        mz->lru_size[lru] += 1 << compound_order(page);
        return &mz->lruvec;
 }
 /**
- * mem_cgroup_lru_del_list - account for removing an lru page
+ * mem_cgroup_update_lru_size - account for adding or removing an lru page
- * @page: the page
+ * @lruvec: mem_cgroup per zone lru vector
- * @lru: target lru
+ * @lru: index of lru list the page is sitting on
+ * @nr_pages: positive when adding or negative when removing
 *
- * This function accounts for @page being removed from @lru.
+ * This function must be called when a page is added to or removed from an
- *
+ * lru list.
- * The callsite is then responsible for physically unlinking
- * @page->lru.
 */
-void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru)
+void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
+                                int nr_pages)
 {
        struct mem_cgroup_per_zone *mz;
-        struct mem_cgroup *memcg;
+        unsigned long *lru_size;
-        struct page_cgroup *pc;
        if (mem_cgroup_disabled())
                return;
-        pc = lookup_page_cgroup(page);
+        mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
-        memcg = pc->mem_cgroup;
+        lru_size = mz->lru_size + lru;
-        VM_BUG_ON(!memcg);
+        *lru_size += nr_pages;
-        mz = page_cgroup_zoneinfo(memcg, page);
+        VM_BUG_ON((long)(*lru_size) < 0);
-        /* huge page split is done under lru_lock. so, we have no races. */
-        VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
-        mz->lru_size[lru] -= 1 << compound_order(page);
-}
-void mem_cgroup_lru_del(struct page *page)
-{
-        mem_cgroup_lru_del_list(page, page_lru(page));
-}
-/**
- * mem_cgroup_lru_move_lists - account for moving a page between lrus
- * @zone: zone of the page
- * @page: the page
- * @from: current lru
- * @to: target lru
- *
- * This function accounts for @page being moved between the lrus @from
- * and @to, and returns the lruvec for the given @zone and the memcg
- * @page is charged to.
- *
- * The callsite is then responsible for physically relinking
- * @page->lru to the returned lruvec->lists[@to].
- */
-struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
-                                         struct page *page,
-                                         enum lru_list from,
-                                         enum lru_list to)
-{
-        /* XXX: Optimize this, especially for @from == @to */
-        mem_cgroup_lru_del_list(page, from);
-        return mem_cgroup_lru_add_list(zone, page, to);
 }
 /*
 * Checks whether given mem is same or in the root_mem_cgroup's
 * hierarchy subtree
 */
+bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
+                                  struct mem_cgroup *memcg)
+{
+        if (root_memcg == memcg)
+                return true;
+        if (!root_memcg->use_hierarchy || !memcg)
+                return false;
+        return css_is_ancestor(&memcg->css, &root_memcg->css);
+}
 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
-                struct mem_cgroup *memcg)
+                                       struct mem_cgroup *memcg)
 {
-        if (root_memcg != memcg) {
+        bool ret;
-                return (root_memcg->use_hierarchy &&
-                        css_is_ancestor(&memcg->css, &root_memcg->css));
-        }
-        return true;
+        rcu_read_lock();
+        ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
+        rcu_read_unlock();
+        return ret;
 }
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
@@ -1195,19 +1200,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
        return ret;
 }
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
+int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
        unsigned long inactive_ratio;
-        int nid = zone_to_nid(zone);
-        int zid = zone_idx(zone);
        unsigned long inactive;
        unsigned long active;
        unsigned long gb;
-        inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+        inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
-                                                BIT(LRU_INACTIVE_ANON));
+        active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-        active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
-                                              BIT(LRU_ACTIVE_ANON));
        gb = (inactive + active) >> (30 - PAGE_SHIFT);
        if (gb)
@@ -1218,55 +1219,23 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
        return inactive * inactive_ratio < active;
 }
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
+int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
 {
        unsigned long active;
        unsigned long inactive;
-        int zid = zone_idx(zone);
-        int nid = zone_to_nid(zone);
-        inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+        inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
-                                                BIT(LRU_INACTIVE_FILE));
+        active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
-        active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
-                                              BIT(LRU_ACTIVE_FILE));
        return (active > inactive);
 }
-struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
-                                                      struct zone *zone)
-{
-        int nid = zone_to_nid(zone);
-        int zid = zone_idx(zone);
-        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-        return &mz->reclaim_stat;
-}
-struct zone_reclaim_stat *
-mem_cgroup_get_reclaim_stat_from_page(struct page *page)
-{
-        struct page_cgroup *pc;
-        struct mem_cgroup_per_zone *mz;
-        if (mem_cgroup_disabled())
-                return NULL;
-        pc = lookup_page_cgroup(page);
-        if (!PageCgroupUsed(pc))
-                return NULL;
-        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
-        smp_rmb();
-        mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
-        return &mz->reclaim_stat;
-}
 #define mem_cgroup_from_res_counter(counter, member)    \
        container_of(counter, struct mem_cgroup, member)
 /**
 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
- * @mem: the memory cgroup
+ * @memcg: the memory cgroup
 *
 * Returns the maximum amount of memory @mem can be charged with, in
 * pages.
@@ -1486,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 /*
 * Return the memory (and swap, if configured) limit for a memcg.
 */
-u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
        u64 limit;
        u64 memsw;
@@ -1502,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        return min(limit, memsw);
 }
+void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                              int order)
+{
+        struct mem_cgroup *iter;
+        unsigned long chosen_points = 0;
+        unsigned long totalpages;
+        unsigned int points = 0;
+        struct task_struct *chosen = NULL;
+        /*
+         * If current has a pending SIGKILL, then automatically select it.  The
+         * goal is to allow it to allocate so that it may quickly exit and free
+         * its memory.
+         */
+        if (fatal_signal_pending(current)) {
+                set_thread_flag(TIF_MEMDIE);
+                return;
+        }
+        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+        totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+        for_each_mem_cgroup_tree(iter, memcg) {
+                struct cgroup *cgroup = iter->css.cgroup;
+                struct cgroup_iter it;
+                struct task_struct *task;
+                cgroup_iter_start(cgroup, &it);
+                while ((task = cgroup_iter_next(cgroup, &it))) {
+                        switch (oom_scan_process_thread(task, totalpages, NULL,
+                                                        false)) {
+                        case OOM_SCAN_SELECT:
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                chosen = task;
+                                chosen_points = ULONG_MAX;
+                                get_task_struct(chosen);
+                                /* fall through */
+                        case OOM_SCAN_CONTINUE:
+                                continue;
+                        case OOM_SCAN_ABORT:
+                                cgroup_iter_end(cgroup, &it);
+                                mem_cgroup_iter_break(memcg, iter);
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                return;
+                        case OOM_SCAN_OK:
+                                break;
+                        };
+                        points = oom_badness(task, memcg, NULL, totalpages);
+                        if (points > chosen_points) {
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                chosen = task;
+                                chosen_points = points;
+                                get_task_struct(chosen);
+                        }
+                }
+                cgroup_iter_end(cgroup, &it);
+        }
+        if (!chosen)
+                return;
+        points = chosen_points * 1000 / totalpages;
+        oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
+                         NULL, "Memory cgroup out of memory");
+}
 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
                                        gfp_t gfp_mask,
                                        unsigned long flags)
@@ -1540,7 +1576,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 /**
 * test_mem_cgroup_node_reclaimable
- * @mem: the target memcg
+ * @memcg: the target memcg
 * @nid: the node ID to be checked.
 * @noswap : specify true here if the user wants flle only information.
 *
@@ -1634,7 +1670,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 * unused nodes. But scan_nodes is lazily updated and may not cotain
 * enough new information. We need to do double check.
 */
-bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
        int nid;
@@ -1669,7 +1705,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
        return 0;
 }
-bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
+static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
        return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
 }
@@ -1843,7 +1879,8 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 /*
 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
 */
-bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
+                                  int order)
 {
        struct oom_wait_info owait;
        bool locked, need_to_kill;
@@ -1930,7 +1967,7 @@ again:
                return;
        /*
         * If this memory cgroup is not under account moving, we don't
-         * need to take move_lock_page_cgroup(). Because we already hold
+         * need to take move_lock_mem_cgroup(). Because we already hold
         * rcu_read_lock(), any calls to move_account will be delayed until
         * rcu_read_unlock() if mem_cgroup_stolen() == true.
         */
@@ -1952,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
        /*
         * It's guaranteed that pc->mem_cgroup never changes while
         * lock is held because a routine modifies pc->mem_cgroup
-         * should take move_lock_page_cgroup().
+         * should take move_lock_mem_cgroup().
         */
        move_unlock_mem_cgroup(pc->mem_cgroup, flags);
 }
@@ -1992,7 +2029,7 @@ struct memcg_stock_pcp {
        unsigned int nr_pages;
        struct work_struct work;
        unsigned long flags;
-#define FLUSHING_CACHED_CHARGE  (0)
+#define FLUSHING_CACHED_CHARGE  0
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static DEFINE_MUTEX(percpu_charge_mutex);
@@ -2139,7 +2176,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
        int i;
        spin_lock(&memcg->pcp_counter_lock);
-        for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
+        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                long x = per_cpu(memcg->stat->count[i], cpu);
                per_cpu(memcg->stat->count[i], cpu) = 0;
@@ -2165,7 +2202,7 @@ static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
        if (action == CPU_ONLINE)
                return NOTIFY_OK;
-        if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
+        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
                return NOTIFY_OK;
        for_each_mem_cgroup(iter)
@@ -2299,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
         * thread group leader migrates. It's possible that mm is not
-         * set, if so charge the init_mm (happens for pagecache usage).
+         * set, if so charge the root memcg (happens for pagecache usage).
         */
        if (!*ptr && !mm)
                *ptr = root_mem_cgroup;
@@ -2427,6 +2464,24 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
 }
 /*
+ * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
+ * This is useful when moving usage to parent cgroup.
+ */
+static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
+                                        unsigned int nr_pages)
+{
+        unsigned long bytes = nr_pages * PAGE_SIZE;
+        if (mem_cgroup_is_root(memcg))
+                return;
+        res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
+        if (do_swap_account)
+                res_counter_uncharge_until(&memcg->memsw,
+                                                memcg->memsw.parent, bytes);
+}
+/*
 * A helper function to get mem_cgroup from ID. must be called under
 * rcu_read_lock(). The caller must check css_is_removed() or some if
 * it's concern. (dropping refcnt from swap can be called against removed
@@ -2442,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
        css = css_lookup(&mem_cgroup_subsys, id);
        if (!css)
                return NULL;
-        return container_of(css, struct mem_cgroup, css);
+        return mem_cgroup_from_css(css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2476,20 +2531,17 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
                                       struct page *page,
                                       unsigned int nr_pages,
-                                       struct page_cgroup *pc,
                                       enum charge_type ctype,
                                       bool lrucare)
 {
+        struct page_cgroup *pc = lookup_page_cgroup(page);
        struct zone *uninitialized_var(zone);
+        struct lruvec *lruvec;
        bool was_on_lru = false;
        bool anon;
        lock_page_cgroup(pc);
-        if (unlikely(PageCgroupUsed(pc))) {
+        VM_BUG_ON(PageCgroupUsed(pc));
-                unlock_page_cgroup(pc);
-                __mem_cgroup_cancel_charge(memcg, nr_pages);
-                return;
-        }
        /*
         * we don't need page_cgroup_lock about tail pages, becase they are not
         * accessed by any other context at this point.
@@ -2503,8 +2555,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
                zone = page_zone(page);
                spin_lock_irq(&zone->lru_lock);
                if (PageLRU(page)) {
+                        lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
                        ClearPageLRU(page);
-                        del_page_from_lru_list(zone, page, page_lru(page));
+                        del_page_from_lru_list(page, lruvec, page_lru(page));
                        was_on_lru = true;
                }
        }
@@ -2522,14 +2575,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        if (lrucare) {
                if (was_on_lru) {
+                        lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
                        VM_BUG_ON(PageLRU(page));
                        SetPageLRU(page);
-                        add_page_to_lru_list(zone, page, page_lru(page));
+                        add_page_to_lru_list(page, lruvec, page_lru(page));
                }
                spin_unlock_irq(&zone->lru_lock);
        }
-        if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+        if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
                anon = true;
        else
                anon = false;
@@ -2547,7 +2601,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION))
+#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
 /*
 * Because tail pages are not marked as "used", set it. We're under
 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2578,23 +2632,19 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 * @pc: page_cgroup of the page.
 * @from: mem_cgroup which the page is moved from.
 * @to: mem_cgroup which the page is moved to. @from != @to.
- * @uncharge: whether we should call uncharge and css_put against @from.
 *
 * The caller must confirm following.
 * - page is not on LRU (isolate_page() is useful.)
 * - compound_lock is held when nr_pages > 1
 *
- * This function doesn't do "charge" nor css_get to new cgroup. It should be
+ * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
- * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
+ * from old cgroup.
- * true, this function does "uncharge" from old cgroup, but it doesn't if
- * @uncharge is false, so a caller should do "uncharge".
 */
 static int mem_cgroup_move_account(struct page *page,
                                   unsigned int nr_pages,
                                   struct page_cgroup *pc,
                                   struct mem_cgroup *from,
-                                   struct mem_cgroup *to,
+                                   struct mem_cgroup *to)
-                                   bool uncharge)
 {
        unsigned long flags;
        int ret;
@@ -2628,9 +2678,6 @@ static int mem_cgroup_move_account(struct page *page,
                preempt_enable();
        }
        mem_cgroup_charge_statistics(from, anon, -nr_pages);
-        if (uncharge)
-                /* This is not "cancel", but cancel_charge does all we need. */
-                __mem_cgroup_cancel_charge(from, nr_pages);
        /* caller should have done css_get */
        pc->mem_cgroup = to;
@@ -2661,18 +2708,15 @@ out:
 static int mem_cgroup_move_parent(struct page *page,
                                  struct page_cgroup *pc,
-                                  struct mem_cgroup *child,
+                                  struct mem_cgroup *child)
-                                  gfp_t gfp_mask)
 {
-        struct cgroup *cg = child->css.cgroup;
-        struct cgroup *pcg = cg->parent;
        struct mem_cgroup *parent;
        unsigned int nr_pages;
        unsigned long uninitialized_var(flags);
        int ret;
        /* Is ROOT ? */
-        if (!pcg)
+        if (mem_cgroup_is_root(child))
                return -EINVAL;
        ret = -EBUSY;
@@ -2683,21 +2727,23 @@ static int mem_cgroup_move_parent(struct page *page,
        nr_pages = hpage_nr_pages(page);
-        parent = mem_cgroup_from_cont(pcg);
+        parent = parent_mem_cgroup(child);
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
+        /*
-        if (ret)
+         * If no parent, move charges to root cgroup.
-                goto put_back;
+         */
+        if (!parent)
+                parent = root_mem_cgroup;
        if (nr_pages > 1)
                flags = compound_lock_irqsave(page);
-        ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
+        ret = mem_cgroup_move_account(page, nr_pages,
-        if (ret)
+                                pc, child, parent);
-                __mem_cgroup_cancel_charge(parent, nr_pages);
+        if (!ret)
+                __mem_cgroup_cancel_local_charge(child, nr_pages);
        if (nr_pages > 1)
                compound_unlock_irqrestore(page, flags);
-put_back:
        putback_lru_page(page);
 put:
        put_page(page);
@@ -2716,7 +2762,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 {
        struct mem_cgroup *memcg = NULL;
        unsigned int nr_pages = 1;
-        struct page_cgroup *pc;
        bool oom = true;
        int ret;
@@ -2730,11 +2775,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                oom = false;
        }
-        pc = lookup_page_cgroup(page);
        ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
        if (ret == -ENOMEM)
                return ret;
-        __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype, false);
+        __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
        return 0;
 }
@@ -2747,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page,
        VM_BUG_ON(page->mapping && !PageAnon(page));
        VM_BUG_ON(!mm);
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                        MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                        MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
-                                        enum charge_type ctype);
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
-                                gfp_t gfp_mask)
-{
-        struct mem_cgroup *memcg = NULL;
-        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
-        int ret;
-        if (mem_cgroup_disabled())
-                return 0;
-        if (PageCompound(page))
-                return 0;
-        if (unlikely(!mm))
-                mm = &init_mm;
-        if (!page_is_file_cache(page))
-                type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-        if (!PageSwapCache(page))
-                ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
-        else { /* page is swapcache/shmem */
-                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
-                if (!ret)
-                        __mem_cgroup_commit_charge_swapin(page, memcg, type);
-        }
-        return ret;
 }
 /*
@@ -2787,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 * struct page_cgroup is acquired. This refcnt will be consumed by
 * "commit()" or removed by "cancel()"
 */
-int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-                                 struct page *page,
+                                          struct page *page,
-                                 gfp_t mask, struct mem_cgroup **memcgp)
+                                          gfp_t mask,
+                                          struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg;
+        struct page_cgroup *pc;
        int ret;
-        *memcgp = NULL;
+        pc = lookup_page_cgroup(page);
-        if (mem_cgroup_disabled())
-                return 0;
-        if (!do_swap_account)
-                goto charge_cur_mm;
        /*
-         * A racing thread's fault, or swapoff, may have already updated
+         * Every swap fault against a single page tries to charge the
-         * the pte, and even removed page from swap cache: in those cases
+         * page, bail as early as possible.  shmem_unuse() encounters
-         * do_swap_page()'s pte_same() test will fail; but there's also a
+         * already charged pages, too.  The USED bit is protected by
-         * KSM case which does need to charge the page.
+         * the page lock, which serializes swap cache removal, which
+         * in turn serializes uncharging.
         */
-        if (!PageSwapCache(page))
+        if (PageCgroupUsed(pc))
+                return 0;
+        if (!do_swap_account)
                goto charge_cur_mm;
        memcg = try_get_mem_cgroup_from_page(page);
        if (!memcg)
@@ -2819,28 +2831,55 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                ret = 0;
        return ret;
 charge_cur_mm:
-        if (unlikely(!mm))
-                mm = &init_mm;
        ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
        if (ret == -EINTR)
                ret = 0;
        return ret;
 }
+int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
+                                 gfp_t gfp_mask, struct mem_cgroup **memcgp)
+{
+        *memcgp = NULL;
+        if (mem_cgroup_disabled())
+                return 0;
+        /*
+         * A racing thread's fault, or swapoff, may have already
+         * updated the pte, and even removed page from swap cache: in
+         * those cases unuse_pte()'s pte_same() test will fail; but
+         * there's also a KSM case which does need to charge the page.
+         */
+        if (!PageSwapCache(page)) {
+                int ret;
+                ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
+                if (ret == -EINTR)
+                        ret = 0;
+                return ret;
+        }
+        return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
+}
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+{
+        if (mem_cgroup_disabled())
+                return;
+        if (!memcg)
+                return;
+        __mem_cgroup_cancel_charge(memcg, 1);
+}
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                                        enum charge_type ctype)
 {
-        struct page_cgroup *pc;
        if (mem_cgroup_disabled())
                return;
        if (!memcg)
                return;
        cgroup_exclude_rmdir(&memcg->css);
-        pc = lookup_page_cgroup(page);
+        __mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
-        __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype, true);
        /*
         * Now swap is on-memory. This means this page may be
         * counted both as mem and swap....double count.
@@ -2850,24 +2889,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
         */
        if (do_swap_account && PageSwapCache(page)) {
                swp_entry_t ent = {.val = page_private(page)};
-                struct mem_cgroup *swap_memcg;
+                mem_cgroup_uncharge_swap(ent);
-                unsigned short id;
-                id = swap_cgroup_record(ent, 0);
-                rcu_read_lock();
-                swap_memcg = mem_cgroup_lookup(id);
-                if (swap_memcg) {
-                        /*
-                         * This recorded memcg can be obsolete one. So, avoid
-                         * calling css_tryget
-                         */
-                        if (!mem_cgroup_is_root(swap_memcg))
-                                res_counter_uncharge(&swap_memcg->memsw,
-                                                     PAGE_SIZE);
-                        mem_cgroup_swap_statistics(swap_memcg, false);
-                        mem_cgroup_put(swap_memcg);
-                }
-                rcu_read_unlock();
        }
        /*
         * At swapin, we may charge account against cgroup which has no tasks.
@@ -2881,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
                                     struct mem_cgroup *memcg)
 {
        __mem_cgroup_commit_charge_swapin(page, memcg,
-                                          MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                          MEM_CGROUP_CHARGE_TYPE_ANON);
 }
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+                                gfp_t gfp_mask)
 {
+        struct mem_cgroup *memcg = NULL;
+        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        int ret;
        if (mem_cgroup_disabled())
-                return;
+                return 0;
-        if (!memcg)
+        if (PageCompound(page))
-                return;
+                return 0;
-        __mem_cgroup_cancel_charge(memcg, 1);
+        if (!PageSwapCache(page))
+                ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
+        else { /* page is swapcache/shmem */
+                ret = __mem_cgroup_try_charge_swapin(mm, page,
+                                                     gfp_mask, &memcg);
+                if (!ret)
+                        __mem_cgroup_commit_charge_swapin(page, memcg, type);
+        }
+        return ret;
 }
 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -2950,7 +2986,8 @@ direct_uncharge:
 * uncharge if !page_mapped(page)
 */
 static struct mem_cgroup *
-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
+                             bool end_migration)
 {
        struct mem_cgroup *memcg = NULL;
        unsigned int nr_pages = 1;
@@ -2960,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        if (mem_cgroup_disabled())
                return NULL;
-        if (PageSwapCache(page))
+        VM_BUG_ON(PageSwapCache(page));
-                return NULL;
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
@@ -2984,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        anon = PageAnon(page);
        switch (ctype) {
-        case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+        case MEM_CGROUP_CHARGE_TYPE_ANON:
                /*
                 * Generally PageAnon tells if it's the anon statistics to be
                 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
@@ -2994,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                /* fallthrough */
        case MEM_CGROUP_CHARGE_TYPE_DROP:
                /* See mem_cgroup_prepare_migration() */
-                if (page_mapped(page) || PageCgroupMigration(pc))
+                if (page_mapped(page))
+                        goto unlock_out;
+                /*
+                 * Pages under migration may not be uncharged.  But
+                 * end_migration() /must/ be the one uncharging the
+                 * unused post-migration page and so it has to call
+                 * here with the migration bit still set.  See the
+                 * res_counter handling below.
+                 */
+                if (!end_migration && PageCgroupMigration(pc))
                        goto unlock_out;
                break;
        case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -3028,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                mem_cgroup_swap_statistics(memcg, true);
                mem_cgroup_get(memcg);
        }
-        if (!mem_cgroup_is_root(memcg))
+        /*
+         * Migration does not charge the res_counter for the
+         * replacement page, so leave it alone when phasing out the
+         * page that is unused after the migration.
+         */
+        if (!end_migration && !mem_cgroup_is_root(memcg))
                mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
        return memcg;
@@ -3044,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
        if (page_mapped(page))
                return;
        VM_BUG_ON(page->mapping && !PageAnon(page));
-        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+        if (PageSwapCache(page))
+                return;
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
 }
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
        VM_BUG_ON(page_mapped(page));
        VM_BUG_ON(page->mapping);
-        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
 }
 /*
@@ -3115,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
        if (!swapout) /* this was a swap cache but the swap is unused ! */
                ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
-        memcg = __mem_cgroup_uncharge_common(page, ctype);
+        memcg = __mem_cgroup_uncharge_common(page, ctype, false);
        /*
         * record memcg information,  if swapout && memcg != NULL,
@@ -3126,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 }
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 /*
 * called from swap_entry_free(). remove record in swap_cgroup and
 * uncharge "memsw" account.
@@ -3160,7 +3212,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
 * @entry: swap entry to be moved
 * @from:  mem_cgroup which the entry is moved from
 * @to:  mem_cgroup which the entry is moved to
- * @need_fixup: whether we should fixup res_counters and refcounts.
 *
 * It succeeds only when the swap_cgroup's record for this entry is the same
 * as the mem_cgroup's id of @from.
@@ -3171,7 +3222,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
 * both res and memsw, and called css_get().
 */
 static int mem_cgroup_move_swap_account(swp_entry_t entry,
-                struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+                                struct mem_cgroup *from, struct mem_cgroup *to)
 {
        unsigned short old_id, new_id;
@@ -3190,24 +3241,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
                 * swap-in, the refcount of @to might be decreased to 0.
                 */
                mem_cgroup_get(to);
-                if (need_fixup) {
-                        if (!mem_cgroup_is_root(from))
-                                res_counter_uncharge(&from->memsw, PAGE_SIZE);
-                        mem_cgroup_put(from);
-                        /*
-                         * we charged both to->res and to->memsw, so we should
-                         * uncharge to->res.
-                         */
-                        if (!mem_cgroup_is_root(to))
-                                res_counter_uncharge(&to->res, PAGE_SIZE);
-                }
                return 0;
        }
        return -EINVAL;
 }
 #else
 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
-                struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+                                struct mem_cgroup *from, struct mem_cgroup *to)
 {
        return -EINVAL;
 }
@@ -3217,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
 * page belongs to.
 */
-int mem_cgroup_prepare_migration(struct page *page,
+void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
-        struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
+                                  struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg = NULL;
        struct page_cgroup *pc;
        enum charge_type ctype;
-        int ret = 0;
        *memcgp = NULL;
        VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
-                return 0;
+                return;
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
@@ -3274,39 +3313,25 @@ int mem_cgroup_prepare_migration(struct page *page,
         * we return here.
         */
        if (!memcg)
-                return 0;
+                return;
        *memcgp = memcg;
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
-        css_put(&memcg->css);/* drop extra refcnt */
-        if (ret) {
-                if (PageAnon(page)) {
-                        lock_page_cgroup(pc);
-                        ClearPageCgroupMigration(pc);
-                        unlock_page_cgroup(pc);
-                        /*
-                         * The old page may be fully unmapped while we kept it.
-                         */
-                        mem_cgroup_uncharge_page(page);
-                }
-                /* we'll need to revisit this error code (we have -EINTR) */
-                return -ENOMEM;
-        }
        /*
         * We charge new page before it's used/mapped. So, even if unlock_page()
         * is called before end_migration, we can catch all events on this new
         * page. In the case new page is migrated but not remapped, new page's
         * mapcount will be finally 0 and we call uncharge in end_migration().
         */
-        pc = lookup_page_cgroup(newpage);
        if (PageAnon(page))
-                ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+                ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
-        else if (page_is_file_cache(page))
-                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
        else
-                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
-        __mem_cgroup_commit_charge(memcg, newpage, 1, pc, ctype, false);
+        /*
-        return ret;
+         * The page is committed to the memcg, but it's not actually
+         * charged to the res_counter since we plan on replacing the
+         * old one and only one page is going to be left afterwards.
+         */
+        __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
 }
 /* remove redundant charge if migration failed*/
@@ -3328,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
                used = newpage;
                unused = oldpage;
        }
+        anon = PageAnon(used);
+        __mem_cgroup_uncharge_common(unused,
+                                     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
+                                     : MEM_CGROUP_CHARGE_TYPE_CACHE,
+                                     true);
+        css_put(&memcg->css);
        /*
         * We disallowed uncharge of pages under migration because mapcount
         * of the page goes down to zero, temporarly.
@@ -3337,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        lock_page_cgroup(pc);
        ClearPageCgroupMigration(pc);
        unlock_page_cgroup(pc);
-        anon = PageAnon(used);
-        __mem_cgroup_uncharge_common(unused,
-                anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
-                     : MEM_CGROUP_CHARGE_TYPE_CACHE);
        /*
         * If a page is a file cache, radix-tree replacement is very atomic
@@ -3369,7 +3396,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
 void mem_cgroup_replace_page_cache(struct page *oldpage,
                                  struct page *newpage)
 {
-        struct mem_cgroup *memcg;
+        struct mem_cgroup *memcg = NULL;
        struct page_cgroup *pc;
        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
@@ -3379,20 +3406,25 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
        pc = lookup_page_cgroup(oldpage);
        /* fix accounting on old pages */
        lock_page_cgroup(pc);
-        memcg = pc->mem_cgroup;
+        if (PageCgroupUsed(pc)) {
-        mem_cgroup_charge_statistics(memcg, false, -1);
+                memcg = pc->mem_cgroup;
-        ClearPageCgroupUsed(pc);
+                mem_cgroup_charge_statistics(memcg, false, -1);
+                ClearPageCgroupUsed(pc);
+        }
        unlock_page_cgroup(pc);
-        if (PageSwapBacked(oldpage))
+        /*
-                type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+         * When called from shmem_replace_page(), in some cases the
+         * oldpage has already been charged, and in some cases not.
+         */
+        if (!memcg)
+                return;
        /*
         * Even if newpage->mapping was NULL before starting replacement,
         * the newpage may be on LRU(or pagevec for LRU) already. We lock
         * LRU while we overwrite pc->mem_cgroup.
         */
-        __mem_cgroup_commit_charge(memcg, newpage, 1, pc, type, true);
+        __mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
 }
 #ifdef CONFIG_DEBUG_VM
@@ -3461,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                /*
                 * Rather than hide all in some function, I do this in
                 * open coded manner. You see what this really does.
-                 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3522,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                /*
                 * Rather than hide all in some function, I do this in
                 * open coded manner. You see what this really does.
-                 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3654,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 }
 /*
- * This routine traverse page_cgroup in given list and drop them all.
+ * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
- * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
+ * reclaim the pages page themselves - it just removes the page_cgroups.
+ * Returns true if some page_cgroups were not freed, indicating that the caller
+ * must retry this operation.
 */
-static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
+static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                                int node, int zid, enum lru_list lru)
 {
        struct mem_cgroup_per_zone *mz;
@@ -3665,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
        struct list_head *list;
        struct page *busy;
        struct zone *zone;
-        int ret = 0;
        zone = &NODE_DATA(node)->node_zones[zid];
        mz = mem_cgroup_zoneinfo(memcg, node, zid);
@@ -3679,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                struct page_cgroup *pc;
                struct page *page;
-                ret = 0;
                spin_lock_irqsave(&zone->lru_lock, flags);
                if (list_empty(list)) {
                        spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -3696,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                pc = lookup_page_cgroup(page);
-                ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
+                if (mem_cgroup_move_parent(page, pc, memcg)) {
-                if (ret == -ENOMEM || ret == -EINTR)
-                        break;
-                if (ret == -EBUSY || ret == -EINVAL) {
                        /* found lock contention or "pc" is obsolete. */
                        busy = page;
                        cond_resched();
                } else
                        busy = NULL;
        }
+        return !list_empty(list);
-        if (!ret && !list_empty(list))
-                return -EBUSY;
-        return ret;
 }
 /*
@@ -3735,9 +3760,6 @@ move_account:
                ret = -EBUSY;
                if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
                        goto out;
-                ret = -EINTR;
-                if (signal_pending(current))
-                        goto out;
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
                drain_all_stock_sync(memcg);
@@ -3758,12 +3780,9 @@ move_account:
                }
                mem_cgroup_end_move(memcg);
                memcg_oom_recover(memcg);
-                /* it seems parent cgroup doesn't have enough mem */
-                if (ret == -ENOMEM)
-                        goto try_to_free;
                cond_resched();
        /* "ret" should also be checked to ensure all lists are empty. */
-        } while (memcg->res.usage > 0 || ret);
+        } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
 out:
        css_put(&memcg->css);
        return ret;
@@ -3778,7 +3797,7 @@ try_to_free:
        lru_add_drain_all();
        /* try to free all pages in this cgroup */
        shrink = 1;
-        while (nr_retries && memcg->res.usage > 0) {
+        while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) {
                int progress;
                if (signal_pending(current)) {
@@ -3799,7 +3818,7 @@ try_to_free:
        goto move_account;
 }
-int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
+static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
 {
        return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
 }
@@ -3822,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                parent_memcg = mem_cgroup_from_cont(parent);
        cgroup_lock();
+        if (memcg->use_hierarchy == val)
+                goto out;
        /*
         * If parent's use_hierarchy is set, we can't make any modifications
         * in the child subtrees. If it is unset, then the change can
@@ -3838,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                        retval = -EBUSY;
        } else
                retval = -EINVAL;
+out:
        cgroup_unlock();
        return retval;
@@ -3874,19 +3899,26 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
        val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
        if (swap)
-                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
+                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
        return val << PAGE_SHIFT;
 }
-static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
+static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
+                               struct file *file, char __user *buf,
+                               size_t nbytes, loff_t *ppos)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        char str[64];
        u64 val;
-        int type, name;
+        int type, name, len;
        type = MEMFILE_TYPE(cft->private);
        name = MEMFILE_ATTR(cft->private);
+        if (!do_swap_account && type == _MEMSWAP)
+                return -EOPNOTSUPP;
        switch (type) {
        case _MEM:
                if (name == RES_USAGE)
@@ -3903,7 +3935,9 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
        default:
                BUG();
        }
-        return val;
+        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
 /*
 * The user of this function is...
@@ -3919,6 +3953,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
        type = MEMFILE_TYPE(cft->private);
        name = MEMFILE_ATTR(cft->private);
+        if (!do_swap_account && type == _MEMSWAP)
+                return -EOPNOTSUPP;
        switch (name) {
        case RES_LIMIT:
                if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
@@ -3984,12 +4022,15 @@ out:
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
-        struct mem_cgroup *memcg;
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        int type, name;
-        memcg = mem_cgroup_from_cont(cont);
        type = MEMFILE_TYPE(event);
        name = MEMFILE_ATTR(event);
+        if (!do_swap_account && type == _MEMSWAP)
+                return -EOPNOTSUPP;
        switch (name) {
        case RES_MAX_USAGE:
                if (type == _MEM)
@@ -4041,103 +4082,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 }
 #endif
-/* For read statistics */
-enum {
-        MCS_CACHE,
-        MCS_RSS,
-        MCS_FILE_MAPPED,
-        MCS_PGPGIN,
-        MCS_PGPGOUT,
-        MCS_SWAP,
-        MCS_PGFAULT,
-        MCS_PGMAJFAULT,
-        MCS_INACTIVE_ANON,
-        MCS_ACTIVE_ANON,
-        MCS_INACTIVE_FILE,
-        MCS_ACTIVE_FILE,
-        MCS_UNEVICTABLE,
-        NR_MCS_STAT,
-};
-struct mcs_total_stat {
-        s64 stat[NR_MCS_STAT];
-};
-struct {
-        char *local_name;
-        char *total_name;
-} memcg_stat_strings[NR_MCS_STAT] = {
-        {"cache", "total_cache"},
-        {"rss", "total_rss"},
-        {"mapped_file", "total_mapped_file"},
-        {"pgpgin", "total_pgpgin"},
-        {"pgpgout", "total_pgpgout"},
-        {"swap", "total_swap"},
-        {"pgfault", "total_pgfault"},
-        {"pgmajfault", "total_pgmajfault"},
-        {"inactive_anon", "total_inactive_anon"},
-        {"active_anon", "total_active_anon"},
-        {"inactive_file", "total_inactive_file"},
-        {"active_file", "total_active_file"},
-        {"unevictable", "total_unevictable"}
-};
-static void
-mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
-{
-        s64 val;
-        /* per cpu stat */
-        val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
-        s->stat[MCS_CACHE] += val * PAGE_SIZE;
-        val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
-        s->stat[MCS_RSS] += val * PAGE_SIZE;
-        val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
-        s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
-        val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
-        s->stat[MCS_PGPGIN] += val;
-        val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
-        s->stat[MCS_PGPGOUT] += val;
-        if (do_swap_account) {
-                val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
-                s->stat[MCS_SWAP] += val * PAGE_SIZE;
-        }
-        val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
-        s->stat[MCS_PGFAULT] += val;
-        val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
-        s->stat[MCS_PGMAJFAULT] += val;
-        /* per zone stat */
-        val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
-        s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
-        val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
-        s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
-        val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
-        s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
-        val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
-        s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
-        val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
-        s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
-}
-static void
-mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
-{
-        struct mem_cgroup *iter;
-        for_each_mem_cgroup_tree(iter, memcg)
-                mem_cgroup_get_local_stat(iter, s);
-}
 #ifdef CONFIG_NUMA
-static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
+static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
+                                      struct seq_file *m)
 {
        int nid;
        unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
        unsigned long node_nr;
-        struct cgroup *cont = m->private;
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
@@ -4178,64 +4129,100 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
 }
 #endif /* CONFIG_NUMA */
-static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+static const char * const mem_cgroup_lru_names[] = {
-                                 struct cgroup_map_cb *cb)
+        "inactive_anon",
-{
+        "active_anon",
-        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        "inactive_file",
-        struct mcs_total_stat mystat;
+        "active_file",
-        int i;
+        "unevictable",
+};
-        memset(&mystat, 0, sizeof(mystat));
+static inline void mem_cgroup_lru_names_not_uptodate(void)
-        mem_cgroup_get_local_stat(memcg, &mystat);
+{
+        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
+}
+static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
+                                 struct seq_file *m)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        struct mem_cgroup *mi;
+        unsigned int i;
-        for (i = 0; i < NR_MCS_STAT; i++) {
+        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-                if (i == MCS_SWAP && !do_swap_account)
+                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
-                cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
+                seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
+                           mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
        }
+        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
+                seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
+                           mem_cgroup_read_events(memcg, i));
+        for (i = 0; i < NR_LRU_LISTS; i++)
+                seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
+                           mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
        /* Hierarchical information */
        {
                unsigned long long limit, memsw_limit;
                memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
-                cb->fill(cb, "hierarchical_memory_limit", limit);
+                seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
                if (do_swap_account)
-                        cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
+                        seq_printf(m, "hierarchical_memsw_limit %llu\n",
+                                   memsw_limit);
        }
-        memset(&mystat, 0, sizeof(mystat));
+        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-        mem_cgroup_get_total_stat(memcg, &mystat);
+                long long val = 0;
-        for (i = 0; i < NR_MCS_STAT; i++) {
-                if (i == MCS_SWAP && !do_swap_account)
+                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
-                cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
+                for_each_mem_cgroup_tree(mi, memcg)
+                        val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
+                seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
+        }
+        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+                unsigned long long val = 0;
+                for_each_mem_cgroup_tree(mi, memcg)
+                        val += mem_cgroup_read_events(mi, i);
+                seq_printf(m, "total_%s %llu\n",
+                           mem_cgroup_events_names[i], val);
+        }
+        for (i = 0; i < NR_LRU_LISTS; i++) {
+                unsigned long long val = 0;
+                for_each_mem_cgroup_tree(mi, memcg)
+                        val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
+                seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
        }
 #ifdef CONFIG_DEBUG_VM
        {
                int nid, zid;
                struct mem_cgroup_per_zone *mz;
+                struct zone_reclaim_stat *rstat;
                unsigned long recent_rotated[2] = {0, 0};
                unsigned long recent_scanned[2] = {0, 0};
                for_each_online_node(nid)
                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                                mz = mem_cgroup_zoneinfo(memcg, nid, zid);
+                                rstat = &mz->lruvec.reclaim_stat;
-                                recent_rotated[0] +=
+                                recent_rotated[0] += rstat->recent_rotated[0];
-                                        mz->reclaim_stat.recent_rotated[0];
+                                recent_rotated[1] += rstat->recent_rotated[1];
-                                recent_rotated[1] +=
+                                recent_scanned[0] += rstat->recent_scanned[0];
-                                        mz->reclaim_stat.recent_rotated[1];
+                                recent_scanned[1] += rstat->recent_scanned[1];
-                                recent_scanned[0] +=
-                                        mz->reclaim_stat.recent_scanned[0];
-                                recent_scanned[1] +=
-                                        mz->reclaim_stat.recent_scanned[1];
                        }
-                cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
+                seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
-                cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
+                seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
-                cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
+                seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
-                cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
+                seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
        }
 #endif
@@ -4297,7 +4284,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
        usage = mem_cgroup_usage(memcg, swap);
        /*
-         * current_threshold points to threshold just below usage.
+         * current_threshold points to threshold just below or equal to usage.
         * If it's not true, a threshold was crossed after last
         * call of __mem_cgroup_threshold().
         */
@@ -4423,14 +4410,15 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
        /* Find current threshold */
        new->current_threshold = -1;
        for (i = 0; i < size; i++) {
-                if (new->entries[i].threshold < usage) {
+                if (new->entries[i].threshold <= usage) {
                        /*
                         * new->current_threshold will not be used until
                         * rcu_assign_pointer(), so it's safe to increment
                         * it here.
                         */
                        ++new->current_threshold;
-                }
+                } else
+                        break;
        }
        /* Free old spare buffer and save old primary buffer as spare */
@@ -4499,7 +4487,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
                        continue;
                new->entries[j] = thresholds->primary->entries[i];
-                if (new->entries[j].threshold < usage) {
+                if (new->entries[j].threshold <= usage) {
                        /*
                         * new->current_threshold will not be used
                         * until rcu_assign_pointer(), so it's safe to increment
@@ -4513,6 +4501,12 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
 swap_buffers:
        /* Swap primary and spare array */
        thresholds->spare = thresholds->primary;
+        /* If all events are unregistered, free the spare array */
+        if (!new) {
+                kfree(thresholds->spare);
+                thresholds->spare = NULL;
+        }
        rcu_assign_pointer(thresholds->primary, new);
        /* To be sure that nobody uses thresholds */
@@ -4607,46 +4601,23 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        return 0;
 }
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_MEMCG_KMEM
-static const struct file_operations mem_control_numa_stat_file_operations = {
+static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-        .read = seq_read,
-        .llseek = seq_lseek,
-        .release = single_release,
-};
-static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
 {
-        struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
+        return mem_cgroup_sockets_init(memcg, ss);
-        file->f_op = &mem_control_numa_stat_file_operations;
-        return single_open(file, mem_control_numa_stat_show, cont);
-}
-#endif /* CONFIG_NUMA */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
-static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
-{
-        /*
-         * Part of this would be better living in a separate allocation
-         * function, leaving us with just the cgroup tree population work.
-         * We, however, depend on state such as network's proto_list that
-         * is only initialized after cgroup creation. I found the less
-         * cumbersome way to deal with it to defer it all to populate time
-         */
-        return mem_cgroup_sockets_init(cont, ss);
 };
-static void kmem_cgroup_destroy(struct cgroup *cont)
+static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
-        mem_cgroup_sockets_destroy(cont);
+        mem_cgroup_sockets_destroy(memcg);
 }
 #else
-static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss)
+static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
        return 0;
 }
-static void kmem_cgroup_destroy(struct cgroup *cont)
+static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
 }
 #endif
@@ -4655,7 +4626,7 @@ static struct cftype mem_cgroup_files[] = {
        {
                .name = "usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
-                .read_u64 = mem_cgroup_read,
+                .read = mem_cgroup_read,
                .register_event = mem_cgroup_usage_register_event,
                .unregister_event = mem_cgroup_usage_unregister_event,
        },
@@ -4663,29 +4634,29 @@ static struct cftype mem_cgroup_files[] = {
                .name = "max_usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
                .trigger = mem_cgroup_reset,
-                .read_u64 = mem_cgroup_read,
+                .read = mem_cgroup_read,
        },
        {
                .name = "limit_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
                .write_string = mem_cgroup_write,
-                .read_u64 = mem_cgroup_read,
+                .read = mem_cgroup_read,
        },
        {
                .name = "soft_limit_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
                .write_string = mem_cgroup_write,
-                .read_u64 = mem_cgroup_read,
+                .read = mem_cgroup_read,
        },
        {
                .name = "failcnt",
                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
                .trigger = mem_cgroup_reset,
-                .read_u64 = mem_cgroup_read,
+                .read = mem_cgroup_read,
        },
        {
                .name = "stat",
-                .read_map = mem_control_stat_show,
+                .read_seq_string = memcg_stat_show,
        },
        {
                .name = "force_empty",
@@ -4717,18 +4688,14 @@ static struct cftype mem_cgroup_files[] = {
 #ifdef CONFIG_NUMA
        {
                .name = "numa_stat",
-                .open = mem_control_numa_stat_open,
+                .read_seq_string = memcg_numa_stat_show,
-                .mode = S_IRUGO,
        },
 #endif
-};
+#ifdef CONFIG_MEMCG_SWAP
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
-static struct cftype memsw_cgroup_files[] = {
        {
                .name = "memsw.usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-                .read_u64 = mem_cgroup_read,
+                .read = mem_cgroup_read,
                .register_event = mem_cgroup_usage_register_event,
                .unregister_event = mem_cgroup_usage_unregister_event,
        },
@@ -4736,41 +4703,28 @@ static struct cftype memsw_cgroup_files[] = {
                .name = "memsw.max_usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
                .trigger = mem_cgroup_reset,
-                .read_u64 = mem_cgroup_read,
+                .read = mem_cgroup_read,
        },
        {
                .name = "memsw.limit_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
                .write_string = mem_cgroup_write,
-                .read_u64 = mem_cgroup_read,
+                .read = mem_cgroup_read,
        },
        {
                .name = "memsw.failcnt",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
                .trigger = mem_cgroup_reset,
-                .read_u64 = mem_cgroup_read,
+                .read = mem_cgroup_read,
        },
-};
-static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
-{
-        if (!do_swap_account)
-                return 0;
-        return cgroup_add_files(cont, ss, memsw_cgroup_files,
-                                ARRAY_SIZE(memsw_cgroup_files));
-};
-#else
-static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
-{
-        return 0;
-}
 #endif
+        { },    /* terminate */
+};
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
        struct mem_cgroup_per_node *pn;
        struct mem_cgroup_per_zone *mz;
-        enum lru_list lru;
        int zone, tmp = node;
        /*
         * This routine is called against possible nodes.
@@ -4788,8 +4742,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
-                for_each_lru(lru)
+                lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
-                        INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
                mz->usage_in_excess = 0;
                mz->on_tree = false;
                mz->memcg = memcg;
@@ -4832,23 +4785,40 @@ out_free:
 }
 /*
- * Helpers for freeing a vzalloc()ed mem_cgroup by RCU,
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
 * but in process context.  The work_freeing structure is overlaid
 * on the rcu_freeing structure, which itself is overlaid on memsw.
 */
-static void vfree_work(struct work_struct *work)
+static void free_work(struct work_struct *work)
 {
        struct mem_cgroup *memcg;
+        int size = sizeof(struct mem_cgroup);
        memcg = container_of(work, struct mem_cgroup, work_freeing);
-        vfree(memcg);
+        /*
+         * We need to make sure that (at least for now), the jump label
+         * destruction code runs outside of the cgroup lock. This is because
+         * get_online_cpus(), which is called from the static_branch update,
+         * can't be called inside the cgroup_lock. cpusets are the ones
+         * enforcing this dependency, so if they ever change, we might as well.
+         *
+         * schedule_work() will guarantee this happens. Be careful if you need
+         * to move this code around, and make sure it is outside
+         * the cgroup_lock.
+         */
+        disarm_sock_keys(memcg);
+        if (size < PAGE_SIZE)
+                kfree(memcg);
+        else
+                vfree(memcg);
 }
-static void vfree_rcu(struct rcu_head *rcu_head)
+static void free_rcu(struct rcu_head *rcu_head)
 {
        struct mem_cgroup *memcg;
        memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-        INIT_WORK(&memcg->work_freeing, vfree_work);
+        INIT_WORK(&memcg->work_freeing, free_work);
        schedule_work(&memcg->work_freeing);
 }
@@ -4874,10 +4844,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
                free_mem_cgroup_per_zone_info(memcg, node);
        free_percpu(memcg->stat);
-        if (sizeof(struct mem_cgroup) < PAGE_SIZE)
+        call_rcu(&memcg->rcu_freeing, free_rcu);
-                kfree_rcu(memcg, rcu_freeing);
-        else
-                call_rcu(&memcg->rcu_freeing, vfree_rcu);
 }
 static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4911,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static void __init enable_swap_cgroup(void)
 {
        if (!mem_cgroup_disabled() && really_do_swap_account)
@@ -5016,6 +4983,17 @@ mem_cgroup_create(struct cgroup *cont)
        memcg->move_charge_at_immigrate = 0;
        mutex_init(&memcg->thresholds_lock);
        spin_lock_init(&memcg->move_lock);
+        error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
+        if (error) {
+                /*
+                 * We call put now because our (and parent's) refcnts
+                 * are already in place. mem_cgroup_put() will internally
+                 * call __mem_cgroup_free, so return directly
+                 */
+                mem_cgroup_put(memcg);
+                return ERR_PTR(error);
+        }
        return &memcg->css;
 free_out:
        __mem_cgroup_free(memcg);
@@ -5033,28 +5011,11 @@ static void mem_cgroup_destroy(struct cgroup *cont)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        kmem_cgroup_destroy(cont);
+        kmem_cgroup_destroy(memcg);
        mem_cgroup_put(memcg);
 }
-static int mem_cgroup_populate(struct cgroup_subsys *ss,
-                                struct cgroup *cont)
-{
-        int ret;
-        ret = cgroup_add_files(cont, ss, mem_cgroup_files,
-                                ARRAY_SIZE(mem_cgroup_files));
-        if (!ret)
-                ret = register_memsw_files(cont, ss);
-        if (!ret)
-                ret = register_kmem_files(cont, ss);
-        return ret;
-}
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
 #define PRECHARGE_COUNT_AT_ONCE 256
@@ -5147,7 +5108,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
                return NULL;
        if (PageAnon(page)) {
                /* we don't move shared anon */
-                if (!move_anon() || page_mapcount(page) > 2)
+                if (!move_anon())
                        return NULL;
        } else if (!move_file())
                /* we ignore mapcount for file pages */
@@ -5158,32 +5119,37 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
        return page;
 }
+#ifdef CONFIG_SWAP
 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
-        int usage_count;
        struct page *page = NULL;
        swp_entry_t ent = pte_to_swp_entry(ptent);
        if (!move_anon() || non_swap_entry(ent))
                return NULL;
-        usage_count = mem_cgroup_count_swap_user(ent, &page);
+        /*
-        if (usage_count > 1) { /* we don't move shared anon */
+         * Because lookup_swap_cache() updates some statistics counter,
-                if (page)
+         * we call find_get_page() with swapper_space directly.
-                        put_page(page);
+         */
-                return NULL;
+        page = find_get_page(&swapper_space, ent.val);
-        }
        if (do_swap_account)
                entry->val = ent.val;
        return page;
 }
+#else
+static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
+                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
+{
+        return NULL;
+}
+#endif
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
        struct page *page = NULL;
-        struct inode *inode;
        struct address_space *mapping;
        pgoff_t pgoff;
@@ -5192,7 +5158,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
        if (!move_file())
                return NULL;
-        inode = vma->vm_file->f_path.dentry->d_inode;
        mapping = vma->vm_file->f_mapping;
        if (pte_none(ptent))
                pgoff = linear_page_index(vma, addr);
@@ -5481,7 +5446,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
         *    part of thp split is not executed yet.
         */
        if (pmd_trans_huge_lock(pmd, vma) == 1) {
-                if (!mc.precharge) {
+                if (mc.precharge < HPAGE_PMD_NR) {
                        spin_unlock(&vma->vm_mm->page_table_lock);
                        return 0;
                }
@@ -5491,8 +5456,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                        if (!isolate_lru_page(page)) {
                                pc = lookup_page_cgroup(page);
                                if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
-                                                             pc, mc.from, mc.to,
+                                                        pc, mc.from, mc.to)) {
-                                                             false)) {
                                        mc.precharge -= HPAGE_PMD_NR;
                                        mc.moved_charge += HPAGE_PMD_NR;
                                }
@@ -5522,7 +5486,7 @@ retry:
                                goto put;
                        pc = lookup_page_cgroup(page);
                        if (!mem_cgroup_move_account(page, 1, pc,
-                                                     mc.from, mc.to, false)) {
+                                                     mc.from, mc.to)) {
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
@@ -5533,8 +5497,7 @@ put:			/* get_mctgt_type() gets the page */
                        break;
                case MC_TARGET_SWAP:
                        ent = target.ent;
-                        if (!mem_cgroup_move_swap_account(ent,
+                        if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
-                                                mc.from, mc.to, false)) {
                                mc.precharge--;
                                /* we fixup refcnts and charges later. */
                                mc.moved_swap++;
@@ -5610,7 +5573,6 @@ static void mem_cgroup_move_task(struct cgroup *cont,
        if (mm) {
                if (mc.to)
                        mem_cgroup_move_charge(mm);
-                put_swap_token(mm);
                mmput(mm);
        }
        if (mc.to)
@@ -5638,15 +5600,16 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .create = mem_cgroup_create,
        .pre_destroy = mem_cgroup_pre_destroy,
        .destroy = mem_cgroup_destroy,
-        .populate = mem_cgroup_populate,
        .can_attach = mem_cgroup_can_attach,
        .cancel_attach = mem_cgroup_cancel_attach,
        .attach = mem_cgroup_move_task,
+        .base_cftypes = mem_cgroup_files,
        .early_init = 0,
        .use_id = 1,
+        .__DEPRECATED_clear_css_refs = true,
 };
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 97cc2733551a..a6e2141a6610 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
 * can only guarantee that the page either belongs to the memcg tasks, or is
 * a freed page.
 */
-#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef  CONFIG_MEMCG_SWAP
 u64 hwpoison_filter_memcg;
 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 static int hwpoison_filter_task(struct page *p)
@@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 * Also when FAIL is set do a force kill because something went
 * wrong earlier.
 */
-static void kill_procs(struct list_head *to_kill, int doit, int trapno,
+static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
                          int fail, struct page *page, unsigned long pfn,
                          int flags)
 {
        struct to_kill *tk, *next;
        list_for_each_entry_safe (tk, next, to_kill, nd) {
-                if (doit) {
+                if (forcekill) {
                        /*
                         * In case something went wrong with munmapping
                         * make sure the process doesn't catch the
@@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        struct address_space *mapping;
        LIST_HEAD(tokill);
        int ret;
-        int kill = 1;
+        int kill = 1, forcekill;
        struct page *hpage = compound_head(p);
        struct page *ppage;
@@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * be called inside page lock (it's recommended but not enforced).
         */
        mapping = page_mapping(hpage);
-        if (!PageDirty(hpage) && mapping &&
+        if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
            mapping_cap_writeback_dirty(mapping)) {
                if (page_mkclean(hpage)) {
                        SetPageDirty(hpage);
@@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * Now that the dirty bit has been propagated to the
         * struct page and all unmaps done we can decide if
         * killing is needed or not.  Only kill when the page
-         * was dirty, otherwise the tokill list is merely
+         * was dirty or the process is not restartable,
+         * otherwise the tokill list is merely
         * freed.  When there was a problem unmapping earlier
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        kill_procs(&tokill, !!PageDirty(ppage), trapno,
+        forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+        kill_procs(&tokill, forcekill, trapno,
                      ret != SWAP_SUCCESS, p, pfn, flags);
        return ret;
@@ -1388,23 +1390,23 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
         */
        if (!get_page_unless_zero(compound_head(p))) {
                if (PageHuge(p)) {
-                        pr_info("get_any_page: %#lx free huge page\n", pfn);
+                        pr_info("%s: %#lx free huge page\n", __func__, pfn);
                        ret = dequeue_hwpoisoned_huge_page(compound_head(p));
                } else if (is_free_buddy_page(p)) {
-                        pr_info("get_any_page: %#lx free buddy page\n", pfn);
+                        pr_info("%s: %#lx free buddy page\n", __func__, pfn);
                        /* Set hwpoison bit while page is still isolated */
                        SetPageHWPoison(p);
                        ret = 0;
                } else {
-                        pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+                        pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
-                                pfn, p->flags);
+                                __func__, pfn, p->flags);
                        ret = -EIO;
                }
        } else {
                /* Not a free page */
                ret = 1;
        }
-        unset_migratetype_isolate(p);
+        unset_migratetype_isolate(p, MIGRATE_MOVABLE);
        unlock_memory_hotplug();
        return ret;
 }
@@ -1414,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
        int ret;
        unsigned long pfn = page_to_pfn(page);
        struct page *hpage = compound_head(page);
-        LIST_HEAD(pagelist);
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
@@ -1429,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
        }
        /* Keep page count to indicate a given hugepage is isolated. */
+        ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
-        list_add(&hpage->lru, &pagelist);
+                                MIGRATE_SYNC);
-        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+        put_page(hpage);
-                                true);
        if (ret) {
-                struct page *page1, *page2;
-                list_for_each_entry_safe(page1, page2, &pagelist, lru)
-                        put_page(page1);
                pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                        pfn, ret, page->flags);
-                if (ret > 0)
-                        ret = -EIO;
                return ret;
        }
 done:
        if (!PageHWPoison(hpage))
-                atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
+                atomic_long_add(1 << compound_trans_order(hpage),
+                                &mce_bad_pages);
        set_page_hwpoison_huge_page(hpage);
        dequeue_hwpoisoned_huge_page(hpage);
        /* keep elevated page count for bad page */
@@ -1561,7 +1556,7 @@ int soft_offline_page(struct page *page, int flags)
                                            page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-                                                        0, MIGRATE_SYNC);
+                                                        false, MIGRATE_SYNC);
                if (ret) {
                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 6105f475fa86..57361708d1a5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
        tlb->mm = mm;
        tlb->fullmm     = fullmm;
+        tlb->start      = -1UL;
+        tlb->end        = 0;
        tlb->need_flush = 0;
        tlb->fast_mode  = (num_possible_cpus() == 1);
        tlb->local.next = NULL;
@@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
 {
        struct mmu_gather_batch *batch, *next;
+        tlb->start = start;
+        tlb->end   = end;
        tlb_flush_mmu(tlb);
        /* keep the page table cache within bounds */
@@ -1204,6 +1208,11 @@ again:
         */
        if (force_flush) {
                force_flush = 0;
+#ifdef HAVE_GENERIC_MMU_GATHER
+                tlb->start = addr;
+                tlb->end = end;
+#endif
                tlb_flush_mmu(tlb);
                if (addr != end)
                        goto again;
@@ -1225,7 +1234,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                next = pmd_addr_end(addr, end);
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE) {
-                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+#ifdef CONFIG_DEBUG_VM
+                                if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
+                                        pr_err("%s: mmap_sem is unlocked! addr=0x%lx end=0x%lx vma->vm_start=0x%lx vma->vm_end=0x%lx\n",
+                                                __func__, addr, end,
+                                                vma->vm_start,
+                                                vma->vm_end);
+                                        BUG();
+                                }
+#endif
                                split_huge_page_pmd(vma->vm_mm, pmd);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
@@ -1295,7 +1312,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
 static void unmap_single_vma(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
-                unsigned long end_addr, unsigned long *nr_accounted,
+                unsigned long end_addr,
                struct zap_details *details)
 {
        unsigned long start = max(vma->vm_start, start_addr);
@@ -1307,8 +1324,8 @@ static void unmap_single_vma(struct mmu_gather *tlb,
        if (end <= vma->vm_start)
                return;
-        if (vma->vm_flags & VM_ACCOUNT)
+        if (vma->vm_file)
-                *nr_accounted += (end - start) >> PAGE_SHIFT;
+                uprobe_munmap(vma, start, end);
        if (unlikely(is_pfn_mapping(vma)))
                untrack_pfn_vma(vma, 0, 0);
@@ -1326,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
                         */
-                        if (vma->vm_file)
+                        if (vma->vm_file) {
-                                unmap_hugepage_range(vma, start, end, NULL);
+                                mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                                __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
+                                mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
        }
@@ -1339,8 +1359,6 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
- * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
- * @details: details of nonlinear truncation or shared cache invalidation
 *
 * Unmap all pages in the vma list.
 *
@@ -1355,40 +1373,40 @@ static void unmap_single_vma(struct mmu_gather *tlb,
 */
 void unmap_vmas(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
-                unsigned long end_addr, unsigned long *nr_accounted,
+                unsigned long end_addr)
-                struct zap_details *details)
 {
        struct mm_struct *mm = vma->vm_mm;
        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
-                unmap_single_vma(tlb, vma, start_addr, end_addr, nr_accounted,
+                unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
-                                 details);
        mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
 }
 /**
 * zap_page_range - remove user pages in a given range
 * @vma: vm_area_struct holding the applicable pages
- * @address: starting address of pages to zap
+ * @start: starting address of pages to zap
 * @size: number of bytes to zap
 * @details: details of nonlinear truncation or shared cache invalidation
 *
 * Caller must protect the VMA list
 */
-void zap_page_range(struct vm_area_struct *vma, unsigned long address,
+void zap_page_range(struct vm_area_struct *vma, unsigned long start,
                unsigned long size, struct zap_details *details)
 {
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_gather tlb;
-        unsigned long end = address + size;
+        unsigned long end = start + size;
-        unsigned long nr_accounted = 0;
        lru_add_drain();
        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
-        unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
+        mmu_notifier_invalidate_range_start(mm, start, end);
-        tlb_finish_mmu(&tlb, address, end);
+        for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
+                unmap_single_vma(&tlb, vma, start, end, details);
+        mmu_notifier_invalidate_range_end(mm, start, end);
+        tlb_finish_mmu(&tlb, start, end);
 }
 /**
@@ -1406,13 +1424,12 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr
        struct mm_struct *mm = vma->vm_mm;
        struct mmu_gather tlb;
        unsigned long end = address + size;
-        unsigned long nr_accounted = 0;
        lru_add_drain();
        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
        mmu_notifier_invalidate_range_start(mm, address, end);
-        unmap_single_vma(&tlb, vma, address, end, &nr_accounted, details);
+        unmap_single_vma(&tlb, vma, address, end, details);
        mmu_notifier_invalidate_range_end(mm, address, end);
        tlb_finish_mmu(&tlb, address, end);
 }
@@ -2633,6 +2650,9 @@ reuse:
                if (!page_mkwrite) {
                        wait_on_page_locked(dirty_page);
                        set_page_dirty_balance(dirty_page, page_mkwrite);
+                        /* file_update_time outside page_lock */
+                        if (vma->vm_file)
+                                file_update_time(vma->vm_file);
                }
                put_page(dirty_page);
                if (page_mkwrite) {
@@ -2650,10 +2670,6 @@ reuse:
                        }
                }
-                /* file_update_time outside page_lock */
-                if (vma->vm_file)
-                        file_update_time(vma->vm_file);
                return ret;
        }
@@ -2911,7 +2927,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        delayacct_set_flag(DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry);
        if (!page) {
-                grab_swap_token(mm); /* Contend for token _before_ read-in */
                page = swapin_readahead(entry,
                                        GFP_HIGHUSER_MOVABLE, vma, address);
                if (!page) {
@@ -2941,6 +2956,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        locked = lock_page_or_retry(page, mm, flags);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        if (!locked) {
                ret |= VM_FAULT_RETRY;
@@ -3322,12 +3338,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (dirty_page) {
                struct address_space *mapping = page->mapping;
+                int dirtied = 0;
                if (set_page_dirty(dirty_page))
-                        page_mkwrite = 1;
+                        dirtied = 1;
                unlock_page(dirty_page);
                put_page(dirty_page);
-                if (page_mkwrite && mapping) {
+                if ((dirtied || page_mkwrite) && mapping) {
                        /*
                         * Some device drivers do not set page.mapping but still
                         * dirty their pages
@@ -3336,7 +3353,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                /* file_update_time outside page_lock */
-                if (vma->vm_file)
+                if (vma->vm_file && !page_mkwrite)
                        file_update_time(vma->vm_file);
        } else {
                unlock_page(vmf.page);
@@ -3489,6 +3506,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (unlikely(is_vm_hugetlb_page(vma)))
                return hugetlb_fault(mm, vma, address, flags);
+retry:
        pgd = pgd_offset(mm, address);
        pud = pud_alloc(mm, pgd, address);
        if (!pud)
@@ -3502,13 +3520,24 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                          pmd, flags);
        } else {
                pmd_t orig_pmd = *pmd;
+                int ret;
                barrier();
                if (pmd_trans_huge(orig_pmd)) {
                        if (flags & FAULT_FLAG_WRITE &&
                            !pmd_write(orig_pmd) &&
-                            !pmd_trans_splitting(orig_pmd))
+                            !pmd_trans_splitting(orig_pmd)) {
-                                return do_huge_pmd_wp_page(mm, vma, address,
+                                ret = do_huge_pmd_wp_page(mm, vma, address, pmd,
-                                                           pmd, orig_pmd);
+                                                          orig_pmd);
+                                /*
+                                 * If COW results in an oom, the huge pmd will
+                                 * have been split, so retry the fault on the
+                                 * pte for a smaller charge.
+                                 */
+                                if (unlikely(ret & VM_FAULT_OOM))
+                                        goto retry;
+                                return ret;
+                        }
                        return 0;
                }
        }
@@ -3912,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
                        free_page((unsigned long)buf);
                }
        }
-        up_read(&current->mm->mmap_sem);
+        up_read(&mm->mmap_sem);
 }
 #ifdef CONFIG_PROVE_LOCKING
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6629fafd6ce4..3ad25f9d1fc1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -74,8 +74,7 @@ static struct resource *register_memory_resource(u64 start, u64 size)
        res->end = start + size - 1;
        res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
        if (request_resource(&iomem_resource, res) < 0) {
-                printk("System RAM resource %llx - %llx cannot be added\n",
+                printk("System RAM resource %pR cannot be added\n", res);
-                (unsigned long long)res->start, (unsigned long long)res->end);
                kfree(res);
                res = NULL;
        }
@@ -502,8 +501,10 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
                online_pages_range);
        if (ret) {
                mutex_unlock(&zonelists_mutex);
-                printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
+                printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
-                        nr_pages, pfn);
+                       (unsigned long long) pfn << PAGE_SHIFT,
+                       (((unsigned long long) pfn + nr_pages)
+                            << PAGE_SHIFT) - 1);
                memory_notify(MEM_CANCEL_ONLINE, &arg);
                unlock_memory_hotplug();
                return ret;
@@ -511,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
-        if (need_zonelists_rebuild)
+        if (onlined_pages) {
-                build_all_zonelists(zone);
+                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
-        else
+                if (need_zonelists_rebuild)
-                zone_pcp_update(zone);
+                        build_all_zonelists(NULL, zone);
+                else
+                        zone_pcp_update(zone);
+        }
        mutex_unlock(&zonelists_mutex);
        init_per_zone_wmark_min();
-        if (onlined_pages) {
+        if (onlined_pages)
                kswapd_run(zone_to_nid(zone));
-                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
-        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -561,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
         * to access not-initialized zonelist, build here.
         */
        mutex_lock(&zonelists_mutex);
-        build_all_zonelists(NULL);
+        build_all_zonelists(pgdat, NULL);
        mutex_unlock(&zonelists_mutex);
        return pgdat;
@@ -617,7 +619,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
                pgdat = hotadd_new_pgdat(nid, start);
                ret = -ENOMEM;
                if (!pgdat)
-                        goto out;
+                        goto error;
                new_pgdat = 1;
        }
@@ -891,7 +893,7 @@ static int __ref offline_pages(unsigned long start_pfn,
        nr_pages = end_pfn - start_pfn;
        /* set above range as isolated */
-        ret = start_isolate_page_range(start_pfn, end_pfn);
+        ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
        if (ret)
                goto out;
@@ -956,7 +958,7 @@ repeat:
           We cannot do rollback at this point. */
        offline_isolated_pages(start_pfn, end_pfn);
        /* reset pagetype flags and makes migrate type to be MOVABLE */
-        undo_isolate_page_range(start_pfn, end_pfn);
+        undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
        /* removal success */
        zone->present_pages -= offlined_pages;
        zone->zone_pgdat->node_present_pages -= offlined_pages;
@@ -964,6 +966,9 @@ repeat:
        init_per_zone_wmark_min();
+        if (!populated_zone(zone))
+                zone_pcp_reset(zone);
        if (!node_present_pages(node)) {
                node_clear_state(node, N_HIGH_MEMORY);
                kswapd_stop(node);
@@ -977,11 +982,12 @@ repeat:
        return 0;
 failed_removal:
-        printk(KERN_INFO "memory offlining %lx to %lx failed\n",
+        printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
-                start_pfn, end_pfn);
+               (unsigned long long) start_pfn << PAGE_SHIFT,
+               ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
        memory_notify(MEM_CANCEL_OFFLINE, &arg);
        /* pushback to free area */
-        undo_isolate_page_range(start_pfn, end_pfn);
+        undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 out:
        unlock_memory_hotplug();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cfb6c8678754..bd92431d4c49 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -390,7 +390,7 @@ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
 {
        if (!pol)
                return;
-        if (!mpol_store_user_nodemask(pol) && step == 0 &&
+        if (!mpol_store_user_nodemask(pol) && step == MPOL_REBIND_ONCE &&
            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
                return;
@@ -607,27 +607,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        return first;
 }
-/* Apply policy to a single VMA */
-static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
-{
-        int err = 0;
-        struct mempolicy *old = vma->vm_policy;
-        pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
-                 vma->vm_start, vma->vm_end, vma->vm_pgoff,
-                 vma->vm_ops, vma->vm_file,
-                 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
-        if (vma->vm_ops && vma->vm_ops->set_policy)
-                err = vma->vm_ops->set_policy(vma, new);
-        if (!err) {
-                mpol_get(new);
-                vma->vm_policy = new;
-                mpol_put(old);
-        }
-        return err;
-}
 /* Step 2: apply policy to a range and do splits. */
 static int mbind_range(struct mm_struct *mm, unsigned long start,
                       unsigned long end, struct mempolicy *new_pol)
@@ -676,9 +655,23 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                        if (err)
                                goto out;
                }
-                err = policy_vma(vma, new_pol);
-                if (err)
+                /*
-                        goto out;
+                 * Apply policy to a single VMA. The reference counting of
+                 * policy for vma_policy linkages has already been handled by
+                 * vma_merge and split_vma as necessary. If this is a shared
+                 * policy then ->set_policy will increment the reference count
+                 * for an sp node.
+                 */
+                pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+                        vma->vm_start, vma->vm_end, vma->vm_pgoff,
+                        vma->vm_ops, vma->vm_file,
+                        vma->vm_ops ? vma->vm_ops->set_policy : NULL);
+                if (vma->vm_ops && vma->vm_ops->set_policy) {
+                        err = vma->vm_ops->set_policy(vma, new_pol);
+                        if (err)
+                                goto out;
+                }
        }
 out:
@@ -957,8 +950,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 *
 * Returns the number of page that could not be moved.
 */
-int do_migrate_pages(struct mm_struct *mm,
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
-        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+                     const nodemask_t *to, int flags)
 {
        int busy = 0;
        int err;
@@ -970,7 +963,7 @@ int do_migrate_pages(struct mm_struct *mm,
        down_read(&mm->mmap_sem);
-        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+        err = migrate_vmas(mm, from, to, flags);
        if (err)
                goto out;
@@ -1005,14 +998,34 @@ int do_migrate_pages(struct mm_struct *mm,
         * moved to an empty node, then there is nothing left worth migrating.
         */
-        tmp = *from_nodes;
+        tmp = *from;
        while (!nodes_empty(tmp)) {
                int s,d;
                int source = -1;
                int dest = 0;
                for_each_node_mask(s, tmp) {
-                        d = node_remap(s, *from_nodes, *to_nodes);
+                        /*
+                         * do_migrate_pages() tries to maintain the relative
+                         * node relationship of the pages established between
+                         * threads and memory areas.
+                         *
+                         * However if the number of source nodes is not equal to
+                         * the number of destination nodes we can not preserve
+                         * this node relative relationship.  In that case, skip
+                         * copying memory from a node that is in the destination
+                         * mask.
+                         *
+                         * Example: [2,3,4] -> [3,4,5] moves everything.
+                         *          [0-7] - > [3,4,5] moves only 0,1,2,6,7.
+                         */
+                        if ((nodes_weight(*from) != nodes_weight(*to)) &&
+                                                (node_isset(s, *to)))
+                                continue;
+                        d = node_remap(s, *from, *to);
                        if (s == d)
                                continue;
@@ -1072,8 +1085,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 {
 }
-int do_migrate_pages(struct mm_struct *mm,
+int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
-        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+                     const nodemask_t *to, int flags)
 {
        return -ENOSYS;
 }
@@ -1164,7 +1177,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist)) {
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
                                                (unsigned long)vma,
-                                                false, true);
+                                                false, MIGRATE_SYNC);
                        if (nr_failed)
                                putback_lru_pages(&pagelist);
                }
@@ -1334,8 +1347,8 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
         * userid as the target process.
         */
        tcred = __task_cred(task);
-        if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
+        if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
-            cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
+            !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
            !capable(CAP_SYS_NICE)) {
                rcu_read_unlock();
                err = -EPERM;
@@ -1361,11 +1374,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
        mm = get_task_mm(task);
        put_task_struct(task);
-        if (mm)
-                err = do_migrate_pages(mm, old, new,
+        if (!mm) {
-                        capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
-        else
                err = -EINVAL;
+                goto out;
+        }
+        err = do_migrate_pages(mm, old, new,
+                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
        mmput(mm);
 out:
@@ -1586,8 +1602,14 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 * task can change it's policy.  The system default policy requires no
 * such protection.
 */
-unsigned slab_node(struct mempolicy *policy)
+unsigned slab_node(void)
 {
+        struct mempolicy *policy;
+        if (in_interrupt())
+                return numa_node_id();
+        policy = current->mempolicy;
        if (!policy || policy->flags & MPOL_F_LOCAL)
                return numa_node_id();
diff --git a/mm/mempool.c b/mm/mempool.c
index d9049811f352..54990476c049 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -63,19 +63,21 @@ EXPORT_SYMBOL(mempool_destroy);
 mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                                mempool_free_t *free_fn, void *pool_data)
 {
-        return  mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
+        return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+                                   GFP_KERNEL, NUMA_NO_NODE);
 }
 EXPORT_SYMBOL(mempool_create);
 mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
-                        mempool_free_t *free_fn, void *pool_data, int node_id)
+                               mempool_free_t *free_fn, void *pool_data,
+                               gfp_t gfp_mask, int node_id)
 {
        mempool_t *pool;
-        pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id);
+        pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
        if (!pool)
                return NULL;
        pool->elements = kmalloc_node(min_nr * sizeof(void *),
-                                        GFP_KERNEL, node_id);
+                                      gfp_mask, node_id);
        if (!pool->elements) {
                kfree(pool);
                return NULL;
@@ -93,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
        while (pool->curr_nr < pool->min_nr) {
                void *element;
-                element = pool->alloc(GFP_KERNEL, pool->pool_data);
+                element = pool->alloc(gfp_mask, pool->pool_data);
                if (unlikely(!element)) {
                        mempool_destroy(pool);
                        return NULL;
diff --git a/mm/migrate.c b/mm/migrate.c
index 51c08a0c6f68..77ed2d773705 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -33,6 +33,7 @@
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
 #include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
 #include <asm/tlbflush.h>
@@ -436,7 +437,10 @@ void migrate_page_copy(struct page *newpage, struct page *page)
                 * is actually a signal that all of the page has become dirty.
                 * Whereas only part of our page may be dirty.
                 */
-                __set_page_dirty_nobuffers(newpage);
+                if (PageSwapBacked(page))
+                        SetPageDirty(newpage);
+                else
+                        __set_page_dirty_nobuffers(newpage);
        }
        mlock_migrate_page(newpage, page);
@@ -679,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 {
        int rc = -EAGAIN;
        int remap_swapcache = 1;
-        int charge = 0;
        struct mem_cgroup *mem;
        struct anon_vma *anon_vma = NULL;
@@ -721,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
        }
        /* charge against new page */
-        charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
+        mem_cgroup_prepare_migration(page, newpage, &mem);
-        if (charge == -ENOMEM) {
-                rc = -ENOMEM;
-                goto unlock;
-        }
-        BUG_ON(charge);
        if (PageWriteback(page)) {
                /*
@@ -816,8 +814,7 @@ skip_unmap:
                put_anon_vma(anon_vma);
 uncharge:
-        if (!charge)
+        mem_cgroup_end_migration(mem, page, newpage, rc == 0);
-                mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
        unlock_page(page);
 out:
@@ -928,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (anon_vma)
                put_anon_vma(anon_vma);
-        unlock_page(hpage);
-out:
+        if (!rc)
-        if (rc != -EAGAIN) {
+                hugetlb_cgroup_migrate(hpage, new_hpage);
-                list_del(&hpage->lru);
-                put_page(hpage);
-        }
+        unlock_page(hpage);
+out:
        put_page(new_hpage);
        if (result) {
                if (rc)
                        *result = rc;
@@ -1013,48 +1007,32 @@ out:
        return nr_failed + retry;
 }
-int migrate_huge_pages(struct list_head *from,
+int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
-                new_page_t get_new_page, unsigned long private, bool offlining,
+                      unsigned long private, bool offlining,
-                enum migrate_mode mode)
+                      enum migrate_mode mode)
 {
-        int retry = 1;
+        int pass, rc;
-        int nr_failed = 0;
-        int pass = 0;
+        for (pass = 0; pass < 10; pass++) {
-        struct page *page;
+                rc = unmap_and_move_huge_page(get_new_page,
-        struct page *page2;
+                                              private, hpage, pass > 2, offlining,
-        int rc;
+                                              mode);
+                switch (rc) {
-        for (pass = 0; pass < 10 && retry; pass++) {
+                case -ENOMEM:
-                retry = 0;
+                        goto out;
+                case -EAGAIN:
-                list_for_each_entry_safe(page, page2, from, lru) {
+                        /* try again */
                        cond_resched();
+                        break;
-                        rc = unmap_and_move_huge_page(get_new_page,
+                case 0:
-                                        private, page, pass > 2, offlining,
+                        goto out;
-                                        mode);
+                default:
+                        rc = -EIO;
-                        switch(rc) {
+                        goto out;
-                        case -ENOMEM:
-                                goto out;
-                        case -EAGAIN:
-                                retry++;
-                                break;
-                        case 0:
-                                break;
-                        default:
-                                /* Permanent failure */
-                                nr_failed++;
-                                break;
-                        }
                }
        }
-        rc = 0;
 out:
-        if (rc)
+        return rc;
-                return rc;
-        return nr_failed + retry;
 }
 #ifdef CONFIG_NUMA
@@ -1371,8 +1349,8 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
         * userid as the target process.
         */
        tcred = __task_cred(task);
-        if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
+        if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) &&
-            cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
+            !uid_eq(cred->uid,  tcred->suid) && !uid_eq(cred->uid,  tcred->uid) &&
            !capable(CAP_SYS_NICE)) {
                rcu_read_unlock();
                err = -EPERM;
@@ -1388,14 +1366,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
        mm = get_task_mm(task);
        put_task_struct(task);
-        if (mm) {
+        if (!mm)
-                if (nodes)
+                return -EINVAL;
-                        err = do_pages_move(mm, task_nodes, nr_pages, pages,
-                                            nodes, status, flags);
+        if (nodes)
-                else
+                err = do_pages_move(mm, task_nodes, nr_pages, pages,
-                        err = do_pages_stat(mm, nr_pages, pages, status);
+                                    nodes, status, flags);
-        } else
+        else
-                err = -EINVAL;
+                err = do_pages_stat(mm, nr_pages, pages, status);
        mmput(mm);
        return err;
diff --git a/mm/mmap.c b/mm/mmap.c
index a7bf6a31c9f6..ae18a48e7e4e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -30,6 +30,7 @@
 #include <linux/perf_event.h>
 #include <linux/audit.h>
 #include <linux/khugepaged.h>
+#include <linux/uprobes.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -240,6 +241,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
        return next;
 }
+static unsigned long do_brk(unsigned long addr, unsigned long len);
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
        unsigned long rlim, retval;
@@ -544,8 +547,15 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (file) {
                mapping = file->f_mapping;
-                if (!(vma->vm_flags & VM_NONLINEAR))
+                if (!(vma->vm_flags & VM_NONLINEAR)) {
                        root = &mapping->i_mmap;
+                        uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+                        if (adjust_next)
+                                uprobe_munmap(next, next->vm_start,
+                                                        next->vm_end);
+                }
                mutex_lock(&mapping->i_mmap_mutex);
                if (insert) {
                        /*
@@ -615,8 +625,16 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
+        if (root) {
+                uprobe_mmap(vma);
+                if (adjust_next)
+                        uprobe_mmap(next);
+        }
        if (remove_next) {
                if (file) {
+                        uprobe_munmap(next, next->vm_start, next->vm_end);
                        fput(file);
                        if (next->vm_flags & VM_EXECUTABLE)
                                removed_exe_file_vma(mm);
@@ -636,6 +654,8 @@ again:			remove_next = 1 + (end > next->vm_end);
                        goto again;
                }
        }
+        if (insert && file)
+                uprobe_mmap(insert);
        validate_mm(mm);
@@ -923,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
        const unsigned long stack_flags
                = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
+        mm->total_vm += pages;
        if (file) {
                mm->shared_vm += pages;
                if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -958,8 +980,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        struct mm_struct * mm = current->mm;
        struct inode *inode;
        vm_flags_t vm_flags;
-        int error;
-        unsigned long reqprot = prot;
        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1081,13 +1101,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                }
        }
-        error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
-        if (error)
-                return error;
        return mmap_region(file, addr, len, flags, vm_flags, pgoff);
 }
-EXPORT_SYMBOL(do_mmap_pgoff);
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
@@ -1120,10 +1135,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-        down_write(&current->mm->mmap_sem);
+        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-        retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-        up_write(&current->mm->mmap_sem);
        if (file)
                fput(file);
 out:
@@ -1337,13 +1349,16 @@ munmap_back:
 out:
        perf_event_mmap(vma);
-        mm->total_vm += len >> PAGE_SHIFT;
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                if (!mlock_vma_pages_range(vma, addr, addr + len))
                        mm->locked_vm += (len >> PAGE_SHIFT);
        } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
                make_pages_present(addr, addr + len);
+        if (file)
+                uprobe_mmap(vma);
        return addr;
 unmap_and_free_vma:
@@ -1579,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
        if (addr & ~PAGE_MASK)
                return -EINVAL;
-        return arch_rebalance_pgtables(addr, len);
+        addr = arch_rebalance_pgtables(addr, len);
+        error = security_mmap_addr(addr);
+        return error ? error : addr;
 }
 EXPORT_SYMBOL(get_unmapped_area);
@@ -1589,33 +1606,34 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
        struct vm_area_struct *vma = NULL;
-        if (mm) {
+        if (WARN_ON_ONCE(!mm))          /* Remove this in linux-3.6 */
-                /* Check the cache first. */
+                return NULL;
-                /* (Cache hit rate is typically around 35%.) */
-                vma = mm->mmap_cache;
+        /* Check the cache first. */
-                if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
+        /* (Cache hit rate is typically around 35%.) */
-                        struct rb_node * rb_node;
+        vma = mm->mmap_cache;
+        if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
-                        rb_node = mm->mm_rb.rb_node;
+                struct rb_node *rb_node;
-                        vma = NULL;
+                rb_node = mm->mm_rb.rb_node;
-                        while (rb_node) {
+                vma = NULL;
-                                struct vm_area_struct * vma_tmp;
+                while (rb_node) {
-                                vma_tmp = rb_entry(rb_node,
+                        struct vm_area_struct *vma_tmp;
-                                                struct vm_area_struct, vm_rb);
+                        vma_tmp = rb_entry(rb_node,
-                                if (vma_tmp->vm_end > addr) {
+                                           struct vm_area_struct, vm_rb);
-                                        vma = vma_tmp;
-                                        if (vma_tmp->vm_start <= addr)
+                        if (vma_tmp->vm_end > addr) {
-                                                break;
+                                vma = vma_tmp;
-                                        rb_node = rb_node->rb_left;
+                                if (vma_tmp->vm_start <= addr)
-                                } else
+                                        break;
-                                        rb_node = rb_node->rb_right;
+                                rb_node = rb_node->rb_left;
-                        }
+                        } else
-                        if (vma)
+                                rb_node = rb_node->rb_right;
-                                mm->mmap_cache = vma;
                }
+                if (vma)
+                        mm->mmap_cache = vma;
        }
        return vma;
 }
@@ -1689,7 +1707,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
                return -ENOMEM;
        /* Ok, everything looks good - let it rip */
-        mm->total_vm += grow;
        if (vma->vm_flags & VM_LOCKED)
                mm->locked_vm += grow;
        vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
@@ -1768,7 +1785,7 @@ int expand_downwards(struct vm_area_struct *vma,
                return -ENOMEM;
        address &= PAGE_MASK;
-        error = security_file_mmap(NULL, 0, 0, 0, address, 1);
+        error = security_mmap_addr(address);
        if (error)
                return error;
@@ -1862,15 +1879,19 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
 */
 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 {
+        unsigned long nr_accounted = 0;
        /* Update high watermark before we lower total_vm */
        update_hiwater_vm(mm);
        do {
                long nrpages = vma_pages(vma);
-                mm->total_vm -= nrpages;
+                if (vma->vm_flags & VM_ACCOUNT)
+                        nr_accounted += nrpages;
                vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
                vma = remove_vma(vma);
        } while (vma);
+        vm_unacct_memory(nr_accounted);
        validate_mm(mm);
 }
@@ -1885,13 +1906,11 @@ static void unmap_region(struct mm_struct *mm,
 {
        struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
        struct mmu_gather tlb;
-        unsigned long nr_accounted = 0;
        lru_add_drain();
        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
-        unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
+        unmap_vmas(&tlb, vma, start, end);
-        vm_unacct_memory(nr_accounted);
        free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
                                 next ? next->vm_start : 0);
        tlb_finish_mmu(&tlb, start, end);
@@ -2106,20 +2125,23 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        return 0;
 }
-EXPORT_SYMBOL(do_munmap);
+int vm_munmap(unsigned long start, size_t len)
-SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 {
        int ret;
        struct mm_struct *mm = current->mm;
-        profile_munmap(addr);
        down_write(&mm->mmap_sem);
-        ret = do_munmap(mm, addr, len);
+        ret = do_munmap(mm, start, len);
        up_write(&mm->mmap_sem);
        return ret;
 }
+EXPORT_SYMBOL(vm_munmap);
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+        profile_munmap(addr);
+        return vm_munmap(addr, len);
+}
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
@@ -2136,7 +2158,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
 *  anonymous maps.  eventually we may be able to do some
 *  brk-specific accounting here.
 */
-unsigned long do_brk(unsigned long addr, unsigned long len)
+static unsigned long do_brk(unsigned long addr, unsigned long len)
 {
        struct mm_struct * mm = current->mm;
        struct vm_area_struct * vma, * prev;
@@ -2149,10 +2171,6 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
        if (!len)
                return addr;
-        error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
-        if (error)
-                return error;
        flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
        error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
@@ -2232,7 +2250,17 @@ out:
        return addr;
 }
-EXPORT_SYMBOL(do_brk);
+unsigned long vm_brk(unsigned long addr, unsigned long len)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long ret;
+        down_write(&mm->mmap_sem);
+        ret = do_brk(addr, len);
+        up_write(&mm->mmap_sem);
+        return ret;
+}
+EXPORT_SYMBOL(vm_brk);
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
@@ -2264,8 +2292,7 @@ void exit_mmap(struct mm_struct *mm)
        tlb_gather_mmu(&tlb, mm, 1);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
-        unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
+        unmap_vmas(&tlb, vma, 0, -1);
-        vm_unacct_memory(nr_accounted);
        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
        tlb_finish_mmu(&tlb, 0, -1);
@@ -2274,10 +2301,14 @@ void exit_mmap(struct mm_struct *mm)
         * Walk the list again, actually closing and freeing it,
         * with preemption enabled, without holding any MM locks.
         */
-        while (vma)
+        while (vma) {
+                if (vma->vm_flags & VM_ACCOUNT)
+                        nr_accounted += vma_pages(vma);
                vma = remove_vma(vma);
+        }
+        vm_unacct_memory(nr_accounted);
-        BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+        WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 /* Insert vm structure into process list sorted by address
@@ -2311,6 +2342,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
        if ((vma->vm_flags & VM_ACCOUNT) &&
             security_vm_enough_memory_mm(mm, vma_pages(vma)))
                return -ENOMEM;
        vma_link(mm, vma, prev, rb_link, rb_parent);
        return 0;
 }
@@ -2380,6 +2412,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                        new_vma->vm_pgoff = pgoff;
                        if (new_vma->vm_file) {
                                get_file(new_vma->vm_file);
                                if (vma->vm_flags & VM_EXECUTABLE)
                                        added_exe_file_vma(mm);
                        }
@@ -2484,10 +2517,6 @@ int install_special_mapping(struct mm_struct *mm,
        vma->vm_ops = &special_mapping_vmops;
        vma->vm_private_data = pages;
-        ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
-        if (ret)
-                goto out;
        ret = insert_vm_struct(mm, vma);
        if (ret)
                goto out;
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9a611d3a1848..862b60822d9f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
 void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        /*
+         * RCU here will block mmu_notifier_unregister until
+         * ->release returns.
+         */
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
+                /*
+                 * if ->release runs before mmu_notifier_unregister it
+                 * must be handled as it's the only way for the driver
+                 * to flush all existing sptes and stop the driver
+                 * from establishing any more sptes before all the
+                 * pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+        rcu_read_unlock();
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
                 * mmu_notifier_unregister to return.
                 */
                hlist_del_init_rcu(&mn->hlist);
-                /*
-                 * RCU here will block mmu_notifier_unregister until
-                 * ->release returns.
-                 */
-                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
-                /*
-                 * if ->release runs before mmu_notifier_unregister it
-                 * must be handled as it's the only way for the driver
-                 * to flush all existing sptes and stop the driver
-                 * from establishing any more sptes before all the
-                 * pages in the mm are freed.
-                 */
-                if (mn->ops->release)
-                        mn->ops->release(mn, mm);
-                rcu_read_unlock();
-                spin_lock(&mm->mmu_notifier_mm->lock);
        }
        spin_unlock(&mm->mmu_notifier_mm->lock);
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
-        spin_lock(&mm->mmu_notifier_mm->lock);
        if (!hlist_unhashed(&mn->hlist)) {
-                hlist_del_rcu(&mn->hlist);
                /*
                 * RCU here will force exit_mmap to wait ->release to finish
                 * before freeing the pages.
                 */
                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
                /*
                 * exit_mmap will block in mmu_notifier_release to
                 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
                rcu_read_unlock();
-        } else
+                spin_lock(&mm->mmu_notifier_mm->lock);
+                hlist_del_rcu(&mn->hlist);
                spin_unlock(&mm->mmu_notifier_mm->lock);
+        }
        /*
         * Wait any running method to finish, of course including
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 7cf7b7ddc7c5..3cef80f6ac79 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -86,3 +86,17 @@ int memmap_valid_within(unsigned long pfn,
        return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+void lruvec_init(struct lruvec *lruvec, struct zone *zone)
+{
+        enum lru_list lru;
+        memset(lruvec, 0, sizeof(struct lruvec));
+        for_each_lru(lru)
+                INIT_LIST_HEAD(&lruvec->lists[lru]);
+#ifdef CONFIG_MEMCG
+        lruvec->zone = zone;
+#endif
+}
diff --git a/mm/mremap.c b/mm/mremap.c
index db8d983b5a7d..cc06d0e48d05 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         * If this were a serious issue, we'd add a flag to do_munmap().
         */
        hiwater_vm = mm->hiwater_vm;
-        mm->total_vm += new_len >> PAGE_SHIFT;
        vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
        if (do_munmap(mm, old_addr, old_len) < 0) {
@@ -371,10 +370,6 @@ static unsigned long mremap_to(unsigned long addr,
        if ((addr <= new_addr) && (addr+old_len) > new_addr)
                goto out;
-        ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
-        if (ret)
-                goto out;
        ret = do_munmap(mm, new_addr, new_len);
        if (ret)
                goto out;
@@ -432,15 +427,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
 * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
 * This option implies MREMAP_MAYMOVE.
 */
-unsigned long do_mremap(unsigned long addr,
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
-        unsigned long old_len, unsigned long new_len,
+                unsigned long, new_len, unsigned long, flags,
-        unsigned long flags, unsigned long new_addr)
+                unsigned long, new_addr)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long ret = -EINVAL;
        unsigned long charged = 0;
+        down_write(&current->mm->mmap_sem);
        if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
                goto out;
@@ -499,7 +496,6 @@ unsigned long do_mremap(unsigned long addr,
                                goto out;
                        }
-                        mm->total_vm += pages;
                        vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
                        if (vma->vm_flags & VM_LOCKED) {
                                mm->locked_vm += pages;
@@ -530,25 +526,11 @@ unsigned long do_mremap(unsigned long addr,
                        goto out;
                }
-                ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
-                if (ret)
-                        goto out;
                ret = move_vma(vma, addr, old_len, new_len, new_addr);
        }
 out:
        if (ret & ~PAGE_MASK)
                vm_unacct_memory(charged);
-        return ret;
-}
-SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
-                unsigned long, new_len, unsigned long, flags,
-                unsigned long, new_addr)
-{
-        unsigned long ret;
-        down_write(&current->mm->mmap_sem);
-        ret = do_mremap(addr, old_len, new_len, flags, new_addr);
        up_write(&current->mm->mmap_sem);
        return ret;
 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 24f0fc1a56d6..405573010f99 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -82,8 +82,7 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
 static void __init __free_pages_memory(unsigned long start, unsigned long end)
 {
-        int i;
+        unsigned long i, start_aligned, end_aligned;
-        unsigned long start_aligned, end_aligned;
        int order = ilog2(BITS_PER_LONG);
        start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
@@ -106,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
                __free_pages_bootmem(pfn_to_page(i), 0);
 }
+static unsigned long __init __free_memory_core(phys_addr_t start,
+                                 phys_addr_t end)
+{
+        unsigned long start_pfn = PFN_UP(start);
+        unsigned long end_pfn = min_t(unsigned long,
+                                      PFN_DOWN(end), max_low_pfn);
+        if (start_pfn > end_pfn)
+                return 0;
+        __free_pages_memory(start_pfn, end_pfn);
+        return end_pfn - start_pfn;
+}
 unsigned long __init free_low_memory_core_early(int nodeid)
 {
        unsigned long count = 0;
-        phys_addr_t start, end;
+        phys_addr_t start, end, size;
        u64 i;
-        /* free reserved array temporarily so that it's treated as free area */
+        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL)
-        memblock_free_reserved_regions();
+                count += __free_memory_core(start, end);
-        for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) {
+        /* free range that is used for reserved array if we allocate it */
-                unsigned long start_pfn = PFN_UP(start);
+        size = get_allocated_memblock_reserved_regions_info(&start);
-                unsigned long end_pfn = min_t(unsigned long,
+        if (size)
-                                              PFN_DOWN(end), max_low_pfn);
+                count += __free_memory_core(start, start + size);
-                if (start_pfn < end_pfn) {
-                        __free_pages_memory(start_pfn, end_pfn);
-                        count += end_pfn - start_pfn;
-                }
-        }
-        /* put region array back? */
-        memblock_reserve_reserved_regions();
        return count;
 }
@@ -275,6 +282,57 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return ___alloc_bootmem(size, align, goal, limit);
 }
+void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
+                                                   unsigned long size,
+                                                   unsigned long align,
+                                                   unsigned long goal,
+                                                   unsigned long limit)
+{
+        void *ptr;
+again:
+        ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+                                        goal, limit);
+        if (ptr)
+                return ptr;
+        ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
+                                        goal, limit);
+        if (ptr)
+                return ptr;
+        if (goal) {
+                goal = 0;
+                goto again;
+        }
+        return NULL;
+}
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+                                   unsigned long align, unsigned long goal)
+{
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+        return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
+}
+void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+                                    unsigned long align, unsigned long goal,
+                                    unsigned long limit)
+{
+        void *ptr;
+        ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
+        if (ptr)
+                return ptr;
+        printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+        panic("Out of memory");
+        return NULL;
+}
 /**
 * __alloc_bootmem_node - allocate boot memory from a specific node
 * @pgdat: node to allocate from
@@ -293,18 +351,10 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
                                   unsigned long align, unsigned long goal)
 {
-        void *ptr;
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+        return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
-                                         goal, -1ULL);
-        if (ptr)
-                return ptr;
-        return __alloc_memory_core_early(MAX_NUMNODES, size, align,
-                                         goal, -1ULL);
 }
 void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -313,44 +363,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
        return __alloc_bootmem_node(pgdat, size, align, goal);
 }
-#ifdef CONFIG_SPARSEMEM
-/**
- * alloc_bootmem_section - allocate boot memory from a specific section
- * @size: size of the request in bytes
- * @section_nr: sparse map section to allocate from
- *
- * Return NULL on failure.
- */
-void * __init alloc_bootmem_section(unsigned long size,
-                                    unsigned long section_nr)
-{
-        unsigned long pfn, goal, limit;
-        pfn = section_nr_to_pfn(section_nr);
-        goal = pfn << PAGE_SHIFT;
-        limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
-        return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
-                                         SMP_CACHE_BYTES, goal, limit);
-}
-#endif
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-                                   unsigned long align, unsigned long goal)
-{
-        void *ptr;
-        if (WARN_ON_ONCE(slab_is_available()))
-                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
-                                                 goal, -1ULL);
-        if (ptr)
-                return ptr;
-        return __alloc_bootmem_nopanic(size, align, goal);
-}
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
 #endif
@@ -392,16 +404,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
                                       unsigned long align, unsigned long goal)
 {
-        void *ptr;
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-        ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+        return ___alloc_bootmem_node(pgdat, size, align, goal,
-                                goal, ARCH_LOW_ADDRESS_LIMIT);
+                                     ARCH_LOW_ADDRESS_LIMIT);
-        if (ptr)
-                return ptr;
-        return  __alloc_memory_core_early(MAX_NUMNODES, size, align,
-                                goal, ARCH_LOW_ADDRESS_LIMIT);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index f59e170fceb4..d4b0c10872de 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file,
                                 unsigned long *_capabilities)
 {
        unsigned long capabilities, rlen;
-        unsigned long reqprot = prot;
        int ret;
        /* do the simple checks first */
@@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file,
        }
        /* allow the security API to have its say */
-        ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+        ret = security_mmap_addr(addr);
        if (ret < 0)
                return ret;
@@ -1470,7 +1469,6 @@ error_getting_region:
        show_free_areas(0);
        return -ENOMEM;
 }
-EXPORT_SYMBOL(do_mmap_pgoff);
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
@@ -1488,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
-        down_write(&current->mm->mmap_sem);
+        retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-        retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-        up_write(&current->mm->mmap_sem);
        if (file)
                fput(file);
@@ -1709,16 +1705,22 @@ erase_whole_vma:
 }
 EXPORT_SYMBOL(do_munmap);
-SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+int vm_munmap(unsigned long addr, size_t len)
 {
-        int ret;
        struct mm_struct *mm = current->mm;
+        int ret;
        down_write(&mm->mmap_sem);
        ret = do_munmap(mm, addr, len);
        up_write(&mm->mmap_sem);
        return ret;
 }
+EXPORT_SYMBOL(vm_munmap);
+SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
+{
+        return vm_munmap(addr, len);
+}
 /*
 * release all the mappings made in a process's VM space
@@ -1744,7 +1746,7 @@ void exit_mmap(struct mm_struct *mm)
        kleave("");
 }
-unsigned long do_brk(unsigned long addr, unsigned long len)
+unsigned long vm_brk(unsigned long addr, unsigned long len)
 {
        return -ENOMEM;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 46bf2ed5594c..198600861638 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -180,10 +180,11 @@ static bool oom_unkillable_task(struct task_struct *p,
 * predictable as possible.  The goal is to return the highest value for the
 * task consuming the most memory to avoid subsequent oom failures.
 */
-unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
+unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-                      const nodemask_t *nodemask, unsigned long totalpages)
+                          const nodemask_t *nodemask, unsigned long totalpages)
 {
        long points;
+        long adj;
        if (oom_unkillable_task(p, memcg, nodemask))
                return 0;
@@ -192,27 +193,18 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
        if (!p)
                return 0;
-        if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+        adj = p->signal->oom_score_adj;
+        if (adj == OOM_SCORE_ADJ_MIN) {
                task_unlock(p);
                return 0;
        }
        /*
-         * The memory controller may have a limit of 0 bytes, so avoid a divide
-         * by zero, if necessary.
-         */
-        if (!totalpages)
-                totalpages = 1;
-        /*
         * The baseline for the badness score is the proportion of RAM that each
         * task's rss, pagetable and swap space use.
         */
-        points = get_mm_rss(p->mm) + p->mm->nr_ptes;
+        points = get_mm_rss(p->mm) + p->mm->nr_ptes +
-        points += get_mm_counter(p->mm, MM_SWAPENTS);
+                 get_mm_counter(p->mm, MM_SWAPENTS);
-        points *= 1000;
-        points /= totalpages;
        task_unlock(p);
        /*
@@ -220,23 +212,17 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
         * implementation used by LSMs.
         */
        if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-                points -= 30;
+                adj -= 30;
-        /*
+        /* Normalize to oom_score_adj units */
-         * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+        adj *= totalpages / 1000;
-         * either completely disable oom killing or always prefer a certain
+        points += adj;
-         * task.
-         */
-        points += p->signal->oom_score_adj;
        /*
-         * Never return 0 for an eligible task that may be killed since it's
+         * Never return 0 for an eligible task regardless of the root bonus and
-         * possible that no single user task uses more than 0.1% of memory and
+         * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
-         * no single admin tasks uses more than 3.0%.
         */
-        if (points <= 0)
+        return points > 0 ? points : 1;
-                return 1;
-        return (points < 1000) ? points : 1000;
 }
 /*
@@ -302,99 +288,116 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 }
 #endif
+enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
+                unsigned long totalpages, const nodemask_t *nodemask,
+                bool force_kill)
+{
+        if (task->exit_state)
+                return OOM_SCAN_CONTINUE;
+        if (oom_unkillable_task(task, NULL, nodemask))
+                return OOM_SCAN_CONTINUE;
+        /*
+         * This task already has access to memory reserves and is being killed.
+         * Don't allow any other task to have access to the reserves.
+         */
+        if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
+                if (unlikely(frozen(task)))
+                        __thaw_task(task);
+                if (!force_kill)
+                        return OOM_SCAN_ABORT;
+        }
+        if (!task->mm)
+                return OOM_SCAN_CONTINUE;
+        if (task->flags & PF_EXITING) {
+                /*
+                 * If task is current and is in the process of releasing memory,
+                 * allow the "kill" to set TIF_MEMDIE, which will allow it to
+                 * access memory reserves.  Otherwise, it may stall forever.
+                 *
+                 * The iteration isn't broken here, however, in case other
+                 * threads are found to have already been oom killed.
+                 */
+                if (task == current)
+                        return OOM_SCAN_SELECT;
+                else if (!force_kill) {
+                        /*
+                         * If this task is not being ptraced on exit, then wait
+                         * for it to finish before killing some other task
+                         * unnecessarily.
+                         */
+                        if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
+                                return OOM_SCAN_ABORT;
+                }
+        }
+        return OOM_SCAN_OK;
+}
 /*
 * Simple selection loop. We chose the process with the highest
- * number of 'points'. We expect the caller will lock the tasklist.
+ * number of 'points'.
 *
 * (not docbooked, we don't want this one cluttering up the manual)
 */
 static struct task_struct *select_bad_process(unsigned int *ppoints,
-                unsigned long totalpages, struct mem_cgroup *memcg,
+                unsigned long totalpages, const nodemask_t *nodemask,
-                const nodemask_t *nodemask, bool force_kill)
+                bool force_kill)
 {
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
-        *ppoints = 0;
+        unsigned long chosen_points = 0;
+        rcu_read_lock();
        do_each_thread(g, p) {
                unsigned int points;
-                if (p->exit_state)
+                switch (oom_scan_process_thread(p, totalpages, nodemask,
-                        continue;
+                                                force_kill)) {
-                if (oom_unkillable_task(p, memcg, nodemask))
+                case OOM_SCAN_SELECT:
-                        continue;
+                        chosen = p;
+                        chosen_points = ULONG_MAX;
-                /*
+                        /* fall through */
-                 * This task already has access to memory reserves and is
+                case OOM_SCAN_CONTINUE:
-                 * being killed. Don't allow any other task access to the
-                 * memory reserve.
-                 *
-                 * Note: this may have a chance of deadlock if it gets
-                 * blocked waiting for another task which itself is waiting
-                 * for memory. Is there a better alternative?
-                 */
-                if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
-                        if (unlikely(frozen(p)))
-                                __thaw_task(p);
-                        if (!force_kill)
-                                return ERR_PTR(-1UL);
-                }
-                if (!p->mm)
                        continue;
+                case OOM_SCAN_ABORT:
-                if (p->flags & PF_EXITING) {
+                        rcu_read_unlock();
-                        /*
+                        return ERR_PTR(-1UL);
-                         * If p is the current task and is in the process of
+                case OOM_SCAN_OK:
-                         * releasing memory, we allow the "kill" to set
+                        break;
-                         * TIF_MEMDIE, which will allow it to gain access to
+                };
-                         * memory reserves.  Otherwise, it may stall forever.
+                points = oom_badness(p, NULL, nodemask, totalpages);
-                         *
+                if (points > chosen_points) {
-                         * The loop isn't broken here, however, in case other
-                         * threads are found to have already been oom killed.
-                         */
-                        if (p == current) {
-                                chosen = p;
-                                *ppoints = 1000;
-                        } else if (!force_kill) {
-                                /*
-                                 * If this task is not being ptraced on exit,
-                                 * then wait for it to finish before killing
-                                 * some other task unnecessarily.
-                                 */
-                                if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
-                                        return ERR_PTR(-1UL);
-                        }
-                }
-                points = oom_badness(p, memcg, nodemask, totalpages);
-                if (points > *ppoints) {
                        chosen = p;
-                        *ppoints = points;
+                        chosen_points = points;
                }
        } while_each_thread(g, p);
+        if (chosen)
+                get_task_struct(chosen);
+        rcu_read_unlock();
+        *ppoints = chosen_points * 1000 / totalpages;
        return chosen;
 }
 /**
 * dump_tasks - dump current memory state of all system tasks
- * @mem: current's memory controller, if constrained
+ * @memcg: current's memory controller, if constrained
 * @nodemask: nodemask passed to page allocator for mempolicy ooms
 *
 * Dumps the current memory state of all eligible tasks.  Tasks not in the same
 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
 * are not shown.
- * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
+ * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
- * value, oom_score_adj value, and name.
+ * swapents, oom_score_adj value, and name.
- *
- * Call with tasklist_lock read-locked.
 */
 static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
        struct task_struct *p;
        struct task_struct *task;
-        pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
+        pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes swapents oom_score_adj name\n");
+        rcu_read_lock();
        for_each_process(p) {
                if (oom_unkillable_task(p, memcg, nodemask))
                        continue;
@@ -409,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
                        continue;
                }
-                pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
+                pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5d %s\n",
-                        task->pid, task_uid(task), task->tgid,
+                        task->pid, from_kuid(&init_user_ns, task_uid(task)),
-                        task->mm->total_vm, get_mm_rss(task->mm),
+                        task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-                        task_cpu(task), task->signal->oom_adj,
+                        task->mm->nr_ptes,
+                        get_mm_counter(task->mm, MM_SWAPENTS),
                        task->signal->oom_score_adj, task->comm);
                task_unlock(task);
        }
+        rcu_read_unlock();
 }
 static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -436,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
-static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+/*
-                             unsigned int points, unsigned long totalpages,
+ * Must be called while holding a reference to p, which will be released upon
-                             struct mem_cgroup *memcg, nodemask_t *nodemask,
+ * returning.
-                             const char *message)
+ */
+void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+                      unsigned int points, unsigned long totalpages,
+                      struct mem_cgroup *memcg, nodemask_t *nodemask,
+                      const char *message)
 {
        struct task_struct *victim = p;
        struct task_struct *child;
@@ -455,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         */
        if (p->flags & PF_EXITING) {
                set_tsk_thread_flag(p, TIF_MEMDIE);
+                put_task_struct(p);
                return;
        }
@@ -472,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * parent.  This attempts to lose the minimal amount of work done while
         * still freeing memory.
         */
+        read_lock(&tasklist_lock);
        do {
                list_for_each_entry(child, &t->children, sibling) {
                        unsigned int child_points;
@@ -484,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        child_points = oom_badness(child, memcg, nodemask,
                                                                totalpages);
                        if (child_points > victim_points) {
+                                put_task_struct(victim);
                                victim = child;
                                victim_points = child_points;
+                                get_task_struct(victim);
                        }
                }
        } while_each_thread(p, t);
+        read_unlock(&tasklist_lock);
-        victim = find_lock_task_mm(victim);
+        rcu_read_lock();
-        if (!victim)
+        p = find_lock_task_mm(victim);
+        if (!p) {
+                rcu_read_unlock();
+                put_task_struct(victim);
                return;
+        } else if (victim != p) {
+                get_task_struct(p);
+                put_task_struct(victim);
+                victim = p;
+        }
        /* mm cannot safely be dereferenced after task_unlock(victim) */
        mm = victim->mm;
@@ -523,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        task_unlock(p);
                        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
                }
+        rcu_read_unlock();
        set_tsk_thread_flag(victim, TIF_MEMDIE);
        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+        put_task_struct(victim);
 }
 #undef K
 /*
 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
 */
-static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-                                int order, const nodemask_t *nodemask)
+                        int order, const nodemask_t *nodemask)
 {
        if (likely(!sysctl_panic_on_oom))
                return;
@@ -546,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
                if (constraint != CONSTRAINT_NONE)
                        return;
        }
-        read_lock(&tasklist_lock);
        dump_header(NULL, gfp_mask, order, NULL, nodemask);
-        read_unlock(&tasklist_lock);
        panic("Out of memory: %s panic_on_oom is enabled\n",
                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
-                              int order)
-{
-        unsigned long limit;
-        unsigned int points = 0;
-        struct task_struct *p;
-        /*
-         * If current has a pending SIGKILL, then automatically select it.  The
-         * goal is to allow it to allocate so that it may quickly exit and free
-         * its memory.
-         */
-        if (fatal_signal_pending(current)) {
-                set_thread_flag(TIF_MEMDIE);
-                return;
-        }
-        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
-        limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT;
-        read_lock(&tasklist_lock);
-        p = select_bad_process(&points, limit, memcg, NULL, false);
-        if (p && PTR_ERR(p) != -1UL)
-                oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
-                                 "Memory cgroup out of memory");
-        read_unlock(&tasklist_lock);
-}
-#endif
 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
 int register_oom_notifier(struct notifier_block *nb)
@@ -703,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        struct task_struct *p;
        unsigned long totalpages;
        unsigned long freed = 0;
-        unsigned int points;
+        unsigned int uninitialized_var(points);
        enum oom_constraint constraint = CONSTRAINT_NONE;
        int killed = 0;
@@ -731,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
        check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
-        read_lock(&tasklist_lock);
+        if (sysctl_oom_kill_allocating_task && current->mm &&
-        if (sysctl_oom_kill_allocating_task &&
            !oom_unkillable_task(current, NULL, nodemask) &&
-            current->mm) {
+            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+                get_task_struct(current);
                oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
                                 nodemask,
                                 "Out of memory (oom_kill_allocating_task)");
                goto out;
        }
-        p = select_bad_process(&points, totalpages, NULL, mpol_mask,
+        p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
-                               force_kill);
        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (!p) {
                dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
-                read_unlock(&tasklist_lock);
                panic("Out of memory and no killable processes...\n");
        }
        if (PTR_ERR(p) != -1UL) {
@@ -755,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                killed = 1;
        }
 out:
-        read_unlock(&tasklist_lock);
        /*
-         * Give "p" a good chance of killing itself before we
+         * Give the killed threads a good chance of exiting before trying to
-         * retry to allocate memory unless "p" is current
+         * allocate memory again.
         */
-        if (killed && !test_thread_flag(TIF_MEMDIE))
+        if (killed)
-                schedule_timeout_uninterruptible(1);
+                schedule_timeout_killable(1);
 }
 /*
@@ -777,6 +766,5 @@ void pagefault_out_of_memory(void)
                out_of_memory(NULL, 0, 0, NULL, false);
                clear_system_oom();
        }
-        if (!test_thread_flag(TIF_MEMDIE))
+        schedule_timeout_killable(1);
-                schedule_timeout_uninterruptible(1);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 26adea8ca2e7..5ad5ce23c1e0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
 #include <linux/pagevec.h>
+#include <linux/timer.h>
 #include <trace/events/writeback.h>
 /*
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
 * measured in page writeback completions.
 *
 */
-static struct prop_descriptor vm_completions;
+static struct fprop_global writeout_completions;
+static void writeout_period(unsigned long t);
+/* Timer for aging of writeout_completions */
+static struct timer_list writeout_period_timer =
+                TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
+static unsigned long writeout_period_time = 0;
+/*
+ * Length of period for aging writeout fractions of bdis. This is an
+ * arbitrarily chosen number. The longer the period, the slower fractions will
+ * reflect changes in current writeout rate.
+ */
+#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
 /*
 * Work out the current dirty-memory clamping and background writeout
@@ -204,7 +218,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
 * Returns the global number of pages potentially available for dirty
 * page cache.  This is the base value for the global dirty limits.
 */
-unsigned long global_dirtyable_memory(void)
+static unsigned long global_dirtyable_memory(void)
 {
        unsigned long x;
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
               zone_page_state(zone, NR_WRITEBACK) <= limit;
 }
-/*
- * couple the period to the dirty_ratio:
- *
- *   period/2 ~ roundup_pow_of_two(dirty limit)
- */
-static int calc_period_shift(void)
-{
-        unsigned long dirty_total;
-        if (vm_dirty_bytes)
-                dirty_total = vm_dirty_bytes / PAGE_SIZE;
-        else
-                dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
-                                100;
-        return 2 + ilog2(dirty_total - 1);
-}
-/*
- * update the period when the dirty threshold changes.
- */
-static void update_completion_period(void)
-{
-        int shift = calc_period_shift();
-        prop_change_shift(&vm_completions, shift);
-        writeback_set_ratelimit();
-}
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
-                update_completion_period();
+                writeback_set_ratelimit();
                vm_dirty_bytes = 0;
        }
        return ret;
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
-                update_completion_period();
+                writeback_set_ratelimit();
                vm_dirty_ratio = 0;
        }
        return ret;
 }
+static unsigned long wp_next_time(unsigned long cur_time)
+{
+        cur_time += VM_COMPLETIONS_PERIOD_LEN;
+        /* 0 has a special meaning... */
+        if (!cur_time)
+                return 1;
+        return cur_time;
+}
 /*
 * Increment the BDI's writeout completion count and the global writeout
 * completion count. Called from test_clear_page_writeback().
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
        __inc_bdi_stat(bdi, BDI_WRITTEN);
-        __prop_inc_percpu_max(&vm_completions, &bdi->completions,
+        __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
-                              bdi->max_prop_frac);
+                               bdi->max_prop_frac);
+        /* First event after period switching was turned off? */
+        if (!unlikely(writeout_period_time)) {
+                /*
+                 * We can race with other __bdi_writeout_inc calls here but
+                 * it does not cause any harm since the resulting time when
+                 * timer will fire and what is in writeout_period_time will be
+                 * roughly the same.
+                 */
+                writeout_period_time = wp_next_time(jiffies);
+                mod_timer(&writeout_period_timer, writeout_period_time);
+        }
 }
 void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
                long *numerator, long *denominator)
 {
-        prop_fraction_percpu(&vm_completions, &bdi->completions,
+        fprop_fraction_percpu(&writeout_completions, &bdi->completions,
                                numerator, denominator);
 }
 /*
+ * On idle system, we can be called long after we scheduled because we use
+ * deferred timers so count with missed periods.
+ */
+static void writeout_period(unsigned long t)
+{
+        int miss_periods = (jiffies - writeout_period_time) /
+                                                 VM_COMPLETIONS_PERIOD_LEN;
+        if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
+                writeout_period_time = wp_next_time(writeout_period_time +
+                                miss_periods * VM_COMPLETIONS_PERIOD_LEN);
+                mod_timer(&writeout_period_timer, writeout_period_time);
+        } else {
+                /*
+                 * Aging has zeroed all fractions. Stop wasting CPU on period
+                 * updates.
+                 */
+                writeout_period_time = 0;
+        }
+}
+/*
 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
 * registered backing devices, which, for obvious reasons, can not
 * exceed 100%.
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
                ret = -EINVAL;
        } else {
                bdi->max_ratio = max_ratio;
-                bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
+                bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
        }
        spin_unlock_bh(&bdi_lock);
@@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
         *      bdi->dirty_ratelimit = balanced_dirty_ratelimit;
         *
         * However to get a more stable dirty_ratelimit, the below elaborated
-         * code makes use of task_ratelimit to filter out sigular points and
+         * code makes use of task_ratelimit to filter out singular points and
         * limit the step size.
         *
         * The below code essentially only uses the relative value of
@@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
         * feel and care are stable dirty rate and small position error.
         *
         * |task_ratelimit - dirty_ratelimit| is used to limit the step size
-         * and filter out the sigular points of balanced_dirty_ratelimit. Which
+         * and filter out the singular points of balanced_dirty_ratelimit. Which
         * keeps jumping around randomly and can even leap far away at times
         * due to the small 200ms estimation period of dirty_rate (we want to
         * keep that period small to reduce time lags).
@@ -1504,7 +1532,6 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
        proc_dointvec(table, write, buffer, length, ppos);
-        bdi_arm_supers_timer();
        return 0;
 }
@@ -1568,6 +1595,7 @@ void writeback_set_ratelimit(void)
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        global_dirty_limits(&background_thresh, &dirty_thresh);
+        global_dirty_limit = dirty_thresh;
        ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
        if (ratelimit_pages < 16)
                ratelimit_pages = 16;
@@ -1605,13 +1633,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
 */
 void __init page_writeback_init(void)
 {
-        int shift;
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
-        shift = calc_period_shift();
+        fprop_global_init(&writeout_completions);
-        prop_descriptor_init(&vm_completions, shift);
 }
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a712fb9e04ce..c66fb875104a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,12 +51,12 @@
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
-#include <linux/memory.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
+#include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
 #include <asm/tlbflush.h>
@@ -218,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+/*
+ * NOTE:
+ * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
+ * Instead, use {un}set_pageblock_isolate.
+ */
+void set_pageblock_migratetype(struct page *page, int migratetype)
 {
        if (unlikely(page_group_by_mobility_disabled))
@@ -513,10 +518,10 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 * free pages of length of (1 << order) and marked with _mapcount -2. Page's
 * order is recorded in page_private(page) field.
 * So when we are allocating or freeing one, we can derive the state of the
- * other.  That is, if we allocate a small block, and both were   
+ * other.  That is, if we allocate a small block, and both were
- * free, the remainder of the region must be split into blocks.   
+ * free, the remainder of the region must be split into blocks.
 * If a block is freed, and its buddy is also free, then this
- * triggers coalescing into a block of larger size.            
+ * triggers coalescing into a block of larger size.
 *
 * -- wli
 */
@@ -749,6 +754,24 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
        __free_pages(page, order);
 }
+#ifdef CONFIG_CMA
+/* Free whole pageblock and set it's migration type to MIGRATE_CMA. */
+void __init init_cma_reserved_pageblock(struct page *page)
+{
+        unsigned i = pageblock_nr_pages;
+        struct page *p = page;
+        do {
+                __ClearPageReserved(p);
+                set_page_count(p, 0);
+        } while (++p, --i);
+        set_page_refcounted(page);
+        set_pageblock_migratetype(page, MIGRATE_CMA);
+        __free_pages(page, pageblock_order);
+        totalram_pages += pageblock_nr_pages;
+}
+#endif
 /*
 * The order of subdivision here is critical for the IO subsystem.
@@ -874,11 +897,17 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 * This array describes the order lists are fallen back to when
 * the free lists for the desirable migrate type are depleted
 */
-static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
+static int fallbacks[MIGRATE_TYPES][4] = {
-        [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
+        [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,     MIGRATE_RESERVE },
-        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
+        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,     MIGRATE_RESERVE },
-        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+#ifdef CONFIG_CMA
-        [MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
+        [MIGRATE_MOVABLE]     = { MIGRATE_CMA,         MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+        [MIGRATE_CMA]         = { MIGRATE_RESERVE }, /* Never used */
+#else
+        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE,   MIGRATE_RESERVE },
+#endif
+        [MIGRATE_RESERVE]     = { MIGRATE_RESERVE }, /* Never used */
+        [MIGRATE_ISOLATE]     = { MIGRATE_RESERVE }, /* Never used */
 };
 /*
@@ -929,7 +958,7 @@ static int move_freepages(struct zone *zone,
        return pages_moved;
 }
-static int move_freepages_block(struct zone *zone, struct page *page,
+int move_freepages_block(struct zone *zone, struct page *page,
                                int migratetype)
 {
        unsigned long start_pfn, end_pfn;
@@ -973,12 +1002,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
        /* Find the largest possible block of pages in the other list */
        for (current_order = MAX_ORDER-1; current_order >= order;
                                                --current_order) {
-                for (i = 0; i < MIGRATE_TYPES - 1; i++) {
+                for (i = 0;; i++) {
                        migratetype = fallbacks[start_migratetype][i];
                        /* MIGRATE_RESERVE handled later if necessary */
                        if (migratetype == MIGRATE_RESERVE)
-                                continue;
+                                break;
                        area = &(zone->free_area[current_order]);
                        if (list_empty(&area->free_list[migratetype]))
@@ -993,11 +1022,18 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                         * pages to the preferred allocation list. If falling
                         * back for a reclaimable kernel allocation, be more
                         * aggressive about taking ownership of free pages
+                         *
+                         * On the other hand, never change migration
+                         * type of MIGRATE_CMA pageblocks nor move CMA
+                         * pages on different free lists. We don't
+                         * want unmovable pages to be allocated from
+                         * MIGRATE_CMA areas.
                         */
-                        if (unlikely(current_order >= (pageblock_order >> 1)) ||
+                        if (!is_migrate_cma(migratetype) &&
-                                        start_migratetype == MIGRATE_RECLAIMABLE ||
+                            (unlikely(current_order >= pageblock_order / 2) ||
-                                        page_group_by_mobility_disabled) {
+                             start_migratetype == MIGRATE_RECLAIMABLE ||
-                                unsigned long pages;
+                             page_group_by_mobility_disabled)) {
+                                int pages;
                                pages = move_freepages_block(zone, page,
                                                                start_migratetype);
@@ -1015,11 +1051,14 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                        rmv_page_order(page);
                        /* Take ownership for orders >= pageblock_order */
-                        if (current_order >= pageblock_order)
+                        if (current_order >= pageblock_order &&
+                            !is_migrate_cma(migratetype))
                                change_pageblock_range(page, current_order,
                                                        start_migratetype);
-                        expand(zone, page, order, current_order, area, migratetype);
+                        expand(zone, page, order, current_order, area,
+                               is_migrate_cma(migratetype)
+                             ? migratetype : start_migratetype);
                        trace_mm_page_alloc_extfrag(page, order, current_order,
                                start_migratetype, migratetype);
@@ -1061,17 +1100,17 @@ retry_reserve:
        return page;
 }
-/* 
+/*
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
-static int rmqueue_bulk(struct zone *zone, unsigned int order, 
+static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        unsigned long count, struct list_head *list,
                        int migratetype, int cold)
 {
-        int i;
+        int mt = migratetype, i;
-        
        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
                struct page *page = __rmqueue(zone, order, migratetype);
@@ -1091,7 +1130,12 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
                        list_add(&page->lru, list);
                else
                        list_add_tail(&page->lru, list);
-                set_page_private(page, migratetype);
+                if (IS_ENABLED(CONFIG_CMA)) {
+                        mt = get_pageblock_migratetype(page);
+                        if (!is_migrate_cma(mt) && mt != MIGRATE_ISOLATE)
+                                mt = migratetype;
+                }
+                set_page_private(page, mt);
                list = &page->lru;
        }
        __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
@@ -1118,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
                to_drain = pcp->batch;
        else
                to_drain = pcp->count;
-        free_pcppages_bulk(zone, to_drain, pcp);
+        if (to_drain > 0) {
-        pcp->count -= to_drain;
+                free_pcppages_bulk(zone, to_drain, pcp);
+                pcp->count -= to_drain;
+        }
        local_irq_restore(flags);
 }
 #endif
@@ -1371,8 +1417,12 @@ int split_free_page(struct page *page)
        if (order >= pageblock_order - 1) {
                struct page *endpage = page + (1 << order) - 1;
-                for (; page < endpage; page += pageblock_nr_pages)
+                for (; page < endpage; page += pageblock_nr_pages) {
-                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+                        int mt = get_pageblock_migratetype(page);
+                        if (mt != MIGRATE_ISOLATE && !is_migrate_cma(mt))
+                                set_pageblock_migratetype(page,
+                                                          MIGRATE_MOVABLE);
+                }
        }
        return 1 << order;
@@ -1485,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
-static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
        if (order < fail_page_alloc.min_order)
-                return 0;
+                return false;
        if (gfp_mask & __GFP_NOFAIL)
-                return 0;
+                return false;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
-                return 0;
+                return false;
        if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
-                return 0;
+                return false;
        return should_fail(&fail_page_alloc.attr, 1 << order);
 }
@@ -1534,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
 #else /* CONFIG_FAIL_PAGE_ALLOC */
-static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
-        return 0;
+        return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1550,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
+        long lowmem_reserve = z->lowmem_reserve[classzone_idx];
        int o;
        free_pages -= (1 << order) - 1;
@@ -1558,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
-        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+        if (free_pages <= min + lowmem_reserve)
                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
@@ -1573,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return true;
 }
+#ifdef CONFIG_MEMORY_ISOLATION
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+        if (unlikely(zone->nr_pageblock_isolate))
+                return zone->nr_pageblock_isolate * pageblock_nr_pages;
+        return 0;
+}
+#else
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+        return 0;
+}
+#endif
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                      int classzone_idx, int alloc_flags)
 {
@@ -1588,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+        /*
+         * If the zone has MIGRATE_ISOLATE type free pages, we should consider
+         * it.  nr_zone_isolate_freepages is never accurate so kswapd might not
+         * sleep although it could do so.  But this is more desirable for memory
+         * hotplug than sleeping which can cause a livelock in the direct
+         * reclaim path.
+         */
+        free_pages -= nr_zone_isolate_freepages(z);
        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                                                                free_pages);
 }
@@ -1855,6 +1928,17 @@ this_zone_full:
                zlc_active = 0;
                goto zonelist_scan;
        }
+        if (page)
+                /*
+                 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+                 * necessary to allocate the page. The expectation is
+                 * that the caller is taking steps that will free more
+                 * memory. The caller should avoid the page being used
+                 * for !PFMEMALLOC purposes.
+                 */
+                page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
        return page;
 }
@@ -2018,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int migratetype, bool sync_migration,
-        bool *deferred_compaction,
+        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
        struct page *page;
@@ -2033,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                nodemask, sync_migration);
+                                                nodemask, sync_migration,
+                                                contended_compaction);
        current->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
@@ -2043,8 +2128,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                page = get_page_from_freelist(gfp_mask, nodemask,
                                order, zonelist, high_zoneidx,
-                                alloc_flags, preferred_zone,
+                                alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                migratetype);
+                                preferred_zone, migratetype);
                if (page) {
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
@@ -2079,23 +2164,20 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int migratetype, bool sync_migration,
-        bool *deferred_compaction,
+        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
        return NULL;
 }
 #endif /* CONFIG_COMPACTION */
-/* The really slow allocator path where we enter direct reclaim */
+/* Perform direct synchronous page reclaim */
-static inline struct page *
+static int
-__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
-        struct zonelist *zonelist, enum zone_type high_zoneidx,
+                  nodemask_t *nodemask)
-        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
 {
-        struct page *page = NULL;
        struct reclaim_state reclaim_state;
-        bool drained = false;
+        int progress;
        cond_resched();
@@ -2106,7 +2188,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
        reclaim_state.reclaimed_slab = 0;
        current->reclaim_state = &reclaim_state;
-        *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+        progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
        current->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
@@ -2114,6 +2196,21 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
        cond_resched();
+        return progress;
+}
+/* The really slow allocator path where we enter direct reclaim */
+static inline struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+        struct zonelist *zonelist, enum zone_type high_zoneidx,
+        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+        int migratetype, unsigned long *did_some_progress)
+{
+        struct page *page = NULL;
+        bool drained = false;
+        *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
+                                               nodemask);
        if (unlikely(!(*did_some_progress)))
                return NULL;
@@ -2124,8 +2221,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 retry:
        page = get_page_from_freelist(gfp_mask, nodemask, order,
                                        zonelist, high_zoneidx,
-                                        alloc_flags, preferred_zone,
+                                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                        migratetype);
+                                        preferred_zone, migratetype);
        /*
         * If an allocation failed after direct reclaim, it could be because
@@ -2209,15 +2306,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
-                if (!in_interrupt() &&
+                if (gfp_mask & __GFP_MEMALLOC)
-                    ((current->flags & PF_MEMALLOC) ||
+                        alloc_flags |= ALLOC_NO_WATERMARKS;
-                     unlikely(test_thread_flag(TIF_MEMDIE))))
+                else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+                        alloc_flags |= ALLOC_NO_WATERMARKS;
+                else if (!in_interrupt() &&
+                                ((current->flags & PF_MEMALLOC) ||
+                                 unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
        return alloc_flags;
 }
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+        return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
+}
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2231,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long did_some_progress;
        bool sync_migration = false;
        bool deferred_compaction = false;
+        bool contended_compaction = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2284,11 +2391,19 @@ rebalance:
        /* Allocate without watermarks if the context allows */
        if (alloc_flags & ALLOC_NO_WATERMARKS) {
+                /*
+                 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
+                 * the allocation is high priority and these type of
+                 * allocations are system rather than user orientated
+                 */
+                zonelist = node_zonelist(numa_node_id(), gfp_mask);
                page = __alloc_pages_high_priority(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-                if (page)
+                if (page) {
                        goto got_pg;
+                }
        }
        /* Atomic allocations - we can't balance anything */
@@ -2312,6 +2427,7 @@ rebalance:
                                        nodemask,
                                        alloc_flags, preferred_zone,
                                        migratetype, sync_migration,
+                                        &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
        if (page)
@@ -2321,10 +2437,11 @@ rebalance:
        /*
         * If compaction is deferred for high-order allocations, it is because
         * sync compaction recently failed. In this is the case and the caller
-         * has requested the system not be heavily disrupted, fail the
+         * requested a movable allocation that does not heavily disrupt the
-         * allocation now instead of entering direct reclaim
+         * system then fail the allocation instead of entering direct reclaim.
         */
-        if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+        if ((deferred_compaction || contended_compaction) &&
+                                                (gfp_mask & __GFP_NO_KSWAPD))
                goto nopage;
        /* Try direct reclaim and then allocating */
@@ -2395,6 +2512,7 @@ rebalance:
                                        nodemask,
                                        alloc_flags, preferred_zone,
                                        migratetype, sync_migration,
+                                        &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
                if (page)
@@ -2407,8 +2525,8 @@ nopage:
 got_pg:
        if (kmemcheck_enabled)
                kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-        return page;
+        return page;
 }
 /*
@@ -2974,7 +3092,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                        user_zonelist_order = oldval;
                } else if (oldval != user_zonelist_order) {
                        mutex_lock(&zonelists_mutex);
-                        build_all_zonelists(NULL);
+                        build_all_zonelists(NULL, NULL);
                        mutex_unlock(&zonelists_mutex);
                }
        }
@@ -3353,14 +3471,21 @@ static void setup_zone_pageset(struct zone *zone);
 DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
-static __init_refok int __build_all_zonelists(void *data)
+static int __build_all_zonelists(void *data)
 {
        int nid;
        int cpu;
+        pg_data_t *self = data;
 #ifdef CONFIG_NUMA
        memset(node_load, 0, sizeof(node_load));
 #endif
+        if (self && !node_online(self->node_id)) {
+                build_zonelists(self);
+                build_zonelist_cache(self);
+        }
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
@@ -3405,7 +3530,7 @@ static __init_refok int __build_all_zonelists(void *data)
 * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
 */
-void __ref build_all_zonelists(void *data)
+void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
        set_zonelist_order();
@@ -3417,10 +3542,10 @@ void __ref build_all_zonelists(void *data)
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
 #ifdef CONFIG_MEMORY_HOTPLUG
-                if (data)
+                if (zone)
-                        setup_zone_pageset((struct zone *)data);
+                        setup_zone_pageset(zone);
 #endif
-                stop_machine(__build_all_zonelists, NULL, NULL);
+                stop_machine(__build_all_zonelists, pgdat, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -3690,7 +3815,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
        memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
-static int zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
        int batch;
@@ -3772,7 +3897,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                pcp->batch = PAGE_SHIFT * 8;
 }
-static void setup_zone_pageset(struct zone *zone)
+static void __meminit setup_zone_pageset(struct zone *zone)
 {
        int cpu;
@@ -3845,32 +3970,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
        return 0;
 }
-static int __zone_pcp_update(void *data)
-{
-        struct zone *zone = data;
-        int cpu;
-        unsigned long batch = zone_batchsize(zone), flags;
-        for_each_possible_cpu(cpu) {
-                struct per_cpu_pageset *pset;
-                struct per_cpu_pages *pcp;
-                pset = per_cpu_ptr(zone->pageset, cpu);
-                pcp = &pset->pcp;
-                local_irq_save(flags);
-                free_pcppages_bulk(zone, pcp->count, pcp);
-                setup_pageset(pset, batch);
-                local_irq_restore(flags);
-        }
-        return 0;
-}
-void zone_pcp_update(struct zone *zone)
-{
-        stop_machine(__zone_pcp_update, zone, NULL);
-}
 static __meminit void zone_pcp_init(struct zone *zone)
 {
        /*
@@ -3886,7 +3985,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
                                         zone_batchsize(zone));
 }
-__meminit int init_currently_empty_zone(struct zone *zone,
+int __meminit init_currently_empty_zone(struct zone *zone,
                                        unsigned long zone_start_pfn,
                                        unsigned long size,
                                        enum memmap_context context)
@@ -4244,25 +4343,24 @@ static inline void setup_usemap(struct pglist_data *pgdat,
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
-/* Return a sensible default order for the pageblock size. */
-static inline int pageblock_default_order(void)
-{
-        if (HPAGE_SHIFT > PAGE_SHIFT)
-                return HUGETLB_PAGE_ORDER;
-        return MAX_ORDER-1;
-}
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(unsigned int order)
+void __init set_pageblock_order(void)
 {
+        unsigned int order;
        /* Check that pageblock_nr_pages has not already been setup */
        if (pageblock_order)
                return;
+        if (HPAGE_SHIFT > PAGE_SHIFT)
+                order = HUGETLB_PAGE_ORDER;
+        else
+                order = MAX_ORDER - 1;
        /*
         * Assume the largest contiguous order of interest is a huge page.
-         * This value may be variable depending on boot parameters on IA64
+         * This value may be variable depending on boot parameters on IA64 and
+         * powerpc.
         */
        pageblock_order = order;
 }
@@ -4270,15 +4368,13 @@ static inline void __init set_pageblock_order(unsigned int order)
 /*
 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
- * and pageblock_default_order() are unused as pageblock_order is set
+ * is unused as pageblock_order is set at compile-time. See
- * at compile-time. See include/linux/pageblock-flags.h for the values of
+ * include/linux/pageblock-flags.h for the values of pageblock_order based on
- * pageblock_order based on the kernel config
+ * the kernel config
 */
-static inline int pageblock_default_order(unsigned int order)
+void __init set_pageblock_order(void)
 {
-        return MAX_ORDER-1;
 }
-#define set_pageblock_order(x)  do {} while (0)
 #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
@@ -4287,6 +4383,8 @@ static inline int pageblock_default_order(unsigned int order)
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
 */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
@@ -4297,15 +4395,13 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        int ret;
        pgdat_resize_init(pgdat);
-        pgdat->nr_zones = 0;
        init_waitqueue_head(&pgdat->kswapd_wait);
-        pgdat->kswapd_max_order = 0;
+        init_waitqueue_head(&pgdat->pfmemalloc_wait);
        pgdat_page_cgroup_init(pgdat);
-        
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, memmap_pages;
-                enum lru_list lru;
                size = zone_spanned_pages_in_node(nid, j, zones_size);
                realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4342,6 +4438,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->spanned_pages = size;
                zone->present_pages = realsize;
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+                zone->compact_cached_free_pfn = zone->zone_start_pfn +
+                                                zone->spanned_pages;
+                zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
+#endif
 #ifdef CONFIG_NUMA
                zone->node = nid;
                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4355,18 +4456,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->zone_pgdat = pgdat;
                zone_pcp_init(zone);
-                for_each_lru(lru)
+                lruvec_init(&zone->lruvec, zone);
-                        INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
-                zone->reclaim_stat.recent_rotated[0] = 0;
-                zone->reclaim_stat.recent_rotated[1] = 0;
-                zone->reclaim_stat.recent_scanned[0] = 0;
-                zone->reclaim_stat.recent_scanned[1] = 0;
-                zap_zone_vm_stats(zone);
-                zone->flags = 0;
                if (!size)
                        continue;
-                set_pageblock_order(pageblock_default_order());
+                set_pageblock_order();
                setup_usemap(pgdat, zone, size);
                ret = init_currently_empty_zone(zone, zone_start_pfn,
                                                size, MEMMAP_EARLY);
@@ -4422,6 +4516,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 {
        pg_data_t *pgdat = NODE_DATA(nid);
+        /* pg_data_t should be reset to zero when it's allocated */
+        WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
        calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4703,7 +4800,7 @@ out:
 }
 /* Any regular memory on that node ? */
-static void check_for_regular_memory(pg_data_t *pgdat)
+static void __init check_for_regular_memory(pg_data_t *pgdat)
 {
 #ifdef CONFIG_HIGHMEM
        enum zone_type zone_type;
@@ -4759,31 +4856,34 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        find_zone_movable_pfns_for_nodes();
        /* Print out the zone ranges */
-        printk("Zone PFN ranges:\n");
+        printk("Zone ranges:\n");
        for (i = 0; i < MAX_NR_ZONES; i++) {
                if (i == ZONE_MOVABLE)
                        continue;
-                printk("  %-8s ", zone_names[i]);
+                printk(KERN_CONT "  %-8s ", zone_names[i]);
                if (arch_zone_lowest_possible_pfn[i] ==
                                arch_zone_highest_possible_pfn[i])
-                        printk("empty\n");
+                        printk(KERN_CONT "empty\n");
                else
-                        printk("%0#10lx -> %0#10lx\n",
+                        printk(KERN_CONT "[mem %0#10lx-%0#10lx]\n",
-                                arch_zone_lowest_possible_pfn[i],
+                                arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
-                                arch_zone_highest_possible_pfn[i]);
+                                (arch_zone_highest_possible_pfn[i]
+                                        << PAGE_SHIFT) - 1);
        }
        /* Print out the PFNs ZONE_MOVABLE begins at in each node */
-        printk("Movable zone start PFN for each node\n");
+        printk("Movable zone start for each node\n");
        for (i = 0; i < MAX_NUMNODES; i++) {
                if (zone_movable_pfn[i])
-                        printk("  Node %d: %lu\n", i, zone_movable_pfn[i]);
+                        printk("  Node %d: %#010lx\n", i,
+                               zone_movable_pfn[i] << PAGE_SHIFT);
        }
        /* Print out the early_node_map[] */
-        printk("Early memory PFN ranges\n");
+        printk("Early memory node ranges\n");
        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
-                printk("  %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
+                printk("  node %3d: [mem %#010lx-%#010lx]\n", nid,
+                       start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
        /* Initialise every node */
        mminit_verify_pageflags_layout();
@@ -4976,14 +5076,7 @@ static void setup_per_zone_lowmem_reserve(void)
        calculate_totalreserve_pages();
 }
-/**
+static void __setup_per_zone_wmarks(void)
- * setup_per_zone_wmarks - called when min_free_kbytes changes
- * or when memory is hot-{added|removed}
- *
- * Ensures that the watermark[min,low,high] values for each zone are set
- * correctly with respect to min_free_kbytes.
- */
-void setup_per_zone_wmarks(void)
 {
        unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
        unsigned long lowmem_pages = 0;
@@ -5030,6 +5123,11 @@ void setup_per_zone_wmarks(void)
                zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
+                zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
+                zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
+                zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
                setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lock, flags);
        }
@@ -5038,6 +5136,20 @@ void setup_per_zone_wmarks(void)
        calculate_totalreserve_pages();
 }
+/**
+ * setup_per_zone_wmarks - called when min_free_kbytes changes
+ * or when memory is hot-{added|removed}
+ *
+ * Ensures that the watermark[min,low,high] values for each zone are set
+ * correctly with respect to min_free_kbytes.
+ */
+void setup_per_zone_wmarks(void)
+{
+        mutex_lock(&zonelists_mutex);
+        __setup_per_zone_wmarks();
+        mutex_unlock(&zonelists_mutex);
+}
 /*
 * The inactive anon list should be small enough that the VM never has to
 * do too much work, but large enough that each inactive page has a chance
@@ -5203,7 +5315,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        int ret;
        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
-        if (!write || (ret == -EINVAL))
+        if (!write || (ret < 0))
                return ret;
        for_each_populated_zone(zone) {
                for_each_possible_cpu(cpu) {
@@ -5242,9 +5354,10 @@ void *__init alloc_large_system_hash(const char *tablename,
                                     int flags,
                                     unsigned int *_hash_shift,
                                     unsigned int *_hash_mask,
-                                     unsigned long limit)
+                                     unsigned long low_limit,
+                                     unsigned long high_limit)
 {
-        unsigned long long max = limit;
+        unsigned long long max = high_limit;
        unsigned long log2qty, size;
        void *table = NULL;
@@ -5282,6 +5395,8 @@ void *__init alloc_large_system_hash(const char *tablename,
        }
        max = min(max, 0x80000000ULL);
+        if (numentries < low_limit)
+                numentries = low_limit;
        if (numentries > max)
                numentries = max;
@@ -5403,24 +5518,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 }
 /*
- * This is designed as sub function...plz see page_isolation.c also.
+ * This function checks whether pageblock includes unmovable pages or not.
- * set/clear page block's type to be ISOLATE.
+ * If @count is not zero, it is okay to include less @count unmovable pages
- * page allocater never alloc memory from ISOLATE block.
+ *
+ * PageLRU check wihtout isolation or lru_lock could race so that
+ * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
+ * expect this function should be exact.
 */
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
-static int
-__count_immobile_pages(struct zone *zone, struct page *page, int count)
 {
        unsigned long pfn, iter, found;
+        int mt;
        /*
         * For avoiding noise data, lru_add_drain_all() should be called
-         * If ZONE_MOVABLE, the zone never contains immobile pages
+         * If ZONE_MOVABLE, the zone never contains unmovable pages
         */
        if (zone_idx(zone) == ZONE_MOVABLE)
-                return true;
+                return false;
+        mt = get_pageblock_migratetype(page);
-        if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+        if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
-                return true;
+                return false;
        pfn = page_to_pfn(page);
        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5430,11 +5548,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
                        continue;
                page = pfn_to_page(check);
-                if (!page_count(page)) {
+                /*
+                 * We can't use page_count without pin a page
+                 * because another CPU can free compound page.
+                 * This check already skips compound tails of THP
+                 * because their page->_count is zero at all time.
+                 */
+                if (!atomic_read(&page->_count)) {
                        if (PageBuddy(page))
                                iter += (1 << page_order(page)) - 1;
                        continue;
                }
                if (!PageLRU(page))
                        found++;
                /*
@@ -5451,9 +5576,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
                 * page at boot.
                 */
                if (found > count)
-                        return false;
+                        return true;
        }
-        return true;
+        return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
@@ -5477,80 +5602,304 @@ bool is_pageblock_removable_nolock(struct page *page)
                        zone->zone_start_pfn + zone->spanned_pages <= pfn)
                return false;
-        return __count_immobile_pages(zone, page, 0);
+        return !has_unmovable_pages(zone, page, 0);
+}
+#ifdef CONFIG_CMA
+static unsigned long pfn_max_align_down(unsigned long pfn)
+{
+        return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
+                             pageblock_nr_pages) - 1);
 }
-int set_migratetype_isolate(struct page *page)
+static unsigned long pfn_max_align_up(unsigned long pfn)
 {
-        struct zone *zone;
+        return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
-        unsigned long flags, pfn;
+                                pageblock_nr_pages));
-        struct memory_isolate_notify arg;
+}
-        int notifier_ret;
-        int ret = -EBUSY;
-        zone = page_zone(page);
+static struct page *
+__alloc_contig_migrate_alloc(struct page *page, unsigned long private,
+                             int **resultp)
+{
+        gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
+        if (PageHighMem(page))
+                gfp_mask |= __GFP_HIGHMEM;
+        return alloc_page(gfp_mask);
+}
+/* [start, end) must belong to a single zone. */
+static int __alloc_contig_migrate_range(unsigned long start, unsigned long end)
+{
+        /* This function is based on compact_zone() from compaction.c. */
+        unsigned long pfn = start;
+        unsigned int tries = 0;
+        int ret = 0;
+        struct compact_control cc = {
+                .nr_migratepages = 0,
+                .order = -1,
+                .zone = page_zone(pfn_to_page(start)),
+                .sync = true,
+        };
+        INIT_LIST_HEAD(&cc.migratepages);
+        migrate_prep_local();
+        while (pfn < end || !list_empty(&cc.migratepages)) {
+                if (fatal_signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                if (list_empty(&cc.migratepages)) {
+                        cc.nr_migratepages = 0;
+                        pfn = isolate_migratepages_range(cc.zone, &cc,
+                                                         pfn, end);
+                        if (!pfn) {
+                                ret = -EINTR;
+                                break;
+                        }
+                        tries = 0;
+                } else if (++tries == 5) {
+                        ret = ret < 0 ? ret : -EBUSY;
+                        break;
+                }
+                ret = migrate_pages(&cc.migratepages,
+                                    __alloc_contig_migrate_alloc,
+                                    0, false, MIGRATE_SYNC);
+        }
+        putback_lru_pages(&cc.migratepages);
+        return ret > 0 ? 0 : ret;
+}
+/*
+ * Update zone's cma pages counter used for watermark level calculation.
+ */
+static inline void __update_cma_watermarks(struct zone *zone, int count)
+{
+        unsigned long flags;
        spin_lock_irqsave(&zone->lock, flags);
+        zone->min_cma_pages += count;
+        spin_unlock_irqrestore(&zone->lock, flags);
+        setup_per_zone_wmarks();
+}
-        pfn = page_to_pfn(page);
+/*
-        arg.start_pfn = pfn;
+ * Trigger memory pressure bump to reclaim some pages in order to be able to
-        arg.nr_pages = pageblock_nr_pages;
+ * allocate 'count' pages in single page units. Does similar work as
-        arg.pages_found = 0;
+ *__alloc_pages_slowpath() function.
+ */
+static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
+{
+        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+        struct zonelist *zonelist = node_zonelist(0, gfp_mask);
+        int did_some_progress = 0;
+        int order = 1;
        /*
-         * It may be possible to isolate a pageblock even if the
+         * Increase level of watermarks to force kswapd do his job
-         * migratetype is not MIGRATE_MOVABLE. The memory isolation
+         * to stabilise at new watermark level.
-         * notifier chain is used by balloon drivers to return the
-         * number of pages in a range that are held by the balloon
-         * driver to shrink memory. If all the pages are accounted for
-         * by balloons, are free, or on the LRU, isolation can continue.
-         * Later, for example, when memory hotplug notifier runs, these
-         * pages reported as "can be isolated" should be isolated(freed)
-         * by the balloon driver through the memory notifier chain.
         */
-        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+        __update_cma_watermarks(zone, count);
-        notifier_ret = notifier_to_errno(notifier_ret);
-        if (notifier_ret)
+        /* Obey watermarks as if the page was being allocated */
-                goto out;
+        while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
+                wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
+                did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
+                                                      NULL);
+                if (!did_some_progress) {
+                        /* Exhausted what can be done so it's blamo time */
+                        out_of_memory(zonelist, gfp_mask, order, NULL, false);
+                }
+        }
+        /* Restore original watermark levels. */
+        __update_cma_watermarks(zone, -count);
+        return count;
+}
+/**
+ * alloc_contig_range() -- tries to allocate given range of pages
+ * @start:      start PFN to allocate
+ * @end:        one-past-the-last PFN to allocate
+ * @migratetype:        migratetype of the underlaying pageblocks (either
+ *                      #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
+ *                      in range must have the same migratetype and it must
+ *                      be either of the two.
+ *
+ * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
+ * aligned, however it's the caller's responsibility to guarantee that
+ * we are the only thread that changes migrate type of pageblocks the
+ * pages fall in.
+ *
+ * The PFN range must belong to a single zone.
+ *
+ * Returns zero on success or negative error code.  On success all
+ * pages which PFN is in [start, end) are allocated for the caller and
+ * need to be freed with free_contig_range().
+ */
+int alloc_contig_range(unsigned long start, unsigned long end,
+                       unsigned migratetype)
+{
+        struct zone *zone = page_zone(pfn_to_page(start));
+        unsigned long outer_start, outer_end;
+        int ret = 0, order;
        /*
-         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+         * What we do here is we mark all pageblocks in range as
-         * We just check MOVABLE pages.
+         * MIGRATE_ISOLATE.  Because pageblock and max order pages may
+         * have different sizes, and due to the way page allocator
+         * work, we align the range to biggest of the two pages so
+         * that page allocator won't try to merge buddies from
+         * different pageblocks and change MIGRATE_ISOLATE to some
+         * other migration type.
+         *
+         * Once the pageblocks are marked as MIGRATE_ISOLATE, we
+         * migrate the pages from an unaligned range (ie. pages that
+         * we are interested in).  This will put all the pages in
+         * range back to page allocator as MIGRATE_ISOLATE.
+         *
+         * When this is done, we take the pages in range from page
+         * allocator removing them from the buddy system.  This way
+         * page allocator will never consider using them.
+         *
+         * This lets us mark the pageblocks back as
+         * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
+         * aligned range but not in the unaligned, original range are
+         * put back to page allocator so that buddy can use them.
         */
-        if (__count_immobile_pages(zone, page, arg.pages_found))
-                ret = 0;
+        ret = start_isolate_page_range(pfn_max_align_down(start),
+                                       pfn_max_align_up(end), migratetype);
+        if (ret)
+                goto done;
+        ret = __alloc_contig_migrate_range(start, end);
+        if (ret)
+                goto done;
        /*
-         * immobile means "not-on-lru" paes. If immobile is larger than
+         * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
-         * removable-by-driver pages reported by notifier, we'll fail.
+         * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
+         * more, all pages in [start, end) are free in page allocator.
+         * What we are going to do is to allocate all pages from
+         * [start, end) (that is remove them from page allocator).
+         *
+         * The only problem is that pages at the beginning and at the
+         * end of interesting range may be not aligned with pages that
+         * page allocator holds, ie. they can be part of higher order
+         * pages.  Because of this, we reserve the bigger range and
+         * once this is done free the pages we are not interested in.
+         *
+         * We don't have to hold zone->lock here because the pages are
+         * isolated thus they won't get removed from buddy.
         */
-out:
+        lru_add_drain_all();
-        if (!ret) {
+        drain_all_pages();
-                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-                move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        order = 0;
+        outer_start = start;
+        while (!PageBuddy(pfn_to_page(outer_start))) {
+                if (++order >= MAX_ORDER) {
+                        ret = -EBUSY;
+                        goto done;
+                }
+                outer_start &= ~0UL << order;
        }
-        spin_unlock_irqrestore(&zone->lock, flags);
+        /* Make sure the range is really isolated. */
-        if (!ret)
+        if (test_pages_isolated(outer_start, end)) {
-                drain_all_pages();
+                pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
+                       outer_start, end);
+                ret = -EBUSY;
+                goto done;
+        }
+        /*
+         * Reclaim enough pages to make sure that contiguous allocation
+         * will not starve the system.
+         */
+        __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
+        /* Grab isolated pages from freelists. */
+        outer_end = isolate_freepages_range(outer_start, end);
+        if (!outer_end) {
+                ret = -EBUSY;
+                goto done;
+        }
+        /* Free head and tail (if any) */
+        if (start != outer_start)
+                free_contig_range(outer_start, start - outer_start);
+        if (end != outer_end)
+                free_contig_range(end, outer_end - end);
+done:
+        undo_isolate_page_range(pfn_max_align_down(start),
+                                pfn_max_align_up(end), migratetype);
        return ret;
 }
-void unset_migratetype_isolate(struct page *page)
+void free_contig_range(unsigned long pfn, unsigned nr_pages)
 {
-        struct zone *zone;
+        for (; nr_pages--; ++pfn)
-        unsigned long flags;
+                __free_page(pfn_to_page(pfn));
-        zone = page_zone(page);
+}
-        spin_lock_irqsave(&zone->lock, flags);
+#endif
-        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
-                goto out;
+#ifdef CONFIG_MEMORY_HOTPLUG
-        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+static int __meminit __zone_pcp_update(void *data)
-        move_freepages_block(zone, page, MIGRATE_MOVABLE);
+{
-out:
+        struct zone *zone = data;
-        spin_unlock_irqrestore(&zone->lock, flags);
+        int cpu;
+        unsigned long batch = zone_batchsize(zone), flags;
+        for_each_possible_cpu(cpu) {
+                struct per_cpu_pageset *pset;
+                struct per_cpu_pages *pcp;
+                pset = per_cpu_ptr(zone->pageset, cpu);
+                pcp = &pset->pcp;
+                local_irq_save(flags);
+                if (pcp->count > 0)
+                        free_pcppages_bulk(zone, pcp->count, pcp);
+                setup_pageset(pset, batch);
+                local_irq_restore(flags);
+        }
+        return 0;
 }
+void __meminit zone_pcp_update(struct zone *zone)
+{
+        stop_machine(__zone_pcp_update, zone, NULL);
+}
+#endif
 #ifdef CONFIG_MEMORY_HOTREMOVE
+void zone_pcp_reset(struct zone *zone)
+{
+        unsigned long flags;
+        /* avoid races with drain_pages()  */
+        local_irq_save(flags);
+        if (zone->pageset != &boot_pageset) {
+                free_percpu(zone->pageset);
+                zone->pageset = &boot_pageset;
+        }
+        local_irq_restore(flags);
+}
 /*
 * All pages in the range must be isolated before calling this.
 */
@@ -5618,7 +5967,7 @@ bool is_free_buddy_page(struct page *page)
 }
 #endif
-static struct trace_print_flags pageflag_names[] = {
+static const struct trace_print_flags pageflag_names[] = {
        {1UL << PG_locked,              "locked"        },
        {1UL << PG_error,               "error"         },
        {1UL << PG_referenced,          "referenced"    },
@@ -5653,7 +6002,9 @@ static struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_MEMORY_FAILURE
        {1UL << PG_hwpoison,            "hwpoison"      },
 #endif
-        {-1UL,                          NULL            },
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        {1UL << PG_compound_lock,       "compound_lock" },
+#endif
 };
 static void dump_page_flags(unsigned long flags)
@@ -5662,12 +6013,14 @@ static void dump_page_flags(unsigned long flags)
        unsigned long mask;
        int i;
+        BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS);
        printk(KERN_ALERT "page flags: %#lx(", flags);
        /* remove zone id */
        flags &= (1UL << NR_PAGEFLAGS) - 1;
-        for (i = 0; pageflag_names[i].name && flags; i++) {
+        for (i = 0; i < ARRAY_SIZE(pageflag_names) && flags; i++) {
                mask = pageflag_names[i].mask;
                if ((flags & mask) != mask)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 1ccbd714059c..5ddad0c6daa6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static DEFINE_MUTEX(swap_cgroup_mutex);
 struct swap_cgroup_ctrl {
@@ -392,7 +392,7 @@ static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent,
 /**
 * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
- * @end: swap entry to be cmpxchged
+ * @ent: swap entry to be cmpxchged
 * @old: old id
 * @new: new id
 *
@@ -422,7 +422,7 @@ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
 /**
 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 * @ent: swap entry to be recorded into
- * @mem: mem_cgroup to be recorded
+ * @id: mem_cgroup to be recorded
 *
 * Returns old value at success, 0 at failure.
 * (Of course, old value can be 0.)
diff --git a/mm/page_io.c b/mm/page_io.c
index dc76b4d0611e..78eee32ee486 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -17,7 +17,9 @@
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/swapops.h>
+#include <linux/buffer_head.h>
 #include <linux/writeback.h>
+#include <linux/frontswap.h>
 #include <asm/pgtable.h>
 static struct bio *get_swap_bio(gfp_t gfp_flags,
@@ -85,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err)
        bio_put(bio);
 }
+int generic_swapfile_activate(struct swap_info_struct *sis,
+                                struct file *swap_file,
+                                sector_t *span)
+{
+        struct address_space *mapping = swap_file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned blocks_per_page;
+        unsigned long page_no;
+        unsigned blkbits;
+        sector_t probe_block;
+        sector_t last_block;
+        sector_t lowest_block = -1;
+        sector_t highest_block = 0;
+        int nr_extents = 0;
+        int ret;
+        blkbits = inode->i_blkbits;
+        blocks_per_page = PAGE_SIZE >> blkbits;
+        /*
+         * Map all the blocks into the extent list.  This code doesn't try
+         * to be very smart.
+         */
+        probe_block = 0;
+        page_no = 0;
+        last_block = i_size_read(inode) >> blkbits;
+        while ((probe_block + blocks_per_page) <= last_block &&
+                        page_no < sis->max) {
+                unsigned block_in_page;
+                sector_t first_block;
+                first_block = bmap(inode, probe_block);
+                if (first_block == 0)
+                        goto bad_bmap;
+                /*
+                 * It must be PAGE_SIZE aligned on-disk
+                 */
+                if (first_block & (blocks_per_page - 1)) {
+                        probe_block++;
+                        goto reprobe;
+                }
+                for (block_in_page = 1; block_in_page < blocks_per_page;
+                                        block_in_page++) {
+                        sector_t block;
+                        block = bmap(inode, probe_block + block_in_page);
+                        if (block == 0)
+                                goto bad_bmap;
+                        if (block != first_block + block_in_page) {
+                                /* Discontiguity */
+                                probe_block++;
+                                goto reprobe;
+                        }
+                }
+                first_block >>= (PAGE_SHIFT - blkbits);
+                if (page_no) {  /* exclude the header page */
+                        if (first_block < lowest_block)
+                                lowest_block = first_block;
+                        if (first_block > highest_block)
+                                highest_block = first_block;
+                }
+                /*
+                 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+                 */
+                ret = add_swap_extent(sis, page_no, 1, first_block);
+                if (ret < 0)
+                        goto out;
+                nr_extents += ret;
+                page_no++;
+                probe_block += blocks_per_page;
+reprobe:
+                continue;
+        }
+        ret = nr_extents;
+        *span = 1 + highest_block - lowest_block;
+        if (page_no == 0)
+                page_no = 1;    /* force Empty message */
+        sis->max = page_no;
+        sis->pages = page_no - 1;
+        sis->highest_bit = page_no - 1;
+out:
+        return ret;
+bad_bmap:
+        printk(KERN_ERR "swapon: swapfile has holes\n");
+        ret = -EINVAL;
+        goto out;
+}
 /*
 * We may have stale swap cache pages in memory: notice
 * them here and get rid of the unnecessary final write.
@@ -93,11 +187,45 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct bio *bio;
        int ret = 0, rw = WRITE;
+        struct swap_info_struct *sis = page_swap_info(page);
        if (try_to_free_swap(page)) {
                unlock_page(page);
                goto out;
        }
+        if (frontswap_store(page) == 0) {
+                set_page_writeback(page);
+                unlock_page(page);
+                end_page_writeback(page);
+                goto out;
+        }
+        if (sis->flags & SWP_FILE) {
+                struct kiocb kiocb;
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                struct iovec iov = {
+                        .iov_base = kmap(page),
+                        .iov_len  = PAGE_SIZE,
+                };
+                init_sync_kiocb(&kiocb, swap_file);
+                kiocb.ki_pos = page_file_offset(page);
+                kiocb.ki_left = PAGE_SIZE;
+                kiocb.ki_nbytes = PAGE_SIZE;
+                unlock_page(page);
+                ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
+                                                &kiocb, &iov,
+                                                kiocb.ki_pos, 1);
+                kunmap(page);
+                if (ret == PAGE_SIZE) {
+                        count_vm_event(PSWPOUT);
+                        ret = 0;
+                }
+                return ret;
+        }
        bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
        if (bio == NULL) {
                set_page_dirty(page);
@@ -119,9 +247,26 @@ int swap_readpage(struct page *page)
 {
        struct bio *bio;
        int ret = 0;
+        struct swap_info_struct *sis = page_swap_info(page);
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageUptodate(page));
+        if (frontswap_load(page) == 0) {
+                SetPageUptodate(page);
+                unlock_page(page);
+                goto out;
+        }
+        if (sis->flags & SWP_FILE) {
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                ret = mapping->a_ops->readpage(swap_file, page);
+                if (!ret)
+                        count_vm_event(PSWPIN);
+                return ret;
+        }
        bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
        if (bio == NULL) {
                unlock_page(page);
@@ -133,3 +278,15 @@ int swap_readpage(struct page *page)
 out:
        return ret;
 }
+int swap_set_page_dirty(struct page *page)
+{
+        struct swap_info_struct *sis = page_swap_info(page);
+        if (sis->flags & SWP_FILE) {
+                struct address_space *mapping = sis->swap_file->f_mapping;
+                return mapping->a_ops->set_page_dirty(page);
+        } else {
+                return __set_page_dirty_no_writeback(page);
+        }
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 4ae42bb40892..247d1f175739 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,101 @@
 #include <linux/mm.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
+#include <linux/memory.h>
 #include "internal.h"
+/* called while holding zone->lock */
+static void set_pageblock_isolate(struct page *page)
+{
+        if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
+                return;
+        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+        page_zone(page)->nr_pageblock_isolate++;
+}
+/* called while holding zone->lock */
+static void restore_pageblock_isolate(struct page *page, int migratetype)
+{
+        struct zone *zone = page_zone(page);
+        if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
+                return;
+        BUG_ON(zone->nr_pageblock_isolate <= 0);
+        set_pageblock_migratetype(page, migratetype);
+        zone->nr_pageblock_isolate--;
+}
+int set_migratetype_isolate(struct page *page)
+{
+        struct zone *zone;
+        unsigned long flags, pfn;
+        struct memory_isolate_notify arg;
+        int notifier_ret;
+        int ret = -EBUSY;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        pfn = page_to_pfn(page);
+        arg.start_pfn = pfn;
+        arg.nr_pages = pageblock_nr_pages;
+        arg.pages_found = 0;
+        /*
+         * It may be possible to isolate a pageblock even if the
+         * migratetype is not MIGRATE_MOVABLE. The memory isolation
+         * notifier chain is used by balloon drivers to return the
+         * number of pages in a range that are held by the balloon
+         * driver to shrink memory. If all the pages are accounted for
+         * by balloons, are free, or on the LRU, isolation can continue.
+         * Later, for example, when memory hotplug notifier runs, these
+         * pages reported as "can be isolated" should be isolated(freed)
+         * by the balloon driver through the memory notifier chain.
+         */
+        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+        notifier_ret = notifier_to_errno(notifier_ret);
+        if (notifier_ret)
+                goto out;
+        /*
+         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+         * We just check MOVABLE pages.
+         */
+        if (!has_unmovable_pages(zone, page, arg.pages_found))
+                ret = 0;
+        /*
+         * immobile means "not-on-lru" paes. If immobile is larger than
+         * removable-by-driver pages reported by notifier, we'll fail.
+         */
+out:
+        if (!ret) {
+                set_pageblock_isolate(page);
+                move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        }
+        spin_unlock_irqrestore(&zone->lock, flags);
+        if (!ret)
+                drain_all_pages();
+        return ret;
+}
+void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+{
+        struct zone *zone;
+        unsigned long flags;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+                goto out;
+        move_freepages_block(zone, page, migratetype);
+        restore_pageblock_isolate(page, migratetype);
+out:
+        spin_unlock_irqrestore(&zone->lock, flags);
+}
 static inline struct page *
 __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 {
@@ -24,6 +117,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 * to be MIGRATE_ISOLATE.
 * @start_pfn: The lower PFN of the range to be isolated.
 * @end_pfn: The upper PFN of the range to be isolated.
+ * @migratetype: migrate type to set in error recovery.
 *
 * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
 * the range will never be allocated. Any free pages and pages freed in the
@@ -32,8 +126,8 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 * start_pfn/end_pfn must be aligned to pageblock_order.
 * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
 */
-int
+int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
+                             unsigned migratetype)
 {
        unsigned long pfn;
        unsigned long undo_pfn;
@@ -56,7 +150,7 @@ undo:
        for (pfn = start_pfn;
             pfn < undo_pfn;
             pfn += pageblock_nr_pages)
-                unset_migratetype_isolate(pfn_to_page(pfn));
+                unset_migratetype_isolate(pfn_to_page(pfn), migratetype);
        return -EBUSY;
 }
@@ -64,8 +158,8 @@ undo:
 /*
 * Make isolated pages available again.
 */
-int
+int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
-undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
+                            unsigned migratetype)
 {
        unsigned long pfn;
        struct page *page;
@@ -77,7 +171,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
                page = __first_valid_page(pfn, pageblock_nr_pages);
                if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
                        continue;
-                unset_migratetype_isolate(page);
+                unset_migratetype_isolate(page, migratetype);
        }
        return 0;
 }
@@ -86,7 +180,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
 * all pages in [start_pfn...end_pfn) must be in the same zone.
 * zone->lock must be held before call this.
 *
- * Returns 1 if all pages in the range is isolated.
+ * Returns 1 if all pages in the range are isolated.
 */
 static int
 __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index aa9701e12714..6c118d012bb5 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -162,7 +162,6 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
 /**
 * walk_page_range - walk a memory map's page tables with a callback
- * @mm: memory map to walk
 * @addr: starting address
 * @end: ending address
 * @walk: set of callbacks to invoke for each level of the tree
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 405d331804c3..3707c71ae4cd 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -360,7 +360,6 @@ err_free:
 * @chunk: chunk to depopulate
 * @off: offset to the area to depopulate
 * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
 *
 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 * from @chunk.  If @flush is true, vcache is flushed before unmapping
diff --git a/mm/percpu.c b/mm/percpu.c
index f47af9123af7..bb4be7435ce3 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1132,20 +1132,20 @@ static void pcpu_dump_alloc_info(const char *lvl,
                for (alloc_end += gi->nr_units / upa;
                     alloc < alloc_end; alloc++) {
                        if (!(alloc % apl)) {
-                                printk("\n");
+                                printk(KERN_CONT "\n");
                                printk("%spcpu-alloc: ", lvl);
                        }
-                        printk("[%0*d] ", group_width, group);
+                        printk(KERN_CONT "[%0*d] ", group_width, group);
                        for (unit_end += upa; unit < unit_end; unit++)
                                if (gi->cpu_map[unit] != NR_CPUS)
-                                        printk("%0*d ", cpu_width,
+                                        printk(KERN_CONT "%0*d ", cpu_width,
                                               gi->cpu_map[unit]);
                                else
-                                        printk("%s ", empty_str);
+                                        printk(KERN_CONT "%s ", empty_str);
                }
        }
-        printk("\n");
+        printk(KERN_CONT "\n");
 }
 /**
@@ -1650,6 +1650,16 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
                areas[group] = ptr;
                base = min(ptr, base);
+        }
+        /*
+         * Copy data and free unused parts.  This should happen after all
+         * allocations are complete; otherwise, we may end up with
+         * overlapping groups.
+         */
+        for (group = 0; group < ai->nr_groups; group++) {
+                struct pcpu_group_info *gi = &ai->groups[group];
+                void *ptr = areas[group];
                for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
                        if (gi->cpu_map[i] == NR_CPUS) {
@@ -1885,6 +1895,8 @@ void __init setup_per_cpu_areas(void)
        fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
        if (!ai || !fc)
                panic("Failed to allocate memory for percpu areas.");
+        /* kmemleak tracks the percpu allocations separately */
+        kmemleak_free(fc);
        ai->dyn_size = unit_size;
        ai->unit_size = unit_size;
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 5a74fea182f1..74c0ddaa6fa0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -109,8 +109,8 @@ pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
 #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
-                           pmd_t *pmdp)
+                          pmd_t *pmdp)
 {
        pmd_t pmd = pmd_mksplitting(*pmdp);
        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index c20ff48994c2..926b46649749 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -371,15 +371,15 @@ static ssize_t process_vm_rw(pid_t pid,
        /* Check iovecs */
        if (vm_write)
                rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
-                                           iovstack_l, &iov_l, 1);
+                                           iovstack_l, &iov_l);
        else
                rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
-                                           iovstack_l, &iov_l, 1);
+                                           iovstack_l, &iov_l);
        if (rc <= 0)
                goto free_iovecs;
-        rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV,
+        rc = rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt, UIO_FASTIOV,
-                                   iovstack_r, &iov_r, 0);
+                                   iovstack_r, &iov_r);
        if (rc <= 0)
                goto free_iovecs;
@@ -438,16 +438,16 @@ compat_process_vm_rw(compat_pid_t pid,
        if (vm_write)
                rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
                                                  UIO_FASTIOV, iovstack_l,
-                                                  &iov_l, 1);
+                                                  &iov_l);
        else
                rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
                                                  UIO_FASTIOV, iovstack_l,
-                                                  &iov_l, 1);
+                                                  &iov_l);
        if (rc <= 0)
                goto free_iovecs;
-        rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt,
+        rc = compat_rw_copy_check_uvector(CHECK_IOVEC_ONLY, rvec, riovcnt,
                                          UIO_FASTIOV, iovstack_r,
-                                          &iov_r, 0);
+                                          &iov_r);
        if (rc <= 0)
                goto free_iovecs;
diff --git a/mm/readahead.c b/mm/readahead.c
index cbcbb02f3e28..ea8f8fa21649 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -17,6 +17,8 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/pagevec.h>
 #include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/file.h>
 /*
 * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -562,3 +564,41 @@ page_cache_async_readahead(struct address_space *mapping,
        ondemand_readahead(mapping, ra, filp, true, offset, req_size);
 }
 EXPORT_SYMBOL_GPL(page_cache_async_readahead);
+static ssize_t
+do_readahead(struct address_space *mapping, struct file *filp,
+             pgoff_t index, unsigned long nr)
+{
+        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
+                return -EINVAL;
+        force_page_cache_readahead(mapping, filp, index, nr);
+        return 0;
+}
+SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
+{
+        ssize_t ret;
+        struct file *file;
+        ret = -EBADF;
+        file = fget(fd);
+        if (file) {
+                if (file->f_mode & FMODE_READ) {
+                        struct address_space *mapping = file->f_mapping;
+                        pgoff_t start = offset >> PAGE_CACHE_SHIFT;
+                        pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+                        unsigned long len = end - start + 1;
+                        ret = do_readahead(mapping, file, start, len);
+                }
+                fput(file);
+        }
+        return ret;
+}
+#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
+asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
+{
+        return SYSC_readahead((int) fd, offset, (size_t) count);
+}
+SYSCALL_ALIAS(sys_readahead, SyS_readahead);
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 5b5ad584ffb7..0f3b7cda2a24 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -755,12 +755,6 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                pte_unmap_unlock(pte, ptl);
        }
-        /* Pretend the page is referenced if the task has the
-           swap token and is in the middle of a page fault. */
-        if (mm != current->mm && has_swap_token(mm) &&
-                        rwsem_is_locked(&mm->mmap_sem))
-                referenced++;
        (*mapcount)--;
        if (referenced)
diff --git a/mm/shmem.c b/mm/shmem.c
index f99ff3e50bd6..d4e184e2a38e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -53,6 +53,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/percpu_counter.h>
+#include <linux/falloc.h>
 #include <linux/splice.h>
 #include <linux/security.h>
 #include <linux/swapops.h>
@@ -83,12 +84,25 @@ struct shmem_xattr {
        char value[0];
 };
+/*
+ * shmem_fallocate and shmem_writepage communicate via inode->i_private
+ * (with i_mutex making sure that it has only one user at a time):
+ * we would prefer not to enlarge the shmem inode just for that.
+ */
+struct shmem_falloc {
+        pgoff_t start;          /* start of range currently being fallocated */
+        pgoff_t next;           /* the next page offset to be fallocated */
+        pgoff_t nr_falloced;    /* how many new pages have been fallocated */
+        pgoff_t nr_unswapped;   /* how often writepage refused to swap out */
+};
 /* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
        SGP_READ,       /* don't exceed i_size, don't allocate page */
        SGP_CACHE,      /* don't exceed i_size, may allocate page */
        SGP_DIRTY,      /* like SGP_CACHE, but set new page dirty */
-        SGP_WRITE,      /* may exceed i_size, may allocate page */
+        SGP_WRITE,      /* may exceed i_size, may allocate !Uptodate page */
+        SGP_FALLOC,     /* like SGP_WRITE, but make existing page Uptodate */
 };
 #ifdef CONFIG_TMPFS
@@ -103,6 +117,9 @@ static unsigned long shmem_default_max_inodes(void)
 }
 #endif
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+                                struct shmem_inode_info *info, pgoff_t index);
 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
@@ -247,46 +264,55 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
 }
 /*
+ * Sometimes, before we decide whether to proceed or to fail, we must check
+ * that an entry was not already brought back from swap by a racing thread.
+ *
+ * Checking page is not enough: by the time a SwapCache page is locked, it
+ * might be reused, and again be SwapCache, using the same swap as before.
+ */
+static bool shmem_confirm_swap(struct address_space *mapping,
+                               pgoff_t index, swp_entry_t swap)
+{
+        void *item;
+        rcu_read_lock();
+        item = radix_tree_lookup(&mapping->page_tree, index);
+        rcu_read_unlock();
+        return item == swp_to_radix_entry(swap);
+}
+/*
 * Like add_to_page_cache_locked, but error if expected item has gone.
 */
 static int shmem_add_to_page_cache(struct page *page,
                                   struct address_space *mapping,
                                   pgoff_t index, gfp_t gfp, void *expected)
 {
-        int error = 0;
+        int error;
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(!PageSwapBacked(page));
+        page_cache_get(page);
+        page->mapping = mapping;
+        page->index = index;
+        spin_lock_irq(&mapping->tree_lock);
        if (!expected)
-                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+                error = radix_tree_insert(&mapping->page_tree, index, page);
+        else
+                error = shmem_radix_tree_replace(mapping, index, expected,
+                                                                 page);
        if (!error) {
-                page_cache_get(page);
+                mapping->nrpages++;
-                page->mapping = mapping;
+                __inc_zone_page_state(page, NR_FILE_PAGES);
-                page->index = index;
+                __inc_zone_page_state(page, NR_SHMEM);
+                spin_unlock_irq(&mapping->tree_lock);
-                spin_lock_irq(&mapping->tree_lock);
+        } else {
-                if (!expected)
+                page->mapping = NULL;
-                        error = radix_tree_insert(&mapping->page_tree,
+                spin_unlock_irq(&mapping->tree_lock);
-                                                        index, page);
+                page_cache_release(page);
-                else
-                        error = shmem_radix_tree_replace(mapping, index,
-                                                        expected, page);
-                if (!error) {
-                        mapping->nrpages++;
-                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                        __inc_zone_page_state(page, NR_SHMEM);
-                        spin_unlock_irq(&mapping->tree_lock);
-                } else {
-                        page->mapping = NULL;
-                        spin_unlock_irq(&mapping->tree_lock);
-                        page_cache_release(page);
-                }
-                if (!expected)
-                        radix_tree_preload_end();
        }
-        if (error)
-                mem_cgroup_uncharge_cache_page(page);
        return error;
 }
@@ -423,27 +449,31 @@ void shmem_unlock_mapping(struct address_space *mapping)
 /*
 * Remove range of pages and swap entries from radix tree, and free them.
+ * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
 */
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+                                                                 bool unfalloc)
 {
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
        pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+        pgoff_t end = (lend + 1) >> PAGE_CACHE_SHIFT;
-        pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+        unsigned int partial_start = lstart & (PAGE_CACHE_SIZE - 1);
+        unsigned int partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
        struct pagevec pvec;
        pgoff_t indices[PAGEVEC_SIZE];
        long nr_swaps_freed = 0;
        pgoff_t index;
        int i;
-        BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+        if (lend == -1)
+                end = -1;       /* unsigned, so actually very big */
        pagevec_init(&pvec, 0);
        index = start;
-        while (index <= end) {
+        while (index < end) {
                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
                                                        pvec.pages, indices);
                if (!pvec.nr)
                        break;
@@ -452,10 +482,12 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                        struct page *page = pvec.pages[i];
                        index = indices[i];
-                        if (index > end)
+                        if (index >= end)
                                break;
                        if (radix_tree_exceptional_entry(page)) {
+                                if (unfalloc)
+                                        continue;
                                nr_swaps_freed += !shmem_free_swap(mapping,
                                                                index, page);
                                continue;
@@ -463,9 +495,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                        if (!trylock_page(page))
                                continue;
-                        if (page->mapping == mapping) {
+                        if (!unfalloc || !PageUptodate(page)) {
-                                VM_BUG_ON(PageWriteback(page));
+                                if (page->mapping == mapping) {
-                                truncate_inode_page(mapping, page);
+                                        VM_BUG_ON(PageWriteback(page));
+                                        truncate_inode_page(mapping, page);
+                                }
                        }
                        unlock_page(page);
                }
@@ -476,30 +510,47 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                index++;
        }
-        if (partial) {
+        if (partial_start) {
                struct page *page = NULL;
                shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
                if (page) {
-                        zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+                        unsigned int top = PAGE_CACHE_SIZE;
+                        if (start > end) {
+                                top = partial_end;
+                                partial_end = 0;
+                        }
+                        zero_user_segment(page, partial_start, top);
                        set_page_dirty(page);
                        unlock_page(page);
                        page_cache_release(page);
                }
        }
+        if (partial_end) {
+                struct page *page = NULL;
+                shmem_getpage(inode, end, &page, SGP_READ, NULL);
+                if (page) {
+                        zero_user_segment(page, 0, partial_end);
+                        set_page_dirty(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                }
+        }
+        if (start >= end)
+                return;
        index = start;
        for ( ; ; ) {
                cond_resched();
                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
                                                        pvec.pages, indices);
                if (!pvec.nr) {
-                        if (index == start)
+                        if (index == start || unfalloc)
                                break;
                        index = start;
                        continue;
                }
-                if (index == start && indices[0] > end) {
+                if ((index == start || unfalloc) && indices[0] >= end) {
                        shmem_deswap_pagevec(&pvec);
                        pagevec_release(&pvec);
                        break;
@@ -509,19 +560,23 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
                        struct page *page = pvec.pages[i];
                        index = indices[i];
-                        if (index > end)
+                        if (index >= end)
                                break;
                        if (radix_tree_exceptional_entry(page)) {
+                                if (unfalloc)
+                                        continue;
                                nr_swaps_freed += !shmem_free_swap(mapping,
                                                                index, page);
                                continue;
                        }
                        lock_page(page);
-                        if (page->mapping == mapping) {
+                        if (!unfalloc || !PageUptodate(page)) {
-                                VM_BUG_ON(PageWriteback(page));
+                                if (page->mapping == mapping) {
-                                truncate_inode_page(mapping, page);
+                                        VM_BUG_ON(PageWriteback(page));
+                                        truncate_inode_page(mapping, page);
+                                }
                        }
                        unlock_page(page);
                }
@@ -535,7 +590,11 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
        info->swapped -= nr_swaps_freed;
        shmem_recalc_inode(inode);
        spin_unlock(&info->lock);
+}
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+{
+        shmem_undo_range(inode, lstart, lend, false);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -597,19 +656,20 @@ static void shmem_evict_inode(struct inode *inode)
        }
        BUG_ON(inode->i_blocks);
        shmem_free_inode(inode->i_sb);
-        end_writeback(inode);
+        clear_inode(inode);
 }
 /*
 * If swap found in inode, free it and move page from swapcache to filecache.
 */
 static int shmem_unuse_inode(struct shmem_inode_info *info,
-                             swp_entry_t swap, struct page *page)
+                             swp_entry_t swap, struct page **pagep)
 {
        struct address_space *mapping = info->vfs_inode.i_mapping;
        void *radswap;
        pgoff_t index;
-        int error;
+        gfp_t gfp;
+        int error = 0;
        radswap = swp_to_radix_entry(swap);
        index = radix_tree_locate_item(&mapping->page_tree, radswap);
@@ -625,22 +685,48 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
        if (shmem_swaplist.next != &info->swaplist)
                list_move_tail(&shmem_swaplist, &info->swaplist);
+        gfp = mapping_gfp_mask(mapping);
+        if (shmem_should_replace_page(*pagep, gfp)) {
+                mutex_unlock(&shmem_swaplist_mutex);
+                error = shmem_replace_page(pagep, gfp, info, index);
+                mutex_lock(&shmem_swaplist_mutex);
+                /*
+                 * We needed to drop mutex to make that restrictive page
+                 * allocation, but the inode might have been freed while we
+                 * dropped it: although a racing shmem_evict_inode() cannot
+                 * complete without emptying the radix_tree, our page lock
+                 * on this swapcache page is not enough to prevent that -
+                 * free_swap_and_cache() of our swap entry will only
+                 * trylock_page(), removing swap from radix_tree whatever.
+                 *
+                 * We must not proceed to shmem_add_to_page_cache() if the
+                 * inode has been freed, but of course we cannot rely on
+                 * inode or mapping or info to check that.  However, we can
+                 * safely check if our swap entry is still in use (and here
+                 * it can't have got reused for another page): if it's still
+                 * in use, then the inode cannot have been freed yet, and we
+                 * can safely proceed (if it's no longer in use, that tells
+                 * nothing about the inode, but we don't need to unuse swap).
+                 */
+                if (!page_swapcount(*pagep))
+                        error = -ENOENT;
+        }
        /*
         * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
         * but also to hold up shmem_evict_inode(): so inode cannot be freed
         * beneath us (pagelock doesn't help until the page is in pagecache).
         */
-        error = shmem_add_to_page_cache(page, mapping, index,
+        if (!error)
+                error = shmem_add_to_page_cache(*pagep, mapping, index,
                                                GFP_NOWAIT, radswap);
-        /* which does mem_cgroup_uncharge_cache_page on error */
        if (error != -ENOMEM) {
                /*
                 * Truncation and eviction use free_swap_and_cache(), which
                 * only does trylock page: if we raced, best clean up here.
                 */
-                delete_from_swap_cache(page);
+                delete_from_swap_cache(*pagep);
-                set_page_dirty(page);
+                set_page_dirty(*pagep);
                if (!error) {
                        spin_lock(&info->lock);
                        info->swapped--;
@@ -660,7 +746,14 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        struct list_head *this, *next;
        struct shmem_inode_info *info;
        int found = 0;
-        int error;
+        int error = 0;
+        /*
+         * There's a faint possibility that swap page was replaced before
+         * caller locked it: caller will come back later with the right page.
+         */
+        if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
+                goto out;
        /*
         * Charge page using GFP_KERNEL while we can wait, before taking
@@ -676,7 +769,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        list_for_each_safe(this, next, &shmem_swaplist) {
                info = list_entry(this, struct shmem_inode_info, swaplist);
                if (info->swapped)
-                        found = shmem_unuse_inode(info, swap, page);
+                        found = shmem_unuse_inode(info, swap, &page);
                else
                        list_del_init(&info->swaplist);
                cond_resched();
@@ -685,8 +778,6 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
        }
        mutex_unlock(&shmem_swaplist_mutex);
-        if (!found)
-                mem_cgroup_uncharge_cache_page(page);
        if (found < 0)
                error = found;
 out:
@@ -727,6 +818,38 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
                goto redirty;
        }
+        /*
+         * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
+         * value into swapfile.c, the only way we can correctly account for a
+         * fallocated page arriving here is now to initialize it and write it.
+         *
+         * That's okay for a page already fallocated earlier, but if we have
+         * not yet completed the fallocation, then (a) we want to keep track
+         * of this page in case we have to undo it, and (b) it may not be a
+         * good idea to continue anyway, once we're pushing into swap.  So
+         * reactivate the page, and let shmem_fallocate() quit when too many.
+         */
+        if (!PageUptodate(page)) {
+                if (inode->i_private) {
+                        struct shmem_falloc *shmem_falloc;
+                        spin_lock(&inode->i_lock);
+                        shmem_falloc = inode->i_private;
+                        if (shmem_falloc &&
+                            index >= shmem_falloc->start &&
+                            index < shmem_falloc->next)
+                                shmem_falloc->nr_unswapped++;
+                        else
+                                shmem_falloc = NULL;
+                        spin_unlock(&inode->i_lock);
+                        if (shmem_falloc)
+                                goto redirty;
+                }
+                clear_highpage(page);
+                flush_dcache_page(page);
+                SetPageUptodate(page);
+        }
        swap = get_swap_page();
        if (!swap.val)
                goto redirty;
@@ -806,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = index;
+        /* Bias interleave by inode number to distribute better across nodes */
+        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
        pvma.vm_ops = NULL;
        pvma.vm_policy = spol;
        return swapin_readahead(swap, gfp, &pvma, 0);
@@ -819,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp,
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = index;
+        /* Bias interleave by inode number to distribute better across nodes */
+        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
        pvma.vm_ops = NULL;
        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
@@ -856,6 +981,89 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 #endif
 /*
+ * When a page is moved from swapcache to shmem filecache (either by the
+ * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * shmem_unuse_inode()), it may have been read in earlier from swap, in
+ * ignorance of the mapping it belongs to.  If that mapping has special
+ * constraints (like the gma500 GEM driver, which requires RAM below 4GB),
+ * we may need to copy to a suitable page before moving to filecache.
+ *
+ * In a future release, this may well be extended to respect cpuset and
+ * NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
+ * but for now it is a simple matter of zone.
+ */
+static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+{
+        return page_zonenum(page) > gfp_zone(gfp);
+}
+static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+                                struct shmem_inode_info *info, pgoff_t index)
+{
+        struct page *oldpage, *newpage;
+        struct address_space *swap_mapping;
+        pgoff_t swap_index;
+        int error;
+        oldpage = *pagep;
+        swap_index = page_private(oldpage);
+        swap_mapping = page_mapping(oldpage);
+        /*
+         * We have arrived here because our zones are constrained, so don't
+         * limit chance of success by further cpuset and node constraints.
+         */
+        gfp &= ~GFP_CONSTRAINT_MASK;
+        newpage = shmem_alloc_page(gfp, info, index);
+        if (!newpage)
+                return -ENOMEM;
+        page_cache_get(newpage);
+        copy_highpage(newpage, oldpage);
+        flush_dcache_page(newpage);
+        __set_page_locked(newpage);
+        SetPageUptodate(newpage);
+        SetPageSwapBacked(newpage);
+        set_page_private(newpage, swap_index);
+        SetPageSwapCache(newpage);
+        /*
+         * Our caller will very soon move newpage out of swapcache, but it's
+         * a nice clean interface for us to replace oldpage by newpage there.
+         */
+        spin_lock_irq(&swap_mapping->tree_lock);
+        error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
+                                                                   newpage);
+        if (!error) {
+                __inc_zone_page_state(newpage, NR_FILE_PAGES);
+                __dec_zone_page_state(oldpage, NR_FILE_PAGES);
+        }
+        spin_unlock_irq(&swap_mapping->tree_lock);
+        if (unlikely(error)) {
+                /*
+                 * Is this possible?  I think not, now that our callers check
+                 * both PageSwapCache and page_private after getting page lock;
+                 * but be defensive.  Reverse old to newpage for clear and free.
+                 */
+                oldpage = newpage;
+        } else {
+                mem_cgroup_replace_page_cache(oldpage, newpage);
+                lru_cache_add_anon(newpage);
+                *pagep = newpage;
+        }
+        ClearPageSwapCache(oldpage);
+        set_page_private(oldpage, 0);
+        unlock_page(oldpage);
+        page_cache_release(oldpage);
+        page_cache_release(oldpage);
+        return error;
+}
+/*
 * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
 *
 * If we allocate a new one we do not mark it dirty. That's up to the
@@ -872,6 +1080,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        swp_entry_t swap;
        int error;
        int once = 0;
+        int alloced = 0;
        if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
                return -EFBIG;
@@ -883,19 +1092,21 @@ repeat:
                page = NULL;
        }
-        if (sgp != SGP_WRITE &&
+        if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
                error = -EINVAL;
                goto failed;
        }
+        /* fallocated page? */
+        if (page && !PageUptodate(page)) {
+                if (sgp != SGP_READ)
+                        goto clear;
+                unlock_page(page);
+                page_cache_release(page);
+                page = NULL;
+        }
        if (page || (sgp == SGP_READ && !swap.val)) {
-                /*
-                 * Once we can get the page lock, it must be uptodate:
-                 * if there were an error in reading back from swap,
-                 * the page would not be inserted into the filecache.
-                 */
-                BUG_ON(page && !PageUptodate(page));
                *pagep = page;
                return 0;
        }
@@ -923,26 +1134,31 @@ repeat:
                /* We have to do this with page locked to prevent races */
                lock_page(page);
+                if (!PageSwapCache(page) || page_private(page) != swap.val ||
+                    !shmem_confirm_swap(mapping, index, swap)) {
+                        error = -EEXIST;        /* try again */
+                        goto unlock;
+                }
                if (!PageUptodate(page)) {
                        error = -EIO;
                        goto failed;
                }
                wait_on_page_writeback(page);
-                /* Someone may have already done it for us */
+                if (shmem_should_replace_page(page, gfp)) {
-                if (page->mapping) {
+                        error = shmem_replace_page(&page, gfp, info, index);
-                        if (page->mapping == mapping &&
+                        if (error)
-                            page->index == index)
+                                goto failed;
-                                goto done;
-                        error = -EEXIST;
-                        goto failed;
                }
                error = mem_cgroup_cache_charge(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
-                if (!error)
+                if (!error) {
                        error = shmem_add_to_page_cache(page, mapping, index,
                                                gfp, swp_to_radix_entry(swap));
+                        /* We already confirmed swap, and make no allocation */
+                        VM_BUG_ON(error);
+                }
                if (error)
                        goto failed;
@@ -979,11 +1195,18 @@ repeat:
                __set_page_locked(page);
                error = mem_cgroup_cache_charge(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
-                if (!error)
-                        error = shmem_add_to_page_cache(page, mapping, index,
-                                                gfp, NULL);
                if (error)
                        goto decused;
+                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+                if (!error) {
+                        error = shmem_add_to_page_cache(page, mapping, index,
+                                                        gfp, NULL);
+                        radix_tree_preload_end();
+                }
+                if (error) {
+                        mem_cgroup_uncharge_cache_page(page);
+                        goto decused;
+                }
                lru_cache_add_anon(page);
                spin_lock(&info->lock);
@@ -991,19 +1214,36 @@ repeat:
                inode->i_blocks += BLOCKS_PER_PAGE;
                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+                alloced = true;
-                clear_highpage(page);
+                /*
-                flush_dcache_page(page);
+                 * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
-                SetPageUptodate(page);
+                 */
+                if (sgp == SGP_FALLOC)
+                        sgp = SGP_WRITE;
+clear:
+                /*
+                 * Let SGP_WRITE caller clear ends if write does not fill page;
+                 * but SGP_FALLOC on a page fallocated earlier must initialize
+                 * it now, lest undo on failure cancel our earlier guarantee.
+                 */
+                if (sgp != SGP_WRITE) {
+                        clear_highpage(page);
+                        flush_dcache_page(page);
+                        SetPageUptodate(page);
+                }
                if (sgp == SGP_DIRTY)
                        set_page_dirty(page);
        }
-done:
        /* Perhaps the file has been truncated since we checked */
-        if (sgp != SGP_WRITE &&
+        if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
                error = -EINVAL;
-                goto trunc;
+                if (alloced)
+                        goto trunc;
+                else
+                        goto failed;
        }
        *pagep = page;
        return 0;
@@ -1012,6 +1252,7 @@ done:
         * Error recovery.
         */
 trunc:
+        info = SHMEM_I(inode);
        ClearPageDirty(page);
        delete_from_page_cache(page);
        spin_lock(&info->lock);
@@ -1019,19 +1260,16 @@ trunc:
        inode->i_blocks -= BLOCKS_PER_PAGE;
        spin_unlock(&info->lock);
 decused:
+        sbinfo = SHMEM_SB(inode->i_sb);
        if (sbinfo->max_blocks)
                percpu_counter_add(&sbinfo->used_blocks, -1);
 unacct:
        shmem_unacct_blocks(info->flags, 1);
 failed:
-        if (swap.val && error != -EINVAL) {
+        if (swap.val && error != -EINVAL &&
-                struct page *test = find_get_page(mapping, index);
+            !shmem_confirm_swap(mapping, index, swap))
-                if (test && !radix_tree_exceptional_entry(test))
+                error = -EEXIST;
-                        page_cache_release(test);
+unlock:
-                /* Have another try if the entry has changed */
-                if (test != swp_to_radix_entry(swap))
-                        error = -EEXIST;
-        }
        if (page) {
                unlock_page(page);
                page_cache_release(page);
@@ -1043,7 +1281,7 @@ failed:
                spin_unlock(&info->lock);
                goto repeat;
        }
-        if (error == -EEXIST)
+        if (error == -EEXIST)   /* from above or from radix_tree_insert */
                goto repeat;
        return error;
 }
@@ -1204,6 +1442,14 @@ shmem_write_end(struct file *file, struct address_space *mapping,
        if (pos + copied > inode->i_size)
                i_size_write(inode, pos + copied);
+        if (!PageUptodate(page)) {
+                if (copied < PAGE_CACHE_SIZE) {
+                        unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+                        zero_user_segments(page, 0, from,
+                                        from + copied, PAGE_CACHE_SIZE);
+                }
+                SetPageUptodate(page);
+        }
        set_page_dirty(page);
        unlock_page(page);
        page_cache_release(page);
@@ -1365,6 +1611,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        struct splice_pipe_desc spd = {
                .pages = pages,
                .partial = partial,
+                .nr_pages_max = PIPE_DEF_BUFFERS,
                .flags = flags,
                .ops = &page_cache_pipe_buf_ops,
                .spd_release = spd_release_page,
@@ -1453,7 +1700,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        if (spd.nr_pages)
                error = splice_to_pipe(pipe, &spd);
-        splice_shrink_spd(pipe, &spd);
+        splice_shrink_spd(&spd);
        if (error > 0) {
                *ppos += error;
@@ -1462,6 +1709,107 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
        return error;
 }
+static long shmem_fallocate(struct file *file, int mode, loff_t offset,
+                                                         loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+        struct shmem_falloc shmem_falloc;
+        pgoff_t start, index, end;
+        int error;
+        mutex_lock(&inode->i_mutex);
+        if (mode & FALLOC_FL_PUNCH_HOLE) {
+                struct address_space *mapping = file->f_mapping;
+                loff_t unmap_start = round_up(offset, PAGE_SIZE);
+                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+                if ((u64)unmap_end > (u64)unmap_start)
+                        unmap_mapping_range(mapping, unmap_start,
+                                            1 + unmap_end - unmap_start, 0);
+                shmem_truncate_range(inode, offset, offset + len - 1);
+                /* No need to unmap again: hole-punching leaves COWed pages */
+                error = 0;
+                goto out;
+        }
+        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
+        error = inode_newsize_ok(inode, offset + len);
+        if (error)
+                goto out;
+        start = offset >> PAGE_CACHE_SHIFT;
+        end = (offset + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        /* Try to avoid a swapstorm if len is impossible to satisfy */
+        if (sbinfo->max_blocks && end - start > sbinfo->max_blocks) {
+                error = -ENOSPC;
+                goto out;
+        }
+        shmem_falloc.start = start;
+        shmem_falloc.next  = start;
+        shmem_falloc.nr_falloced = 0;
+        shmem_falloc.nr_unswapped = 0;
+        spin_lock(&inode->i_lock);
+        inode->i_private = &shmem_falloc;
+        spin_unlock(&inode->i_lock);
+        for (index = start; index < end; index++) {
+                struct page *page;
+                /*
+                 * Good, the fallocate(2) manpage permits EINTR: we may have
+                 * been interrupted because we are using up too much memory.
+                 */
+                if (signal_pending(current))
+                        error = -EINTR;
+                else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
+                        error = -ENOMEM;
+                else
+                        error = shmem_getpage(inode, index, &page, SGP_FALLOC,
+                                                                        NULL);
+                if (error) {
+                        /* Remove the !PageUptodate pages we added */
+                        shmem_undo_range(inode,
+                                (loff_t)start << PAGE_CACHE_SHIFT,
+                                (loff_t)index << PAGE_CACHE_SHIFT, true);
+                        goto undone;
+                }
+                /*
+                 * Inform shmem_writepage() how far we have reached.
+                 * No need for lock or barrier: we have the page lock.
+                 */
+                shmem_falloc.next++;
+                if (!PageUptodate(page))
+                        shmem_falloc.nr_falloced++;
+                /*
+                 * If !PageUptodate, leave it that way so that freeable pages
+                 * can be recognized if we need to rollback on error later.
+                 * But set_page_dirty so that memory pressure will swap rather
+                 * than free the pages we are allocating (and SGP_CACHE pages
+                 * might still be clean: we now need to mark those dirty too).
+                 */
+                set_page_dirty(page);
+                unlock_page(page);
+                page_cache_release(page);
+                cond_resched();
+        }
+        if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
+                i_size_write(inode, offset + len);
+        inode->i_ctime = CURRENT_TIME;
+undone:
+        spin_lock(&inode->i_lock);
+        inode->i_private = NULL;
+        spin_unlock(&inode->i_lock);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return error;
+}
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@ -1531,7 +1879,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 }
 static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-                struct nameidata *nd)
+                bool excl)
 {
        return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
 }
@@ -1665,6 +2013,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                kaddr = kmap_atomic(page);
                memcpy(kaddr, symname, len);
                kunmap_atomic(kaddr);
+                SetPageUptodate(page);
                set_page_dirty(page);
                unlock_page(page);
                page_cache_release(page);
@@ -2033,11 +2382,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
        return dentry;
 }
-static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
+static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
-                                int connectable)
+                                struct inode *parent)
 {
-        struct inode *inode = dentry->d_inode;
        if (*len < 3) {
                *len = 3;
                return 255;
@@ -2075,6 +2422,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                               bool remount)
 {
        char *this_char, *value, *rest;
+        uid_t uid;
+        gid_t gid;
        while (options != NULL) {
                this_char = options;
@@ -2134,15 +2483,21 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                } else if (!strcmp(this_char,"uid")) {
                        if (remount)
                                continue;
-                        sbinfo->uid = simple_strtoul(value, &rest, 0);
+                        uid = simple_strtoul(value, &rest, 0);
                        if (*rest)
                                goto bad_val;
+                        sbinfo->uid = make_kuid(current_user_ns(), uid);
+                        if (!uid_valid(sbinfo->uid))
+                                goto bad_val;
                } else if (!strcmp(this_char,"gid")) {
                        if (remount)
                                continue;
-                        sbinfo->gid = simple_strtoul(value, &rest, 0);
+                        gid = simple_strtoul(value, &rest, 0);
                        if (*rest)
                                goto bad_val;
+                        sbinfo->gid = make_kgid(current_user_ns(), gid);
+                        if (!gid_valid(sbinfo->gid))
+                                goto bad_val;
                } else if (!strcmp(this_char,"mpol")) {
                        if (mpol_parse_str(value, &sbinfo->mpol, 1))
                                goto bad_val;
@@ -2210,10 +2565,12 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
                seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
        if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
                seq_printf(seq, ",mode=%03ho", sbinfo->mode);
-        if (sbinfo->uid != 0)
+        if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
-                seq_printf(seq, ",uid=%u", sbinfo->uid);
+                seq_printf(seq, ",uid=%u",
-        if (sbinfo->gid != 0)
+                                from_kuid_munged(&init_user_ns, sbinfo->uid));
-                seq_printf(seq, ",gid=%u", sbinfo->gid);
+        if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
+                seq_printf(seq, ",gid=%u",
+                                from_kgid_munged(&init_user_ns, sbinfo->gid));
        shmem_show_mpol(seq, sbinfo->mpol);
        return 0;
 }
@@ -2260,6 +2617,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
                }
        }
        sb->s_export_op = &shmem_export_ops;
+        sb->s_flags |= MS_NOSEC;
 #else
        sb->s_flags |= MS_NOUSER;
 #endif
@@ -2362,12 +2720,12 @@ static const struct file_operations shmem_file_operations = {
        .fsync          = noop_fsync,
        .splice_read    = shmem_file_splice_read,
        .splice_write   = generic_file_splice_write,
+        .fallocate      = shmem_fallocate,
 #endif
 };
 static const struct inode_operations shmem_inode_operations = {
        .setattr        = shmem_setattr,
-        .truncate_range = shmem_truncate_range,
 #ifdef CONFIG_TMPFS_XATTR
        .setxattr       = shmem_setxattr,
        .getxattr       = shmem_getxattr,
diff --git a/mm/slab.c b/mm/slab.c
index e901a36e2520..f8b0d539b482 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
 * Further notes from the original documentation:
 *
 * 11 April '97.  Started multi-threading - markhe
- *      The global cache-chain is protected by the mutex 'cache_chain_mutex'.
+ *      The global cache-chain is protected by the mutex 'slab_mutex'.
 *      The sem is only needed when accessing/extending the cache-chain, which
 *      can never happen inside an interrupt (kmem_cache_create(),
 *      kmem_cache_shrink() and kmem_cache_reap()).
@@ -87,6 +87,7 @@
 */
 #include        <linux/slab.h>
+#include        "slab.h"
 #include        <linux/mm.h>
 #include        <linux/poison.h>
 #include        <linux/swap.h>
@@ -117,12 +118,16 @@
 #include        <linux/memory.h>
 #include        <linux/prefetch.h>
+#include        <net/sock.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
 #include        <asm/page.h>
 #include <trace/events/kmem.h>
+#include        "internal.h"
 /*
 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 *                0 for faster, smaller code (especially in the critical paths).
@@ -151,6 +156,12 @@
 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 #endif
+/*
+ * true if a page was allocated from pfmemalloc reserves for network-based
+ * swap
+ */
+static bool pfmemalloc_active __read_mostly;
 /* Legal flag mask for kmem_cache_create(). */
 #if DEBUG
 # define CREATE_MASK    (SLAB_RED_ZONE | \
@@ -256,9 +267,30 @@ struct array_cache {
                         * Must have this definition in here for the proper
                         * alignment of array_cache. Also simplifies accessing
                         * the entries.
+                         *
+                         * Entries should not be directly dereferenced as
+                         * entries belonging to slabs marked pfmemalloc will
+                         * have the lower bits set SLAB_OBJ_PFMEMALLOC
                         */
 };
+#define SLAB_OBJ_PFMEMALLOC     1
+static inline bool is_obj_pfmemalloc(void *objp)
+{
+        return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
+}
+static inline void set_obj_pfmemalloc(void **objp)
+{
+        *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
+        return;
+}
+static inline void clear_obj_pfmemalloc(void **objp)
+{
+        *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
+}
 /*
 * bootstrap: The caches do not work without cpuarrays anymore, but the
 * cpuarrays are allocated from the generic caches...
@@ -424,8 +456,8 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
 *              redzone word.
 * cachep->obj_offset: The real object.
- * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ * cachep->size - 1* BYTES_PER_WORD: last caller address
 *                                      [BYTES_PER_WORD long]
 */
 static int obj_offset(struct kmem_cache *cachep)
@@ -433,11 +465,6 @@ static int obj_offset(struct kmem_cache *cachep)
        return cachep->obj_offset;
 }
-static int obj_size(struct kmem_cache *cachep)
-{
-        return cachep->obj_size;
-}
 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
@@ -449,23 +476,22 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
        if (cachep->flags & SLAB_STORE_USER)
-                return (unsigned long long *)(objp + cachep->buffer_size -
+                return (unsigned long long *)(objp + cachep->size -
                                              sizeof(unsigned long long) -
                                              REDZONE_ALIGN);
-        return (unsigned long long *) (objp + cachep->buffer_size -
+        return (unsigned long long *) (objp + cachep->size -
                                       sizeof(unsigned long long));
 }
 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-        return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
+        return (void **)(objp + cachep->size - BYTES_PER_WORD);
 }
 #else
 #define obj_offset(x)                   0
-#define obj_size(cachep)                (cachep->buffer_size)
 #define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 #define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 #define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
@@ -475,7 +501,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #ifdef CONFIG_TRACING
 size_t slab_buffer_size(struct kmem_cache *cachep)
 {
-        return cachep->buffer_size;
+        return cachep->size;
 }
 EXPORT_SYMBOL(slab_buffer_size);
 #endif
@@ -489,56 +515,37 @@ EXPORT_SYMBOL(slab_buffer_size);
 static int slab_max_order = SLAB_MAX_ORDER_LO;
 static bool slab_max_order_set __initdata;
-/*
- * Functions for storing/retrieving the cachep and or slab from the page
- * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
- * these are used to find the cache which an obj belongs to.
- */
-static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
-{
-        page->lru.next = (struct list_head *)cache;
-}
 static inline struct kmem_cache *page_get_cache(struct page *page)
 {
        page = compound_head(page);
        BUG_ON(!PageSlab(page));
-        return (struct kmem_cache *)page->lru.next;
+        return page->slab_cache;
-}
-static inline void page_set_slab(struct page *page, struct slab *slab)
-{
-        page->lru.prev = (struct list_head *)slab;
-}
-static inline struct slab *page_get_slab(struct page *page)
-{
-        BUG_ON(!PageSlab(page));
-        return (struct slab *)page->lru.prev;
 }
 static inline struct kmem_cache *virt_to_cache(const void *obj)
 {
        struct page *page = virt_to_head_page(obj);
-        return page_get_cache(page);
+        return page->slab_cache;
 }
 static inline struct slab *virt_to_slab(const void *obj)
 {
        struct page *page = virt_to_head_page(obj);
-        return page_get_slab(page);
+        VM_BUG_ON(!PageSlab(page));
+        return page->slab_page;
 }
 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
                                 unsigned int idx)
 {
-        return slab->s_mem + cache->buffer_size * idx;
+        return slab->s_mem + cache->size * idx;
 }
 /*
- * We want to avoid an expensive divide : (offset / cache->buffer_size)
+ * We want to avoid an expensive divide : (offset / cache->size)
- *   Using the fact that buffer_size is a constant for a particular cache,
+ *   Using the fact that size is a constant for a particular cache,
- *   we can replace (offset / cache->buffer_size) by
+ *   we can replace (offset / cache->size) by
 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
 */
 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
@@ -584,33 +591,12 @@ static struct kmem_cache cache_cache = {
        .batchcount = 1,
        .limit = BOOT_CPUCACHE_ENTRIES,
        .shared = 1,
-        .buffer_size = sizeof(struct kmem_cache),
+        .size = sizeof(struct kmem_cache),
        .name = "kmem_cache",
 };
 #define BAD_ALIEN_MAGIC 0x01020304ul
-/*
- * chicken and egg problem: delay the per-cpu array allocation
- * until the general caches are up.
- */
-static enum {
-        NONE,
-        PARTIAL_AC,
-        PARTIAL_L3,
-        EARLY,
-        LATE,
-        FULL
-} g_cpucache_up;
-/*
- * used by boot code to determine if it can use slab based allocator
- */
-int slab_is_available(void)
-{
-        return g_cpucache_up >= EARLY;
-}
 #ifdef CONFIG_LOCKDEP
 /*
@@ -676,7 +662,7 @@ static void init_node_lock_keys(int q)
 {
        struct cache_sizes *s = malloc_sizes;
-        if (g_cpucache_up < LATE)
+        if (slab_state < UP)
                return;
        for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
@@ -716,12 +702,6 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 }
 #endif
-/*
- * Guard access to the cache-chain.
- */
-static DEFINE_MUTEX(cache_chain_mutex);
-static struct list_head cache_chain;
 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -951,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries,
        return nc;
 }
+static inline bool is_slab_pfmemalloc(struct slab *slabp)
+{
+        struct page *page = virt_to_page(slabp->s_mem);
+        return PageSlabPfmemalloc(page);
+}
+/* Clears pfmemalloc_active if no slabs have pfmalloc set */
+static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
+                                                struct array_cache *ac)
+{
+        struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
+        struct slab *slabp;
+        unsigned long flags;
+        if (!pfmemalloc_active)
+                return;
+        spin_lock_irqsave(&l3->list_lock, flags);
+        list_for_each_entry(slabp, &l3->slabs_full, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        list_for_each_entry(slabp, &l3->slabs_partial, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        list_for_each_entry(slabp, &l3->slabs_free, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        pfmemalloc_active = false;
+out:
+        spin_unlock_irqrestore(&l3->list_lock, flags);
+}
+static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                gfp_t flags, bool force_refill)
+{
+        int i;
+        void *objp = ac->entry[--ac->avail];
+        /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
+        if (unlikely(is_obj_pfmemalloc(objp))) {
+                struct kmem_list3 *l3;
+                if (gfp_pfmemalloc_allowed(flags)) {
+                        clear_obj_pfmemalloc(&objp);
+                        return objp;
+                }
+                /* The caller cannot use PFMEMALLOC objects, find another one */
+                for (i = 1; i < ac->avail; i++) {
+                        /* If a !PFMEMALLOC object is found, swap them */
+                        if (!is_obj_pfmemalloc(ac->entry[i])) {
+                                objp = ac->entry[i];
+                                ac->entry[i] = ac->entry[ac->avail];
+                                ac->entry[ac->avail] = objp;
+                                return objp;
+                        }
+                }
+                /*
+                 * If there are empty slabs on the slabs_free list and we are
+                 * being forced to refill the cache, mark this one !pfmemalloc.
+                 */
+                l3 = cachep->nodelists[numa_mem_id()];
+                if (!list_empty(&l3->slabs_free) && force_refill) {
+                        struct slab *slabp = virt_to_slab(objp);
+                        ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
+                        clear_obj_pfmemalloc(&objp);
+                        recheck_pfmemalloc_active(cachep, ac);
+                        return objp;
+                }
+                /* No !PFMEMALLOC objects available */
+                ac->avail++;
+                objp = NULL;
+        }
+        return objp;
+}
+static inline void *ac_get_obj(struct kmem_cache *cachep,
+                        struct array_cache *ac, gfp_t flags, bool force_refill)
+{
+        void *objp;
+        if (unlikely(sk_memalloc_socks()))
+                objp = __ac_get_obj(cachep, ac, flags, force_refill);
+        else
+                objp = ac->entry[--ac->avail];
+        return objp;
+}
+static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                                void *objp)
+{
+        if (unlikely(pfmemalloc_active)) {
+                /* Some pfmemalloc slabs exist, check if this is one */
+                struct page *page = virt_to_page(objp);
+                if (PageSlabPfmemalloc(page))
+                        set_obj_pfmemalloc(&objp);
+        }
+        return objp;
+}
+static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                                void *objp)
+{
+        if (unlikely(sk_memalloc_socks()))
+                objp = __ac_put_obj(cachep, ac, objp);
+        ac->entry[ac->avail++] = objp;
+}
 /*
 * Transfer objects in one arraycache to another.
 * Locking must be handled by the caller.
@@ -1127,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
                        STATS_INC_ACOVERFLOW(cachep);
                        __drain_alien_cache(cachep, alien, nodeid);
                }
-                alien->entry[alien->avail++] = objp;
+                ac_put_obj(cachep, alien, objp);
                spin_unlock(&alien->lock);
        } else {
                spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1145,7 +1243,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 * When hotplugging memory or a cpu, existing nodelists are not replaced if
 * already in use.
 *
- * Must hold cache_chain_mutex.
+ * Must hold slab_mutex.
 */
 static int init_cache_nodelists_node(int node)
 {
@@ -1153,7 +1251,7 @@ static int init_cache_nodelists_node(int node)
        struct kmem_list3 *l3;
        const int memsize = sizeof(struct kmem_list3);
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                /*
                 * Set up the size64 kmemlist for cpu before we can
                 * begin anything. Make sure some other cpu on this
@@ -1169,7 +1267,7 @@ static int init_cache_nodelists_node(int node)
                        /*
                         * The l3s don't come and go as CPUs come and
-                         * go.  cache_chain_mutex is sufficient
+                         * go.  slab_mutex is sufficient
                         * protection here.
                         */
                        cachep->nodelists[node] = l3;
@@ -1191,7 +1289,7 @@ static void __cpuinit cpuup_canceled(long cpu)
        int node = cpu_to_mem(cpu);
        const struct cpumask *mask = cpumask_of_node(node);
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                struct array_cache *nc;
                struct array_cache *shared;
                struct array_cache **alien;
@@ -1241,7 +1339,7 @@ free_array_cache:
         * the respective cache's slabs,  now we can go ahead and
         * shrink each nodelist to its limit.
         */
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                l3 = cachep->nodelists[node];
                if (!l3)
                        continue;
@@ -1270,7 +1368,7 @@ static int __cpuinit cpuup_prepare(long cpu)
         * Now we can go ahead with allocating the shared arrays and
         * array caches
         */
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                struct array_cache *nc;
                struct array_cache *shared = NULL;
                struct array_cache **alien = NULL;
@@ -1338,9 +1436,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                err = cpuup_prepare(cpu);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
@@ -1350,7 +1448,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                /*
-                 * Shutdown cache reaper. Note that the cache_chain_mutex is
+                 * Shutdown cache reaper. Note that the slab_mutex is
                 * held so that if cache_reap() is invoked it cannot do
                 * anything expensive but will only modify reap_work
                 * and reschedule the timer.
@@ -1377,9 +1475,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 #endif
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                cpuup_canceled(cpu);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        }
        return notifier_from_errno(err);
@@ -1395,14 +1493,14 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
 * Returns -EBUSY if all objects cannot be drained so that the node is not
 * removed.
 *
- * Must hold cache_chain_mutex.
+ * Must hold slab_mutex.
 */
 static int __meminit drain_cache_nodelists_node(int node)
 {
        struct kmem_cache *cachep;
        int ret = 0;
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                struct kmem_list3 *l3;
                l3 = cachep->nodelists[node];
@@ -1433,14 +1531,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
        switch (action) {
        case MEM_GOING_ONLINE:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                ret = init_cache_nodelists_node(nid);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        case MEM_GOING_OFFLINE:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                ret = drain_cache_nodelists_node(nid);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        case MEM_ONLINE:
        case MEM_OFFLINE:
@@ -1544,8 +1642,8 @@ void __init kmem_cache_init(void)
        node = numa_mem_id();
        /* 1) create the cache_cache */
-        INIT_LIST_HEAD(&cache_chain);
+        INIT_LIST_HEAD(&slab_caches);
-        list_add(&cache_cache.next, &cache_chain);
+        list_add(&cache_cache.list, &slab_caches);
        cache_cache.colour_off = cache_line_size();
        cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
        cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
@@ -1553,18 +1651,16 @@ void __init kmem_cache_init(void)
        /*
         * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
         */
-        cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+        cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
                                  nr_node_ids * sizeof(struct kmem_list3 *);
-#if DEBUG
+        cache_cache.object_size = cache_cache.size;
-        cache_cache.obj_size = cache_cache.buffer_size;
+        cache_cache.size = ALIGN(cache_cache.size,
-#endif
-        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
                                        cache_line_size());
        cache_cache.reciprocal_buffer_size =
-                reciprocal_value(cache_cache.buffer_size);
+                reciprocal_value(cache_cache.size);
        for (order = 0; order < MAX_ORDER; order++) {
-                cache_estimate(order, cache_cache.buffer_size,
+                cache_estimate(order, cache_cache.size,
                        cache_line_size(), 0, &left_over, &cache_cache.num);
                if (cache_cache.num)
                        break;
@@ -1585,7 +1681,7 @@ void __init kmem_cache_init(void)
         * bug.
         */
-        sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
+        sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
                                        sizes[INDEX_AC].cs_size,
                                        ARCH_KMALLOC_MINALIGN,
                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1593,7 +1689,7 @@ void __init kmem_cache_init(void)
        if (INDEX_AC != INDEX_L3) {
                sizes[INDEX_L3].cs_cachep =
-                        kmem_cache_create(names[INDEX_L3].name,
+                        __kmem_cache_create(names[INDEX_L3].name,
                                sizes[INDEX_L3].cs_size,
                                ARCH_KMALLOC_MINALIGN,
                                ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1611,14 +1707,14 @@ void __init kmem_cache_init(void)
                 * allow tighter packing of the smaller caches.
                 */
                if (!sizes->cs_cachep) {
-                        sizes->cs_cachep = kmem_cache_create(names->name,
+                        sizes->cs_cachep = __kmem_cache_create(names->name,
                                        sizes->cs_size,
                                        ARCH_KMALLOC_MINALIGN,
                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
                                        NULL);
                }
 #ifdef CONFIG_ZONE_DMA
-                sizes->cs_dmacachep = kmem_cache_create(
+                sizes->cs_dmacachep = __kmem_cache_create(
                                        names->name_dma,
                                        sizes->cs_size,
                                        ARCH_KMALLOC_MINALIGN,
@@ -1676,27 +1772,27 @@ void __init kmem_cache_init(void)
                }
        }
-        g_cpucache_up = EARLY;
+        slab_state = UP;
 }
 void __init kmem_cache_init_late(void)
 {
        struct kmem_cache *cachep;
-        g_cpucache_up = LATE;
+        slab_state = UP;
        /* Annotate slab for lockdep -- annotate the malloc caches */
        init_lock_keys();
        /* 6) resize the head arrays to their final sizes */
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
-        list_for_each_entry(cachep, &cache_chain, next)
+        list_for_each_entry(cachep, &slab_caches, list)
                if (enable_cpucache(cachep, GFP_NOWAIT))
                        BUG();
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        /* Done! */
-        g_cpucache_up = FULL;
+        slab_state = FULL;
        /*
         * Register a cpu startup notifier callback that initializes
@@ -1727,6 +1823,9 @@ static int __init cpucache_init(void)
         */
        for_each_online_cpu(cpu)
                start_cpu_timer(cpu);
+        /* Done! */
+        slab_state = FULL;
        return 0;
 }
 __initcall(cpucache_init);
@@ -1743,7 +1842,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
                "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
                nodeid, gfpflags);
        printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n",
-                cachep->name, cachep->buffer_size, cachep->gfporder);
+                cachep->name, cachep->size, cachep->gfporder);
        for_each_online_node(node) {
                unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
@@ -1798,7 +1897,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        flags |= __GFP_COMP;
 #endif
-        flags |= cachep->gfpflags;
+        flags |= cachep->allocflags;
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
                flags |= __GFP_RECLAIMABLE;
@@ -1809,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                return NULL;
        }
+        /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+        if (unlikely(page->pfmemalloc))
+                pfmemalloc_active = true;
        nr_pages = (1 << cachep->gfporder);
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
                add_zone_page_state(page_zone(page),
@@ -1816,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        else
                add_zone_page_state(page_zone(page),
                        NR_SLAB_UNRECLAIMABLE, nr_pages);
-        for (i = 0; i < nr_pages; i++)
+        for (i = 0; i < nr_pages; i++) {
                __SetPageSlab(page + i);
+                if (page->pfmemalloc)
+                        SetPageSlabPfmemalloc(page + i);
+        }
        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
                kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1850,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
                                NR_SLAB_UNRECLAIMABLE, nr_freed);
        while (i--) {
                BUG_ON(!PageSlab(page));
+                __ClearPageSlabPfmemalloc(page);
                __ClearPageSlab(page);
                page++;
        }
@@ -1874,7 +1982,7 @@ static void kmem_rcu_free(struct rcu_head *head)
 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
                            unsigned long caller)
 {
-        int size = obj_size(cachep);
+        int size = cachep->object_size;
        addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
@@ -1906,7 +2014,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
 {
-        int size = obj_size(cachep);
+        int size = cachep->object_size;
        addr = &((char *)addr)[obj_offset(cachep)];
        memset(addr, val, size);
@@ -1966,7 +2074,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
                printk("\n");
        }
        realobj = (char *)objp + obj_offset(cachep);
-        size = obj_size(cachep);
+        size = cachep->object_size;
        for (i = 0; i < size && lines; i += 16, lines--) {
                int limit;
                limit = 16;
@@ -1983,7 +2091,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
        int lines = 0;
        realobj = (char *)objp + obj_offset(cachep);
-        size = obj_size(cachep);
+        size = cachep->object_size;
        for (i = 0; i < size; i++) {
                char exp = POISON_FREE;
@@ -2047,10 +2155,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
                if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                        if (cachep->buffer_size % PAGE_SIZE == 0 &&
+                        if (cachep->size % PAGE_SIZE == 0 &&
                                        OFF_SLAB(cachep))
                                kernel_map_pages(virt_to_page(objp),
-                                        cachep->buffer_size / PAGE_SIZE, 1);
+                                        cachep->size / PAGE_SIZE, 1);
                        else
                                check_poison_obj(cachep, objp);
 #else
@@ -2194,10 +2302,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
-        if (g_cpucache_up == FULL)
+        if (slab_state >= FULL)
                return enable_cpucache(cachep, gfp);
-        if (g_cpucache_up == NONE) {
+        if (slab_state == DOWN) {
                /*
                 * Note: the first kmem_cache_create must create the cache
                 * that's used by kmalloc(24), otherwise the creation of
@@ -2212,16 +2320,16 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
                 */
                set_up_list3s(cachep, SIZE_AC);
                if (INDEX_AC == INDEX_L3)
-                        g_cpucache_up = PARTIAL_L3;
+                        slab_state = PARTIAL_L3;
                else
-                        g_cpucache_up = PARTIAL_AC;
+                        slab_state = PARTIAL_ARRAYCACHE;
        } else {
                cachep->array[smp_processor_id()] =
                        kmalloc(sizeof(struct arraycache_init), gfp);
-                if (g_cpucache_up == PARTIAL_AC) {
+                if (slab_state == PARTIAL_ARRAYCACHE) {
                        set_up_list3s(cachep, SIZE_L3);
-                        g_cpucache_up = PARTIAL_L3;
+                        slab_state = PARTIAL_L3;
                } else {
                        int node;
                        for_each_online_node(node) {
@@ -2247,7 +2355,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 }
 /**
- * kmem_cache_create - Create a cache.
+ * __kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
@@ -2274,59 +2382,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 * as davem.
 */
 struct kmem_cache *
-kmem_cache_create (const char *name, size_t size, size_t align,
+__kmem_cache_create (const char *name, size_t size, size_t align,
        unsigned long flags, void (*ctor)(void *))
 {
        size_t left_over, slab_size, ralign;
-        struct kmem_cache *cachep = NULL, *pc;
+        struct kmem_cache *cachep = NULL;
        gfp_t gfp;
-        /*
-         * Sanity checks... these are all serious usage bugs.
-         */
-        if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
-            size > KMALLOC_MAX_SIZE) {
-                printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
-                                name);
-                BUG();
-        }
-        /*
-         * We use cache_chain_mutex to ensure a consistent view of
-         * cpu_online_mask as well.  Please see cpuup_callback
-         */
-        if (slab_is_available()) {
-                get_online_cpus();
-                mutex_lock(&cache_chain_mutex);
-        }
-        list_for_each_entry(pc, &cache_chain, next) {
-                char tmp;
-                int res;
-                /*
-                 * This happens when the module gets unloaded and doesn't
-                 * destroy its slab cache and no-one else reuses the vmalloc
-                 * area of the module.  Print a warning.
-                 */
-                res = probe_kernel_address(pc->name, tmp);
-                if (res) {
-                        printk(KERN_ERR
-                               "SLAB: cache with size %d has lost its name\n",
-                               pc->buffer_size);
-                        continue;
-                }
-                if (!strcmp(pc->name, name)) {
-                        printk(KERN_ERR
-                               "kmem_cache_create: duplicate cache %s\n", name);
-                        dump_stack();
-                        goto oops;
-                }
-        }
 #if DEBUG
-        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
 #if FORCED_DEBUG
        /*
         * Enable redzoning and last user accounting, except for caches with
@@ -2415,11 +2478,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /* Get cache's description obj. */
        cachep = kmem_cache_zalloc(&cache_cache, gfp);
        if (!cachep)
-                goto oops;
+                return NULL;
        cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+        cachep->object_size = size;
+        cachep->align = align;
 #if DEBUG
-        cachep->obj_size = size;
        /*
         * Both debugging options require word-alignment which is calculated
@@ -2442,7 +2506,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        }
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
-            && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+            && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
                cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
                size = PAGE_SIZE;
        }
@@ -2471,8 +2535,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                printk(KERN_ERR
                       "kmem_cache_create: couldn't create cache %s.\n", name);
                kmem_cache_free(&cache_cache, cachep);
-                cachep = NULL;
+                return NULL;
-                goto oops;
        }
        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
                          + sizeof(struct slab), align);
@@ -2508,10 +2571,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->colour = left_over / cachep->colour_off;
        cachep->slab_size = slab_size;
        cachep->flags = flags;
-        cachep->gfpflags = 0;
+        cachep->allocflags = 0;
        if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
-                cachep->gfpflags |= GFP_DMA;
+                cachep->allocflags |= GFP_DMA;
-        cachep->buffer_size = size;
+        cachep->size = size;
        cachep->reciprocal_buffer_size = reciprocal_value(size);
        if (flags & CFLGS_OFF_SLAB) {
@@ -2530,8 +2593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (setup_cpu_cache(cachep, gfp)) {
                __kmem_cache_destroy(cachep);
-                cachep = NULL;
+                return NULL;
-                goto oops;
        }
        if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2545,18 +2607,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        }
        /* cache setup completed, link it into the list */
-        list_add(&cachep->next, &cache_chain);
+        list_add(&cachep->list, &slab_caches);
-oops:
-        if (!cachep && (flags & SLAB_PANIC))
-                panic("kmem_cache_create(): failed to create slab `%s'\n",
-                      name);
-        if (slab_is_available()) {
-                mutex_unlock(&cache_chain_mutex);
-                put_online_cpus();
-        }
        return cachep;
 }
-EXPORT_SYMBOL(kmem_cache_create);
 #if DEBUG
 static void check_irq_off(void)
@@ -2671,7 +2724,7 @@ out:
        return nr_freed;
 }
-/* Called with cache_chain_mutex held to protect against cpu hotplug */
+/* Called with slab_mutex held to protect against cpu hotplug */
 static int __cache_shrink(struct kmem_cache *cachep)
 {
        int ret = 0, i = 0;
@@ -2706,9 +2759,9 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
        BUG_ON(!cachep || in_interrupt());
        get_online_cpus();
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        ret = __cache_shrink(cachep);
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        put_online_cpus();
        return ret;
 }
@@ -2736,15 +2789,15 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
        /* Find the cache in the chain of caches. */
        get_online_cpus();
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        /*
         * the chain is never empty, cache_cache is never destroyed
         */
-        list_del(&cachep->next);
+        list_del(&cachep->list);
        if (__cache_shrink(cachep)) {
                slab_error(cachep, "Can't free all objects");
-                list_add(&cachep->next, &cache_chain);
+                list_add(&cachep->list, &slab_caches);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                put_online_cpus();
                return;
        }
@@ -2753,7 +2806,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
                rcu_barrier();
        __kmem_cache_destroy(cachep);
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        put_online_cpus();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2840,10 +2893,10 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                slab_error(cachep, "constructor overwrote the"
                                           " start of an object");
                }
-                if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
+                if ((cachep->size % PAGE_SIZE) == 0 &&
                            OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
                        kernel_map_pages(virt_to_page(objp),
-                                         cachep->buffer_size / PAGE_SIZE, 0);
+                                         cachep->size / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
                        cachep->ctor(objp);
@@ -2857,9 +2910,9 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
        if (CONFIG_ZONE_DMA_FLAG) {
                if (flags & GFP_DMA)
-                        BUG_ON(!(cachep->gfpflags & GFP_DMA));
+                        BUG_ON(!(cachep->allocflags & GFP_DMA));
                else
-                        BUG_ON(cachep->gfpflags & GFP_DMA);
+                        BUG_ON(cachep->allocflags & GFP_DMA);
        }
 }
@@ -2918,8 +2971,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
                nr_pages <<= cache->gfporder;
        do {
-                page_set_cache(page, cache);
+                page->slab_cache = cache;
-                page_set_slab(page, slab);
+                page->slab_page = slab;
                page++;
        } while (--nr_pages);
 }
@@ -3057,7 +3110,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        kfree_debugcheck(objp);
        page = virt_to_head_page(objp);
-        slabp = page_get_slab(page);
+        slabp = page->slab_page;
        if (cachep->flags & SLAB_RED_ZONE) {
                verify_redzone_free(cachep, objp);
@@ -3077,10 +3130,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 #endif
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
+                if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
                        store_stackinfo(cachep, objp, (unsigned long)caller);
                        kernel_map_pages(virt_to_page(objp),
-                                         cachep->buffer_size / PAGE_SIZE, 0);
+                                         cachep->size / PAGE_SIZE, 0);
                } else {
                        poison_obj(cachep, objp, POISON_FREE);
                }
@@ -3120,16 +3173,19 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
+                                                        bool force_refill)
 {
        int batchcount;
        struct kmem_list3 *l3;
        struct array_cache *ac;
        int node;
-retry:
        check_irq_off();
        node = numa_mem_id();
+        if (unlikely(force_refill))
+                goto force_grow;
+retry:
        ac = cpu_cache_get(cachep);
        batchcount = ac->batchcount;
        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3179,8 +3235,8 @@ retry:
                        STATS_INC_ACTIVE(cachep);
                        STATS_SET_HIGH(cachep);
-                        ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
+                        ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
-                                                            node);
+                                                                        node));
                }
                check_slabp(cachep, slabp);
@@ -3199,18 +3255,22 @@ alloc_done:
        if (unlikely(!ac->avail)) {
                int x;
+force_grow:
                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
                /* cache_grow can reenable interrupts, then ac could change. */
                ac = cpu_cache_get(cachep);
-                if (!x && ac->avail == 0)       /* no objects in sight? abort */
+                /* no objects in sight? abort */
+                if (!x && (ac->avail == 0 || force_refill))
                        return NULL;
                if (!ac->avail)         /* objects refilled by interrupt? */
                        goto retry;
        }
        ac->touched = 1;
-        return ac->entry[--ac->avail];
+        return ac_get_obj(cachep, ac, flags, force_refill);
 }
 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3230,9 +3290,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                return objp;
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+                if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
                        kernel_map_pages(virt_to_page(objp),
-                                         cachep->buffer_size / PAGE_SIZE, 1);
+                                         cachep->size / PAGE_SIZE, 1);
                else
                        check_poison_obj(cachep, objp);
 #else
@@ -3261,8 +3321,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                struct slab *slabp;
                unsigned objnr;
-                slabp = page_get_slab(virt_to_head_page(objp));
+                slabp = virt_to_head_page(objp)->slab_page;
-                objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+                objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
                slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
        }
 #endif
@@ -3285,30 +3345,42 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
        if (cachep == &cache_cache)
                return false;
-        return should_failslab(obj_size(cachep), flags, cachep->flags);
+        return should_failslab(cachep->object_size, flags, cachep->flags);
 }
 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
        void *objp;
        struct array_cache *ac;
+        bool force_refill = false;
        check_irq_off();
        ac = cpu_cache_get(cachep);
        if (likely(ac->avail)) {
-                STATS_INC_ALLOCHIT(cachep);
                ac->touched = 1;
-                objp = ac->entry[--ac->avail];
+                objp = ac_get_obj(cachep, ac, flags, false);
-        } else {
-                STATS_INC_ALLOCMISS(cachep);
-                objp = cache_alloc_refill(cachep, flags);
                /*
-                 * the 'ac' may be updated by cache_alloc_refill(),
+                 * Allow for the possibility all avail objects are not allowed
-                 * and kmemleak_erase() requires its correct value.
+                 * by the current flags
                 */
-                ac = cpu_cache_get(cachep);
+                if (objp) {
+                        STATS_INC_ALLOCHIT(cachep);
+                        goto out;
+                }
+                force_refill = true;
        }
+        STATS_INC_ALLOCMISS(cachep);
+        objp = cache_alloc_refill(cachep, flags, force_refill);
+        /*
+         * the 'ac' may be updated by cache_alloc_refill(),
+         * and kmemleak_erase() requires its correct value.
+         */
+        ac = cpu_cache_get(cachep);
+out:
        /*
         * To avoid a false negative, if an object that is in one of the
         * per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3336,7 +3408,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
                nid_alloc = cpuset_slab_spread_node();
        else if (current->mempolicy)
-                nid_alloc = slab_node(current->mempolicy);
+                nid_alloc = slab_node();
        if (nid_alloc != nid_here)
                return ____cache_alloc_node(cachep, flags, nid_alloc);
        return NULL;
@@ -3368,7 +3440,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 retry_cpuset:
        cpuset_mems_cookie = get_mems_allowed();
-        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+        zonelist = node_zonelist(slab_node(), flags);
 retry:
        /*
@@ -3545,14 +3617,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
  out:
        local_irq_restore(save_flags);
        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
-        kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
+        kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
                                 flags);
        if (likely(ptr))
-                kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
+                kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
        if (unlikely((flags & __GFP_ZERO) && ptr))
-                memset(ptr, 0, obj_size(cachep));
+                memset(ptr, 0, cachep->object_size);
        return ptr;
 }
@@ -3607,15 +3679,15 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
        objp = __do_cache_alloc(cachep, flags);
        local_irq_restore(save_flags);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
-        kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
+        kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
                                 flags);
        prefetchw(objp);
        if (likely(objp))
-                kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
+                kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
        if (unlikely((flags & __GFP_ZERO) && objp))
-                memset(objp, 0, obj_size(cachep));
+                memset(objp, 0, cachep->object_size);
        return objp;
 }
@@ -3630,9 +3702,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
        struct kmem_list3 *l3;
        for (i = 0; i < nr_objects; i++) {
-                void *objp = objpp[i];
+                void *objp;
                struct slab *slabp;
+                clear_obj_pfmemalloc(&objpp[i]);
+                objp = objpp[i];
                slabp = virt_to_slab(objp);
                l3 = cachep->nodelists[node];
                list_del(&slabp->list);
@@ -3731,7 +3806,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
        kmemleak_free_recursive(objp, cachep->flags);
        objp = cache_free_debugcheck(cachep, objp, caller);
-        kmemcheck_slab_free(cachep, objp, obj_size(cachep));
+        kmemcheck_slab_free(cachep, objp, cachep->object_size);
        /*
         * Skip calling cache_free_alien() when the platform is not numa.
@@ -3750,7 +3825,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
                cache_flusharray(cachep, ac);
        }
-        ac->entry[ac->avail++] = objp;
+        ac_put_obj(cachep, ac, objp);
 }
 /**
@@ -3766,7 +3841,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
        void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
        trace_kmem_cache_alloc(_RET_IP_, ret,
-                               obj_size(cachep), cachep->buffer_size, flags);
+                               cachep->object_size, cachep->size, flags);
        return ret;
 }
@@ -3794,7 +3869,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                                       __builtin_return_address(0));
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
-                                    obj_size(cachep), cachep->buffer_size,
+                                    cachep->object_size, cachep->size,
                                    flags, nodeid);
        return ret;
@@ -3876,7 +3951,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
        ret = __cache_alloc(cachep, flags, caller);
        trace_kmalloc((unsigned long) caller, ret,
-                      size, cachep->buffer_size, flags);
+                      size, cachep->size, flags);
        return ret;
 }
@@ -3916,9 +3991,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
        unsigned long flags;
        local_irq_save(flags);
-        debug_check_no_locks_freed(objp, obj_size(cachep));
+        debug_check_no_locks_freed(objp, cachep->object_size);
        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
-                debug_check_no_obj_freed(objp, obj_size(cachep));
+                debug_check_no_obj_freed(objp, cachep->object_size);
        __cache_free(cachep, objp, __builtin_return_address(0));
        local_irq_restore(flags);
@@ -3947,8 +4022,9 @@ void kfree(const void *objp)
        local_irq_save(flags);
        kfree_debugcheck(objp);
        c = virt_to_cache(objp);
-        debug_check_no_locks_freed(objp, obj_size(c));
+        debug_check_no_locks_freed(objp, c->object_size);
-        debug_check_no_obj_freed(objp, obj_size(c));
+        debug_check_no_obj_freed(objp, c->object_size);
        __cache_free(c, (void *)objp, __builtin_return_address(0));
        local_irq_restore(flags);
 }
@@ -3956,7 +4032,7 @@ EXPORT_SYMBOL(kfree);
 unsigned int kmem_cache_size(struct kmem_cache *cachep)
 {
-        return obj_size(cachep);
+        return cachep->object_size;
 }
 EXPORT_SYMBOL(kmem_cache_size);
@@ -4030,7 +4106,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
        return 0;
 fail:
-        if (!cachep->next.next) {
+        if (!cachep->list.next) {
                /* Cache is not active yet. Roll back what we did */
                node--;
                while (node >= 0) {
@@ -4065,7 +4141,7 @@ static void do_ccupdate_local(void *info)
        new->new[smp_processor_id()] = old;
 }
-/* Always called with the cache_chain_mutex held */
+/* Always called with the slab_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
                                int batchcount, int shared, gfp_t gfp)
 {
@@ -4109,7 +4185,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
        return alloc_kmemlist(cachep, gfp);
 }
-/* Called with cache_chain_mutex held always */
+/* Called with slab_mutex held always */
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
        int err;
@@ -4124,13 +4200,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
         * The numbers are guessed, we should auto-tune as described by
         * Bonwick.
         */
-        if (cachep->buffer_size > 131072)
+        if (cachep->size > 131072)
                limit = 1;
-        else if (cachep->buffer_size > PAGE_SIZE)
+        else if (cachep->size > PAGE_SIZE)
                limit = 8;
-        else if (cachep->buffer_size > 1024)
+        else if (cachep->size > 1024)
                limit = 24;
-        else if (cachep->buffer_size > 256)
+        else if (cachep->size > 256)
                limit = 54;
        else
                limit = 120;
@@ -4145,7 +4221,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
         * to a larger limit. Thus disabled by default.
         */
        shared = 0;
-        if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
+        if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
                shared = 8;
 #if DEBUG
@@ -4211,11 +4287,11 @@ static void cache_reap(struct work_struct *w)
        int node = numa_mem_id();
        struct delayed_work *work = to_delayed_work(w);
-        if (!mutex_trylock(&cache_chain_mutex))
+        if (!mutex_trylock(&slab_mutex))
                /* Give up. Setup the next iteration. */
                goto out;
-        list_for_each_entry(searchp, &cache_chain, next) {
+        list_for_each_entry(searchp, &slab_caches, list) {
                check_irq_on();
                /*
@@ -4253,7 +4329,7 @@ next:
                cond_resched();
        }
        check_irq_on();
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        next_reap_node();
 out:
        /* Set up the next iteration */
@@ -4289,26 +4365,26 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 {
        loff_t n = *pos;
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        if (!n)
                print_slabinfo_header(m);
-        return seq_list_start(&cache_chain, *pos);
+        return seq_list_start(&slab_caches, *pos);
 }
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-        return seq_list_next(p, &cache_chain, pos);
+        return seq_list_next(p, &slab_caches, pos);
 }
 static void s_stop(struct seq_file *m, void *p)
 {
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
 }
 static int s_show(struct seq_file *m, void *p)
 {
-        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
+        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
        struct slab *slabp;
        unsigned long active_objs;
        unsigned long num_objs;
@@ -4364,7 +4440,7 @@ static int s_show(struct seq_file *m, void *p)
                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-                   name, active_objs, num_objs, cachep->buffer_size,
+                   name, active_objs, num_objs, cachep->size,
                   cachep->num, (1 << cachep->gfporder));
        seq_printf(m, " : tunables %4u %4u %4u",
                   cachep->limit, cachep->batchcount, cachep->shared);
@@ -4454,9 +4530,9 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                return -EINVAL;
        /* Find the cache in the chain of caches. */
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        res = -EINVAL;
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                if (!strcmp(cachep->name, kbuf)) {
                        if (limit < 1 || batchcount < 1 ||
                                        batchcount > limit || shared < 0) {
@@ -4469,7 +4545,7 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                        break;
                }
        }
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        if (res >= 0)
                res = count;
        return res;
@@ -4492,8 +4568,8 @@ static const struct file_operations proc_slabinfo_operations = {
 static void *leaks_start(struct seq_file *m, loff_t *pos)
 {
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
-        return seq_list_start(&cache_chain, *pos);
+        return seq_list_start(&slab_caches, *pos);
 }
 static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4532,7 +4608,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
        int i;
        if (n[0] == n[1])
                return;
-        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
+        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
                if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
                        continue;
                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
@@ -4558,7 +4634,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
 static int leaks_show(struct seq_file *m, void *p)
 {
-        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
+        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
        struct slab *slabp;
        struct kmem_list3 *l3;
        const char *name;
@@ -4592,17 +4668,17 @@ static int leaks_show(struct seq_file *m, void *p)
        name = cachep->name;
        if (n[0] == n[1]) {
                /* Increase the buffer size */
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
                if (!m->private) {
                        /* Too bad, we are really out */
                        m->private = n;
-                        mutex_lock(&cache_chain_mutex);
+                        mutex_lock(&slab_mutex);
                        return -ENOMEM;
                }
                *(unsigned long *)m->private = n[0] * 2;
                kfree(n);
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                /* Now make sure this entry will be retried */
                m->count = m->size;
                return 0;
@@ -4677,6 +4753,6 @@ size_t ksize(const void *objp)
        if (unlikely(objp == ZERO_SIZE_PTR))
                return 0;
-        return obj_size(virt_to_cache(objp));
+        return virt_to_cache(objp)->object_size;
 }
 EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
new file mode 100644
index 000000000000..db7848caaa25
--- /dev/null
+++ b/mm/slab.h
@@ -0,0 +1,33 @@
+#ifndef MM_SLAB_H
+#define MM_SLAB_H
+/*
+ * Internal slab definitions
+ */
+/*
+ * State of the slab allocator.
+ *
+ * This is used to describe the states of the allocator during bootup.
+ * Allocators use this to gradually bootstrap themselves. Most allocators
+ * have the problem that the structures used for managing slab caches are
+ * allocated from slab caches themselves.
+ */
+enum slab_state {
+        DOWN,                   /* No slab functionality yet */
+        PARTIAL,                /* SLUB: kmem_cache_node available */
+        PARTIAL_ARRAYCACHE,     /* SLAB: kmalloc size for arraycache available */
+        PARTIAL_L3,             /* SLAB: kmalloc size for l3 struct available */
+        UP,                     /* Slab caches usable but not all extras yet */
+        FULL                    /* Everything is working */
+};
+extern enum slab_state slab_state;
+/* The slab cache mutex protects the management structures during changes */
+extern struct mutex slab_mutex;
+extern struct list_head slab_caches;
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+        size_t align, unsigned long flags, void (*ctor)(void *));
+#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
new file mode 100644
index 000000000000..aa3ca5bb01b5
--- /dev/null
+++ b/mm/slab_common.c
@@ -0,0 +1,120 @@
+/*
+ * Slab allocator functions that are independent of the allocator strategy
+ *
+ * (C) 2012 Christoph Lameter <cl@linux.com>
+ */
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/poison.h>
+#include <linux/interrupt.h>
+#include <linux/memory.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/page.h>
+#include "slab.h"
+enum slab_state slab_state;
+LIST_HEAD(slab_caches);
+DEFINE_MUTEX(slab_mutex);
+/*
+ * kmem_cache_create - Create a cache.
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @ctor: A constructor for the objects.
+ *
+ * Returns a ptr to the cache on success, NULL on failure.
+ * Cannot be called within a interrupt, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline.  This can be beneficial if you're counting cycles as closely
+ * as davem.
+ */
+struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
+                unsigned long flags, void (*ctor)(void *))
+{
+        struct kmem_cache *s = NULL;
+#ifdef CONFIG_DEBUG_VM
+        if (!name || in_interrupt() || size < sizeof(void *) ||
+                size > KMALLOC_MAX_SIZE) {
+                printk(KERN_ERR "kmem_cache_create(%s) integrity check"
+                        " failed\n", name);
+                goto out;
+        }
+#endif
+        get_online_cpus();
+        mutex_lock(&slab_mutex);
+#ifdef CONFIG_DEBUG_VM
+        list_for_each_entry(s, &slab_caches, list) {
+                char tmp;
+                int res;
+                /*
+                 * This happens when the module gets unloaded and doesn't
+                 * destroy its slab cache and no-one else reuses the vmalloc
+                 * area of the module.  Print a warning.
+                 */
+                res = probe_kernel_address(s->name, tmp);
+                if (res) {
+                        printk(KERN_ERR
+                               "Slab cache with size %d has lost its name\n",
+                               s->object_size);
+                        continue;
+                }
+                if (!strcmp(s->name, name)) {
+                        printk(KERN_ERR "kmem_cache_create(%s): Cache name"
+                                " already exists.\n",
+                                name);
+                        dump_stack();
+                        s = NULL;
+                        goto oops;
+                }
+        }
+        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
+#endif
+        s = __kmem_cache_create(name, size, align, flags, ctor);
+#ifdef CONFIG_DEBUG_VM
+oops:
+#endif
+        mutex_unlock(&slab_mutex);
+        put_online_cpus();
+#ifdef CONFIG_DEBUG_VM
+out:
+#endif
+        if (!s && (flags & SLAB_PANIC))
+                panic("kmem_cache_create: Failed to create slab '%s'\n", name);
+        return s;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+int slab_is_available(void)
+{
+        return slab_state >= UP;
+}
diff --git a/mm/slob.c b/mm/slob.c
index 8105be42cad1..45d4ca79933a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -59,6 +59,8 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include "slab.h"
 #include <linux/mm.h>
 #include <linux/swap.h> /* struct reclaim_state */
 #include <linux/cache.h>
@@ -92,36 +94,6 @@ struct slob_block {
 typedef struct slob_block slob_t;
 /*
- * We use struct page fields to manage some slob allocation aspects,
- * however to avoid the horrible mess in include/linux/mm_types.h, we'll
- * just define our own struct page type variant here.
- */
-struct slob_page {
-        union {
-                struct {
-                        unsigned long flags;    /* mandatory */
-                        atomic_t _count;        /* mandatory */
-                        slobidx_t units;        /* free units left in page */
-                        unsigned long pad[2];
-                        slob_t *free;           /* first free slob_t in page */
-                        struct list_head list;  /* linked list of free pages */
-                };
-                struct page page;
-        };
-};
-static inline void struct_slob_page_wrong_size(void)
-{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
-/*
- * free_slob_page: call before a slob_page is returned to the page allocator.
- */
-static inline void free_slob_page(struct slob_page *sp)
-{
-        reset_page_mapcount(&sp->page);
-        sp->page.mapping = NULL;
-}
-/*
 * All partially free slob pages go on these lists.
 */
 #define SLOB_BREAK1 256
@@ -131,46 +103,23 @@ static LIST_HEAD(free_slob_medium);
 static LIST_HEAD(free_slob_large);
 /*
- * is_slob_page: True for all slob pages (false for bigblock pages)
- */
-static inline int is_slob_page(struct slob_page *sp)
-{
-        return PageSlab((struct page *)sp);
-}
-static inline void set_slob_page(struct slob_page *sp)
-{
-        __SetPageSlab((struct page *)sp);
-}
-static inline void clear_slob_page(struct slob_page *sp)
-{
-        __ClearPageSlab((struct page *)sp);
-}
-static inline struct slob_page *slob_page(const void *addr)
-{
-        return (struct slob_page *)virt_to_page(addr);
-}
-/*
 * slob_page_free: true for pages on free_slob_pages list.
 */
-static inline int slob_page_free(struct slob_page *sp)
+static inline int slob_page_free(struct page *sp)
 {
-        return PageSlobFree((struct page *)sp);
+        return PageSlobFree(sp);
 }
-static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
+static void set_slob_page_free(struct page *sp, struct list_head *list)
 {
        list_add(&sp->list, list);
-        __SetPageSlobFree((struct page *)sp);
+        __SetPageSlobFree(sp);
 }
-static inline void clear_slob_page_free(struct slob_page *sp)
+static inline void clear_slob_page_free(struct page *sp)
 {
        list_del(&sp->list);
-        __ClearPageSlobFree((struct page *)sp);
+        __ClearPageSlobFree(sp);
 }
 #define SLOB_UNIT sizeof(slob_t)
@@ -267,12 +216,12 @@ static void slob_free_pages(void *b, int order)
 /*
 * Allocate a slob block within a given slob_page sp.
 */
-static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
+static void *slob_page_alloc(struct page *sp, size_t size, int align)
 {
        slob_t *prev, *cur, *aligned = NULL;
        int delta = 0, units = SLOB_UNITS(size);
-        for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
+        for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
                slobidx_t avail = slob_units(cur);
                if (align) {
@@ -296,12 +245,12 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
                                if (prev)
                                        set_slob(prev, slob_units(prev), next);
                                else
-                                        sp->free = next;
+                                        sp->freelist = next;
                        } else { /* fragment */
                                if (prev)
                                        set_slob(prev, slob_units(prev), cur + units);
                                else
-                                        sp->free = cur + units;
+                                        sp->freelist = cur + units;
                                set_slob(cur + units, avail - units, next);
                        }
@@ -320,7 +269,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
 */
 static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 {
-        struct slob_page *sp;
+        struct page *sp;
        struct list_head *prev;
        struct list_head *slob_list;
        slob_t *b = NULL;
@@ -341,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                 * If there's a node specification, search for a partial
                 * page with a matching node id in the freelist.
                 */
-                if (node != -1 && page_to_nid(&sp->page) != node)
+                if (node != -1 && page_to_nid(sp) != node)
                        continue;
 #endif
                /* Enough room on this page? */
@@ -369,12 +318,12 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
                if (!b)
                        return NULL;
-                sp = slob_page(b);
+                sp = virt_to_page(b);
-                set_slob_page(sp);
+                __SetPageSlab(sp);
                spin_lock_irqsave(&slob_lock, flags);
                sp->units = SLOB_UNITS(PAGE_SIZE);
-                sp->free = b;
+                sp->freelist = b;
                INIT_LIST_HEAD(&sp->list);
                set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
                set_slob_page_free(sp, slob_list);
@@ -392,7 +341,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 */
 static void slob_free(void *block, int size)
 {
-        struct slob_page *sp;
+        struct page *sp;
        slob_t *prev, *next, *b = (slob_t *)block;
        slobidx_t units;
        unsigned long flags;
@@ -402,7 +351,7 @@ static void slob_free(void *block, int size)
                return;
        BUG_ON(!size);
-        sp = slob_page(block);
+        sp = virt_to_page(block);
        units = SLOB_UNITS(size);
        spin_lock_irqsave(&slob_lock, flags);
@@ -412,8 +361,8 @@ static void slob_free(void *block, int size)
                if (slob_page_free(sp))
                        clear_slob_page_free(sp);
                spin_unlock_irqrestore(&slob_lock, flags);
-                clear_slob_page(sp);
+                __ClearPageSlab(sp);
-                free_slob_page(sp);
+                reset_page_mapcount(sp);
                slob_free_pages(b, 0);
                return;
        }
@@ -421,7 +370,7 @@ static void slob_free(void *block, int size)
        if (!slob_page_free(sp)) {
                /* This slob page is about to become partially free. Easy! */
                sp->units = units;
-                sp->free = b;
+                sp->freelist = b;
                set_slob(b, units,
                        (void *)((unsigned long)(b +
                                        SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
@@ -441,15 +390,15 @@ static void slob_free(void *block, int size)
         */
        sp->units += units;
-        if (b < sp->free) {
+        if (b < (slob_t *)sp->freelist) {
-                if (b + units == sp->free) {
+                if (b + units == sp->freelist) {
-                        units += slob_units(sp->free);
+                        units += slob_units(sp->freelist);
-                        sp->free = slob_next(sp->free);
+                        sp->freelist = slob_next(sp->freelist);
                }
-                set_slob(b, units, sp->free);
+                set_slob(b, units, sp->freelist);
-                sp->free = b;
+                sp->freelist = b;
        } else {
-                prev = sp->free;
+                prev = sp->freelist;
                next = slob_next(prev);
                while (b > next) {
                        prev = next;
@@ -522,7 +471,7 @@ EXPORT_SYMBOL(__kmalloc_node);
 void kfree(const void *block)
 {
-        struct slob_page *sp;
+        struct page *sp;
        trace_kfree(_RET_IP_, block);
@@ -530,43 +479,36 @@ void kfree(const void *block)
                return;
        kmemleak_free(block);
-        sp = slob_page(block);
+        sp = virt_to_page(block);
-        if (is_slob_page(sp)) {
+        if (PageSlab(sp)) {
                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                unsigned int *m = (unsigned int *)(block - align);
                slob_free(m, *m + align);
        } else
-                put_page(&sp->page);
+                put_page(sp);
 }
 EXPORT_SYMBOL(kfree);
 /* can't use ksize for kmem_cache_alloc memory, only kmalloc */
 size_t ksize(const void *block)
 {
-        struct slob_page *sp;
+        struct page *sp;
        BUG_ON(!block);
        if (unlikely(block == ZERO_SIZE_PTR))
                return 0;
-        sp = slob_page(block);
+        sp = virt_to_page(block);
-        if (is_slob_page(sp)) {
+        if (PageSlab(sp)) {
                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                unsigned int *m = (unsigned int *)(block - align);
                return SLOB_UNITS(*m) * SLOB_UNIT;
        } else
-                return sp->page.private;
+                return sp->private;
 }
 EXPORT_SYMBOL(ksize);
-struct kmem_cache {
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
-        unsigned int size, align;
-        unsigned long flags;
-        const char *name;
-        void (*ctor)(void *);
-};
-struct kmem_cache *kmem_cache_create(const char *name, size_t size,
        size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *c;
@@ -589,13 +531,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                        c->align = ARCH_SLAB_MINALIGN;
                if (c->align < align)
                        c->align = align;
-        } else if (flags & SLAB_PANIC)
-                panic("Cannot create slab cache %s\n", name);
-        kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
+                kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
+                c->refcount = 1;
+        }
        return c;
 }
-EXPORT_SYMBOL(kmem_cache_create);
 void kmem_cache_destroy(struct kmem_cache *c)
 {
@@ -678,19 +619,12 @@ int kmem_cache_shrink(struct kmem_cache *d)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
-static unsigned int slob_ready __read_mostly;
-int slab_is_available(void)
-{
-        return slob_ready;
-}
 void __init kmem_cache_init(void)
 {
-        slob_ready = 1;
+        slab_state = UP;
 }
 void __init kmem_cache_init_late(void)
 {
-        /* Nothing to do */
+        slab_state = FULL;
 }
diff --git a/mm/slub.c b/mm/slub.c
index ffe13fdf8144..8f78e2577031 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/slab.h>
+#include "slab.h"
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/kmemcheck.h>
@@ -33,15 +34,17 @@
 #include <trace/events/kmem.h>
+#include "internal.h"
 /*
 * Lock order:
- *   1. slub_lock (Global Semaphore)
+ *   1. slab_mutex (Global Mutex)
 *   2. node->list_lock
 *   3. slab_lock(page) (Only on some arches and for debugging)
 *
- *   slub_lock
+ *   slab_mutex
 *
- *   The role of the slub_lock is to protect the list of all the slabs
+ *   The role of the slab_mutex is to protect the list of all the slabs
 *   and to synchronize major metadata changes to slab cache structures.
 *
 *   The slab_lock is only used for debugging and on arches that do not
@@ -182,17 +185,6 @@ static int kmem_size = sizeof(struct kmem_cache);
 static struct notifier_block slab_notifier;
 #endif
-static enum {
-        DOWN,           /* No slab functionality available */
-        PARTIAL,        /* Kmem_cache_node works */
-        UP,             /* Everything works but does not show up in sysfs */
-        SYSFS           /* Sysfs up */
-} slab_state = DOWN;
-/* A list of all slab caches on the system */
-static DECLARE_RWSEM(slub_lock);
-static LIST_HEAD(slab_caches);
 /*
 * Tracking user of a slab.
 */
@@ -237,11 +229,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
 *                      Core slab cache functions
 *******************************************************************/
-int slab_is_available(void)
-{
-        return slab_state >= UP;
-}
 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 {
        return s->node[node];
@@ -311,7 +298,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
         * and whatever may come after it.
         */
        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
-                return s->objsize;
+                return s->object_size;
 #endif
        /*
@@ -609,11 +596,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
        if (p > addr + 16)
                print_section("Bytes b4 ", p - 16, 16);
-        print_section("Object ", p, min_t(unsigned long, s->objsize,
+        print_section("Object ", p, min_t(unsigned long, s->object_size,
                                PAGE_SIZE));
        if (s->flags & SLAB_RED_ZONE)
-                print_section("Redzone ", p + s->objsize,
+                print_section("Redzone ", p + s->object_size,
-                        s->inuse - s->objsize);
+                        s->inuse - s->object_size);
        if (s->offset)
                off = s->offset + sizeof(void *);
@@ -655,12 +642,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
        u8 *p = object;
        if (s->flags & __OBJECT_POISON) {
-                memset(p, POISON_FREE, s->objsize - 1);
+                memset(p, POISON_FREE, s->object_size - 1);
-                p[s->objsize - 1] = POISON_END;
+                p[s->object_size - 1] = POISON_END;
        }
        if (s->flags & SLAB_RED_ZONE)
-                memset(p + s->objsize, val, s->inuse - s->objsize);
+                memset(p + s->object_size, val, s->inuse - s->object_size);
 }
 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
@@ -705,10 +692,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 *      0xa5 (POISON_END)
 *
- * object + s->objsize
+ * object + s->object_size
 *      Padding to reach word boundary. This is also used for Redzoning.
 *      Padding is extended by another word if Redzoning is enabled and
- *      objsize == inuse.
+ *      object_size == inuse.
 *
 *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 *      0xcc (RED_ACTIVE) for objects in use.
@@ -727,7 +714,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 * object + s->size
 *      Nothing is used beyond s->size.
 *
- * If slabcaches are merged then the objsize and inuse boundaries are mostly
+ * If slabcaches are merged then the object_size and inuse boundaries are mostly
 * ignored. And therefore no slab options that rely on these boundaries
 * may be used with merged slabcaches.
 */
@@ -787,25 +774,25 @@ static int check_object(struct kmem_cache *s, struct page *page,
                                        void *object, u8 val)
 {
        u8 *p = object;
-        u8 *endobject = object + s->objsize;
+        u8 *endobject = object + s->object_size;
        if (s->flags & SLAB_RED_ZONE) {
                if (!check_bytes_and_report(s, page, object, "Redzone",
-                        endobject, val, s->inuse - s->objsize))
+                        endobject, val, s->inuse - s->object_size))
                        return 0;
        } else {
-                if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
+                if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
                        check_bytes_and_report(s, page, p, "Alignment padding",
-                                endobject, POISON_INUSE, s->inuse - s->objsize);
+                                endobject, POISON_INUSE, s->inuse - s->object_size);
                }
        }
        if (s->flags & SLAB_POISON) {
                if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
                        (!check_bytes_and_report(s, page, p, "Poison", p,
-                                        POISON_FREE, s->objsize - 1) ||
+                                        POISON_FREE, s->object_size - 1) ||
                         !check_bytes_and_report(s, page, p, "Poison",
-                                p + s->objsize - 1, POISON_END, 1)))
+                                p + s->object_size - 1, POISON_END, 1)))
                        return 0;
                /*
                 * check_pad_bytes cleans up on its own.
@@ -926,7 +913,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
                        page->freelist);
                if (!alloc)
-                        print_section("Object ", (void *)object, s->objsize);
+                        print_section("Object ", (void *)object, s->object_size);
                dump_stack();
        }
@@ -942,14 +929,14 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
        lockdep_trace_alloc(flags);
        might_sleep_if(flags & __GFP_WAIT);
-        return should_failslab(s->objsize, flags, s->flags);
+        return should_failslab(s->object_size, flags, s->flags);
 }
 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
 {
        flags &= gfp_allowed_mask;
        kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
-        kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
+        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
 }
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -966,13 +953,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
                unsigned long flags;
                local_irq_save(flags);
-                kmemcheck_slab_free(s, x, s->objsize);
+                kmemcheck_slab_free(s, x, s->object_size);
-                debug_check_no_locks_freed(x, s->objsize);
+                debug_check_no_locks_freed(x, s->object_size);
                local_irq_restore(flags);
        }
 #endif
        if (!(s->flags & SLAB_DEBUG_OBJECTS))
-                debug_check_no_obj_freed(x, s->objsize);
+                debug_check_no_obj_freed(x, s->object_size);
 }
 /*
@@ -1207,7 +1194,7 @@ out:
 __setup("slub_debug", setup_slub_debug);
-static unsigned long kmem_cache_flags(unsigned long objsize,
+static unsigned long kmem_cache_flags(unsigned long object_size,
        unsigned long flags, const char *name,
        void (*ctor)(void *))
 {
@@ -1237,7 +1224,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
                                        struct page *page) {}
 static inline void remove_full(struct kmem_cache *s, struct page *page) {}
-static inline unsigned long kmem_cache_flags(unsigned long objsize,
+static inline unsigned long kmem_cache_flags(unsigned long object_size,
        unsigned long flags, const char *name,
        void (*ctor)(void *))
 {
@@ -1314,13 +1301,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                        stat(s, ORDER_FALLBACK);
        }
-        if (flags & __GFP_WAIT)
+        if (kmemcheck_enabled && page
-                local_irq_disable();
-        if (!page)
-                return NULL;
-        if (kmemcheck_enabled
                && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
                int pages = 1 << oo_order(oo);
@@ -1336,6 +1317,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                        kmemcheck_mark_unallocated_pages(page, pages);
        }
+        if (flags & __GFP_WAIT)
+                local_irq_disable();
+        if (!page)
+                return NULL;
        page->objects = oo_objects(oo);
        mod_zone_page_state(page_zone(page),
                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1369,7 +1355,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        inc_slabs_node(s, page_to_nid(page), page->objects);
        page->slab = s;
-        page->flags |= 1 << PG_slab;
+        __SetPageSlab(page);
+        if (page->pfmemalloc)
+                SetPageSlabPfmemalloc(page);
        start = page_address(page);
@@ -1413,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
                -pages);
+        __ClearPageSlabPfmemalloc(page);
        __ClearPageSlab(page);
        reset_page_mapcount(page);
        if (current->reclaim_state)
@@ -1490,12 +1479,12 @@ static inline void remove_partial(struct kmem_cache_node *n,
 }
 /*
- * Lock slab, remove from the partial list and put the object into the
+ * Remove slab from the partial list, freeze it and
- * per cpu freelist.
+ * return the pointer to the freelist.
 *
 * Returns a list of objects or NULL if it fails.
 *
- * Must hold list_lock.
+ * Must hold list_lock since we modify the partial list.
 */
 static inline void *acquire_slab(struct kmem_cache *s,
                struct kmem_cache_node *n, struct page *page,
@@ -1510,22 +1499,27 @@ static inline void *acquire_slab(struct kmem_cache *s,
         * The old freelist is the list of objects for the
         * per cpu allocation list.
         */
-        do {
+        freelist = page->freelist;
-                freelist = page->freelist;
+        counters = page->counters;
-                counters = page->counters;
+        new.counters = counters;
-                new.counters = counters;
+        if (mode) {
-                if (mode)
+                new.inuse = page->objects;
-                        new.inuse = page->objects;
+                new.freelist = NULL;
+        } else {
+                new.freelist = freelist;
+        }
-                VM_BUG_ON(new.frozen);
+        VM_BUG_ON(new.frozen);
-                new.frozen = 1;
+        new.frozen = 1;
-        } while (!__cmpxchg_double_slab(s, page,
+        if (!__cmpxchg_double_slab(s, page,
                        freelist, counters,
-                        NULL, new.counters,
+                        new.freelist, new.counters,
-                        "lock and freeze"));
+                        "acquire_slab"))
+                return NULL;
        remove_partial(n, page);
+        WARN_ON(!freelist);
        return freelist;
 }
@@ -1559,12 +1553,10 @@ static void *get_partial_node(struct kmem_cache *s,
                if (!object) {
                        c->page = page;
-                        c->node = page_to_nid(page);
                        stat(s, ALLOC_FROM_PARTIAL);
                        object = t;
                        available =  page->objects - page->inuse;
                } else {
-                        page->freelist = t;
                        available = put_cpu_partial(s, page, 0);
                        stat(s, CPU_PARTIAL_NODE);
                }
@@ -1579,7 +1571,7 @@ static void *get_partial_node(struct kmem_cache *s,
 /*
 * Get a page from somewhere. Search in increasing NUMA distances.
 */
-static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
+static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
                struct kmem_cache_cpu *c)
 {
 #ifdef CONFIG_NUMA
@@ -1614,7 +1606,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
        do {
                cpuset_mems_cookie = get_mems_allowed();
-                zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+                zonelist = node_zonelist(slab_node(), flags);
                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
                        struct kmem_cache_node *n;
@@ -1728,14 +1720,12 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
 /*
 * Remove the cpu slab
 */
-static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist)
 {
        enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
-        struct page *page = c->page;
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
        int lock = 0;
        enum slab_modes l = M_NONE, m = M_NONE;
-        void *freelist;
        void *nextfree;
        int tail = DEACTIVATE_TO_HEAD;
        struct page new;
@@ -1746,11 +1736,6 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
                tail = DEACTIVATE_TO_TAIL;
        }
-        c->tid = next_tid(c->tid);
-        c->page = NULL;
-        freelist = c->freelist;
-        c->freelist = NULL;
        /*
         * Stage one: Free all available per cpu objects back
         * to the page freelist while it is still frozen. Leave the
@@ -1876,21 +1861,31 @@ redo:
        }
 }
-/* Unfreeze all the cpu partial slabs */
+/*
+ * Unfreeze all the cpu partial slabs.
+ *
+ * This function must be called with interrupt disabled.
+ */
 static void unfreeze_partials(struct kmem_cache *s)
 {
-        struct kmem_cache_node *n = NULL;
+        struct kmem_cache_node *n = NULL, *n2 = NULL;
        struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
        struct page *page, *discard_page = NULL;
        while ((page = c->partial)) {
-                enum slab_modes { M_PARTIAL, M_FREE };
-                enum slab_modes l, m;
                struct page new;
                struct page old;
                c->partial = page->next;
-                l = M_FREE;
+                n2 = get_node(s, page_to_nid(page));
+                if (n != n2) {
+                        if (n)
+                                spin_unlock(&n->list_lock);
+                        n = n2;
+                        spin_lock(&n->list_lock);
+                }
                do {
@@ -1903,43 +1898,17 @@ static void unfreeze_partials(struct kmem_cache *s)
                        new.frozen = 0;
-                        if (!new.inuse && (!n || n->nr_partial > s->min_partial))
+                } while (!__cmpxchg_double_slab(s, page,
-                                m = M_FREE;
-                        else {
-                                struct kmem_cache_node *n2 = get_node(s,
-                                                        page_to_nid(page));
-                                m = M_PARTIAL;
-                                if (n != n2) {
-                                        if (n)
-                                                spin_unlock(&n->list_lock);
-                                        n = n2;
-                                        spin_lock(&n->list_lock);
-                                }
-                        }
-                        if (l != m) {
-                                if (l == M_PARTIAL) {
-                                        remove_partial(n, page);
-                                        stat(s, FREE_REMOVE_PARTIAL);
-                                } else {
-                                        add_partial(n, page,
-                                                DEACTIVATE_TO_TAIL);
-                                        stat(s, FREE_ADD_PARTIAL);
-                                }
-                                l = m;
-                        }
-                } while (!cmpxchg_double_slab(s, page,
                                old.freelist, old.counters,
                                new.freelist, new.counters,
                                "unfreezing slab"));
-                if (m == M_FREE) {
+                if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
                        page->next = discard_page;
                        discard_page = page;
+                } else {
+                        add_partial(n, page, DEACTIVATE_TO_TAIL);
+                        stat(s, FREE_ADD_PARTIAL);
                }
        }
@@ -2008,7 +1977,11 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
        stat(s, CPUSLAB_FLUSH);
-        deactivate_slab(s, c);
+        deactivate_slab(s, c->page, c->freelist);
+        c->tid = next_tid(c->tid);
+        c->page = NULL;
+        c->freelist = NULL;
 }
 /*
@@ -2040,7 +2013,7 @@ static bool has_cpu_slab(int cpu, void *info)
        struct kmem_cache *s = info;
        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
-        return !!(c->page);
+        return c->page || c->partial;
 }
 static void flush_all(struct kmem_cache *s)
@@ -2052,10 +2025,10 @@ static void flush_all(struct kmem_cache *s)
 * Check if the objects in a per cpu structure fit numa
 * locality expectations.
 */
-static inline int node_match(struct kmem_cache_cpu *c, int node)
+static inline int node_match(struct page *page, int node)
 {
 #ifdef CONFIG_NUMA
-        if (node != NUMA_NO_NODE && c->node != node)
+        if (node != NUMA_NO_NODE && page_to_nid(page) != node)
                return 0;
 #endif
        return 1;
@@ -2098,10 +2071,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
                "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
                nid, gfpflags);
        printk(KERN_WARNING "  cache: %s, object size: %d, buffer size: %d, "
-                "default order: %d, min order: %d\n", s->name, s->objsize,
+                "default order: %d, min order: %d\n", s->name, s->object_size,
                s->size, oo_order(s->oo), oo_order(s->min));
-        if (oo_order(s->min) > get_order(s->objsize))
+        if (oo_order(s->min) > get_order(s->object_size))
                printk(KERN_WARNING "  %s debugging increased min order, use "
                       "slub_debug=O to disable.\n", s->name);
@@ -2127,10 +2100,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
 static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
                        int node, struct kmem_cache_cpu **pc)
 {
-        void *object;
+        void *freelist;
-        struct kmem_cache_cpu *c;
+        struct kmem_cache_cpu *c = *pc;
-        struct page *page = new_slab(s, flags, node);
+        struct page *page;
+        freelist = get_partial(s, flags, node, c);
+        if (freelist)
+                return freelist;
+        page = new_slab(s, flags, node);
        if (page) {
                c = __this_cpu_ptr(s->cpu_slab);
                if (c->page)
@@ -2140,17 +2119,24 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
                 * No other reference to the page yet so we can
                 * muck around with it freely without cmpxchg
                 */
-                object = page->freelist;
+                freelist = page->freelist;
                page->freelist = NULL;
                stat(s, ALLOC_SLAB);
-                c->node = page_to_nid(page);
                c->page = page;
                *pc = c;
        } else
-                object = NULL;
+                freelist = NULL;
-        return object;
+        return freelist;
+}
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
+{
+        if (unlikely(PageSlabPfmemalloc(page)))
+                return gfp_pfmemalloc_allowed(gfpflags);
+        return true;
 }
 /*
@@ -2160,6 +2146,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
 * The page is still frozen if the return value is not NULL.
 *
 * If this function returns NULL then the page has been unfrozen.
+ *
+ * This function must be called with interrupt disabled.
 */
 static inline void *get_freelist(struct kmem_cache *s, struct page *page)
 {
@@ -2170,13 +2158,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
        do {
                freelist = page->freelist;
                counters = page->counters;
                new.counters = counters;
                VM_BUG_ON(!new.frozen);
                new.inuse = page->objects;
                new.frozen = freelist != NULL;
-        } while (!cmpxchg_double_slab(s, page,
+        } while (!__cmpxchg_double_slab(s, page,
                freelist, counters,
                NULL, new.counters,
                "get_freelist"));
@@ -2203,7 +2192,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
                          unsigned long addr, struct kmem_cache_cpu *c)
 {
-        void **object;
+        void *freelist;
+        struct page *page;
        unsigned long flags;
        local_irq_save(flags);
@@ -2216,25 +2206,41 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
        c = this_cpu_ptr(s->cpu_slab);
 #endif
-        if (!c->page)
+        page = c->page;
+        if (!page)
                goto new_slab;
 redo:
-        if (unlikely(!node_match(c, node))) {
+        if (unlikely(!node_match(page, node))) {
                stat(s, ALLOC_NODE_MISMATCH);
-                deactivate_slab(s, c);
+                deactivate_slab(s, page, c->freelist);
+                c->page = NULL;
+                c->freelist = NULL;
+                goto new_slab;
+        }
+        /*
+         * By rights, we should be searching for a slab page that was
+         * PFMEMALLOC but right now, we are losing the pfmemalloc
+         * information when the page leaves the per-cpu allocator
+         */
+        if (unlikely(!pfmemalloc_match(page, gfpflags))) {
+                deactivate_slab(s, page, c->freelist);
+                c->page = NULL;
+                c->freelist = NULL;
                goto new_slab;
        }
        /* must check again c->freelist in case of cpu migration or IRQ */
-        object = c->freelist;
+        freelist = c->freelist;
-        if (object)
+        if (freelist)
                goto load_freelist;
        stat(s, ALLOC_SLOWPATH);
-        object = get_freelist(s, c->page);
+        freelist = get_freelist(s, page);
-        if (!object) {
+        if (!freelist) {
                c->page = NULL;
                stat(s, DEACTIVATE_BYPASS);
                goto new_slab;
@@ -2243,50 +2249,50 @@ redo:
        stat(s, ALLOC_REFILL);
 load_freelist:
-        c->freelist = get_freepointer(s, object);
+        /*
+         * freelist is pointing to the list of objects to be used.
+         * page is pointing to the page from which the objects are obtained.
+         * That page must be frozen for per cpu allocations to work.
+         */
+        VM_BUG_ON(!c->page->frozen);
+        c->freelist = get_freepointer(s, freelist);
        c->tid = next_tid(c->tid);
        local_irq_restore(flags);
-        return object;
+        return freelist;
 new_slab:
        if (c->partial) {
-                c->page = c->partial;
+                page = c->page = c->partial;
-                c->partial = c->page->next;
+                c->partial = page->next;
-                c->node = page_to_nid(c->page);
                stat(s, CPU_PARTIAL_ALLOC);
                c->freelist = NULL;
                goto redo;
        }
-        /* Then do expensive stuff like retrieving pages from the partial lists */
+        freelist = new_slab_objects(s, gfpflags, node, &c);
-        object = get_partial(s, gfpflags, node, c);
-        if (unlikely(!object)) {
-                object = new_slab_objects(s, gfpflags, node, &c);
+        if (unlikely(!freelist)) {
+                if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+                        slab_out_of_memory(s, gfpflags, node);
-                if (unlikely(!object)) {
+                local_irq_restore(flags);
-                        if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+                return NULL;
-                                slab_out_of_memory(s, gfpflags, node);
-                        local_irq_restore(flags);
-                        return NULL;
-                }
        }
-        if (likely(!kmem_cache_debug(s)))
+        page = c->page;
+        if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
                goto load_freelist;
        /* Only entered in the debug case */
-        if (!alloc_debug_processing(s, c->page, object, addr))
+        if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
                goto new_slab;  /* Slab failed checks. Next slab needed */
-        c->freelist = get_freepointer(s, object);
+        deactivate_slab(s, page, get_freepointer(s, freelist));
-        deactivate_slab(s, c);
+        c->page = NULL;
-        c->node = NUMA_NO_NODE;
+        c->freelist = NULL;
        local_irq_restore(flags);
-        return object;
+        return freelist;
 }
 /*
@@ -2304,6 +2310,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 {
        void **object;
        struct kmem_cache_cpu *c;
+        struct page *page;
        unsigned long tid;
        if (slab_pre_alloc_hook(s, gfpflags))
@@ -2329,8 +2336,8 @@ redo:
        barrier();
        object = c->freelist;
-        if (unlikely(!object || !node_match(c, node)))
+        page = c->page;
+        if (unlikely(!object || !node_match(page, node)))
                object = __slab_alloc(s, gfpflags, node, addr, c);
        else {
@@ -2361,7 +2368,7 @@ redo:
        }
        if (unlikely(gfpflags & __GFP_ZERO) && object)
-                memset(object, 0, s->objsize);
+                memset(object, 0, s->object_size);
        slab_post_alloc_hook(s, gfpflags, object);
@@ -2372,7 +2379,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
        void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
-        trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
+        trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
        return ret;
 }
@@ -2402,7 +2409,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
-                                    s->objsize, s->size, gfpflags, node);
+                                    s->object_size, s->size, gfpflags, node);
        return ret;
 }
@@ -2766,7 +2773,7 @@ static unsigned long calculate_alignment(unsigned long flags,
 }
 static void
-init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
+init_kmem_cache_node(struct kmem_cache_node *n)
 {
        n->nr_partial = 0;
        spin_lock_init(&n->list_lock);
@@ -2836,7 +2843,7 @@ static void early_kmem_cache_node_alloc(int node)
        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
        init_tracking(kmem_cache_node, n);
 #endif
-        init_kmem_cache_node(n, kmem_cache_node);
+        init_kmem_cache_node(n);
        inc_slabs_node(kmem_cache_node, node, page->objects);
        add_partial(n, page, DEACTIVATE_TO_HEAD);
@@ -2876,7 +2883,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
                }
                s->node[node] = n;
-                init_kmem_cache_node(n, s);
+                init_kmem_cache_node(n);
        }
        return 1;
 }
@@ -2897,7 +2904,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
 static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
        unsigned long flags = s->flags;
-        unsigned long size = s->objsize;
+        unsigned long size = s->object_size;
        unsigned long align = s->align;
        int order;
@@ -2926,7 +2933,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
         * end of the object and the free pointer. If not then add an
         * additional word to have some bytes to store Redzone information.
         */
-        if ((flags & SLAB_RED_ZONE) && size == s->objsize)
+        if ((flags & SLAB_RED_ZONE) && size == s->object_size)
                size += sizeof(void *);
 #endif
@@ -2974,7 +2981,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
         * user specified and the dynamic determination of cache line size
         * on bootup.
         */
-        align = calculate_alignment(flags, align, s->objsize);
+        align = calculate_alignment(flags, align, s->object_size);
        s->align = align;
        /*
@@ -3022,7 +3029,7 @@ static int kmem_cache_open(struct kmem_cache *s,
        memset(s, 0, kmem_size);
        s->name = name;
        s->ctor = ctor;
-        s->objsize = size;
+        s->object_size = size;
        s->align = align;
        s->flags = kmem_cache_flags(size, flags, name, ctor);
        s->reserved = 0;
@@ -3037,7 +3044,7 @@ static int kmem_cache_open(struct kmem_cache *s,
                 * Disable debugging flags that store metadata if the min slab
                 * order increased.
                 */
-                if (get_order(s->size) > get_order(s->objsize)) {
+                if (get_order(s->size) > get_order(s->object_size)) {
                        s->flags &= ~DEBUG_METADATA_FLAGS;
                        s->offset = 0;
                        if (!calculate_sizes(s, -1))
@@ -3111,7 +3118,7 @@ error:
 */
 unsigned int kmem_cache_size(struct kmem_cache *s)
 {
-        return s->objsize;
+        return s->object_size;
 }
 EXPORT_SYMBOL(kmem_cache_size);
@@ -3189,11 +3196,11 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 */
 void kmem_cache_destroy(struct kmem_cache *s)
 {
-        down_write(&slub_lock);
+        mutex_lock(&slab_mutex);
        s->refcount--;
        if (!s->refcount) {
                list_del(&s->list);
-                up_write(&slub_lock);
+                mutex_unlock(&slab_mutex);
                if (kmem_cache_close(s)) {
                        printk(KERN_ERR "SLUB %s: %s called for cache that "
                                "still has objects.\n", s->name, __func__);
@@ -3203,7 +3210,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
                        rcu_barrier();
                sysfs_slab_remove(s);
        } else
-                up_write(&slub_lock);
+                mutex_unlock(&slab_mutex);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -3265,7 +3272,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
        /*
         * This function is called with IRQs disabled during early-boot on
-         * single CPU so there's no need to take slub_lock here.
+         * single CPU so there's no need to take slab_mutex here.
         */
        if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
                                                                flags, NULL))
@@ -3550,10 +3557,10 @@ static int slab_mem_going_offline_callback(void *arg)
 {
        struct kmem_cache *s;
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list)
                kmem_cache_shrink(s);
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
        return 0;
 }
@@ -3574,7 +3581,7 @@ static void slab_mem_offline_callback(void *arg)
        if (offline_node < 0)
                return;
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list) {
                n = get_node(s, offline_node);
                if (n) {
@@ -3590,7 +3597,7 @@ static void slab_mem_offline_callback(void *arg)
                        kmem_cache_free(kmem_cache_node, n);
                }
        }
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
 }
 static int slab_mem_going_online_callback(void *arg)
@@ -3613,7 +3620,7 @@ static int slab_mem_going_online_callback(void *arg)
         * allocate a kmem_cache_node structure in order to bring the node
         * online.
         */
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list) {
                /*
                 * XXX: kmem_cache_alloc_node will fallback to other nodes
@@ -3625,11 +3632,11 @@ static int slab_mem_going_online_callback(void *arg)
                        ret = -ENOMEM;
                        goto out;
                }
-                init_kmem_cache_node(n, s);
+                init_kmem_cache_node(n);
                s->node[nid] = n;
        }
 out:
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
        return ret;
 }
@@ -3840,11 +3847,11 @@ void __init kmem_cache_init(void)
                if (s && s->size) {
                        char *name = kasprintf(GFP_NOWAIT,
-                                 "dma-kmalloc-%d", s->objsize);
+                                 "dma-kmalloc-%d", s->object_size);
                        BUG_ON(!name);
                        kmalloc_dma_caches[i] = create_kmalloc_cache(name,
-                                s->objsize, SLAB_CACHE_DMA);
+                                s->object_size, SLAB_CACHE_DMA);
                }
        }
 #endif
@@ -3921,16 +3928,12 @@ static struct kmem_cache *find_mergeable(size_t size,
        return NULL;
 }
-struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
                size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s;
        char *n;
-        if (WARN_ON(!name))
-                return NULL;
-        down_write(&slub_lock);
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
                s->refcount++;
@@ -3938,49 +3941,42 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                 * Adjust the object sizes so that we clear
                 * the complete object on kzalloc.
                 */
-                s->objsize = max(s->objsize, (int)size);
+                s->object_size = max(s->object_size, (int)size);
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
                if (sysfs_slab_alias(s, name)) {
                        s->refcount--;
-                        goto err;
+                        return NULL;
                }
-                up_write(&slub_lock);
                return s;
        }
        n = kstrdup(name, GFP_KERNEL);
        if (!n)
-                goto err;
+                return NULL;
        s = kmalloc(kmem_size, GFP_KERNEL);
        if (s) {
                if (kmem_cache_open(s, n,
                                size, align, flags, ctor)) {
+                        int r;
                        list_add(&s->list, &slab_caches);
-                        up_write(&slub_lock);
+                        mutex_unlock(&slab_mutex);
-                        if (sysfs_slab_add(s)) {
+                        r = sysfs_slab_add(s);
-                                down_write(&slub_lock);
+                        mutex_lock(&slab_mutex);
-                                list_del(&s->list);
-                                kfree(n);
+                        if (!r)
-                                kfree(s);
+                                return s;
-                                goto err;
-                        }
+                        list_del(&s->list);
-                        return s;
+                        kmem_cache_close(s);
                }
-                kfree(n);
                kfree(s);
        }
-err:
+        kfree(n);
-        up_write(&slub_lock);
+        return NULL;
-        if (flags & SLAB_PANIC)
-                panic("Cannot create slabcache %s\n", name);
-        else
-                s = NULL;
-        return s;
 }
-EXPORT_SYMBOL(kmem_cache_create);
 #ifdef CONFIG_SMP
 /*
@@ -3999,13 +3995,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
        case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                down_read(&slub_lock);
+                mutex_lock(&slab_mutex);
                list_for_each_entry(s, &slab_caches, list) {
                        local_irq_save(flags);
                        __flush_cpu_slab(s, cpu);
                        local_irq_restore(flags);
                }
-                up_read(&slub_lock);
+                mutex_unlock(&slab_mutex);
                break;
        default:
                break;
@@ -4497,30 +4493,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                for_each_possible_cpu(cpu) {
                        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
-                        int node = ACCESS_ONCE(c->node);
+                        int node;
                        struct page *page;
-                        if (node < 0)
-                                continue;
                        page = ACCESS_ONCE(c->page);
-                        if (page) {
+                        if (!page)
-                                if (flags & SO_TOTAL)
+                                continue;
-                                        x = page->objects;
-                                else if (flags & SO_OBJECTS)
-                                        x = page->inuse;
-                                else
-                                        x = 1;
-                                total += x;
+                        node = page_to_nid(page);
-                                nodes[node] += x;
+                        if (flags & SO_TOTAL)
-                        }
+                                x = page->objects;
-                        page = c->partial;
+                        else if (flags & SO_OBJECTS)
+                                x = page->inuse;
+                        else
+                                x = 1;
+                        total += x;
+                        nodes[node] += x;
+                        page = ACCESS_ONCE(c->partial);
                        if (page) {
                                x = page->pobjects;
                                total += x;
                                nodes[node] += x;
                        }
                        per_cpu[node]++;
                }
        }
@@ -4620,7 +4617,7 @@ SLAB_ATTR_RO(align);
 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", s->objsize);
+        return sprintf(buf, "%d\n", s->object_size);
 }
 SLAB_ATTR_RO(object_size);
@@ -5283,7 +5280,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
        const char *name;
        int unmergeable;
-        if (slab_state < SYSFS)
+        if (slab_state < FULL)
                /* Defer until later */
                return 0;
@@ -5328,7 +5325,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
 static void sysfs_slab_remove(struct kmem_cache *s)
 {
-        if (slab_state < SYSFS)
+        if (slab_state < FULL)
                /*
                 * Sysfs has not been setup yet so no need to remove the
                 * cache from sysfs.
@@ -5356,7 +5353,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
 {
        struct saved_alias *al;
-        if (slab_state == SYSFS) {
+        if (slab_state == FULL) {
                /*
                 * If we have a leftover link then remove it.
                 */
@@ -5380,16 +5377,16 @@ static int __init slab_sysfs_init(void)
        struct kmem_cache *s;
        int err;
-        down_write(&slub_lock);
+        mutex_lock(&slab_mutex);
        slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
        if (!slab_kset) {
-                up_write(&slub_lock);
+                mutex_unlock(&slab_mutex);
                printk(KERN_ERR "Cannot register slab subsystem.\n");
                return -ENOSYS;
        }
-        slab_state = SYSFS;
+        slab_state = FULL;
        list_for_each_entry(s, &slab_caches, list) {
                err = sysfs_slab_add(s);
@@ -5405,11 +5402,11 @@ static int __init slab_sysfs_init(void)
                err = sysfs_slab_alias(al->s, al->name);
                if (err)
                        printk(KERN_ERR "SLUB: Unable to add boot slab alias"
-                                        " %s to sysfs\n", s->name);
+                                        " %s to sysfs\n", al->name);
                kfree(al);
        }
-        up_write(&slub_lock);
+        mutex_unlock(&slab_mutex);
        resiliency_test();
        return 0;
 }
@@ -5424,7 +5421,7 @@ __initcall(slab_sysfs_init);
 static void print_slabinfo_header(struct seq_file *m)
 {
        seq_puts(m, "slabinfo - version: 2.1\n");
-        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
+        seq_puts(m, "# name            <active_objs> <num_objs> <object_size> "
                 "<objperslab> <pagesperslab>");
        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
@@ -5435,7 +5432,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 {
        loff_t n = *pos;
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        if (!n)
                print_slabinfo_header(m);
@@ -5449,7 +5446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 static void s_stop(struct seq_file *m, void *p)
 {
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
 }
 static int s_show(struct seq_file *m, void *p)
diff --git a/mm/sparse.c b/mm/sparse.c
index a8bc7d364deb..fac95f2888f2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
        if (slab_is_available()) {
                if (node_state(nid, N_HIGH_MEMORY))
-                        section = kmalloc_node(array_size, GFP_KERNEL, nid);
+                        section = kzalloc_node(array_size, GFP_KERNEL, nid);
                else
-                        section = kmalloc(array_size, GFP_KERNEL);
+                        section = kzalloc(array_size, GFP_KERNEL);
-        } else
+        } else {
                section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+        }
-        if (section)
-                memset(section, 0, array_size);
        return section;
 }
 static int __meminit sparse_index_init(unsigned long section_nr, int nid)
 {
-        static DEFINE_SPINLOCK(index_init_lock);
        unsigned long root = SECTION_NR_TO_ROOT(section_nr);
        struct mem_section *section;
        int ret = 0;
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
        section = sparse_index_alloc(nid);
        if (!section)
                return -ENOMEM;
-        /*
-         * This lock keeps two different sections from
-         * reallocating for the same index
-         */
-        spin_lock(&index_init_lock);
-        if (mem_section[root]) {
-                ret = -EEXIST;
-                goto out;
-        }
        mem_section[root] = section;
-out:
-        spin_unlock(&index_init_lock);
        return ret;
 }
 #else /* !SPARSEMEM_EXTREME */
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms)
                     break;
        }
+        VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
        return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
@@ -273,10 +261,11 @@ static unsigned long *__kmalloc_section_usemap(void)
 #ifdef CONFIG_MEMORY_HOTREMOVE
 static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
-                                         unsigned long count)
+                                         unsigned long size)
 {
-        unsigned long section_nr;
+        unsigned long goal, limit;
+        unsigned long *p;
+        int nid;
        /*
         * A page may contain usemaps for other sections preventing the
         * page being freed and making a section unremovable while
@@ -287,8 +276,17 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
         * from the same section as the pgdat where possible to avoid
         * this problem.
         */
-        section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+        goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
-        return alloc_bootmem_section(usemap_size() * count, section_nr);
+        limit = goal + (1UL << PA_SECTION_SHIFT);
+        nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
+again:
+        p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size,
+                                          SMP_CACHE_BYTES, goal, limit);
+        if (!p && limit) {
+                limit = 0;
+                goto again;
+        }
+        return p;
 }
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -332,9 +330,9 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 #else
 static unsigned long * __init
 sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
-                                         unsigned long count)
+                                         unsigned long size)
 {
-        return NULL;
+        return alloc_bootmem_node_nopanic(pgdat, size);
 }
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -352,13 +350,10 @@ static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
        int size = usemap_size();
        usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
-                                                                 usemap_count);
+                                                          size * usemap_count);
        if (!usemap) {
-                usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
+                printk(KERN_WARNING "%s: allocation failed\n", __func__);
-                if (!usemap) {
+                return;
-                        printk(KERN_WARNING "%s: allocation failed\n", __func__);
-                        return;
-                }
        }
        for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
@@ -486,6 +481,9 @@ void __init sparse_init(void)
        struct page **map_map;
 #endif
+        /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
+        set_pageblock_order();
        /*
         * map is using big page (aka 2M in x86 64 bit)
         * usemap is less one page (aka 24 bytes)
diff --git a/mm/swap.c b/mm/swap.c
index 5c13f1338972..77825883298f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,13 +47,15 @@ static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 static void __page_cache_release(struct page *page)
 {
        if (PageLRU(page)) {
-                unsigned long flags;
                struct zone *zone = page_zone(page);
+                struct lruvec *lruvec;
+                unsigned long flags;
                spin_lock_irqsave(&zone->lru_lock, flags);
+                lruvec = mem_cgroup_page_lruvec(page, zone);
                VM_BUG_ON(!PageLRU(page));
                __ClearPageLRU(page);
-                del_page_from_lru_list(zone, page, page_off_lru(page));
+                del_page_from_lru_list(page, lruvec, page_off_lru(page));
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
 }
@@ -82,6 +84,25 @@ static void put_compound_page(struct page *page)
                if (likely(page != page_head &&
                           get_page_unless_zero(page_head))) {
                        unsigned long flags;
+                        /*
+                         * THP can not break up slab pages so avoid taking
+                         * compound_lock().  Slab performs non-atomic bit ops
+                         * on page->flags for better performance.  In particular
+                         * slab_unlock() in slub used to be a hot path.  It is
+                         * still hot on arches that do not support
+                         * this_cpu_cmpxchg_double().
+                         */
+                        if (PageSlab(page_head)) {
+                                if (PageTail(page)) {
+                                        if (put_page_testzero(page_head))
+                                                VM_BUG_ON(1);
+                                        atomic_dec(&page->_mapcount);
+                                        goto skip_lock_tail;
+                                } else
+                                        goto skip_lock;
+                        }
                        /*
                         * page_head wasn't a dangling pointer but it
                         * may not be a head page anymore by the time
@@ -92,10 +113,10 @@ static void put_compound_page(struct page *page)
                        if (unlikely(!PageTail(page))) {
                                /* __split_huge_page_refcount run before us */
                                compound_unlock_irqrestore(page_head, flags);
-                                VM_BUG_ON(PageHead(page_head));
+skip_lock:
                                if (put_page_testzero(page_head))
                                        __put_single_page(page_head);
-                        out_put_single:
+out_put_single:
                                if (put_page_testzero(page))
                                        __put_single_page(page);
                                return;
@@ -115,6 +136,8 @@ static void put_compound_page(struct page *page)
                        VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
                        VM_BUG_ON(atomic_read(&page->_count) != 0);
                        compound_unlock_irqrestore(page_head, flags);
+skip_lock_tail:
                        if (put_page_testzero(page_head)) {
                                if (PageHead(page_head))
                                        __put_compound_page(page_head);
@@ -162,6 +185,18 @@ bool __get_page_tail(struct page *page)
        struct page *page_head = compound_trans_head(page);
        if (likely(page != page_head && get_page_unless_zero(page_head))) {
+                /* Ref to put_compound_page() comment. */
+                if (PageSlab(page_head)) {
+                        if (likely(PageTail(page))) {
+                                __get_page_tail_foll(page, false);
+                                return true;
+                        } else {
+                                put_page(page_head);
+                                return false;
+                        }
+                }
                /*
                 * page_head wasn't a dangling pointer but it
                 * may not be a head page anymore by the time
@@ -201,12 +236,65 @@ void put_pages_list(struct list_head *pages)
 }
 EXPORT_SYMBOL(put_pages_list);
+/*
+ * get_kernel_pages() - pin kernel pages in memory
+ * @kiov:       An array of struct kvec structures
+ * @nr_segs:    number of segments to pin
+ * @write:      pinning for read/write, currently ignored
+ * @pages:      array that receives pointers to the pages pinned.
+ *              Should be at least nr_segs long.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with.
+ */
+int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
+                struct page **pages)
+{
+        int seg;
+        for (seg = 0; seg < nr_segs; seg++) {
+                if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
+                        return seg;
+                pages[seg] = kmap_to_page(kiov[seg].iov_base);
+                page_cache_get(pages[seg]);
+        }
+        return seg;
+}
+EXPORT_SYMBOL_GPL(get_kernel_pages);
+/*
+ * get_kernel_page() - pin a kernel page in memory
+ * @start:      starting kernel address
+ * @write:      pinning for read/write, currently ignored
+ * @pages:      array that receives pointer to the page pinned.
+ *              Must be at least nr_segs long.
+ *
+ * Returns 1 if page is pinned. If the page was not pinned, returns
+ * -errno. The page returned must be released with a put_page() call
+ * when it is finished with.
+ */
+int get_kernel_page(unsigned long start, int write, struct page **pages)
+{
+        const struct kvec kiov = {
+                .iov_base = (void *)start,
+                .iov_len = PAGE_SIZE
+        };
+        return get_kernel_pages(&kiov, 1, write, pages);
+}
+EXPORT_SYMBOL_GPL(get_kernel_page);
 static void pagevec_lru_move_fn(struct pagevec *pvec,
-                                void (*move_fn)(struct page *page, void *arg),
+        void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
-                                void *arg)
+        void *arg)
 {
        int i;
        struct zone *zone = NULL;
+        struct lruvec *lruvec;
        unsigned long flags = 0;
        for (i = 0; i < pagevec_count(pvec); i++) {
@@ -220,7 +308,8 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
                        spin_lock_irqsave(&zone->lru_lock, flags);
                }
-                (*move_fn)(page, arg);
+                lruvec = mem_cgroup_page_lruvec(page, zone);
+                (*move_fn)(page, lruvec, arg);
        }
        if (zone)
                spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -228,16 +317,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
        pagevec_reinit(pvec);
 }
-static void pagevec_move_tail_fn(struct page *page, void *arg)
+static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
+                                 void *arg)
 {
        int *pgmoved = arg;
        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
                enum lru_list lru = page_lru_base_type(page);
-                struct lruvec *lruvec;
-                lruvec = mem_cgroup_lru_move_lists(page_zone(page),
-                                                   page, lru, lru);
                list_move_tail(&page->lru, &lruvec->lists[lru]);
                (*pgmoved)++;
        }
@@ -276,41 +362,30 @@ void rotate_reclaimable_page(struct page *page)
        }
 }
-static void update_page_reclaim_stat(struct zone *zone, struct page *page,
+static void update_page_reclaim_stat(struct lruvec *lruvec,
                                     int file, int rotated)
 {
-        struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
+        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
-        struct zone_reclaim_stat *memcg_reclaim_stat;
-        memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
        reclaim_stat->recent_scanned[file]++;
        if (rotated)
                reclaim_stat->recent_rotated[file]++;
-        if (!memcg_reclaim_stat)
-                return;
-        memcg_reclaim_stat->recent_scanned[file]++;
-        if (rotated)
-                memcg_reclaim_stat->recent_rotated[file]++;
 }
-static void __activate_page(struct page *page, void *arg)
+static void __activate_page(struct page *page, struct lruvec *lruvec,
+                            void *arg)
 {
-        struct zone *zone = page_zone(page);
        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
                int file = page_is_file_cache(page);
                int lru = page_lru_base_type(page);
-                del_page_from_lru_list(zone, page, lru);
+                del_page_from_lru_list(page, lruvec, lru);
                SetPageActive(page);
                lru += LRU_ACTIVE;
-                add_page_to_lru_list(zone, page, lru);
+                add_page_to_lru_list(page, lruvec, lru);
-                __count_vm_event(PGACTIVATE);
-                update_page_reclaim_stat(zone, page, file, 1);
+                __count_vm_event(PGACTIVATE);
+                update_page_reclaim_stat(lruvec, file, 1);
        }
 }
@@ -347,7 +422,7 @@ void activate_page(struct page *page)
        struct zone *zone = page_zone(page);
        spin_lock_irq(&zone->lru_lock);
-        __activate_page(page, NULL);
+        __activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
        spin_unlock_irq(&zone->lru_lock);
 }
 #endif
@@ -414,11 +489,13 @@ void lru_cache_add_lru(struct page *page, enum lru_list lru)
 void add_page_to_unevictable_list(struct page *page)
 {
        struct zone *zone = page_zone(page);
+        struct lruvec *lruvec;
        spin_lock_irq(&zone->lru_lock);
+        lruvec = mem_cgroup_page_lruvec(page, zone);
        SetPageUnevictable(page);
        SetPageLRU(page);
-        add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
+        add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
        spin_unlock_irq(&zone->lru_lock);
 }
@@ -443,11 +520,11 @@ void add_page_to_unevictable_list(struct page *page)
 * be write it out by flusher threads as this is much more effective
 * than the single-page writeout from reclaim.
 */
-static void lru_deactivate_fn(struct page *page, void *arg)
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+                              void *arg)
 {
        int lru, file;
        bool active;
-        struct zone *zone = page_zone(page);
        if (!PageLRU(page))
                return;
@@ -460,13 +537,13 @@ static void lru_deactivate_fn(struct page *page, void *arg)
                return;
        active = PageActive(page);
        file = page_is_file_cache(page);
        lru = page_lru_base_type(page);
-        del_page_from_lru_list(zone, page, lru + active);
+        del_page_from_lru_list(page, lruvec, lru + active);
        ClearPageActive(page);
        ClearPageReferenced(page);
-        add_page_to_lru_list(zone, page, lru);
+        add_page_to_lru_list(page, lruvec, lru);
        if (PageWriteback(page) || PageDirty(page)) {
                /*
@@ -476,19 +553,17 @@ static void lru_deactivate_fn(struct page *page, void *arg)
                 */
                SetPageReclaim(page);
        } else {
-                struct lruvec *lruvec;
                /*
                 * The page's writeback ends up during pagevec
                 * We moves tha page into tail of inactive.
                 */
-                lruvec = mem_cgroup_lru_move_lists(zone, page, lru, lru);
                list_move_tail(&page->lru, &lruvec->lists[lru]);
                __count_vm_event(PGROTATED);
        }
        if (active)
                __count_vm_event(PGDEACTIVATE);
-        update_page_reclaim_stat(zone, page, file, 0);
+        update_page_reclaim_stat(lruvec, file, 0);
 }
 /*
@@ -588,6 +663,7 @@ void release_pages(struct page **pages, int nr, int cold)
        int i;
        LIST_HEAD(pages_to_free);
        struct zone *zone = NULL;
+        struct lruvec *lruvec;
        unsigned long uninitialized_var(flags);
        for (i = 0; i < nr; i++) {
@@ -615,9 +691,11 @@ void release_pages(struct page **pages, int nr, int cold)
                                zone = pagezone;
                                spin_lock_irqsave(&zone->lru_lock, flags);
                        }
+                        lruvec = mem_cgroup_page_lruvec(page, zone);
                        VM_BUG_ON(!PageLRU(page));
                        __ClearPageLRU(page);
-                        del_page_from_lru_list(zone, page, page_off_lru(page));
+                        del_page_from_lru_list(page, lruvec, page_off_lru(page));
                }
                list_add(&page->lru, &pages_to_free);
@@ -649,8 +727,8 @@ EXPORT_SYMBOL(__pagevec_release);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /* used by __split_huge_page_refcount() */
-void lru_add_page_tail(struct zone* zone,
+void lru_add_page_tail(struct page *page, struct page *page_tail,
-                       struct page *page, struct page *page_tail)
+                       struct lruvec *lruvec)
 {
        int uninitialized_var(active);
        enum lru_list lru;
@@ -659,7 +737,8 @@ void lru_add_page_tail(struct zone* zone,
        VM_BUG_ON(!PageHead(page));
        VM_BUG_ON(PageCompound(page_tail));
        VM_BUG_ON(PageLRU(page_tail));
-        VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&zone->lru_lock));
+        VM_BUG_ON(NR_CPUS != 1 &&
+                  !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
        SetPageLRU(page_tail);
@@ -688,20 +767,20 @@ void lru_add_page_tail(struct zone* zone,
                 * Use the standard add function to put page_tail on the list,
                 * but then correct its position so they all end up in order.
                 */
-                add_page_to_lru_list(zone, page_tail, lru);
+                add_page_to_lru_list(page_tail, lruvec, lru);
                list_head = page_tail->lru.prev;
                list_move_tail(&page_tail->lru, list_head);
        }
        if (!PageUnevictable(page))
-                update_page_reclaim_stat(zone, page_tail, file, active);
+                update_page_reclaim_stat(lruvec, file, active);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-static void __pagevec_lru_add_fn(struct page *page, void *arg)
+static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
+                                 void *arg)
 {
        enum lru_list lru = (enum lru_list)arg;
-        struct zone *zone = page_zone(page);
        int file = is_file_lru(lru);
        int active = is_active_lru(lru);
@@ -712,8 +791,8 @@ static void __pagevec_lru_add_fn(struct page *page, void *arg)
        SetPageLRU(page);
        if (active)
                SetPageActive(page);
-        add_page_to_lru_list(zone, page, lru);
+        add_page_to_lru_list(page, lruvec, lru);
-        update_page_reclaim_stat(zone, page, file, active);
+        update_page_reclaim_stat(lruvec, file, active);
 }
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9d3dd3763cf7..0cb36fb1f61c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
+#include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
 #include <linux/page_cgroup.h>
@@ -26,7 +27,7 @@
 */
 static const struct address_space_operations swap_aops = {
        .writepage      = swap_writepage,
-        .set_page_dirty = __set_page_dirty_nobuffers,
+        .set_page_dirty = swap_set_page_dirty,
        .migratepage    = migrate_page,
 };
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        unsigned long offset = swp_offset(entry);
        unsigned long start_offset, end_offset;
        unsigned long mask = (1UL << page_cluster) - 1;
+        struct blk_plug plug;
        /* Read a page_cluster sized and aligned cluster around offset. */
        start_offset = offset & ~mask;
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        if (!start_offset)      /* First page is swap header. */
                start_offset++;
+        blk_start_plug(&plug);
        for (offset = start_offset; offset <= end_offset ; offset++) {
                /* Ok, do the async read-ahead now */
                page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
                        continue;
                page_cache_release(page);
        }
+        blk_finish_plug(&plug);
        lru_add_drain();        /* Push any new pages onto the LRU now */
        return read_swap_cache_async(entry, gfp_mask, vma, addr);
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index fafc26d1b1dc..14e254c768fc 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -31,6 +31,9 @@
 #include <linux/memcontrol.h>
 #include <linux/poll.h>
 #include <linux/oom.h>
+#include <linux/frontswap.h>
+#include <linux/swapfile.h>
+#include <linux/export.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -42,7 +45,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
 static void free_swap_count_continuations(struct swap_info_struct *);
 static sector_t map_swap_entry(swp_entry_t, struct block_device**);
-static DEFINE_SPINLOCK(swap_lock);
+DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 long nr_swap_pages;
 long total_swap_pages;
@@ -53,9 +56,9 @@ static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
-static struct swap_list_t swap_list = {-1, -1};
+struct swap_list_t swap_list = {-1, -1};
-static struct swap_info_struct *swap_info[MAX_SWAPFILES];
+struct swap_info_struct *swap_info[MAX_SWAPFILES];
 static DEFINE_MUTEX(swapon_mutex);
@@ -546,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
        /* free if no reference */
        if (!usage) {
-                struct gendisk *disk = p->bdev->bd_disk;
                if (offset < p->lowest_bit)
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
@@ -556,9 +558,13 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
                        swap_list.next = p->type;
                nr_swap_pages++;
                p->inuse_pages--;
-                if ((p->flags & SWP_BLKDEV) &&
+                frontswap_invalidate_page(p->type, offset);
-                                disk->fops->swap_slot_free_notify)
+                if (p->flags & SWP_BLKDEV) {
-                        disk->fops->swap_slot_free_notify(p->bdev, offset);
+                        struct gendisk *disk = p->bdev->bd_disk;
+                        if (disk->fops->swap_slot_free_notify)
+                                disk->fops->swap_slot_free_notify(p->bdev,
+                                                                  offset);
+                }
        }
        return usage;
@@ -601,7 +607,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
 * This does not give an exact answer when swap count is continued,
 * but does include the high COUNT_CONTINUED flag to allow for that.
 */
-static inline int page_swapcount(struct page *page)
+int page_swapcount(struct page *page)
 {
        int count = 0;
        struct swap_info_struct *p;
@@ -717,37 +723,6 @@ int free_swap_and_cache(swp_entry_t entry)
        return p != NULL;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_count_swap_user - count the user of a swap entry
- * @ent: the swap entry to be checked
- * @pagep: the pointer for the swap cache page of the entry to be stored
- *
- * Returns the number of the user of the swap entry. The number is valid only
- * for swaps of anonymous pages.
- * If the entry is found on swap cache, the page is stored to pagep with
- * refcount of it being incremented.
- */
-int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
-{
-        struct page *page;
-        struct swap_info_struct *p;
-        int count = 0;
-        page = find_get_page(&swapper_space, ent.val);
-        if (page)
-                count += page_mapcount(page);
-        p = swap_info_get(ent);
-        if (p) {
-                count += swap_count(p->swap_map[swp_offset(ent)]);
-                spin_unlock(&swap_lock);
-        }
-        *pagep = page;
-        return count;
-}
-#endif
 #ifdef CONFIG_HIBERNATION
 /*
 * Find the swap type that corresponds to given device (if any).
@@ -860,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
-                if (ret > 0)
+                mem_cgroup_cancel_charge_swapin(memcg);
-                        mem_cgroup_cancel_charge_swapin(memcg);
                ret = 0;
                goto out;
        }
@@ -1016,11 +990,12 @@ static int unuse_mm(struct mm_struct *mm,
 }
 /*
- * Scan swap_map from current position to next entry still in use.
+ * Scan swap_map (or frontswap_map if frontswap parameter is true)
+ * from current position to next entry still in use.
 * Recycle to start on reaching the end, returning 0 when empty.
 */
 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
-                                        unsigned int prev)
+                                        unsigned int prev, bool frontswap)
 {
        unsigned int max = si->max;
        unsigned int i = prev;
@@ -1046,6 +1021,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
                        prev = 0;
                        i = 1;
                }
+                if (frontswap) {
+                        if (frontswap_test(si, i))
+                                break;
+                        else
+                                continue;
+                }
                count = si->swap_map[i];
                if (count && swap_count(count) != SWAP_MAP_BAD)
                        break;
@@ -1057,8 +1038,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 * We completely avoid races by reading each swap page in advance,
 * and then search for the process using it.  All the necessary
 * page table adjustments can then be made atomically.
+ *
+ * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages; ignored if frontswap is false
 */
-static int try_to_unuse(unsigned int type)
+int try_to_unuse(unsigned int type, bool frontswap,
+                 unsigned long pages_to_unuse)
 {
        struct swap_info_struct *si = swap_info[type];
        struct mm_struct *start_mm;
@@ -1091,7 +1076,7 @@ static int try_to_unuse(unsigned int type)
         * one pass through swap_map is enough, but not necessarily:
         * there are races when an instance of an entry might be missed.
         */
-        while ((i = find_next_to_unuse(si, i)) != 0) {
+        while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
                if (signal_pending(current)) {
                        retval = -EINTR;
                        break;
@@ -1258,6 +1243,10 @@ static int try_to_unuse(unsigned int type)
                 * interactive performance.
                 */
                cond_resched();
+                if (frontswap && pages_to_unuse > 0) {
+                        if (!--pages_to_unuse)
+                                break;
+                }
        }
        mmput(start_mm);
@@ -1341,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
                list_del(&se->list);
                kfree(se);
        }
+        if (sis->flags & SWP_FILE) {
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                sis->flags &= ~SWP_FILE;
+                mapping->a_ops->swap_deactivate(swap_file);
+        }
 }
 /*
@@ -1349,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 *
 * This function rather assumes that it is called in ascending page order.
 */
-static int
+int
 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block)
 {
@@ -1422,102 +1419,33 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 */
 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 {
-        struct inode *inode;
+        struct file *swap_file = sis->swap_file;
-        unsigned blocks_per_page;
+        struct address_space *mapping = swap_file->f_mapping;
-        unsigned long page_no;
+        struct inode *inode = mapping->host;
-        unsigned blkbits;
-        sector_t probe_block;
-        sector_t last_block;
-        sector_t lowest_block = -1;
-        sector_t highest_block = 0;
-        int nr_extents = 0;
        int ret;
-        inode = sis->swap_file->f_mapping->host;
        if (S_ISBLK(inode->i_mode)) {
                ret = add_swap_extent(sis, 0, sis->max, 0);
                *span = sis->pages;
-                goto out;
+                return ret;
        }
-        blkbits = inode->i_blkbits;
+        if (mapping->a_ops->swap_activate) {
-        blocks_per_page = PAGE_SIZE >> blkbits;
+                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+                if (!ret) {
-        /*
+                        sis->flags |= SWP_FILE;
-         * Map all the blocks into the extent list.  This code doesn't try
+                        ret = add_swap_extent(sis, 0, sis->max, 0);
-         * to be very smart.
+                        *span = sis->pages;
-         */
-        probe_block = 0;
-        page_no = 0;
-        last_block = i_size_read(inode) >> blkbits;
-        while ((probe_block + blocks_per_page) <= last_block &&
-                        page_no < sis->max) {
-                unsigned block_in_page;
-                sector_t first_block;
-                first_block = bmap(inode, probe_block);
-                if (first_block == 0)
-                        goto bad_bmap;
-                /*
-                 * It must be PAGE_SIZE aligned on-disk
-                 */
-                if (first_block & (blocks_per_page - 1)) {
-                        probe_block++;
-                        goto reprobe;
-                }
-                for (block_in_page = 1; block_in_page < blocks_per_page;
-                                        block_in_page++) {
-                        sector_t block;
-                        block = bmap(inode, probe_block + block_in_page);
-                        if (block == 0)
-                                goto bad_bmap;
-                        if (block != first_block + block_in_page) {
-                                /* Discontiguity */
-                                probe_block++;
-                                goto reprobe;
-                        }
-                }
-                first_block >>= (PAGE_SHIFT - blkbits);
-                if (page_no) {  /* exclude the header page */
-                        if (first_block < lowest_block)
-                                lowest_block = first_block;
-                        if (first_block > highest_block)
-                                highest_block = first_block;
                }
+                return ret;
+        }
-                /*
+        return generic_swapfile_activate(sis, swap_file, span);
-                 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
-                 */
-                ret = add_swap_extent(sis, page_no, 1, first_block);
-                if (ret < 0)
-                        goto out;
-                nr_extents += ret;
-                page_no++;
-                probe_block += blocks_per_page;
-reprobe:
-                continue;
-        }
-        ret = nr_extents;
-        *span = 1 + highest_block - lowest_block;
-        if (page_no == 0)
-                page_no = 1;    /* force Empty message */
-        sis->max = page_no;
-        sis->pages = page_no - 1;
-        sis->highest_bit = page_no - 1;
-out:
-        return ret;
-bad_bmap:
-        printk(KERN_ERR "swapon: swapfile has holes\n");
-        ret = -EINVAL;
-        goto out;
 }
 static void enable_swap_info(struct swap_info_struct *p, int prio,
-                                unsigned char *swap_map)
+                                unsigned char *swap_map,
+                                unsigned long *frontswap_map)
 {
        int i, prev;
@@ -1527,6 +1455,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
        else
                p->prio = --least_priority;
        p->swap_map = swap_map;
+        frontswap_map_set(p, frontswap_map);
        p->flags |= SWP_WRITEOK;
        nr_swap_pages += p->pages;
        total_swap_pages += p->pages;
@@ -1543,6 +1472,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
                swap_list.head = swap_list.next = p->type;
        else
                swap_info[prev]->next = p->type;
+        frontswap_init(p->type);
        spin_unlock(&swap_lock);
 }
@@ -1616,7 +1546,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_unlock(&swap_lock);
        oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
-        err = try_to_unuse(type);
+        err = try_to_unuse(type, false, 0); /* force all pages to be unused */
        compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
        if (err) {
@@ -1627,7 +1557,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                 * sys_swapoff for this swap_info_struct at this point.
                 */
                /* re-insert swap space back into swap_list */
-                enable_swap_info(p, p->prio, p->swap_map);
+                enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
                goto out_dput;
        }
@@ -1653,9 +1583,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        swap_map = p->swap_map;
        p->swap_map = NULL;
        p->flags = 0;
+        frontswap_invalidate_area(type);
        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
        vfree(swap_map);
+        vfree(frontswap_map_get(p));
        /* Destroy swap account informatin */
        swap_cgroup_swapoff(type);
@@ -1924,24 +1856,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        /*
         * Find out how many pages are allowed for a single swap
-         * device. There are three limiting factors: 1) the number
+         * device. There are two limiting factors: 1) the number
         * of bits for the swap offset in the swp_entry_t type, and
         * 2) the number of bits in the swap pte as defined by the
-         * the different architectures, and 3) the number of free bits
+         * different architectures. In order to find the
-         * in an exceptional radix_tree entry. In order to find the
         * largest possible bit mask, a swap entry with swap type 0
         * and swap offset ~0UL is created, encoded to a swap pte,
         * decoded to a swp_entry_t again, and finally the swap
         * offset is extracted. This will mask all the bits from
         * the initial ~0UL mask that can't be encoded in either
         * the swp_entry_t or the architecture definition of a
-         * swap pte.  Then the same is done for a radix_tree entry.
+         * swap pte.
         */
        maxpages = swp_offset(pte_to_swp_entry(
-                        swp_entry_to_pte(swp_entry(0, ~0UL))));
+                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
-        maxpages = swp_offset(radix_to_swp_entry(
-                        swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
        if (maxpages > swap_header->info.last_page) {
                maxpages = swap_header->info.last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
@@ -2019,6 +1947,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        sector_t span;
        unsigned long maxpages;
        unsigned char *swap_map = NULL;
+        unsigned long *frontswap_map = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
@@ -2102,6 +2031,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                error = nr_extents;
                goto bad_swap;
        }
+        /* frontswap enabled? set up bit-per-page map for frontswap */
+        if (frontswap_enabled)
+                frontswap_map = vzalloc(maxpages / sizeof(long));
        if (p->bdev) {
                if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
@@ -2117,14 +2049,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        if (swap_flags & SWAP_FLAG_PREFER)
                prio =
                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
-        enable_swap_info(p, prio, swap_map);
+        enable_swap_info(p, prio, swap_map, frontswap_map);
        printk(KERN_INFO "Adding %uk swap on %s.  "
-                        "Priority:%d extents:%d across:%lluk %s%s\n",
+                        "Priority:%d extents:%d across:%lluk %s%s%s\n",
                p->pages<<(PAGE_SHIFT-10), name, p->prio,
                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
-                (p->flags & SWP_DISCARDABLE) ? "D" : "");
+                (p->flags & SWP_DISCARDABLE) ? "D" : "",
+                (frontswap_map) ? "FS" : "");
        mutex_unlock(&swapon_mutex);
        atomic_inc(&proc_poll_event);
@@ -2292,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry)
        return __swap_duplicate(entry, SWAP_HAS_CACHE);
 }
+struct swap_info_struct *page_swap_info(struct page *page)
+{
+        swp_entry_t swap = { .val = page_private(page) };
+        BUG_ON(!PageSwapCache(page));
+        return swap_info[swp_type(swap)];
+}
+/*
+ * out-of-line __page_file_ methods to avoid include hell.
+ */
+struct address_space *__page_file_mapping(struct page *page)
+{
+        VM_BUG_ON(!PageSwapCache(page));
+        return page_swap_info(page)->swap_file->f_mapping;
+}
+EXPORT_SYMBOL_GPL(__page_file_mapping);
+pgoff_t __page_file_index(struct page *page)
+{
+        swp_entry_t swap = { .val = page_private(page) };
+        VM_BUG_ON(!PageSwapCache(page));
+        return swp_offset(swap);
+}
+EXPORT_SYMBOL_GPL(__page_file_index);
 /*
 * add_swap_count_continuation - called when a swap count is duplicated
 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
diff --git a/mm/thrash.c b/mm/thrash.c
deleted file mode 100644
index 57ad495dbd54..000000000000
--- a/mm/thrash.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * mm/thrash.c
- *
- * Copyright (C) 2004, Red Hat, Inc.
- * Copyright (C) 2004, Rik van Riel <riel@redhat.com>
- * Released under the GPL, see the file COPYING for details.
- *
- * Simple token based thrashing protection, using the algorithm
- * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html
- *
- * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com>
- * Improved algorithm to pass token:
- * Each task has a priority which is incremented if it contended
- * for the token in an interval less than its previous attempt.
- * If the token is acquired, that task's priority is boosted to prevent
- * the token from bouncing around too often and to let the task make
- * some progress in its execution.
- */
-#include <linux/jiffies.h>
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/swap.h>
-#include <linux/memcontrol.h>
-#include <trace/events/vmscan.h>
-#define TOKEN_AGING_INTERVAL    (0xFF)
-static DEFINE_SPINLOCK(swap_token_lock);
-struct mm_struct *swap_token_mm;
-static struct mem_cgroup *swap_token_memcg;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
-        struct mem_cgroup *memcg;
-        memcg = try_get_mem_cgroup_from_mm(mm);
-        if (memcg)
-                css_put(mem_cgroup_css(memcg));
-        return memcg;
-}
-#else
-static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
-{
-        return NULL;
-}
-#endif
-void grab_swap_token(struct mm_struct *mm)
-{
-        int current_interval;
-        unsigned int old_prio = mm->token_priority;
-        static unsigned int global_faults;
-        static unsigned int last_aging;
-        global_faults++;
-        current_interval = global_faults - mm->faultstamp;
-        if (!spin_trylock(&swap_token_lock))
-                return;
-        /* First come first served */
-        if (!swap_token_mm)
-                goto replace_token;
-        /*
-         * Usually, we don't need priority aging because long interval faults
-         * makes priority decrease quickly. But there is one exception. If the
-         * token owner task is sleeping, it never make long interval faults.
-         * Thus, we need a priority aging mechanism instead. The requirements
-         * of priority aging are
-         *  1) An aging interval is reasonable enough long. Too short aging
-         *     interval makes quick swap token lost and decrease performance.
-         *  2) The swap token owner task have to get priority aging even if
-         *     it's under sleep.
-         */
-        if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
-                swap_token_mm->token_priority /= 2;
-                last_aging = global_faults;
-        }
-        if (mm == swap_token_mm) {
-                mm->token_priority += 2;
-                goto update_priority;
-        }
-        if (current_interval < mm->last_interval)
-                mm->token_priority++;
-        else {
-                if (likely(mm->token_priority > 0))
-                        mm->token_priority--;
-        }
-        /* Check if we deserve the token */
-        if (mm->token_priority > swap_token_mm->token_priority)
-                goto replace_token;
-update_priority:
-        trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
-out:
-        mm->faultstamp = global_faults;
-        mm->last_interval = current_interval;
-        spin_unlock(&swap_token_lock);
-        return;
-replace_token:
-        mm->token_priority += 2;
-        trace_replace_swap_token(swap_token_mm, mm);
-        swap_token_mm = mm;
-        swap_token_memcg = swap_token_memcg_from_mm(mm);
-        last_aging = global_faults;
-        goto out;
-}
-/* Called on process exit. */
-void __put_swap_token(struct mm_struct *mm)
-{
-        spin_lock(&swap_token_lock);
-        if (likely(mm == swap_token_mm)) {
-                trace_put_swap_token(swap_token_mm);
-                swap_token_mm = NULL;
-                swap_token_memcg = NULL;
-        }
-        spin_unlock(&swap_token_lock);
-}
-static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
-{
-        if (!a)
-                return true;
-        if (!b)
-                return true;
-        if (a == b)
-                return true;
-        return false;
-}
-void disable_swap_token(struct mem_cgroup *memcg)
-{
-        /* memcg reclaim don't disable unrelated mm token. */
-        if (match_memcg(memcg, swap_token_memcg)) {
-                spin_lock(&swap_token_lock);
-                if (match_memcg(memcg, swap_token_memcg)) {
-                        trace_disable_swap_token(swap_token_mm);
-                        swap_token_mm = NULL;
-                        swap_token_memcg = NULL;
-                }
-                spin_unlock(&swap_token_lock);
-        }
-}
diff --git a/mm/truncate.c b/mm/truncate.c
index 61a183b89df6..75801acdaac7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -602,31 +602,6 @@ int vmtruncate(struct inode *inode, loff_t newsize)
 }
 EXPORT_SYMBOL(vmtruncate);
-int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend)
-{
-        struct address_space *mapping = inode->i_mapping;
-        loff_t holebegin = round_up(lstart, PAGE_SIZE);
-        loff_t holelen = 1 + lend - holebegin;
-        /*
-         * If the underlying filesystem is not going to provide
-         * a way to truncate a range of blocks (punch a hole) -
-         * we should return failure right now.
-         */
-        if (!inode->i_op->truncate_range)
-                return -ENOSYS;
-        mutex_lock(&inode->i_mutex);
-        inode_dio_wait(inode);
-        unmap_mapping_range(mapping, holebegin, holelen, 1);
-        inode->i_op->truncate_range(inode, lstart, lend);
-        /* unmap again to remove racily COWed private pages */
-        unmap_mapping_range(mapping, holebegin, holelen, 1);
-        mutex_unlock(&inode->i_mutex);
-        return 0;
-}
 /**
 * truncate_pagecache_range - unmap and remove pagecache that is hole-punched
 * @inode: inode
diff --git a/mm/util.c b/mm/util.c
index ae962b31de88..8c7265afa29f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
 #include <linux/export.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/security.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
+unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
+        unsigned long len, unsigned long prot,
+        unsigned long flag, unsigned long pgoff)
+{
+        unsigned long ret;
+        struct mm_struct *mm = current->mm;
+        ret = security_mmap_file(file, prot, flag);
+        if (!ret) {
+                down_write(&mm->mmap_sem);
+                ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
+                up_write(&mm->mmap_sem);
+        }
+        return ret;
+}
+unsigned long vm_mmap(struct file *file, unsigned long addr,
+        unsigned long len, unsigned long prot,
+        unsigned long flag, unsigned long offset)
+{
+        if (unlikely(offset + PAGE_ALIGN(len) < offset))
+                return -EINVAL;
+        if (unlikely(offset & ~PAGE_MASK))
+                return -EINVAL;
+        return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(vm_mmap);
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 94dff883b449..2bb90b1d241c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,11 +413,11 @@ nocache:
                if (addr + size - 1 < addr)
                        goto overflow;
-                n = rb_next(&first->rb_node);
+                if (list_is_last(&first->list, &vmap_area_list))
-                if (n)
-                        first = rb_entry(n, struct vmap_area, rb_node);
-                else
                        goto found;
+                first = list_entry(first->list.next,
+                                struct vmap_area, list);
        }
 found:
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
        BUG_ON(size & ~PAGE_MASK);
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+        if (WARN_ON(size == 0)) {
+                /*
+                 * Allocating 0 bytes isn't what caller wants since
+                 * get_order(0) returns funny result. Just warn and terminate
+                 * early.
+                 */
+                return NULL;
+        }
        order = get_order(size);
 again:
@@ -1185,9 +1193,10 @@ void __init vmalloc_init(void)
        /* Import existing vmlist entries. */
        for (tmp = vmlist; tmp; tmp = tmp->next) {
                va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
-                va->flags = tmp->flags | VM_VM_AREA;
+                va->flags = VM_VM_AREA;
                va->va_start = (unsigned long)tmp->addr;
                va->va_end = va->va_start + tmp->size;
+                va->vm = tmp;
                __insert_vmap_area(va);
        }
@@ -1279,7 +1288,7 @@ DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-                              unsigned long flags, void *caller)
+                              unsigned long flags, const void *caller)
 {
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
@@ -1305,7 +1314,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm)
 }
 static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-                              unsigned long flags, void *caller)
+                              unsigned long flags, const void *caller)
 {
        setup_vmalloc_vm(vm, va, flags, caller);
        insert_vmalloc_vmlist(vm);
@@ -1313,7 +1322,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long flags, unsigned long start,
-                unsigned long end, int node, gfp_t gfp_mask, void *caller)
+                unsigned long end, int node, gfp_t gfp_mask, const void *caller)
 {
        struct vmap_area *va;
        struct vm_struct *area;
@@ -1374,7 +1383,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
-                                       void *caller)
+                                       const void *caller)
 {
        return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
                                  caller);
@@ -1396,13 +1405,21 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 }
 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
-                                void *caller)
+                                const void *caller)
 {
        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
                                                -1, GFP_KERNEL, caller);
 }
-static struct vm_struct *find_vm_area(const void *addr)
+/**
+ *      find_vm_area  -  find a continuous kernel virtual area
+ *      @addr:          base address
+ *
+ *      Search for the kernel VM area starting at @addr, and return it.
+ *      It is up to the caller to do all required locking to keep the returned
+ *      pointer valid.
+ */
+struct vm_struct *find_vm_area(const void *addr)
 {
        struct vmap_area *va;
@@ -1567,9 +1584,9 @@ EXPORT_SYMBOL(vmap);
 static void *__vmalloc_node(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, pgprot_t prot,
-                            int node, void *caller);
+                            int node, const void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-                                 pgprot_t prot, int node, void *caller)
+                                 pgprot_t prot, int node, const void *caller)
 {
        const int order = 0;
        struct page **pages;
@@ -1642,7 +1659,7 @@ fail:
 */
 void *__vmalloc_node_range(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
-                        pgprot_t prot, int node, void *caller)
+                        pgprot_t prot, int node, const void *caller)
 {
        struct vm_struct *area;
        void *addr;
@@ -1698,7 +1715,7 @@ fail:
 */
 static void *__vmalloc_node(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, pgprot_t prot,
-                            int node, void *caller)
+                            int node, const void *caller)
 {
        return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
                                gfp_mask, prot, node, caller);
@@ -1974,9 +1991,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
 *      IOREMAP area is treated as memory hole and no copy is done.
 *
 *      If [addr...addr+count) doesn't includes any intersects with alive
- *      vm_struct area, returns 0.
+ *      vm_struct area, returns 0. @buf should be kernel's buffer.
- *      @buf should be kernel's buffer. Because this function uses KM_USER0,
- *      the caller should guarantee KM_USER0 is not used.
 *
 *      Note: In usual ops, vread() is never necessary because the caller
 *      should know vmalloc() area is valid and can use memcpy().
@@ -2050,9 +2065,7 @@ finished:
 *      IOREMAP area is treated as memory hole and no copy is done.
 *
 *      If [addr...addr+count) doesn't includes any intersects with alive
- *      vm_struct area, returns 0.
+ *      vm_struct area, returns 0. @buf should be kernel's buffer.
- *      @buf should be kernel's buffer. Because this function uses KM_USER0,
- *      the caller should guarantee KM_USER0 is not used.
 *
 *      Note: In usual ops, vwrite() is never necessary because the caller
 *      should know vmalloc() area is valid and can use memcpy().
@@ -2375,8 +2388,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                return NULL;
        }
-        vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
+        vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
-        vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
+        vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
        if (!vas || !vms)
                goto err_free2;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 33c332bbab73..8d01243d9560 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -53,24 +53,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
-/*
- * reclaim_mode determines how the inactive list is shrunk
- * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
- * RECLAIM_MODE_ASYNC:  Do not block
- * RECLAIM_MODE_SYNC:   Allow blocking e.g. call wait_on_page_writeback
- * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
- *                      page from the LRU and reclaim all pages within a
- *                      naturally aligned range
- * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
- *                      order-0 pages and then compact the zone
- */
-typedef unsigned __bitwise__ reclaim_mode_t;
-#define RECLAIM_MODE_SINGLE             ((__force reclaim_mode_t)0x01u)
-#define RECLAIM_MODE_ASYNC              ((__force reclaim_mode_t)0x02u)
-#define RECLAIM_MODE_SYNC               ((__force reclaim_mode_t)0x04u)
-#define RECLAIM_MODE_LUMPYRECLAIM       ((__force reclaim_mode_t)0x08u)
-#define RECLAIM_MODE_COMPACTION         ((__force reclaim_mode_t)0x10u)
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
@@ -96,11 +78,8 @@ struct scan_control {
        int order;
-        /*
+        /* Scan (total_size >> priority) pages at once */
-         * Intend to reclaim enough continuous memory rather than reclaim
+        int priority;
-         * enough amount of memory. i.e, mode for high order allocation.
-         */
-        reclaim_mode_t reclaim_mode;
        /*
         * The memory cgroup that hit its limit and as a result is the
@@ -115,11 +94,6 @@ struct scan_control {
        nodemask_t      *nodemask;
 };
-struct mem_cgroup_zone {
-        struct mem_cgroup *mem_cgroup;
-        struct zone *zone;
-};
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 #ifdef ARCH_HAS_PREFETCH
@@ -159,49 +133,26 @@ long vm_total_pages;	/* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
 {
        return !sc->target_mem_cgroup;
 }
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
-        return !mz->mem_cgroup;
-}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
        return true;
 }
-static bool scanning_global_lru(struct mem_cgroup_zone *mz)
-{
-        return true;
-}
 #endif
-static struct zone_reclaim_stat *get_reclaim_stat(struct mem_cgroup_zone *mz)
+static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
-        if (!scanning_global_lru(mz))
+        if (!mem_cgroup_disabled())
-                return mem_cgroup_get_reclaim_stat(mz->mem_cgroup, mz->zone);
+                return mem_cgroup_get_lru_size(lruvec, lru);
-        return &mz->zone->reclaim_stat;
+        return zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru);
 }
-static unsigned long zone_nr_lru_pages(struct mem_cgroup_zone *mz,
-                                       enum lru_list lru)
-{
-        if (!scanning_global_lru(mz))
-                return mem_cgroup_zone_nr_lru_pages(mz->mem_cgroup,
-                                                    zone_to_nid(mz->zone),
-                                                    zone_idx(mz->zone),
-                                                    BIT(lru));
-        return zone_page_state(mz->zone, NR_LRU_BASE + lru);
-}
 /*
 * Add a shrinker callback to be called from the vm
 */
@@ -364,39 +315,6 @@ out:
        return ret;
 }
-static void set_reclaim_mode(int priority, struct scan_control *sc,
-                                   bool sync)
-{
-        reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
-        /*
-         * Initially assume we are entering either lumpy reclaim or
-         * reclaim/compaction.Depending on the order, we will either set the
-         * sync mode or just reclaim order-0 pages later.
-         */
-        if (COMPACTION_BUILD)
-                sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
-        else
-                sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
-        /*
-         * Avoid using lumpy reclaim or reclaim/compaction if possible by
-         * restricting when its set to either costly allocations or when
-         * under memory pressure
-         */
-        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-                sc->reclaim_mode |= syncmode;
-        else if (sc->order && priority < DEF_PRIORITY - 2)
-                sc->reclaim_mode |= syncmode;
-        else
-                sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
-}
-static void reset_reclaim_mode(struct scan_control *sc)
-{
-        sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
-}
 static inline int is_page_cache_freeable(struct page *page)
 {
        /*
@@ -416,10 +334,6 @@ static int may_write_to_queue(struct backing_dev_info *bdi,
                return 1;
        if (bdi == current->backing_dev_info)
                return 1;
-        /* lumpy reclaim for hugepage often need a lot of write */
-        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-                return 1;
        return 0;
 }
@@ -523,8 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
                }
-                trace_mm_vmscan_writepage(page,
+                trace_mm_vmscan_writepage(page, trace_reclaim_flags(page));
-                        trace_reclaim_flags(page, sc->reclaim_mode));
                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -701,19 +614,15 @@ enum page_references {
 };
 static enum page_references page_check_references(struct page *page,
-                                                  struct mem_cgroup_zone *mz,
                                                  struct scan_control *sc)
 {
        int referenced_ptes, referenced_page;
        unsigned long vm_flags;
-        referenced_ptes = page_referenced(page, 1, mz->mem_cgroup, &vm_flags);
+        referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
+                                          &vm_flags);
        referenced_page = TestClearPageReferenced(page);
-        /* Lumpy reclaim - ignore references */
-        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-                return PAGEREF_RECLAIM;
        /*
         * Mlock lost the isolation race with us.  Let try_to_unmap()
         * move the page to the unevictable list.
@@ -722,7 +631,7 @@ static enum page_references page_check_references(struct page *page,
                return PAGEREF_RECLAIM;
        if (referenced_ptes) {
-                if (PageAnon(page))
+                if (PageSwapBacked(page))
                        return PAGEREF_ACTIVATE;
                /*
                 * All mapped pages start out with page table
@@ -763,9 +672,8 @@ static enum page_references page_check_references(struct page *page,
 * shrink_page_list() returns the number of reclaimed pages
 */
 static unsigned long shrink_page_list(struct list_head *page_list,
-                                      struct mem_cgroup_zone *mz,
+                                      struct zone *zone,
                                      struct scan_control *sc,
-                                      int priority,
                                      unsigned long *ret_nr_dirty,
                                      unsigned long *ret_nr_writeback)
 {
@@ -779,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        cond_resched();
+        mem_cgroup_uncharge_start();
        while (!list_empty(page_list)) {
                enum page_references references;
                struct address_space *mapping;
@@ -794,7 +703,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        goto keep;
                VM_BUG_ON(PageActive(page));
-                VM_BUG_ON(page_zone(page) != mz->zone);
+                VM_BUG_ON(page_zone(page) != zone);
                sc->nr_scanned++;
@@ -812,23 +721,44 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
                if (PageWriteback(page)) {
-                        nr_writeback++;
                        /*
-                         * Synchronous reclaim cannot queue pages for
+                         * memcg doesn't have any dirty pages throttling so we
-                         * writeback due to the possibility of stack overflow
+                         * could easily OOM just because too many pages are in
-                         * but if it encounters a page under writeback, wait
+                         * writeback and there is nothing else to reclaim.
-                         * for the IO to complete.
+                         *
+                         * Check __GFP_IO, certainly because a loop driver
+                         * thread might enter reclaim, and deadlock if it waits
+                         * on a page for which it is needed to do the write
+                         * (loop masks off __GFP_IO|__GFP_FS for this reason);
+                         * but more thought would probably show more reasons.
+                         *
+                         * Don't require __GFP_FS, since we're not going into
+                         * the FS, just waiting on its writeback completion.
+                         * Worryingly, ext4 gfs2 and xfs allocate pages with
+                         * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
+                         * testing may_enter_fs here is liable to OOM on them.
                         */
-                        if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
+                        if (global_reclaim(sc) ||
-                            may_enter_fs)
+                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
-                                wait_on_page_writeback(page);
+                                /*
-                        else {
+                                 * This is slightly racy - end_page_writeback()
-                                unlock_page(page);
+                                 * might have just cleared PageReclaim, then
-                                goto keep_lumpy;
+                                 * setting PageReclaim here end up interpreted
+                                 * as PageReadahead - but that does not matter
+                                 * enough to care.  What we do want is for this
+                                 * page to have PageReclaim set next time memcg
+                                 * reclaim reaches the tests above, so it will
+                                 * then wait_on_page_writeback() to avoid OOM;
+                                 * and it's also appropriate in global reclaim.
+                                 */
+                                SetPageReclaim(page);
+                                nr_writeback++;
+                                goto keep_locked;
                        }
+                        wait_on_page_writeback(page);
                }
-                references = page_check_references(page, mz, sc);
+                references = page_check_references(page, sc);
                switch (references) {
                case PAGEREF_ACTIVATE:
                        goto activate_locked;
@@ -879,7 +809,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         * unless under significant pressure.
                         */
                        if (page_is_file_cache(page) &&
-                                        (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+                                        (!current_is_kswapd() ||
+                                         sc->priority >= DEF_PRIORITY - 2)) {
                                /*
                                 * Immediately reclaim when written back.
                                 * Similar in principal to deactivate_page()
@@ -908,7 +839,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto activate_locked;
                        case PAGE_SUCCESS:
                                if (PageWriteback(page))
-                                        goto keep_lumpy;
+                                        goto keep;
                                if (PageDirty(page))
                                        goto keep;
@@ -994,7 +925,6 @@ cull_mlocked:
                        try_to_free_swap(page);
                unlock_page(page);
                putback_lru_page(page);
-                reset_reclaim_mode(sc);
                continue;
 activate_locked:
@@ -1007,8 +937,6 @@ activate_locked:
 keep_locked:
                unlock_page(page);
 keep:
-                reset_reclaim_mode(sc);
-keep_lumpy:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
@@ -1020,12 +948,13 @@ keep_lumpy:
         * will encounter the same problem
         */
        if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
-                zone_set_flag(mz->zone, ZONE_CONGESTED);
+                zone_set_flag(zone, ZONE_CONGESTED);
        free_hot_cold_page_list(&free_pages, 1);
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
+        mem_cgroup_uncharge_end();
        *ret_nr_dirty += nr_dirty;
        *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
@@ -1041,34 +970,15 @@ keep_lumpy:
 *
 * returns 0 on success, -ve errno on failure.
 */
-int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode)
 {
-        bool all_lru_mode;
        int ret = -EINVAL;
        /* Only take pages on the LRU. */
        if (!PageLRU(page))
                return ret;
-        all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+        /* Do not give back unevictable pages for compaction */
-                (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
-        /*
-         * When checking the active state, we need to be sure we are
-         * dealing with comparible boolean values.  Take the logical not
-         * of each.
-         */
-        if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
-                return ret;
-        if (!all_lru_mode && !!page_is_file_cache(page) != file)
-                return ret;
-        /*
-         * When this function is being called for lumpy reclaim, we
-         * initially look into all LRU pages, active, inactive and
-         * unevictable; only give shrink_page_list evictable pages.
-         */
        if (PageUnevictable(page))
                return ret;
@@ -1135,54 +1045,39 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 * Appropriate locks must be held before calling this function.
 *
 * @nr_to_scan: The number of pages to look through on the list.
- * @mz:         The mem_cgroup_zone to pull pages from.
+ * @lruvec:     The LRU vector to pull pages from.
 * @dst:        The temp list to put pages on to.
 * @nr_scanned: The number of pages that were scanned.
 * @sc:         The scan_control struct for this reclaim session
 * @mode:       One of the LRU isolation modes
- * @active:     True [1] if isolating active pages
+ * @lru:        LRU list id for isolating
- * @file:       True [1] if isolating file [!anon] pages
 *
 * returns how many pages were moved onto *@dst.
 */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
-                struct mem_cgroup_zone *mz, struct list_head *dst,
+                struct lruvec *lruvec, struct list_head *dst,
                unsigned long *nr_scanned, struct scan_control *sc,
-                isolate_mode_t mode, int active, int file)
+                isolate_mode_t mode, enum lru_list lru)
 {
-        struct lruvec *lruvec;
+        struct list_head *src = &lruvec->lists[lru];
-        struct list_head *src;
        unsigned long nr_taken = 0;
-        unsigned long nr_lumpy_taken = 0;
-        unsigned long nr_lumpy_dirty = 0;
-        unsigned long nr_lumpy_failed = 0;
        unsigned long scan;
-        int lru = LRU_BASE;
-        lruvec = mem_cgroup_zone_lruvec(mz->zone, mz->mem_cgroup);
-        if (active)
-                lru += LRU_ACTIVE;
-        if (file)
-                lru += LRU_FILE;
-        src = &lruvec->lists[lru];
        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
                struct page *page;
-                unsigned long pfn;
+                int nr_pages;
-                unsigned long end_pfn;
-                unsigned long page_pfn;
-                int zone_id;
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
                VM_BUG_ON(!PageLRU(page));
-                switch (__isolate_lru_page(page, mode, file)) {
+                switch (__isolate_lru_page(page, mode)) {
                case 0:
-                        mem_cgroup_lru_del(page);
+                        nr_pages = hpage_nr_pages(page);
+                        mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
                        list_move(&page->lru, dst);
-                        nr_taken += hpage_nr_pages(page);
+                        nr_taken += nr_pages;
                        break;
                case -EBUSY:
@@ -1193,93 +1088,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                default:
                        BUG();
                }
-                if (!sc->order || !(sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM))
-                        continue;
-                /*
-                 * Attempt to take all pages in the order aligned region
-                 * surrounding the tag page.  Only take those pages of
-                 * the same active state as that tag page.  We may safely
-                 * round the target page pfn down to the requested order
-                 * as the mem_map is guaranteed valid out to MAX_ORDER,
-                 * where that page is in a different zone we will detect
-                 * it from its zone id and abort this block scan.
-                 */
-                zone_id = page_zone_id(page);
-                page_pfn = page_to_pfn(page);
-                pfn = page_pfn & ~((1 << sc->order) - 1);
-                end_pfn = pfn + (1 << sc->order);
-                for (; pfn < end_pfn; pfn++) {
-                        struct page *cursor_page;
-                        /* The target page is in the block, ignore it. */
-                        if (unlikely(pfn == page_pfn))
-                                continue;
-                        /* Avoid holes within the zone. */
-                        if (unlikely(!pfn_valid_within(pfn)))
-                                break;
-                        cursor_page = pfn_to_page(pfn);
-                        /* Check that we have not crossed a zone boundary. */
-                        if (unlikely(page_zone_id(cursor_page) != zone_id))
-                                break;
-                        /*
-                         * If we don't have enough swap space, reclaiming of
-                         * anon page which don't already have a swap slot is
-                         * pointless.
-                         */
-                        if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) &&
-                            !PageSwapCache(cursor_page))
-                                break;
-                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
-                                unsigned int isolated_pages;
-                                mem_cgroup_lru_del(cursor_page);
-                                list_move(&cursor_page->lru, dst);
-                                isolated_pages = hpage_nr_pages(cursor_page);
-                                nr_taken += isolated_pages;
-                                nr_lumpy_taken += isolated_pages;
-                                if (PageDirty(cursor_page))
-                                        nr_lumpy_dirty += isolated_pages;
-                                scan++;
-                                pfn += isolated_pages - 1;
-                        } else {
-                                /*
-                                 * Check if the page is freed already.
-                                 *
-                                 * We can't use page_count() as that
-                                 * requires compound_head and we don't
-                                 * have a pin on the page here. If a
-                                 * page is tail, we may or may not
-                                 * have isolated the head, so assume
-                                 * it's not free, it'd be tricky to
-                                 * track the head status without a
-                                 * page pin.
-                                 */
-                                if (!PageTail(cursor_page) &&
-                                    !atomic_read(&cursor_page->_count))
-                                        continue;
-                                break;
-                        }
-                }
-                /* If we break out of the loop above, lumpy reclaim failed */
-                if (pfn < end_pfn)
-                        nr_lumpy_failed++;
        }
        *nr_scanned = scan;
+        trace_mm_vmscan_lru_isolate(sc->order, nr_to_scan, scan,
-        trace_mm_vmscan_lru_isolate(sc->order,
+                                    nr_taken, mode, is_file_lru(lru));
-                        nr_to_scan, scan,
-                        nr_taken,
-                        nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
-                        mode, file);
        return nr_taken;
 }
@@ -1316,15 +1129,16 @@ int isolate_lru_page(struct page *page)
        if (PageLRU(page)) {
                struct zone *zone = page_zone(page);
+                struct lruvec *lruvec;
                spin_lock_irq(&zone->lru_lock);
+                lruvec = mem_cgroup_page_lruvec(page, zone);
                if (PageLRU(page)) {
                        int lru = page_lru(page);
-                        ret = 0;
                        get_page(page);
                        ClearPageLRU(page);
+                        del_page_from_lru_list(page, lruvec, lru);
-                        del_page_from_lru_list(zone, page, lru);
+                        ret = 0;
                }
                spin_unlock_irq(&zone->lru_lock);
        }
@@ -1357,11 +1171,10 @@ static int too_many_isolated(struct zone *zone, int file,
 }
 static noinline_for_stack void
-putback_inactive_pages(struct mem_cgroup_zone *mz,
+putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
-                       struct list_head *page_list)
 {
-        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
-        struct zone *zone = mz->zone;
+        struct zone *zone = lruvec_zone(lruvec);
        LIST_HEAD(pages_to_free);
        /*
@@ -1379,9 +1192,13 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
                        spin_lock_irq(&zone->lru_lock);
                        continue;
                }
+                lruvec = mem_cgroup_page_lruvec(page, zone);
                SetPageLRU(page);
                lru = page_lru(page);
-                add_page_to_lru_list(zone, page, lru);
+                add_page_to_lru_list(page, lruvec, lru);
                if (is_active_lru(lru)) {
                        int file = is_file_lru(lru);
                        int numpages = hpage_nr_pages(page);
@@ -1390,7 +1207,7 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
                if (put_page_testzero(page)) {
                        __ClearPageLRU(page);
                        __ClearPageActive(page);
-                        del_page_from_lru_list(zone, page, lru);
+                        del_page_from_lru_list(page, lruvec, lru);
                        if (unlikely(PageCompound(page))) {
                                spin_unlock_irq(&zone->lru_lock);
@@ -1407,112 +1224,24 @@ putback_inactive_pages(struct mem_cgroup_zone *mz,
        list_splice(&pages_to_free, page_list);
 }
-static noinline_for_stack void
-update_isolated_counts(struct mem_cgroup_zone *mz,
-                       struct list_head *page_list,
-                       unsigned long *nr_anon,
-                       unsigned long *nr_file)
-{
-        struct zone *zone = mz->zone;
-        unsigned int count[NR_LRU_LISTS] = { 0, };
-        unsigned long nr_active = 0;
-        struct page *page;
-        int lru;
-        /*
-         * Count pages and clear active flags
-         */
-        list_for_each_entry(page, page_list, lru) {
-                int numpages = hpage_nr_pages(page);
-                lru = page_lru_base_type(page);
-                if (PageActive(page)) {
-                        lru += LRU_ACTIVE;
-                        ClearPageActive(page);
-                        nr_active += numpages;
-                }
-                count[lru] += numpages;
-        }
-        preempt_disable();
-        __count_vm_events(PGDEACTIVATE, nr_active);
-        __mod_zone_page_state(zone, NR_ACTIVE_FILE,
-                              -count[LRU_ACTIVE_FILE]);
-        __mod_zone_page_state(zone, NR_INACTIVE_FILE,
-                              -count[LRU_INACTIVE_FILE]);
-        __mod_zone_page_state(zone, NR_ACTIVE_ANON,
-                              -count[LRU_ACTIVE_ANON]);
-        __mod_zone_page_state(zone, NR_INACTIVE_ANON,
-                              -count[LRU_INACTIVE_ANON]);
-        *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
-        *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
-        __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
-        __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
-        preempt_enable();
-}
-/*
- * Returns true if a direct reclaim should wait on pages under writeback.
- *
- * If we are direct reclaiming for contiguous pages and we do not reclaim
- * everything in the list, try again and wait for writeback IO to complete.
- * This will stall high-order allocations noticeably. Only do that when really
- * need to free the pages under high memory pressure.
- */
-static inline bool should_reclaim_stall(unsigned long nr_taken,
-                                        unsigned long nr_freed,
-                                        int priority,
-                                        struct scan_control *sc)
-{
-        int lumpy_stall_priority;
-        /* kswapd should not stall on sync IO */
-        if (current_is_kswapd())
-                return false;
-        /* Only stall on lumpy reclaim */
-        if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
-                return false;
-        /* If we have reclaimed everything on the isolated list, no stall */
-        if (nr_freed == nr_taken)
-                return false;
-        /*
-         * For high-order allocations, there are two stall thresholds.
-         * High-cost allocations stall immediately where as lower
-         * order allocations such as stacks require the scanning
-         * priority to be much higher before stalling.
-         */
-        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-                lumpy_stall_priority = DEF_PRIORITY;
-        else
-                lumpy_stall_priority = DEF_PRIORITY / 3;
-        return priority <= lumpy_stall_priority;
-}
 /*
 * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
 * of reclaimed pages
 */
 static noinline_for_stack unsigned long
-shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
+shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
-                     struct scan_control *sc, int priority, int file)
+                     struct scan_control *sc, enum lru_list lru)
 {
        LIST_HEAD(page_list);
        unsigned long nr_scanned;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_taken;
-        unsigned long nr_anon;
-        unsigned long nr_file;
        unsigned long nr_dirty = 0;
        unsigned long nr_writeback = 0;
-        isolate_mode_t isolate_mode = ISOLATE_INACTIVE;
+        isolate_mode_t isolate_mode = 0;
-        struct zone *zone = mz->zone;
+        int file = is_file_lru(lru);
-        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+        struct zone *zone = lruvec_zone(lruvec);
+        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1522,10 +1251,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
                        return SWAP_CLUSTER_MAX;
        }
-        set_reclaim_mode(priority, sc, false);
-        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
-                isolate_mode |= ISOLATE_ACTIVE;
        lru_add_drain();
        if (!sc->may_unmap)
@@ -1535,47 +1260,43 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
        spin_lock_irq(&zone->lru_lock);
-        nr_taken = isolate_lru_pages(nr_to_scan, mz, &page_list, &nr_scanned,
+        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
-                                     sc, isolate_mode, 0, file);
+                                     &nr_scanned, sc, isolate_mode, lru);
+        __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
        if (global_reclaim(sc)) {
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
-                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+                        __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
-                                               nr_scanned);
                else
-                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
+                        __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
-                                               nr_scanned);
        }
        spin_unlock_irq(&zone->lru_lock);
        if (nr_taken == 0)
                return 0;
-        update_isolated_counts(mz, &page_list, &nr_anon, &nr_file);
+        nr_reclaimed = shrink_page_list(&page_list, zone, sc,
-        nr_reclaimed = shrink_page_list(&page_list, mz, sc, priority,
                                                &nr_dirty, &nr_writeback);
-        /* Check if we should syncronously wait for writeback */
-        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-                set_reclaim_mode(priority, sc, true);
-                nr_reclaimed += shrink_page_list(&page_list, mz, sc,
-                                        priority, &nr_dirty, &nr_writeback);
-        }
        spin_lock_irq(&zone->lru_lock);
-        reclaim_stat->recent_scanned[0] += nr_anon;
+        reclaim_stat->recent_scanned[file] += nr_taken;
-        reclaim_stat->recent_scanned[1] += nr_file;
-        if (current_is_kswapd())
+        if (global_reclaim(sc)) {
-                __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
+                if (current_is_kswapd())
-        __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
+                        __count_zone_vm_events(PGSTEAL_KSWAPD, zone,
+                                               nr_reclaimed);
+                else
+                        __count_zone_vm_events(PGSTEAL_DIRECT, zone,
+                                               nr_reclaimed);
+        }
-        putback_inactive_pages(mz, &page_list);
+        putback_inactive_pages(lruvec, &page_list);
-        __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
-        __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
        spin_unlock_irq(&zone->lru_lock);
@@ -1604,14 +1325,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
         * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
         *                     isolated page is PageWriteback
         */
-        if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+        if (nr_writeback && nr_writeback >=
+                        (nr_taken >> (DEF_PRIORITY - sc->priority)))
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
                zone_idx(zone),
                nr_scanned, nr_reclaimed,
-                priority,
+                sc->priority,
-                trace_shrink_flags(file, sc->reclaim_mode));
+                trace_shrink_flags(file));
        return nr_reclaimed;
 }
@@ -1633,30 +1355,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct mem_cgroup_zone *mz,
 * But we had to alter page->flags anyway.
 */
-static void move_active_pages_to_lru(struct zone *zone,
+static void move_active_pages_to_lru(struct lruvec *lruvec,
                                     struct list_head *list,
                                     struct list_head *pages_to_free,
                                     enum lru_list lru)
 {
+        struct zone *zone = lruvec_zone(lruvec);
        unsigned long pgmoved = 0;
        struct page *page;
+        int nr_pages;
        while (!list_empty(list)) {
-                struct lruvec *lruvec;
                page = lru_to_page(list);
+                lruvec = mem_cgroup_page_lruvec(page, zone);
                VM_BUG_ON(PageLRU(page));
                SetPageLRU(page);
-                lruvec = mem_cgroup_lru_add_list(zone, page, lru);
+                nr_pages = hpage_nr_pages(page);
+                mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
                list_move(&page->lru, &lruvec->lists[lru]);
-                pgmoved += hpage_nr_pages(page);
+                pgmoved += nr_pages;
                if (put_page_testzero(page)) {
                        __ClearPageLRU(page);
                        __ClearPageActive(page);
-                        del_page_from_lru_list(zone, page, lru);
+                        del_page_from_lru_list(page, lruvec, lru);
                        if (unlikely(PageCompound(page))) {
                                spin_unlock_irq(&zone->lru_lock);
@@ -1672,9 +1396,9 @@ static void move_active_pages_to_lru(struct zone *zone,
 }
 static void shrink_active_list(unsigned long nr_to_scan,
-                               struct mem_cgroup_zone *mz,
+                               struct lruvec *lruvec,
                               struct scan_control *sc,
-                               int priority, int file)
+                               enum lru_list lru)
 {
        unsigned long nr_taken;
        unsigned long nr_scanned;
@@ -1683,15 +1407,14 @@ static void shrink_active_list(unsigned long nr_to_scan,
        LIST_HEAD(l_active);
        LIST_HEAD(l_inactive);
        struct page *page;
-        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
        unsigned long nr_rotated = 0;
-        isolate_mode_t isolate_mode = ISOLATE_ACTIVE;
+        isolate_mode_t isolate_mode = 0;
-        struct zone *zone = mz->zone;
+        int file = is_file_lru(lru);
+        struct zone *zone = lruvec_zone(lruvec);
        lru_add_drain();
-        reset_reclaim_mode(sc);
        if (!sc->may_unmap)
                isolate_mode |= ISOLATE_UNMAPPED;
        if (!sc->may_writepage)
@@ -1699,18 +1422,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
        spin_lock_irq(&zone->lru_lock);
-        nr_taken = isolate_lru_pages(nr_to_scan, mz, &l_hold, &nr_scanned, sc,
+        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
-                                     isolate_mode, 1, file);
+                                     &nr_scanned, sc, isolate_mode, lru);
        if (global_reclaim(sc))
                zone->pages_scanned += nr_scanned;
        reclaim_stat->recent_scanned[file] += nr_taken;
        __count_zone_vm_events(PGREFILL, zone, nr_scanned);
-        if (file)
+        __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
-                __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken);
-        else
-                __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken);
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
        spin_unlock_irq(&zone->lru_lock);
@@ -1732,7 +1452,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
                        }
                }
-                if (page_referenced(page, 0, mz->mem_cgroup, &vm_flags)) {
+                if (page_referenced(page, 0, sc->target_mem_cgroup,
+                                    &vm_flags)) {
                        nr_rotated += hpage_nr_pages(page);
                        /*
                         * Identify referenced, file-backed active pages and
@@ -1765,10 +1486,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
         */
        reclaim_stat->recent_rotated[file] += nr_rotated;
-        move_active_pages_to_lru(zone, &l_active, &l_hold,
+        move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
-                                                LRU_ACTIVE + file * LRU_FILE);
+        move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
-        move_active_pages_to_lru(zone, &l_inactive, &l_hold,
-                                                LRU_BASE   + file * LRU_FILE);
        __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
        spin_unlock_irq(&zone->lru_lock);
@@ -1791,13 +1510,12 @@ static int inactive_anon_is_low_global(struct zone *zone)
 /**
 * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @zone: zone to check
+ * @lruvec: LRU vector to check
- * @sc:   scan control of this context
 *
 * Returns true if the zone does not have enough inactive anon pages,
 * meaning some active anon pages need to be deactivated.
 */
-static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
+static int inactive_anon_is_low(struct lruvec *lruvec)
 {
        /*
         * If we don't have swap space, anonymous page deactivation
@@ -1806,14 +1524,13 @@ static int inactive_anon_is_low(struct mem_cgroup_zone *mz)
        if (!total_swap_pages)
                return 0;
-        if (!scanning_global_lru(mz))
+        if (!mem_cgroup_disabled())
-                return mem_cgroup_inactive_anon_is_low(mz->mem_cgroup,
+                return mem_cgroup_inactive_anon_is_low(lruvec);
-                                                       mz->zone);
-        return inactive_anon_is_low_global(mz->zone);
+        return inactive_anon_is_low_global(lruvec_zone(lruvec));
 }
 #else
-static inline int inactive_anon_is_low(struct mem_cgroup_zone *mz)
+static inline int inactive_anon_is_low(struct lruvec *lruvec)
 {
        return 0;
 }
@@ -1831,7 +1548,7 @@ static int inactive_file_is_low_global(struct zone *zone)
 /**
 * inactive_file_is_low - check if file pages need to be deactivated
- * @mz: memory cgroup and zone to check
+ * @lruvec: LRU vector to check
 *
 * When the system is doing streaming IO, memory pressure here
 * ensures that active file pages get deactivated, until more
@@ -1843,44 +1560,39 @@ static int inactive_file_is_low_global(struct zone *zone)
 * This uses a different ratio than the anonymous pages, because
 * the page cache uses a use-once replacement algorithm.
 */
-static int inactive_file_is_low(struct mem_cgroup_zone *mz)
+static int inactive_file_is_low(struct lruvec *lruvec)
 {
-        if (!scanning_global_lru(mz))
+        if (!mem_cgroup_disabled())
-                return mem_cgroup_inactive_file_is_low(mz->mem_cgroup,
+                return mem_cgroup_inactive_file_is_low(lruvec);
-                                                       mz->zone);
-        return inactive_file_is_low_global(mz->zone);
+        return inactive_file_is_low_global(lruvec_zone(lruvec));
 }
-static int inactive_list_is_low(struct mem_cgroup_zone *mz, int file)
+static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
 {
-        if (file)
+        if (is_file_lru(lru))
-                return inactive_file_is_low(mz);
+                return inactive_file_is_low(lruvec);
        else
-                return inactive_anon_is_low(mz);
+                return inactive_anon_is_low(lruvec);
 }
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-                                 struct mem_cgroup_zone *mz,
+                                 struct lruvec *lruvec, struct scan_control *sc)
-                                 struct scan_control *sc, int priority)
 {
-        int file = is_file_lru(lru);
        if (is_active_lru(lru)) {
-                if (inactive_list_is_low(mz, file))
+                if (inactive_list_is_low(lruvec, lru))
-                        shrink_active_list(nr_to_scan, mz, sc, priority, file);
+                        shrink_active_list(nr_to_scan, lruvec, sc, lru);
                return 0;
        }
-        return shrink_inactive_list(nr_to_scan, mz, sc, priority, file);
+        return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
 }
-static int vmscan_swappiness(struct mem_cgroup_zone *mz,
+static int vmscan_swappiness(struct scan_control *sc)
-                             struct scan_control *sc)
 {
        if (global_reclaim(sc))
                return vm_swappiness;
-        return mem_cgroup_swappiness(mz->mem_cgroup);
+        return mem_cgroup_swappiness(sc->target_mem_cgroup);
 }
 /*
@@ -1889,19 +1601,21 @@ static int vmscan_swappiness(struct mem_cgroup_zone *mz,
 * by looking at the fraction of the pages scanned we did rotate back
 * onto the active list instead of evict.
 *
- * nr[0] = anon pages to scan; nr[1] = file pages to scan
+ * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
+ * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
 */
-static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
+static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
-                           unsigned long *nr, int priority)
+                           unsigned long *nr)
 {
        unsigned long anon, file, free;
        unsigned long anon_prio, file_prio;
        unsigned long ap, fp;
-        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(mz);
+        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
        u64 fraction[2], denominator;
        enum lru_list lru;
        int noswap = 0;
        bool force_scan = false;
+        struct zone *zone = lruvec_zone(lruvec);
        /*
         * If the zone or memcg is small, nr[l] can be 0.  This
@@ -1913,7 +1627,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
         * latencies, so it's better to scan a minimum amount there as
         * well.
         */
-        if (current_is_kswapd() && mz->zone->all_unreclaimable)
+        if (current_is_kswapd() && zone->all_unreclaimable)
                force_scan = true;
        if (!global_reclaim(sc))
                force_scan = true;
@@ -1927,16 +1641,16 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
                goto out;
        }
-        anon  = zone_nr_lru_pages(mz, LRU_ACTIVE_ANON) +
+        anon  = get_lru_size(lruvec, LRU_ACTIVE_ANON) +
-                zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
+                get_lru_size(lruvec, LRU_INACTIVE_ANON);
-        file  = zone_nr_lru_pages(mz, LRU_ACTIVE_FILE) +
+        file  = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
-                zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+                get_lru_size(lruvec, LRU_INACTIVE_FILE);
        if (global_reclaim(sc)) {
-                free  = zone_page_state(mz->zone, NR_FREE_PAGES);
+                free  = zone_page_state(zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
                   force-scan anon pages. */
-                if (unlikely(file + free <= high_wmark_pages(mz->zone))) {
+                if (unlikely(file + free <= high_wmark_pages(zone))) {
                        fraction[0] = 1;
                        fraction[1] = 0;
                        denominator = 1;
@@ -1948,8 +1662,8 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
         * With swappiness at 100, anonymous and file have the same priority.
         * This scanning priority is essentially the inverse of IO cost.
         */
-        anon_prio = vmscan_swappiness(mz, sc);
+        anon_prio = vmscan_swappiness(sc);
-        file_prio = 200 - vmscan_swappiness(mz, sc);
+        file_prio = 200 - anon_prio;
        /*
         * OK, so we have swap space and a fair amount of page cache
@@ -1962,7 +1676,7 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
         *
         * anon in [0], file in [1]
         */
-        spin_lock_irq(&mz->zone->lru_lock);
+        spin_lock_irq(&zone->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
                reclaim_stat->recent_scanned[0] /= 2;
                reclaim_stat->recent_rotated[0] /= 2;
@@ -1978,12 +1692,12 @@ static void get_scan_count(struct mem_cgroup_zone *mz, struct scan_control *sc,
         * proportional to the fraction of recently scanned pages on
         * each list that were recently referenced and in active use.
         */
-        ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
+        ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
        ap /= reclaim_stat->recent_rotated[0] + 1;
-        fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
+        fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
        fp /= reclaim_stat->recent_rotated[1] + 1;
-        spin_unlock_irq(&mz->zone->lru_lock);
+        spin_unlock_irq(&zone->lru_lock);
        fraction[0] = ap;
        fraction[1] = fp;
@@ -1993,9 +1707,9 @@ out:
                int file = is_file_lru(lru);
                unsigned long scan;
-                scan = zone_nr_lru_pages(mz, lru);
+                scan = get_lru_size(lruvec, lru);
-                if (priority || noswap) {
+                if (sc->priority || noswap || !vmscan_swappiness(sc)) {
-                        scan >>= priority;
+                        scan >>= sc->priority;
                        if (!scan && force_scan)
                                scan = SWAP_CLUSTER_MAX;
                        scan = div64_u64(scan * fraction[file], denominator);
@@ -2004,14 +1718,25 @@ out:
        }
 }
+/* Use reclaim/compaction for costly allocs or under memory pressure */
+static bool in_reclaim_compaction(struct scan_control *sc)
+{
+        if (COMPACTION_BUILD && sc->order &&
+                        (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
+                         sc->priority < DEF_PRIORITY - 2))
+                return true;
+        return false;
+}
 /*
- * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * Reclaim/compaction is used for high-order allocation requests. It reclaims
- * disruption to the system, a small number of order-0 pages continue to be
+ * order-0 pages before compacting the zone. should_continue_reclaim() returns
- * rotated and reclaimed in the normal fashion. However, by the time we get
+ * true if more pages should be reclaimed such that when the page allocator
- * back to the allocator and call try_to_compact_zone(), we ensure that
+ * calls try_to_compact_zone() that it will have enough free pages to succeed.
- * there are enough free pages for it to be likely successful
+ * It will give up earlier than that if there is difficulty reclaiming pages.
 */
-static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
+static inline bool should_continue_reclaim(struct lruvec *lruvec,
                                        unsigned long nr_reclaimed,
                                        unsigned long nr_scanned,
                                        struct scan_control *sc)
@@ -2020,7 +1745,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
        unsigned long inactive_lru_pages;
        /* If not in reclaim/compaction mode, stop */
-        if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+        if (!in_reclaim_compaction(sc))
                return false;
        /* Consider stopping depending on scan and reclaim activity */
@@ -2051,15 +1776,15 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
         * inactive lists are large enough, continue reclaiming
         */
        pages_for_compaction = (2UL << sc->order);
-        inactive_lru_pages = zone_nr_lru_pages(mz, LRU_INACTIVE_FILE);
+        inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE);
        if (nr_swap_pages > 0)
-                inactive_lru_pages += zone_nr_lru_pages(mz, LRU_INACTIVE_ANON);
+                inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON);
        if (sc->nr_reclaimed < pages_for_compaction &&
                        inactive_lru_pages > pages_for_compaction)
                return true;
        /* If compaction would go ahead or the allocation would succeed, stop */
-        switch (compaction_suitable(mz->zone, sc->order)) {
+        switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) {
        case COMPACT_PARTIAL:
        case COMPACT_CONTINUE:
                return false;
@@ -2071,8 +1796,7 @@ static inline bool should_continue_reclaim(struct mem_cgroup_zone *mz,
 /*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
-static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-                                   struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
@@ -2084,7 +1808,7 @@ static void shrink_mem_cgroup_zone(int priority, struct mem_cgroup_zone *mz,
 restart:
        nr_reclaimed = 0;
        nr_scanned = sc->nr_scanned;
-        get_scan_count(mz, sc, nr, priority);
+        get_scan_count(lruvec, sc, nr);
        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2096,7 +1820,7 @@ restart:
                                nr[lru] -= nr_to_scan;
                                nr_reclaimed += shrink_list(lru, nr_to_scan,
-                                                            mz, sc, priority);
+                                                            lruvec, sc);
                        }
                }
                /*
@@ -2107,12 +1831,8 @@ restart:
                 * with multiple processes reclaiming pages, the total
                 * freeing target can get unreasonably large.
                 */
-                if (nr_reclaimed >= nr_to_reclaim)
+                if (nr_reclaimed >= nr_to_reclaim &&
-                        nr_to_reclaim = 0;
+                    sc->priority < DEF_PRIORITY)
-                else
-                        nr_to_reclaim -= nr_reclaimed;
-                if (!nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
        blk_finish_plug(&plug);
@@ -2122,35 +1842,33 @@ restart:
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-        if (inactive_anon_is_low(mz))
+        if (inactive_anon_is_low(lruvec))
-                shrink_active_list(SWAP_CLUSTER_MAX, mz, sc, priority, 0);
+                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+                                   sc, LRU_ACTIVE_ANON);
        /* reclaim/compaction might need reclaim to continue */
-        if (should_continue_reclaim(mz, nr_reclaimed,
+        if (should_continue_reclaim(lruvec, nr_reclaimed,
-                                        sc->nr_scanned - nr_scanned, sc))
+                                    sc->nr_scanned - nr_scanned, sc))
                goto restart;
        throttle_vm_writeout(sc->gfp_mask);
 }
-static void shrink_zone(int priority, struct zone *zone,
+static void shrink_zone(struct zone *zone, struct scan_control *sc)
-                        struct scan_control *sc)
 {
        struct mem_cgroup *root = sc->target_mem_cgroup;
        struct mem_cgroup_reclaim_cookie reclaim = {
                .zone = zone,
-                .priority = priority,
+                .priority = sc->priority,
        };
        struct mem_cgroup *memcg;
        memcg = mem_cgroup_iter(root, NULL, &reclaim);
        do {
-                struct mem_cgroup_zone mz = {
+                struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
-                        .mem_cgroup = memcg,
-                        .zone = zone,
+                shrink_lruvec(lruvec, sc);
-                };
-                shrink_mem_cgroup_zone(priority, &mz, sc);
                /*
                 * Limit reclaim has historically picked one memcg and
                 * scanned it with decreasing priority levels until
@@ -2226,8 +1944,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
 * the caller that it should consider retrying the allocation instead of
 * further reclaim.
 */
-static bool shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
-                                        struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
@@ -2254,7 +1971,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                if (global_reclaim(sc)) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
-                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                        if (zone->all_unreclaimable &&
+                                        sc->priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
                        if (COMPACTION_BUILD) {
                                /*
@@ -2286,7 +2004,7 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                        /* need some check for avoid more shrink_zone() */
                }
-                shrink_zone(priority, zone, sc);
+                shrink_zone(zone, sc);
        }
        return aborted_reclaim;
@@ -2337,7 +2055,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                        struct scan_control *sc,
                                        struct shrink_control *shrink)
 {
-        int priority;
        unsigned long total_scanned = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        struct zoneref *z;
@@ -2350,11 +2067,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        if (global_reclaim(sc))
                count_vm_event(ALLOCSTALL);
-        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+        do {
                sc->nr_scanned = 0;
-                if (!priority)
+                aborted_reclaim = shrink_zones(zonelist, sc);
-                        disable_swap_token(sc->target_mem_cgroup);
-                aborted_reclaim = shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
@@ -2396,7 +2111,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                /* Take a nap, wait for some writeback to complete */
                if (!sc->hibernation_mode && sc->nr_scanned &&
-                    priority < DEF_PRIORITY - 2) {
+                    sc->priority < DEF_PRIORITY - 2) {
                        struct zone *preferred_zone;
                        first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
@@ -2404,7 +2119,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                                &preferred_zone);
                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
                }
-        }
+        } while (--sc->priority >= 0);
 out:
        delayacct_freepages_end();
@@ -2431,6 +2146,83 @@ out:
        return 0;
 }
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+{
+        struct zone *zone;
+        unsigned long pfmemalloc_reserve = 0;
+        unsigned long free_pages = 0;
+        int i;
+        bool wmark_ok;
+        for (i = 0; i <= ZONE_NORMAL; i++) {
+                zone = &pgdat->node_zones[i];
+                pfmemalloc_reserve += min_wmark_pages(zone);
+                free_pages += zone_page_state(zone, NR_FREE_PAGES);
+        }
+        wmark_ok = free_pages > pfmemalloc_reserve / 2;
+        /* kswapd must be awake if processes are being throttled */
+        if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
+                pgdat->classzone_idx = min(pgdat->classzone_idx,
+                                                (enum zone_type)ZONE_NORMAL);
+                wake_up_interruptible(&pgdat->kswapd_wait);
+        }
+        return wmark_ok;
+}
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+                                        nodemask_t *nodemask)
+{
+        struct zone *zone;
+        int high_zoneidx = gfp_zone(gfp_mask);
+        pg_data_t *pgdat;
+        /*
+         * Kernel threads should not be throttled as they may be indirectly
+         * responsible for cleaning pages necessary for reclaim to make forward
+         * progress. kjournald for example may enter direct reclaim while
+         * committing a transaction where throttling it could forcing other
+         * processes to block on log_wait_commit().
+         */
+        if (current->flags & PF_KTHREAD)
+                return;
+        /* Check if the pfmemalloc reserves are ok */
+        first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+        pgdat = zone->zone_pgdat;
+        if (pfmemalloc_watermark_ok(pgdat))
+                return;
+        /* Account for the throttling */
+        count_vm_event(PGSCAN_DIRECT_THROTTLE);
+        /*
+         * If the caller cannot enter the filesystem, it's possible that it
+         * is due to the caller holding an FS lock or performing a journal
+         * transaction in the case of a filesystem like ext[3|4]. In this case,
+         * it is not safe to block on pfmemalloc_wait as kswapd could be
+         * blocked waiting on the same lock. Instead, throttle for up to a
+         * second before continuing.
+         */
+        if (!(gfp_mask & __GFP_FS)) {
+                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+                        pfmemalloc_watermark_ok(pgdat), HZ);
+                return;
+        }
+        /* Throttle until kswapd wakes the process */
+        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+                pfmemalloc_watermark_ok(pgdat));
+}
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                gfp_t gfp_mask, nodemask_t *nodemask)
 {
@@ -2442,6 +2234,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .may_unmap = 1,
                .may_swap = 1,
                .order = order,
+                .priority = DEF_PRIORITY,
                .target_mem_cgroup = NULL,
                .nodemask = nodemask,
        };
@@ -2449,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .gfp_mask = sc.gfp_mask,
        };
+        throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+        /*
+         * Do not enter reclaim if fatal signal is pending. 1 is returned so
+         * that the page allocator does not consider triggering OOM
+         */
+        if (fatal_signal_pending(current))
+                return 1;
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
                                gfp_mask);
@@ -2460,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        return nr_reclaimed;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                                                gfp_t gfp_mask, bool noswap,
@@ -2474,17 +2276,15 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                .may_unmap = 1,
                .may_swap = !noswap,
                .order = 0,
+                .priority = 0,
                .target_mem_cgroup = memcg,
        };
-        struct mem_cgroup_zone mz = {
+        struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
-                .mem_cgroup = memcg,
-                .zone = zone,
-        };
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
-        trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
+        trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
                                                      sc.may_writepage,
                                                      sc.gfp_mask);
@@ -2495,7 +2295,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
         * will pick up pages from other mem cgroup's as well. We hack
         * the priority and make it zero.
         */
-        shrink_mem_cgroup_zone(0, &mz, &sc);
+        shrink_lruvec(lruvec, &sc);
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
@@ -2516,6 +2316,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                .may_swap = !noswap,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .order = 0,
+                .priority = DEF_PRIORITY,
                .target_mem_cgroup = memcg,
                .nodemask = NULL, /* we don't care the placement */
                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
@@ -2546,8 +2347,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 }
 #endif
-static void age_active_anon(struct zone *zone, struct scan_control *sc,
+static void age_active_anon(struct zone *zone, struct scan_control *sc)
-                            int priority)
 {
        struct mem_cgroup *memcg;
@@ -2556,14 +2356,11 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc,
        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
-                struct mem_cgroup_zone mz = {
+                struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
-                        .mem_cgroup = memcg,
-                        .zone = zone,
-                };
-                if (inactive_anon_is_low(&mz))
+                if (inactive_anon_is_low(lruvec))
-                        shrink_active_list(SWAP_CLUSTER_MAX, &mz,
+                        shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
-                                           sc, priority, 0);
+                                           sc, LRU_ACTIVE_ANON);
                memcg = mem_cgroup_iter(NULL, memcg, NULL);
        } while (memcg);
@@ -2598,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
        return balanced_pages >= (present_pages >> 2);
 }
-/* is kswapd sleeping prematurely? */
+/*
-static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ * Prepare kswapd for sleeping. This verifies that there are no processes
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
+ *
+ * Returns true if kswapd is ready to sleep
+ */
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                                        int classzone_idx)
 {
        int i;
@@ -2608,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
-                return true;
+                return false;
+        /*
+         * There is a potential race between when kswapd checks its watermarks
+         * and a process gets throttled. There is also a potential race if
+         * processes get throttled, kswapd wakes, a large process exits therby
+         * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+         * is going to sleep, no process should be sleeping on pfmemalloc_wait
+         * so wake them now if necessary. If necessary, processes will wake
+         * kswapd and get throttled again
+         */
+        if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+                wake_up(&pgdat->pfmemalloc_wait);
+                return false;
+        }
        /* Check the watermark levels */
        for (i = 0; i <= classzone_idx; i++) {
@@ -2641,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
         * must be balanced
         */
        if (order)
-                return !pgdat_balanced(pgdat, balanced, classzone_idx);
+                return pgdat_balanced(pgdat, balanced, classzone_idx);
        else
-                return !all_zones_ok;
+                return all_zones_ok;
 }
 /*
@@ -2672,7 +2488,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 {
        int all_zones_ok;
        unsigned long balanced;
-        int priority;
        int i;
        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long total_scanned;
@@ -2696,18 +2511,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
        };
 loop_again:
        total_scanned = 0;
+        sc.priority = DEF_PRIORITY;
        sc.nr_reclaimed = 0;
        sc.may_writepage = !laptop_mode;
        count_vm_event(PAGEOUTRUN);
-        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+        do {
                unsigned long lru_pages = 0;
                int has_under_min_watermark_zone = 0;
-                /* The swap token gets in the way of swapout... */
-                if (!priority)
-                        disable_swap_token(NULL);
                all_zones_ok = 1;
                balanced = 0;
@@ -2721,14 +2533,15 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
-                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                        if (zone->all_unreclaimable &&
+                            sc.priority != DEF_PRIORITY)
                                continue;
                        /*
                         * Do some background aging of the anon list, to give
                         * pages a chance to be referenced before reclaiming.
                         */
-                        age_active_anon(zone, &sc, priority);
+                        age_active_anon(zone, &sc);
                        /*
                         * If the number of buffer_heads in the machine
@@ -2776,7 +2589,8 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
-                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                        if (zone->all_unreclaimable &&
+                            sc.priority != DEF_PRIORITY)
                                continue;
                        sc.nr_scanned = 0;
@@ -2820,7 +2634,7 @@ loop_again:
                                    !zone_watermark_ok_safe(zone, testorder,
                                        high_wmark_pages(zone) + balance_gap,
                                        end_zone, 0)) {
-                                shrink_zone(priority, zone, &sc);
+                                shrink_zone(zone, &sc);
                                reclaim_state->reclaimed_slab = 0;
                                nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
@@ -2863,7 +2677,7 @@ loop_again:
                                 * consider it to be no longer congested. It's
                                 * possible there are dirty pages backed by
                                 * congested BDIs but as pressure is relieved,
-                                 * spectulatively avoid congestion waits
+                                 * speculatively avoid congestion waits
                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
                                if (i <= *classzone_idx)
@@ -2871,13 +2685,23 @@ loop_again:
                        }
                }
+                /*
+                 * If the low watermark is met there is no need for processes
+                 * to be throttled on pfmemalloc_wait as they should not be
+                 * able to safely make forward progress. Wake them
+                 */
+                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+                                pfmemalloc_watermark_ok(pgdat))
+                        wake_up(&pgdat->pfmemalloc_wait);
                if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                        break;          /* kswapd: all done */
                /*
                 * OK, kswapd is getting into trouble.  Take a nap, then take
                 * another pass across the zones.
                 */
-                if (total_scanned && (priority < DEF_PRIORITY - 2)) {
+                if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
                        if (has_under_min_watermark_zone)
                                count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
                        else
@@ -2892,7 +2716,7 @@ loop_again:
                 */
                if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
                        break;
-        }
+        } while (--sc.priority >= 0);
 out:
        /*
@@ -2942,7 +2766,8 @@ out:
                        if (!populated_zone(zone))
                                continue;
-                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                        if (zone->all_unreclaimable &&
+                            sc.priority != DEF_PRIORITY)
                                continue;
                        /* Would compaction fail due to lack of free memory? */
@@ -2971,7 +2796,7 @@ out:
        }
        /*
-         * Return the order we were reclaiming at so sleeping_prematurely()
+         * Return the order we were reclaiming at so prepare_kswapd_sleep()
         * makes a decision on the order we were last reclaiming at. However,
         * if another caller entered the allocator slow path while kswapd
         * was awake, order will remain at the higher level
@@ -2991,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
        /* Try to sleep for a short interval */
-        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                remaining = schedule_timeout(HZ/10);
                finish_wait(&pgdat->kswapd_wait, &wait);
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -3001,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
-        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
                /*
@@ -3013,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 * them before going back to sleep.
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
-                schedule();
+                if (!kthread_should_stop())
+                        schedule();
                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
        } else {
                if (remaining)
@@ -3209,6 +3037,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
                .nr_to_reclaim = nr_to_reclaim,
                .hibernation_mode = 1,
                .order = 0,
+                .priority = DEF_PRIORITY,
        };
        struct shrink_control shrink = {
                .gfp_mask = sc.gfp_mask,
@@ -3279,14 +3108,17 @@ int kswapd_run(int nid)
 }
 /*
- * Called by memory hotplug when all memory in a node is offlined.
+ * Called by memory hotplug when all memory in a node is offlined.  Caller must
+ * hold lock_memory_hotplug().
 */
 void kswapd_stop(int nid)
 {
        struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
-        if (kswapd)
+        if (kswapd) {
                kthread_stop(kswapd);
+                NODE_DATA(nid)->kswapd = NULL;
+        }
 }
 static int __init kswapd_init(void)
@@ -3386,7 +3218,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        const unsigned long nr_pages = 1 << order;
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
-        int priority;
        struct scan_control sc = {
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
                .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -3395,6 +3226,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                                       SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
                .order = order,
+                .priority = ZONE_RECLAIM_PRIORITY,
        };
        struct shrink_control shrink = {
                .gfp_mask = sc.gfp_mask,
@@ -3417,11 +3249,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 * Free memory by calling shrink zone with increasing
                 * priorities until we have enough memory freed.
                 */
-                priority = ZONE_RECLAIM_PRIORITY;
                do {
-                        shrink_zone(priority, zone, &sc);
+                        shrink_zone(zone, &sc);
-                        priority--;
+                } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
-                } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
        }
        nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -3536,7 +3366,7 @@ int page_evictable(struct page *page, struct vm_area_struct *vma)
        if (mapping_unevictable(page_mapping(page)))
                return 0;
-        if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
+        if (PageMlocked(page) || (vma && mlocked_vma_newpage(vma, page)))
                return 0;
        return 1;
@@ -3572,6 +3402,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
                        zone = pagezone;
                        spin_lock_irq(&zone->lru_lock);
                }
+                lruvec = mem_cgroup_page_lruvec(page, zone);
                if (!PageLRU(page) || !PageUnevictable(page))
                        continue;
@@ -3581,11 +3412,8 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages)
                        VM_BUG_ON(PageActive(page));
                        ClearPageUnevictable(page);
-                        __dec_zone_state(zone, NR_UNEVICTABLE);
+                        del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
-                        lruvec = mem_cgroup_lru_move_lists(zone, page,
+                        add_page_to_lru_list(page, lruvec, lru);
-                                                LRU_UNEVICTABLE, lru);
-                        list_move(&page->lru, &lruvec->lists[lru]);
-                        __inc_zone_state(zone, NR_INACTIVE_ANON + lru);
                        pgrescued++;
                }
        }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f600557a7659..df7a6748231d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -613,6 +613,9 @@ static char * const migratetype_names[MIGRATE_TYPES] = {
        "Reclaimable",
        "Movable",
        "Reserve",
+#ifdef CONFIG_CMA
+        "CMA",
+#endif
        "Isolate",
 };
@@ -738,16 +741,17 @@ const char * const vmstat_text[] = {
        "pgmajfault",
        TEXTS_FOR_ZONES("pgrefill")
-        TEXTS_FOR_ZONES("pgsteal")
+        TEXTS_FOR_ZONES("pgsteal_kswapd")
+        TEXTS_FOR_ZONES("pgsteal_direct")
        TEXTS_FOR_ZONES("pgscan_kswapd")
        TEXTS_FOR_ZONES("pgscan_direct")
+        "pgscan_direct_throttle",
 #ifdef CONFIG_NUMA
        "zone_reclaim_failed",
 #endif
        "pginodesteal",
        "slabs_scanned",
-        "kswapd_steal",
        "kswapd_inodesteal",
        "kswapd_low_wmark_hit_quickly",
        "kswapd_high_wmark_hit_quickly",
@@ -1220,7 +1224,6 @@ module_init(setup_vmstat)
 #if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
 #include <linux/debugfs.h>
-static struct dentry *extfrag_debug_root;
 /*
 * Return an index indicating how much of the available free memory is
@@ -1358,19 +1361,24 @@ static const struct file_operations extfrag_file_ops = {
 static int __init extfrag_debug_init(void)
 {
+        struct dentry *extfrag_debug_root;
        extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
        if (!extfrag_debug_root)
                return -ENOMEM;
        if (!debugfs_create_file("unusable_index", 0444,
                        extfrag_debug_root, NULL, &unusable_file_ops))
-                return -ENOMEM;
+                goto fail;
        if (!debugfs_create_file("extfrag_index", 0444,
                        extfrag_debug_root, NULL, &extfrag_file_ops))
-                return -ENOMEM;
+                goto fail;
        return 0;
+fail:
+        debugfs_remove_recursive(extfrag_debug_root);
+        return -ENOMEM;
 }
 module_init(extfrag_debug_init);