46 files changed, 2961 insertions, 1616 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 82fed4eb2b6..d5c8019c662 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK
 config NO_BOOTMEM
        boolean
+config MEMORY_ISOLATION
+        boolean
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
+        select MEMORY_ISOLATION
        depends on SPARSEMEM || X86_64_ACPI_NUMA
        depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
        depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
@@ -272,6 +276,7 @@ config MEMORY_FAILURE
        depends on MMU
        depends on ARCH_SUPPORTS_MEMORY_FAILURE
        bool "Enable recovery from hardware memory errors"
+        select MEMORY_ISOLATION
        help
          Enables code to recover from some memory failures on systems
          with MCA recovery. This allows a system to continue running
diff --git a/mm/Makefile b/mm/Makefile
index 2e2fbbefb99..92753e2d82d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -15,8 +15,9 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                           page_isolation.o mm_init.o mmu_context.o percpu.o \
+                           mm_init.o mmu_context.o percpu.o slab_common.o \
                           compaction.o $(mmu-y)
 obj-y += init-mm.o
 ifdef CONFIG_NO_BOOTMEM
@@ -48,9 +49,11 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
+obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index dd8e2aafb07..b41823cc05e 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -39,12 +39,6 @@ DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
 LIST_HEAD(bdi_pending_list);
-static struct task_struct *sync_supers_tsk;
-static struct timer_list sync_supers_timer;
-static int bdi_sync_supers(void *);
-static void sync_supers_timer_fn(unsigned long);
 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
 {
        if (wb1 < wb2) {
@@ -250,12 +244,6 @@ static int __init default_bdi_init(void)
 {
        int err;
-        sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
-        BUG_ON(IS_ERR(sync_supers_tsk));
-        setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
-        bdi_arm_supers_timer();
        err = bdi_init(&default_backing_dev_info);
        if (!err)
                bdi_register(&default_backing_dev_info, NULL, "default");
@@ -270,46 +258,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
        return wb_has_dirty_io(&bdi->wb);
 }
-/*
- * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
- * or we risk deadlocking on ->s_umount. The longer term solution would be
- * to implement sync_supers_bdi() or similar and simply do it from the
- * bdi writeback thread individually.
- */
-static int bdi_sync_supers(void *unused)
-{
-        set_user_nice(current, 0);
-        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
-                schedule();
-                /*
-                 * Do this periodically, like kupdated() did before.
-                 */
-                sync_supers();
-        }
-        return 0;
-}
-void bdi_arm_supers_timer(void)
-{
-        unsigned long next;
-        if (!dirty_writeback_interval)
-                return;
-        next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
-        mod_timer(&sync_supers_timer, round_jiffies_up(next));
-}
-static void sync_supers_timer_fn(unsigned long unused)
-{
-        wake_up_process(sync_supers_tsk);
-        bdi_arm_supers_timer();
-}
 static void wakeup_timer_fn(unsigned long data)
 {
        struct backing_dev_info *bdi = (struct backing_dev_info *)data;
@@ -677,7 +625,7 @@ int bdi_init(struct backing_dev_info *bdi)
        bdi->min_ratio = 0;
        bdi->max_ratio = 100;
-        bdi->max_prop_frac = PROP_FRAC_BASE;
+        bdi->max_prop_frac = FPROP_FRAC_BASE;
        spin_lock_init(&bdi->wb_lock);
        INIT_LIST_HEAD(&bdi->bdi_list);
        INIT_LIST_HEAD(&bdi->work_list);
@@ -700,7 +648,7 @@ int bdi_init(struct backing_dev_info *bdi)
        bdi->write_bandwidth = INIT_BW;
        bdi->avg_write_bandwidth = INIT_BW;
-        err = prop_local_init_percpu(&bdi->completions);
+        err = fprop_local_init_percpu(&bdi->completions);
        if (err) {
 err:
@@ -744,7 +692,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
                percpu_counter_destroy(&bdi->bdi_stat[i]);
-        prop_local_destroy_percpu(&bdi->completions);
+        fprop_local_destroy_percpu(&bdi->completions);
 }
 EXPORT_SYMBOL(bdi_destroy);
@@ -886,3 +834,23 @@ out:
        return ret;
 }
 EXPORT_SYMBOL(wait_iff_congested);
+int pdflush_proc_obsolete(struct ctl_table *table, int write,
+                        void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        char kbuf[] = "0\n";
+        if (*ppos) {
+                *lenp = 0;
+                return 0;
+        }
+        if (copy_to_user(buffer, kbuf, sizeof(kbuf)))
+                return -EFAULT;
+        printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n",
+                        table->procname);
+        *lenp = 2;
+        *ppos += *lenp;
+        return 2;
+}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 73096630cb3..bcb63ac48cc 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -710,6 +710,10 @@ again:
        if (ptr)
                return ptr;
+        /* do not panic in alloc_bootmem_bdata() */
+        if (limit && goal + size > limit)
+                limit = 0;
        ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
        if (ptr)
                return ptr;
diff --git a/mm/bounce.c b/mm/bounce.c
index d1be02ca188..04208677556 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -24,23 +24,25 @@
 static mempool_t *page_pool, *isa_page_pool;
-#ifdef CONFIG_HIGHMEM
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_NEED_BOUNCE_POOL)
 static __init int init_emergency_pool(void)
 {
-#ifndef CONFIG_MEMORY_HOTPLUG
+#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
        if (max_pfn <= max_low_pfn)
                return 0;
 #endif
        page_pool = mempool_create_page_pool(POOL_SIZE, 0);
        BUG_ON(!page_pool);
-        printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
+        printk("bounce pool size: %d pages\n", POOL_SIZE);
        return 0;
 }
 __initcall(init_emergency_pool);
+#endif
+#ifdef CONFIG_HIGHMEM
 /*
 * highmem version, map in to vec
 */
diff --git a/mm/compaction.c b/mm/compaction.c
index 2f42d952853..7fcd3a52e68 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -51,6 +51,47 @@ static inline bool migrate_async_suitable(int migratetype)
 }
 /*
+ * Compaction requires the taking of some coarse locks that are potentially
+ * very heavily contended. Check if the process needs to be scheduled or
+ * if the lock is contended. For async compaction, back out in the event
+ * if contention is severe. For sync compaction, schedule.
+ *
+ * Returns true if the lock is held.
+ * Returns false if the lock is released and compaction should abort
+ */
+static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags,
+                                      bool locked, struct compact_control *cc)
+{
+        if (need_resched() || spin_is_contended(lock)) {
+                if (locked) {
+                        spin_unlock_irqrestore(lock, *flags);
+                        locked = false;
+                }
+                /* async aborts if taking too long or contended */
+                if (!cc->sync) {
+                        if (cc->contended)
+                                *cc->contended = true;
+                        return false;
+                }
+                cond_resched();
+                if (fatal_signal_pending(current))
+                        return false;
+        }
+        if (!locked)
+                spin_lock_irqsave(lock, *flags);
+        return true;
+}
+static inline bool compact_trylock_irqsave(spinlock_t *lock,
+                        unsigned long *flags, struct compact_control *cc)
+{
+        return compact_checklock_irqsave(lock, flags, false, cc);
+}
+/*
 * Isolate free pages onto a private freelist. Caller must hold zone->lock.
 * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
 * pages inside of the pageblock (even though it may still end up isolating
@@ -173,7 +214,7 @@ isolate_freepages_range(unsigned long start_pfn, unsigned long end_pfn)
 }
 /* Update the number of anon and file isolated pages in the zone */
-static void acct_isolated(struct zone *zone, struct compact_control *cc)
+static void acct_isolated(struct zone *zone, bool locked, struct compact_control *cc)
 {
        struct page *page;
        unsigned int count[2] = { 0, };
@@ -181,8 +222,14 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
        list_for_each_entry(page, &cc->migratepages, lru)
                count[!!page_is_file_cache(page)]++;
-        __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+        /* If locked we can use the interrupt unsafe versions */
-        __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        if (locked) {
+                __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+                __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        } else {
+                mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+                mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
+        }
 }
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -228,6 +275,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        struct list_head *migratelist = &cc->migratepages;
        isolate_mode_t mode = 0;
        struct lruvec *lruvec;
+        unsigned long flags;
+        bool locked;
        /*
         * Ensure that there are not too many pages isolated from the LRU
@@ -247,25 +296,22 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
        /* Time to isolate some pages for migration */
        cond_resched();
-        spin_lock_irq(&zone->lru_lock);
+        spin_lock_irqsave(&zone->lru_lock, flags);
+        locked = true;
        for (; low_pfn < end_pfn; low_pfn++) {
                struct page *page;
-                bool locked = true;
                /* give a chance to irqs before checking need_resched() */
                if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
-                        spin_unlock_irq(&zone->lru_lock);
+                        spin_unlock_irqrestore(&zone->lru_lock, flags);
                        locked = false;
                }
-                if (need_resched() || spin_is_contended(&zone->lru_lock)) {
-                        if (locked)
+                /* Check if it is ok to still hold the lock */
-                                spin_unlock_irq(&zone->lru_lock);
+                locked = compact_checklock_irqsave(&zone->lru_lock, &flags,
-                        cond_resched();
+                                                                locked, cc);
-                        spin_lock_irq(&zone->lru_lock);
+                if (!locked)
-                        if (fatal_signal_pending(current))
+                        break;
-                                break;
-                } else if (!locked)
-                        spin_lock_irq(&zone->lru_lock);
                /*
                 * migrate_pfn does not necessarily start aligned to a
@@ -349,9 +395,10 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc,
                }
        }
-        acct_isolated(zone, cc);
+        acct_isolated(zone, locked, cc);
-        spin_unlock_irq(&zone->lru_lock);
+        if (locked)
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
@@ -384,6 +431,20 @@ static bool suitable_migration_target(struct page *page)
 }
 /*
+ * Returns the start pfn of the last page block in a zone.  This is the starting
+ * point for full compaction of a zone.  Compaction searches for free pages from
+ * the end of each zone, while isolate_freepages_block scans forward inside each
+ * page block.
+ */
+static unsigned long start_free_pfn(struct zone *zone)
+{
+        unsigned long free_pfn;
+        free_pfn = zone->zone_start_pfn + zone->spanned_pages;
+        free_pfn &= ~(pageblock_nr_pages-1);
+        return free_pfn;
+}
+/*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
 */
@@ -447,7 +508,16 @@ static void isolate_freepages(struct zone *zone,
                 * are disabled
                 */
                isolated = 0;
-                spin_lock_irqsave(&zone->lock, flags);
+                /*
+                 * The zone lock must be held to isolate freepages. This
+                 * unfortunately this is a very coarse lock and can be
+                 * heavily contended if there are parallel allocations
+                 * or parallel compactions. For async compaction do not
+                 * spin on the lock
+                 */
+                if (!compact_trylock_irqsave(&zone->lock, &flags, cc))
+                        break;
                if (suitable_migration_target(page)) {
                        end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn);
                        isolated = isolate_freepages_block(pfn, end_pfn,
@@ -461,8 +531,19 @@ static void isolate_freepages(struct zone *zone,
                 * looking for free pages, the search will restart here as
                 * page migration may have returned some pages to the allocator
                 */
-                if (isolated)
+                if (isolated) {
                        high_pfn = max(high_pfn, pfn);
+                        /*
+                         * If the free scanner has wrapped, update
+                         * compact_cached_free_pfn to point to the highest
+                         * pageblock with free pages. This reduces excessive
+                         * scanning of full pageblocks near the end of the
+                         * zone
+                         */
+                        if (cc->order > 0 && cc->wrapped)
+                                zone->compact_cached_free_pfn = high_pfn;
+                }
        }
        /* split_free_page does not map the pages */
@@ -470,6 +551,11 @@ static void isolate_freepages(struct zone *zone,
        cc->free_pfn = high_pfn;
        cc->nr_freepages = nr_freepages;
+        /* If compact_cached_free_pfn is reset then set it now */
+        if (cc->order > 0 && !cc->wrapped &&
+                        zone->compact_cached_free_pfn == start_free_pfn(zone))
+                zone->compact_cached_free_pfn = high_pfn;
 }
 /*
@@ -565,8 +651,26 @@ static int compact_finished(struct zone *zone,
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
-        /* Compaction run completes if the migrate and free scanner meet */
+        /*
-        if (cc->free_pfn <= cc->migrate_pfn)
+         * A full (order == -1) compaction run starts at the beginning and
+         * end of a zone; it completes when the migrate and free scanner meet.
+         * A partial (order > 0) compaction can start with the free scanner
+         * at a random point in the zone, and may have to restart.
+         */
+        if (cc->free_pfn <= cc->migrate_pfn) {
+                if (cc->order > 0 && !cc->wrapped) {
+                        /* We started partway through; restart at the end. */
+                        unsigned long free_pfn = start_free_pfn(zone);
+                        zone->compact_cached_free_pfn = free_pfn;
+                        cc->free_pfn = free_pfn;
+                        cc->wrapped = 1;
+                        return COMPACT_CONTINUE;
+                }
+                return COMPACT_COMPLETE;
+        }
+        /* We wrapped around and ended up where we started. */
+        if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn)
                return COMPACT_COMPLETE;
        /*
@@ -664,8 +768,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
-        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
-        cc->free_pfn &= ~(pageblock_nr_pages-1);
+        if (cc->order > 0) {
+                /* Incremental compaction. Start where the last one stopped. */
+                cc->free_pfn = zone->compact_cached_free_pfn;
+                cc->start_free_pfn = cc->free_pfn;
+        } else {
+                /* Order == -1 starts at the end of the zone. */
+                cc->free_pfn = start_free_pfn(zone);
+        }
        migrate_prep_local();
@@ -718,7 +829,7 @@ out:
 static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
-                                 bool sync)
+                                 bool sync, bool *contended)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -727,6 +838,7 @@ static unsigned long compact_zone_order(struct zone *zone,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
                .sync = sync,
+                .contended = contended,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -748,7 +860,7 @@ int sysctl_extfrag_threshold = 500;
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
-                        bool sync)
+                        bool sync, bool *contended)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
@@ -772,7 +884,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
                                                                nodemask) {
                int status;
-                status = compact_zone_order(zone, order, gfp_mask, sync);
+                status = compact_zone_order(zone, order, gfp_mask, sync,
+                                                contended);
                rc = max(status, rc);
                /* If a normal allocation would succeed, stop compacting */
@@ -808,7 +921,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
                if (cc->order > 0) {
                        int ok = zone_watermark_ok(zone, cc->order,
                                                low_wmark_pages(zone), 0, 0);
-                        if (ok && cc->order > zone->compact_order_failed)
+                        if (ok && cc->order >= zone->compact_order_failed)
                                zone->compact_order_failed = cc->order + 1;
                        /* Currently async compaction is never deferred. */
                        else if (!ok && cc->sync)
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 469491e0af7..9b75a045dbf 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                spin_unlock(&file->f_lock);
                break;
        case POSIX_FADV_WILLNEED:
-                if (!mapping->a_ops->readpage) {
-                        ret = -EINVAL;
-                        break;
-                }
                /* First and last PARTIAL page! */
                start_index = offset >> PAGE_CACHE_SHIFT;
                end_index = endbyte >> PAGE_CACHE_SHIFT;
@@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
                nrpages = end_index - start_index + 1;
                if (!nrpages)
                        nrpages = ~0UL;
-                
-                ret = force_page_cache_readahead(mapping, file,
+                /*
-                                start_index,
+                 * Ignore return value because fadvise() shall return
-                                nrpages);
+                 * success even if filesystem can't retrieve a hint,
-                if (ret > 0)
+                 */
-                        ret = 0;
+                force_page_cache_readahead(mapping, file, start_index,
+                                           nrpages);
                break;
        case POSIX_FADV_NOREUSE:
                break;
diff --git a/mm/filemap.c b/mm/filemap.c
index a4a5260b027..384344575c3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1412,12 +1412,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        retval = filemap_write_and_wait_range(mapping, pos,
                                        pos + iov_length(iov, nr_segs) - 1);
                        if (!retval) {
-                                struct blk_plug plug;
-                                blk_start_plug(&plug);
                                retval = mapping->a_ops->direct_IO(READ, iocb,
                                                        iov, pos, nr_segs);
-                                blk_finish_plug(&plug);
                        }
                        if (retval > 0) {
                                *ppos = pos + retval;
@@ -1712,8 +1708,35 @@ page_not_uptodate:
 }
 EXPORT_SYMBOL(filemap_fault);
+int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct page *page = vmf->page;
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+        int ret = VM_FAULT_LOCKED;
+        sb_start_pagefault(inode->i_sb);
+        file_update_time(vma->vm_file);
+        lock_page(page);
+        if (page->mapping != inode->i_mapping) {
+                unlock_page(page);
+                ret = VM_FAULT_NOPAGE;
+                goto out;
+        }
+        /*
+         * We mark the page dirty already here so that when freeze is in
+         * progress, we are guaranteed that writeback during freezing will
+         * see the dirty page and writeprotect it again.
+         */
+        set_page_dirty(page);
+out:
+        sb_end_pagefault(inode->i_sb);
+        return ret;
+}
+EXPORT_SYMBOL(filemap_page_mkwrite);
 const struct vm_operations_struct generic_file_vm_ops = {
        .fault          = filemap_fault,
+        .page_mkwrite   = filemap_page_mkwrite,
 };
 /* This is used for a general mmap of a disk file */
@@ -2407,8 +2430,6 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        count = ocount;
        pos = *ppos;
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
        written = 0;
@@ -2502,13 +2523,12 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
+        sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
-        blk_start_plug(&plug);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
@@ -2519,7 +2539,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
-        blk_finish_plug(&plug);
+        sb_end_write(inode->i_sb);
        return ret;
 }
 EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 213ca1f5340..13e013b1270 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -304,6 +304,7 @@ out:
 static const struct vm_operations_struct xip_file_vm_ops = {
        .fault  = xip_file_fault,
+        .page_mkwrite   = filemap_page_mkwrite,
 };
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -401,6 +402,8 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        loff_t pos;
        ssize_t ret;
+        sb_start_write(inode->i_sb);
        mutex_lock(&inode->i_mutex);
        if (!access_ok(VERIFY_READ, buf, len)) {
@@ -411,8 +414,6 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        pos = *ppos;
        count = len;
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
        /* We can write back this queue in page reclaim */
        current->backing_dev_info = mapping->backing_dev_info;
@@ -436,6 +437,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        current->backing_dev_info = NULL;
 out_up:
        mutex_unlock(&inode->i_mutex);
+        sb_end_write(inode->i_sb);
        return ret;
 }
 EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index e25025574a0..6b3e71a2cd4 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -11,15 +11,11 @@
 * This work is licensed under the terms of the GNU GPL, version 2.
 */
-#include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
-#include <linux/proc_fs.h>
 #include <linux/security.h>
-#include <linux/capability.h>
 #include <linux/module.h>
-#include <linux/uaccess.h>
 #include <linux/debugfs.h>
 #include <linux/frontswap.h>
 #include <linux/swapfile.h>
@@ -110,16 +106,21 @@ void __frontswap_init(unsigned type)
        BUG_ON(sis == NULL);
        if (sis->frontswap_map == NULL)
                return;
-        if (frontswap_enabled)
+        frontswap_ops.init(type);
-                (*frontswap_ops.init)(type);
 }
 EXPORT_SYMBOL(__frontswap_init);
+static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+{
+        frontswap_clear(sis, offset);
+        atomic_dec(&sis->frontswap_pages);
+}
 /*
 * "Store" data from a page to frontswap and associate it with the page's
 * swaptype and offset.  Page must be locked and in the swap cache.
 * If frontswap already contains a page with matching swaptype and
- * offset, the frontswap implmentation may either overwrite the data and
+ * offset, the frontswap implementation may either overwrite the data and
 * return success or invalidate the page from frontswap and return failure.
 */
 int __frontswap_store(struct page *page)
@@ -134,22 +135,21 @@ int __frontswap_store(struct page *page)
        BUG_ON(sis == NULL);
        if (frontswap_test(sis, offset))
                dup = 1;
-        ret = (*frontswap_ops.store)(type, offset, page);
+        ret = frontswap_ops.store(type, offset, page);
        if (ret == 0) {
                frontswap_set(sis, offset);
                inc_frontswap_succ_stores();
                if (!dup)
                        atomic_inc(&sis->frontswap_pages);
-        } else if (dup) {
+        } else {
                /*
                  failed dup always results in automatic invalidate of
                  the (older) page from frontswap
                 */
-                frontswap_clear(sis, offset);
-                atomic_dec(&sis->frontswap_pages);
-                inc_frontswap_failed_stores();
-        } else
                inc_frontswap_failed_stores();
+                if (dup)
+                        __frontswap_clear(sis, offset);
+        }
        if (frontswap_writethrough_enabled)
                /* report failure so swap also writes to swap device */
                ret = -1;
@@ -173,7 +173,7 @@ int __frontswap_load(struct page *page)
        BUG_ON(!PageLocked(page));
        BUG_ON(sis == NULL);
        if (frontswap_test(sis, offset))
-                ret = (*frontswap_ops.load)(type, offset, page);
+                ret = frontswap_ops.load(type, offset, page);
        if (ret == 0)
                inc_frontswap_loads();
        return ret;
@@ -190,9 +190,8 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
        BUG_ON(sis == NULL);
        if (frontswap_test(sis, offset)) {
-                (*frontswap_ops.invalidate_page)(type, offset);
+                frontswap_ops.invalidate_page(type, offset);
-                atomic_dec(&sis->frontswap_pages);
+                __frontswap_clear(sis, offset);
-                frontswap_clear(sis, offset);
                inc_frontswap_invalidates();
        }
 }
@@ -209,67 +208,102 @@ void __frontswap_invalidate_area(unsigned type)
        BUG_ON(sis == NULL);
        if (sis->frontswap_map == NULL)
                return;
-        (*frontswap_ops.invalidate_area)(type);
+        frontswap_ops.invalidate_area(type);
        atomic_set(&sis->frontswap_pages, 0);
        memset(sis->frontswap_map, 0, sis->max / sizeof(long));
 }
 EXPORT_SYMBOL(__frontswap_invalidate_area);
-/*
+static unsigned long __frontswap_curr_pages(void)
- * Frontswap, like a true swap device, may unnecessarily retain pages
- * under certain circumstances; "shrink" frontswap is essentially a
- * "partial swapoff" and works by calling try_to_unuse to attempt to
- * unuse enough frontswap pages to attempt to -- subject to memory
- * constraints -- reduce the number of pages in frontswap to the
- * number given in the parameter target_pages.
- */
-void frontswap_shrink(unsigned long target_pages)
 {
-        struct swap_info_struct *si = NULL;
-        int si_frontswap_pages;
-        unsigned long total_pages = 0, total_pages_to_unuse;
-        unsigned long pages = 0, pages_to_unuse = 0;
        int type;
-        bool locked = false;
+        unsigned long totalpages = 0;
+        struct swap_info_struct *si = NULL;
-        /*
+        assert_spin_locked(&swap_lock);
-         * we don't want to hold swap_lock while doing a very
-         * lengthy try_to_unuse, but swap_list may change
-         * so restart scan from swap_list.head each time
-         */
-        spin_lock(&swap_lock);
-        locked = true;
-        total_pages = 0;
        for (type = swap_list.head; type >= 0; type = si->next) {
                si = swap_info[type];
-                total_pages += atomic_read(&si->frontswap_pages);
+                totalpages += atomic_read(&si->frontswap_pages);
        }
-        if (total_pages <= target_pages)
+        return totalpages;
-                goto out;
+}
-        total_pages_to_unuse = total_pages - target_pages;
+static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
+                                        int *swapid)
+{
+        int ret = -EINVAL;
+        struct swap_info_struct *si = NULL;
+        int si_frontswap_pages;
+        unsigned long total_pages_to_unuse = total;
+        unsigned long pages = 0, pages_to_unuse = 0;
+        int type;
+        assert_spin_locked(&swap_lock);
        for (type = swap_list.head; type >= 0; type = si->next) {
                si = swap_info[type];
                si_frontswap_pages = atomic_read(&si->frontswap_pages);
-                if (total_pages_to_unuse < si_frontswap_pages)
+                if (total_pages_to_unuse < si_frontswap_pages) {
                        pages = pages_to_unuse = total_pages_to_unuse;
-                else {
+                } else {
                        pages = si_frontswap_pages;
                        pages_to_unuse = 0; /* unuse all */
                }
                /* ensure there is enough RAM to fetch pages from frontswap */
-                if (security_vm_enough_memory_mm(current->mm, pages))
+                if (security_vm_enough_memory_mm(current->mm, pages)) {
+                        ret = -ENOMEM;
                        continue;
+                }
                vm_unacct_memory(pages);
+                *unused = pages_to_unuse;
+                *swapid = type;
+                ret = 0;
                break;
        }
-        if (type < 0)
-                goto out;
+        return ret;
-        locked = false;
+}
+static int __frontswap_shrink(unsigned long target_pages,
+                                unsigned long *pages_to_unuse,
+                                int *type)
+{
+        unsigned long total_pages = 0, total_pages_to_unuse;
+        assert_spin_locked(&swap_lock);
+        total_pages = __frontswap_curr_pages();
+        if (total_pages <= target_pages) {
+                /* Nothing to do */
+                *pages_to_unuse = 0;
+                return 0;
+        }
+        total_pages_to_unuse = total_pages - target_pages;
+        return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
+}
+/*
+ * Frontswap, like a true swap device, may unnecessarily retain pages
+ * under certain circumstances; "shrink" frontswap is essentially a
+ * "partial swapoff" and works by calling try_to_unuse to attempt to
+ * unuse enough frontswap pages to attempt to -- subject to memory
+ * constraints -- reduce the number of pages in frontswap to the
+ * number given in the parameter target_pages.
+ */
+void frontswap_shrink(unsigned long target_pages)
+{
+        unsigned long pages_to_unuse = 0;
+        int type, ret;
+        /*
+         * we don't want to hold swap_lock while doing a very
+         * lengthy try_to_unuse, but swap_list may change
+         * so restart scan from swap_list.head each time
+         */
+        spin_lock(&swap_lock);
+        ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
        spin_unlock(&swap_lock);
-        try_to_unuse(type, true, pages_to_unuse);
+        if (ret == 0 && pages_to_unuse)
-out:
+                try_to_unuse(type, true, pages_to_unuse);
-        if (locked)
-                spin_unlock(&swap_lock);
        return;
 }
 EXPORT_SYMBOL(frontswap_shrink);
@@ -281,16 +315,12 @@ EXPORT_SYMBOL(frontswap_shrink);
 */
 unsigned long frontswap_curr_pages(void)
 {
-        int type;
        unsigned long totalpages = 0;
-        struct swap_info_struct *si = NULL;
        spin_lock(&swap_lock);
-        for (type = swap_list.head; type >= 0; type = si->next) {
+        totalpages = __frontswap_curr_pages();
-                si = swap_info[type];
-                totalpages += atomic_read(&si->frontswap_pages);
-        }
        spin_unlock(&swap_lock);
        return totalpages;
 }
 EXPORT_SYMBOL(frontswap_curr_pages);
diff --git a/mm/highmem.c b/mm/highmem.c
index 57d82c6250c..d517cd16a6e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
                do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
 #endif
+struct page *kmap_to_page(void *vaddr)
+{
+        unsigned long addr = (unsigned long)vaddr;
+        if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) {
+                int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT;
+                return pte_page(pkmap_page_table[i]);
+        }
+        return virt_to_page(addr);
+}
 static void flush_all_zero_pkmaps(void)
 {
        int i;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e198831276a..bc727122dd4 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,17 +24,20 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
-#include <linux/io.h>
+#include <asm/tlb.h>
+#include <linux/io.h>
 #include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
+#include <linux/hugetlb_cgroup.h>
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-static int max_hstate;
+int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
 struct hstate hstates[HUGE_MAX_HSTATE];
@@ -45,13 +48,10 @@ static struct hstate * __initdata parsed_hstate;
 static unsigned long __initdata default_hstate_max_huge_pages;
 static unsigned long __initdata default_hstate_size;
-#define for_each_hstate(h) \
-        for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
-static DEFINE_SPINLOCK(hugetlb_lock);
+DEFINE_SPINLOCK(hugetlb_lock);
 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
 {
@@ -509,7 +509,7 @@ void copy_huge_page(struct page *dst, struct page *src)
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
-        list_add(&page->lru, &h->hugepage_freelists[nid]);
+        list_move(&page->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
 }
@@ -521,7 +521,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
        if (list_empty(&h->hugepage_freelists[nid]))
                return NULL;
        page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
-        list_del(&page->lru);
+        list_move(&page->lru, &h->hugepage_activelist);
        set_page_refcounted(page);
        h->free_huge_pages--;
        h->free_huge_pages_node[nid]--;
@@ -593,6 +593,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                1 << PG_active | 1 << PG_reserved |
                                1 << PG_private | 1 << PG_writeback);
        }
+        VM_BUG_ON(hugetlb_cgroup_from_page(page));
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
        arch_release_hugepage(page);
@@ -625,10 +626,13 @@ static void free_huge_page(struct page *page)
        page->mapping = NULL;
        BUG_ON(page_count(page));
        BUG_ON(page_mapcount(page));
-        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
+        hugetlb_cgroup_uncharge_page(hstate_index(h),
+                                     pages_per_huge_page(h), page);
        if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
+                /* remove the page from active list */
+                list_del(&page->lru);
                update_and_free_page(h, page);
                h->surplus_huge_pages--;
                h->surplus_huge_pages_node[nid]--;
@@ -641,8 +645,10 @@ static void free_huge_page(struct page *page)
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
+        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, free_huge_page);
        spin_lock(&hugetlb_lock);
+        set_hugetlb_cgroup(page, NULL);
        h->nr_huge_pages++;
        h->nr_huge_pages_node[nid]++;
        spin_unlock(&hugetlb_lock);
@@ -889,8 +895,10 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
        spin_lock(&hugetlb_lock);
        if (page) {
+                INIT_LIST_HEAD(&page->lru);
                r_nid = page_to_nid(page);
                set_compound_page_dtor(page, free_huge_page);
+                set_hugetlb_cgroup(page, NULL);
                /*
                 * We incremented the global counters already
                 */
@@ -993,7 +1001,6 @@ retry:
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
                        break;
-                list_del(&page->lru);
                /*
                 * This page is now managed by the hugetlb allocator and has
                 * no users -- drop the buddy allocator's reference.
@@ -1008,7 +1015,6 @@ free:
        /* Free unnecessary surplus pages to the buddy allocator */
        if (!list_empty(&surplus_list)) {
                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
-                        list_del(&page->lru);
                        put_page(page);
                }
        }
@@ -1112,7 +1118,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        struct hstate *h = hstate_vma(vma);
        struct page *page;
        long chg;
+        int ret, idx;
+        struct hugetlb_cgroup *h_cg;
+        idx = hstate_index(h);
        /*
         * Processes that did not create the mapping will have no
         * reserves and will not have accounted against subpool
@@ -1123,27 +1132,43 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
         */
        chg = vma_needs_reservation(h, vma, addr);
        if (chg < 0)
-                return ERR_PTR(-VM_FAULT_OOM);
+                return ERR_PTR(-ENOMEM);
        if (chg)
                if (hugepage_subpool_get_pages(spool, chg))
-                        return ERR_PTR(-VM_FAULT_SIGBUS);
+                        return ERR_PTR(-ENOSPC);
+        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
+        if (ret) {
+                hugepage_subpool_put_pages(spool, chg);
+                return ERR_PTR(-ENOSPC);
+        }
        spin_lock(&hugetlb_lock);
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
-        spin_unlock(&hugetlb_lock);
+        if (page) {
+                /* update page cgroup details */
-        if (!page) {
+                hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+                                             h_cg, page);
+                spin_unlock(&hugetlb_lock);
+        } else {
+                spin_unlock(&hugetlb_lock);
                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
+                        hugetlb_cgroup_uncharge_cgroup(idx,
+                                                       pages_per_huge_page(h),
+                                                       h_cg);
                        hugepage_subpool_put_pages(spool, chg);
-                        return ERR_PTR(-VM_FAULT_SIGBUS);
+                        return ERR_PTR(-ENOSPC);
                }
+                spin_lock(&hugetlb_lock);
+                hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
+                                             h_cg, page);
+                list_move(&page->lru, &h->hugepage_activelist);
+                spin_unlock(&hugetlb_lock);
        }
        set_page_private(page, (unsigned long)spool);
        vma_commit_reservation(h, vma, addr);
        return page;
 }
@@ -1646,7 +1671,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
                                    struct attribute_group *hstate_attr_group)
 {
        int retval;
-        int hi = h - hstates;
+        int hi = hstate_index(h);
        hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
        if (!hstate_kobjs[hi])
@@ -1741,11 +1766,13 @@ void hugetlb_unregister_node(struct node *node)
        if (!nhs->hugepages_kobj)
                return;         /* no hstate attributes */
-        for_each_hstate(h)
+        for_each_hstate(h) {
-                if (nhs->hstate_kobjs[h - hstates]) {
+                int idx = hstate_index(h);
-                        kobject_put(nhs->hstate_kobjs[h - hstates]);
+                if (nhs->hstate_kobjs[idx]) {
-                        nhs->hstate_kobjs[h - hstates] = NULL;
+                        kobject_put(nhs->hstate_kobjs[idx]);
+                        nhs->hstate_kobjs[idx] = NULL;
                }
+        }
        kobject_put(nhs->hugepages_kobj);
        nhs->hugepages_kobj = NULL;
@@ -1848,7 +1875,7 @@ static void __exit hugetlb_exit(void)
        hugetlb_unregister_all_nodes();
        for_each_hstate(h) {
-                kobject_put(hstate_kobjs[h - hstates]);
+                kobject_put(hstate_kobjs[hstate_index(h)]);
        }
        kobject_put(hugepages_kobj);
@@ -1869,7 +1896,7 @@ static int __init hugetlb_init(void)
                if (!size_to_hstate(default_hstate_size))
                        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
        }
-        default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+        default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
        if (default_hstate_max_huge_pages)
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
@@ -1897,19 +1924,27 @@ void __init hugetlb_add_hstate(unsigned order)
                printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
                return;
        }
-        BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
        BUG_ON(order == 0);
-        h = &hstates[max_hstate++];
+        h = &hstates[hugetlb_max_hstate++];
        h->order = order;
        h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
        h->nr_huge_pages = 0;
        h->free_huge_pages = 0;
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+        INIT_LIST_HEAD(&h->hugepage_activelist);
        h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
        h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
+        /*
+         * Add cgroup control files only if the huge page consists
+         * of more than two normal pages. This is because we use
+         * page[2].lru.next for storing cgoup details.
+         */
+        if (order >= HUGETLB_CGROUP_MIN_ORDER)
+                hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
        parsed_hstate = h;
 }
@@ -1920,10 +1955,10 @@ static int __init hugetlb_nrpages_setup(char *s)
        static unsigned long *last_mhp;
        /*
-         * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+         * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
         * so this hugepages= parameter goes to the "default hstate".
         */
-        if (!max_hstate)
+        if (!hugetlb_max_hstate)
                mhp = &default_hstate_max_huge_pages;
        else
                mhp = &parsed_hstate->max_huge_pages;
@@ -1942,7 +1977,7 @@ static int __init hugetlb_nrpages_setup(char *s)
         * But we need to allocate >= MAX_ORDER hstates here early to still
         * use the bootmem allocator.
         */
-        if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+        if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
                hugetlb_hstate_alloc_pages(parsed_hstate);
        last_mhp = mhp;
@@ -2308,30 +2343,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte)
                return 0;
 }
-void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
+void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
-                            unsigned long end, struct page *ref_page)
+                            unsigned long start, unsigned long end,
+                            struct page *ref_page)
 {
+        int force_flush = 0;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
        pte_t *ptep;
        pte_t pte;
        struct page *page;
-        struct page *tmp;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
-        /*
-         * A page gathering list, protected by per file i_mmap_mutex. The
-         * lock is used to avoid list corruption from multiple unmapping
-         * of the same page since we are using page->lru.
-         */
-        LIST_HEAD(page_list);
        WARN_ON(!is_vm_hugetlb_page(vma));
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));
+        tlb_start_vma(tlb, vma);
        mmu_notifier_invalidate_range_start(mm, start, end);
+again:
        spin_lock(&mm->page_table_lock);
        for (address = start; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address);
@@ -2370,30 +2401,64 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                }
                pte = huge_ptep_get_and_clear(mm, address, ptep);
+                tlb_remove_tlb_entry(tlb, ptep, address);
                if (pte_dirty(pte))
                        set_page_dirty(page);
-                list_add(&page->lru, &page_list);
+                page_remove_rmap(page);
+                force_flush = !__tlb_remove_page(tlb, page);
+                if (force_flush)
+                        break;
                /* Bail out after unmapping reference page if supplied */
                if (ref_page)
                        break;
        }
-        flush_tlb_range(vma, start, end);
        spin_unlock(&mm->page_table_lock);
-        mmu_notifier_invalidate_range_end(mm, start, end);
+        /*
-        list_for_each_entry_safe(page, tmp, &page_list, lru) {
+         * mmu_gather ran out of room to batch pages, we break out of
-                page_remove_rmap(page);
+         * the PTE lock to avoid doing the potential expensive TLB invalidate
-                list_del(&page->lru);
+         * and page-free while holding it.
-                put_page(page);
+         */
+        if (force_flush) {
+                force_flush = 0;
+                tlb_flush_mmu(tlb);
+                if (address < end && !ref_page)
+                        goto again;
        }
+        mmu_notifier_invalidate_range_end(mm, start, end);
+        tlb_end_vma(tlb, vma);
+}
+void __unmap_hugepage_range_final(struct mmu_gather *tlb,
+                          struct vm_area_struct *vma, unsigned long start,
+                          unsigned long end, struct page *ref_page)
+{
+        __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+        /*
+         * Clear this flag so that x86's huge_pmd_share page_table_shareable
+         * test will fail on a vma being torn down, and not grab a page table
+         * on its way out.  We're lucky that the flag has such an appropriate
+         * name, and can in fact be safely cleared here. We could clear it
+         * before the __unmap_hugepage_range above, but all that's necessary
+         * is to clear it before releasing the i_mmap_mutex. This works
+         * because in the context this is called, the VMA is about to be
+         * destroyed and the i_mmap_mutex is held.
+         */
+        vma->vm_flags &= ~VM_MAYSHARE;
 }
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page)
 {
-        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        struct mm_struct *mm;
-        __unmap_hugepage_range(vma, start, end, ref_page);
+        struct mmu_gather tlb;
-        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        mm = vma->vm_mm;
+        tlb_gather_mmu(&tlb, mm, 0);
+        __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+        tlb_finish_mmu(&tlb, start, end);
 }
 /*
@@ -2438,9 +2503,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                 * from the time of fork. This would look like data corruption
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
-                        __unmap_hugepage_range(iter_vma,
+                        unmap_hugepage_range(iter_vma, address,
-                                address, address + huge_page_size(h),
+                                             address + huge_page_size(h), page);
-                                page);
        }
        mutex_unlock(&mapping->i_mmap_mutex);
@@ -2496,6 +2560,7 @@ retry_avoidcopy:
        new_page = alloc_huge_page(vma, address, outside_reserve);
        if (IS_ERR(new_page)) {
+                long err = PTR_ERR(new_page);
                page_cache_release(old_page);
                /*
@@ -2524,7 +2589,10 @@ retry_avoidcopy:
                /* Caller expects lock to be held */
                spin_lock(&mm->page_table_lock);
-                return -PTR_ERR(new_page);
+                if (err == -ENOMEM)
+                        return VM_FAULT_OOM;
+                else
+                        return VM_FAULT_SIGBUS;
        }
        /*
@@ -2642,7 +2710,11 @@ retry:
                        goto out;
                page = alloc_huge_page(vma, address, 0);
                if (IS_ERR(page)) {
-                        ret = -PTR_ERR(page);
+                        ret = PTR_ERR(page);
+                        if (ret == -ENOMEM)
+                                ret = VM_FAULT_OOM;
+                        else
+                                ret = VM_FAULT_SIGBUS;
                        goto out;
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
@@ -2679,7 +2751,7 @@ retry:
                 */
                if (unlikely(PageHWPoison(page))) {
                        ret = VM_FAULT_HWPOISON |
-                              VM_FAULT_SET_HINDEX(h - hstates);
+                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto backout_unlocked;
                }
        }
@@ -2752,7 +2824,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        return 0;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
                        return VM_FAULT_HWPOISON_LARGE |
-                               VM_FAULT_SET_HINDEX(h - hstates);
+                                VM_FAULT_SET_HINDEX(hstate_index(h));
        }
        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2959,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                }
        }
        spin_unlock(&mm->page_table_lock);
-        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+        /*
+         * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
+         * may have cleared our pud entry and done put_page on the page table:
+         * once we release i_mmap_mutex, another task can do the final put_page
+         * and that page table be reused and filled with junk.
+         */
        flush_tlb_range(vma, start, end);
+        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
 int hugetlb_reserve_pages(struct inode *inode,
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
new file mode 100644
index 00000000000..a3f358fb8a0
--- /dev/null
+++ b/mm/hugetlb_cgroup.c
@@ -0,0 +1,418 @@
+/*
+ *
+ * Copyright IBM Corporation, 2012
+ * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2.1 of the GNU Lesser General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ */
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
+struct hugetlb_cgroup {
+        struct cgroup_subsys_state css;
+        /*
+         * the counter to account for hugepages from hugetlb.
+         */
+        struct res_counter hugepage[HUGE_MAX_HSTATE];
+};
+#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
+#define MEMFILE_IDX(val)        (((val) >> 16) & 0xffff)
+#define MEMFILE_ATTR(val)       ((val) & 0xffff)
+struct cgroup_subsys hugetlb_subsys __read_mostly;
+static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+        return container_of(s, struct hugetlb_cgroup, css);
+}
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup)
+{
+        return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup,
+                                                           hugetlb_subsys_id));
+}
+static inline
+struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task)
+{
+        return hugetlb_cgroup_from_css(task_subsys_state(task,
+                                                         hugetlb_subsys_id));
+}
+static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg)
+{
+        return (h_cg == root_h_cgroup);
+}
+static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg)
+{
+        if (!cg->parent)
+                return NULL;
+        return hugetlb_cgroup_from_cgroup(cg->parent);
+}
+static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg)
+{
+        int idx;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg);
+        for (idx = 0; idx < hugetlb_max_hstate; idx++) {
+                if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0)
+                        return true;
+        }
+        return false;
+}
+static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup)
+{
+        int idx;
+        struct cgroup *parent_cgroup;
+        struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup;
+        h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
+        if (!h_cgroup)
+                return ERR_PTR(-ENOMEM);
+        parent_cgroup = cgroup->parent;
+        if (parent_cgroup) {
+                parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup);
+                for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+                        res_counter_init(&h_cgroup->hugepage[idx],
+                                         &parent_h_cgroup->hugepage[idx]);
+        } else {
+                root_h_cgroup = h_cgroup;
+                for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
+                        res_counter_init(&h_cgroup->hugepage[idx], NULL);
+        }
+        return &h_cgroup->css;
+}
+static void hugetlb_cgroup_destroy(struct cgroup *cgroup)
+{
+        struct hugetlb_cgroup *h_cgroup;
+        h_cgroup = hugetlb_cgroup_from_cgroup(cgroup);
+        kfree(h_cgroup);
+}
+/*
+ * Should be called with hugetlb_lock held.
+ * Since we are holding hugetlb_lock, pages cannot get moved from
+ * active list or uncharged from the cgroup, So no need to get
+ * page reference and test for page active here. This function
+ * cannot fail.
+ */
+static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup,
+                                       struct page *page)
+{
+        int csize;
+        struct res_counter *counter;
+        struct res_counter *fail_res;
+        struct hugetlb_cgroup *page_hcg;
+        struct hugetlb_cgroup *h_cg   = hugetlb_cgroup_from_cgroup(cgroup);
+        struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup);
+        page_hcg = hugetlb_cgroup_from_page(page);
+        /*
+         * We can have pages in active list without any cgroup
+         * ie, hugepage with less than 3 pages. We can safely
+         * ignore those pages.
+         */
+        if (!page_hcg || page_hcg != h_cg)
+                goto out;
+        csize = PAGE_SIZE << compound_order(page);
+        if (!parent) {
+                parent = root_h_cgroup;
+                /* root has no limit */
+                res_counter_charge_nofail(&parent->hugepage[idx],
+                                          csize, &fail_res);
+        }
+        counter = &h_cg->hugepage[idx];
+        res_counter_uncharge_until(counter, counter->parent, csize);
+        set_hugetlb_cgroup(page, parent);
+out:
+        return;
+}
+/*
+ * Force the hugetlb cgroup to empty the hugetlb resources by moving them to
+ * the parent cgroup.
+ */
+static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup)
+{
+        struct hstate *h;
+        struct page *page;
+        int ret = 0, idx = 0;
+        do {
+                if (cgroup_task_count(cgroup) ||
+                    !list_empty(&cgroup->children)) {
+                        ret = -EBUSY;
+                        goto out;
+                }
+                for_each_hstate(h) {
+                        spin_lock(&hugetlb_lock);
+                        list_for_each_entry(page, &h->hugepage_activelist, lru)
+                                hugetlb_cgroup_move_parent(idx, cgroup, page);
+                        spin_unlock(&hugetlb_lock);
+                        idx++;
+                }
+                cond_resched();
+        } while (hugetlb_cgroup_have_usage(cgroup));
+out:
+        return ret;
+}
+int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
+                                 struct hugetlb_cgroup **ptr)
+{
+        int ret = 0;
+        struct res_counter *fail_res;
+        struct hugetlb_cgroup *h_cg = NULL;
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled())
+                goto done;
+        /*
+         * We don't charge any cgroup if the compound page have less
+         * than 3 pages.
+         */
+        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+                goto done;
+again:
+        rcu_read_lock();
+        h_cg = hugetlb_cgroup_from_task(current);
+        if (!css_tryget(&h_cg->css)) {
+                rcu_read_unlock();
+                goto again;
+        }
+        rcu_read_unlock();
+        ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res);
+        css_put(&h_cg->css);
+done:
+        *ptr = h_cg;
+        return ret;
+}
+/* Should be called with hugetlb_lock held */
+void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
+                                  struct hugetlb_cgroup *h_cg,
+                                  struct page *page)
+{
+        if (hugetlb_cgroup_disabled() || !h_cg)
+                return;
+        set_hugetlb_cgroup(page, h_cg);
+        return;
+}
+/*
+ * Should be called with hugetlb_lock held
+ */
+void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
+                                  struct page *page)
+{
+        struct hugetlb_cgroup *h_cg;
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled())
+                return;
+        VM_BUG_ON(!spin_is_locked(&hugetlb_lock));
+        h_cg = hugetlb_cgroup_from_page(page);
+        if (unlikely(!h_cg))
+                return;
+        set_hugetlb_cgroup(page, NULL);
+        res_counter_uncharge(&h_cg->hugepage[idx], csize);
+        return;
+}
+void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages,
+                                    struct hugetlb_cgroup *h_cg)
+{
+        unsigned long csize = nr_pages * PAGE_SIZE;
+        if (hugetlb_cgroup_disabled() || !h_cg)
+                return;
+        if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER)
+                return;
+        res_counter_uncharge(&h_cg->hugepage[idx], csize);
+        return;
+}
+static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft,
+                                   struct file *file, char __user *buf,
+                                   size_t nbytes, loff_t *ppos)
+{
+        u64 val;
+        char str[64];
+        int idx, name, len;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        val = res_counter_read_u64(&h_cg->hugepage[idx], name);
+        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft,
+                                const char *buffer)
+{
+        int idx, name, ret;
+        unsigned long long val;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(cft->private);
+        name = MEMFILE_ATTR(cft->private);
+        switch (name) {
+        case RES_LIMIT:
+                if (hugetlb_cgroup_is_root(h_cg)) {
+                        /* Can't set limit on root */
+                        ret = -EINVAL;
+                        break;
+                }
+                /* This function does all necessary parse...reuse it */
+                ret = res_counter_memparse_write_strategy(buffer, &val);
+                if (ret)
+                        break;
+                ret = res_counter_set_limit(&h_cg->hugepage[idx], val);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret;
+}
+static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event)
+{
+        int idx, name, ret = 0;
+        struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup);
+        idx = MEMFILE_IDX(event);
+        name = MEMFILE_ATTR(event);
+        switch (name) {
+        case RES_MAX_USAGE:
+                res_counter_reset_max(&h_cg->hugepage[idx]);
+                break;
+        case RES_FAILCNT:
+                res_counter_reset_failcnt(&h_cg->hugepage[idx]);
+                break;
+        default:
+                ret = -EINVAL;
+                break;
+        }
+        return ret;
+}
+static char *mem_fmt(char *buf, int size, unsigned long hsize)
+{
+        if (hsize >= (1UL << 30))
+                snprintf(buf, size, "%luGB", hsize >> 30);
+        else if (hsize >= (1UL << 20))
+                snprintf(buf, size, "%luMB", hsize >> 20);
+        else
+                snprintf(buf, size, "%luKB", hsize >> 10);
+        return buf;
+}
+int __init hugetlb_cgroup_file_init(int idx)
+{
+        char buf[32];
+        struct cftype *cft;
+        struct hstate *h = &hstates[idx];
+        /* format the size */
+        mem_fmt(buf, 32, huge_page_size(h));
+        /* Add the limit file */
+        cft = &h->cgroup_files[0];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT);
+        cft->read = hugetlb_cgroup_read;
+        cft->write_string = hugetlb_cgroup_write;
+        /* Add the usage file */
+        cft = &h->cgroup_files[1];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_USAGE);
+        cft->read = hugetlb_cgroup_read;
+        /* Add the MAX usage file */
+        cft = &h->cgroup_files[2];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf);
+        cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE);
+        cft->trigger = hugetlb_cgroup_reset;
+        cft->read = hugetlb_cgroup_read;
+        /* Add the failcntfile */
+        cft = &h->cgroup_files[3];
+        snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf);
+        cft->private  = MEMFILE_PRIVATE(idx, RES_FAILCNT);
+        cft->trigger  = hugetlb_cgroup_reset;
+        cft->read = hugetlb_cgroup_read;
+        /* NULL terminate the last cft */
+        cft = &h->cgroup_files[4];
+        memset(cft, 0, sizeof(*cft));
+        WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
+        return 0;
+}
+/*
+ * hugetlb_lock will make sure a parallel cgroup rmdir won't happen
+ * when we migrate hugepages
+ */
+void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage)
+{
+        struct hugetlb_cgroup *h_cg;
+        struct hstate *h = page_hstate(oldhpage);
+        if (hugetlb_cgroup_disabled())
+                return;
+        VM_BUG_ON(!PageHuge(oldhpage));
+        spin_lock(&hugetlb_lock);
+        h_cg = hugetlb_cgroup_from_page(oldhpage);
+        set_hugetlb_cgroup(oldhpage, NULL);
+        /* move the h_cg details to new cgroup */
+        set_hugetlb_cgroup(newhpage, h_cg);
+        list_move(&newhpage->lru, &h->hugepage_activelist);
+        spin_unlock(&hugetlb_lock);
+        return;
+}
+struct cgroup_subsys hugetlb_subsys = {
+        .name = "hugetlb",
+        .create     = hugetlb_cgroup_create,
+        .pre_destroy = hugetlb_cgroup_pre_destroy,
+        .destroy    = hugetlb_cgroup_destroy,
+        .subsys_id  = hugetlb_subsys_id,
+};
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index cc448bb983b..3a61efc518d 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -123,7 +123,7 @@ static int pfn_inject_init(void)
        if (!dentry)
                goto fail;
-#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
        dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
                                    hwpoison_dir, &hwpoison_filter_memcg);
        if (!dentry)
diff --git a/mm/internal.h b/mm/internal.h
index 2ba87fbfb75..b8c91b342e2 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -118,12 +118,19 @@ struct compact_control {
        unsigned long nr_freepages;     /* Number of isolated free pages */
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
+        unsigned long start_free_pfn;   /* where we started the search */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
+        bool wrapped;                   /* Order > 0 compactions are
+                                           incremental, once free_pfn
+                                           and migrate_pfn meet, we restart
+                                           from the top of the zone;
+                                           remember we wrapped around. */
        int order;                      /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
+        bool *contended;                /* True if a lock was contended */
 };
 unsigned long
@@ -347,3 +354,5 @@ extern u32 hwpoison_filter_enable;
 extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
        unsigned long, unsigned long,
        unsigned long, unsigned long);
+extern void set_pageblock_order(void);
diff --git a/mm/memblock.c b/mm/memblock.c
index 5cc6731b00c..4d9393c7edc 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -222,13 +222,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
        /* Try to find some space for it.
         *
         * WARNING: We assume that either slab_is_available() and we use it or
-         * we use MEMBLOCK for allocations. That means that this is unsafe to use
+         * we use MEMBLOCK for allocations. That means that this is unsafe to
-         * when bootmem is currently active (unless bootmem itself is implemented
+         * use when bootmem is currently active (unless bootmem itself is
-         * on top of MEMBLOCK which isn't the case yet)
+         * implemented on top of MEMBLOCK which isn't the case yet)
         *
         * This should however not be an issue for now, as we currently only
-         * call into MEMBLOCK while it's still active, or much later when slab is
+         * call into MEMBLOCK while it's still active, or much later when slab
-         * active for memory hotplug operations
+         * is active for memory hotplug operations
         */
        if (use_slab) {
                new_array = kmalloc(new_size, GFP_KERNEL);
@@ -243,8 +243,8 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
                                                new_alloc_size, PAGE_SIZE);
                if (!addr && new_area_size)
                        addr = memblock_find_in_range(0,
-                                        min(new_area_start, memblock.current_limit),
+                                min(new_area_start, memblock.current_limit),
-                                        new_alloc_size, PAGE_SIZE);
+                                new_alloc_size, PAGE_SIZE);
                new_array = addr ? __va(addr) : 0;
        }
@@ -254,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
                return -1;
        }
-        memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
+        memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]",
-                 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
+                        memblock_type_name(type), type->max * 2, (u64)addr,
+                        (u64)addr + new_size - 1);
-        /* Found space, we now need to move the array over before
+        /*
-         * we add the reserved region since it may be our reserved
+         * Found space, we now need to move the array over before we add the
-         * array itself that is full.
+         * reserved region since it may be our reserved array itself that is
+         * full.
         */
        memcpy(new_array, type->regions, old_size);
        memset(new_array + type->max, 0, old_size);
@@ -267,17 +269,16 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
        type->regions = new_array;
        type->max <<= 1;
-        /* Free old array. We needn't free it if the array is the
+        /* Free old array. We needn't free it if the array is the static one */
-         * static one
-         */
        if (*in_slab)
                kfree(old_array);
        else if (old_array != memblock_memory_init_regions &&
                 old_array != memblock_reserved_init_regions)
                memblock_free(__pa(old_array), old_alloc_size);
-        /* Reserve the new array if that comes from the memblock.
+        /*
-         * Otherwise, we needn't do it
+         * Reserve the new array if that comes from the memblock.  Otherwise, we
+         * needn't do it
         */
        if (!use_slab)
                BUG_ON(memblock_reserve(addr, new_alloc_size));
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f72b5e52451..795e525afab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES      5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
 /* for remember boot option*/
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
 static int really_do_swap_account __initdata = 1;
 #else
 static int really_do_swap_account __initdata = 0;
@@ -87,7 +87,7 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
        MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
        MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
-        MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+        MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */
        MEM_CGROUP_STAT_NSTATS,
 };
@@ -378,9 +378,7 @@ static bool move_file(void)
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
-        MEM_CGROUP_CHARGE_TYPE_MAPPED,
+        MEM_CGROUP_CHARGE_TYPE_ANON,
-        MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
-        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
        MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
        MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
        NR_CHARGE_TYPE,
@@ -407,8 +405,14 @@ enum charge_type {
 static void mem_cgroup_get(struct mem_cgroup *memcg);
 static void mem_cgroup_put(struct mem_cgroup *memcg);
+static inline
+struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
+{
+        return container_of(s, struct mem_cgroup, css);
+}
 /* Writing them here to avoid exposing memcg's inner layout */
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
 #include <net/sock.h>
 #include <net/ip.h>
@@ -467,9 +471,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 #endif /* CONFIG_INET */
-#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
+#endif /* CONFIG_MEMCG_KMEM */
-#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
+#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
        if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
@@ -703,7 +707,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
                                         bool charge)
 {
        int val = (charge) ? 1 : -1;
-        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
+        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
 }
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -864,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
-        return container_of(cgroup_subsys_state(cont,
+        return mem_cgroup_from_css(
-                                mem_cgroup_subsys_id), struct mem_cgroup,
+                cgroup_subsys_state(cont, mem_cgroup_subsys_id));
-                                css);
 }
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
@@ -879,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
        if (unlikely(!p))
                return NULL;
-        return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
+        return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
-                                struct mem_cgroup, css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
@@ -966,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id);
                if (css) {
                        if (css == &root->css || css_tryget(css))
-                                memcg = container_of(css,
+                                memcg = mem_cgroup_from_css(css);
-                                                     struct mem_cgroup, css);
                } else
                        id = 0;
                rcu_read_unlock();
@@ -1454,7 +1455,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 /*
 * Return the memory (and swap, if configured) limit for a memcg.
 */
-u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 {
        u64 limit;
        u64 memsw;
@@ -1470,6 +1471,73 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        return min(limit, memsw);
 }
+void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
+                              int order)
+{
+        struct mem_cgroup *iter;
+        unsigned long chosen_points = 0;
+        unsigned long totalpages;
+        unsigned int points = 0;
+        struct task_struct *chosen = NULL;
+        /*
+         * If current has a pending SIGKILL, then automatically select it.  The
+         * goal is to allow it to allocate so that it may quickly exit and free
+         * its memory.
+         */
+        if (fatal_signal_pending(current)) {
+                set_thread_flag(TIF_MEMDIE);
+                return;
+        }
+        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
+        totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
+        for_each_mem_cgroup_tree(iter, memcg) {
+                struct cgroup *cgroup = iter->css.cgroup;
+                struct cgroup_iter it;
+                struct task_struct *task;
+                cgroup_iter_start(cgroup, &it);
+                while ((task = cgroup_iter_next(cgroup, &it))) {
+                        switch (oom_scan_process_thread(task, totalpages, NULL,
+                                                        false)) {
+                        case OOM_SCAN_SELECT:
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                chosen = task;
+                                chosen_points = ULONG_MAX;
+                                get_task_struct(chosen);
+                                /* fall through */
+                        case OOM_SCAN_CONTINUE:
+                                continue;
+                        case OOM_SCAN_ABORT:
+                                cgroup_iter_end(cgroup, &it);
+                                mem_cgroup_iter_break(memcg, iter);
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                return;
+                        case OOM_SCAN_OK:
+                                break;
+                        };
+                        points = oom_badness(task, memcg, NULL, totalpages);
+                        if (points > chosen_points) {
+                                if (chosen)
+                                        put_task_struct(chosen);
+                                chosen = task;
+                                chosen_points = points;
+                                get_task_struct(chosen);
+                        }
+                }
+                cgroup_iter_end(cgroup, &it);
+        }
+        if (!chosen)
+                return;
+        points = chosen_points * 1000 / totalpages;
+        oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
+                         NULL, "Memory cgroup out of memory");
+}
 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
                                        gfp_t gfp_mask,
                                        unsigned long flags)
@@ -1899,7 +1967,7 @@ again:
                return;
        /*
         * If this memory cgroup is not under account moving, we don't
-         * need to take move_lock_page_cgroup(). Because we already hold
+         * need to take move_lock_mem_cgroup(). Because we already hold
         * rcu_read_lock(), any calls to move_account will be delayed until
         * rcu_read_unlock() if mem_cgroup_stolen() == true.
         */
@@ -1921,7 +1989,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags)
        /*
         * It's guaranteed that pc->mem_cgroup never changes while
         * lock is held because a routine modifies pc->mem_cgroup
-         * should take move_lock_page_cgroup().
+         * should take move_lock_mem_cgroup().
         */
        move_unlock_mem_cgroup(pc->mem_cgroup, flags);
 }
@@ -2268,7 +2336,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
         * thread group leader migrates. It's possible that mm is not
-         * set, if so charge the init_mm (happens for pagecache usage).
+         * set, if so charge the root memcg (happens for pagecache usage).
         */
        if (!*ptr && !mm)
                *ptr = root_mem_cgroup;
@@ -2429,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
        css = css_lookup(&mem_cgroup_subsys, id);
        if (!css)
                return NULL;
-        return container_of(css, struct mem_cgroup, css);
+        return mem_cgroup_from_css(css);
 }
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2473,11 +2541,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        bool anon;
        lock_page_cgroup(pc);
-        if (unlikely(PageCgroupUsed(pc))) {
+        VM_BUG_ON(PageCgroupUsed(pc));
-                unlock_page_cgroup(pc);
-                __mem_cgroup_cancel_charge(memcg, nr_pages);
-                return;
-        }
        /*
         * we don't need page_cgroup_lock about tail pages, becase they are not
         * accessed by any other context at this point.
@@ -2519,7 +2583,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
                spin_unlock_irq(&zone->lru_lock);
        }
-        if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
+        if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
                anon = true;
        else
                anon = false;
@@ -2644,8 +2708,7 @@ out:
 static int mem_cgroup_move_parent(struct page *page,
                                  struct page_cgroup *pc,
-                                  struct mem_cgroup *child,
+                                  struct mem_cgroup *child)
-                                  gfp_t gfp_mask)
 {
        struct mem_cgroup *parent;
        unsigned int nr_pages;
@@ -2728,38 +2791,7 @@ int mem_cgroup_newpage_charge(struct page *page,
        VM_BUG_ON(page->mapping && !PageAnon(page));
        VM_BUG_ON(!mm);
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                        MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                        MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
-                                        enum charge_type ctype);
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
-                                gfp_t gfp_mask)
-{
-        struct mem_cgroup *memcg = NULL;
-        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
-        int ret;
-        if (mem_cgroup_disabled())
-                return 0;
-        if (PageCompound(page))
-                return 0;
-        if (unlikely(!mm))
-                mm = &init_mm;
-        if (!page_is_file_cache(page))
-                type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-        if (!PageSwapCache(page))
-                ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
-        else { /* page is swapcache/shmem */
-                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
-                if (!ret)
-                        __mem_cgroup_commit_charge_swapin(page, memcg, type);
-        }
-        return ret;
 }
 /*
@@ -2768,27 +2800,26 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 * struct page_cgroup is acquired. This refcnt will be consumed by
 * "commit()" or removed by "cancel()"
 */
-int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
+static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-                                 struct page *page,
+                                          struct page *page,
-                                 gfp_t mask, struct mem_cgroup **memcgp)
+                                          gfp_t mask,
+                                          struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg;
+        struct page_cgroup *pc;
        int ret;
-        *memcgp = NULL;
+        pc = lookup_page_cgroup(page);
-        if (mem_cgroup_disabled())
-                return 0;
-        if (!do_swap_account)
-                goto charge_cur_mm;
        /*
-         * A racing thread's fault, or swapoff, may have already updated
+         * Every swap fault against a single page tries to charge the
-         * the pte, and even removed page from swap cache: in those cases
+         * page, bail as early as possible.  shmem_unuse() encounters
-         * do_swap_page()'s pte_same() test will fail; but there's also a
+         * already charged pages, too.  The USED bit is protected by
-         * KSM case which does need to charge the page.
+         * the page lock, which serializes swap cache removal, which
+         * in turn serializes uncharging.
         */
-        if (!PageSwapCache(page))
+        if (PageCgroupUsed(pc))
+                return 0;
+        if (!do_swap_account)
                goto charge_cur_mm;
        memcg = try_get_mem_cgroup_from_page(page);
        if (!memcg)
@@ -2800,14 +2831,44 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                ret = 0;
        return ret;
 charge_cur_mm:
-        if (unlikely(!mm))
-                mm = &init_mm;
        ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
        if (ret == -EINTR)
                ret = 0;
        return ret;
 }
+int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
+                                 gfp_t gfp_mask, struct mem_cgroup **memcgp)
+{
+        *memcgp = NULL;
+        if (mem_cgroup_disabled())
+                return 0;
+        /*
+         * A racing thread's fault, or swapoff, may have already
+         * updated the pte, and even removed page from swap cache: in
+         * those cases unuse_pte()'s pte_same() test will fail; but
+         * there's also a KSM case which does need to charge the page.
+         */
+        if (!PageSwapCache(page)) {
+                int ret;
+                ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
+                if (ret == -EINTR)
+                        ret = 0;
+                return ret;
+        }
+        return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
+}
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+{
+        if (mem_cgroup_disabled())
+                return;
+        if (!memcg)
+                return;
+        __mem_cgroup_cancel_charge(memcg, 1);
+}
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
                                        enum charge_type ctype)
@@ -2842,16 +2903,30 @@ void mem_cgroup_commit_charge_swapin(struct page *page,
                                     struct mem_cgroup *memcg)
 {
        __mem_cgroup_commit_charge_swapin(page, memcg,
-                                          MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                          MEM_CGROUP_CHARGE_TYPE_ANON);
 }
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
+                                gfp_t gfp_mask)
 {
+        struct mem_cgroup *memcg = NULL;
+        enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        int ret;
        if (mem_cgroup_disabled())
-                return;
+                return 0;
-        if (!memcg)
+        if (PageCompound(page))
-                return;
+                return 0;
-        __mem_cgroup_cancel_charge(memcg, 1);
+        if (!PageSwapCache(page))
+                ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
+        else { /* page is swapcache/shmem */
+                ret = __mem_cgroup_try_charge_swapin(mm, page,
+                                                     gfp_mask, &memcg);
+                if (!ret)
+                        __mem_cgroup_commit_charge_swapin(page, memcg, type);
+        }
+        return ret;
 }
 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
@@ -2911,7 +2986,8 @@ direct_uncharge:
 * uncharge if !page_mapped(page)
 */
 static struct mem_cgroup *
-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
+                             bool end_migration)
 {
        struct mem_cgroup *memcg = NULL;
        unsigned int nr_pages = 1;
@@ -2921,8 +2997,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        if (mem_cgroup_disabled())
                return NULL;
-        if (PageSwapCache(page))
+        VM_BUG_ON(PageSwapCache(page));
-                return NULL;
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
@@ -2945,7 +3020,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        anon = PageAnon(page);
        switch (ctype) {
-        case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+        case MEM_CGROUP_CHARGE_TYPE_ANON:
                /*
                 * Generally PageAnon tells if it's the anon statistics to be
                 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
@@ -2955,7 +3030,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                /* fallthrough */
        case MEM_CGROUP_CHARGE_TYPE_DROP:
                /* See mem_cgroup_prepare_migration() */
-                if (page_mapped(page) || PageCgroupMigration(pc))
+                if (page_mapped(page))
+                        goto unlock_out;
+                /*
+                 * Pages under migration may not be uncharged.  But
+                 * end_migration() /must/ be the one uncharging the
+                 * unused post-migration page and so it has to call
+                 * here with the migration bit still set.  See the
+                 * res_counter handling below.
+                 */
+                if (!end_migration && PageCgroupMigration(pc))
                        goto unlock_out;
                break;
        case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2989,7 +3073,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                mem_cgroup_swap_statistics(memcg, true);
                mem_cgroup_get(memcg);
        }
-        if (!mem_cgroup_is_root(memcg))
+        /*
+         * Migration does not charge the res_counter for the
+         * replacement page, so leave it alone when phasing out the
+         * page that is unused after the migration.
+         */
+        if (!end_migration && !mem_cgroup_is_root(memcg))
                mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
        return memcg;
@@ -3005,14 +3094,16 @@ void mem_cgroup_uncharge_page(struct page *page)
        if (page_mapped(page))
                return;
        VM_BUG_ON(page->mapping && !PageAnon(page));
-        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+        if (PageSwapCache(page))
+                return;
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
 }
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
        VM_BUG_ON(page_mapped(page));
        VM_BUG_ON(page->mapping);
-        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
 }
 /*
@@ -3076,7 +3167,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
        if (!swapout) /* this was a swap cache but the swap is unused ! */
                ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
-        memcg = __mem_cgroup_uncharge_common(page, ctype);
+        memcg = __mem_cgroup_uncharge_common(page, ctype, false);
        /*
         * record memcg information,  if swapout && memcg != NULL,
@@ -3087,7 +3178,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 }
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 /*
 * called from swap_entry_free(). remove record in swap_cgroup and
 * uncharge "memsw" account.
@@ -3166,19 +3257,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
 * page belongs to.
 */
-int mem_cgroup_prepare_migration(struct page *page,
+void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
-        struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask)
+                                  struct mem_cgroup **memcgp)
 {
        struct mem_cgroup *memcg = NULL;
        struct page_cgroup *pc;
        enum charge_type ctype;
-        int ret = 0;
        *memcgp = NULL;
        VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
-                return 0;
+                return;
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
@@ -3223,24 +3313,9 @@ int mem_cgroup_prepare_migration(struct page *page,
         * we return here.
         */
        if (!memcg)
-                return 0;
+                return;
        *memcgp = memcg;
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false);
-        css_put(&memcg->css);/* drop extra refcnt */
-        if (ret) {
-                if (PageAnon(page)) {
-                        lock_page_cgroup(pc);
-                        ClearPageCgroupMigration(pc);
-                        unlock_page_cgroup(pc);
-                        /*
-                         * The old page may be fully unmapped while we kept it.
-                         */
-                        mem_cgroup_uncharge_page(page);
-                }
-                /* we'll need to revisit this error code (we have -EINTR) */
-                return -ENOMEM;
-        }
        /*
         * We charge new page before it's used/mapped. So, even if unlock_page()
         * is called before end_migration, we can catch all events on this new
@@ -3248,13 +3323,15 @@ int mem_cgroup_prepare_migration(struct page *page,
         * mapcount will be finally 0 and we call uncharge in end_migration().
         */
        if (PageAnon(page))
-                ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+                ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
-        else if (page_is_file_cache(page))
-                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
        else
-                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        /*
+         * The page is committed to the memcg, but it's not actually
+         * charged to the res_counter since we plan on replacing the
+         * old one and only one page is going to be left afterwards.
+         */
        __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false);
-        return ret;
 }
 /* remove redundant charge if migration failed*/
@@ -3276,6 +3353,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
                used = newpage;
                unused = oldpage;
        }
+        anon = PageAnon(used);
+        __mem_cgroup_uncharge_common(unused,
+                                     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
+                                     : MEM_CGROUP_CHARGE_TYPE_CACHE,
+                                     true);
+        css_put(&memcg->css);
        /*
         * We disallowed uncharge of pages under migration because mapcount
         * of the page goes down to zero, temporarly.
@@ -3285,10 +3368,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        lock_page_cgroup(pc);
        ClearPageCgroupMigration(pc);
        unlock_page_cgroup(pc);
-        anon = PageAnon(used);
-        __mem_cgroup_uncharge_common(unused,
-                anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED
-                     : MEM_CGROUP_CHARGE_TYPE_CACHE);
        /*
         * If a page is a file cache, radix-tree replacement is very atomic
@@ -3340,10 +3419,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
         */
        if (!memcg)
                return;
-        if (PageSwapBacked(oldpage))
-                type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
        /*
         * Even if newpage->mapping was NULL before starting replacement,
         * the newpage may be on LRU(or pagevec for LRU) already. We lock
@@ -3418,7 +3493,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                /*
                 * Rather than hide all in some function, I do this in
                 * open coded manner. You see what this really does.
-                 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3479,7 +3554,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                /*
                 * Rather than hide all in some function, I do this in
                 * open coded manner. You see what this really does.
-                 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
+                 * We have to guarantee memcg->res.limit <= memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3611,10 +3686,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 }
 /*
- * This routine traverse page_cgroup in given list and drop them all.
+ * Traverse a specified page_cgroup list and try to drop them all.  This doesn't
- * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
+ * reclaim the pages page themselves - it just removes the page_cgroups.
+ * Returns true if some page_cgroups were not freed, indicating that the caller
+ * must retry this operation.
 */
-static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
+static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                                int node, int zid, enum lru_list lru)
 {
        struct mem_cgroup_per_zone *mz;
@@ -3622,7 +3699,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
        struct list_head *list;
        struct page *busy;
        struct zone *zone;
-        int ret = 0;
        zone = &NODE_DATA(node)->node_zones[zid];
        mz = mem_cgroup_zoneinfo(memcg, node, zid);
@@ -3636,7 +3712,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                struct page_cgroup *pc;
                struct page *page;
-                ret = 0;
                spin_lock_irqsave(&zone->lru_lock, flags);
                if (list_empty(list)) {
                        spin_unlock_irqrestore(&zone->lru_lock, flags);
@@ -3653,21 +3728,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                pc = lookup_page_cgroup(page);
-                ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
+                if (mem_cgroup_move_parent(page, pc, memcg)) {
-                if (ret == -ENOMEM || ret == -EINTR)
-                        break;
-                if (ret == -EBUSY || ret == -EINVAL) {
                        /* found lock contention or "pc" is obsolete. */
                        busy = page;
                        cond_resched();
                } else
                        busy = NULL;
        }
+        return !list_empty(list);
-        if (!ret && !list_empty(list))
-                return -EBUSY;
-        return ret;
 }
 /*
@@ -3692,9 +3760,6 @@ move_account:
                ret = -EBUSY;
                if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
                        goto out;
-                ret = -EINTR;
-                if (signal_pending(current))
-                        goto out;
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
                drain_all_stock_sync(memcg);
@@ -3715,9 +3780,6 @@ move_account:
                }
                mem_cgroup_end_move(memcg);
                memcg_oom_recover(memcg);
-                /* it seems parent cgroup doesn't have enough mem */
-                if (ret == -ENOMEM)
-                        goto try_to_free;
                cond_resched();
        /* "ret" should also be checked to ensure all lists are empty. */
        } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret);
@@ -3779,6 +3841,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                parent_memcg = mem_cgroup_from_cont(parent);
        cgroup_lock();
+        if (memcg->use_hierarchy == val)
+                goto out;
        /*
         * If parent's use_hierarchy is set, we can't make any modifications
         * in the child subtrees. If it is unset, then the change can
@@ -3795,6 +3861,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                        retval = -EBUSY;
        } else
                retval = -EINVAL;
+out:
        cgroup_unlock();
        return retval;
@@ -3831,7 +3899,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
        val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
        if (swap)
-                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
+                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
        return val << PAGE_SHIFT;
 }
@@ -4015,7 +4083,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 #endif
 #ifdef CONFIG_NUMA
-static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft,
+static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
                                      struct seq_file *m)
 {
        int nid;
@@ -4074,7 +4142,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 }
-static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
                                 struct seq_file *m)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
@@ -4082,7 +4150,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        unsigned int i;
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-                if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
+                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
                seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
                           mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
@@ -4109,7 +4177,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
                long long val = 0;
-                if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
+                if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
                        continue;
                for_each_mem_cgroup_tree(mi, memcg)
                        val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
@@ -4533,7 +4601,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        return 0;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+#ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
        return mem_cgroup_sockets_init(memcg, ss);
@@ -4588,7 +4656,7 @@ static struct cftype mem_cgroup_files[] = {
        },
        {
                .name = "stat",
-                .read_seq_string = mem_control_stat_show,
+                .read_seq_string = memcg_stat_show,
        },
        {
                .name = "force_empty",
@@ -4620,10 +4688,10 @@ static struct cftype mem_cgroup_files[] = {
 #ifdef CONFIG_NUMA
        {
                .name = "numa_stat",
-                .read_seq_string = mem_control_numa_stat_show,
+                .read_seq_string = memcg_numa_stat_show,
        },
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
        {
                .name = "memsw.usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -4810,7 +4878,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static void __init enable_swap_cgroup(void)
 {
        if (!mem_cgroup_disabled() && really_do_swap_account)
@@ -5541,7 +5609,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .__DEPRECATED_clear_css_refs = true,
 };
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static int __init enable_swap_account(char *s)
 {
        /* consider enabled if no parameter or 1 is given */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ab1e7145e29..a6e2141a661 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p)
 * can only guarantee that the page either belongs to the memcg tasks, or is
 * a freed page.
 */
-#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef  CONFIG_MEMCG_SWAP
 u64 hwpoison_filter_memcg;
 EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
 static int hwpoison_filter_task(struct page *p)
@@ -345,14 +345,14 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 * Also when FAIL is set do a force kill because something went
 * wrong earlier.
 */
-static void kill_procs(struct list_head *to_kill, int doit, int trapno,
+static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
                          int fail, struct page *page, unsigned long pfn,
                          int flags)
 {
        struct to_kill *tk, *next;
        list_for_each_entry_safe (tk, next, to_kill, nd) {
-                if (doit) {
+                if (forcekill) {
                        /*
                         * In case something went wrong with munmapping
                         * make sure the process doesn't catch the
@@ -858,7 +858,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        struct address_space *mapping;
        LIST_HEAD(tokill);
        int ret;
-        int kill = 1;
+        int kill = 1, forcekill;
        struct page *hpage = compound_head(p);
        struct page *ppage;
@@ -888,7 +888,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * be called inside page lock (it's recommended but not enforced).
         */
        mapping = page_mapping(hpage);
-        if (!PageDirty(hpage) && mapping &&
+        if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
            mapping_cap_writeback_dirty(mapping)) {
                if (page_mkclean(hpage)) {
                        SetPageDirty(hpage);
@@ -965,12 +965,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * Now that the dirty bit has been propagated to the
         * struct page and all unmaps done we can decide if
         * killing is needed or not.  Only kill when the page
-         * was dirty, otherwise the tokill list is merely
+         * was dirty or the process is not restartable,
+         * otherwise the tokill list is merely
         * freed.  When there was a problem unmapping earlier
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        kill_procs(&tokill, !!PageDirty(ppage), trapno,
+        forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL);
+        kill_procs(&tokill, forcekill, trapno,
                      ret != SWAP_SUCCESS, p, pfn, flags);
        return ret;
@@ -1414,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
        int ret;
        unsigned long pfn = page_to_pfn(page);
        struct page *hpage = compound_head(page);
-        LIST_HEAD(pagelist);
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
@@ -1429,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags)
        }
        /* Keep page count to indicate a given hugepage is isolated. */
+        ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false,
-        list_add(&hpage->lru, &pagelist);
+                                MIGRATE_SYNC);
-        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+        put_page(hpage);
-                                true);
        if (ret) {
-                struct page *page1, *page2;
-                list_for_each_entry_safe(page1, page2, &pagelist, lru)
-                        put_page(page1);
                pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                        pfn, ret, page->flags);
-                if (ret > 0)
-                        ret = -EIO;
                return ret;
        }
 done:
        if (!PageHWPoison(hpage))
-                atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
+                atomic_long_add(1 << compound_trans_order(hpage),
+                                &mce_bad_pages);
        set_page_hwpoison_huge_page(hpage);
        dequeue_hwpoisoned_huge_page(hpage);
        /* keep elevated page count for bad page */
@@ -1561,7 +1556,7 @@ int soft_offline_page(struct page *page, int flags)
                                            page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
-                                                        0, MIGRATE_SYNC);
+                                                        false, MIGRATE_SYNC);
                if (ret) {
                        putback_lru_pages(&pagelist);
                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
diff --git a/mm/memory.c b/mm/memory.c
index 2466d125023..57361708d1a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -206,6 +206,8 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
        tlb->mm = mm;
        tlb->fullmm     = fullmm;
+        tlb->start      = -1UL;
+        tlb->end        = 0;
        tlb->need_flush = 0;
        tlb->fast_mode  = (num_possible_cpus() == 1);
        tlb->local.next = NULL;
@@ -248,6 +250,8 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e
 {
        struct mmu_gather_batch *batch, *next;
+        tlb->start = start;
+        tlb->end   = end;
        tlb_flush_mmu(tlb);
        /* keep the page table cache within bounds */
@@ -1204,6 +1208,11 @@ again:
         */
        if (force_flush) {
                force_flush = 0;
+#ifdef HAVE_GENERIC_MMU_GATHER
+                tlb->start = addr;
+                tlb->end = end;
+#endif
                tlb_flush_mmu(tlb);
                if (addr != end)
                        goto again;
@@ -1334,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                         * Since no pte has actually been setup, it is
                         * safe to do nothing in this case.
                         */
-                        if (vma->vm_file)
+                        if (vma->vm_file) {
-                                unmap_hugepage_range(vma, start, end, NULL);
+                                mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                                __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
+                                mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
+                        }
                } else
                        unmap_page_range(tlb, vma, start, end, details);
        }
@@ -2638,6 +2650,9 @@ reuse:
                if (!page_mkwrite) {
                        wait_on_page_locked(dirty_page);
                        set_page_dirty_balance(dirty_page, page_mkwrite);
+                        /* file_update_time outside page_lock */
+                        if (vma->vm_file)
+                                file_update_time(vma->vm_file);
                }
                put_page(dirty_page);
                if (page_mkwrite) {
@@ -2655,10 +2670,6 @@ reuse:
                        }
                }
-                /* file_update_time outside page_lock */
-                if (vma->vm_file)
-                        file_update_time(vma->vm_file);
                return ret;
        }
@@ -3327,12 +3338,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (dirty_page) {
                struct address_space *mapping = page->mapping;
+                int dirtied = 0;
                if (set_page_dirty(dirty_page))
-                        page_mkwrite = 1;
+                        dirtied = 1;
                unlock_page(dirty_page);
                put_page(dirty_page);
-                if (page_mkwrite && mapping) {
+                if ((dirtied || page_mkwrite) && mapping) {
                        /*
                         * Some device drivers do not set page.mapping but still
                         * dirty their pages
@@ -3341,7 +3353,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                /* file_update_time outside page_lock */
-                if (vma->vm_file)
+                if (vma->vm_file && !page_mkwrite)
                        file_update_time(vma->vm_file);
        } else {
                unlock_page(vmf.page);
@@ -3929,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
                        free_page((unsigned long)buf);
                }
        }
-        up_read(&current->mm->mmap_sem);
+        up_read(&mm->mmap_sem);
 }
 #ifdef CONFIG_PROVE_LOCKING
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 427bb291dd0..3ad25f9d1fc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
-        if (need_zonelists_rebuild)
+        if (onlined_pages) {
-                build_all_zonelists(zone);
+                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
-        else
+                if (need_zonelists_rebuild)
-                zone_pcp_update(zone);
+                        build_all_zonelists(NULL, zone);
+                else
+                        zone_pcp_update(zone);
+        }
        mutex_unlock(&zonelists_mutex);
        init_per_zone_wmark_min();
-        if (onlined_pages) {
+        if (onlined_pages)
                kswapd_run(zone_to_nid(zone));
-                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
-        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -562,7 +563,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
         * to access not-initialized zonelist, build here.
         */
        mutex_lock(&zonelists_mutex);
-        build_all_zonelists(NULL);
+        build_all_zonelists(pgdat, NULL);
        mutex_unlock(&zonelists_mutex);
        return pgdat;
@@ -965,6 +966,9 @@ repeat:
        init_per_zone_wmark_min();
+        if (!populated_zone(zone))
+                zone_pcp_reset(zone);
        if (!node_present_pages(node)) {
                node_clear_state(node, N_HIGH_MEMORY);
                kswapd_stop(node);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1d771e4200d..4ada3be6e25 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1602,8 +1602,14 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 * task can change it's policy.  The system default policy requires no
 * such protection.
 */
-unsigned slab_node(struct mempolicy *policy)
+unsigned slab_node(void)
 {
+        struct mempolicy *policy;
+        if (in_interrupt())
+                return numa_node_id();
+        policy = current->mempolicy;
        if (!policy || policy->flags & MPOL_F_LOCAL)
                return numa_node_id();
@@ -2556,7 +2562,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
                break;
        default:
-                BUG();
+                return -EINVAL;
        }
        l = strlen(policy_modes[mode]);
diff --git a/mm/mempool.c b/mm/mempool.c
index d9049811f35..54990476c04 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -63,19 +63,21 @@ EXPORT_SYMBOL(mempool_destroy);
 mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
                                mempool_free_t *free_fn, void *pool_data)
 {
-        return  mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,-1);
+        return mempool_create_node(min_nr,alloc_fn,free_fn, pool_data,
+                                   GFP_KERNEL, NUMA_NO_NODE);
 }
 EXPORT_SYMBOL(mempool_create);
 mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
-                        mempool_free_t *free_fn, void *pool_data, int node_id)
+                               mempool_free_t *free_fn, void *pool_data,
+                               gfp_t gfp_mask, int node_id)
 {
        mempool_t *pool;
-        pool = kmalloc_node(sizeof(*pool), GFP_KERNEL | __GFP_ZERO, node_id);
+        pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
        if (!pool)
                return NULL;
        pool->elements = kmalloc_node(min_nr * sizeof(void *),
-                                        GFP_KERNEL, node_id);
+                                      gfp_mask, node_id);
        if (!pool->elements) {
                kfree(pool);
                return NULL;
@@ -93,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
        while (pool->curr_nr < pool->min_nr) {
                void *element;
-                element = pool->alloc(GFP_KERNEL, pool->pool_data);
+                element = pool->alloc(gfp_mask, pool->pool_data);
                if (unlikely(!element)) {
                        mempool_destroy(pool);
                        return NULL;
diff --git a/mm/migrate.c b/mm/migrate.c
index be26d5cbe56..77ed2d77370 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -33,6 +33,7 @@
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
 #include <linux/hugetlb.h>
+#include <linux/hugetlb_cgroup.h>
 #include <linux/gfp.h>
 #include <asm/tlbflush.h>
@@ -682,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 {
        int rc = -EAGAIN;
        int remap_swapcache = 1;
-        int charge = 0;
        struct mem_cgroup *mem;
        struct anon_vma *anon_vma = NULL;
@@ -724,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
        }
        /* charge against new page */
-        charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
+        mem_cgroup_prepare_migration(page, newpage, &mem);
-        if (charge == -ENOMEM) {
-                rc = -ENOMEM;
-                goto unlock;
-        }
-        BUG_ON(charge);
        if (PageWriteback(page)) {
                /*
@@ -819,8 +814,7 @@ skip_unmap:
                put_anon_vma(anon_vma);
 uncharge:
-        if (!charge)
+        mem_cgroup_end_migration(mem, page, newpage, rc == 0);
-                mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
        unlock_page(page);
 out:
@@ -931,16 +925,13 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
        if (anon_vma)
                put_anon_vma(anon_vma);
-        unlock_page(hpage);
-out:
+        if (!rc)
-        if (rc != -EAGAIN) {
+                hugetlb_cgroup_migrate(hpage, new_hpage);
-                list_del(&hpage->lru);
-                put_page(hpage);
-        }
+        unlock_page(hpage);
+out:
        put_page(new_hpage);
        if (result) {
                if (rc)
                        *result = rc;
@@ -1016,48 +1007,32 @@ out:
        return nr_failed + retry;
 }
-int migrate_huge_pages(struct list_head *from,
+int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
-                new_page_t get_new_page, unsigned long private, bool offlining,
+                      unsigned long private, bool offlining,
-                enum migrate_mode mode)
+                      enum migrate_mode mode)
 {
-        int retry = 1;
+        int pass, rc;
-        int nr_failed = 0;
-        int pass = 0;
+        for (pass = 0; pass < 10; pass++) {
-        struct page *page;
+                rc = unmap_and_move_huge_page(get_new_page,
-        struct page *page2;
+                                              private, hpage, pass > 2, offlining,
-        int rc;
+                                              mode);
+                switch (rc) {
-        for (pass = 0; pass < 10 && retry; pass++) {
+                case -ENOMEM:
-                retry = 0;
+                        goto out;
+                case -EAGAIN:
-                list_for_each_entry_safe(page, page2, from, lru) {
+                        /* try again */
                        cond_resched();
+                        break;
-                        rc = unmap_and_move_huge_page(get_new_page,
+                case 0:
-                                        private, page, pass > 2, offlining,
+                        goto out;
-                                        mode);
+                default:
+                        rc = -EIO;
-                        switch(rc) {
+                        goto out;
-                        case -ENOMEM:
-                                goto out;
-                        case -EAGAIN:
-                                retry++;
-                                break;
-                        case 0:
-                                break;
-                        default:
-                                /* Permanent failure */
-                                nr_failed++;
-                                break;
-                        }
                }
        }
-        rc = 0;
 out:
-        if (rc)
+        return rc;
-                return rc;
-        return nr_failed + retry;
 }
 #ifdef CONFIG_NUMA
diff --git a/mm/mmap.c b/mm/mmap.c
index 3edfcdfa42d..ae18a48e7e4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
        const unsigned long stack_flags
                = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
+        mm->total_vm += pages;
        if (file) {
                mm->shared_vm += pages;
                if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -1347,7 +1349,6 @@ munmap_back:
 out:
        perf_event_mmap(vma);
-        mm->total_vm += len >> PAGE_SHIFT;
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
                if (!mlock_vma_pages_range(vma, addr, addr + len))
@@ -1355,9 +1356,8 @@ out:
        } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
                make_pages_present(addr, addr + len);
-        if (file && uprobe_mmap(vma))
+        if (file)
-                /* matching probes but cannot insert */
+                uprobe_mmap(vma);
-                goto unmap_and_free_vma;
        return addr;
@@ -1707,7 +1707,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
                return -ENOMEM;
        /* Ok, everything looks good - let it rip */
-        mm->total_vm += grow;
        if (vma->vm_flags & VM_LOCKED)
                mm->locked_vm += grow;
        vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
@@ -1889,7 +1888,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
                if (vma->vm_flags & VM_ACCOUNT)
                        nr_accounted += nrpages;
-                mm->total_vm -= nrpages;
                vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
                vma = remove_vma(vma);
        } while (vma);
@@ -2310,7 +2308,7 @@ void exit_mmap(struct mm_struct *mm)
        }
        vm_unacct_memory(nr_accounted);
-        BUG_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+        WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
 }
 /* Insert vm structure into process list sorted by address
@@ -2345,9 +2343,6 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
             security_vm_enough_memory_mm(mm, vma_pages(vma)))
                return -ENOMEM;
-        if (vma->vm_file && uprobe_mmap(vma))
-                return -EINVAL;
        vma_link(mm, vma, prev, rb_link, rb_parent);
        return 0;
 }
@@ -2418,9 +2413,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                        if (new_vma->vm_file) {
                                get_file(new_vma->vm_file);
-                                if (uprobe_mmap(new_vma))
-                                        goto out_free_mempol;
                                if (vma->vm_flags & VM_EXECUTABLE)
                                        added_exe_file_vma(mm);
                        }
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 9a611d3a184..862b60822d9 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -33,6 +33,24 @@
 void __mmu_notifier_release(struct mm_struct *mm)
 {
        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        /*
+         * RCU here will block mmu_notifier_unregister until
+         * ->release returns.
+         */
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist)
+                /*
+                 * if ->release runs before mmu_notifier_unregister it
+                 * must be handled as it's the only way for the driver
+                 * to flush all existing sptes and stop the driver
+                 * from establishing any more sptes before all the
+                 * pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+        rcu_read_unlock();
        spin_lock(&mm->mmu_notifier_mm->lock);
        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
@@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm)
                 * mmu_notifier_unregister to return.
                 */
                hlist_del_init_rcu(&mn->hlist);
-                /*
-                 * RCU here will block mmu_notifier_unregister until
-                 * ->release returns.
-                 */
-                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
-                /*
-                 * if ->release runs before mmu_notifier_unregister it
-                 * must be handled as it's the only way for the driver
-                 * to flush all existing sptes and stop the driver
-                 * from establishing any more sptes before all the
-                 * pages in the mm are freed.
-                 */
-                if (mn->ops->release)
-                        mn->ops->release(mn, mm);
-                rcu_read_unlock();
-                spin_lock(&mm->mmu_notifier_mm->lock);
        }
        spin_unlock(&mm->mmu_notifier_mm->lock);
@@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
 {
        BUG_ON(atomic_read(&mm->mm_count) <= 0);
-        spin_lock(&mm->mmu_notifier_mm->lock);
        if (!hlist_unhashed(&mn->hlist)) {
-                hlist_del_rcu(&mn->hlist);
                /*
                 * RCU here will force exit_mmap to wait ->release to finish
                 * before freeing the pages.
                 */
                rcu_read_lock();
-                spin_unlock(&mm->mmu_notifier_mm->lock);
                /*
                 * exit_mmap will block in mmu_notifier_release to
                 * guarantee ->release is called before freeing the
@@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
                if (mn->ops->release)
                        mn->ops->release(mn, mm);
                rcu_read_unlock();
-        } else
+                spin_lock(&mm->mmu_notifier_mm->lock);
+                hlist_del_rcu(&mn->hlist);
                spin_unlock(&mm->mmu_notifier_mm->lock);
+        }
        /*
         * Wait any running method to finish, of course including
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 6830eab5bf0..3cef80f6ac7 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone)
        for_each_lru(lru)
                INIT_LIST_HEAD(&lruvec->lists[lru]);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
        lruvec->zone = zone;
 #endif
 }
diff --git a/mm/mremap.c b/mm/mremap.c
index 21fed202dda..cc06d0e48d0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma,
         * If this were a serious issue, we'd add a flag to do_munmap().
         */
        hiwater_vm = mm->hiwater_vm;
-        mm->total_vm += new_len >> PAGE_SHIFT;
        vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
        if (do_munmap(mm, old_addr, old_len) < 0) {
@@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
                                goto out;
                        }
-                        mm->total_vm += pages;
                        vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
                        if (vma->vm_flags & VM_LOCKED) {
                                mm->locked_vm += pages;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ac300c99baf..19860086163 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -288,76 +288,93 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 }
 #endif
+enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
+                unsigned long totalpages, const nodemask_t *nodemask,
+                bool force_kill)
+{
+        if (task->exit_state)
+                return OOM_SCAN_CONTINUE;
+        if (oom_unkillable_task(task, NULL, nodemask))
+                return OOM_SCAN_CONTINUE;
+        /*
+         * This task already has access to memory reserves and is being killed.
+         * Don't allow any other task to have access to the reserves.
+         */
+        if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
+                if (unlikely(frozen(task)))
+                        __thaw_task(task);
+                if (!force_kill)
+                        return OOM_SCAN_ABORT;
+        }
+        if (!task->mm)
+                return OOM_SCAN_CONTINUE;
+        if (task->flags & PF_EXITING) {
+                /*
+                 * If task is current and is in the process of releasing memory,
+                 * allow the "kill" to set TIF_MEMDIE, which will allow it to
+                 * access memory reserves.  Otherwise, it may stall forever.
+                 *
+                 * The iteration isn't broken here, however, in case other
+                 * threads are found to have already been oom killed.
+                 */
+                if (task == current)
+                        return OOM_SCAN_SELECT;
+                else if (!force_kill) {
+                        /*
+                         * If this task is not being ptraced on exit, then wait
+                         * for it to finish before killing some other task
+                         * unnecessarily.
+                         */
+                        if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
+                                return OOM_SCAN_ABORT;
+                }
+        }
+        return OOM_SCAN_OK;
+}
 /*
 * Simple selection loop. We chose the process with the highest
- * number of 'points'. We expect the caller will lock the tasklist.
+ * number of 'points'.
 *
 * (not docbooked, we don't want this one cluttering up the manual)
 */
 static struct task_struct *select_bad_process(unsigned int *ppoints,
-                unsigned long totalpages, struct mem_cgroup *memcg,
+                unsigned long totalpages, const nodemask_t *nodemask,
-                const nodemask_t *nodemask, bool force_kill)
+                bool force_kill)
 {
        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
        unsigned long chosen_points = 0;
+        rcu_read_lock();
        do_each_thread(g, p) {
                unsigned int points;
-                if (p->exit_state)
+                switch (oom_scan_process_thread(p, totalpages, nodemask,
-                        continue;
+                                                force_kill)) {
-                if (oom_unkillable_task(p, memcg, nodemask))
+                case OOM_SCAN_SELECT:
-                        continue;
+                        chosen = p;
+                        chosen_points = ULONG_MAX;
-                /*
+                        /* fall through */
-                 * This task already has access to memory reserves and is
+                case OOM_SCAN_CONTINUE:
-                 * being killed. Don't allow any other task access to the
-                 * memory reserve.
-                 *
-                 * Note: this may have a chance of deadlock if it gets
-                 * blocked waiting for another task which itself is waiting
-                 * for memory. Is there a better alternative?
-                 */
-                if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
-                        if (unlikely(frozen(p)))
-                                __thaw_task(p);
-                        if (!force_kill)
-                                return ERR_PTR(-1UL);
-                }
-                if (!p->mm)
                        continue;
+                case OOM_SCAN_ABORT:
-                if (p->flags & PF_EXITING) {
+                        rcu_read_unlock();
-                        /*
+                        return ERR_PTR(-1UL);
-                         * If p is the current task and is in the process of
+                case OOM_SCAN_OK:
-                         * releasing memory, we allow the "kill" to set
+                        break;
-                         * TIF_MEMDIE, which will allow it to gain access to
+                };
-                         * memory reserves.  Otherwise, it may stall forever.
+                points = oom_badness(p, NULL, nodemask, totalpages);
-                         *
-                         * The loop isn't broken here, however, in case other
-                         * threads are found to have already been oom killed.
-                         */
-                        if (p == current) {
-                                chosen = p;
-                                chosen_points = ULONG_MAX;
-                        } else if (!force_kill) {
-                                /*
-                                 * If this task is not being ptraced on exit,
-                                 * then wait for it to finish before killing
-                                 * some other task unnecessarily.
-                                 */
-                                if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
-                                        return ERR_PTR(-1UL);
-                        }
-                }
-                points = oom_badness(p, memcg, nodemask, totalpages);
                if (points > chosen_points) {
                        chosen = p;
                        chosen_points = points;
                }
        } while_each_thread(g, p);
+        if (chosen)
+                get_task_struct(chosen);
+        rcu_read_unlock();
        *ppoints = chosen_points * 1000 / totalpages;
        return chosen;
@@ -371,17 +388,16 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
 * Dumps the current memory state of all eligible tasks.  Tasks not in the same
 * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
 * are not shown.
- * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
+ * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes,
- * value, oom_score_adj value, and name.
+ * swapents, oom_score_adj value, and name.
- *
- * Call with tasklist_lock read-locked.
 */
 static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
 {
        struct task_struct *p;
        struct task_struct *task;
-        pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
+        pr_info("[ pid ]   uid  tgid total_vm      rss nr_ptes swapents oom_score_adj name\n");
+        rcu_read_lock();
        for_each_process(p) {
                if (oom_unkillable_task(p, memcg, nodemask))
                        continue;
@@ -396,13 +412,15 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
                        continue;
                }
-                pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
+                pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu         %5d %s\n",
                        task->pid, from_kuid(&init_user_ns, task_uid(task)),
                        task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
-                        task_cpu(task), task->signal->oom_adj,
+                        task->mm->nr_ptes,
+                        get_mm_counter(task->mm, MM_SWAPENTS),
                        task->signal->oom_score_adj, task->comm);
                task_unlock(task);
        }
+        rcu_read_unlock();
 }
 static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
@@ -423,10 +441,14 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
-static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+/*
-                             unsigned int points, unsigned long totalpages,
+ * Must be called while holding a reference to p, which will be released upon
-                             struct mem_cgroup *memcg, nodemask_t *nodemask,
+ * returning.
-                             const char *message)
+ */
+void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
+                      unsigned int points, unsigned long totalpages,
+                      struct mem_cgroup *memcg, nodemask_t *nodemask,
+                      const char *message)
 {
        struct task_struct *victim = p;
        struct task_struct *child;
@@ -442,6 +464,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         */
        if (p->flags & PF_EXITING) {
                set_tsk_thread_flag(p, TIF_MEMDIE);
+                put_task_struct(p);
                return;
        }
@@ -459,6 +482,7 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * parent.  This attempts to lose the minimal amount of work done while
         * still freeing memory.
         */
+        read_lock(&tasklist_lock);
        do {
                list_for_each_entry(child, &t->children, sibling) {
                        unsigned int child_points;
@@ -471,15 +495,26 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        child_points = oom_badness(child, memcg, nodemask,
                                                                totalpages);
                        if (child_points > victim_points) {
+                                put_task_struct(victim);
                                victim = child;
                                victim_points = child_points;
+                                get_task_struct(victim);
                        }
                }
        } while_each_thread(p, t);
+        read_unlock(&tasklist_lock);
-        victim = find_lock_task_mm(victim);
+        rcu_read_lock();
-        if (!victim)
+        p = find_lock_task_mm(victim);
+        if (!p) {
+                rcu_read_unlock();
+                put_task_struct(victim);
                return;
+        } else if (victim != p) {
+                get_task_struct(p);
+                put_task_struct(victim);
+                victim = p;
+        }
        /* mm cannot safely be dereferenced after task_unlock(victim) */
        mm = victim->mm;
@@ -510,17 +545,19 @@ static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                        task_unlock(p);
                        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
                }
+        rcu_read_unlock();
        set_tsk_thread_flag(victim, TIF_MEMDIE);
        do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+        put_task_struct(victim);
 }
 #undef K
 /*
 * Determines whether the kernel must panic because of the panic_on_oom sysctl.
 */
-static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
-                                int order, const nodemask_t *nodemask)
+                        int order, const nodemask_t *nodemask)
 {
        if (likely(!sysctl_panic_on_oom))
                return;
@@ -533,42 +570,11 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
                if (constraint != CONSTRAINT_NONE)
                        return;
        }
-        read_lock(&tasklist_lock);
        dump_header(NULL, gfp_mask, order, NULL, nodemask);
-        read_unlock(&tasklist_lock);
        panic("Out of memory: %s panic_on_oom is enabled\n",
                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
-                              int order)
-{
-        unsigned long limit;
-        unsigned int points = 0;
-        struct task_struct *p;
-        /*
-         * If current has a pending SIGKILL, then automatically select it.  The
-         * goal is to allow it to allocate so that it may quickly exit and free
-         * its memory.
-         */
-        if (fatal_signal_pending(current)) {
-                set_thread_flag(TIF_MEMDIE);
-                return;
-        }
-        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
-        limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
-        read_lock(&tasklist_lock);
-        p = select_bad_process(&points, limit, memcg, NULL, false);
-        if (p && PTR_ERR(p) != -1UL)
-                oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL,
-                                 "Memory cgroup out of memory");
-        read_unlock(&tasklist_lock);
-}
-#endif
 static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
 int register_oom_notifier(struct notifier_block *nb)
@@ -690,7 +696,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        struct task_struct *p;
        unsigned long totalpages;
        unsigned long freed = 0;
-        unsigned int points;
+        unsigned int uninitialized_var(points);
        enum oom_constraint constraint = CONSTRAINT_NONE;
        int killed = 0;
@@ -718,22 +724,20 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
        check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
-        read_lock(&tasklist_lock);
+        if (sysctl_oom_kill_allocating_task && current->mm &&
-        if (sysctl_oom_kill_allocating_task &&
            !oom_unkillable_task(current, NULL, nodemask) &&
-            current->mm) {
+            current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+                get_task_struct(current);
                oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
                                 nodemask,
                                 "Out of memory (oom_kill_allocating_task)");
                goto out;
        }
-        p = select_bad_process(&points, totalpages, NULL, mpol_mask,
+        p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
-                               force_kill);
        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (!p) {
                dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
-                read_unlock(&tasklist_lock);
                panic("Out of memory and no killable processes...\n");
        }
        if (PTR_ERR(p) != -1UL) {
@@ -742,14 +746,12 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                killed = 1;
        }
 out:
-        read_unlock(&tasklist_lock);
        /*
-         * Give "p" a good chance of killing itself before we
+         * Give the killed threads a good chance of exiting before trying to
-         * retry to allocate memory unless "p" is current
+         * allocate memory again.
         */
-        if (killed && !test_thread_flag(TIF_MEMDIE))
+        if (killed)
-                schedule_timeout_uninterruptible(1);
+                schedule_timeout_killable(1);
 }
 /*
@@ -764,6 +766,5 @@ void pagefault_out_of_memory(void)
                out_of_memory(NULL, 0, 0, NULL, false);
                clear_system_oom();
        }
-        if (!test_thread_flag(TIF_MEMDIE))
+        schedule_timeout_killable(1);
-                schedule_timeout_uninterruptible(1);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 93d8d2f7108..5ad5ce23c1e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h> /* __set_page_dirty_buffers */
 #include <linux/pagevec.h>
+#include <linux/timer.h>
 #include <trace/events/writeback.h>
 /*
@@ -135,7 +136,20 @@ unsigned long global_dirty_limit;
 * measured in page writeback completions.
 *
 */
-static struct prop_descriptor vm_completions;
+static struct fprop_global writeout_completions;
+static void writeout_period(unsigned long t);
+/* Timer for aging of writeout_completions */
+static struct timer_list writeout_period_timer =
+                TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
+static unsigned long writeout_period_time = 0;
+/*
+ * Length of period for aging writeout fractions of bdis. This is an
+ * arbitrarily chosen number. The longer the period, the slower fractions will
+ * reflect changes in current writeout rate.
+ */
+#define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
 /*
 * Work out the current dirty-memory clamping and background writeout
@@ -322,34 +336,6 @@ bool zone_dirty_ok(struct zone *zone)
               zone_page_state(zone, NR_WRITEBACK) <= limit;
 }
-/*
- * couple the period to the dirty_ratio:
- *
- *   period/2 ~ roundup_pow_of_two(dirty limit)
- */
-static int calc_period_shift(void)
-{
-        unsigned long dirty_total;
-        if (vm_dirty_bytes)
-                dirty_total = vm_dirty_bytes / PAGE_SIZE;
-        else
-                dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) /
-                                100;
-        return 2 + ilog2(dirty_total - 1);
-}
-/*
- * update the period when the dirty threshold changes.
- */
-static void update_completion_period(void)
-{
-        int shift = calc_period_shift();
-        prop_change_shift(&vm_completions, shift);
-        writeback_set_ratelimit();
-}
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -383,7 +369,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
        ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
-                update_completion_period();
+                writeback_set_ratelimit();
                vm_dirty_bytes = 0;
        }
        return ret;
@@ -398,12 +384,21 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
        ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
        if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
-                update_completion_period();
+                writeback_set_ratelimit();
                vm_dirty_ratio = 0;
        }
        return ret;
 }
+static unsigned long wp_next_time(unsigned long cur_time)
+{
+        cur_time += VM_COMPLETIONS_PERIOD_LEN;
+        /* 0 has a special meaning... */
+        if (!cur_time)
+                return 1;
+        return cur_time;
+}
 /*
 * Increment the BDI's writeout completion count and the global writeout
 * completion count. Called from test_clear_page_writeback().
@@ -411,8 +406,19 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
        __inc_bdi_stat(bdi, BDI_WRITTEN);
-        __prop_inc_percpu_max(&vm_completions, &bdi->completions,
+        __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
-                              bdi->max_prop_frac);
+                               bdi->max_prop_frac);
+        /* First event after period switching was turned off? */
+        if (!unlikely(writeout_period_time)) {
+                /*
+                 * We can race with other __bdi_writeout_inc calls here but
+                 * it does not cause any harm since the resulting time when
+                 * timer will fire and what is in writeout_period_time will be
+                 * roughly the same.
+                 */
+                writeout_period_time = wp_next_time(jiffies);
+                mod_timer(&writeout_period_timer, writeout_period_time);
+        }
 }
 void bdi_writeout_inc(struct backing_dev_info *bdi)
@@ -431,11 +437,33 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
                long *numerator, long *denominator)
 {
-        prop_fraction_percpu(&vm_completions, &bdi->completions,
+        fprop_fraction_percpu(&writeout_completions, &bdi->completions,
                                numerator, denominator);
 }
 /*
+ * On idle system, we can be called long after we scheduled because we use
+ * deferred timers so count with missed periods.
+ */
+static void writeout_period(unsigned long t)
+{
+        int miss_periods = (jiffies - writeout_period_time) /
+                                                 VM_COMPLETIONS_PERIOD_LEN;
+        if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
+                writeout_period_time = wp_next_time(writeout_period_time +
+                                miss_periods * VM_COMPLETIONS_PERIOD_LEN);
+                mod_timer(&writeout_period_timer, writeout_period_time);
+        } else {
+                /*
+                 * Aging has zeroed all fractions. Stop wasting CPU on period
+                 * updates.
+                 */
+                writeout_period_time = 0;
+        }
+}
+/*
 * bdi_min_ratio keeps the sum of the minimum dirty shares of all
 * registered backing devices, which, for obvious reasons, can not
 * exceed 100%.
@@ -475,7 +503,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
                ret = -EINVAL;
        } else {
                bdi->max_ratio = max_ratio;
-                bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
+                bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100;
        }
        spin_unlock_bh(&bdi_lock);
@@ -918,7 +946,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
         *      bdi->dirty_ratelimit = balanced_dirty_ratelimit;
         *
         * However to get a more stable dirty_ratelimit, the below elaborated
-         * code makes use of task_ratelimit to filter out sigular points and
+         * code makes use of task_ratelimit to filter out singular points and
         * limit the step size.
         *
         * The below code essentially only uses the relative value of
@@ -941,7 +969,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
         * feel and care are stable dirty rate and small position error.
         *
         * |task_ratelimit - dirty_ratelimit| is used to limit the step size
-         * and filter out the sigular points of balanced_dirty_ratelimit. Which
+         * and filter out the singular points of balanced_dirty_ratelimit. Which
         * keeps jumping around randomly and can even leap far away at times
         * due to the small 200ms estimation period of dirty_rate (we want to
         * keep that period small to reduce time lags).
@@ -1504,7 +1532,6 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
        proc_dointvec(table, write, buffer, length, ppos);
-        bdi_arm_supers_timer();
        return 0;
 }
@@ -1606,13 +1633,10 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
 */
 void __init page_writeback_init(void)
 {
-        int shift;
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
-        shift = calc_period_shift();
+        fprop_global_init(&writeout_completions);
-        prop_descriptor_init(&vm_completions, shift);
 }
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4a4f9219683..c66fb875104 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
-#include <linux/memory.h>
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
 int page_group_by_mobility_disabled __read_mostly;
-static void set_pageblock_migratetype(struct page *page, int migratetype)
+/*
+ * NOTE:
+ * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
+ * Instead, use {un}set_pageblock_isolate.
+ */
+void set_pageblock_migratetype(struct page *page, int migratetype)
 {
        if (unlikely(page_group_by_mobility_disabled))
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
        return pages_moved;
 }
-static int move_freepages_block(struct zone *zone, struct page *page,
+int move_freepages_block(struct zone *zone, struct page *page,
                                int migratetype)
 {
        unsigned long start_pfn, end_pfn;
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
                to_drain = pcp->batch;
        else
                to_drain = pcp->count;
-        free_pcppages_bulk(zone, to_drain, pcp);
+        if (to_drain > 0) {
-        pcp->count -= to_drain;
+                free_pcppages_bulk(zone, to_drain, pcp);
+                pcp->count -= to_drain;
+        }
        local_irq_restore(flags);
 }
 #endif
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
 }
 __setup("fail_page_alloc=", setup_fail_page_alloc);
-static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
        if (order < fail_page_alloc.min_order)
-                return 0;
+                return false;
        if (gfp_mask & __GFP_NOFAIL)
-                return 0;
+                return false;
        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
-                return 0;
+                return false;
        if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
-                return 0;
+                return false;
        return should_fail(&fail_page_alloc.attr, 1 << order);
 }
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
 #else /* CONFIG_FAIL_PAGE_ALLOC */
-static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 {
-        return 0;
+        return false;
 }
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
+        long lowmem_reserve = z->lowmem_reserve[classzone_idx];
        int o;
        free_pages -= (1 << order) - 1;
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
-        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+        if (free_pages <= min + lowmem_reserve)
                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return true;
 }
+#ifdef CONFIG_MEMORY_ISOLATION
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+        if (unlikely(zone->nr_pageblock_isolate))
+                return zone->nr_pageblock_isolate * pageblock_nr_pages;
+        return 0;
+}
+#else
+static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
+{
+        return 0;
+}
+#endif
 bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                      int classzone_idx, int alloc_flags)
 {
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+        /*
+         * If the zone has MIGRATE_ISOLATE type free pages, we should consider
+         * it.  nr_zone_isolate_freepages is never accurate so kswapd might not
+         * sleep although it could do so.  But this is more desirable for memory
+         * hotplug than sleeping which can cause a livelock in the direct
+         * reclaim path.
+         */
+        free_pages -= nr_zone_isolate_freepages(z);
        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
                                                                free_pages);
 }
@@ -1899,6 +1928,17 @@ this_zone_full:
                zlc_active = 0;
                goto zonelist_scan;
        }
+        if (page)
+                /*
+                 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
+                 * necessary to allocate the page. The expectation is
+                 * that the caller is taking steps that will free more
+                 * memory. The caller should avoid the page being used
+                 * for !PFMEMALLOC purposes.
+                 */
+                page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
        return page;
 }
@@ -2062,7 +2102,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int migratetype, bool sync_migration,
-        bool *deferred_compaction,
+        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
        struct page *page;
@@ -2077,7 +2117,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                nodemask, sync_migration);
+                                                nodemask, sync_migration,
+                                                contended_compaction);
        current->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
@@ -2087,8 +2128,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                page = get_page_from_freelist(gfp_mask, nodemask,
                                order, zonelist, high_zoneidx,
-                                alloc_flags, preferred_zone,
+                                alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                migratetype);
+                                preferred_zone, migratetype);
                if (page) {
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
@@ -2123,7 +2164,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
        int migratetype, bool sync_migration,
-        bool *deferred_compaction,
+        bool *contended_compaction, bool *deferred_compaction,
        unsigned long *did_some_progress)
 {
        return NULL;
@@ -2180,8 +2221,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 retry:
        page = get_page_from_freelist(gfp_mask, nodemask, order,
                                        zonelist, high_zoneidx,
-                                        alloc_flags, preferred_zone,
+                                        alloc_flags & ~ALLOC_NO_WATERMARKS,
-                                        migratetype);
+                                        preferred_zone, migratetype);
        /*
         * If an allocation failed after direct reclaim, it could be because
@@ -2265,15 +2306,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
-                if (!in_interrupt() &&
+                if (gfp_mask & __GFP_MEMALLOC)
-                    ((current->flags & PF_MEMALLOC) ||
+                        alloc_flags |= ALLOC_NO_WATERMARKS;
-                     unlikely(test_thread_flag(TIF_MEMDIE))))
+                else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
+                        alloc_flags |= ALLOC_NO_WATERMARKS;
+                else if (!in_interrupt() &&
+                                ((current->flags & PF_MEMALLOC) ||
+                                 unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
        return alloc_flags;
 }
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+        return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
+}
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2287,6 +2337,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long did_some_progress;
        bool sync_migration = false;
        bool deferred_compaction = false;
+        bool contended_compaction = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2340,11 +2391,19 @@ rebalance:
        /* Allocate without watermarks if the context allows */
        if (alloc_flags & ALLOC_NO_WATERMARKS) {
+                /*
+                 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
+                 * the allocation is high priority and these type of
+                 * allocations are system rather than user orientated
+                 */
+                zonelist = node_zonelist(numa_node_id(), gfp_mask);
                page = __alloc_pages_high_priority(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-                if (page)
+                if (page) {
                        goto got_pg;
+                }
        }
        /* Atomic allocations - we can't balance anything */
@@ -2368,6 +2427,7 @@ rebalance:
                                        nodemask,
                                        alloc_flags, preferred_zone,
                                        migratetype, sync_migration,
+                                        &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
        if (page)
@@ -2377,10 +2437,11 @@ rebalance:
        /*
         * If compaction is deferred for high-order allocations, it is because
         * sync compaction recently failed. In this is the case and the caller
-         * has requested the system not be heavily disrupted, fail the
+         * requested a movable allocation that does not heavily disrupt the
-         * allocation now instead of entering direct reclaim
+         * system then fail the allocation instead of entering direct reclaim.
         */
-        if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+        if ((deferred_compaction || contended_compaction) &&
+                                                (gfp_mask & __GFP_NO_KSWAPD))
                goto nopage;
        /* Try direct reclaim and then allocating */
@@ -2451,6 +2512,7 @@ rebalance:
                                        nodemask,
                                        alloc_flags, preferred_zone,
                                        migratetype, sync_migration,
+                                        &contended_compaction,
                                        &deferred_compaction,
                                        &did_some_progress);
                if (page)
@@ -2463,8 +2525,8 @@ nopage:
 got_pg:
        if (kmemcheck_enabled)
                kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-        return page;
+        return page;
 }
 /*
@@ -3030,7 +3092,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                        user_zonelist_order = oldval;
                } else if (oldval != user_zonelist_order) {
                        mutex_lock(&zonelists_mutex);
-                        build_all_zonelists(NULL);
+                        build_all_zonelists(NULL, NULL);
                        mutex_unlock(&zonelists_mutex);
                }
        }
@@ -3409,14 +3471,21 @@ static void setup_zone_pageset(struct zone *zone);
 DEFINE_MUTEX(zonelists_mutex);
 /* return values int ....just for stop_machine() */
-static __init_refok int __build_all_zonelists(void *data)
+static int __build_all_zonelists(void *data)
 {
        int nid;
        int cpu;
+        pg_data_t *self = data;
 #ifdef CONFIG_NUMA
        memset(node_load, 0, sizeof(node_load));
 #endif
+        if (self && !node_online(self->node_id)) {
+                build_zonelists(self);
+                build_zonelist_cache(self);
+        }
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
@@ -3461,7 +3530,7 @@ static __init_refok int __build_all_zonelists(void *data)
 * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
 */
-void __ref build_all_zonelists(void *data)
+void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
        set_zonelist_order();
@@ -3473,10 +3542,10 @@ void __ref build_all_zonelists(void *data)
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
 #ifdef CONFIG_MEMORY_HOTPLUG
-                if (data)
+                if (zone)
-                        setup_zone_pageset((struct zone *)data);
+                        setup_zone_pageset(zone);
 #endif
-                stop_machine(__build_all_zonelists, NULL, NULL);
+                stop_machine(__build_all_zonelists, pgdat, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -3746,7 +3815,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
        memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
 #endif
-static int zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
 {
 #ifdef CONFIG_MMU
        int batch;
@@ -3828,7 +3897,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                pcp->batch = PAGE_SHIFT * 8;
 }
-static void setup_zone_pageset(struct zone *zone)
+static void __meminit setup_zone_pageset(struct zone *zone)
 {
        int cpu;
@@ -3901,32 +3970,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
        return 0;
 }
-static int __zone_pcp_update(void *data)
-{
-        struct zone *zone = data;
-        int cpu;
-        unsigned long batch = zone_batchsize(zone), flags;
-        for_each_possible_cpu(cpu) {
-                struct per_cpu_pageset *pset;
-                struct per_cpu_pages *pcp;
-                pset = per_cpu_ptr(zone->pageset, cpu);
-                pcp = &pset->pcp;
-                local_irq_save(flags);
-                free_pcppages_bulk(zone, pcp->count, pcp);
-                setup_pageset(pset, batch);
-                local_irq_restore(flags);
-        }
-        return 0;
-}
-void zone_pcp_update(struct zone *zone)
-{
-        stop_machine(__zone_pcp_update, zone, NULL);
-}
 static __meminit void zone_pcp_init(struct zone *zone)
 {
        /*
@@ -3942,7 +3985,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
                                         zone_batchsize(zone));
 }
-__meminit int init_currently_empty_zone(struct zone *zone,
+int __meminit init_currently_empty_zone(struct zone *zone,
                                        unsigned long zone_start_pfn,
                                        unsigned long size,
                                        enum memmap_context context)
@@ -4301,7 +4344,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-static inline void __init set_pageblock_order(void)
+void __init set_pageblock_order(void)
 {
        unsigned int order;
@@ -4329,7 +4372,7 @@ static inline void __init set_pageblock_order(void)
 * include/linux/pageblock-flags.h for the values of pageblock_order based on
 * the kernel config
 */
-static inline void set_pageblock_order(void)
+void __init set_pageblock_order(void)
 {
 }
@@ -4340,6 +4383,8 @@ static inline void set_pageblock_order(void)
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
+ *
+ * NOTE: pgdat should get zeroed by caller.
 */
 static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
@@ -4350,9 +4395,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        int ret;
        pgdat_resize_init(pgdat);
-        pgdat->nr_zones = 0;
        init_waitqueue_head(&pgdat->kswapd_wait);
-        pgdat->kswapd_max_order = 0;
+        init_waitqueue_head(&pgdat->pfmemalloc_wait);
        pgdat_page_cgroup_init(pgdat);
        for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -4394,6 +4438,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->spanned_pages = size;
                zone->present_pages = realsize;
+#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+                zone->compact_cached_free_pfn = zone->zone_start_pfn +
+                                                zone->spanned_pages;
+                zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
+#endif
 #ifdef CONFIG_NUMA
                zone->node = nid;
                zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4408,8 +4457,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone_pcp_init(zone);
                lruvec_init(&zone->lruvec, zone);
-                zap_zone_vm_stats(zone);
-                zone->flags = 0;
                if (!size)
                        continue;
@@ -4469,6 +4516,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 {
        pg_data_t *pgdat = NODE_DATA(nid);
+        /* pg_data_t should be reset to zero when it's allocated */
+        WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
        calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4750,7 +4800,7 @@ out:
 }
 /* Any regular memory on that node ? */
-static void check_for_regular_memory(pg_data_t *pgdat)
+static void __init check_for_regular_memory(pg_data_t *pgdat)
 {
 #ifdef CONFIG_HIGHMEM
        enum zone_type zone_type;
@@ -5468,26 +5518,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 }
 /*
- * This is designed as sub function...plz see page_isolation.c also.
+ * This function checks whether pageblock includes unmovable pages or not.
- * set/clear page block's type to be ISOLATE.
+ * If @count is not zero, it is okay to include less @count unmovable pages
- * page allocater never alloc memory from ISOLATE block.
+ *
+ * PageLRU check wihtout isolation or lru_lock could race so that
+ * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
+ * expect this function should be exact.
 */
+bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
-static int
-__count_immobile_pages(struct zone *zone, struct page *page, int count)
 {
        unsigned long pfn, iter, found;
        int mt;
        /*
         * For avoiding noise data, lru_add_drain_all() should be called
-         * If ZONE_MOVABLE, the zone never contains immobile pages
+         * If ZONE_MOVABLE, the zone never contains unmovable pages
         */
        if (zone_idx(zone) == ZONE_MOVABLE)
-                return true;
+                return false;
        mt = get_pageblock_migratetype(page);
        if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
-                return true;
+                return false;
        pfn = page_to_pfn(page);
        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5497,11 +5548,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
                        continue;
                page = pfn_to_page(check);
-                if (!page_count(page)) {
+                /*
+                 * We can't use page_count without pin a page
+                 * because another CPU can free compound page.
+                 * This check already skips compound tails of THP
+                 * because their page->_count is zero at all time.
+                 */
+                if (!atomic_read(&page->_count)) {
                        if (PageBuddy(page))
                                iter += (1 << page_order(page)) - 1;
                        continue;
                }
                if (!PageLRU(page))
                        found++;
                /*
@@ -5518,9 +5576,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
                 * page at boot.
                 */
                if (found > count)
-                        return false;
+                        return true;
        }
-        return true;
+        return false;
 }
 bool is_pageblock_removable_nolock(struct page *page)
@@ -5544,77 +5602,7 @@ bool is_pageblock_removable_nolock(struct page *page)
                        zone->zone_start_pfn + zone->spanned_pages <= pfn)
                return false;
-        return __count_immobile_pages(zone, page, 0);
+        return !has_unmovable_pages(zone, page, 0);
-}
-int set_migratetype_isolate(struct page *page)
-{
-        struct zone *zone;
-        unsigned long flags, pfn;
-        struct memory_isolate_notify arg;
-        int notifier_ret;
-        int ret = -EBUSY;
-        zone = page_zone(page);
-        spin_lock_irqsave(&zone->lock, flags);
-        pfn = page_to_pfn(page);
-        arg.start_pfn = pfn;
-        arg.nr_pages = pageblock_nr_pages;
-        arg.pages_found = 0;
-        /*
-         * It may be possible to isolate a pageblock even if the
-         * migratetype is not MIGRATE_MOVABLE. The memory isolation
-         * notifier chain is used by balloon drivers to return the
-         * number of pages in a range that are held by the balloon
-         * driver to shrink memory. If all the pages are accounted for
-         * by balloons, are free, or on the LRU, isolation can continue.
-         * Later, for example, when memory hotplug notifier runs, these
-         * pages reported as "can be isolated" should be isolated(freed)
-         * by the balloon driver through the memory notifier chain.
-         */
-        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
-        notifier_ret = notifier_to_errno(notifier_ret);
-        if (notifier_ret)
-                goto out;
-        /*
-         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
-         * We just check MOVABLE pages.
-         */
-        if (__count_immobile_pages(zone, page, arg.pages_found))
-                ret = 0;
-        /*
-         * immobile means "not-on-lru" paes. If immobile is larger than
-         * removable-by-driver pages reported by notifier, we'll fail.
-         */
-out:
-        if (!ret) {
-                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-                move_freepages_block(zone, page, MIGRATE_ISOLATE);
-        }
-        spin_unlock_irqrestore(&zone->lock, flags);
-        if (!ret)
-                drain_all_pages();
-        return ret;
-}
-void unset_migratetype_isolate(struct page *page, unsigned migratetype)
-{
-        struct zone *zone;
-        unsigned long flags;
-        zone = page_zone(page);
-        spin_lock_irqsave(&zone->lock, flags);
-        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
-                goto out;
-        set_pageblock_migratetype(page, migratetype);
-        move_freepages_block(zone, page, migratetype);
-out:
-        spin_unlock_irqrestore(&zone->lock, flags);
 }
 #ifdef CONFIG_CMA
@@ -5869,7 +5857,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
 }
 #endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int __meminit __zone_pcp_update(void *data)
+{
+        struct zone *zone = data;
+        int cpu;
+        unsigned long batch = zone_batchsize(zone), flags;
+        for_each_possible_cpu(cpu) {
+                struct per_cpu_pageset *pset;
+                struct per_cpu_pages *pcp;
+                pset = per_cpu_ptr(zone->pageset, cpu);
+                pcp = &pset->pcp;
+                local_irq_save(flags);
+                if (pcp->count > 0)
+                        free_pcppages_bulk(zone, pcp->count, pcp);
+                setup_pageset(pset, batch);
+                local_irq_restore(flags);
+        }
+        return 0;
+}
+void __meminit zone_pcp_update(struct zone *zone)
+{
+        stop_machine(__zone_pcp_update, zone, NULL);
+}
+#endif
 #ifdef CONFIG_MEMORY_HOTREMOVE
+void zone_pcp_reset(struct zone *zone)
+{
+        unsigned long flags;
+        /* avoid races with drain_pages()  */
+        local_irq_save(flags);
+        if (zone->pageset != &boot_pageset) {
+                free_percpu(zone->pageset);
+                zone->pageset = &boot_pageset;
+        }
+        local_irq_restore(flags);
+}
 /*
 * All pages in the range must be isolated before calling this.
 */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index eb750f85139..5ddad0c6daa 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
 #endif
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+#ifdef CONFIG_MEMCG_SWAP
 static DEFINE_MUTEX(swap_cgroup_mutex);
 struct swap_cgroup_ctrl {
diff --git a/mm/page_io.c b/mm/page_io.c
index 34f02923744..78eee32ee48 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -17,6 +17,7 @@
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/swapops.h>
+#include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/frontswap.h>
 #include <asm/pgtable.h>
@@ -86,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err)
        bio_put(bio);
 }
+int generic_swapfile_activate(struct swap_info_struct *sis,
+                                struct file *swap_file,
+                                sector_t *span)
+{
+        struct address_space *mapping = swap_file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned blocks_per_page;
+        unsigned long page_no;
+        unsigned blkbits;
+        sector_t probe_block;
+        sector_t last_block;
+        sector_t lowest_block = -1;
+        sector_t highest_block = 0;
+        int nr_extents = 0;
+        int ret;
+        blkbits = inode->i_blkbits;
+        blocks_per_page = PAGE_SIZE >> blkbits;
+        /*
+         * Map all the blocks into the extent list.  This code doesn't try
+         * to be very smart.
+         */
+        probe_block = 0;
+        page_no = 0;
+        last_block = i_size_read(inode) >> blkbits;
+        while ((probe_block + blocks_per_page) <= last_block &&
+                        page_no < sis->max) {
+                unsigned block_in_page;
+                sector_t first_block;
+                first_block = bmap(inode, probe_block);
+                if (first_block == 0)
+                        goto bad_bmap;
+                /*
+                 * It must be PAGE_SIZE aligned on-disk
+                 */
+                if (first_block & (blocks_per_page - 1)) {
+                        probe_block++;
+                        goto reprobe;
+                }
+                for (block_in_page = 1; block_in_page < blocks_per_page;
+                                        block_in_page++) {
+                        sector_t block;
+                        block = bmap(inode, probe_block + block_in_page);
+                        if (block == 0)
+                                goto bad_bmap;
+                        if (block != first_block + block_in_page) {
+                                /* Discontiguity */
+                                probe_block++;
+                                goto reprobe;
+                        }
+                }
+                first_block >>= (PAGE_SHIFT - blkbits);
+                if (page_no) {  /* exclude the header page */
+                        if (first_block < lowest_block)
+                                lowest_block = first_block;
+                        if (first_block > highest_block)
+                                highest_block = first_block;
+                }
+                /*
+                 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+                 */
+                ret = add_swap_extent(sis, page_no, 1, first_block);
+                if (ret < 0)
+                        goto out;
+                nr_extents += ret;
+                page_no++;
+                probe_block += blocks_per_page;
+reprobe:
+                continue;
+        }
+        ret = nr_extents;
+        *span = 1 + highest_block - lowest_block;
+        if (page_no == 0)
+                page_no = 1;    /* force Empty message */
+        sis->max = page_no;
+        sis->pages = page_no - 1;
+        sis->highest_bit = page_no - 1;
+out:
+        return ret;
+bad_bmap:
+        printk(KERN_ERR "swapon: swapfile has holes\n");
+        ret = -EINVAL;
+        goto out;
+}
 /*
 * We may have stale swap cache pages in memory: notice
 * them here and get rid of the unnecessary final write.
@@ -94,6 +187,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct bio *bio;
        int ret = 0, rw = WRITE;
+        struct swap_info_struct *sis = page_swap_info(page);
        if (try_to_free_swap(page)) {
                unlock_page(page);
@@ -105,6 +199,33 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                end_page_writeback(page);
                goto out;
        }
+        if (sis->flags & SWP_FILE) {
+                struct kiocb kiocb;
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                struct iovec iov = {
+                        .iov_base = kmap(page),
+                        .iov_len  = PAGE_SIZE,
+                };
+                init_sync_kiocb(&kiocb, swap_file);
+                kiocb.ki_pos = page_file_offset(page);
+                kiocb.ki_left = PAGE_SIZE;
+                kiocb.ki_nbytes = PAGE_SIZE;
+                unlock_page(page);
+                ret = mapping->a_ops->direct_IO(KERNEL_WRITE,
+                                                &kiocb, &iov,
+                                                kiocb.ki_pos, 1);
+                kunmap(page);
+                if (ret == PAGE_SIZE) {
+                        count_vm_event(PSWPOUT);
+                        ret = 0;
+                }
+                return ret;
+        }
        bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
        if (bio == NULL) {
                set_page_dirty(page);
@@ -126,6 +247,7 @@ int swap_readpage(struct page *page)
 {
        struct bio *bio;
        int ret = 0;
+        struct swap_info_struct *sis = page_swap_info(page);
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageUptodate(page));
@@ -134,6 +256,17 @@ int swap_readpage(struct page *page)
                unlock_page(page);
                goto out;
        }
+        if (sis->flags & SWP_FILE) {
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                ret = mapping->a_ops->readpage(swap_file, page);
+                if (!ret)
+                        count_vm_event(PSWPIN);
+                return ret;
+        }
        bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
        if (bio == NULL) {
                unlock_page(page);
@@ -145,3 +278,15 @@ int swap_readpage(struct page *page)
 out:
        return ret;
 }
+int swap_set_page_dirty(struct page *page)
+{
+        struct swap_info_struct *sis = page_swap_info(page);
+        if (sis->flags & SWP_FILE) {
+                struct address_space *mapping = sis->swap_file->f_mapping;
+                return mapping->a_ops->set_page_dirty(page);
+        } else {
+                return __set_page_dirty_no_writeback(page);
+        }
+}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c9f04774f2b..247d1f17573 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -5,8 +5,101 @@
 #include <linux/mm.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
+#include <linux/memory.h>
 #include "internal.h"
+/* called while holding zone->lock */
+static void set_pageblock_isolate(struct page *page)
+{
+        if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE)
+                return;
+        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+        page_zone(page)->nr_pageblock_isolate++;
+}
+/* called while holding zone->lock */
+static void restore_pageblock_isolate(struct page *page, int migratetype)
+{
+        struct zone *zone = page_zone(page);
+        if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE))
+                return;
+        BUG_ON(zone->nr_pageblock_isolate <= 0);
+        set_pageblock_migratetype(page, migratetype);
+        zone->nr_pageblock_isolate--;
+}
+int set_migratetype_isolate(struct page *page)
+{
+        struct zone *zone;
+        unsigned long flags, pfn;
+        struct memory_isolate_notify arg;
+        int notifier_ret;
+        int ret = -EBUSY;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        pfn = page_to_pfn(page);
+        arg.start_pfn = pfn;
+        arg.nr_pages = pageblock_nr_pages;
+        arg.pages_found = 0;
+        /*
+         * It may be possible to isolate a pageblock even if the
+         * migratetype is not MIGRATE_MOVABLE. The memory isolation
+         * notifier chain is used by balloon drivers to return the
+         * number of pages in a range that are held by the balloon
+         * driver to shrink memory. If all the pages are accounted for
+         * by balloons, are free, or on the LRU, isolation can continue.
+         * Later, for example, when memory hotplug notifier runs, these
+         * pages reported as "can be isolated" should be isolated(freed)
+         * by the balloon driver through the memory notifier chain.
+         */
+        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
+        notifier_ret = notifier_to_errno(notifier_ret);
+        if (notifier_ret)
+                goto out;
+        /*
+         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
+         * We just check MOVABLE pages.
+         */
+        if (!has_unmovable_pages(zone, page, arg.pages_found))
+                ret = 0;
+        /*
+         * immobile means "not-on-lru" paes. If immobile is larger than
+         * removable-by-driver pages reported by notifier, we'll fail.
+         */
+out:
+        if (!ret) {
+                set_pageblock_isolate(page);
+                move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        }
+        spin_unlock_irqrestore(&zone->lock, flags);
+        if (!ret)
+                drain_all_pages();
+        return ret;
+}
+void unset_migratetype_isolate(struct page *page, unsigned migratetype)
+{
+        struct zone *zone;
+        unsigned long flags;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+                goto out;
+        move_freepages_block(zone, page, migratetype);
+        restore_pageblock_isolate(page, migratetype);
+out:
+        spin_unlock_irqrestore(&zone->lock, flags);
+}
 static inline struct page *
 __first_valid_page(unsigned long pfn, unsigned long nr_pages)
 {
diff --git a/mm/shmem.c b/mm/shmem.c
index bd106361be4..d4e184e2a38 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -929,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = index;
+        /* Bias interleave by inode number to distribute better across nodes */
+        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
        pvma.vm_ops = NULL;
        pvma.vm_policy = spol;
        return swapin_readahead(swap, gfp, &pvma, 0);
@@ -942,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp,
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = index;
+        /* Bias interleave by inode number to distribute better across nodes */
+        pvma.vm_pgoff = index + info->vfs_inode.i_ino;
        pvma.vm_ops = NULL;
        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
@@ -1877,7 +1879,7 @@ static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 }
 static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-                struct nameidata *nd)
+                bool excl)
 {
        return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
 }
diff --git a/mm/slab.c b/mm/slab.c
index e901a36e252..811af03a14e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
 * Further notes from the original documentation:
 *
 * 11 April '97.  Started multi-threading - markhe
- *      The global cache-chain is protected by the mutex 'cache_chain_mutex'.
+ *      The global cache-chain is protected by the mutex 'slab_mutex'.
 *      The sem is only needed when accessing/extending the cache-chain, which
 *      can never happen inside an interrupt (kmem_cache_create(),
 *      kmem_cache_shrink() and kmem_cache_reap()).
@@ -87,6 +87,7 @@
 */
 #include        <linux/slab.h>
+#include        "slab.h"
 #include        <linux/mm.h>
 #include        <linux/poison.h>
 #include        <linux/swap.h>
@@ -117,12 +118,16 @@
 #include        <linux/memory.h>
 #include        <linux/prefetch.h>
+#include        <net/sock.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
 #include        <asm/page.h>
 #include <trace/events/kmem.h>
+#include        "internal.h"
 /*
 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 *                0 for faster, smaller code (especially in the critical paths).
@@ -151,6 +156,12 @@
 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 #endif
+/*
+ * true if a page was allocated from pfmemalloc reserves for network-based
+ * swap
+ */
+static bool pfmemalloc_active __read_mostly;
 /* Legal flag mask for kmem_cache_create(). */
 #if DEBUG
 # define CREATE_MASK    (SLAB_RED_ZONE | \
@@ -256,9 +267,30 @@ struct array_cache {
                         * Must have this definition in here for the proper
                         * alignment of array_cache. Also simplifies accessing
                         * the entries.
+                         *
+                         * Entries should not be directly dereferenced as
+                         * entries belonging to slabs marked pfmemalloc will
+                         * have the lower bits set SLAB_OBJ_PFMEMALLOC
                         */
 };
+#define SLAB_OBJ_PFMEMALLOC     1
+static inline bool is_obj_pfmemalloc(void *objp)
+{
+        return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC;
+}
+static inline void set_obj_pfmemalloc(void **objp)
+{
+        *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC);
+        return;
+}
+static inline void clear_obj_pfmemalloc(void **objp)
+{
+        *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC);
+}
 /*
 * bootstrap: The caches do not work without cpuarrays anymore, but the
 * cpuarrays are allocated from the generic caches...
@@ -424,8 +456,8 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 * cachep->obj_offset - BYTES_PER_WORD .. cachep->obj_offset - 1:
 *              redzone word.
 * cachep->obj_offset: The real object.
- * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
+ * cachep->size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ * cachep->size - 1* BYTES_PER_WORD: last caller address
 *                                      [BYTES_PER_WORD long]
 */
 static int obj_offset(struct kmem_cache *cachep)
@@ -433,11 +465,6 @@ static int obj_offset(struct kmem_cache *cachep)
        return cachep->obj_offset;
 }
-static int obj_size(struct kmem_cache *cachep)
-{
-        return cachep->obj_size;
-}
 static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
@@ -449,23 +476,22 @@ static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
        if (cachep->flags & SLAB_STORE_USER)
-                return (unsigned long long *)(objp + cachep->buffer_size -
+                return (unsigned long long *)(objp + cachep->size -
                                              sizeof(unsigned long long) -
                                              REDZONE_ALIGN);
-        return (unsigned long long *) (objp + cachep->buffer_size -
+        return (unsigned long long *) (objp + cachep->size -
                                       sizeof(unsigned long long));
 }
 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-        return (void **)(objp + cachep->buffer_size - BYTES_PER_WORD);
+        return (void **)(objp + cachep->size - BYTES_PER_WORD);
 }
 #else
 #define obj_offset(x)                   0
-#define obj_size(cachep)                (cachep->buffer_size)
 #define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 #define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 #define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
@@ -475,7 +501,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #ifdef CONFIG_TRACING
 size_t slab_buffer_size(struct kmem_cache *cachep)
 {
-        return cachep->buffer_size;
+        return cachep->size;
 }
 EXPORT_SYMBOL(slab_buffer_size);
 #endif
@@ -489,56 +515,37 @@ EXPORT_SYMBOL(slab_buffer_size);
 static int slab_max_order = SLAB_MAX_ORDER_LO;
 static bool slab_max_order_set __initdata;
-/*
- * Functions for storing/retrieving the cachep and or slab from the page
- * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
- * these are used to find the cache which an obj belongs to.
- */
-static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
-{
-        page->lru.next = (struct list_head *)cache;
-}
 static inline struct kmem_cache *page_get_cache(struct page *page)
 {
        page = compound_head(page);
        BUG_ON(!PageSlab(page));
-        return (struct kmem_cache *)page->lru.next;
+        return page->slab_cache;
-}
-static inline void page_set_slab(struct page *page, struct slab *slab)
-{
-        page->lru.prev = (struct list_head *)slab;
-}
-static inline struct slab *page_get_slab(struct page *page)
-{
-        BUG_ON(!PageSlab(page));
-        return (struct slab *)page->lru.prev;
 }
 static inline struct kmem_cache *virt_to_cache(const void *obj)
 {
        struct page *page = virt_to_head_page(obj);
-        return page_get_cache(page);
+        return page->slab_cache;
 }
 static inline struct slab *virt_to_slab(const void *obj)
 {
        struct page *page = virt_to_head_page(obj);
-        return page_get_slab(page);
+        VM_BUG_ON(!PageSlab(page));
+        return page->slab_page;
 }
 static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
                                 unsigned int idx)
 {
-        return slab->s_mem + cache->buffer_size * idx;
+        return slab->s_mem + cache->size * idx;
 }
 /*
- * We want to avoid an expensive divide : (offset / cache->buffer_size)
+ * We want to avoid an expensive divide : (offset / cache->size)
- *   Using the fact that buffer_size is a constant for a particular cache,
+ *   Using the fact that size is a constant for a particular cache,
- *   we can replace (offset / cache->buffer_size) by
+ *   we can replace (offset / cache->size) by
 *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
 */
 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
@@ -584,33 +591,12 @@ static struct kmem_cache cache_cache = {
        .batchcount = 1,
        .limit = BOOT_CPUCACHE_ENTRIES,
        .shared = 1,
-        .buffer_size = sizeof(struct kmem_cache),
+        .size = sizeof(struct kmem_cache),
        .name = "kmem_cache",
 };
 #define BAD_ALIEN_MAGIC 0x01020304ul
-/*
- * chicken and egg problem: delay the per-cpu array allocation
- * until the general caches are up.
- */
-static enum {
-        NONE,
-        PARTIAL_AC,
-        PARTIAL_L3,
-        EARLY,
-        LATE,
-        FULL
-} g_cpucache_up;
-/*
- * used by boot code to determine if it can use slab based allocator
- */
-int slab_is_available(void)
-{
-        return g_cpucache_up >= EARLY;
-}
 #ifdef CONFIG_LOCKDEP
 /*
@@ -676,7 +662,7 @@ static void init_node_lock_keys(int q)
 {
        struct cache_sizes *s = malloc_sizes;
-        if (g_cpucache_up < LATE)
+        if (slab_state < UP)
                return;
        for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
@@ -716,12 +702,6 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 }
 #endif
-/*
- * Guard access to the cache-chain.
- */
-static DEFINE_MUTEX(cache_chain_mutex);
-static struct list_head cache_chain;
 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -951,6 +931,124 @@ static struct array_cache *alloc_arraycache(int node, int entries,
        return nc;
 }
+static inline bool is_slab_pfmemalloc(struct slab *slabp)
+{
+        struct page *page = virt_to_page(slabp->s_mem);
+        return PageSlabPfmemalloc(page);
+}
+/* Clears pfmemalloc_active if no slabs have pfmalloc set */
+static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
+                                                struct array_cache *ac)
+{
+        struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
+        struct slab *slabp;
+        unsigned long flags;
+        if (!pfmemalloc_active)
+                return;
+        spin_lock_irqsave(&l3->list_lock, flags);
+        list_for_each_entry(slabp, &l3->slabs_full, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        list_for_each_entry(slabp, &l3->slabs_partial, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        list_for_each_entry(slabp, &l3->slabs_free, list)
+                if (is_slab_pfmemalloc(slabp))
+                        goto out;
+        pfmemalloc_active = false;
+out:
+        spin_unlock_irqrestore(&l3->list_lock, flags);
+}
+static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                gfp_t flags, bool force_refill)
+{
+        int i;
+        void *objp = ac->entry[--ac->avail];
+        /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
+        if (unlikely(is_obj_pfmemalloc(objp))) {
+                struct kmem_list3 *l3;
+                if (gfp_pfmemalloc_allowed(flags)) {
+                        clear_obj_pfmemalloc(&objp);
+                        return objp;
+                }
+                /* The caller cannot use PFMEMALLOC objects, find another one */
+                for (i = 1; i < ac->avail; i++) {
+                        /* If a !PFMEMALLOC object is found, swap them */
+                        if (!is_obj_pfmemalloc(ac->entry[i])) {
+                                objp = ac->entry[i];
+                                ac->entry[i] = ac->entry[ac->avail];
+                                ac->entry[ac->avail] = objp;
+                                return objp;
+                        }
+                }
+                /*
+                 * If there are empty slabs on the slabs_free list and we are
+                 * being forced to refill the cache, mark this one !pfmemalloc.
+                 */
+                l3 = cachep->nodelists[numa_mem_id()];
+                if (!list_empty(&l3->slabs_free) && force_refill) {
+                        struct slab *slabp = virt_to_slab(objp);
+                        ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem));
+                        clear_obj_pfmemalloc(&objp);
+                        recheck_pfmemalloc_active(cachep, ac);
+                        return objp;
+                }
+                /* No !PFMEMALLOC objects available */
+                ac->avail++;
+                objp = NULL;
+        }
+        return objp;
+}
+static inline void *ac_get_obj(struct kmem_cache *cachep,
+                        struct array_cache *ac, gfp_t flags, bool force_refill)
+{
+        void *objp;
+        if (unlikely(sk_memalloc_socks()))
+                objp = __ac_get_obj(cachep, ac, flags, force_refill);
+        else
+                objp = ac->entry[--ac->avail];
+        return objp;
+}
+static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                                void *objp)
+{
+        if (unlikely(pfmemalloc_active)) {
+                /* Some pfmemalloc slabs exist, check if this is one */
+                struct page *page = virt_to_page(objp);
+                if (PageSlabPfmemalloc(page))
+                        set_obj_pfmemalloc(&objp);
+        }
+        return objp;
+}
+static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac,
+                                                                void *objp)
+{
+        if (unlikely(sk_memalloc_socks()))
+                objp = __ac_put_obj(cachep, ac, objp);
+        ac->entry[ac->avail++] = objp;
+}
 /*
 * Transfer objects in one arraycache to another.
 * Locking must be handled by the caller.
@@ -1127,7 +1225,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
                        STATS_INC_ACOVERFLOW(cachep);
                        __drain_alien_cache(cachep, alien, nodeid);
                }
-                alien->entry[alien->avail++] = objp;
+                ac_put_obj(cachep, alien, objp);
                spin_unlock(&alien->lock);
        } else {
                spin_lock(&(cachep->nodelists[nodeid])->list_lock);
@@ -1145,7 +1243,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 * When hotplugging memory or a cpu, existing nodelists are not replaced if
 * already in use.
 *
- * Must hold cache_chain_mutex.
+ * Must hold slab_mutex.
 */
 static int init_cache_nodelists_node(int node)
 {
@@ -1153,7 +1251,7 @@ static int init_cache_nodelists_node(int node)
        struct kmem_list3 *l3;
        const int memsize = sizeof(struct kmem_list3);
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                /*
                 * Set up the size64 kmemlist for cpu before we can
                 * begin anything. Make sure some other cpu on this
@@ -1169,7 +1267,7 @@ static int init_cache_nodelists_node(int node)
                        /*
                         * The l3s don't come and go as CPUs come and
-                         * go.  cache_chain_mutex is sufficient
+                         * go.  slab_mutex is sufficient
                         * protection here.
                         */
                        cachep->nodelists[node] = l3;
@@ -1191,7 +1289,7 @@ static void __cpuinit cpuup_canceled(long cpu)
        int node = cpu_to_mem(cpu);
        const struct cpumask *mask = cpumask_of_node(node);
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                struct array_cache *nc;
                struct array_cache *shared;
                struct array_cache **alien;
@@ -1241,7 +1339,7 @@ free_array_cache:
         * the respective cache's slabs,  now we can go ahead and
         * shrink each nodelist to its limit.
         */
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                l3 = cachep->nodelists[node];
                if (!l3)
                        continue;
@@ -1270,7 +1368,7 @@ static int __cpuinit cpuup_prepare(long cpu)
         * Now we can go ahead with allocating the shared arrays and
         * array caches
         */
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                struct array_cache *nc;
                struct array_cache *shared = NULL;
                struct array_cache **alien = NULL;
@@ -1338,9 +1436,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                err = cpuup_prepare(cpu);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
@@ -1350,7 +1448,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
                /*
-                 * Shutdown cache reaper. Note that the cache_chain_mutex is
+                 * Shutdown cache reaper. Note that the slab_mutex is
                 * held so that if cache_reap() is invoked it cannot do
                 * anything expensive but will only modify reap_work
                 * and reschedule the timer.
@@ -1377,9 +1475,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 #endif
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                cpuup_canceled(cpu);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        }
        return notifier_from_errno(err);
@@ -1395,14 +1493,14 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
 * Returns -EBUSY if all objects cannot be drained so that the node is not
 * removed.
 *
- * Must hold cache_chain_mutex.
+ * Must hold slab_mutex.
 */
 static int __meminit drain_cache_nodelists_node(int node)
 {
        struct kmem_cache *cachep;
        int ret = 0;
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                struct kmem_list3 *l3;
                l3 = cachep->nodelists[node];
@@ -1433,14 +1531,14 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
        switch (action) {
        case MEM_GOING_ONLINE:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                ret = init_cache_nodelists_node(nid);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        case MEM_GOING_OFFLINE:
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                ret = drain_cache_nodelists_node(nid);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                break;
        case MEM_ONLINE:
        case MEM_OFFLINE:
@@ -1544,8 +1642,8 @@ void __init kmem_cache_init(void)
        node = numa_mem_id();
        /* 1) create the cache_cache */
-        INIT_LIST_HEAD(&cache_chain);
+        INIT_LIST_HEAD(&slab_caches);
-        list_add(&cache_cache.next, &cache_chain);
+        list_add(&cache_cache.list, &slab_caches);
        cache_cache.colour_off = cache_line_size();
        cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
        cache_cache.nodelists[node] = &initkmem_list3[CACHE_CACHE + node];
@@ -1553,18 +1651,16 @@ void __init kmem_cache_init(void)
        /*
         * struct kmem_cache size depends on nr_node_ids & nr_cpu_ids
         */
-        cache_cache.buffer_size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
+        cache_cache.size = offsetof(struct kmem_cache, array[nr_cpu_ids]) +
                                  nr_node_ids * sizeof(struct kmem_list3 *);
-#if DEBUG
+        cache_cache.object_size = cache_cache.size;
-        cache_cache.obj_size = cache_cache.buffer_size;
+        cache_cache.size = ALIGN(cache_cache.size,
-#endif
-        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
                                        cache_line_size());
        cache_cache.reciprocal_buffer_size =
-                reciprocal_value(cache_cache.buffer_size);
+                reciprocal_value(cache_cache.size);
        for (order = 0; order < MAX_ORDER; order++) {
-                cache_estimate(order, cache_cache.buffer_size,
+                cache_estimate(order, cache_cache.size,
                        cache_line_size(), 0, &left_over, &cache_cache.num);
                if (cache_cache.num)
                        break;
@@ -1585,7 +1681,7 @@ void __init kmem_cache_init(void)
         * bug.
         */
-        sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
+        sizes[INDEX_AC].cs_cachep = __kmem_cache_create(names[INDEX_AC].name,
                                        sizes[INDEX_AC].cs_size,
                                        ARCH_KMALLOC_MINALIGN,
                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1593,7 +1689,7 @@ void __init kmem_cache_init(void)
        if (INDEX_AC != INDEX_L3) {
                sizes[INDEX_L3].cs_cachep =
-                        kmem_cache_create(names[INDEX_L3].name,
+                        __kmem_cache_create(names[INDEX_L3].name,
                                sizes[INDEX_L3].cs_size,
                                ARCH_KMALLOC_MINALIGN,
                                ARCH_KMALLOC_FLAGS|SLAB_PANIC,
@@ -1611,14 +1707,14 @@ void __init kmem_cache_init(void)
                 * allow tighter packing of the smaller caches.
                 */
                if (!sizes->cs_cachep) {
-                        sizes->cs_cachep = kmem_cache_create(names->name,
+                        sizes->cs_cachep = __kmem_cache_create(names->name,
                                        sizes->cs_size,
                                        ARCH_KMALLOC_MINALIGN,
                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
                                        NULL);
                }
 #ifdef CONFIG_ZONE_DMA
-                sizes->cs_dmacachep = kmem_cache_create(
+                sizes->cs_dmacachep = __kmem_cache_create(
                                        names->name_dma,
                                        sizes->cs_size,
                                        ARCH_KMALLOC_MINALIGN,
@@ -1676,27 +1772,27 @@ void __init kmem_cache_init(void)
                }
        }
-        g_cpucache_up = EARLY;
+        slab_state = UP;
 }
 void __init kmem_cache_init_late(void)
 {
        struct kmem_cache *cachep;
-        g_cpucache_up = LATE;
+        slab_state = UP;
        /* Annotate slab for lockdep -- annotate the malloc caches */
        init_lock_keys();
        /* 6) resize the head arrays to their final sizes */
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
-        list_for_each_entry(cachep, &cache_chain, next)
+        list_for_each_entry(cachep, &slab_caches, list)
                if (enable_cpucache(cachep, GFP_NOWAIT))
                        BUG();
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        /* Done! */
-        g_cpucache_up = FULL;
+        slab_state = FULL;
        /*
         * Register a cpu startup notifier callback that initializes
@@ -1727,6 +1823,9 @@ static int __init cpucache_init(void)
         */
        for_each_online_cpu(cpu)
                start_cpu_timer(cpu);
+        /* Done! */
+        slab_state = FULL;
        return 0;
 }
 __initcall(cpucache_init);
@@ -1743,7 +1842,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
                "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n",
                nodeid, gfpflags);
        printk(KERN_WARNING "  cache: %s, object size: %d, order: %d\n",
-                cachep->name, cachep->buffer_size, cachep->gfporder);
+                cachep->name, cachep->size, cachep->gfporder);
        for_each_online_node(node) {
                unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
@@ -1798,7 +1897,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        flags |= __GFP_COMP;
 #endif
-        flags |= cachep->gfpflags;
+        flags |= cachep->allocflags;
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
                flags |= __GFP_RECLAIMABLE;
@@ -1809,6 +1908,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                return NULL;
        }
+        /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
+        if (unlikely(page->pfmemalloc))
+                pfmemalloc_active = true;
        nr_pages = (1 << cachep->gfporder);
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
                add_zone_page_state(page_zone(page),
@@ -1816,9 +1919,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        else
                add_zone_page_state(page_zone(page),
                        NR_SLAB_UNRECLAIMABLE, nr_pages);
-        for (i = 0; i < nr_pages; i++)
+        for (i = 0; i < nr_pages; i++) {
                __SetPageSlab(page + i);
+                if (page->pfmemalloc)
+                        SetPageSlabPfmemalloc(page + i);
+        }
        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
                kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1850,6 +1957,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
                                NR_SLAB_UNRECLAIMABLE, nr_freed);
        while (i--) {
                BUG_ON(!PageSlab(page));
+                __ClearPageSlabPfmemalloc(page);
                __ClearPageSlab(page);
                page++;
        }
@@ -1874,7 +1982,7 @@ static void kmem_rcu_free(struct rcu_head *head)
 static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
                            unsigned long caller)
 {
-        int size = obj_size(cachep);
+        int size = cachep->object_size;
        addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
@@ -1906,7 +2014,7 @@ static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
 static void poison_obj(struct kmem_cache *cachep, void *addr, unsigned char val)
 {
-        int size = obj_size(cachep);
+        int size = cachep->object_size;
        addr = &((char *)addr)[obj_offset(cachep)];
        memset(addr, val, size);
@@ -1966,7 +2074,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
                printk("\n");
        }
        realobj = (char *)objp + obj_offset(cachep);
-        size = obj_size(cachep);
+        size = cachep->object_size;
        for (i = 0; i < size && lines; i += 16, lines--) {
                int limit;
                limit = 16;
@@ -1983,7 +2091,7 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
        int lines = 0;
        realobj = (char *)objp + obj_offset(cachep);
-        size = obj_size(cachep);
+        size = cachep->object_size;
        for (i = 0; i < size; i++) {
                char exp = POISON_FREE;
@@ -2047,10 +2155,10 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slab
                if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                        if (cachep->buffer_size % PAGE_SIZE == 0 &&
+                        if (cachep->size % PAGE_SIZE == 0 &&
                                        OFF_SLAB(cachep))
                                kernel_map_pages(virt_to_page(objp),
-                                        cachep->buffer_size / PAGE_SIZE, 1);
+                                        cachep->size / PAGE_SIZE, 1);
                        else
                                check_poison_obj(cachep, objp);
 #else
@@ -2194,10 +2302,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
-        if (g_cpucache_up == FULL)
+        if (slab_state >= FULL)
                return enable_cpucache(cachep, gfp);
-        if (g_cpucache_up == NONE) {
+        if (slab_state == DOWN) {
                /*
                 * Note: the first kmem_cache_create must create the cache
                 * that's used by kmalloc(24), otherwise the creation of
@@ -2212,16 +2320,16 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
                 */
                set_up_list3s(cachep, SIZE_AC);
                if (INDEX_AC == INDEX_L3)
-                        g_cpucache_up = PARTIAL_L3;
+                        slab_state = PARTIAL_L3;
                else
-                        g_cpucache_up = PARTIAL_AC;
+                        slab_state = PARTIAL_ARRAYCACHE;
        } else {
                cachep->array[smp_processor_id()] =
                        kmalloc(sizeof(struct arraycache_init), gfp);
-                if (g_cpucache_up == PARTIAL_AC) {
+                if (slab_state == PARTIAL_ARRAYCACHE) {
                        set_up_list3s(cachep, SIZE_L3);
-                        g_cpucache_up = PARTIAL_L3;
+                        slab_state = PARTIAL_L3;
                } else {
                        int node;
                        for_each_online_node(node) {
@@ -2247,7 +2355,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 }
 /**
- * kmem_cache_create - Create a cache.
+ * __kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
@@ -2274,59 +2382,14 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 * as davem.
 */
 struct kmem_cache *
-kmem_cache_create (const char *name, size_t size, size_t align,
+__kmem_cache_create (const char *name, size_t size, size_t align,
        unsigned long flags, void (*ctor)(void *))
 {
        size_t left_over, slab_size, ralign;
-        struct kmem_cache *cachep = NULL, *pc;
+        struct kmem_cache *cachep = NULL;
        gfp_t gfp;
-        /*
-         * Sanity checks... these are all serious usage bugs.
-         */
-        if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
-            size > KMALLOC_MAX_SIZE) {
-                printk(KERN_ERR "%s: Early error in slab %s\n", __func__,
-                                name);
-                BUG();
-        }
-        /*
-         * We use cache_chain_mutex to ensure a consistent view of
-         * cpu_online_mask as well.  Please see cpuup_callback
-         */
-        if (slab_is_available()) {
-                get_online_cpus();
-                mutex_lock(&cache_chain_mutex);
-        }
-        list_for_each_entry(pc, &cache_chain, next) {
-                char tmp;
-                int res;
-                /*
-                 * This happens when the module gets unloaded and doesn't
-                 * destroy its slab cache and no-one else reuses the vmalloc
-                 * area of the module.  Print a warning.
-                 */
-                res = probe_kernel_address(pc->name, tmp);
-                if (res) {
-                        printk(KERN_ERR
-                               "SLAB: cache with size %d has lost its name\n",
-                               pc->buffer_size);
-                        continue;
-                }
-                if (!strcmp(pc->name, name)) {
-                        printk(KERN_ERR
-                               "kmem_cache_create: duplicate cache %s\n", name);
-                        dump_stack();
-                        goto oops;
-                }
-        }
 #if DEBUG
-        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
 #if FORCED_DEBUG
        /*
         * Enable redzoning and last user accounting, except for caches with
@@ -2415,11 +2478,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /* Get cache's description obj. */
        cachep = kmem_cache_zalloc(&cache_cache, gfp);
        if (!cachep)
-                goto oops;
+                return NULL;
        cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+        cachep->object_size = size;
+        cachep->align = align;
 #if DEBUG
-        cachep->obj_size = size;
        /*
         * Both debugging options require word-alignment which is calculated
@@ -2442,7 +2506,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        }
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
-            && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+            && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
                cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
                size = PAGE_SIZE;
        }
@@ -2471,8 +2535,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                printk(KERN_ERR
                       "kmem_cache_create: couldn't create cache %s.\n", name);
                kmem_cache_free(&cache_cache, cachep);
-                cachep = NULL;
+                return NULL;
-                goto oops;
        }
        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
                          + sizeof(struct slab), align);
@@ -2508,10 +2571,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->colour = left_over / cachep->colour_off;
        cachep->slab_size = slab_size;
        cachep->flags = flags;
-        cachep->gfpflags = 0;
+        cachep->allocflags = 0;
        if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
-                cachep->gfpflags |= GFP_DMA;
+                cachep->allocflags |= GFP_DMA;
-        cachep->buffer_size = size;
+        cachep->size = size;
        cachep->reciprocal_buffer_size = reciprocal_value(size);
        if (flags & CFLGS_OFF_SLAB) {
@@ -2530,8 +2593,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (setup_cpu_cache(cachep, gfp)) {
                __kmem_cache_destroy(cachep);
-                cachep = NULL;
+                return NULL;
-                goto oops;
        }
        if (flags & SLAB_DEBUG_OBJECTS) {
@@ -2545,18 +2607,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        }
        /* cache setup completed, link it into the list */
-        list_add(&cachep->next, &cache_chain);
+        list_add(&cachep->list, &slab_caches);
-oops:
-        if (!cachep && (flags & SLAB_PANIC))
-                panic("kmem_cache_create(): failed to create slab `%s'\n",
-                      name);
-        if (slab_is_available()) {
-                mutex_unlock(&cache_chain_mutex);
-                put_online_cpus();
-        }
        return cachep;
 }
-EXPORT_SYMBOL(kmem_cache_create);
 #if DEBUG
 static void check_irq_off(void)
@@ -2671,7 +2724,7 @@ out:
        return nr_freed;
 }
-/* Called with cache_chain_mutex held to protect against cpu hotplug */
+/* Called with slab_mutex held to protect against cpu hotplug */
 static int __cache_shrink(struct kmem_cache *cachep)
 {
        int ret = 0, i = 0;
@@ -2706,9 +2759,9 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
        BUG_ON(!cachep || in_interrupt());
        get_online_cpus();
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        ret = __cache_shrink(cachep);
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        put_online_cpus();
        return ret;
 }
@@ -2736,15 +2789,15 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
        /* Find the cache in the chain of caches. */
        get_online_cpus();
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        /*
         * the chain is never empty, cache_cache is never destroyed
         */
-        list_del(&cachep->next);
+        list_del(&cachep->list);
        if (__cache_shrink(cachep)) {
                slab_error(cachep, "Can't free all objects");
-                list_add(&cachep->next, &cache_chain);
+                list_add(&cachep->list, &slab_caches);
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                put_online_cpus();
                return;
        }
@@ -2753,7 +2806,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
                rcu_barrier();
        __kmem_cache_destroy(cachep);
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        put_online_cpus();
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2840,10 +2893,10 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                slab_error(cachep, "constructor overwrote the"
                                           " start of an object");
                }
-                if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
+                if ((cachep->size % PAGE_SIZE) == 0 &&
                            OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
                        kernel_map_pages(virt_to_page(objp),
-                                         cachep->buffer_size / PAGE_SIZE, 0);
+                                         cachep->size / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
                        cachep->ctor(objp);
@@ -2857,9 +2910,9 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
        if (CONFIG_ZONE_DMA_FLAG) {
                if (flags & GFP_DMA)
-                        BUG_ON(!(cachep->gfpflags & GFP_DMA));
+                        BUG_ON(!(cachep->allocflags & GFP_DMA));
                else
-                        BUG_ON(cachep->gfpflags & GFP_DMA);
+                        BUG_ON(cachep->allocflags & GFP_DMA);
        }
 }
@@ -2918,8 +2971,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
                nr_pages <<= cache->gfporder;
        do {
-                page_set_cache(page, cache);
+                page->slab_cache = cache;
-                page_set_slab(page, slab);
+                page->slab_page = slab;
                page++;
        } while (--nr_pages);
 }
@@ -3057,7 +3110,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        kfree_debugcheck(objp);
        page = virt_to_head_page(objp);
-        slabp = page_get_slab(page);
+        slabp = page->slab_page;
        if (cachep->flags & SLAB_RED_ZONE) {
                verify_redzone_free(cachep, objp);
@@ -3077,10 +3130,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 #endif
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
+                if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
                        store_stackinfo(cachep, objp, (unsigned long)caller);
                        kernel_map_pages(virt_to_page(objp),
-                                         cachep->buffer_size / PAGE_SIZE, 0);
+                                         cachep->size / PAGE_SIZE, 0);
                } else {
                        poison_obj(cachep, objp, POISON_FREE);
                }
@@ -3120,16 +3173,19 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
-static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
+static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
+                                                        bool force_refill)
 {
        int batchcount;
        struct kmem_list3 *l3;
        struct array_cache *ac;
        int node;
-retry:
        check_irq_off();
        node = numa_mem_id();
+        if (unlikely(force_refill))
+                goto force_grow;
+retry:
        ac = cpu_cache_get(cachep);
        batchcount = ac->batchcount;
        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3179,8 +3235,8 @@ retry:
                        STATS_INC_ACTIVE(cachep);
                        STATS_SET_HIGH(cachep);
-                        ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
+                        ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
-                                                            node);
+                                                                        node));
                }
                check_slabp(cachep, slabp);
@@ -3199,18 +3255,23 @@ alloc_done:
        if (unlikely(!ac->avail)) {
                int x;
+force_grow:
                x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
                /* cache_grow can reenable interrupts, then ac could change. */
                ac = cpu_cache_get(cachep);
-                if (!x && ac->avail == 0)       /* no objects in sight? abort */
+                node = numa_mem_id();
+                /* no objects in sight? abort */
+                if (!x && (ac->avail == 0 || force_refill))
                        return NULL;
                if (!ac->avail)         /* objects refilled by interrupt? */
                        goto retry;
        }
        ac->touched = 1;
-        return ac->entry[--ac->avail];
+        return ac_get_obj(cachep, ac, flags, force_refill);
 }
 static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
@@ -3230,9 +3291,9 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                return objp;
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
+                if ((cachep->size % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
                        kernel_map_pages(virt_to_page(objp),
-                                         cachep->buffer_size / PAGE_SIZE, 1);
+                                         cachep->size / PAGE_SIZE, 1);
                else
                        check_poison_obj(cachep, objp);
 #else
@@ -3261,8 +3322,8 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                struct slab *slabp;
                unsigned objnr;
-                slabp = page_get_slab(virt_to_head_page(objp));
+                slabp = virt_to_head_page(objp)->slab_page;
-                objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+                objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
                slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
        }
 #endif
@@ -3285,30 +3346,42 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
        if (cachep == &cache_cache)
                return false;
-        return should_failslab(obj_size(cachep), flags, cachep->flags);
+        return should_failslab(cachep->object_size, flags, cachep->flags);
 }
 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
        void *objp;
        struct array_cache *ac;
+        bool force_refill = false;
        check_irq_off();
        ac = cpu_cache_get(cachep);
        if (likely(ac->avail)) {
-                STATS_INC_ALLOCHIT(cachep);
                ac->touched = 1;
-                objp = ac->entry[--ac->avail];
+                objp = ac_get_obj(cachep, ac, flags, false);
-        } else {
-                STATS_INC_ALLOCMISS(cachep);
-                objp = cache_alloc_refill(cachep, flags);
                /*
-                 * the 'ac' may be updated by cache_alloc_refill(),
+                 * Allow for the possibility all avail objects are not allowed
-                 * and kmemleak_erase() requires its correct value.
+                 * by the current flags
                 */
-                ac = cpu_cache_get(cachep);
+                if (objp) {
+                        STATS_INC_ALLOCHIT(cachep);
+                        goto out;
+                }
+                force_refill = true;
        }
+        STATS_INC_ALLOCMISS(cachep);
+        objp = cache_alloc_refill(cachep, flags, force_refill);
+        /*
+         * the 'ac' may be updated by cache_alloc_refill(),
+         * and kmemleak_erase() requires its correct value.
+         */
+        ac = cpu_cache_get(cachep);
+out:
        /*
         * To avoid a false negative, if an object that is in one of the
         * per-CPU caches is leaked, we need to make sure kmemleak doesn't
@@ -3336,7 +3409,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
                nid_alloc = cpuset_slab_spread_node();
        else if (current->mempolicy)
-                nid_alloc = slab_node(current->mempolicy);
+                nid_alloc = slab_node();
        if (nid_alloc != nid_here)
                return ____cache_alloc_node(cachep, flags, nid_alloc);
        return NULL;
@@ -3368,7 +3441,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 retry_cpuset:
        cpuset_mems_cookie = get_mems_allowed();
-        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+        zonelist = node_zonelist(slab_node(), flags);
 retry:
        /*
@@ -3545,14 +3618,14 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
  out:
        local_irq_restore(save_flags);
        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
-        kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
+        kmemleak_alloc_recursive(ptr, cachep->object_size, 1, cachep->flags,
                                 flags);
        if (likely(ptr))
-                kmemcheck_slab_alloc(cachep, flags, ptr, obj_size(cachep));
+                kmemcheck_slab_alloc(cachep, flags, ptr, cachep->object_size);
        if (unlikely((flags & __GFP_ZERO) && ptr))
-                memset(ptr, 0, obj_size(cachep));
+                memset(ptr, 0, cachep->object_size);
        return ptr;
 }
@@ -3607,15 +3680,15 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
        objp = __do_cache_alloc(cachep, flags);
        local_irq_restore(save_flags);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
-        kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
+        kmemleak_alloc_recursive(objp, cachep->object_size, 1, cachep->flags,
                                 flags);
        prefetchw(objp);
        if (likely(objp))
-                kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));
+                kmemcheck_slab_alloc(cachep, flags, objp, cachep->object_size);
        if (unlikely((flags & __GFP_ZERO) && objp))
-                memset(objp, 0, obj_size(cachep));
+                memset(objp, 0, cachep->object_size);
        return objp;
 }
@@ -3630,9 +3703,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
        struct kmem_list3 *l3;
        for (i = 0; i < nr_objects; i++) {
-                void *objp = objpp[i];
+                void *objp;
                struct slab *slabp;
+                clear_obj_pfmemalloc(&objpp[i]);
+                objp = objpp[i];
                slabp = virt_to_slab(objp);
                l3 = cachep->nodelists[node];
                list_del(&slabp->list);
@@ -3731,7 +3807,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
        kmemleak_free_recursive(objp, cachep->flags);
        objp = cache_free_debugcheck(cachep, objp, caller);
-        kmemcheck_slab_free(cachep, objp, obj_size(cachep));
+        kmemcheck_slab_free(cachep, objp, cachep->object_size);
        /*
         * Skip calling cache_free_alien() when the platform is not numa.
@@ -3750,7 +3826,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp,
                cache_flusharray(cachep, ac);
        }
-        ac->entry[ac->avail++] = objp;
+        ac_put_obj(cachep, ac, objp);
 }
 /**
@@ -3766,7 +3842,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
        void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
        trace_kmem_cache_alloc(_RET_IP_, ret,
-                               obj_size(cachep), cachep->buffer_size, flags);
+                               cachep->object_size, cachep->size, flags);
        return ret;
 }
@@ -3794,7 +3870,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                                       __builtin_return_address(0));
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
-                                    obj_size(cachep), cachep->buffer_size,
+                                    cachep->object_size, cachep->size,
                                    flags, nodeid);
        return ret;
@@ -3876,7 +3952,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
        ret = __cache_alloc(cachep, flags, caller);
        trace_kmalloc((unsigned long) caller, ret,
-                      size, cachep->buffer_size, flags);
+                      size, cachep->size, flags);
        return ret;
 }
@@ -3916,9 +3992,9 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
        unsigned long flags;
        local_irq_save(flags);
-        debug_check_no_locks_freed(objp, obj_size(cachep));
+        debug_check_no_locks_freed(objp, cachep->object_size);
        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
-                debug_check_no_obj_freed(objp, obj_size(cachep));
+                debug_check_no_obj_freed(objp, cachep->object_size);
        __cache_free(cachep, objp, __builtin_return_address(0));
        local_irq_restore(flags);
@@ -3947,8 +4023,9 @@ void kfree(const void *objp)
        local_irq_save(flags);
        kfree_debugcheck(objp);
        c = virt_to_cache(objp);
-        debug_check_no_locks_freed(objp, obj_size(c));
+        debug_check_no_locks_freed(objp, c->object_size);
-        debug_check_no_obj_freed(objp, obj_size(c));
+        debug_check_no_obj_freed(objp, c->object_size);
        __cache_free(c, (void *)objp, __builtin_return_address(0));
        local_irq_restore(flags);
 }
@@ -3956,7 +4033,7 @@ EXPORT_SYMBOL(kfree);
 unsigned int kmem_cache_size(struct kmem_cache *cachep)
 {
-        return obj_size(cachep);
+        return cachep->object_size;
 }
 EXPORT_SYMBOL(kmem_cache_size);
@@ -4030,7 +4107,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
        return 0;
 fail:
-        if (!cachep->next.next) {
+        if (!cachep->list.next) {
                /* Cache is not active yet. Roll back what we did */
                node--;
                while (node >= 0) {
@@ -4065,7 +4142,7 @@ static void do_ccupdate_local(void *info)
        new->new[smp_processor_id()] = old;
 }
-/* Always called with the cache_chain_mutex held */
+/* Always called with the slab_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
                                int batchcount, int shared, gfp_t gfp)
 {
@@ -4109,7 +4186,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
        return alloc_kmemlist(cachep, gfp);
 }
-/* Called with cache_chain_mutex held always */
+/* Called with slab_mutex held always */
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
        int err;
@@ -4124,13 +4201,13 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
         * The numbers are guessed, we should auto-tune as described by
         * Bonwick.
         */
-        if (cachep->buffer_size > 131072)
+        if (cachep->size > 131072)
                limit = 1;
-        else if (cachep->buffer_size > PAGE_SIZE)
+        else if (cachep->size > PAGE_SIZE)
                limit = 8;
-        else if (cachep->buffer_size > 1024)
+        else if (cachep->size > 1024)
                limit = 24;
-        else if (cachep->buffer_size > 256)
+        else if (cachep->size > 256)
                limit = 54;
        else
                limit = 120;
@@ -4145,7 +4222,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
         * to a larger limit. Thus disabled by default.
         */
        shared = 0;
-        if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
+        if (cachep->size <= PAGE_SIZE && num_possible_cpus() > 1)
                shared = 8;
 #if DEBUG
@@ -4211,11 +4288,11 @@ static void cache_reap(struct work_struct *w)
        int node = numa_mem_id();
        struct delayed_work *work = to_delayed_work(w);
-        if (!mutex_trylock(&cache_chain_mutex))
+        if (!mutex_trylock(&slab_mutex))
                /* Give up. Setup the next iteration. */
                goto out;
-        list_for_each_entry(searchp, &cache_chain, next) {
+        list_for_each_entry(searchp, &slab_caches, list) {
                check_irq_on();
                /*
@@ -4253,7 +4330,7 @@ next:
                cond_resched();
        }
        check_irq_on();
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        next_reap_node();
 out:
        /* Set up the next iteration */
@@ -4289,26 +4366,26 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 {
        loff_t n = *pos;
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        if (!n)
                print_slabinfo_header(m);
-        return seq_list_start(&cache_chain, *pos);
+        return seq_list_start(&slab_caches, *pos);
 }
 static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
-        return seq_list_next(p, &cache_chain, pos);
+        return seq_list_next(p, &slab_caches, pos);
 }
 static void s_stop(struct seq_file *m, void *p)
 {
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
 }
 static int s_show(struct seq_file *m, void *p)
 {
-        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
+        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
        struct slab *slabp;
        unsigned long active_objs;
        unsigned long num_objs;
@@ -4364,7 +4441,7 @@ static int s_show(struct seq_file *m, void *p)
                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-                   name, active_objs, num_objs, cachep->buffer_size,
+                   name, active_objs, num_objs, cachep->size,
                   cachep->num, (1 << cachep->gfporder));
        seq_printf(m, " : tunables %4u %4u %4u",
                   cachep->limit, cachep->batchcount, cachep->shared);
@@ -4454,9 +4531,9 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                return -EINVAL;
        /* Find the cache in the chain of caches. */
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
        res = -EINVAL;
-        list_for_each_entry(cachep, &cache_chain, next) {
+        list_for_each_entry(cachep, &slab_caches, list) {
                if (!strcmp(cachep->name, kbuf)) {
                        if (limit < 1 || batchcount < 1 ||
                                        batchcount > limit || shared < 0) {
@@ -4469,7 +4546,7 @@ static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                        break;
                }
        }
-        mutex_unlock(&cache_chain_mutex);
+        mutex_unlock(&slab_mutex);
        if (res >= 0)
                res = count;
        return res;
@@ -4492,8 +4569,8 @@ static const struct file_operations proc_slabinfo_operations = {
 static void *leaks_start(struct seq_file *m, loff_t *pos)
 {
-        mutex_lock(&cache_chain_mutex);
+        mutex_lock(&slab_mutex);
-        return seq_list_start(&cache_chain, *pos);
+        return seq_list_start(&slab_caches, *pos);
 }
 static inline int add_caller(unsigned long *n, unsigned long v)
@@ -4532,7 +4609,7 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
        int i;
        if (n[0] == n[1])
                return;
-        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
+        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
                if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
                        continue;
                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
@@ -4558,7 +4635,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
 static int leaks_show(struct seq_file *m, void *p)
 {
-        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, next);
+        struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
        struct slab *slabp;
        struct kmem_list3 *l3;
        const char *name;
@@ -4592,17 +4669,17 @@ static int leaks_show(struct seq_file *m, void *p)
        name = cachep->name;
        if (n[0] == n[1]) {
                /* Increase the buffer size */
-                mutex_unlock(&cache_chain_mutex);
+                mutex_unlock(&slab_mutex);
                m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
                if (!m->private) {
                        /* Too bad, we are really out */
                        m->private = n;
-                        mutex_lock(&cache_chain_mutex);
+                        mutex_lock(&slab_mutex);
                        return -ENOMEM;
                }
                *(unsigned long *)m->private = n[0] * 2;
                kfree(n);
-                mutex_lock(&cache_chain_mutex);
+                mutex_lock(&slab_mutex);
                /* Now make sure this entry will be retried */
                m->count = m->size;
                return 0;
@@ -4677,6 +4754,6 @@ size_t ksize(const void *objp)
        if (unlikely(objp == ZERO_SIZE_PTR))
                return 0;
-        return obj_size(virt_to_cache(objp));
+        return virt_to_cache(objp)->object_size;
 }
 EXPORT_SYMBOL(ksize);
diff --git a/mm/slab.h b/mm/slab.h
new file mode 100644
index 00000000000..db7848caaa2
--- /dev/null
+++ b/mm/slab.h
@@ -0,0 +1,33 @@
+#ifndef MM_SLAB_H
+#define MM_SLAB_H
+/*
+ * Internal slab definitions
+ */
+/*
+ * State of the slab allocator.
+ *
+ * This is used to describe the states of the allocator during bootup.
+ * Allocators use this to gradually bootstrap themselves. Most allocators
+ * have the problem that the structures used for managing slab caches are
+ * allocated from slab caches themselves.
+ */
+enum slab_state {
+        DOWN,                   /* No slab functionality yet */
+        PARTIAL,                /* SLUB: kmem_cache_node available */
+        PARTIAL_ARRAYCACHE,     /* SLAB: kmalloc size for arraycache available */
+        PARTIAL_L3,             /* SLAB: kmalloc size for l3 struct available */
+        UP,                     /* Slab caches usable but not all extras yet */
+        FULL                    /* Everything is working */
+};
+extern enum slab_state slab_state;
+/* The slab cache mutex protects the management structures during changes */
+extern struct mutex slab_mutex;
+extern struct list_head slab_caches;
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
+        size_t align, unsigned long flags, void (*ctor)(void *));
+#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
new file mode 100644
index 00000000000..aa3ca5bb01b
--- /dev/null
+++ b/mm/slab_common.c
@@ -0,0 +1,120 @@
+/*
+ * Slab allocator functions that are independent of the allocator strategy
+ *
+ * (C) 2012 Christoph Lameter <cl@linux.com>
+ */
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/poison.h>
+#include <linux/interrupt.h>
+#include <linux/memory.h>
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/page.h>
+#include "slab.h"
+enum slab_state slab_state;
+LIST_HEAD(slab_caches);
+DEFINE_MUTEX(slab_mutex);
+/*
+ * kmem_cache_create - Create a cache.
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @ctor: A constructor for the objects.
+ *
+ * Returns a ptr to the cache on success, NULL on failure.
+ * Cannot be called within a interrupt, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline.  This can be beneficial if you're counting cycles as closely
+ * as davem.
+ */
+struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
+                unsigned long flags, void (*ctor)(void *))
+{
+        struct kmem_cache *s = NULL;
+#ifdef CONFIG_DEBUG_VM
+        if (!name || in_interrupt() || size < sizeof(void *) ||
+                size > KMALLOC_MAX_SIZE) {
+                printk(KERN_ERR "kmem_cache_create(%s) integrity check"
+                        " failed\n", name);
+                goto out;
+        }
+#endif
+        get_online_cpus();
+        mutex_lock(&slab_mutex);
+#ifdef CONFIG_DEBUG_VM
+        list_for_each_entry(s, &slab_caches, list) {
+                char tmp;
+                int res;
+                /*
+                 * This happens when the module gets unloaded and doesn't
+                 * destroy its slab cache and no-one else reuses the vmalloc
+                 * area of the module.  Print a warning.
+                 */
+                res = probe_kernel_address(s->name, tmp);
+                if (res) {
+                        printk(KERN_ERR
+                               "Slab cache with size %d has lost its name\n",
+                               s->object_size);
+                        continue;
+                }
+                if (!strcmp(s->name, name)) {
+                        printk(KERN_ERR "kmem_cache_create(%s): Cache name"
+                                " already exists.\n",
+                                name);
+                        dump_stack();
+                        s = NULL;
+                        goto oops;
+                }
+        }
+        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
+#endif
+        s = __kmem_cache_create(name, size, align, flags, ctor);
+#ifdef CONFIG_DEBUG_VM
+oops:
+#endif
+        mutex_unlock(&slab_mutex);
+        put_online_cpus();
+#ifdef CONFIG_DEBUG_VM
+out:
+#endif
+        if (!s && (flags & SLAB_PANIC))
+                panic("kmem_cache_create: Failed to create slab '%s'\n", name);
+        return s;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+int slab_is_available(void)
+{
+        return slab_state >= UP;
+}
diff --git a/mm/slob.c b/mm/slob.c
index 8105be42cad..45d4ca79933 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -59,6 +59,8 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include "slab.h"
 #include <linux/mm.h>
 #include <linux/swap.h> /* struct reclaim_state */
 #include <linux/cache.h>
@@ -92,36 +94,6 @@ struct slob_block {
 typedef struct slob_block slob_t;
 /*
- * We use struct page fields to manage some slob allocation aspects,
- * however to avoid the horrible mess in include/linux/mm_types.h, we'll
- * just define our own struct page type variant here.
- */
-struct slob_page {
-        union {
-                struct {
-                        unsigned long flags;    /* mandatory */
-                        atomic_t _count;        /* mandatory */
-                        slobidx_t units;        /* free units left in page */
-                        unsigned long pad[2];
-                        slob_t *free;           /* first free slob_t in page */
-                        struct list_head list;  /* linked list of free pages */
-                };
-                struct page page;
-        };
-};
-static inline void struct_slob_page_wrong_size(void)
-{ BUILD_BUG_ON(sizeof(struct slob_page) != sizeof(struct page)); }
-/*
- * free_slob_page: call before a slob_page is returned to the page allocator.
- */
-static inline void free_slob_page(struct slob_page *sp)
-{
-        reset_page_mapcount(&sp->page);
-        sp->page.mapping = NULL;
-}
-/*
 * All partially free slob pages go on these lists.
 */
 #define SLOB_BREAK1 256
@@ -131,46 +103,23 @@ static LIST_HEAD(free_slob_medium);
 static LIST_HEAD(free_slob_large);
 /*
- * is_slob_page: True for all slob pages (false for bigblock pages)
- */
-static inline int is_slob_page(struct slob_page *sp)
-{
-        return PageSlab((struct page *)sp);
-}
-static inline void set_slob_page(struct slob_page *sp)
-{
-        __SetPageSlab((struct page *)sp);
-}
-static inline void clear_slob_page(struct slob_page *sp)
-{
-        __ClearPageSlab((struct page *)sp);
-}
-static inline struct slob_page *slob_page(const void *addr)
-{
-        return (struct slob_page *)virt_to_page(addr);
-}
-/*
 * slob_page_free: true for pages on free_slob_pages list.
 */
-static inline int slob_page_free(struct slob_page *sp)
+static inline int slob_page_free(struct page *sp)
 {
-        return PageSlobFree((struct page *)sp);
+        return PageSlobFree(sp);
 }
-static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
+static void set_slob_page_free(struct page *sp, struct list_head *list)
 {
        list_add(&sp->list, list);
-        __SetPageSlobFree((struct page *)sp);
+        __SetPageSlobFree(sp);
 }
-static inline void clear_slob_page_free(struct slob_page *sp)
+static inline void clear_slob_page_free(struct page *sp)
 {
        list_del(&sp->list);
-        __ClearPageSlobFree((struct page *)sp);
+        __ClearPageSlobFree(sp);
 }
 #define SLOB_UNIT sizeof(slob_t)
@@ -267,12 +216,12 @@ static void slob_free_pages(void *b, int order)
 /*
 * Allocate a slob block within a given slob_page sp.
 */
-static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
+static void *slob_page_alloc(struct page *sp, size_t size, int align)
 {
        slob_t *prev, *cur, *aligned = NULL;
        int delta = 0, units = SLOB_UNITS(size);
-        for (prev = NULL, cur = sp->free; ; prev = cur, cur = slob_next(cur)) {
+        for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
                slobidx_t avail = slob_units(cur);
                if (align) {
@@ -296,12 +245,12 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
                                if (prev)
                                        set_slob(prev, slob_units(prev), next);
                                else
-                                        sp->free = next;
+                                        sp->freelist = next;
                        } else { /* fragment */
                                if (prev)
                                        set_slob(prev, slob_units(prev), cur + units);
                                else
-                                        sp->free = cur + units;
+                                        sp->freelist = cur + units;
                                set_slob(cur + units, avail - units, next);
                        }
@@ -320,7 +269,7 @@ static void *slob_page_alloc(struct slob_page *sp, size_t size, int align)
 */
 static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 {
-        struct slob_page *sp;
+        struct page *sp;
        struct list_head *prev;
        struct list_head *slob_list;
        slob_t *b = NULL;
@@ -341,7 +290,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                 * If there's a node specification, search for a partial
                 * page with a matching node id in the freelist.
                 */
-                if (node != -1 && page_to_nid(&sp->page) != node)
+                if (node != -1 && page_to_nid(sp) != node)
                        continue;
 #endif
                /* Enough room on this page? */
@@ -369,12 +318,12 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
                b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
                if (!b)
                        return NULL;
-                sp = slob_page(b);
+                sp = virt_to_page(b);
-                set_slob_page(sp);
+                __SetPageSlab(sp);
                spin_lock_irqsave(&slob_lock, flags);
                sp->units = SLOB_UNITS(PAGE_SIZE);
-                sp->free = b;
+                sp->freelist = b;
                INIT_LIST_HEAD(&sp->list);
                set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
                set_slob_page_free(sp, slob_list);
@@ -392,7 +341,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
 */
 static void slob_free(void *block, int size)
 {
-        struct slob_page *sp;
+        struct page *sp;
        slob_t *prev, *next, *b = (slob_t *)block;
        slobidx_t units;
        unsigned long flags;
@@ -402,7 +351,7 @@ static void slob_free(void *block, int size)
                return;
        BUG_ON(!size);
-        sp = slob_page(block);
+        sp = virt_to_page(block);
        units = SLOB_UNITS(size);
        spin_lock_irqsave(&slob_lock, flags);
@@ -412,8 +361,8 @@ static void slob_free(void *block, int size)
                if (slob_page_free(sp))
                        clear_slob_page_free(sp);
                spin_unlock_irqrestore(&slob_lock, flags);
-                clear_slob_page(sp);
+                __ClearPageSlab(sp);
-                free_slob_page(sp);
+                reset_page_mapcount(sp);
                slob_free_pages(b, 0);
                return;
        }
@@ -421,7 +370,7 @@ static void slob_free(void *block, int size)
        if (!slob_page_free(sp)) {
                /* This slob page is about to become partially free. Easy! */
                sp->units = units;
-                sp->free = b;
+                sp->freelist = b;
                set_slob(b, units,
                        (void *)((unsigned long)(b +
                                        SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK));
@@ -441,15 +390,15 @@ static void slob_free(void *block, int size)
         */
        sp->units += units;
-        if (b < sp->free) {
+        if (b < (slob_t *)sp->freelist) {
-                if (b + units == sp->free) {
+                if (b + units == sp->freelist) {
-                        units += slob_units(sp->free);
+                        units += slob_units(sp->freelist);
-                        sp->free = slob_next(sp->free);
+                        sp->freelist = slob_next(sp->freelist);
                }
-                set_slob(b, units, sp->free);
+                set_slob(b, units, sp->freelist);
-                sp->free = b;
+                sp->freelist = b;
        } else {
-                prev = sp->free;
+                prev = sp->freelist;
                next = slob_next(prev);
                while (b > next) {
                        prev = next;
@@ -522,7 +471,7 @@ EXPORT_SYMBOL(__kmalloc_node);
 void kfree(const void *block)
 {
-        struct slob_page *sp;
+        struct page *sp;
        trace_kfree(_RET_IP_, block);
@@ -530,43 +479,36 @@ void kfree(const void *block)
                return;
        kmemleak_free(block);
-        sp = slob_page(block);
+        sp = virt_to_page(block);
-        if (is_slob_page(sp)) {
+        if (PageSlab(sp)) {
                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                unsigned int *m = (unsigned int *)(block - align);
                slob_free(m, *m + align);
        } else
-                put_page(&sp->page);
+                put_page(sp);
 }
 EXPORT_SYMBOL(kfree);
 /* can't use ksize for kmem_cache_alloc memory, only kmalloc */
 size_t ksize(const void *block)
 {
-        struct slob_page *sp;
+        struct page *sp;
        BUG_ON(!block);
        if (unlikely(block == ZERO_SIZE_PTR))
                return 0;
-        sp = slob_page(block);
+        sp = virt_to_page(block);
-        if (is_slob_page(sp)) {
+        if (PageSlab(sp)) {
                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
                unsigned int *m = (unsigned int *)(block - align);
                return SLOB_UNITS(*m) * SLOB_UNIT;
        } else
-                return sp->page.private;
+                return sp->private;
 }
 EXPORT_SYMBOL(ksize);
-struct kmem_cache {
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
-        unsigned int size, align;
-        unsigned long flags;
-        const char *name;
-        void (*ctor)(void *);
-};
-struct kmem_cache *kmem_cache_create(const char *name, size_t size,
        size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *c;
@@ -589,13 +531,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                        c->align = ARCH_SLAB_MINALIGN;
                if (c->align < align)
                        c->align = align;
-        } else if (flags & SLAB_PANIC)
-                panic("Cannot create slab cache %s\n", name);
-        kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
+                kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
+                c->refcount = 1;
+        }
        return c;
 }
-EXPORT_SYMBOL(kmem_cache_create);
 void kmem_cache_destroy(struct kmem_cache *c)
 {
@@ -678,19 +619,12 @@ int kmem_cache_shrink(struct kmem_cache *d)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
-static unsigned int slob_ready __read_mostly;
-int slab_is_available(void)
-{
-        return slob_ready;
-}
 void __init kmem_cache_init(void)
 {
-        slob_ready = 1;
+        slab_state = UP;
 }
 void __init kmem_cache_init_late(void)
 {
-        /* Nothing to do */
+        slab_state = FULL;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 8c691fa1cf3..8f78e257703 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,6 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/slab.h>
+#include "slab.h"
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/kmemcheck.h>
@@ -33,15 +34,17 @@
 #include <trace/events/kmem.h>
+#include "internal.h"
 /*
 * Lock order:
- *   1. slub_lock (Global Semaphore)
+ *   1. slab_mutex (Global Mutex)
 *   2. node->list_lock
 *   3. slab_lock(page) (Only on some arches and for debugging)
 *
- *   slub_lock
+ *   slab_mutex
 *
- *   The role of the slub_lock is to protect the list of all the slabs
+ *   The role of the slab_mutex is to protect the list of all the slabs
 *   and to synchronize major metadata changes to slab cache structures.
 *
 *   The slab_lock is only used for debugging and on arches that do not
@@ -182,17 +185,6 @@ static int kmem_size = sizeof(struct kmem_cache);
 static struct notifier_block slab_notifier;
 #endif
-static enum {
-        DOWN,           /* No slab functionality available */
-        PARTIAL,        /* Kmem_cache_node works */
-        UP,             /* Everything works but does not show up in sysfs */
-        SYSFS           /* Sysfs up */
-} slab_state = DOWN;
-/* A list of all slab caches on the system */
-static DECLARE_RWSEM(slub_lock);
-static LIST_HEAD(slab_caches);
 /*
 * Tracking user of a slab.
 */
@@ -237,11 +229,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
 *                      Core slab cache functions
 *******************************************************************/
-int slab_is_available(void)
-{
-        return slab_state >= UP;
-}
 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 {
        return s->node[node];
@@ -311,7 +298,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
         * and whatever may come after it.
         */
        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
-                return s->objsize;
+                return s->object_size;
 #endif
        /*
@@ -609,11 +596,11 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
        if (p > addr + 16)
                print_section("Bytes b4 ", p - 16, 16);
-        print_section("Object ", p, min_t(unsigned long, s->objsize,
+        print_section("Object ", p, min_t(unsigned long, s->object_size,
                                PAGE_SIZE));
        if (s->flags & SLAB_RED_ZONE)
-                print_section("Redzone ", p + s->objsize,
+                print_section("Redzone ", p + s->object_size,
-                        s->inuse - s->objsize);
+                        s->inuse - s->object_size);
        if (s->offset)
                off = s->offset + sizeof(void *);
@@ -655,12 +642,12 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
        u8 *p = object;
        if (s->flags & __OBJECT_POISON) {
-                memset(p, POISON_FREE, s->objsize - 1);
+                memset(p, POISON_FREE, s->object_size - 1);
-                p[s->objsize - 1] = POISON_END;
+                p[s->object_size - 1] = POISON_END;
        }
        if (s->flags & SLAB_RED_ZONE)
-                memset(p + s->objsize, val, s->inuse - s->objsize);
+                memset(p + s->object_size, val, s->inuse - s->object_size);
 }
 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
@@ -705,10 +692,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 *      0xa5 (POISON_END)
 *
- * object + s->objsize
+ * object + s->object_size
 *      Padding to reach word boundary. This is also used for Redzoning.
 *      Padding is extended by another word if Redzoning is enabled and
- *      objsize == inuse.
+ *      object_size == inuse.
 *
 *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 *      0xcc (RED_ACTIVE) for objects in use.
@@ -727,7 +714,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 * object + s->size
 *      Nothing is used beyond s->size.
 *
- * If slabcaches are merged then the objsize and inuse boundaries are mostly
+ * If slabcaches are merged then the object_size and inuse boundaries are mostly
 * ignored. And therefore no slab options that rely on these boundaries
 * may be used with merged slabcaches.
 */
@@ -787,25 +774,25 @@ static int check_object(struct kmem_cache *s, struct page *page,
                                        void *object, u8 val)
 {
        u8 *p = object;
-        u8 *endobject = object + s->objsize;
+        u8 *endobject = object + s->object_size;
        if (s->flags & SLAB_RED_ZONE) {
                if (!check_bytes_and_report(s, page, object, "Redzone",
-                        endobject, val, s->inuse - s->objsize))
+                        endobject, val, s->inuse - s->object_size))
                        return 0;
        } else {
-                if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
+                if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) {
                        check_bytes_and_report(s, page, p, "Alignment padding",
-                                endobject, POISON_INUSE, s->inuse - s->objsize);
+                                endobject, POISON_INUSE, s->inuse - s->object_size);
                }
        }
        if (s->flags & SLAB_POISON) {
                if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
                        (!check_bytes_and_report(s, page, p, "Poison", p,
-                                        POISON_FREE, s->objsize - 1) ||
+                                        POISON_FREE, s->object_size - 1) ||
                         !check_bytes_and_report(s, page, p, "Poison",
-                                p + s->objsize - 1, POISON_END, 1)))
+                                p + s->object_size - 1, POISON_END, 1)))
                        return 0;
                /*
                 * check_pad_bytes cleans up on its own.
@@ -926,7 +913,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
                        page->freelist);
                if (!alloc)
-                        print_section("Object ", (void *)object, s->objsize);
+                        print_section("Object ", (void *)object, s->object_size);
                dump_stack();
        }
@@ -942,14 +929,14 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
        lockdep_trace_alloc(flags);
        might_sleep_if(flags & __GFP_WAIT);
-        return should_failslab(s->objsize, flags, s->flags);
+        return should_failslab(s->object_size, flags, s->flags);
 }
 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
 {
        flags &= gfp_allowed_mask;
        kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
-        kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
+        kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
 }
 static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -966,13 +953,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
                unsigned long flags;
                local_irq_save(flags);
-                kmemcheck_slab_free(s, x, s->objsize);
+                kmemcheck_slab_free(s, x, s->object_size);
-                debug_check_no_locks_freed(x, s->objsize);
+                debug_check_no_locks_freed(x, s->object_size);
                local_irq_restore(flags);
        }
 #endif
        if (!(s->flags & SLAB_DEBUG_OBJECTS))
-                debug_check_no_obj_freed(x, s->objsize);
+                debug_check_no_obj_freed(x, s->object_size);
 }
 /*
@@ -1207,7 +1194,7 @@ out:
 __setup("slub_debug", setup_slub_debug);
-static unsigned long kmem_cache_flags(unsigned long objsize,
+static unsigned long kmem_cache_flags(unsigned long object_size,
        unsigned long flags, const char *name,
        void (*ctor)(void *))
 {
@@ -1237,7 +1224,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
                                        struct page *page) {}
 static inline void remove_full(struct kmem_cache *s, struct page *page) {}
-static inline unsigned long kmem_cache_flags(unsigned long objsize,
+static inline unsigned long kmem_cache_flags(unsigned long object_size,
        unsigned long flags, const char *name,
        void (*ctor)(void *))
 {
@@ -1314,13 +1301,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                        stat(s, ORDER_FALLBACK);
        }
-        if (flags & __GFP_WAIT)
+        if (kmemcheck_enabled && page
-                local_irq_disable();
-        if (!page)
-                return NULL;
-        if (kmemcheck_enabled
                && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
                int pages = 1 << oo_order(oo);
@@ -1336,6 +1317,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                        kmemcheck_mark_unallocated_pages(page, pages);
        }
+        if (flags & __GFP_WAIT)
+                local_irq_disable();
+        if (!page)
+                return NULL;
        page->objects = oo_objects(oo);
        mod_zone_page_state(page_zone(page),
                (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1370,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        inc_slabs_node(s, page_to_nid(page), page->objects);
        page->slab = s;
        __SetPageSlab(page);
+        if (page->pfmemalloc)
+                SetPageSlabPfmemalloc(page);
        start = page_address(page);
@@ -1413,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
                -pages);
+        __ClearPageSlabPfmemalloc(page);
        __ClearPageSlab(page);
        reset_page_mapcount(page);
        if (current->reclaim_state)
@@ -1490,12 +1479,12 @@ static inline void remove_partial(struct kmem_cache_node *n,
 }
 /*
- * Lock slab, remove from the partial list and put the object into the
+ * Remove slab from the partial list, freeze it and
- * per cpu freelist.
+ * return the pointer to the freelist.
 *
 * Returns a list of objects or NULL if it fails.
 *
- * Must hold list_lock.
+ * Must hold list_lock since we modify the partial list.
 */
 static inline void *acquire_slab(struct kmem_cache *s,
                struct kmem_cache_node *n, struct page *page,
@@ -1510,26 +1499,27 @@ static inline void *acquire_slab(struct kmem_cache *s,
         * The old freelist is the list of objects for the
         * per cpu allocation list.
         */
-        do {
+        freelist = page->freelist;
-                freelist = page->freelist;
+        counters = page->counters;
-                counters = page->counters;
+        new.counters = counters;
-                new.counters = counters;
+        if (mode) {
-                if (mode) {
+                new.inuse = page->objects;
-                        new.inuse = page->objects;
+                new.freelist = NULL;
-                        new.freelist = NULL;
+        } else {
-                } else {
+                new.freelist = freelist;
-                        new.freelist = freelist;
+        }
-                }
-                VM_BUG_ON(new.frozen);
+        VM_BUG_ON(new.frozen);
-                new.frozen = 1;
+        new.frozen = 1;
-        } while (!__cmpxchg_double_slab(s, page,
+        if (!__cmpxchg_double_slab(s, page,
                        freelist, counters,
                        new.freelist, new.counters,
-                        "lock and freeze"));
+                        "acquire_slab"))
+                return NULL;
        remove_partial(n, page);
+        WARN_ON(!freelist);
        return freelist;
 }
@@ -1563,7 +1553,6 @@ static void *get_partial_node(struct kmem_cache *s,
                if (!object) {
                        c->page = page;
-                        c->node = page_to_nid(page);
                        stat(s, ALLOC_FROM_PARTIAL);
                        object = t;
                        available =  page->objects - page->inuse;
@@ -1617,7 +1606,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
        do {
                cpuset_mems_cookie = get_mems_allowed();
-                zonelist = node_zonelist(slab_node(current->mempolicy), flags);
+                zonelist = node_zonelist(slab_node(), flags);
                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
                        struct kmem_cache_node *n;
@@ -1731,14 +1720,12 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
 /*
 * Remove the cpu slab
 */
-static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist)
 {
        enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
-        struct page *page = c->page;
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
        int lock = 0;
        enum slab_modes l = M_NONE, m = M_NONE;
-        void *freelist;
        void *nextfree;
        int tail = DEACTIVATE_TO_HEAD;
        struct page new;
@@ -1749,11 +1736,6 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
                tail = DEACTIVATE_TO_TAIL;
        }
-        c->tid = next_tid(c->tid);
-        c->page = NULL;
-        freelist = c->freelist;
-        c->freelist = NULL;
        /*
         * Stage one: Free all available per cpu objects back
         * to the page freelist while it is still frozen. Leave the
@@ -1879,21 +1861,31 @@ redo:
        }
 }
-/* Unfreeze all the cpu partial slabs */
+/*
+ * Unfreeze all the cpu partial slabs.
+ *
+ * This function must be called with interrupt disabled.
+ */
 static void unfreeze_partials(struct kmem_cache *s)
 {
-        struct kmem_cache_node *n = NULL;
+        struct kmem_cache_node *n = NULL, *n2 = NULL;
        struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
        struct page *page, *discard_page = NULL;
        while ((page = c->partial)) {
-                enum slab_modes { M_PARTIAL, M_FREE };
-                enum slab_modes l, m;
                struct page new;
                struct page old;
                c->partial = page->next;
-                l = M_FREE;
+                n2 = get_node(s, page_to_nid(page));
+                if (n != n2) {
+                        if (n)
+                                spin_unlock(&n->list_lock);
+                        n = n2;
+                        spin_lock(&n->list_lock);
+                }
                do {
@@ -1906,43 +1898,17 @@ static void unfreeze_partials(struct kmem_cache *s)
                        new.frozen = 0;
-                        if (!new.inuse && (!n || n->nr_partial > s->min_partial))
+                } while (!__cmpxchg_double_slab(s, page,
-                                m = M_FREE;
-                        else {
-                                struct kmem_cache_node *n2 = get_node(s,
-                                                        page_to_nid(page));
-                                m = M_PARTIAL;
-                                if (n != n2) {
-                                        if (n)
-                                                spin_unlock(&n->list_lock);
-                                        n = n2;
-                                        spin_lock(&n->list_lock);
-                                }
-                        }
-                        if (l != m) {
-                                if (l == M_PARTIAL) {
-                                        remove_partial(n, page);
-                                        stat(s, FREE_REMOVE_PARTIAL);
-                                } else {
-                                        add_partial(n, page,
-                                                DEACTIVATE_TO_TAIL);
-                                        stat(s, FREE_ADD_PARTIAL);
-                                }
-                                l = m;
-                        }
-                } while (!cmpxchg_double_slab(s, page,
                                old.freelist, old.counters,
                                new.freelist, new.counters,
                                "unfreezing slab"));
-                if (m == M_FREE) {
+                if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
                        page->next = discard_page;
                        discard_page = page;
+                } else {
+                        add_partial(n, page, DEACTIVATE_TO_TAIL);
+                        stat(s, FREE_ADD_PARTIAL);
                }
        }
@@ -2011,7 +1977,11 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
        stat(s, CPUSLAB_FLUSH);
-        deactivate_slab(s, c);
+        deactivate_slab(s, c->page, c->freelist);
+        c->tid = next_tid(c->tid);
+        c->page = NULL;
+        c->freelist = NULL;
 }
 /*
@@ -2055,10 +2025,10 @@ static void flush_all(struct kmem_cache *s)
 * Check if the objects in a per cpu structure fit numa
 * locality expectations.
 */
-static inline int node_match(struct kmem_cache_cpu *c, int node)
+static inline int node_match(struct page *page, int node)
 {
 #ifdef CONFIG_NUMA
-        if (node != NUMA_NO_NODE && c->node != node)
+        if (node != NUMA_NO_NODE && page_to_nid(page) != node)
                return 0;
 #endif
        return 1;
@@ -2101,10 +2071,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
                "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
                nid, gfpflags);
        printk(KERN_WARNING "  cache: %s, object size: %d, buffer size: %d, "
-                "default order: %d, min order: %d\n", s->name, s->objsize,
+                "default order: %d, min order: %d\n", s->name, s->object_size,
                s->size, oo_order(s->oo), oo_order(s->min));
-        if (oo_order(s->min) > get_order(s->objsize))
+        if (oo_order(s->min) > get_order(s->object_size))
                printk(KERN_WARNING "  %s debugging increased min order, use "
                       "slub_debug=O to disable.\n", s->name);
@@ -2130,10 +2100,16 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
 static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
                        int node, struct kmem_cache_cpu **pc)
 {
-        void *object;
+        void *freelist;
-        struct kmem_cache_cpu *c;
+        struct kmem_cache_cpu *c = *pc;
-        struct page *page = new_slab(s, flags, node);
+        struct page *page;
+        freelist = get_partial(s, flags, node, c);
+        if (freelist)
+                return freelist;
+        page = new_slab(s, flags, node);
        if (page) {
                c = __this_cpu_ptr(s->cpu_slab);
                if (c->page)
@@ -2143,17 +2119,24 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
                 * No other reference to the page yet so we can
                 * muck around with it freely without cmpxchg
                 */
-                object = page->freelist;
+                freelist = page->freelist;
                page->freelist = NULL;
                stat(s, ALLOC_SLAB);
-                c->node = page_to_nid(page);
                c->page = page;
                *pc = c;
        } else
-                object = NULL;
+                freelist = NULL;
-        return object;
+        return freelist;
+}
+static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
+{
+        if (unlikely(PageSlabPfmemalloc(page)))
+                return gfp_pfmemalloc_allowed(gfpflags);
+        return true;
 }
 /*
@@ -2163,6 +2146,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
 * The page is still frozen if the return value is not NULL.
 *
 * If this function returns NULL then the page has been unfrozen.
+ *
+ * This function must be called with interrupt disabled.
 */
 static inline void *get_freelist(struct kmem_cache *s, struct page *page)
 {
@@ -2173,13 +2158,14 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
        do {
                freelist = page->freelist;
                counters = page->counters;
                new.counters = counters;
                VM_BUG_ON(!new.frozen);
                new.inuse = page->objects;
                new.frozen = freelist != NULL;
-        } while (!cmpxchg_double_slab(s, page,
+        } while (!__cmpxchg_double_slab(s, page,
                freelist, counters,
                NULL, new.counters,
                "get_freelist"));
@@ -2206,7 +2192,8 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
 static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
                          unsigned long addr, struct kmem_cache_cpu *c)
 {
-        void **object;
+        void *freelist;
+        struct page *page;
        unsigned long flags;
        local_irq_save(flags);
@@ -2219,25 +2206,41 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
        c = this_cpu_ptr(s->cpu_slab);
 #endif
-        if (!c->page)
+        page = c->page;
+        if (!page)
                goto new_slab;
 redo:
-        if (unlikely(!node_match(c, node))) {
+        if (unlikely(!node_match(page, node))) {
                stat(s, ALLOC_NODE_MISMATCH);
-                deactivate_slab(s, c);
+                deactivate_slab(s, page, c->freelist);
+                c->page = NULL;
+                c->freelist = NULL;
+                goto new_slab;
+        }
+        /*
+         * By rights, we should be searching for a slab page that was
+         * PFMEMALLOC but right now, we are losing the pfmemalloc
+         * information when the page leaves the per-cpu allocator
+         */
+        if (unlikely(!pfmemalloc_match(page, gfpflags))) {
+                deactivate_slab(s, page, c->freelist);
+                c->page = NULL;
+                c->freelist = NULL;
                goto new_slab;
        }
        /* must check again c->freelist in case of cpu migration or IRQ */
-        object = c->freelist;
+        freelist = c->freelist;
-        if (object)
+        if (freelist)
                goto load_freelist;
        stat(s, ALLOC_SLOWPATH);
-        object = get_freelist(s, c->page);
+        freelist = get_freelist(s, page);
-        if (!object) {
+        if (!freelist) {
                c->page = NULL;
                stat(s, DEACTIVATE_BYPASS);
                goto new_slab;
@@ -2246,50 +2249,50 @@ redo:
        stat(s, ALLOC_REFILL);
 load_freelist:
-        c->freelist = get_freepointer(s, object);
+        /*
+         * freelist is pointing to the list of objects to be used.
+         * page is pointing to the page from which the objects are obtained.
+         * That page must be frozen for per cpu allocations to work.
+         */
+        VM_BUG_ON(!c->page->frozen);
+        c->freelist = get_freepointer(s, freelist);
        c->tid = next_tid(c->tid);
        local_irq_restore(flags);
-        return object;
+        return freelist;
 new_slab:
        if (c->partial) {
-                c->page = c->partial;
+                page = c->page = c->partial;
-                c->partial = c->page->next;
+                c->partial = page->next;
-                c->node = page_to_nid(c->page);
                stat(s, CPU_PARTIAL_ALLOC);
                c->freelist = NULL;
                goto redo;
        }
-        /* Then do expensive stuff like retrieving pages from the partial lists */
+        freelist = new_slab_objects(s, gfpflags, node, &c);
-        object = get_partial(s, gfpflags, node, c);
-        if (unlikely(!object)) {
-                object = new_slab_objects(s, gfpflags, node, &c);
+        if (unlikely(!freelist)) {
+                if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+                        slab_out_of_memory(s, gfpflags, node);
-                if (unlikely(!object)) {
+                local_irq_restore(flags);
-                        if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+                return NULL;
-                                slab_out_of_memory(s, gfpflags, node);
-                        local_irq_restore(flags);
-                        return NULL;
-                }
        }
-        if (likely(!kmem_cache_debug(s)))
+        page = c->page;
+        if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags)))
                goto load_freelist;
        /* Only entered in the debug case */
-        if (!alloc_debug_processing(s, c->page, object, addr))
+        if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr))
                goto new_slab;  /* Slab failed checks. Next slab needed */
-        c->freelist = get_freepointer(s, object);
+        deactivate_slab(s, page, get_freepointer(s, freelist));
-        deactivate_slab(s, c);
+        c->page = NULL;
-        c->node = NUMA_NO_NODE;
+        c->freelist = NULL;
        local_irq_restore(flags);
-        return object;
+        return freelist;
 }
 /*
@@ -2307,6 +2310,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 {
        void **object;
        struct kmem_cache_cpu *c;
+        struct page *page;
        unsigned long tid;
        if (slab_pre_alloc_hook(s, gfpflags))
@@ -2332,8 +2336,8 @@ redo:
        barrier();
        object = c->freelist;
-        if (unlikely(!object || !node_match(c, node)))
+        page = c->page;
+        if (unlikely(!object || !node_match(page, node)))
                object = __slab_alloc(s, gfpflags, node, addr, c);
        else {
@@ -2364,7 +2368,7 @@ redo:
        }
        if (unlikely(gfpflags & __GFP_ZERO) && object)
-                memset(object, 0, s->objsize);
+                memset(object, 0, s->object_size);
        slab_post_alloc_hook(s, gfpflags, object);
@@ -2375,7 +2379,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
        void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
-        trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
+        trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags);
        return ret;
 }
@@ -2405,7 +2409,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
        trace_kmem_cache_alloc_node(_RET_IP_, ret,
-                                    s->objsize, s->size, gfpflags, node);
+                                    s->object_size, s->size, gfpflags, node);
        return ret;
 }
@@ -2900,7 +2904,7 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
 static int calculate_sizes(struct kmem_cache *s, int forced_order)
 {
        unsigned long flags = s->flags;
-        unsigned long size = s->objsize;
+        unsigned long size = s->object_size;
        unsigned long align = s->align;
        int order;
@@ -2929,7 +2933,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
         * end of the object and the free pointer. If not then add an
         * additional word to have some bytes to store Redzone information.
         */
-        if ((flags & SLAB_RED_ZONE) && size == s->objsize)
+        if ((flags & SLAB_RED_ZONE) && size == s->object_size)
                size += sizeof(void *);
 #endif
@@ -2977,7 +2981,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
         * user specified and the dynamic determination of cache line size
         * on bootup.
         */
-        align = calculate_alignment(flags, align, s->objsize);
+        align = calculate_alignment(flags, align, s->object_size);
        s->align = align;
        /*
@@ -3025,7 +3029,7 @@ static int kmem_cache_open(struct kmem_cache *s,
        memset(s, 0, kmem_size);
        s->name = name;
        s->ctor = ctor;
-        s->objsize = size;
+        s->object_size = size;
        s->align = align;
        s->flags = kmem_cache_flags(size, flags, name, ctor);
        s->reserved = 0;
@@ -3040,7 +3044,7 @@ static int kmem_cache_open(struct kmem_cache *s,
                 * Disable debugging flags that store metadata if the min slab
                 * order increased.
                 */
-                if (get_order(s->size) > get_order(s->objsize)) {
+                if (get_order(s->size) > get_order(s->object_size)) {
                        s->flags &= ~DEBUG_METADATA_FLAGS;
                        s->offset = 0;
                        if (!calculate_sizes(s, -1))
@@ -3114,7 +3118,7 @@ error:
 */
 unsigned int kmem_cache_size(struct kmem_cache *s)
 {
-        return s->objsize;
+        return s->object_size;
 }
 EXPORT_SYMBOL(kmem_cache_size);
@@ -3192,11 +3196,11 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 */
 void kmem_cache_destroy(struct kmem_cache *s)
 {
-        down_write(&slub_lock);
+        mutex_lock(&slab_mutex);
        s->refcount--;
        if (!s->refcount) {
                list_del(&s->list);
-                up_write(&slub_lock);
+                mutex_unlock(&slab_mutex);
                if (kmem_cache_close(s)) {
                        printk(KERN_ERR "SLUB %s: %s called for cache that "
                                "still has objects.\n", s->name, __func__);
@@ -3206,7 +3210,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
                        rcu_barrier();
                sysfs_slab_remove(s);
        } else
-                up_write(&slub_lock);
+                mutex_unlock(&slab_mutex);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -3268,7 +3272,7 @@ static struct kmem_cache *__init create_kmalloc_cache(const char *name,
        /*
         * This function is called with IRQs disabled during early-boot on
-         * single CPU so there's no need to take slub_lock here.
+         * single CPU so there's no need to take slab_mutex here.
         */
        if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
                                                                flags, NULL))
@@ -3553,10 +3557,10 @@ static int slab_mem_going_offline_callback(void *arg)
 {
        struct kmem_cache *s;
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list)
                kmem_cache_shrink(s);
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
        return 0;
 }
@@ -3577,7 +3581,7 @@ static void slab_mem_offline_callback(void *arg)
        if (offline_node < 0)
                return;
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list) {
                n = get_node(s, offline_node);
                if (n) {
@@ -3593,7 +3597,7 @@ static void slab_mem_offline_callback(void *arg)
                        kmem_cache_free(kmem_cache_node, n);
                }
        }
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
 }
 static int slab_mem_going_online_callback(void *arg)
@@ -3616,7 +3620,7 @@ static int slab_mem_going_online_callback(void *arg)
         * allocate a kmem_cache_node structure in order to bring the node
         * online.
         */
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list) {
                /*
                 * XXX: kmem_cache_alloc_node will fallback to other nodes
@@ -3632,7 +3636,7 @@ static int slab_mem_going_online_callback(void *arg)
                s->node[nid] = n;
        }
 out:
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
        return ret;
 }
@@ -3843,11 +3847,11 @@ void __init kmem_cache_init(void)
                if (s && s->size) {
                        char *name = kasprintf(GFP_NOWAIT,
-                                 "dma-kmalloc-%d", s->objsize);
+                                 "dma-kmalloc-%d", s->object_size);
                        BUG_ON(!name);
                        kmalloc_dma_caches[i] = create_kmalloc_cache(name,
-                                s->objsize, SLAB_CACHE_DMA);
+                                s->object_size, SLAB_CACHE_DMA);
                }
        }
 #endif
@@ -3924,16 +3928,12 @@ static struct kmem_cache *find_mergeable(size_t size,
        return NULL;
 }
-struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+struct kmem_cache *__kmem_cache_create(const char *name, size_t size,
                size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s;
        char *n;
-        if (WARN_ON(!name))
-                return NULL;
-        down_write(&slub_lock);
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
                s->refcount++;
@@ -3941,49 +3941,42 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                 * Adjust the object sizes so that we clear
                 * the complete object on kzalloc.
                 */
-                s->objsize = max(s->objsize, (int)size);
+                s->object_size = max(s->object_size, (int)size);
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
                if (sysfs_slab_alias(s, name)) {
                        s->refcount--;
-                        goto err;
+                        return NULL;
                }
-                up_write(&slub_lock);
                return s;
        }
        n = kstrdup(name, GFP_KERNEL);
        if (!n)
-                goto err;
+                return NULL;
        s = kmalloc(kmem_size, GFP_KERNEL);
        if (s) {
                if (kmem_cache_open(s, n,
                                size, align, flags, ctor)) {
+                        int r;
                        list_add(&s->list, &slab_caches);
-                        up_write(&slub_lock);
+                        mutex_unlock(&slab_mutex);
-                        if (sysfs_slab_add(s)) {
+                        r = sysfs_slab_add(s);
-                                down_write(&slub_lock);
+                        mutex_lock(&slab_mutex);
-                                list_del(&s->list);
-                                kfree(n);
+                        if (!r)
-                                kfree(s);
+                                return s;
-                                goto err;
-                        }
+                        list_del(&s->list);
-                        return s;
+                        kmem_cache_close(s);
                }
                kfree(s);
        }
        kfree(n);
-err:
+        return NULL;
-        up_write(&slub_lock);
-        if (flags & SLAB_PANIC)
-                panic("Cannot create slabcache %s\n", name);
-        else
-                s = NULL;
-        return s;
 }
-EXPORT_SYMBOL(kmem_cache_create);
 #ifdef CONFIG_SMP
 /*
@@ -4002,13 +3995,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
        case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                down_read(&slub_lock);
+                mutex_lock(&slab_mutex);
                list_for_each_entry(s, &slab_caches, list) {
                        local_irq_save(flags);
                        __flush_cpu_slab(s, cpu);
                        local_irq_restore(flags);
                }
-                up_read(&slub_lock);
+                mutex_unlock(&slab_mutex);
                break;
        default:
                break;
@@ -4500,30 +4493,31 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                for_each_possible_cpu(cpu) {
                        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
-                        int node = ACCESS_ONCE(c->node);
+                        int node;
                        struct page *page;
-                        if (node < 0)
-                                continue;
                        page = ACCESS_ONCE(c->page);
-                        if (page) {
+                        if (!page)
-                                if (flags & SO_TOTAL)
+                                continue;
-                                        x = page->objects;
-                                else if (flags & SO_OBJECTS)
-                                        x = page->inuse;
-                                else
-                                        x = 1;
-                                total += x;
+                        node = page_to_nid(page);
-                                nodes[node] += x;
+                        if (flags & SO_TOTAL)
-                        }
+                                x = page->objects;
-                        page = c->partial;
+                        else if (flags & SO_OBJECTS)
+                                x = page->inuse;
+                        else
+                                x = 1;
+                        total += x;
+                        nodes[node] += x;
+                        page = ACCESS_ONCE(c->partial);
                        if (page) {
                                x = page->pobjects;
                                total += x;
                                nodes[node] += x;
                        }
                        per_cpu[node]++;
                }
        }
@@ -4623,7 +4617,7 @@ SLAB_ATTR_RO(align);
 static ssize_t object_size_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", s->objsize);
+        return sprintf(buf, "%d\n", s->object_size);
 }
 SLAB_ATTR_RO(object_size);
@@ -5286,7 +5280,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
        const char *name;
        int unmergeable;
-        if (slab_state < SYSFS)
+        if (slab_state < FULL)
                /* Defer until later */
                return 0;
@@ -5331,7 +5325,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
 static void sysfs_slab_remove(struct kmem_cache *s)
 {
-        if (slab_state < SYSFS)
+        if (slab_state < FULL)
                /*
                 * Sysfs has not been setup yet so no need to remove the
                 * cache from sysfs.
@@ -5359,7 +5353,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
 {
        struct saved_alias *al;
-        if (slab_state == SYSFS) {
+        if (slab_state == FULL) {
                /*
                 * If we have a leftover link then remove it.
                 */
@@ -5383,16 +5377,16 @@ static int __init slab_sysfs_init(void)
        struct kmem_cache *s;
        int err;
-        down_write(&slub_lock);
+        mutex_lock(&slab_mutex);
        slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
        if (!slab_kset) {
-                up_write(&slub_lock);
+                mutex_unlock(&slab_mutex);
                printk(KERN_ERR "Cannot register slab subsystem.\n");
                return -ENOSYS;
        }
-        slab_state = SYSFS;
+        slab_state = FULL;
        list_for_each_entry(s, &slab_caches, list) {
                err = sysfs_slab_add(s);
@@ -5408,11 +5402,11 @@ static int __init slab_sysfs_init(void)
                err = sysfs_slab_alias(al->s, al->name);
                if (err)
                        printk(KERN_ERR "SLUB: Unable to add boot slab alias"
-                                        " %s to sysfs\n", s->name);
+                                        " %s to sysfs\n", al->name);
                kfree(al);
        }
-        up_write(&slub_lock);
+        mutex_unlock(&slab_mutex);
        resiliency_test();
        return 0;
 }
@@ -5427,7 +5421,7 @@ __initcall(slab_sysfs_init);
 static void print_slabinfo_header(struct seq_file *m)
 {
        seq_puts(m, "slabinfo - version: 2.1\n");
-        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
+        seq_puts(m, "# name            <active_objs> <num_objs> <object_size> "
                 "<objperslab> <pagesperslab>");
        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
@@ -5438,7 +5432,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 {
        loff_t n = *pos;
-        down_read(&slub_lock);
+        mutex_lock(&slab_mutex);
        if (!n)
                print_slabinfo_header(m);
@@ -5452,7 +5446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 static void s_stop(struct seq_file *m, void *p)
 {
-        up_read(&slub_lock);
+        mutex_unlock(&slab_mutex);
 }
 static int s_show(struct seq_file *m, void *p)
diff --git a/mm/sparse.c b/mm/sparse.c
index c7bb952400c..fac95f2888f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -65,21 +65,18 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
        if (slab_is_available()) {
                if (node_state(nid, N_HIGH_MEMORY))
-                        section = kmalloc_node(array_size, GFP_KERNEL, nid);
+                        section = kzalloc_node(array_size, GFP_KERNEL, nid);
                else
-                        section = kmalloc(array_size, GFP_KERNEL);
+                        section = kzalloc(array_size, GFP_KERNEL);
-        } else
+        } else {
                section = alloc_bootmem_node(NODE_DATA(nid), array_size);
+        }
-        if (section)
-                memset(section, 0, array_size);
        return section;
 }
 static int __meminit sparse_index_init(unsigned long section_nr, int nid)
 {
-        static DEFINE_SPINLOCK(index_init_lock);
        unsigned long root = SECTION_NR_TO_ROOT(section_nr);
        struct mem_section *section;
        int ret = 0;
@@ -90,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid)
        section = sparse_index_alloc(nid);
        if (!section)
                return -ENOMEM;
-        /*
-         * This lock keeps two different sections from
-         * reallocating for the same index
-         */
-        spin_lock(&index_init_lock);
-        if (mem_section[root]) {
-                ret = -EEXIST;
-                goto out;
-        }
        mem_section[root] = section;
-out:
-        spin_unlock(&index_init_lock);
        return ret;
 }
 #else /* !SPARSEMEM_EXTREME */
@@ -132,6 +118,8 @@ int __section_nr(struct mem_section* ms)
                     break;
        }
+        VM_BUG_ON(root_nr == NR_SECTION_ROOTS);
        return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
@@ -493,6 +481,9 @@ void __init sparse_init(void)
        struct page **map_map;
 #endif
+        /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
+        set_pageblock_order();
        /*
         * map is using big page (aka 2M in x86 64 bit)
         * usemap is less one page (aka 24 bytes)
diff --git a/mm/swap.c b/mm/swap.c
index 4e7e2ec6707..77825883298 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -236,6 +236,58 @@ void put_pages_list(struct list_head *pages)
 }
 EXPORT_SYMBOL(put_pages_list);
+/*
+ * get_kernel_pages() - pin kernel pages in memory
+ * @kiov:       An array of struct kvec structures
+ * @nr_segs:    number of segments to pin
+ * @write:      pinning for read/write, currently ignored
+ * @pages:      array that receives pointers to the pages pinned.
+ *              Should be at least nr_segs long.
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with.
+ */
+int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
+                struct page **pages)
+{
+        int seg;
+        for (seg = 0; seg < nr_segs; seg++) {
+                if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
+                        return seg;
+                pages[seg] = kmap_to_page(kiov[seg].iov_base);
+                page_cache_get(pages[seg]);
+        }
+        return seg;
+}
+EXPORT_SYMBOL_GPL(get_kernel_pages);
+/*
+ * get_kernel_page() - pin a kernel page in memory
+ * @start:      starting kernel address
+ * @write:      pinning for read/write, currently ignored
+ * @pages:      array that receives pointer to the page pinned.
+ *              Must be at least nr_segs long.
+ *
+ * Returns 1 if page is pinned. If the page was not pinned, returns
+ * -errno. The page returned must be released with a put_page() call
+ * when it is finished with.
+ */
+int get_kernel_page(unsigned long start, int write, struct page **pages)
+{
+        const struct kvec kiov = {
+                .iov_base = (void *)start,
+                .iov_len = PAGE_SIZE
+        };
+        return get_kernel_pages(&kiov, 1, write, pages);
+}
+EXPORT_SYMBOL_GPL(get_kernel_page);
 static void pagevec_lru_move_fn(struct pagevec *pvec,
        void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
        void *arg)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 4c5ff7f284d..0cb36fb1f61 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
+#include <linux/blkdev.h>
 #include <linux/pagevec.h>
 #include <linux/migrate.h>
 #include <linux/page_cgroup.h>
@@ -26,7 +27,7 @@
 */
 static const struct address_space_operations swap_aops = {
        .writepage      = swap_writepage,
-        .set_page_dirty = __set_page_dirty_no_writeback,
+        .set_page_dirty = swap_set_page_dirty,
        .migratepage    = migrate_page,
 };
@@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        unsigned long offset = swp_offset(entry);
        unsigned long start_offset, end_offset;
        unsigned long mask = (1UL << page_cluster) - 1;
+        struct blk_plug plug;
        /* Read a page_cluster sized and aligned cluster around offset. */
        start_offset = offset & ~mask;
@@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
        if (!start_offset)      /* First page is swap header. */
                start_offset++;
+        blk_start_plug(&plug);
        for (offset = start_offset; offset <= end_offset ; offset++) {
                /* Ok, do the async read-ahead now */
                page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
@@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
                        continue;
                page_cache_release(page);
        }
+        blk_finish_plug(&plug);
        lru_add_drain();        /* Push any new pages onto the LRU now */
        return read_swap_cache_async(entry, gfp_mask, vma, addr);
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 71373d03fce..14e254c768f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,6 +33,7 @@
 #include <linux/oom.h>
 #include <linux/frontswap.h>
 #include <linux/swapfile.h>
+#include <linux/export.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -548,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
        /* free if no reference */
        if (!usage) {
-                struct gendisk *disk = p->bdev->bd_disk;
                if (offset < p->lowest_bit)
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
@@ -559,9 +559,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
                nr_swap_pages++;
                p->inuse_pages--;
                frontswap_invalidate_page(p->type, offset);
-                if ((p->flags & SWP_BLKDEV) &&
+                if (p->flags & SWP_BLKDEV) {
-                                disk->fops->swap_slot_free_notify)
+                        struct gendisk *disk = p->bdev->bd_disk;
-                        disk->fops->swap_slot_free_notify(p->bdev, offset);
+                        if (disk->fops->swap_slot_free_notify)
+                                disk->fops->swap_slot_free_notify(p->bdev,
+                                                                  offset);
+                }
        }
        return usage;
@@ -832,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
-                if (ret > 0)
+                mem_cgroup_cancel_charge_swapin(memcg);
-                        mem_cgroup_cancel_charge_swapin(memcg);
                ret = 0;
                goto out;
        }
@@ -1328,6 +1330,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
                list_del(&se->list);
                kfree(se);
        }
+        if (sis->flags & SWP_FILE) {
+                struct file *swap_file = sis->swap_file;
+                struct address_space *mapping = swap_file->f_mapping;
+                sis->flags &= ~SWP_FILE;
+                mapping->a_ops->swap_deactivate(swap_file);
+        }
 }
 /*
@@ -1336,7 +1346,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis)
 *
 * This function rather assumes that it is called in ascending page order.
 */
-static int
+int
 add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
                unsigned long nr_pages, sector_t start_block)
 {
@@ -1409,98 +1419,28 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
 */
 static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 {
-        struct inode *inode;
+        struct file *swap_file = sis->swap_file;
-        unsigned blocks_per_page;
+        struct address_space *mapping = swap_file->f_mapping;
-        unsigned long page_no;
+        struct inode *inode = mapping->host;
-        unsigned blkbits;
-        sector_t probe_block;
-        sector_t last_block;
-        sector_t lowest_block = -1;
-        sector_t highest_block = 0;
-        int nr_extents = 0;
        int ret;
-        inode = sis->swap_file->f_mapping->host;
        if (S_ISBLK(inode->i_mode)) {
                ret = add_swap_extent(sis, 0, sis->max, 0);
                *span = sis->pages;
-                goto out;
+                return ret;
        }
-        blkbits = inode->i_blkbits;
+        if (mapping->a_ops->swap_activate) {
-        blocks_per_page = PAGE_SIZE >> blkbits;
+                ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+                if (!ret) {
-        /*
+                        sis->flags |= SWP_FILE;
-         * Map all the blocks into the extent list.  This code doesn't try
+                        ret = add_swap_extent(sis, 0, sis->max, 0);
-         * to be very smart.
+                        *span = sis->pages;
-         */
-        probe_block = 0;
-        page_no = 0;
-        last_block = i_size_read(inode) >> blkbits;
-        while ((probe_block + blocks_per_page) <= last_block &&
-                        page_no < sis->max) {
-                unsigned block_in_page;
-                sector_t first_block;
-                first_block = bmap(inode, probe_block);
-                if (first_block == 0)
-                        goto bad_bmap;
-                /*
-                 * It must be PAGE_SIZE aligned on-disk
-                 */
-                if (first_block & (blocks_per_page - 1)) {
-                        probe_block++;
-                        goto reprobe;
-                }
-                for (block_in_page = 1; block_in_page < blocks_per_page;
-                                        block_in_page++) {
-                        sector_t block;
-                        block = bmap(inode, probe_block + block_in_page);
-                        if (block == 0)
-                                goto bad_bmap;
-                        if (block != first_block + block_in_page) {
-                                /* Discontiguity */
-                                probe_block++;
-                                goto reprobe;
-                        }
-                }
-                first_block >>= (PAGE_SHIFT - blkbits);
-                if (page_no) {  /* exclude the header page */
-                        if (first_block < lowest_block)
-                                lowest_block = first_block;
-                        if (first_block > highest_block)
-                                highest_block = first_block;
                }
+                return ret;
+        }
-                /*
+        return generic_swapfile_activate(sis, swap_file, span);
-                 * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
-                 */
-                ret = add_swap_extent(sis, page_no, 1, first_block);
-                if (ret < 0)
-                        goto out;
-                nr_extents += ret;
-                page_no++;
-                probe_block += blocks_per_page;
-reprobe:
-                continue;
-        }
-        ret = nr_extents;
-        *span = 1 + highest_block - lowest_block;
-        if (page_no == 0)
-                page_no = 1;    /* force Empty message */
-        sis->max = page_no;
-        sis->pages = page_no - 1;
-        sis->highest_bit = page_no - 1;
-out:
-        return ret;
-bad_bmap:
-        printk(KERN_ERR "swapon: swapfile has holes\n");
-        ret = -EINVAL;
-        goto out;
 }
 static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -2285,6 +2225,31 @@ int swapcache_prepare(swp_entry_t entry)
        return __swap_duplicate(entry, SWAP_HAS_CACHE);
 }
+struct swap_info_struct *page_swap_info(struct page *page)
+{
+        swp_entry_t swap = { .val = page_private(page) };
+        BUG_ON(!PageSwapCache(page));
+        return swap_info[swp_type(swap)];
+}
+/*
+ * out-of-line __page_file_ methods to avoid include hell.
+ */
+struct address_space *__page_file_mapping(struct page *page)
+{
+        VM_BUG_ON(!PageSwapCache(page));
+        return page_swap_info(page)->swap_file->f_mapping;
+}
+EXPORT_SYMBOL_GPL(__page_file_mapping);
+pgoff_t __page_file_index(struct page *page)
+{
+        swp_entry_t swap = { .val = page_private(page) };
+        VM_BUG_ON(!PageSwapCache(page));
+        return swp_offset(swap);
+}
+EXPORT_SYMBOL_GPL(__page_file_index);
 /*
 * add_swap_count_continuation - called when a swap count is duplicated
 * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2aad49981b5..2bb90b1d241 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,11 +413,11 @@ nocache:
                if (addr + size - 1 < addr)
                        goto overflow;
-                n = rb_next(&first->rb_node);
+                if (list_is_last(&first->list, &vmap_area_list))
-                if (n)
-                        first = rb_entry(n, struct vmap_area, rb_node);
-                else
                        goto found;
+                first = list_entry(first->list.next,
+                                struct vmap_area, list);
        }
 found:
@@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
        BUG_ON(size & ~PAGE_MASK);
        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+        if (WARN_ON(size == 0)) {
+                /*
+                 * Allocating 0 bytes isn't what caller wants since
+                 * get_order(0) returns funny result. Just warn and terminate
+                 * early.
+                 */
+                return NULL;
+        }
        order = get_order(size);
 again:
@@ -1280,7 +1288,7 @@ DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-                              unsigned long flags, void *caller)
+                              unsigned long flags, const void *caller)
 {
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
@@ -1306,7 +1314,7 @@ static void insert_vmalloc_vmlist(struct vm_struct *vm)
 }
 static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
-                              unsigned long flags, void *caller)
+                              unsigned long flags, const void *caller)
 {
        setup_vmalloc_vm(vm, va, flags, caller);
        insert_vmalloc_vmlist(vm);
@@ -1314,7 +1322,7 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long flags, unsigned long start,
-                unsigned long end, int node, gfp_t gfp_mask, void *caller)
+                unsigned long end, int node, gfp_t gfp_mask, const void *caller)
 {
        struct vmap_area *va;
        struct vm_struct *area;
@@ -1375,7 +1383,7 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
 struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
                                       unsigned long start, unsigned long end,
-                                       void *caller)
+                                       const void *caller)
 {
        return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
                                  caller);
@@ -1397,13 +1405,21 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 }
 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
-                                void *caller)
+                                const void *caller)
 {
        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
                                                -1, GFP_KERNEL, caller);
 }
-static struct vm_struct *find_vm_area(const void *addr)
+/**
+ *      find_vm_area  -  find a continuous kernel virtual area
+ *      @addr:          base address
+ *
+ *      Search for the kernel VM area starting at @addr, and return it.
+ *      It is up to the caller to do all required locking to keep the returned
+ *      pointer valid.
+ */
+struct vm_struct *find_vm_area(const void *addr)
 {
        struct vmap_area *va;
@@ -1568,9 +1584,9 @@ EXPORT_SYMBOL(vmap);
 static void *__vmalloc_node(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, pgprot_t prot,
-                            int node, void *caller);
+                            int node, const void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-                                 pgprot_t prot, int node, void *caller)
+                                 pgprot_t prot, int node, const void *caller)
 {
        const int order = 0;
        struct page **pages;
@@ -1643,7 +1659,7 @@ fail:
 */
 void *__vmalloc_node_range(unsigned long size, unsigned long align,
                        unsigned long start, unsigned long end, gfp_t gfp_mask,
-                        pgprot_t prot, int node, void *caller)
+                        pgprot_t prot, int node, const void *caller)
 {
        struct vm_struct *area;
        void *addr;
@@ -1699,7 +1715,7 @@ fail:
 */
 static void *__vmalloc_node(unsigned long size, unsigned long align,
                            gfp_t gfp_mask, pgprot_t prot,
-                            int node, void *caller)
+                            int node, const void *caller)
 {
        return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
                                gfp_mask, prot, node, caller);
@@ -1975,9 +1991,7 @@ static int aligned_vwrite(char *buf, char *addr, unsigned long count)
 *      IOREMAP area is treated as memory hole and no copy is done.
 *
 *      If [addr...addr+count) doesn't includes any intersects with alive
- *      vm_struct area, returns 0.
+ *      vm_struct area, returns 0. @buf should be kernel's buffer.
- *      @buf should be kernel's buffer. Because this function uses KM_USER0,
- *      the caller should guarantee KM_USER0 is not used.
 *
 *      Note: In usual ops, vread() is never necessary because the caller
 *      should know vmalloc() area is valid and can use memcpy().
@@ -2051,9 +2065,7 @@ finished:
 *      IOREMAP area is treated as memory hole and no copy is done.
 *
 *      If [addr...addr+count) doesn't includes any intersects with alive
- *      vm_struct area, returns 0.
+ *      vm_struct area, returns 0. @buf should be kernel's buffer.
- *      @buf should be kernel's buffer. Because this function uses KM_USER0,
- *      the caller should guarantee KM_USER0 is not used.
 *
 *      Note: In usual ops, vwrite() is never necessary because the caller
 *      should know vmalloc() area is valid and can use memcpy().
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 661576324c7..8d01243d956 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -133,7 +133,7 @@ long vm_total_pages;	/* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
 {
        return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        cond_resched();
+        mem_cgroup_uncharge_start();
        while (!list_empty(page_list)) {
                enum page_references references;
                struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
                if (PageWriteback(page)) {
-                        nr_writeback++;
+                        /*
-                        unlock_page(page);
+                         * memcg doesn't have any dirty pages throttling so we
-                        goto keep;
+                         * could easily OOM just because too many pages are in
+                         * writeback and there is nothing else to reclaim.
+                         *
+                         * Check __GFP_IO, certainly because a loop driver
+                         * thread might enter reclaim, and deadlock if it waits
+                         * on a page for which it is needed to do the write
+                         * (loop masks off __GFP_IO|__GFP_FS for this reason);
+                         * but more thought would probably show more reasons.
+                         *
+                         * Don't require __GFP_FS, since we're not going into
+                         * the FS, just waiting on its writeback completion.
+                         * Worryingly, ext4 gfs2 and xfs allocate pages with
+                         * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
+                         * testing may_enter_fs here is liable to OOM on them.
+                         */
+                        if (global_reclaim(sc) ||
+                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+                                /*
+                                 * This is slightly racy - end_page_writeback()
+                                 * might have just cleared PageReclaim, then
+                                 * setting PageReclaim here end up interpreted
+                                 * as PageReadahead - but that does not matter
+                                 * enough to care.  What we do want is for this
+                                 * page to have PageReclaim set next time memcg
+                                 * reclaim reaches the tests above, so it will
+                                 * then wait_on_page_writeback() to avoid OOM;
+                                 * and it's also appropriate in global reclaim.
+                                 */
+                                SetPageReclaim(page);
+                                nr_writeback++;
+                                goto keep_locked;
+                        }
+                        wait_on_page_writeback(page);
                }
                references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
+        mem_cgroup_uncharge_end();
        *ret_nr_dirty += nr_dirty;
        *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
@@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc)
 * by looking at the fraction of the pages scanned we did rotate back
 * onto the active list instead of evict.
 *
- * nr[0] = anon pages to scan; nr[1] = file pages to scan
+ * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
+ * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
 */
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                           unsigned long *nr)
@@ -2111,6 +2146,83 @@ out:
        return 0;
 }
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+{
+        struct zone *zone;
+        unsigned long pfmemalloc_reserve = 0;
+        unsigned long free_pages = 0;
+        int i;
+        bool wmark_ok;
+        for (i = 0; i <= ZONE_NORMAL; i++) {
+                zone = &pgdat->node_zones[i];
+                pfmemalloc_reserve += min_wmark_pages(zone);
+                free_pages += zone_page_state(zone, NR_FREE_PAGES);
+        }
+        wmark_ok = free_pages > pfmemalloc_reserve / 2;
+        /* kswapd must be awake if processes are being throttled */
+        if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
+                pgdat->classzone_idx = min(pgdat->classzone_idx,
+                                                (enum zone_type)ZONE_NORMAL);
+                wake_up_interruptible(&pgdat->kswapd_wait);
+        }
+        return wmark_ok;
+}
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+                                        nodemask_t *nodemask)
+{
+        struct zone *zone;
+        int high_zoneidx = gfp_zone(gfp_mask);
+        pg_data_t *pgdat;
+        /*
+         * Kernel threads should not be throttled as they may be indirectly
+         * responsible for cleaning pages necessary for reclaim to make forward
+         * progress. kjournald for example may enter direct reclaim while
+         * committing a transaction where throttling it could forcing other
+         * processes to block on log_wait_commit().
+         */
+        if (current->flags & PF_KTHREAD)
+                return;
+        /* Check if the pfmemalloc reserves are ok */
+        first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+        pgdat = zone->zone_pgdat;
+        if (pfmemalloc_watermark_ok(pgdat))
+                return;
+        /* Account for the throttling */
+        count_vm_event(PGSCAN_DIRECT_THROTTLE);
+        /*
+         * If the caller cannot enter the filesystem, it's possible that it
+         * is due to the caller holding an FS lock or performing a journal
+         * transaction in the case of a filesystem like ext[3|4]. In this case,
+         * it is not safe to block on pfmemalloc_wait as kswapd could be
+         * blocked waiting on the same lock. Instead, throttle for up to a
+         * second before continuing.
+         */
+        if (!(gfp_mask & __GFP_FS)) {
+                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+                        pfmemalloc_watermark_ok(pgdat), HZ);
+                return;
+        }
+        /* Throttle until kswapd wakes the process */
+        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+                pfmemalloc_watermark_ok(pgdat));
+}
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                gfp_t gfp_mask, nodemask_t *nodemask)
 {
@@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .gfp_mask = sc.gfp_mask,
        };
+        throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+        /*
+         * Do not enter reclaim if fatal signal is pending. 1 is returned so
+         * that the page allocator does not consider triggering OOM
+         */
+        if (fatal_signal_pending(current))
+                return 1;
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
                                gfp_mask);
@@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        return nr_reclaimed;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                                                gfp_t gfp_mask, bool noswap,
@@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
        return balanced_pages >= (present_pages >> 2);
 }
-/* is kswapd sleeping prematurely? */
+/*
-static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ * Prepare kswapd for sleeping. This verifies that there are no processes
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
+ *
+ * Returns true if kswapd is ready to sleep
+ */
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                                        int classzone_idx)
 {
        int i;
@@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
-                return true;
+                return false;
+        /*
+         * There is a potential race between when kswapd checks its watermarks
+         * and a process gets throttled. There is also a potential race if
+         * processes get throttled, kswapd wakes, a large process exits therby
+         * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+         * is going to sleep, no process should be sleeping on pfmemalloc_wait
+         * so wake them now if necessary. If necessary, processes will wake
+         * kswapd and get throttled again
+         */
+        if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+                wake_up(&pgdat->pfmemalloc_wait);
+                return false;
+        }
        /* Check the watermark levels */
        for (i = 0; i <= classzone_idx; i++) {
@@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
         * must be balanced
         */
        if (order)
-                return !pgdat_balanced(pgdat, balanced, classzone_idx);
+                return pgdat_balanced(pgdat, balanced, classzone_idx);
        else
-                return !all_zones_ok;
+                return all_zones_ok;
 }
 /*
@@ -2537,7 +2677,7 @@ loop_again:
                                 * consider it to be no longer congested. It's
                                 * possible there are dirty pages backed by
                                 * congested BDIs but as pressure is relieved,
-                                 * spectulatively avoid congestion waits
+                                 * speculatively avoid congestion waits
                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
                                if (i <= *classzone_idx)
@@ -2545,6 +2685,16 @@ loop_again:
                        }
                }
+                /*
+                 * If the low watermark is met there is no need for processes
+                 * to be throttled on pfmemalloc_wait as they should not be
+                 * able to safely make forward progress. Wake them
+                 */
+                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+                                pfmemalloc_watermark_ok(pgdat))
+                        wake_up(&pgdat->pfmemalloc_wait);
                if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                        break;          /* kswapd: all done */
                /*
@@ -2646,7 +2796,7 @@ out:
        }
        /*
-         * Return the order we were reclaiming at so sleeping_prematurely()
+         * Return the order we were reclaiming at so prepare_kswapd_sleep()
         * makes a decision on the order we were last reclaiming at. However,
         * if another caller entered the allocator slow path while kswapd
         * was awake, order will remain at the higher level
@@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
        /* Try to sleep for a short interval */
-        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                remaining = schedule_timeout(HZ/10);
                finish_wait(&pgdat->kswapd_wait, &wait);
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
-        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
                /*
@@ -2688,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 * them before going back to sleep.
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
-                schedule();
+                if (!kthread_should_stop())
+                        schedule();
                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
        } else {
                if (remaining)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1bbbbd9776a..df7a6748231 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -745,6 +745,7 @@ const char * const vmstat_text[] = {
        TEXTS_FOR_ZONES("pgsteal_direct")
        TEXTS_FOR_ZONES("pgscan_kswapd")
        TEXTS_FOR_ZONES("pgscan_direct")
+        "pgscan_direct_throttle",
 #ifdef CONFIG_NUMA
        "zone_reclaim_failed",