Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next into wl12xx-next

author: Luciano Coelho <coelho@ti.com> 2011-12-01 05:14:48 -0500
committer: Luciano Coelho <coelho@ti.com> 2011-12-01 05:14:48 -0500
commit: e4da3fbfbd1de56d2367653e3823e6445e49f8a9 (patch)
tree: f69f424f731b89a75f881967903ff2f38f4b6a92 /mm
parent: b693289406f0b8ca70ab77e745be6196d5740eb0 (diff)
parent: ba5736a5e9ac20c378ae4179e8a0ed3cc4b44351 (diff)
59 files changed, 3934 insertions, 2910 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f2f1ca19ed53..011b110365c8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -131,6 +131,9 @@ config SPARSEMEM_VMEMMAP
 config HAVE_MEMBLOCK
        boolean
+config NO_BOOTMEM
+        boolean
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
diff --git a/mm/Makefile b/mm/Makefile
index 836e4163c1bf..50ec00ef2a0e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,8 @@
 mmu-y                   := nommu.o
 mmu-$(CONFIG_MMU)       := fremap.o highmem.o madvise.o memory.o mincore.o \
                           mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-                           vmalloc.o pagewalk.o pgtable-generic.o
+                           vmalloc.o pagewalk.o pgtable-generic.o \
+                           process_vm_access.o
 obj-y                   := filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index d6edf8d14f9c..a0860640378d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -97,6 +97,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
                   "BdiDirtyThresh:     %10lu kB\n"
                   "DirtyThresh:        %10lu kB\n"
                   "BackgroundThresh:   %10lu kB\n"
+                   "BdiDirtied:         %10lu kB\n"
                   "BdiWritten:         %10lu kB\n"
                   "BdiWriteBandwidth:  %10lu kBps\n"
                   "b_dirty:            %10lu\n"
@@ -109,6 +110,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
                   K(bdi_thresh),
                   K(dirty_thresh),
                   K(background_thresh),
+                   (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
                   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
                   (unsigned long) K(bdi->write_bandwidth),
                   nr_dirty,
@@ -359,6 +361,17 @@ static unsigned long bdi_longest_inactive(void)
        return max(5UL * 60 * HZ, interval);
 }
+/*
+ * Clear pending bit and wakeup anybody waiting for flusher thread creation or
+ * shutdown
+ */
+static void bdi_clear_pending(struct backing_dev_info *bdi)
+{
+        clear_bit(BDI_pending, &bdi->state);
+        smp_mb__after_clear_bit();
+        wake_up_bit(&bdi->state, BDI_pending);
+}
 static int bdi_forker_thread(void *ptr)
 {
        struct bdi_writeback *me = ptr;
@@ -390,6 +403,12 @@ static int bdi_forker_thread(void *ptr)
                }
                spin_lock_bh(&bdi_lock);
+                /*
+                 * In the following loop we are going to check whether we have
+                 * some work to do without any synchronization with tasks
+                 * waking us up to do work for them. Set the task state here
+                 * so that we don't miss wakeups after verifying conditions.
+                 */
                set_current_state(TASK_INTERRUPTIBLE);
                list_for_each_entry(bdi, &bdi_list, bdi_list) {
@@ -456,7 +475,8 @@ static int bdi_forker_thread(void *ptr)
                                 * the bdi from the thread. Hopefully 1024 is
                                 * large enough for efficient IO.
                                 */
-                                writeback_inodes_wb(&bdi->wb, 1024);
+                                writeback_inodes_wb(&bdi->wb, 1024,
+                                                    WB_REASON_FORKER_THREAD);
                        } else {
                                /*
                                 * The spinlock makes sure we do not lose
@@ -469,11 +489,13 @@ static int bdi_forker_thread(void *ptr)
                                spin_unlock_bh(&bdi->wb_lock);
                                wake_up_process(task);
                        }
+                        bdi_clear_pending(bdi);
                        break;
                case KILL_THREAD:
                        __set_current_state(TASK_RUNNING);
                        kthread_stop(task);
+                        bdi_clear_pending(bdi);
                        break;
                case NO_ACTION:
@@ -489,16 +511,8 @@ static int bdi_forker_thread(void *ptr)
                        else
                                schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
                        try_to_freeze();
-                        /* Back to the main loop */
+                        break;
-                        continue;
                }
-                /*
-                 * Clear pending bit and wakeup anybody waiting to tear us down.
-                 */
-                clear_bit(BDI_pending, &bdi->state);
-                smp_mb__after_clear_bit();
-                wake_up_bit(&bdi->state, BDI_pending);
        }
        return 0;
@@ -672,6 +686,8 @@ int bdi_init(struct backing_dev_info *bdi)
        bdi->bw_time_stamp = jiffies;
        bdi->written_stamp = 0;
+        bdi->balanced_dirty_ratelimit = INIT_BW;
+        bdi->dirty_ratelimit = INIT_BW;
        bdi->write_bandwidth = INIT_BW;
        bdi->avg_write_bandwidth = INIT_BW;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 01d5a4b3dd0c..1a77012ecdb3 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -12,7 +12,7 @@
 #include <linux/pfn.h>
 #include <linux/slab.h>
 #include <linux/bootmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kmemleak.h>
 #include <linux/range.h>
 #include <linux/memblock.h>
diff --git a/mm/bounce.c b/mm/bounce.c
index 1481de68184b..4e9ae722af83 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -4,7 +4,7 @@
 */
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/swap.h>
 #include <linux/gfp.h>
 #include <linux/bio.h>
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/bootmem.h>
 #include <asm/tlbflush.h>
 #include <trace/events/block.h>
@@ -26,12 +27,10 @@ static mempool_t *page_pool, *isa_page_pool;
 #ifdef CONFIG_HIGHMEM
 static __init int init_emergency_pool(void)
 {
-        struct sysinfo i;
+#ifndef CONFIG_MEMORY_HOTPLUG
-        si_meminfo(&i);
+        if (max_pfn <= max_low_pfn)
-        si_swapinfo(&i);
-        if (!i.totalhigh)
                return 0;
+#endif
        page_pool = mempool_create_page_pool(POOL_SIZE, 0);
        BUG_ON(!page_pool);
diff --git a/mm/compaction.c b/mm/compaction.c
index 6cc604bd5649..899d95638586 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
        bool sync;                      /* Synchronous migration */
-        /* Account for isolated anon and file pages */
-        unsigned long nr_anon;
-        unsigned long nr_file;
        unsigned int order;             /* order a direct compactor needs */
        int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
 static void acct_isolated(struct zone *zone, struct compact_control *cc)
 {
        struct page *page;
-        unsigned int count[NR_LRU_LISTS] = { 0, };
+        unsigned int count[2] = { 0, };
-        list_for_each_entry(page, &cc->migratepages, lru) {
+        list_for_each_entry(page, &cc->migratepages, lru)
-                int lru = page_lru_base_type(page);
+                count[!!page_is_file_cache(page)]++;
-                count[lru]++;
-        }
-        cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
-        cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
-        __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
-        __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
 }
 /* Similar to reclaim, but different enough that they don't share logic */
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
        unsigned long last_pageblock_nr = 0, pageblock_nr;
        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
+        isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
        /* Do not scan outside zone boundaries */
        low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -356,8 +349,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                        continue;
                }
+                if (!cc->sync)
+                        mode |= ISOLATE_CLEAN;
                /* Try isolate the page */
-                if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
+                if (__isolate_lru_page(page, mode, 0) != 0)
                        continue;
                VM_BUG_ON(PageTransCompound(page));
@@ -586,7 +582,7 @@ out:
        return ret;
 }
-unsigned long compact_zone_order(struct zone *zone,
+static unsigned long compact_zone_order(struct zone *zone,
                                 int order, gfp_t gfp_mask,
                                 bool sync)
 {
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index a1e3324de2b5..7cea557407f4 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -1,7 +1,10 @@
 #include <linux/kernel.h>
+#include <linux/string.h>
 #include <linux/mm.h>
+#include <linux/highmem.h>
 #include <linux/page-debug-flags.h>
 #include <linux/poison.h>
+#include <linux/ratelimit.h>
 static inline void set_page_poison(struct page *page)
 {
@@ -18,28 +21,13 @@ static inline bool page_poison(struct page *page)
        return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
 }
-static void poison_highpage(struct page *page)
-{
-        /*
-         * Page poisoning for highmem pages is not implemented.
-         *
-         * This can be called from interrupt contexts.
-         * So we need to create a new kmap_atomic slot for this
-         * application and it will need interrupt protection.
-         */
-}
 static void poison_page(struct page *page)
 {
-        void *addr;
+        void *addr = kmap_atomic(page);
-        if (PageHighMem(page)) {
-                poison_highpage(page);
-                return;
-        }
        set_page_poison(page);
-        addr = page_address(page);
        memset(addr, PAGE_POISON, PAGE_SIZE);
+        kunmap_atomic(addr);
 }
 static void poison_pages(struct page *page, int n)
@@ -59,14 +47,12 @@ static bool single_bit_flip(unsigned char a, unsigned char b)
 static void check_poison_mem(unsigned char *mem, size_t bytes)
 {
+        static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
        unsigned char *start;
        unsigned char *end;
-        for (start = mem; start < mem + bytes; start++) {
+        start = memchr_inv(mem, PAGE_POISON, bytes);
-                if (*start != PAGE_POISON)
+        if (!start)
-                        break;
-        }
-        if (start == mem + bytes)
                return;
        for (end = mem + bytes - 1; end > start; end--) {
@@ -74,7 +60,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
                        break;
        }
-        if (!printk_ratelimit())
+        if (!__ratelimit(&ratelimit))
                return;
        else if (start == end && single_bit_flip(*start, PAGE_POISON))
                printk(KERN_ERR "pagealloc: single bit error\n");
@@ -86,27 +72,17 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
        dump_stack();
 }
-static void unpoison_highpage(struct page *page)
-{
-        /*
-         * See comment in poison_highpage().
-         * Highmem pages should not be poisoned for now
-         */
-        BUG_ON(page_poison(page));
-}
 static void unpoison_page(struct page *page)
 {
-        if (PageHighMem(page)) {
+        void *addr;
-                unpoison_highpage(page);
+        if (!page_poison(page))
                return;
-        }
-        if (page_poison(page)) {
-                void *addr = page_address(page);
-                check_poison_mem(addr, PAGE_SIZE);
+        addr = kmap_atomic(page);
-                clear_page_poison(page);
+        check_poison_mem(addr, PAGE_SIZE);
-        }
+        clear_page_poison(page);
+        kunmap_atomic(addr);
 }
 static void unpoison_pages(struct page *page, int n)
diff --git a/mm/dmapool.c b/mm/dmapool.c
index fbb58e346888..c5ab33bca0a8 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -27,11 +27,12 @@
 #include <linux/dmapool.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/poison.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/stat.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/types.h>
diff --git a/mm/failslab.c b/mm/failslab.c
index 1ce58c201dca..0dd7b8fec71c 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -34,23 +34,23 @@ __setup("failslab=", setup_failslab);
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init failslab_debugfs_init(void)
 {
+        struct dentry *dir;
        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
-        int err;
-        err = init_fault_attr_dentries(&failslab.attr, "failslab");
+        dir = fault_create_debugfs_attr("failslab", NULL, &failslab.attr);
-        if (err)
+        if (IS_ERR(dir))
-                return err;
+                return PTR_ERR(dir);
-        if (!debugfs_create_bool("ignore-gfp-wait", mode, failslab.attr.dir,
+        if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
                                &failslab.ignore_gfp_wait))
                goto fail;
-        if (!debugfs_create_bool("cache-filter", mode, failslab.attr.dir,
+        if (!debugfs_create_bool("cache-filter", mode, dir,
                                &failslab.cache_filter))
                goto fail;
        return 0;
 fail:
-        cleanup_fault_attr_dentries(&failslab.attr);
+        debugfs_remove_recursive(dir);
        return -ENOMEM;
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 867d40222ec7..c0018f2d50e0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,7 +9,7 @@
 * most "normal" filesystems (but you don't /have/ to use this:
 * the NFS filesystem used to do this differently, for example)
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
@@ -33,7 +33,6 @@
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
-#include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include <linux/cleancache.h>
 #include "internal.h"
@@ -462,6 +461,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
        int error;
        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON(PageSwapBacked(page));
        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & GFP_RECLAIM_MASK);
@@ -479,8 +479,6 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                if (likely(!error)) {
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                        if (PageSwapBacked(page))
-                                __inc_zone_page_state(page, NR_SHMEM);
                        spin_unlock_irq(&mapping->tree_lock);
                } else {
                        page->mapping = NULL;
@@ -502,22 +500,9 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 {
        int ret;
-        /*
-         * Splice_read and readahead add shmem/tmpfs pages into the page cache
-         * before shmem_readpage has a chance to mark them as SwapBacked: they
-         * need to go on the anon lru below, and mem_cgroup_cache_charge
-         * (called in add_to_page_cache) needs to know where they're going too.
-         */
-        if (mapping_cap_swap_backed(mapping))
-                SetPageSwapBacked(page);
        ret = add_to_page_cache(page, mapping, offset, gfp_mask);
-        if (ret == 0) {
+        if (ret == 0)
-                if (page_is_file_cache(page))
+                lru_cache_add_file(page);
-                        lru_cache_add_file(page);
-                else
-                        lru_cache_add_anon(page);
-        }
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
@@ -714,9 +699,16 @@ repeat:
                page = radix_tree_deref_slot(pagep);
                if (unlikely(!page))
                        goto out;
-                if (radix_tree_deref_retry(page))
+                if (radix_tree_exception(page)) {
-                        goto repeat;
+                        if (radix_tree_deref_retry(page))
+                                goto repeat;
+                        /*
+                         * Otherwise, shmem/tmpfs must be storing a swap entry
+                         * here as an exceptional entry: so return it without
+                         * attempting to raise page count.
+                         */
+                        goto out;
+                }
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -753,7 +745,7 @@ struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
 repeat:
        page = find_get_page(mapping, offset);
-        if (page) {
+        if (page && !radix_tree_exception(page)) {
                lock_page(page);
                /* Has the page been truncated? */
                if (unlikely(page->mapping != mapping)) {
@@ -835,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 {
        unsigned int i;
        unsigned int ret;
-        unsigned int nr_found;
+        unsigned int nr_found, nr_skip;
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                                (void ***)pages, start, nr_pages);
+                                (void ***)pages, NULL, start, nr_pages);
        ret = 0;
+        nr_skip = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
 repeat:
@@ -849,13 +842,23 @@ repeat:
                if (unlikely(!page))
                        continue;
-                /*
+                if (radix_tree_exception(page)) {
-                 * This can only trigger when the entry at index 0 moves out
+                        if (radix_tree_deref_retry(page)) {
-                 * of or back to the root: none yet gotten, safe to restart.
+                                /*
-                 */
+                                 * Transient condition which can only trigger
-                if (radix_tree_deref_retry(page)) {
+                                 * when entry at index 0 moves out of or back
-                        WARN_ON(start | i);
+                                 * to root: none yet gotten, safe to restart.
-                        goto restart;
+                                 */
+                                WARN_ON(start | i);
+                                goto restart;
+                        }
+                        /*
+                         * Otherwise, shmem/tmpfs must be storing a swap entry
+                         * here as an exceptional entry: so skip over it -
+                         * we only reach this from invalidate_mapping_pages().
+                         */
+                        nr_skip++;
+                        continue;
                }
                if (!page_cache_get_speculative(page))
@@ -875,7 +878,7 @@ repeat:
         * If all entries were removed before we could secure them,
         * try again, because callers stop trying once 0 is returned.
         */
-        if (unlikely(!ret && nr_found))
+        if (unlikely(!ret && nr_found > nr_skip))
                goto restart;
        rcu_read_unlock();
        return ret;
@@ -903,7 +906,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
        rcu_read_lock();
 restart:
        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
-                                (void ***)pages, index, nr_pages);
+                                (void ***)pages, NULL, index, nr_pages);
        ret = 0;
        for (i = 0; i < nr_found; i++) {
                struct page *page;
@@ -912,12 +915,22 @@ repeat:
                if (unlikely(!page))
                        continue;
-                /*
+                if (radix_tree_exception(page)) {
-                 * This can only trigger when the entry at index 0 moves out
+                        if (radix_tree_deref_retry(page)) {
-                 * of or back to the root: none yet gotten, safe to restart.
+                                /*
-                 */
+                                 * Transient condition which can only trigger
-                if (radix_tree_deref_retry(page))
+                                 * when entry at index 0 moves out of or back
-                        goto restart;
+                                 * to root: none yet gotten, safe to restart.
+                                 */
+                                goto restart;
+                        }
+                        /*
+                         * Otherwise, shmem/tmpfs must be storing a swap entry
+                         * here as an exceptional entry: so stop looking for
+                         * contiguous pages.
+                         */
+                        break;
+                }
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -977,12 +990,21 @@ repeat:
                if (unlikely(!page))
                        continue;
-                /*
+                if (radix_tree_exception(page)) {
-                 * This can only trigger when the entry at index 0 moves out
+                        if (radix_tree_deref_retry(page)) {
-                 * of or back to the root: none yet gotten, safe to restart.
+                                /*
-                 */
+                                 * Transient condition which can only trigger
-                if (radix_tree_deref_retry(page))
+                                 * when entry at index 0 moves out of or back
-                        goto restart;
+                                 * to root: none yet gotten, safe to restart.
+                                 */
+                                goto restart;
+                        }
+                        /*
+                         * This function is never used on a shmem/tmpfs
+                         * mapping, so a swap entry won't be found here.
+                         */
+                        BUG();
+                }
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -2093,6 +2115,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
        } else {
                const struct iovec *iov = i->iov;
                size_t base = i->iov_offset;
+                unsigned long nr_segs = i->nr_segs;
                /*
                 * The !iov->iov_len check ensures we skip over unlikely
@@ -2108,11 +2131,13 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
                        base += copy;
                        if (iov->iov_len == base) {
                                iov++;
+                                nr_segs--;
                                base = 0;
                        }
                }
                i->iov = iov;
                i->iov_offset = base;
+                i->nr_segs = nr_segs;
        }
 }
 EXPORT_SYMBOL(iov_iter_advance);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 93356cd12828..f91b2f687343 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -10,7 +10,7 @@
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/uio.h>
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
diff --git a/mm/fremap.c b/mm/fremap.c
index b8e0e2d468af..9ed4fd432467 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -13,7 +13,6 @@
 #include <linux/pagemap.h>
 #include <linux/swapops.h>
 #include <linux/rmap.h>
-#include <linux/module.h>
 #include <linux/syscalls.h>
 #include <linux/mmu_notifier.h>
diff --git a/mm/highmem.c b/mm/highmem.c
index 693394daa2ed..57d82c6250c3 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -17,7 +17,7 @@
 */
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/pagemap.h>
@@ -250,7 +250,7 @@ void *kmap_high_get(struct page *page)
 #endif
 /**
- * kunmap_high - map a highmem page into memory
+ * kunmap_high - unmap a highmem page into memory
 * @page: &struct page to unmap
 *
 * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
@@ -326,7 +326,7 @@ static struct page_address_slot {
        spinlock_t lock;                        /* Protect this bucket's list */
 } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
-static struct page_address_slot *page_slot(struct page *page)
+static struct page_address_slot *page_slot(const struct page *page)
 {
        return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
 }
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page)
 *
 * Returns the page's virtual address.
 */
-void *page_address(struct page *page)
+void *page_address(const struct page *page)
 {
        unsigned long flags;
        void *ret;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2d1587be269..4298abaae153 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -89,7 +89,8 @@ struct khugepaged_scan {
        struct list_head mm_head;
        struct mm_slot *mm_slot;
        unsigned long address;
-} khugepaged_scan = {
+};
+static struct khugepaged_scan khugepaged_scan = {
        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
 };
@@ -829,7 +830,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
        for (i = 0; i < HPAGE_PMD_NR; i++) {
                copy_user_highpage(pages[i], page + i,
-                                   haddr + PAGE_SHIFT*i, vma);
+                                   haddr + PAGE_SIZE * i, vma);
                __SetPageUptodate(pages[i]);
                cond_resched();
        }
@@ -989,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
        VM_BUG_ON(!PageCompound(page));
        if (flags & FOLL_GET)
-                get_page(page);
+                get_page_foll(page);
 out:
        return page;
@@ -1052,6 +1053,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        return ret;
 }
+int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+                  unsigned long old_addr,
+                  unsigned long new_addr, unsigned long old_end,
+                  pmd_t *old_pmd, pmd_t *new_pmd)
+{
+        int ret = 0;
+        pmd_t pmd;
+        struct mm_struct *mm = vma->vm_mm;
+        if ((old_addr & ~HPAGE_PMD_MASK) ||
+            (new_addr & ~HPAGE_PMD_MASK) ||
+            old_end - old_addr < HPAGE_PMD_SIZE ||
+            (new_vma->vm_flags & VM_NOHUGEPAGE))
+                goto out;
+        /*
+         * The destination pmd shouldn't be established, free_pgtables()
+         * should have release it.
+         */
+        if (WARN_ON(!pmd_none(*new_pmd))) {
+                VM_BUG_ON(pmd_trans_huge(*new_pmd));
+                goto out;
+        }
+        spin_lock(&mm->page_table_lock);
+        if (likely(pmd_trans_huge(*old_pmd))) {
+                if (pmd_trans_splitting(*old_pmd)) {
+                        spin_unlock(&mm->page_table_lock);
+                        wait_split_huge_page(vma->anon_vma, old_pmd);
+                        ret = -1;
+                } else {
+                        pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+                        VM_BUG_ON(!pmd_none(*new_pmd));
+                        set_pmd_at(mm, new_addr, new_pmd, pmd);
+                        spin_unlock(&mm->page_table_lock);
+                        ret = 1;
+                }
+        } else {
+                spin_unlock(&mm->page_table_lock);
+        }
+out:
+        return ret;
+}
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, pgprot_t newprot)
 {
@@ -1156,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page)
        unsigned long head_index = page->index;
        struct zone *zone = page_zone(page);
        int zonestat;
+        int tail_count = 0;
        /* prevent PageLRU to go away from under us, and freeze lru stats */
        spin_lock_irq(&zone->lru_lock);
@@ -1164,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page)
        for (i = 1; i < HPAGE_PMD_NR; i++) {
                struct page *page_tail = page + i;
-                /* tail_page->_count cannot change */
+                /* tail_page->_mapcount cannot change */
-                atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+                BUG_ON(page_mapcount(page_tail) < 0);
-                BUG_ON(page_count(page) <= 0);
+                tail_count += page_mapcount(page_tail);
-                atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+                /* check for overflow */
-                BUG_ON(atomic_read(&page_tail->_count) <= 0);
+                BUG_ON(tail_count < 0);
+                BUG_ON(atomic_read(&page_tail->_count) != 0);
+                /*
+                 * tail_page->_count is zero and not changing from
+                 * under us. But get_page_unless_zero() may be running
+                 * from under us on the tail_page. If we used
+                 * atomic_set() below instead of atomic_add(), we
+                 * would then run atomic_set() concurrently with
+                 * get_page_unless_zero(), and atomic_set() is
+                 * implemented in C not using locked ops. spin_unlock
+                 * on x86 sometime uses locked ops because of PPro
+                 * errata 66, 92, so unless somebody can guarantee
+                 * atomic_set() here would be safe on all archs (and
+                 * not only on x86), it's safer to use atomic_add().
+                 */
+                atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1,
+                           &page_tail->_count);
                /* after clearing PageTail the gup refcount can be released */
                smp_mb();
@@ -1186,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page)
                                      (1L << PG_uptodate)));
                page_tail->flags |= (1L << PG_dirty);
-                /*
+                /* clear PageTail before overwriting first_page */
-                 * 1) clear PageTail before overwriting first_page
-                 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
-                 */
                smp_wmb();
                /*
@@ -1206,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page)
                 * status is achieved setting a reserved bit in the
                 * pmd, not by clearing the present bit.
                */
-                BUG_ON(page_mapcount(page_tail));
                page_tail->_mapcount = page->_mapcount;
                BUG_ON(page_tail->mapping);
@@ -1223,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page)
                lru_add_page_tail(zone, page, page_tail);
        }
+        atomic_sub(tail_count, &page->_count);
+        BUG_ON(atomic_read(&page->_count) <= 0);
        __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
@@ -1906,7 +1967,7 @@ static void collapse_huge_page(struct mm_struct *mm,
        BUG_ON(!pmd_none(*pmd));
        page_add_new_anon_rmap(new_page, vma, address);
        set_pmd_at(mm, address, pmd, _pmd);
-        update_mmu_cache(vma, address, entry);
+        update_mmu_cache(vma, address, _pmd);
        prepare_pmd_huge_pte(pgtable, mm);
        mm->nr_ptes--;
        spin_unlock(&mm->page_table_lock);
@@ -2024,6 +2085,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
 static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
                                            struct page **hpage)
+        __releases(&khugepaged_mm_lock)
+        __acquires(&khugepaged_mm_lock)
 {
        struct mm_slot *mm_slot;
        struct mm_struct *mm;
diff --git a/mm/internal.h b/mm/internal.h
index d071d380fb49..2189af491783 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,6 +37,52 @@ static inline void __put_page(struct page *page)
        atomic_dec(&page->_count);
 }
+static inline void __get_page_tail_foll(struct page *page,
+                                        bool get_page_head)
+{
+        /*
+         * If we're getting a tail page, the elevated page->_count is
+         * required only in the head page and we will elevate the head
+         * page->_count and tail page->_mapcount.
+         *
+         * We elevate page_tail->_mapcount for tail pages to force
+         * page_tail->_count to be zero at all times to avoid getting
+         * false positives from get_page_unless_zero() with
+         * speculative page access (like in
+         * page_cache_get_speculative()) on tail pages.
+         */
+        VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0);
+        VM_BUG_ON(atomic_read(&page->_count) != 0);
+        VM_BUG_ON(page_mapcount(page) < 0);
+        if (get_page_head)
+                atomic_inc(&page->first_page->_count);
+        atomic_inc(&page->_mapcount);
+}
+/*
+ * This is meant to be called as the FOLL_GET operation of
+ * follow_page() and it must be called while holding the proper PT
+ * lock while the pte (or pmd_trans_huge) is still mapping the page.
+ */
+static inline void get_page_foll(struct page *page)
+{
+        if (unlikely(PageTail(page)))
+                /*
+                 * This is safe only because
+                 * __split_huge_page_refcount() can't run under
+                 * get_page_foll() because we hold the proper PT lock.
+                 */
+                __get_page_tail_foll(page, true);
+        else {
+                /*
+                 * Getting a normal page or the head of a compound page
+                 * requires to already have an elevated page->_count.
+                 */
+                VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                atomic_inc(&page->_count);
+        }
+}
 extern unsigned long highest_memmap_pfn;
 /*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index d6880f542f95..f3b2a00fe9c1 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -69,7 +69,7 @@
 #include <linux/sched.h>
 #include <linux/jiffies.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kthread.h>
 #include <linux/prio_tree.h>
 #include <linux/fs.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index 9a68b0cf0a1c..310544a379ae 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1905,7 +1905,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
                        oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
                        err = unmerge_and_remove_all_rmap_items();
-                        test_set_oom_score_adj(oom_score_adj);
+                        compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
+                                                                oom_score_adj);
                        if (err) {
                                ksm_run = KSM_RUN_STOP;
                                count = err;
diff --git a/mm/maccess.c b/mm/maccess.c
index 4cee182ab5f3..d53adf9ba84b 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,7 +1,7 @@
 /*
 * Access kernel memory without faulting.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
diff --git a/mm/memblock.c b/mm/memblock.c
index ccbf97339592..84bec4969ed5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -58,7 +58,8 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
        return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
 }
-long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+                                        phys_addr_t base, phys_addr_t size)
 {
        unsigned long i;
@@ -267,7 +268,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
        return 0;
 }
-extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
+int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
                                          phys_addr_t addr2, phys_addr_t size2)
 {
        return 1;
@@ -626,6 +627,12 @@ phys_addr_t __init memblock_phys_mem_size(void)
        return memblock.memory_size;
 }
+/* lowest address */
+phys_addr_t __init_memblock memblock_start_of_DRAM(void)
+{
+        return memblock.memory.regions[0].base;
+}
 phys_addr_t __init_memblock memblock_end_of_DRAM(void)
 {
        int idx = memblock.memory.cnt - 1;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5f84d2351ddb..6aff93c98aca 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -33,9 +33,9 @@
 #include <linux/bit_spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/limits.h>
+#include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
-#include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -202,52 +202,8 @@ struct mem_cgroup_eventfd_list {
        struct eventfd_ctx *eventfd;
 };
-static void mem_cgroup_threshold(struct mem_cgroup *mem);
+static void mem_cgroup_threshold(struct mem_cgroup *memcg);
-static void mem_cgroup_oom_notify(struct mem_cgroup *mem);
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
-enum {
-        SCAN_BY_LIMIT,
-        SCAN_BY_SYSTEM,
-        NR_SCAN_CONTEXT,
-        SCAN_BY_SHRINK, /* not recorded now */
-};
-enum {
-        SCAN,
-        SCAN_ANON,
-        SCAN_FILE,
-        ROTATE,
-        ROTATE_ANON,
-        ROTATE_FILE,
-        FREED,
-        FREED_ANON,
-        FREED_FILE,
-        ELAPSED,
-        NR_SCANSTATS,
-};
-struct scanstat {
-        spinlock_t      lock;
-        unsigned long   stats[NR_SCAN_CONTEXT][NR_SCANSTATS];
-        unsigned long   rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS];
-};
-const char *scanstat_string[NR_SCANSTATS] = {
-        "scanned_pages",
-        "scanned_anon_pages",
-        "scanned_file_pages",
-        "rotated_pages",
-        "rotated_anon_pages",
-        "rotated_file_pages",
-        "freed_pages",
-        "freed_anon_pages",
-        "freed_file_pages",
-        "elapsed_ns",
-};
-#define SCANSTAT_WORD_LIMIT     "_by_limit"
-#define SCANSTAT_WORD_SYSTEM    "_by_system"
-#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy"
 /*
 * The memory controller data structure. The memory controller controls both
@@ -314,8 +270,7 @@ struct mem_cgroup {
        /* For oom notifier event fd */
        struct list_head oom_notify;
-        /* For recording LRU-scan statistics */
-        struct scanstat scanstat;
        /*
         * Should we move charges of a task when a task is moved into this
         * mem_cgroup ? And what type of charges should we move ?
@@ -408,29 +363,29 @@ enum charge_type {
 #define MEM_CGROUP_RECLAIM_SOFT_BIT     0x2
 #define MEM_CGROUP_RECLAIM_SOFT         (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
-static void mem_cgroup_get(struct mem_cgroup *mem);
+static void mem_cgroup_get(struct mem_cgroup *memcg);
-static void mem_cgroup_put(struct mem_cgroup *mem);
+static void mem_cgroup_put(struct mem_cgroup *memcg);
-static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
-static void drain_all_stock_async(struct mem_cgroup *mem);
+static void drain_all_stock_async(struct mem_cgroup *memcg);
 static struct mem_cgroup_per_zone *
-mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
+mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
 {
-        return &mem->info.nodeinfo[nid]->zoneinfo[zid];
+        return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
 }
-struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
+struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 {
-        return &mem->css;
+        return &memcg->css;
 }
 static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
+page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
        int nid = page_to_nid(page);
        int zid = page_zonenum(page);
-        return mem_cgroup_zoneinfo(mem, nid, zid);
+        return mem_cgroup_zoneinfo(memcg, nid, zid);
 }
 static struct mem_cgroup_tree_per_zone *
@@ -449,7 +404,7 @@ soft_limit_tree_from_page(struct page *page)
 }
 static void
-__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
+__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,
                                struct mem_cgroup_per_zone *mz,
                                struct mem_cgroup_tree_per_zone *mctz,
                                unsigned long long new_usage_in_excess)
@@ -483,7 +438,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
 }
 static void
-__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
                                struct mem_cgroup_per_zone *mz,
                                struct mem_cgroup_tree_per_zone *mctz)
 {
@@ -494,17 +449,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 }
 static void
-mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
+mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
                                struct mem_cgroup_per_zone *mz,
                                struct mem_cgroup_tree_per_zone *mctz)
 {
        spin_lock(&mctz->lock);
-        __mem_cgroup_remove_exceeded(mem, mz, mctz);
+        __mem_cgroup_remove_exceeded(memcg, mz, mctz);
        spin_unlock(&mctz->lock);
 }
-static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
+static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 {
        unsigned long long excess;
        struct mem_cgroup_per_zone *mz;
@@ -517,9 +472,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
         * Necessary to update all ancestors when hierarchy is used.
         * because their event counter is not touched.
         */
-        for (; mem; mem = parent_mem_cgroup(mem)) {
+        for (; memcg; memcg = parent_mem_cgroup(memcg)) {
-                mz = mem_cgroup_zoneinfo(mem, nid, zid);
+                mz = mem_cgroup_zoneinfo(memcg, nid, zid);
-                excess = res_counter_soft_limit_excess(&mem->res);
+                excess = res_counter_soft_limit_excess(&memcg->res);
                /*
                 * We have to update the tree if mz is on RB-tree or
                 * mem is over its softlimit.
@@ -528,18 +483,18 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
                        spin_lock(&mctz->lock);
                        /* if on-tree, remove it */
                        if (mz->on_tree)
-                                __mem_cgroup_remove_exceeded(mem, mz, mctz);
+                                __mem_cgroup_remove_exceeded(memcg, mz, mctz);
                        /*
                         * Insert again. mz->usage_in_excess will be updated.
                         * If excess is 0, no tree ops.
                         */
-                        __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
+                        __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
                        spin_unlock(&mctz->lock);
                }
        }
 }
-static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
+static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 {
        int node, zone;
        struct mem_cgroup_per_zone *mz;
@@ -547,9 +502,9 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
        for_each_node_state(node, N_POSSIBLE) {
                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-                        mz = mem_cgroup_zoneinfo(mem, node, zone);
+                        mz = mem_cgroup_zoneinfo(memcg, node, zone);
                        mctz = soft_limit_tree_node_zone(node, zone);
-                        mem_cgroup_remove_exceeded(mem, mz, mctz);
+                        mem_cgroup_remove_exceeded(memcg, mz, mctz);
                }
        }
 }
@@ -610,7 +565,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 * common workload, threashold and synchonization as vmstat[] should be
 * implemented.
 */
-static long mem_cgroup_read_stat(struct mem_cgroup *mem,
+static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
                                 enum mem_cgroup_stat_index idx)
 {
        long val = 0;
@@ -618,81 +573,83 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem,
        get_online_cpus();
        for_each_online_cpu(cpu)
-                val += per_cpu(mem->stat->count[idx], cpu);
+                val += per_cpu(memcg->stat->count[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-        spin_lock(&mem->pcp_counter_lock);
+        spin_lock(&memcg->pcp_counter_lock);
-        val += mem->nocpu_base.count[idx];
+        val += memcg->nocpu_base.count[idx];
-        spin_unlock(&mem->pcp_counter_lock);
+        spin_unlock(&memcg->pcp_counter_lock);
 #endif
        put_online_cpus();
        return val;
 }
-static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
+static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
                                         bool charge)
 {
        int val = (charge) ? 1 : -1;
-        this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
+        this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 }
-void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
+void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val)
 {
-        this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
+        this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
 }
-void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
+void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val)
 {
-        this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
+        this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
 }
-static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
                                            enum mem_cgroup_events_index idx)
 {
        unsigned long val = 0;
        int cpu;
        for_each_online_cpu(cpu)
-                val += per_cpu(mem->stat->events[idx], cpu);
+                val += per_cpu(memcg->stat->events[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
-        spin_lock(&mem->pcp_counter_lock);
+        spin_lock(&memcg->pcp_counter_lock);
-        val += mem->nocpu_base.events[idx];
+        val += memcg->nocpu_base.events[idx];
-        spin_unlock(&mem->pcp_counter_lock);
+        spin_unlock(&memcg->pcp_counter_lock);
 #endif
        return val;
 }
-static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
+static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
                                         bool file, int nr_pages)
 {
        preempt_disable();
        if (file)
-                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
+                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
+                                nr_pages);
        else
-                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
+                __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
+                                nr_pages);
        /* pagein of a big page is an event. So, ignore page size */
        if (nr_pages > 0)
-                __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
+                __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
        else {
-                __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
+                __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
                nr_pages = -nr_pages; /* for event */
        }
-        __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
+        __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
        preempt_enable();
 }
 unsigned long
-mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
+mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
                        unsigned int lru_mask)
 {
        struct mem_cgroup_per_zone *mz;
        enum lru_list l;
        unsigned long ret = 0;
-        mz = mem_cgroup_zoneinfo(mem, nid, zid);
+        mz = mem_cgroup_zoneinfo(memcg, nid, zid);
        for_each_lru(l) {
                if (BIT(l) & lru_mask)
@@ -702,44 +659,45 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid,
 }
 static unsigned long
-mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem,
+mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
                        int nid, unsigned int lru_mask)
 {
        u64 total = 0;
        int zid;
        for (zid = 0; zid < MAX_NR_ZONES; zid++)
-                total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask);
+                total += mem_cgroup_zone_nr_lru_pages(memcg,
+                                                nid, zid, lru_mask);
        return total;
 }
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
                        unsigned int lru_mask)
 {
        int nid;
        u64 total = 0;
        for_each_node_state(nid, N_HIGH_MEMORY)
-                total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask);
+                total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
        return total;
 }
-static bool __memcg_event_check(struct mem_cgroup *mem, int target)
+static bool __memcg_event_check(struct mem_cgroup *memcg, int target)
 {
        unsigned long val, next;
-        val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+        val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
-        next = this_cpu_read(mem->stat->targets[target]);
+        next = __this_cpu_read(memcg->stat->targets[target]);
        /* from time_after() in jiffies.h */
        return ((long)next - (long)val < 0);
 }
-static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
+static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target)
 {
        unsigned long val, next;
-        val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+        val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]);
        switch (target) {
        case MEM_CGROUP_TARGET_THRESH:
@@ -755,34 +713,36 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
                return;
        }
-        this_cpu_write(mem->stat->targets[target], next);
+        __this_cpu_write(memcg->stat->targets[target], next);
 }
 /*
 * Check events in order.
 *
 */
-static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
+static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
+        preempt_disable();
        /* threshold event is triggered in finer grain than soft limit */
-        if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
+        if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) {
-                mem_cgroup_threshold(mem);
+                mem_cgroup_threshold(memcg);
-                __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
+                __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH);
-                if (unlikely(__memcg_event_check(mem,
+                if (unlikely(__memcg_event_check(memcg,
                             MEM_CGROUP_TARGET_SOFTLIMIT))) {
-                        mem_cgroup_update_tree(mem, page);
+                        mem_cgroup_update_tree(memcg, page);
-                        __mem_cgroup_target_update(mem,
+                        __mem_cgroup_target_update(memcg,
                                                   MEM_CGROUP_TARGET_SOFTLIMIT);
                }
 #if MAX_NUMNODES > 1
-                if (unlikely(__memcg_event_check(mem,
+                if (unlikely(__memcg_event_check(memcg,
                        MEM_CGROUP_TARGET_NUMAINFO))) {
-                        atomic_inc(&mem->numainfo_events);
+                        atomic_inc(&memcg->numainfo_events);
-                        __mem_cgroup_target_update(mem,
+                        __mem_cgroup_target_update(memcg,
                                MEM_CGROUP_TARGET_NUMAINFO);
                }
 #endif
        }
+        preempt_enable();
 }
 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
@@ -808,7 +768,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
-        struct mem_cgroup *mem = NULL;
+        struct mem_cgroup *memcg = NULL;
        if (!mm)
                return NULL;
@@ -819,25 +779,25 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
         */
        rcu_read_lock();
        do {
-                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+                memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-                if (unlikely(!mem))
+                if (unlikely(!memcg))
                        break;
-        } while (!css_tryget(&mem->css));
+        } while (!css_tryget(&memcg->css));
        rcu_read_unlock();
-        return mem;
+        return memcg;
 }
 /* The caller has to guarantee "mem" exists before calling this */
-static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
+static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg)
 {
        struct cgroup_subsys_state *css;
        int found;
-        if (!mem) /* ROOT cgroup has the smallest ID */
+        if (!memcg) /* ROOT cgroup has the smallest ID */
                return root_mem_cgroup; /*css_put/get against root is ignored*/
-        if (!mem->use_hierarchy) {
+        if (!memcg->use_hierarchy) {
-                if (css_tryget(&mem->css))
+                if (css_tryget(&memcg->css))
-                        return mem;
+                        return memcg;
                return NULL;
        }
        rcu_read_lock();
@@ -845,13 +805,13 @@ static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
         * searching a memory cgroup which has the smallest ID under given
         * ROOT cgroup. (ID >= 1)
         */
-        css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
+        css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found);
        if (css && css_tryget(css))
-                mem = container_of(css, struct mem_cgroup, css);
+                memcg = container_of(css, struct mem_cgroup, css);
        else
-                mem = NULL;
+                memcg = NULL;
        rcu_read_unlock();
-        return mem;
+        return memcg;
 }
 static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
@@ -905,29 +865,29 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
        for_each_mem_cgroup_tree_cond(iter, NULL, true)
-static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
+static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
-        return (mem == root_mem_cgroup);
+        return (memcg == root_mem_cgroup);
 }
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *memcg;
        if (!mm)
                return;
        rcu_read_lock();
-        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+        memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-        if (unlikely(!mem))
+        if (unlikely(!memcg))
                goto out;
        switch (idx) {
        case PGMAJFAULT:
-                mem_cgroup_pgmajfault(mem, 1);
+                mem_cgroup_pgmajfault(memcg, 1);
                break;
        case PGFAULT:
-                mem_cgroup_pgfault(mem, 1);
+                mem_cgroup_pgfault(memcg, 1);
                break;
        default:
                BUG();
@@ -1036,6 +996,16 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
                return;
        pc = lookup_page_cgroup(page);
        VM_BUG_ON(PageCgroupAcctLRU(pc));
+        /*
+         * putback:                             charge:
+         * SetPageLRU                           SetPageCgroupUsed
+         * smp_mb                               smp_mb
+         * PageCgroupUsed && add to memcg LRU   PageLRU && add to memcg LRU
+         *
+         * Ensure that one of the two sides adds the page to the memcg
+         * LRU during a race.
+         */
+        smp_mb();
        if (!PageCgroupUsed(pc))
                return;
        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
@@ -1087,7 +1057,16 @@ static void mem_cgroup_lru_add_after_commit(struct page *page)
        unsigned long flags;
        struct zone *zone = page_zone(page);
        struct page_cgroup *pc = lookup_page_cgroup(page);
+        /*
+         * putback:                             charge:
+         * SetPageLRU                           SetPageCgroupUsed
+         * smp_mb                               smp_mb
+         * PageCgroupUsed && add to memcg LRU   PageLRU && add to memcg LRU
+         *
+         * Ensure that one of the two sides adds the page to the memcg
+         * LRU during a race.
+         */
+        smp_mb();
        /* taking care of that the page is added to LRU while we commit it */
        if (likely(!PageLRU(page)))
                return;
@@ -1109,21 +1088,21 @@ void mem_cgroup_move_lists(struct page *page,
 }
 /*
- * Checks whether given mem is same or in the root_mem's
+ * Checks whether given mem is same or in the root_mem_cgroup's
 * hierarchy subtree
 */
-static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem,
+static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
-                struct mem_cgroup *mem)
+                struct mem_cgroup *memcg)
 {
-        if (root_mem != mem) {
+        if (root_memcg != memcg) {
-                return (root_mem->use_hierarchy &&
+                return (root_memcg->use_hierarchy &&
-                        css_is_ancestor(&mem->css, &root_mem->css));
+                        css_is_ancestor(&memcg->css, &root_memcg->css));
        }
        return true;
 }
-int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 {
        int ret;
        struct mem_cgroup *curr = NULL;
@@ -1137,25 +1116,29 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
        if (!curr)
                return 0;
        /*
-         * We should check use_hierarchy of "mem" not "curr". Because checking
+         * We should check use_hierarchy of "memcg" not "curr". Because checking
         * use_hierarchy of "curr" here make this function true if hierarchy is
-         * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
+         * enabled in "curr" and "curr" is a child of "memcg" in *cgroup*
-         * hierarchy(even if use_hierarchy is disabled in "mem").
+         * hierarchy(even if use_hierarchy is disabled in "memcg").
         */
-        ret = mem_cgroup_same_or_subtree(mem, curr);
+        ret = mem_cgroup_same_or_subtree(memcg, curr);
        css_put(&curr->css);
        return ret;
 }
-static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
+int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
 {
-        unsigned long active;
+        unsigned long inactive_ratio;
+        int nid = zone_to_nid(zone);
+        int zid = zone_idx(zone);
        unsigned long inactive;
+        unsigned long active;
        unsigned long gb;
-        unsigned long inactive_ratio;
-        inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
+        inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
-        active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
+                                                BIT(LRU_INACTIVE_ANON));
+        active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+                                              BIT(LRU_ACTIVE_ANON));
        gb = (inactive + active) >> (30 - PAGE_SHIFT);
        if (gb)
@@ -1163,39 +1146,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
        else
                inactive_ratio = 1;
-        if (present_pages) {
+        return inactive * inactive_ratio < active;
-                present_pages[0] = inactive;
-                present_pages[1] = active;
-        }
-        return inactive_ratio;
-}
-int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
-{
-        unsigned long active;
-        unsigned long inactive;
-        unsigned long present_pages[2];
-        unsigned long inactive_ratio;
-        inactive_ratio = calc_inactive_ratio(memcg, present_pages);
-        inactive = present_pages[0];
-        active = present_pages[1];
-        if (inactive * inactive_ratio < active)
-                return 1;
-        return 0;
 }
-int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
+int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone)
 {
        unsigned long active;
        unsigned long inactive;
+        int zid = zone_idx(zone);
+        int nid = zone_to_nid(zone);
-        inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
+        inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
-        active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
+                                                BIT(LRU_INACTIVE_FILE));
+        active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
+                                              BIT(LRU_ACTIVE_FILE));
        return (active > inactive);
 }
@@ -1231,7 +1195,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                                        struct list_head *dst,
                                        unsigned long *scanned, int order,
-                                        int mode, struct zone *z,
+                                        isolate_mode_t mode,
+                                        struct zone *z,
                                        struct mem_cgroup *mem_cont,
                                        int active, int file)
 {
@@ -1299,13 +1264,13 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 * Returns the maximum amount of memory @mem can be charged with, in
 * pages.
 */
-static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
+static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 {
        unsigned long long margin;
-        margin = res_counter_margin(&mem->res);
+        margin = res_counter_margin(&memcg->res);
        if (do_swap_account)
-                margin = min(margin, res_counter_margin(&mem->memsw));
+                margin = min(margin, res_counter_margin(&memcg->memsw));
        return margin >> PAGE_SHIFT;
 }
@@ -1320,33 +1285,33 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
        return memcg->swappiness;
 }
-static void mem_cgroup_start_move(struct mem_cgroup *mem)
+static void mem_cgroup_start_move(struct mem_cgroup *memcg)
 {
        int cpu;
        get_online_cpus();
-        spin_lock(&mem->pcp_counter_lock);
+        spin_lock(&memcg->pcp_counter_lock);
        for_each_online_cpu(cpu)
-                per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+                per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
-        mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
+        memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
-        spin_unlock(&mem->pcp_counter_lock);
+        spin_unlock(&memcg->pcp_counter_lock);
        put_online_cpus();
        synchronize_rcu();
 }
-static void mem_cgroup_end_move(struct mem_cgroup *mem)
+static void mem_cgroup_end_move(struct mem_cgroup *memcg)
 {
        int cpu;
-        if (!mem)
+        if (!memcg)
                return;
        get_online_cpus();
-        spin_lock(&mem->pcp_counter_lock);
+        spin_lock(&memcg->pcp_counter_lock);
        for_each_online_cpu(cpu)
-                per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+                per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
-        mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
+        memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
-        spin_unlock(&mem->pcp_counter_lock);
+        spin_unlock(&memcg->pcp_counter_lock);
        put_online_cpus();
 }
 /*
@@ -1361,13 +1326,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem)
 *                        waiting at hith-memory prressure caused by "move".
 */
-static bool mem_cgroup_stealed(struct mem_cgroup *mem)
+static bool mem_cgroup_stealed(struct mem_cgroup *memcg)
 {
        VM_BUG_ON(!rcu_read_lock_held());
-        return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+        return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
 }
-static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *from;
        struct mem_cgroup *to;
@@ -1382,17 +1347,17 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem)
        if (!from)
                goto unlock;
-        ret = mem_cgroup_same_or_subtree(mem, from)
+        ret = mem_cgroup_same_or_subtree(memcg, from)
-                || mem_cgroup_same_or_subtree(mem, to);
+                || mem_cgroup_same_or_subtree(memcg, to);
 unlock:
        spin_unlock(&mc.lock);
        return ret;
 }
-static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
 {
        if (mc.moving_task && current != mc.moving_task) {
-                if (mem_cgroup_under_move(mem)) {
+                if (mem_cgroup_under_move(memcg)) {
                        DEFINE_WAIT(wait);
                        prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
                        /* moving charge context might have finished. */
@@ -1476,12 +1441,12 @@ done:
 * This function returns the number of memcg under hierarchy tree. Returns
 * 1(self count) if no children.
 */
-static int mem_cgroup_count_children(struct mem_cgroup *mem)
+static int mem_cgroup_count_children(struct mem_cgroup *memcg)
 {
        int num = 0;
        struct mem_cgroup *iter;
-        for_each_mem_cgroup_tree(iter, mem)
+        for_each_mem_cgroup_tree(iter, memcg)
                num++;
        return num;
 }
@@ -1511,21 +1476,21 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
 * that to reclaim free pages from.
 */
 static struct mem_cgroup *
-mem_cgroup_select_victim(struct mem_cgroup *root_mem)
+mem_cgroup_select_victim(struct mem_cgroup *root_memcg)
 {
        struct mem_cgroup *ret = NULL;
        struct cgroup_subsys_state *css;
        int nextid, found;
-        if (!root_mem->use_hierarchy) {
+        if (!root_memcg->use_hierarchy) {
-                css_get(&root_mem->css);
+                css_get(&root_memcg->css);
-                ret = root_mem;
+                ret = root_memcg;
        }
        while (!ret) {
                rcu_read_lock();
-                nextid = root_mem->last_scanned_child + 1;
+                nextid = root_memcg->last_scanned_child + 1;
-                css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
+                css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css,
                                   &found);
                if (css && css_tryget(css))
                        ret = container_of(css, struct mem_cgroup, css);
@@ -1534,9 +1499,9 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
                /* Updates scanning parameter */
                if (!css) {
                        /* this means start scan from ID:1 */
-                        root_mem->last_scanned_child = 0;
+                        root_memcg->last_scanned_child = 0;
                } else
-                        root_mem->last_scanned_child = found;
+                        root_memcg->last_scanned_child = found;
        }
        return ret;
@@ -1552,14 +1517,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 * reclaimable pages on a node. Returns true if there are any reclaimable
 * pages in the node.
 */
-static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
                int nid, bool noswap)
 {
-        if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE))
+        if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
                return true;
        if (noswap || !total_swap_pages)
                return false;
-        if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON))
+        if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
                return true;
        return false;
@@ -1572,29 +1537,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
 * nodes based on the zonelist. So update the list loosely once per 10 secs.
 *
 */
-static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
 {
        int nid;
        /*
         * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
         * pagein/pageout changes since the last update.
         */
-        if (!atomic_read(&mem->numainfo_events))
+        if (!atomic_read(&memcg->numainfo_events))
                return;
-        if (atomic_inc_return(&mem->numainfo_updating) > 1)
+        if (atomic_inc_return(&memcg->numainfo_updating) > 1)
                return;
        /* make a nodemask where this memcg uses memory from */
-        mem->scan_nodes = node_states[N_HIGH_MEMORY];
+        memcg->scan_nodes = node_states[N_HIGH_MEMORY];
        for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
-                if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
+                if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
-                        node_clear(nid, mem->scan_nodes);
+                        node_clear(nid, memcg->scan_nodes);
        }
-        atomic_set(&mem->numainfo_events, 0);
+        atomic_set(&memcg->numainfo_events, 0);
-        atomic_set(&mem->numainfo_updating, 0);
+        atomic_set(&memcg->numainfo_updating, 0);
 }
 /*
@@ -1609,16 +1574,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
 *
 * Now, we use round-robin. Better algorithm is welcomed.
 */
-int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
        int node;
-        mem_cgroup_may_update_nodemask(mem);
+        mem_cgroup_may_update_nodemask(memcg);
-        node = mem->last_scanned_node;
+        node = memcg->last_scanned_node;
-        node = next_node(node, mem->scan_nodes);
+        node = next_node(node, memcg->scan_nodes);
        if (node == MAX_NUMNODES)
-                node = first_node(mem->scan_nodes);
+                node = first_node(memcg->scan_nodes);
        /*
         * We call this when we hit limit, not when pages are added to LRU.
         * No LRU may hold pages because all pages are UNEVICTABLE or
@@ -1628,7 +1593,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
        if (unlikely(node == MAX_NUMNODES))
                node = numa_node_id();
-        mem->last_scanned_node = node;
+        memcg->last_scanned_node = node;
        return node;
 }
@@ -1638,7 +1603,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
 * unused nodes. But scan_nodes is lazily updated and may not cotain
 * enough new information. We need to do double check.
 */
-bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
        int nid;
@@ -1646,12 +1611,12 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
         * quick check...making use of scan_node.
         * We can skip unused nodes.
         */
-        if (!nodes_empty(mem->scan_nodes)) {
+        if (!nodes_empty(memcg->scan_nodes)) {
-                for (nid = first_node(mem->scan_nodes);
+                for (nid = first_node(memcg->scan_nodes);
                     nid < MAX_NUMNODES;
-                     nid = next_node(nid, mem->scan_nodes)) {
+                     nid = next_node(nid, memcg->scan_nodes)) {
-                        if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                        if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
                                return true;
                }
        }
@@ -1659,77 +1624,39 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
         * Check rest of nodes.
         */
        for_each_node_state(nid, N_HIGH_MEMORY) {
-                if (node_isset(nid, mem->scan_nodes))
+                if (node_isset(nid, memcg->scan_nodes))
                        continue;
-                if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap))
                        return true;
        }
        return false;
 }
 #else
-int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 {
        return 0;
 }
-bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
 {
-        return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+        return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
 }
 #endif
-static void __mem_cgroup_record_scanstat(unsigned long *stats,
-                           struct memcg_scanrecord *rec)
-{
-        stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1];
-        stats[SCAN_ANON] += rec->nr_scanned[0];
-        stats[SCAN_FILE] += rec->nr_scanned[1];
-        stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1];
-        stats[ROTATE_ANON] += rec->nr_rotated[0];
-        stats[ROTATE_FILE] += rec->nr_rotated[1];
-        stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1];
-        stats[FREED_ANON] += rec->nr_freed[0];
-        stats[FREED_FILE] += rec->nr_freed[1];
-        stats[ELAPSED] += rec->elapsed;
-}
-static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec)
-{
-        struct mem_cgroup *mem;
-        int context = rec->context;
-        if (context >= NR_SCAN_CONTEXT)
-                return;
-        mem = rec->mem;
-        spin_lock(&mem->scanstat.lock);
-        __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec);
-        spin_unlock(&mem->scanstat.lock);
-        mem = rec->root;
-        spin_lock(&mem->scanstat.lock);
-        __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec);
-        spin_unlock(&mem->scanstat.lock);
-}
 /*
 * Scan the hierarchy if needed to reclaim memory. We remember the last child
 * we reclaimed from, so that we don't end up penalizing one child extensively
 * based on its position in the children list.
 *
- * root_mem is the original ancestor that we've been reclaim from.
+ * root_memcg is the original ancestor that we've been reclaim from.
 *
- * We give up and return to the caller when we visit root_mem twice.
+ * We give up and return to the caller when we visit root_memcg twice.
 * (other groups can be removed while we're walking....)
 *
 * If shrink==true, for avoiding to free too much, this returns immedieately.
 */
-static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
+static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg,
                                                struct zone *zone,
                                                gfp_t gfp_mask,
                                                unsigned long reclaim_options,
@@ -1741,28 +1668,18 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
        bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
        bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
-        struct memcg_scanrecord rec;
        unsigned long excess;
-        unsigned long scanned;
+        unsigned long nr_scanned;
-        excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
+        excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;
        /* If memsw_is_minimum==1, swap-out is of-no-use. */
-        if (!check_soft && !shrink && root_mem->memsw_is_minimum)
+        if (!check_soft && !shrink && root_memcg->memsw_is_minimum)
                noswap = true;
-        if (shrink)
-                rec.context = SCAN_BY_SHRINK;
-        else if (check_soft)
-                rec.context = SCAN_BY_SYSTEM;
-        else
-                rec.context = SCAN_BY_LIMIT;
-        rec.root = root_mem;
        while (1) {
-                victim = mem_cgroup_select_victim(root_mem);
+                victim = mem_cgroup_select_victim(root_memcg);
-                if (victim == root_mem) {
+                if (victim == root_memcg) {
                        loop++;
                        /*
                         * We are not draining per cpu cached charges during
@@ -1771,7 +1688,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                         * charges will not give any.
                         */
                        if (!check_soft && loop >= 1)
-                                drain_all_stock_async(root_mem);
+                                drain_all_stock_async(root_memcg);
                        if (loop >= 2) {
                                /*
                                 * If we have not been able to reclaim
@@ -1800,23 +1717,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                        css_put(&victim->css);
                        continue;
                }
-                rec.mem = victim;
-                rec.nr_scanned[0] = 0;
-                rec.nr_scanned[1] = 0;
-                rec.nr_rotated[0] = 0;
-                rec.nr_rotated[1] = 0;
-                rec.nr_freed[0] = 0;
-                rec.nr_freed[1] = 0;
-                rec.elapsed = 0;
                /* we use swappiness of local cgroup */
                if (check_soft) {
                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                                noswap, zone, &rec, &scanned);
+                                noswap, zone, &nr_scanned);
-                        *total_scanned += scanned;
+                        *total_scanned += nr_scanned;
                } else
                        ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
-                                                noswap, &rec);
+                                                noswap);
-                mem_cgroup_record_scanstat(&rec);
                css_put(&victim->css);
                /*
                 * At shrinking usage, we can't check we should stop here or
@@ -1827,9 +1735,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                        return ret;
                total += ret;
                if (check_soft) {
-                        if (!res_counter_soft_limit_excess(&root_mem->res))
+                        if (!res_counter_soft_limit_excess(&root_memcg->res))
                                return total;
-                } else if (mem_cgroup_margin(root_mem))
+                } else if (mem_cgroup_margin(root_memcg))
                        return total;
        }
        return total;
@@ -1840,69 +1748,62 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
 * If someone is running, return false.
 * Has to be called with memcg_oom_lock
 */
-static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg)
 {
-        int lock_count = -1;
        struct mem_cgroup *iter, *failed = NULL;
        bool cond = true;
-        for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+        for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
-                bool locked = iter->oom_lock;
+                if (iter->oom_lock) {
-                iter->oom_lock = true;
-                if (lock_count == -1)
-                        lock_count = iter->oom_lock;
-                else if (lock_count != locked) {
                        /*
                         * this subtree of our hierarchy is already locked
                         * so we cannot give a lock.
                         */
-                        lock_count = 0;
                        failed = iter;
                        cond = false;
-                }
+                } else
+                        iter->oom_lock = true;
        }
        if (!failed)
-                goto done;
+                return true;
        /*
         * OK, we failed to lock the whole subtree so we have to clean up
         * what we set up to the failing subtree
         */
        cond = true;
-        for_each_mem_cgroup_tree_cond(iter, mem, cond) {
+        for_each_mem_cgroup_tree_cond(iter, memcg, cond) {
                if (iter == failed) {
                        cond = false;
                        continue;
                }
                iter->oom_lock = false;
        }
-done:
+        return false;
-        return lock_count;
 }
 /*
 * Has to be called with memcg_oom_lock
 */
-static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
+static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *iter;
-        for_each_mem_cgroup_tree(iter, mem)
+        for_each_mem_cgroup_tree(iter, memcg)
                iter->oom_lock = false;
        return 0;
 }
-static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem)
+static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *iter;
-        for_each_mem_cgroup_tree(iter, mem)
+        for_each_mem_cgroup_tree(iter, memcg)
                atomic_inc(&iter->under_oom);
 }
-static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
+static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *iter;
@@ -1911,7 +1812,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem)
         * mem_cgroup_oom_lock() may not be called. We have to use
         * atomic_add_unless() here.
         */
-        for_each_mem_cgroup_tree(iter, mem)
+        for_each_mem_cgroup_tree(iter, memcg)
                atomic_add_unless(&iter->under_oom, -1, 0);
 }
@@ -1926,85 +1827,85 @@ struct oom_wait_info {
 static int memcg_oom_wake_function(wait_queue_t *wait,
        unsigned mode, int sync, void *arg)
 {
-        struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg,
+        struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg,
-                          *oom_wait_mem;
+                          *oom_wait_memcg;
        struct oom_wait_info *oom_wait_info;
        oom_wait_info = container_of(wait, struct oom_wait_info, wait);
-        oom_wait_mem = oom_wait_info->mem;
+        oom_wait_memcg = oom_wait_info->mem;
        /*
         * Both of oom_wait_info->mem and wake_mem are stable under us.
         * Then we can use css_is_ancestor without taking care of RCU.
         */
-        if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem)
+        if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg)
-                        && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem))
+                && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg))
                return 0;
        return autoremove_wake_function(wait, mode, sync, arg);
 }
-static void memcg_wakeup_oom(struct mem_cgroup *mem)
+static void memcg_wakeup_oom(struct mem_cgroup *memcg)
 {
-        /* for filtering, pass "mem" as argument. */
+        /* for filtering, pass "memcg" as argument. */
-        __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem);
+        __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 }
-static void memcg_oom_recover(struct mem_cgroup *mem)
+static void memcg_oom_recover(struct mem_cgroup *memcg)
 {
-        if (mem && atomic_read(&mem->under_oom))
+        if (memcg && atomic_read(&memcg->under_oom))
-                memcg_wakeup_oom(mem);
+                memcg_wakeup_oom(memcg);
 }
 /*
 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
 */
-bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
+bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask)
 {
        struct oom_wait_info owait;
        bool locked, need_to_kill;
-        owait.mem = mem;
+        owait.mem = memcg;
        owait.wait.flags = 0;
        owait.wait.func = memcg_oom_wake_function;
        owait.wait.private = current;
        INIT_LIST_HEAD(&owait.wait.task_list);
        need_to_kill = true;
-        mem_cgroup_mark_under_oom(mem);
+        mem_cgroup_mark_under_oom(memcg);
-        /* At first, try to OOM lock hierarchy under mem.*/
+        /* At first, try to OOM lock hierarchy under memcg.*/
        spin_lock(&memcg_oom_lock);
-        locked = mem_cgroup_oom_lock(mem);
+        locked = mem_cgroup_oom_lock(memcg);
        /*
         * Even if signal_pending(), we can't quit charge() loop without
         * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
         * under OOM is always welcomed, use TASK_KILLABLE here.
         */
        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-        if (!locked || mem->oom_kill_disable)
+        if (!locked || memcg->oom_kill_disable)
                need_to_kill = false;
        if (locked)
-                mem_cgroup_oom_notify(mem);
+                mem_cgroup_oom_notify(memcg);
        spin_unlock(&memcg_oom_lock);
        if (need_to_kill) {
                finish_wait(&memcg_oom_waitq, &owait.wait);
-                mem_cgroup_out_of_memory(mem, mask);
+                mem_cgroup_out_of_memory(memcg, mask);
        } else {
                schedule();
                finish_wait(&memcg_oom_waitq, &owait.wait);
        }
        spin_lock(&memcg_oom_lock);
        if (locked)
-                mem_cgroup_oom_unlock(mem);
+                mem_cgroup_oom_unlock(memcg);
-        memcg_wakeup_oom(mem);
+        memcg_wakeup_oom(memcg);
        spin_unlock(&memcg_oom_lock);
-        mem_cgroup_unmark_under_oom(mem);
+        mem_cgroup_unmark_under_oom(memcg);
        if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
                return false;
        /* Give chance to dying process */
-        schedule_timeout(1);
+        schedule_timeout_uninterruptible(1);
        return true;
 }
@@ -2035,7 +1936,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 void mem_cgroup_update_page_stat(struct page *page,
                                 enum mem_cgroup_page_stat_item idx, int val)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *memcg;
        struct page_cgroup *pc = lookup_page_cgroup(page);
        bool need_unlock = false;
        unsigned long uninitialized_var(flags);
@@ -2044,16 +1945,16 @@ void mem_cgroup_update_page_stat(struct page *page,
                return;
        rcu_read_lock();
-        mem = pc->mem_cgroup;
+        memcg = pc->mem_cgroup;
-        if (unlikely(!mem || !PageCgroupUsed(pc)))
+        if (unlikely(!memcg || !PageCgroupUsed(pc)))
                goto out;
        /* pc->mem_cgroup is unstable ? */
-        if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
+        if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) {
                /* take a lock against to access pc->mem_cgroup */
                move_lock_page_cgroup(pc, &flags);
                need_unlock = true;
-                mem = pc->mem_cgroup;
+                memcg = pc->mem_cgroup;
-                if (!mem || !PageCgroupUsed(pc))
+                if (!memcg || !PageCgroupUsed(pc))
                        goto out;
        }
@@ -2069,7 +1970,7 @@ void mem_cgroup_update_page_stat(struct page *page,
                BUG();
        }
-        this_cpu_add(mem->stat->count[idx], val);
+        this_cpu_add(memcg->stat->count[idx], val);
 out:
        if (unlikely(need_unlock))
@@ -2092,6 +1993,7 @@ struct memcg_stock_pcp {
 #define FLUSHING_CACHED_CHARGE  (0)
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static DEFINE_MUTEX(percpu_charge_mutex);
 /*
 * Try to consume stocked charge on this cpu. If success, one page is consumed
@@ -2099,13 +2001,13 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 * cgroup which is not current target, returns false. This stock will be
 * refilled.
 */
-static bool consume_stock(struct mem_cgroup *mem)
+static bool consume_stock(struct mem_cgroup *memcg)
 {
        struct memcg_stock_pcp *stock;
        bool ret = true;
        stock = &get_cpu_var(memcg_stock);
-        if (mem == stock->cached && stock->nr_pages)
+        if (memcg == stock->cached && stock->nr_pages)
                stock->nr_pages--;
        else /* need to call res_counter_charge */
                ret = false;
@@ -2146,44 +2048,38 @@ static void drain_local_stock(struct work_struct *dummy)
 * Cache charges(val) which is from res_counter, to local per_cpu area.
 * This will be consumed by consume_stock() function, later.
 */
-static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
        struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
-        if (stock->cached != mem) { /* reset if necessary */
+        if (stock->cached != memcg) { /* reset if necessary */
                drain_stock(stock);
-                stock->cached = mem;
+                stock->cached = memcg;
        }
        stock->nr_pages += nr_pages;
        put_cpu_var(memcg_stock);
 }
 /*
- * Drains all per-CPU charge caches for given root_mem resp. subtree
+ * Drains all per-CPU charge caches for given root_memcg resp. subtree
 * of the hierarchy under it. sync flag says whether we should block
 * until the work is done.
 */
-static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
+static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
 {
        int cpu, curcpu;
        /* Notify other cpus that system-wide "drain" is running */
        get_online_cpus();
-        /*
+        curcpu = get_cpu();
-         * Get a hint for avoiding draining charges on the current cpu,
-         * which must be exhausted by our charging.  It is not required that
-         * this be a precise check, so we use raw_smp_processor_id() instead of
-         * getcpu()/putcpu().
-         */
-        curcpu = raw_smp_processor_id();
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-                struct mem_cgroup *mem;
+                struct mem_cgroup *memcg;
-                mem = stock->cached;
+                memcg = stock->cached;
-                if (!mem || !stock->nr_pages)
+                if (!memcg || !stock->nr_pages)
                        continue;
-                if (!mem_cgroup_same_or_subtree(root_mem, mem))
+                if (!mem_cgroup_same_or_subtree(root_memcg, memcg))
                        continue;
                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
                        if (cpu == curcpu)
@@ -2192,14 +2088,14 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync)
                                schedule_work_on(cpu, &stock->work);
                }
        }
+        put_cpu();
        if (!sync)
                goto out;
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-                if (mem_cgroup_same_or_subtree(root_mem, stock->cached) &&
+                if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
-                                test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
                        flush_work(&stock->work);
        }
 out:
@@ -2212,51 +2108,59 @@ out:
 * expects some charges will be back to res_counter later but cannot wait for
 * it.
 */
-static void drain_all_stock_async(struct mem_cgroup *root_mem)
+static void drain_all_stock_async(struct mem_cgroup *root_memcg)
 {
-        drain_all_stock(root_mem, false);
+        /*
+         * If someone calls draining, avoid adding more kworker runs.
+         */
+        if (!mutex_trylock(&percpu_charge_mutex))
+                return;
+        drain_all_stock(root_memcg, false);
+        mutex_unlock(&percpu_charge_mutex);
 }
 /* This is a synchronous drain interface. */
-static void drain_all_stock_sync(struct mem_cgroup *root_mem)
+static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
 {
        /* called when force_empty is called */
-        drain_all_stock(root_mem, true);
+        mutex_lock(&percpu_charge_mutex);
+        drain_all_stock(root_memcg, true);
+        mutex_unlock(&percpu_charge_mutex);
 }
 /*
 * This function drains percpu counter value from DEAD cpu and
 * move it to local cpu. Note that this function can be preempted.
 */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
+static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
 {
        int i;
-        spin_lock(&mem->pcp_counter_lock);
+        spin_lock(&memcg->pcp_counter_lock);
        for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
-                long x = per_cpu(mem->stat->count[i], cpu);
+                long x = per_cpu(memcg->stat->count[i], cpu);
-                per_cpu(mem->stat->count[i], cpu) = 0;
+                per_cpu(memcg->stat->count[i], cpu) = 0;
-                mem->nocpu_base.count[i] += x;
+                memcg->nocpu_base.count[i] += x;
        }
        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-                unsigned long x = per_cpu(mem->stat->events[i], cpu);
+                unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-                per_cpu(mem->stat->events[i], cpu) = 0;
+                per_cpu(memcg->stat->events[i], cpu) = 0;
-                mem->nocpu_base.events[i] += x;
+                memcg->nocpu_base.events[i] += x;
        }
        /* need to clear ON_MOVE value, works as a kind of lock. */
-        per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
+        per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
-        spin_unlock(&mem->pcp_counter_lock);
+        spin_unlock(&memcg->pcp_counter_lock);
 }
-static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
+static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu)
 {
        int idx = MEM_CGROUP_ON_MOVE;
-        spin_lock(&mem->pcp_counter_lock);
+        spin_lock(&memcg->pcp_counter_lock);
-        per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
+        per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx];
-        spin_unlock(&mem->pcp_counter_lock);
+        spin_unlock(&memcg->pcp_counter_lock);
 }
 static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
@@ -2294,7 +2198,7 @@ enum {
        CHARGE_OOM_DIE,         /* the current is killed because of OOM */
 };
-static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                unsigned int nr_pages, bool oom_check)
 {
        unsigned long csize = nr_pages * PAGE_SIZE;
@@ -2303,16 +2207,16 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
        unsigned long flags = 0;
        int ret;
-        ret = res_counter_charge(&mem->res, csize, &fail_res);
+        ret = res_counter_charge(&memcg->res, csize, &fail_res);
        if (likely(!ret)) {
                if (!do_swap_account)
                        return CHARGE_OK;
-                ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+                ret = res_counter_charge(&memcg->memsw, csize, &fail_res);
                if (likely(!ret))
                        return CHARGE_OK;
-                res_counter_uncharge(&mem->res, csize);
+                res_counter_uncharge(&memcg->res, csize);
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
@@ -2370,12 +2274,12 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
                                   gfp_t gfp_mask,
                                   unsigned int nr_pages,
-                                   struct mem_cgroup **memcg,
+                                   struct mem_cgroup **ptr,
                                   bool oom)
 {
        unsigned int batch = max(CHARGE_BATCH, nr_pages);
        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        struct mem_cgroup *mem = NULL;
+        struct mem_cgroup *memcg = NULL;
        int ret;
        /*
@@ -2393,17 +2297,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
         * thread group leader migrates. It's possible that mm is not
         * set, if so charge the init_mm (happens for pagecache usage).
         */
-        if (!*memcg && !mm)
+        if (!*ptr && !mm)
                goto bypass;
 again:
-        if (*memcg) { /* css should be a valid one */
+        if (*ptr) { /* css should be a valid one */
-                mem = *memcg;
+                memcg = *ptr;
-                VM_BUG_ON(css_is_removed(&mem->css));
+                VM_BUG_ON(css_is_removed(&memcg->css));
-                if (mem_cgroup_is_root(mem))
+                if (mem_cgroup_is_root(memcg))
                        goto done;
-                if (nr_pages == 1 && consume_stock(mem))
+                if (nr_pages == 1 && consume_stock(memcg))
                        goto done;
-                css_get(&mem->css);
+                css_get(&memcg->css);
        } else {
                struct task_struct *p;
@@ -2411,7 +2315,7 @@ again:
                p = rcu_dereference(mm->owner);
                /*
                 * Because we don't have task_lock(), "p" can exit.
-                 * In that case, "mem" can point to root or p can be NULL with
+                 * In that case, "memcg" can point to root or p can be NULL with
                 * race with swapoff. Then, we have small risk of mis-accouning.
                 * But such kind of mis-account by race always happens because
                 * we don't have cgroup_mutex(). It's overkill and we allo that
@@ -2419,12 +2323,12 @@ again:
                 * (*) swapoff at el will charge against mm-struct not against
                 * task-struct. So, mm->owner can be NULL.
                 */
-                mem = mem_cgroup_from_task(p);
+                memcg = mem_cgroup_from_task(p);
-                if (!mem || mem_cgroup_is_root(mem)) {
+                if (!memcg || mem_cgroup_is_root(memcg)) {
                        rcu_read_unlock();
                        goto done;
                }
-                if (nr_pages == 1 && consume_stock(mem)) {
+                if (nr_pages == 1 && consume_stock(memcg)) {
                        /*
                         * It seems dagerous to access memcg without css_get().
                         * But considering how consume_stok works, it's not
@@ -2437,7 +2341,7 @@ again:
                        goto done;
                }
                /* after here, we may be blocked. we need to get refcnt */
-                if (!css_tryget(&mem->css)) {
+                if (!css_tryget(&memcg->css)) {
                        rcu_read_unlock();
                        goto again;
                }
@@ -2449,7 +2353,7 @@ again:
                /* If killed, bypass charge */
                if (fatal_signal_pending(current)) {
-                        css_put(&mem->css);
+                        css_put(&memcg->css);
                        goto bypass;
                }
@@ -2459,43 +2363,43 @@ again:
                        nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
                }
-                ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
+                ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
                switch (ret) {
                case CHARGE_OK:
                        break;
                case CHARGE_RETRY: /* not in OOM situation but retry */
                        batch = nr_pages;
-                        css_put(&mem->css);
+                        css_put(&memcg->css);
-                        mem = NULL;
+                        memcg = NULL;
                        goto again;
                case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
-                        css_put(&mem->css);
+                        css_put(&memcg->css);
                        goto nomem;
                case CHARGE_NOMEM: /* OOM routine works */
                        if (!oom) {
-                                css_put(&mem->css);
+                                css_put(&memcg->css);
                                goto nomem;
                        }
                        /* If oom, we never return -ENOMEM */
                        nr_oom_retries--;
                        break;
                case CHARGE_OOM_DIE: /* Killed by OOM Killer */
-                        css_put(&mem->css);
+                        css_put(&memcg->css);
                        goto bypass;
                }
        } while (ret != CHARGE_OK);
        if (batch > nr_pages)
-                refill_stock(mem, batch - nr_pages);
+                refill_stock(memcg, batch - nr_pages);
-        css_put(&mem->css);
+        css_put(&memcg->css);
 done:
-        *memcg = mem;
+        *ptr = memcg;
        return 0;
 nomem:
-        *memcg = NULL;
+        *ptr = NULL;
        return -ENOMEM;
 bypass:
-        *memcg = NULL;
+        *ptr = NULL;
        return 0;
 }
@@ -2504,15 +2408,15 @@ bypass:
 * This function is for that and do uncharge, put css's refcnt.
 * gotten by try_charge().
 */
-static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
                                       unsigned int nr_pages)
 {
-        if (!mem_cgroup_is_root(mem)) {
+        if (!mem_cgroup_is_root(memcg)) {
                unsigned long bytes = nr_pages * PAGE_SIZE;
-                res_counter_uncharge(&mem->res, bytes);
+                res_counter_uncharge(&memcg->res, bytes);
                if (do_swap_account)
-                        res_counter_uncharge(&mem->memsw, bytes);
+                        res_counter_uncharge(&memcg->memsw, bytes);
        }
 }
@@ -2537,7 +2441,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
-        struct mem_cgroup *mem = NULL;
+        struct mem_cgroup *memcg = NULL;
        struct page_cgroup *pc;
        unsigned short id;
        swp_entry_t ent;
@@ -2547,23 +2451,23 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc)) {
-                mem = pc->mem_cgroup;
+                memcg = pc->mem_cgroup;
-                if (mem && !css_tryget(&mem->css))
+                if (memcg && !css_tryget(&memcg->css))
-                        mem = NULL;
+                        memcg = NULL;
        } else if (PageSwapCache(page)) {
                ent.val = page_private(page);
                id = lookup_swap_cgroup(ent);
                rcu_read_lock();
-                mem = mem_cgroup_lookup(id);
+                memcg = mem_cgroup_lookup(id);
-                if (mem && !css_tryget(&mem->css))
+                if (memcg && !css_tryget(&memcg->css))
-                        mem = NULL;
+                        memcg = NULL;
                rcu_read_unlock();
        }
        unlock_page_cgroup(pc);
-        return mem;
+        return memcg;
 }
-static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
+static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
                                       struct page *page,
                                       unsigned int nr_pages,
                                       struct page_cgroup *pc,
@@ -2572,14 +2476,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
        lock_page_cgroup(pc);
        if (unlikely(PageCgroupUsed(pc))) {
                unlock_page_cgroup(pc);
-                __mem_cgroup_cancel_charge(mem, nr_pages);
+                __mem_cgroup_cancel_charge(memcg, nr_pages);
                return;
        }
        /*
         * we don't need page_cgroup_lock about tail pages, becase they are not
         * accessed by any other context at this point.
         */
-        pc->mem_cgroup = mem;
+        pc->mem_cgroup = memcg;
        /*
         * We access a page_cgroup asynchronously without lock_page_cgroup().
         * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
@@ -2602,14 +2506,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
                break;
        }
-        mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
+        mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages);
        unlock_page_cgroup(pc);
        /*
         * "charge_statistics" updated event counter. Then, check it.
         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
         * if they exceeds softlimit.
         */
-        memcg_check_events(mem, page);
+        memcg_check_events(memcg, page);
 }
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2796,7 +2700,7 @@ out:
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask, enum charge_type ctype)
 {
-        struct mem_cgroup *mem = NULL;
+        struct mem_cgroup *memcg = NULL;
        unsigned int nr_pages = 1;
        struct page_cgroup *pc;
        bool oom = true;
@@ -2815,11 +2719,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        pc = lookup_page_cgroup(page);
        BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
-        if (ret || !mem)
+        if (ret || !memcg)
                return ret;
-        __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
+        __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype);
        return 0;
 }
@@ -2848,7 +2752,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                                        enum charge_type ctype);
 static void
-__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
+__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg,
                                        enum charge_type ctype)
 {
        struct page_cgroup *pc = lookup_page_cgroup(page);
@@ -2858,7 +2762,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
         * LRU. Take care of it.
         */
        mem_cgroup_lru_del_before_commit(page);
-        __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
+        __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
        mem_cgroup_lru_add_after_commit(page);
        return;
 }
@@ -2866,44 +2770,20 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
-        struct mem_cgroup *mem = NULL;
+        struct mem_cgroup *memcg = NULL;
        int ret;
        if (mem_cgroup_disabled())
                return 0;
        if (PageCompound(page))
                return 0;
-        /*
-         * Corner case handling. This is called from add_to_page_cache()
-         * in usual. But some FS (shmem) precharges this page before calling it
-         * and call add_to_page_cache() with GFP_NOWAIT.
-         *
-         * For GFP_NOWAIT case, the page may be pre-charged before calling
-         * add_to_page_cache(). (See shmem.c) check it here and avoid to call
-         * charge twice. (It works but has to pay a bit larger cost.)
-         * And when the page is SwapCache, it should take swap information
-         * into account. This is under lock_page() now.
-         */
-        if (!(gfp_mask & __GFP_WAIT)) {
-                struct page_cgroup *pc;
-                pc = lookup_page_cgroup(page);
-                if (!pc)
-                        return 0;
-                lock_page_cgroup(pc);
-                if (PageCgroupUsed(pc)) {
-                        unlock_page_cgroup(pc);
-                        return 0;
-                }
-                unlock_page_cgroup(pc);
-        }
        if (unlikely(!mm))
                mm = &init_mm;
        if (page_is_file_cache(page)) {
-                ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
+                ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true);
-                if (ret || !mem)
+                if (ret || !memcg)
                        return ret;
                /*
@@ -2911,15 +2791,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                 * put that would remove them from the LRU list, make
                 * sure that they get relinked properly.
                 */
-                __mem_cgroup_commit_charge_lrucare(page, mem,
+                __mem_cgroup_commit_charge_lrucare(page, memcg,
                                        MEM_CGROUP_CHARGE_TYPE_CACHE);
                return ret;
        }
        /* shmem */
        if (PageSwapCache(page)) {
-                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
+                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg);
                if (!ret)
-                        __mem_cgroup_commit_charge_swapin(page, mem,
+                        __mem_cgroup_commit_charge_swapin(page, memcg,
                                        MEM_CGROUP_CHARGE_TYPE_SHMEM);
        } else
                ret = mem_cgroup_charge_common(page, mm, gfp_mask,
@@ -2938,7 +2818,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                                 struct page *page,
                                 gfp_t mask, struct mem_cgroup **ptr)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *memcg;
        int ret;
        *ptr = NULL;
@@ -2956,12 +2836,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
         */
        if (!PageSwapCache(page))
                goto charge_cur_mm;
-        mem = try_get_mem_cgroup_from_page(page);
+        memcg = try_get_mem_cgroup_from_page(page);
-        if (!mem)
+        if (!memcg)
                goto charge_cur_mm;
-        *ptr = mem;
+        *ptr = memcg;
        ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
-        css_put(&mem->css);
+        css_put(&memcg->css);
        return ret;
 charge_cur_mm:
        if (unlikely(!mm))
@@ -3021,16 +2901,16 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
                                        MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
+void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
 {
        if (mem_cgroup_disabled())
                return;
-        if (!mem)
+        if (!memcg)
                return;
-        __mem_cgroup_cancel_charge(mem, 1);
+        __mem_cgroup_cancel_charge(memcg, 1);
 }
-static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
+static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
                                   unsigned int nr_pages,
                                   const enum charge_type ctype)
 {
@@ -3048,7 +2928,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
         * uncharges. Then, it's ok to ignore memcg's refcnt.
         */
        if (!batch->memcg)
-                batch->memcg = mem;
+                batch->memcg = memcg;
        /*
         * do_batch > 0 when unmapping pages or inode invalidate/truncate.
         * In those cases, all pages freed continuously can be expected to be in
@@ -3068,7 +2948,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
         * merge a series of uncharges to an uncharge of res_counter.
         * If not, we uncharge res_counter ony by one.
         */
-        if (batch->memcg != mem)
+        if (batch->memcg != memcg)
                goto direct_uncharge;
        /* remember freed charge and uncharge it later */
        batch->nr_pages++;
@@ -3076,11 +2956,11 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
                batch->memsw_nr_pages++;
        return;
 direct_uncharge:
-        res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
+        res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE);
        if (uncharge_memsw)
-                res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
+                res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE);
-        if (unlikely(batch->memcg != mem))
+        if (unlikely(batch->memcg != memcg))
-                memcg_oom_recover(mem);
+                memcg_oom_recover(memcg);
        return;
 }
@@ -3090,7 +2970,7 @@ direct_uncharge:
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
-        struct mem_cgroup *mem = NULL;
+        struct mem_cgroup *memcg = NULL;
        unsigned int nr_pages = 1;
        struct page_cgroup *pc;
@@ -3113,7 +2993,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        lock_page_cgroup(pc);
-        mem = pc->mem_cgroup;
+        memcg = pc->mem_cgroup;
        if (!PageCgroupUsed(pc))
                goto unlock_out;
@@ -3136,7 +3016,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
+        mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages);
        ClearPageCgroupUsed(pc);
        /*
@@ -3148,18 +3028,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        unlock_page_cgroup(pc);
        /*
-         * even after unlock, we have mem->res.usage here and this memcg
+         * even after unlock, we have memcg->res.usage here and this memcg
         * will never be freed.
         */
-        memcg_check_events(mem, page);
+        memcg_check_events(memcg, page);
        if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
-                mem_cgroup_swap_statistics(mem, true);
+                mem_cgroup_swap_statistics(memcg, true);
-                mem_cgroup_get(mem);
+                mem_cgroup_get(memcg);
        }
-        if (!mem_cgroup_is_root(mem))
+        if (!mem_cgroup_is_root(memcg))
-                mem_cgroup_do_uncharge(mem, nr_pages, ctype);
+                mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
-        return mem;
+        return memcg;
 unlock_out:
        unlock_page_cgroup(pc);
@@ -3349,7 +3229,7 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 int mem_cgroup_prepare_migration(struct page *page,
        struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
 {
-        struct mem_cgroup *mem = NULL;
+        struct mem_cgroup *memcg = NULL;
        struct page_cgroup *pc;
        enum charge_type ctype;
        int ret = 0;
@@ -3363,8 +3243,8 @@ int mem_cgroup_prepare_migration(struct page *page,
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc)) {
-                mem = pc->mem_cgroup;
+                memcg = pc->mem_cgroup;
-                css_get(&mem->css);
+                css_get(&memcg->css);
                /*
                 * At migrating an anonymous page, its mapcount goes down
                 * to 0 and uncharge() will be called. But, even if it's fully
@@ -3402,12 +3282,12 @@ int mem_cgroup_prepare_migration(struct page *page,
         * If the page is not charged at this point,
         * we return here.
         */
-        if (!mem)
+        if (!memcg)
                return 0;
-        *ptr = mem;
+        *ptr = memcg;
        ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
-        css_put(&mem->css);/* drop extra refcnt */
+        css_put(&memcg->css);/* drop extra refcnt */
        if (ret || *ptr == NULL) {
                if (PageAnon(page)) {
                        lock_page_cgroup(pc);
@@ -3433,21 +3313,21 @@ int mem_cgroup_prepare_migration(struct page *page,
                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
        else
                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-        __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
+        __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype);
        return ret;
 }
 /* remove redundant charge if migration failed*/
-void mem_cgroup_end_migration(struct mem_cgroup *mem,
+void mem_cgroup_end_migration(struct mem_cgroup *memcg,
        struct page *oldpage, struct page *newpage, bool migration_ok)
 {
        struct page *used, *unused;
        struct page_cgroup *pc;
-        if (!mem)
+        if (!memcg)
                return;
        /* blocks rmdir() */
-        cgroup_exclude_rmdir(&mem->css);
+        cgroup_exclude_rmdir(&memcg->css);
        if (!migration_ok) {
                used = oldpage;
                unused = newpage;
@@ -3483,32 +3363,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
         * So, rmdir()->pre_destroy() can be called while we do this charge.
         * In that case, we need to call pre_destroy() again. check it here.
         */
-        cgroup_release_and_wakeup_rmdir(&mem->css);
+        cgroup_release_and_wakeup_rmdir(&memcg->css);
-}
-/*
- * A call to try to shrink memory usage on charge failure at shmem's swapin.
- * Calling hierarchical_reclaim is not enough because we should update
- * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
- * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
- * not from the memcg which this page would be charged to.
- * try_charge_swapin does all of these works properly.
- */
-int mem_cgroup_shmem_charge_fallback(struct page *page,
-                            struct mm_struct *mm,
-                            gfp_t gfp_mask)
-{
-        struct mem_cgroup *mem;
-        int ret;
-        if (mem_cgroup_disabled())
-                return 0;
-        ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
-        if (!ret)
-                mem_cgroup_cancel_charge_swapin(mem); /* it does !mem check */
-        return ret;
 }
 #ifdef CONFIG_DEBUG_VM
@@ -3587,7 +3442,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                /*
                 * Rather than hide all in some function, I do this in
                 * open coded manner. You see what this really does.
-                 * We have to guarantee mem->res.limit < mem->memsw.limit.
+                 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
                memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
@@ -3649,7 +3504,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                /*
                 * Rather than hide all in some function, I do this in
                 * open coded manner. You see what this really does.
-                 * We have to guarantee mem->res.limit < mem->memsw.limit.
+                 * We have to guarantee memcg->res.limit < memcg->memsw.limit.
                 */
                mutex_lock(&set_limit_mutex);
                memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
@@ -3787,7 +3642,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 * This routine traverse page_cgroup in given list and drop them all.
 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 */
-static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
+static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
                                int node, int zid, enum lru_list lru)
 {
        struct zone *zone;
@@ -3798,7 +3653,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
        int ret = 0;
        zone = &NODE_DATA(node)->node_zones[zid];
-        mz = mem_cgroup_zoneinfo(mem, node, zid);
+        mz = mem_cgroup_zoneinfo(memcg, node, zid);
        list = &mz->lists[lru];
        loop = MEM_CGROUP_ZSTAT(mz, lru);
@@ -3825,7 +3680,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
                page = lookup_cgroup_page(pc);
-                ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
+                ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL);
                if (ret == -ENOMEM)
                        break;
@@ -3846,14 +3701,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
 * make mem_cgroup's charge to be 0 if there is no task.
 * This enables deleting this mem_cgroup.
 */
-static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
+static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all)
 {
        int ret;
        int node, zid, shrink;
        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        struct cgroup *cgrp = mem->css.cgroup;
+        struct cgroup *cgrp = memcg->css.cgroup;
-        css_get(&mem->css);
+        css_get(&memcg->css);
        shrink = 0;
        /* should free all ? */
@@ -3869,14 +3724,14 @@ move_account:
                        goto out;
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
-                drain_all_stock_sync(mem);
+                drain_all_stock_sync(memcg);
                ret = 0;
-                mem_cgroup_start_move(mem);
+                mem_cgroup_start_move(memcg);
                for_each_node_state(node, N_HIGH_MEMORY) {
                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
                                enum lru_list l;
                                for_each_lru(l) {
-                                        ret = mem_cgroup_force_empty_list(mem,
+                                        ret = mem_cgroup_force_empty_list(memcg,
                                                        node, zid, l);
                                        if (ret)
                                                break;
@@ -3885,16 +3740,16 @@ move_account:
                        if (ret)
                                break;
                }
-                mem_cgroup_end_move(mem);
+                mem_cgroup_end_move(memcg);
-                memcg_oom_recover(mem);
+                memcg_oom_recover(memcg);
                /* it seems parent cgroup doesn't have enough mem */
                if (ret == -ENOMEM)
                        goto try_to_free;
                cond_resched();
        /* "ret" should also be checked to ensure all lists are empty. */
-        } while (mem->res.usage > 0 || ret);
+        } while (memcg->res.usage > 0 || ret);
 out:
-        css_put(&mem->css);
+        css_put(&memcg->css);
        return ret;
 try_to_free:
@@ -3907,19 +3762,15 @@ try_to_free:
        lru_add_drain_all();
        /* try to free all pages in this cgroup */
        shrink = 1;
-        while (nr_retries && mem->res.usage > 0) {
+        while (nr_retries && memcg->res.usage > 0) {
-                struct memcg_scanrecord rec;
                int progress;
                if (signal_pending(current)) {
                        ret = -EINTR;
                        goto out;
                }
-                rec.context = SCAN_BY_SHRINK;
+                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
-                rec.mem = mem;
+                                                false);
-                rec.root = mem;
-                progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
-                                                false, &rec);
                if (!progress) {
                        nr_retries--;
                        /* maybe some writeback is necessary */
@@ -3947,12 +3798,12 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
                                        u64 val)
 {
        int retval = 0;
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        struct cgroup *parent = cont->parent;
-        struct mem_cgroup *parent_mem = NULL;
+        struct mem_cgroup *parent_memcg = NULL;
        if (parent)
-                parent_mem = mem_cgroup_from_cont(parent);
+                parent_memcg = mem_cgroup_from_cont(parent);
        cgroup_lock();
        /*
@@ -3963,10 +3814,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
         * For the root cgroup, parent_mem is NULL, we allow value to be
         * set if there are no children.
         */
-        if ((!parent_mem || !parent_mem->use_hierarchy) &&
+        if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
                                (val == 1 || val == 0)) {
                if (list_empty(&cont->children))
-                        mem->use_hierarchy = val;
+                        memcg->use_hierarchy = val;
                else
                        retval = -EBUSY;
        } else
@@ -3977,14 +3828,14 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
 }
-static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
                                               enum mem_cgroup_stat_index idx)
 {
        struct mem_cgroup *iter;
        long val = 0;
        /* Per-cpu values can be negative, use a signed accumulator */
-        for_each_mem_cgroup_tree(iter, mem)
+        for_each_mem_cgroup_tree(iter, memcg)
                val += mem_cgroup_read_stat(iter, idx);
        if (val < 0) /* race ? */
@@ -3992,29 +3843,29 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
        return val;
 }
-static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
+static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
        u64 val;
-        if (!mem_cgroup_is_root(mem)) {
+        if (!mem_cgroup_is_root(memcg)) {
                if (!swap)
-                        return res_counter_read_u64(&mem->res, RES_USAGE);
+                        return res_counter_read_u64(&memcg->res, RES_USAGE);
                else
-                        return res_counter_read_u64(&mem->memsw, RES_USAGE);
+                        return res_counter_read_u64(&memcg->memsw, RES_USAGE);
        }
-        val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
+        val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
-        val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
+        val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
        if (swap)
-                val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
+                val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
        return val << PAGE_SHIFT;
 }
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        u64 val;
        int type, name;
@@ -4023,15 +3874,15 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
        switch (type) {
        case _MEM:
                if (name == RES_USAGE)
-                        val = mem_cgroup_usage(mem, false);
+                        val = mem_cgroup_usage(memcg, false);
                else
-                        val = res_counter_read_u64(&mem->res, name);
+                        val = res_counter_read_u64(&memcg->res, name);
                break;
        case _MEMSWAP:
                if (name == RES_USAGE)
-                        val = mem_cgroup_usage(mem, true);
+                        val = mem_cgroup_usage(memcg, true);
                else
-                        val = res_counter_read_u64(&mem->memsw, name);
+                        val = res_counter_read_u64(&memcg->memsw, name);
                break;
        default:
                BUG();
@@ -4119,24 +3970,24 @@ out:
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *memcg;
        int type, name;
-        mem = mem_cgroup_from_cont(cont);
+        memcg = mem_cgroup_from_cont(cont);
        type = MEMFILE_TYPE(event);
        name = MEMFILE_ATTR(event);
        switch (name) {
        case RES_MAX_USAGE:
                if (type == _MEM)
-                        res_counter_reset_max(&mem->res);
+                        res_counter_reset_max(&memcg->res);
                else
-                        res_counter_reset_max(&mem->memsw);
+                        res_counter_reset_max(&memcg->memsw);
                break;
        case RES_FAILCNT:
                if (type == _MEM)
-                        res_counter_reset_failcnt(&mem->res);
+                        res_counter_reset_failcnt(&memcg->res);
                else
-                        res_counter_reset_failcnt(&mem->memsw);
+                        res_counter_reset_failcnt(&memcg->memsw);
                break;
        }
@@ -4153,7 +4004,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
 static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
                                        struct cftype *cft, u64 val)
 {
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        if (val >= (1 << NR_MOVE_TYPE))
                return -EINVAL;
@@ -4163,7 +4014,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
         * inconsistent.
         */
        cgroup_lock();
-        mem->move_charge_at_immigrate = val;
+        memcg->move_charge_at_immigrate = val;
        cgroup_unlock();
        return 0;
@@ -4220,49 +4071,49 @@ struct {
 static void
-mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
+mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
 {
        s64 val;
        /* per cpu stat */
-        val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
+        val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
        s->stat[MCS_CACHE] += val * PAGE_SIZE;
-        val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
+        val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
        s->stat[MCS_RSS] += val * PAGE_SIZE;
-        val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
+        val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
        s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
-        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
+        val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
        s->stat[MCS_PGPGIN] += val;
-        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
+        val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
        s->stat[MCS_PGPGOUT] += val;
        if (do_swap_account) {
-                val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
+                val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
                s->stat[MCS_SWAP] += val * PAGE_SIZE;
        }
-        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
+        val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
        s->stat[MCS_PGFAULT] += val;
-        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
+        val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
        s->stat[MCS_PGMAJFAULT] += val;
        /* per zone stat */
-        val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON));
+        val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
        s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
-        val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON));
+        val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
        s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
-        val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE));
+        val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
        s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
-        val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE));
+        val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
        s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
-        val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE));
+        val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
        s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
 }
 static void
-mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
+mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
 {
        struct mem_cgroup *iter;
-        for_each_mem_cgroup_tree(iter, mem)
+        for_each_mem_cgroup_tree(iter, memcg)
                mem_cgroup_get_local_stat(iter, s);
 }
@@ -4348,8 +4199,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        }
 #ifdef CONFIG_DEBUG_VM
-        cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
        {
                int nid, zid;
                struct mem_cgroup_per_zone *mz;
@@ -4486,20 +4335,20 @@ static int compare_thresholds(const void *a, const void *b)
        return _a->threshold - _b->threshold;
 }
-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 {
        struct mem_cgroup_eventfd_list *ev;
-        list_for_each_entry(ev, &mem->oom_notify, list)
+        list_for_each_entry(ev, &memcg->oom_notify, list)
                eventfd_signal(ev->eventfd, 1);
        return 0;
 }
-static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
+static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
 {
        struct mem_cgroup *iter;
-        for_each_mem_cgroup_tree(iter, mem)
+        for_each_mem_cgroup_tree(iter, memcg)
                mem_cgroup_oom_notify_cb(iter);
 }
@@ -4689,7 +4538,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
        struct cftype *cft, struct eventfd_ctx *eventfd)
 {
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        struct mem_cgroup_eventfd_list *ev, *tmp;
        int type = MEMFILE_TYPE(cft->private);
@@ -4697,7 +4546,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
        spin_lock(&memcg_oom_lock);
-        list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) {
+        list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
                if (ev->eventfd == eventfd) {
                        list_del(&ev->list);
                        kfree(ev);
@@ -4710,11 +4559,11 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
        struct cftype *cft,  struct cgroup_map_cb *cb)
 {
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
-        cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable);
+        cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
-        if (atomic_read(&mem->under_oom))
+        if (atomic_read(&memcg->under_oom))
                cb->fill(cb, "under_oom", 1);
        else
                cb->fill(cb, "under_oom", 0);
@@ -4724,7 +4573,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        struct cftype *cft, u64 val)
 {
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        struct mem_cgroup *parent;
        /* cannot set to root cgroup and only 0 and 1 are allowed */
@@ -4736,13 +4585,13 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        cgroup_lock();
        /* oom-kill-disable is a flag for subhierarchy. */
        if ((parent->use_hierarchy) ||
-            (mem->use_hierarchy && !list_empty(&cgrp->children))) {
+            (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
                cgroup_unlock();
                return -EINVAL;
        }
-        mem->oom_kill_disable = val;
+        memcg->oom_kill_disable = val;
        if (!val)
-                memcg_oom_recover(mem);
+                memcg_oom_recover(memcg);
        cgroup_unlock();
        return 0;
 }
@@ -4763,54 +4612,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
 }
 #endif /* CONFIG_NUMA */
-static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp,
-                                struct cftype *cft,
-                                struct cgroup_map_cb *cb)
-{
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
-        char string[64];
-        int i;
-        for (i = 0; i < NR_SCANSTATS; i++) {
-                strcpy(string, scanstat_string[i]);
-                strcat(string, SCANSTAT_WORD_LIMIT);
-                cb->fill(cb, string,  mem->scanstat.stats[SCAN_BY_LIMIT][i]);
-        }
-        for (i = 0; i < NR_SCANSTATS; i++) {
-                strcpy(string, scanstat_string[i]);
-                strcat(string, SCANSTAT_WORD_SYSTEM);
-                cb->fill(cb, string,  mem->scanstat.stats[SCAN_BY_SYSTEM][i]);
-        }
-        for (i = 0; i < NR_SCANSTATS; i++) {
-                strcpy(string, scanstat_string[i]);
-                strcat(string, SCANSTAT_WORD_LIMIT);
-                strcat(string, SCANSTAT_WORD_HIERARCHY);
-                cb->fill(cb, string,  mem->scanstat.rootstats[SCAN_BY_LIMIT][i]);
-        }
-        for (i = 0; i < NR_SCANSTATS; i++) {
-                strcpy(string, scanstat_string[i]);
-                strcat(string, SCANSTAT_WORD_SYSTEM);
-                strcat(string, SCANSTAT_WORD_HIERARCHY);
-                cb->fill(cb, string,  mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]);
-        }
-        return 0;
-}
-static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp,
-                                unsigned int event)
-{
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
-        spin_lock(&mem->scanstat.lock);
-        memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats));
-        memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats));
-        spin_unlock(&mem->scanstat.lock);
-        return 0;
-}
 static struct cftype mem_cgroup_files[] = {
        {
                .name = "usage_in_bytes",
@@ -4881,11 +4682,6 @@ static struct cftype mem_cgroup_files[] = {
                .mode = S_IRUGO,
        },
 #endif
-        {
-                .name = "vmscan_stat",
-                .read_map = mem_cgroup_vmscan_stat_read,
-                .trigger = mem_cgroup_reset_vmscan_stat,
-        },
 };
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -4931,7 +4727,7 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
 }
 #endif
-static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
        struct mem_cgroup_per_node *pn;
        struct mem_cgroup_per_zone *mz;
@@ -4951,21 +4747,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
        if (!pn)
                return 1;
-        mem->info.nodeinfo[node] = pn;
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
                for_each_lru(l)
                        INIT_LIST_HEAD(&mz->lists[l]);
                mz->usage_in_excess = 0;
                mz->on_tree = false;
-                mz->mem = mem;
+                mz->mem = memcg;
        }
+        memcg->info.nodeinfo[node] = pn;
        return 0;
 }
-static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
+static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
-        kfree(mem->info.nodeinfo[node]);
+        kfree(memcg->info.nodeinfo[node]);
 }
 static struct mem_cgroup *mem_cgroup_alloc(void)
@@ -5007,51 +4803,51 @@ out_free:
 * Removal of cgroup itself succeeds regardless of refs from swap.
 */
-static void __mem_cgroup_free(struct mem_cgroup *mem)
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
        int node;
-        mem_cgroup_remove_from_trees(mem);
+        mem_cgroup_remove_from_trees(memcg);
-        free_css_id(&mem_cgroup_subsys, &mem->css);
+        free_css_id(&mem_cgroup_subsys, &memcg->css);
        for_each_node_state(node, N_POSSIBLE)
-                free_mem_cgroup_per_zone_info(mem, node);
+                free_mem_cgroup_per_zone_info(memcg, node);
-        free_percpu(mem->stat);
+        free_percpu(memcg->stat);
        if (sizeof(struct mem_cgroup) < PAGE_SIZE)
-                kfree(mem);
+                kfree(memcg);
        else
-                vfree(mem);
+                vfree(memcg);
 }
-static void mem_cgroup_get(struct mem_cgroup *mem)
+static void mem_cgroup_get(struct mem_cgroup *memcg)
 {
-        atomic_inc(&mem->refcnt);
+        atomic_inc(&memcg->refcnt);
 }
-static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
+static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
 {
-        if (atomic_sub_and_test(count, &mem->refcnt)) {
+        if (atomic_sub_and_test(count, &memcg->refcnt)) {
-                struct mem_cgroup *parent = parent_mem_cgroup(mem);
+                struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-                __mem_cgroup_free(mem);
+                __mem_cgroup_free(memcg);
                if (parent)
                        mem_cgroup_put(parent);
        }
 }
-static void mem_cgroup_put(struct mem_cgroup *mem)
+static void mem_cgroup_put(struct mem_cgroup *memcg)
 {
-        __mem_cgroup_put(mem, 1);
+        __mem_cgroup_put(memcg, 1);
 }
 /*
 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
 */
-static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
+static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 {
-        if (!mem->res.parent)
+        if (!memcg->res.parent)
                return NULL;
-        return mem_cgroup_from_res_counter(mem->res.parent, res);
+        return mem_cgroup_from_res_counter(memcg->res.parent, res);
 }
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -5094,16 +4890,16 @@ static int mem_cgroup_soft_limit_tree_init(void)
 static struct cgroup_subsys_state * __ref
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
-        struct mem_cgroup *mem, *parent;
+        struct mem_cgroup *memcg, *parent;
        long error = -ENOMEM;
        int node;
-        mem = mem_cgroup_alloc();
+        memcg = mem_cgroup_alloc();
-        if (!mem)
+        if (!memcg)
                return ERR_PTR(error);
        for_each_node_state(node, N_POSSIBLE)
-                if (alloc_mem_cgroup_per_zone_info(mem, node))
+                if (alloc_mem_cgroup_per_zone_info(memcg, node))
                        goto free_out;
        /* root ? */
@@ -5111,7 +4907,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                int cpu;
                enable_swap_cgroup();
                parent = NULL;
-                root_mem_cgroup = mem;
+                root_mem_cgroup = memcg;
                if (mem_cgroup_soft_limit_tree_init())
                        goto free_out;
                for_each_possible_cpu(cpu) {
@@ -5122,13 +4918,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
        } else {
                parent = mem_cgroup_from_cont(cont->parent);
-                mem->use_hierarchy = parent->use_hierarchy;
+                memcg->use_hierarchy = parent->use_hierarchy;
-                mem->oom_kill_disable = parent->oom_kill_disable;
+                memcg->oom_kill_disable = parent->oom_kill_disable;
        }
        if (parent && parent->use_hierarchy) {
-                res_counter_init(&mem->res, &parent->res);
+                res_counter_init(&memcg->res, &parent->res);
-                res_counter_init(&mem->memsw, &parent->memsw);
+                res_counter_init(&memcg->memsw, &parent->memsw);
                /*
                 * We increment refcnt of the parent to ensure that we can
                 * safely access it on res_counter_charge/uncharge.
@@ -5137,22 +4933,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                 */
                mem_cgroup_get(parent);
        } else {
-                res_counter_init(&mem->res, NULL);
+                res_counter_init(&memcg->res, NULL);
-                res_counter_init(&mem->memsw, NULL);
+                res_counter_init(&memcg->memsw, NULL);
        }
-        mem->last_scanned_child = 0;
+        memcg->last_scanned_child = 0;
-        mem->last_scanned_node = MAX_NUMNODES;
+        memcg->last_scanned_node = MAX_NUMNODES;
-        INIT_LIST_HEAD(&mem->oom_notify);
+        INIT_LIST_HEAD(&memcg->oom_notify);
        if (parent)
-                mem->swappiness = mem_cgroup_swappiness(parent);
+                memcg->swappiness = mem_cgroup_swappiness(parent);
-        atomic_set(&mem->refcnt, 1);
+        atomic_set(&memcg->refcnt, 1);
-        mem->move_charge_at_immigrate = 0;
+        memcg->move_charge_at_immigrate = 0;
-        mutex_init(&mem->thresholds_lock);
+        mutex_init(&memcg->thresholds_lock);
-        spin_lock_init(&mem->scanstat.lock);
+        return &memcg->css;
-        return &mem->css;
 free_out:
-        __mem_cgroup_free(mem);
+        __mem_cgroup_free(memcg);
        root_mem_cgroup = NULL;
        return ERR_PTR(error);
 }
@@ -5160,17 +4955,17 @@ free_out:
 static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
                                        struct cgroup *cont)
 {
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        return mem_cgroup_force_empty(mem, false);
+        return mem_cgroup_force_empty(memcg, false);
 }
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
                                struct cgroup *cont)
 {
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        mem_cgroup_put(mem);
+        mem_cgroup_put(memcg);
 }
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
@@ -5193,9 +4988,9 @@ static int mem_cgroup_do_precharge(unsigned long count)
 {
        int ret = 0;
        int batch_count = PRECHARGE_COUNT_AT_ONCE;
-        struct mem_cgroup *mem = mc.to;
+        struct mem_cgroup *memcg = mc.to;
-        if (mem_cgroup_is_root(mem)) {
+        if (mem_cgroup_is_root(memcg)) {
                mc.precharge += count;
                /* we don't need css_get for root */
                return ret;
@@ -5204,16 +4999,16 @@ static int mem_cgroup_do_precharge(unsigned long count)
        if (count > 1) {
                struct res_counter *dummy;
                /*
-                 * "mem" cannot be under rmdir() because we've already checked
+                 * "memcg" cannot be under rmdir() because we've already checked
                 * by cgroup_lock_live_cgroup() that it is not removed and we
                 * are still under the same cgroup_mutex. So we can postpone
                 * css_get().
                 */
-                if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
+                if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy))
                        goto one_by_one;
-                if (do_swap_account && res_counter_charge(&mem->memsw,
+                if (do_swap_account && res_counter_charge(&memcg->memsw,
                                                PAGE_SIZE * count, &dummy)) {
-                        res_counter_uncharge(&mem->res, PAGE_SIZE * count);
+                        res_counter_uncharge(&memcg->res, PAGE_SIZE * count);
                        goto one_by_one;
                }
                mc.precharge += count;
@@ -5230,8 +5025,9 @@ one_by_one:
                        batch_count = PRECHARGE_COUNT_AT_ONCE;
                        cond_resched();
                }
-                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
+                ret = __mem_cgroup_try_charge(NULL,
-                if (ret || !mem)
+                                        GFP_KERNEL, 1, &memcg, false);
+                if (ret || !memcg)
                        /* mem_cgroup_clear_mc() will do uncharge later */
                        return -ENOMEM;
                mc.precharge++;
@@ -5330,15 +5126,17 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
                pgoff = pte_to_pgoff(ptent);
        /* page is moved even if it's not RSS of this task(page-faulted). */
-        if (!mapping_cap_swap_backed(mapping)) { /* normal file */
+        page = find_get_page(mapping, pgoff);
-                page = find_get_page(mapping, pgoff);
-        } else { /* shmem/tmpfs file. we should take account of swap too. */
+#ifdef CONFIG_SWAP
-                swp_entry_t ent;
+        /* shmem/tmpfs may report page out on swap: account for that too. */
-                mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent);
+        if (radix_tree_exceptional_entry(page)) {
+                swp_entry_t swap = radix_to_swp_entry(page);
                if (do_swap_account)
-                        entry->val = ent.val;
+                        *entry = swap;
+                page = find_get_page(&swapper_space, swap.val);
        }
+#endif
        return page;
 }
@@ -5503,13 +5301,13 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                                struct task_struct *p)
 {
        int ret = 0;
-        struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup);
-        if (mem->move_charge_at_immigrate) {
+        if (memcg->move_charge_at_immigrate) {
                struct mm_struct *mm;
                struct mem_cgroup *from = mem_cgroup_from_task(p);
-                VM_BUG_ON(from == mem);
+                VM_BUG_ON(from == memcg);
                mm = get_task_mm(p);
                if (!mm)
@@ -5524,7 +5322,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        mem_cgroup_start_move(from);
                        spin_lock(&mc.lock);
                        mc.from = from;
-                        mc.to = mem;
+                        mc.to = memcg;
                        spin_unlock(&mc.lock);
                        /* We set mc.moving_task later */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 740c4f52059c..06d3479513aa 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -42,6 +42,7 @@
 #include <linux/sched.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
+#include <linux/export.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/backing-dev.h>
@@ -53,6 +54,7 @@
 #include <linux/hugetlb.h>
 #include <linux/memory_hotplug.h>
 #include <linux/mm_inline.h>
+#include <linux/kfifo.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1178,6 +1180,97 @@ void memory_failure(unsigned long pfn, int trapno)
        __memory_failure(pfn, trapno, 0);
 }
+#define MEMORY_FAILURE_FIFO_ORDER       4
+#define MEMORY_FAILURE_FIFO_SIZE        (1 << MEMORY_FAILURE_FIFO_ORDER)
+struct memory_failure_entry {
+        unsigned long pfn;
+        int trapno;
+        int flags;
+};
+struct memory_failure_cpu {
+        DECLARE_KFIFO(fifo, struct memory_failure_entry,
+                      MEMORY_FAILURE_FIFO_SIZE);
+        spinlock_t lock;
+        struct work_struct work;
+};
+static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
+/**
+ * memory_failure_queue - Schedule handling memory failure of a page.
+ * @pfn: Page Number of the corrupted page
+ * @trapno: Trap number reported in the signal to user space.
+ * @flags: Flags for memory failure handling
+ *
+ * This function is called by the low level hardware error handler
+ * when it detects hardware memory corruption of a page. It schedules
+ * the recovering of error page, including dropping pages, killing
+ * processes etc.
+ *
+ * The function is primarily of use for corruptions that
+ * happen outside the current execution context (e.g. when
+ * detected by a background scrubber)
+ *
+ * Can run in IRQ context.
+ */
+void memory_failure_queue(unsigned long pfn, int trapno, int flags)
+{
+        struct memory_failure_cpu *mf_cpu;
+        unsigned long proc_flags;
+        struct memory_failure_entry entry = {
+                .pfn =          pfn,
+                .trapno =       trapno,
+                .flags =        flags,
+        };
+        mf_cpu = &get_cpu_var(memory_failure_cpu);
+        spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+        if (kfifo_put(&mf_cpu->fifo, &entry))
+                schedule_work_on(smp_processor_id(), &mf_cpu->work);
+        else
+                pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
+                       pfn);
+        spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+        put_cpu_var(memory_failure_cpu);
+}
+EXPORT_SYMBOL_GPL(memory_failure_queue);
+static void memory_failure_work_func(struct work_struct *work)
+{
+        struct memory_failure_cpu *mf_cpu;
+        struct memory_failure_entry entry = { 0, };
+        unsigned long proc_flags;
+        int gotten;
+        mf_cpu = &__get_cpu_var(memory_failure_cpu);
+        for (;;) {
+                spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+                gotten = kfifo_get(&mf_cpu->fifo, &entry);
+                spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+                if (!gotten)
+                        break;
+                __memory_failure(entry.pfn, entry.trapno, entry.flags);
+        }
+}
+static int __init memory_failure_init(void)
+{
+        struct memory_failure_cpu *mf_cpu;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+                spin_lock_init(&mf_cpu->lock);
+                INIT_KFIFO(mf_cpu->fifo);
+                INIT_WORK(&mf_cpu->work, memory_failure_work_func);
+        }
+        return 0;
+}
+core_initcall(memory_failure_init);
 /**
 * unpoison_memory - Unpoison a previously poisoned page
 * @pfn: Page number of the to be unpoisoned page
@@ -1218,7 +1311,7 @@ int unpoison_memory(unsigned long pfn)
                 * to the end.
                 */
                if (PageHuge(page)) {
-                        pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+                        pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
                        return 0;
                }
                if (TestClearPageHWPoison(p))
@@ -1327,7 +1420,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
        if (PageHWPoison(hpage)) {
                put_page(hpage);
-                pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+                pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
                return -EBUSY;
        }
@@ -1341,8 +1434,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
                list_for_each_entry_safe(page1, page2, &pagelist, lru)
                        put_page(page1);
-                pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
-                         pfn, ret, page->flags);
+                        pfn, ret, page->flags);
                if (ret > 0)
                        ret = -EIO;
                return ret;
@@ -1413,7 +1506,7 @@ int soft_offline_page(struct page *page, int flags)
        }
        if (!PageLRU(page)) {
                pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
-                                pfn, page->flags);
+                        pfn, page->flags);
                return -EIO;
        }
@@ -1474,7 +1567,7 @@ int soft_offline_page(struct page *page, int flags)
                }
        } else {
                pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
-                                pfn, ret, page_count(page), page->flags);
+                        pfn, ret, page_count(page), page->flags);
        }
        if (ret)
                return ret;
diff --git a/mm/memory.c b/mm/memory.c
index a56e3ba816b2..829d43735402 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -47,7 +47,7 @@
 #include <linux/pagemap.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/delayacct.h>
 #include <linux/init.h>
 #include <linux/writeback.h>
@@ -1503,7 +1503,7 @@ split_fallthrough:
        }
        if (flags & FOLL_GET)
-                get_page(page);
+                get_page_foll(page);
        if (flags & FOLL_TOUCH) {
                if ((flags & FOLL_WRITE) &&
                    !pte_dirty(pte) && !PageDirty(page))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6e7d8b21dbfa..2168489c0bc9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -11,7 +11,7 @@
 #include <linux/pagemap.h>
 #include <linux/bootmem.h>
 #include <linux/compiler.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 #include <linux/slab.h>
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8b57173c1dd5..adc395481813 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -75,7 +75,7 @@
 #include <linux/cpuset.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/nsproxy.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
@@ -111,7 +111,7 @@ enum zone_type policy_zone = 0;
 /*
 * run-time system-wide default policy => local allocation
 */
-struct mempolicy default_policy = {
+static struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
        .mode = MPOL_PREFERRED,
        .flags = MPOL_F_LOCAL,
@@ -636,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
        struct vm_area_struct *prev;
        struct vm_area_struct *vma;
        int err = 0;
-        pgoff_t pgoff;
        unsigned long vmstart;
        unsigned long vmend;
@@ -649,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                vmstart = max(start, vma->vm_start);
                vmend   = min(end, vma->vm_end);
-                pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
-                                  vma->anon_vma, vma->vm_file, pgoff, new_pol);
+                                  vma->anon_vma, vma->vm_file, vma->vm_pgoff,
+                                  new_pol);
                if (prev) {
                        vma = prev;
                        next = vma->vm_next;
@@ -1412,7 +1411,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy,
        err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
        if (!err && nmask) {
-                err = copy_from_user(bm, nm, alloc_size);
+                unsigned long copy_size;
+                copy_size = min_t(unsigned long, sizeof(bm), alloc_size);
+                err = copy_from_user(bm, nm, copy_size);
                /* ensure entire bitmap is zeroed */
                err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
                err |= compat_put_bitmap(nmask, bm, nr_bits);
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a3bc3d4d554..e73641b79bb5 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -10,7 +10,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
diff --git a/mm/migrate.c b/mm/migrate.c
index 666e4e677414..578e29174fa6 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -13,7 +13,7 @@
 */
 #include <linux/migrate.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pagemap.h>
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                ptep = pte_offset_map(pmd, addr);
-                if (!is_swap_pte(*ptep)) {
+                /*
-                        pte_unmap(ptep);
+                 * Peek to check is_swap_pte() before taking ptlock?  No, we
-                        goto out;
+                 * can race mremap's move_ptes(), which skips anon_vma lock.
-                }
+                 */
                ptl = pte_lockptr(mm, pmd);
        }
@@ -621,38 +621,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
        return rc;
 }
-/*
+static int __unmap_and_move(struct page *page, struct page *newpage,
- * Obtain the lock on page, remove all ptes and migrate the page
+                                int force, bool offlining, bool sync)
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-                        struct page *page, int force, bool offlining, bool sync)
 {
-        int rc = 0;
+        int rc = -EAGAIN;
-        int *result = NULL;
-        struct page *newpage = get_new_page(page, private, &result);
        int remap_swapcache = 1;
        int charge = 0;
        struct mem_cgroup *mem;
        struct anon_vma *anon_vma = NULL;
-        if (!newpage)
-                return -ENOMEM;
-        if (page_count(page) == 1) {
-                /* page was freed from under us. So we are done. */
-                goto move_newpage;
-        }
-        if (unlikely(PageTransHuge(page)))
-                if (unlikely(split_huge_page(page)))
-                        goto move_newpage;
-        /* prepare cgroup just returns 0 or -ENOMEM */
-        rc = -EAGAIN;
        if (!trylock_page(page)) {
                if (!force || !sync)
-                        goto move_newpage;
+                        goto out;
                /*
                 * It's not safe for direct compaction to call lock_page.
@@ -668,7 +648,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                 * altogether.
                 */
                if (current->flags & PF_MEMALLOC)
-                        goto move_newpage;
+                        goto out;
                lock_page(page);
        }
@@ -785,27 +765,52 @@ uncharge:
                mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
        unlock_page(page);
+out:
+        return rc;
+}
-move_newpage:
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+                        struct page *page, int force, bool offlining, bool sync)
+{
+        int rc = 0;
+        int *result = NULL;
+        struct page *newpage = get_new_page(page, private, &result);
+        if (!newpage)
+                return -ENOMEM;
+        if (page_count(page) == 1) {
+                /* page was freed from under us. So we are done. */
+                goto out;
+        }
+        if (unlikely(PageTransHuge(page)))
+                if (unlikely(split_huge_page(page)))
+                        goto out;
+        rc = __unmap_and_move(page, newpage, force, offlining, sync);
+out:
        if (rc != -EAGAIN) {
-                /*
+                /*
-                 * A page that has been migrated has all references
+                 * A page that has been migrated has all references
-                 * removed and will be freed. A page that has not been
+                 * removed and will be freed. A page that has not been
-                 * migrated will have kepts its references and be
+                 * migrated will have kepts its references and be
-                 * restored.
+                 * restored.
-                 */
+                 */
-                list_del(&page->lru);
+                list_del(&page->lru);
                dec_zone_page_state(page, NR_ISOLATED_ANON +
                                page_is_file_cache(page));
                putback_lru_page(page);
        }
        /*
         * Move the new page to the LRU. If migration was not successful
         * then this will free the page.
         */
        putback_lru_page(newpage);
        if (result) {
                if (rc)
                        *result = rc;
diff --git a/mm/mincore.c b/mm/mincore.c
index a4e6b9d75c76..636a86876ff2 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -69,12 +69,15 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
         * file will not get a swp_entry_t in its pte, but rather it is like
         * any other file mapping (ie. marked !present and faulted in with
         * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
-         *
-         * However when tmpfs moves the page from pagecache and into swapcache,
-         * it is still in core, but the find_get_page below won't find it.
-         * No big deal, but make a note of it.
         */
        page = find_get_page(mapping, pgoff);
+#ifdef CONFIG_SWAP
+        /* shmem/tmpfs may return swap: account for swapcache page too. */
+        if (radix_tree_exceptional_entry(page)) {
+                swp_entry_t swap = radix_to_swp_entry(page);
+                page = find_get_page(&swapper_space, swap.val);
+        }
+#endif
        if (page) {
                present = PageUptodate(page);
                page_cache_release(page);
diff --git a/mm/mlock.c b/mm/mlock.c
index 048260c4e02e..4f4f53bdc65d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -14,7 +14,7 @@
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/rmap.h>
 #include <linux/mmzone.h>
 #include <linux/hugetlb.h>
@@ -110,7 +110,15 @@ void munlock_vma_page(struct page *page)
        if (TestClearPageMlocked(page)) {
                dec_zone_page_state(page, NR_MLOCK);
                if (!isolate_lru_page(page)) {
-                        int ret = try_to_munlock(page);
+                        int ret = SWAP_AGAIN;
+                        /*
+                         * Optimization: if the page was mapped just once,
+                         * that's our mapping and we don't need to check all the
+                         * other vmas.
+                         */
+                        if (page_mapcount(page) > 1)
+                                ret = try_to_munlock(page);
                        /*
                         * did try_to_unlock() succeed or punt?
                         */
@@ -549,7 +557,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
        if (!can_do_mlock())
                goto out;
-        lru_add_drain_all();    /* flush pagevec */
+        if (flags & MCL_CURRENT)
+                lru_add_drain_all();    /* flush pagevec */
        down_write(&current->mm->mmap_sem);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4e0e26591dfa..1ffd97ae26d7 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -8,7 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/kobject.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include "internal.h"
 #ifdef CONFIG_DEBUG_MEMORY_INIT
diff --git a/mm/mmap.c b/mm/mmap.c
index a65efd4db3e1..eae90af60ea6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -22,7 +22,7 @@
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/profile.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
@@ -2558,7 +2558,6 @@ int mm_take_all_locks(struct mm_struct *mm)
 {
        struct vm_area_struct *vma;
        struct anon_vma_chain *avc;
-        int ret = -EINTR;
        BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2579,13 +2578,11 @@ int mm_take_all_locks(struct mm_struct *mm)
                                vm_lock_anon_vma(mm, avc->anon_vma);
        }
-        ret = 0;
+        return 0;
 out_unlock:
-        if (ret)
+        mm_drop_all_locks(mm);
-                mm_drop_all_locks(mm);
+        return -EINTR;
-        return ret;
 }
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 9e82e937000e..cf332bc0080a 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,7 +5,7 @@
 #include <linux/mm.h>
 #include <linux/mmu_context.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <asm/mmu_context.h>
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8d032de4088e..9a611d3a1848 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -11,7 +11,7 @@
 #include <linux/rculist.h>
 #include <linux/mmu_notifier.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/rcupdate.h>
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d1760213..7cf7b7ddc7c5 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -8,7 +8,6 @@
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
-#include <linux/module.h>
 struct pglist_data *first_online_pgdat(void)
 {
diff --git a/mm/mremap.c b/mm/mremap.c
index 506fa44403df..d6959cb4df58 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
                return NULL;
        pmd = pmd_offset(pud, addr);
-        split_huge_page_pmd(mm, pmd);
+        if (pmd_none(*pmd))
-        if (pmd_none_or_clear_bad(pmd))
                return NULL;
        return pmd;
@@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
                return NULL;
        VM_BUG_ON(pmd_trans_huge(*pmd));
-        if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
-                return NULL;
        return pmd;
 }
@@ -80,11 +77,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
-        unsigned long old_start;
-        old_start = old_addr;
-        mmu_notifier_invalidate_range_start(vma->vm_mm,
-                                            old_start, old_end);
        if (vma->vm_file) {
                /*
                 * Subtle point from Rajesh Venkatasubramanian: before
@@ -111,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                                   new_pte++, new_addr += PAGE_SIZE) {
                if (pte_none(*old_pte))
                        continue;
-                pte = ptep_clear_flush(vma, old_addr, old_pte);
+                pte = ptep_get_and_clear(mm, old_addr, old_pte);
                pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
                set_pte_at(mm, new_addr, new_pte, pte);
        }
@@ -123,7 +116,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (mapping)
                mutex_unlock(&mapping->i_mmap_mutex);
-        mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 }
 #define LATENCY_LIMIT   (64 * PAGE_SIZE)
@@ -134,22 +126,43 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 {
        unsigned long extent, next, old_end;
        pmd_t *old_pmd, *new_pmd;
+        bool need_flush = false;
        old_end = old_addr + len;
        flush_cache_range(vma, old_addr, old_end);
+        mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
        for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
                cond_resched();
                next = (old_addr + PMD_SIZE) & PMD_MASK;
-                if (next - 1 > old_end)
+                /* even if next overflowed, extent below will be ok */
-                        next = old_end;
                extent = next - old_addr;
+                if (extent > old_end - old_addr)
+                        extent = old_end - old_addr;
                old_pmd = get_old_pmd(vma->vm_mm, old_addr);
                if (!old_pmd)
                        continue;
                new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
                if (!new_pmd)
                        break;
+                if (pmd_trans_huge(*old_pmd)) {
+                        int err = 0;
+                        if (extent == HPAGE_PMD_SIZE)
+                                err = move_huge_pmd(vma, new_vma, old_addr,
+                                                    new_addr, old_end,
+                                                    old_pmd, new_pmd);
+                        if (err > 0) {
+                                need_flush = true;
+                                continue;
+                        } else if (!err) {
+                                split_huge_page_pmd(vma->vm_mm, old_pmd);
+                        }
+                        VM_BUG_ON(pmd_trans_huge(*old_pmd));
+                }
+                if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
+                                                      new_pmd, new_addr))
+                        break;
                next = (new_addr + PMD_SIZE) & PMD_MASK;
                if (extent > next - new_addr)
                        extent = next - new_addr;
@@ -157,7 +170,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                        extent = LATENCY_LIMIT;
                move_ptes(vma, old_pmd, old_addr, old_addr + extent,
                                new_vma, new_pmd, new_addr);
+                need_flush = true;
        }
+        if (likely(need_flush))
+                flush_tlb_range(vma, old_end-len, old_addr);
+        mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
        return len + old_addr - old_end;        /* how much done */
 }
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 6e93dc7f2586..7fa41b4a07bf 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -12,7 +12,7 @@
 #include <linux/pfn.h>
 #include <linux/slab.h>
 #include <linux/bootmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kmemleak.h>
 #include <linux/range.h>
 #include <linux/memblock.h>
diff --git a/mm/nommu.c b/mm/nommu.c
index 4358032566e9..73419c55eda6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -13,7 +13,7 @@
 *  Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index eafff89b3dd6..471dedb463ab 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,18 +26,38 @@
 #include <linux/timex.h>
 #include <linux/jiffies.h>
 #include <linux/cpuset.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/notifier.h>
 #include <linux/memcontrol.h>
 #include <linux/mempolicy.h>
 #include <linux/security.h>
 #include <linux/ptrace.h>
+#include <linux/freezer.h>
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 static DEFINE_SPINLOCK(zone_scan_lock);
+/*
+ * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
+ * @old_val: old oom_score_adj for compare
+ * @new_val: new oom_score_adj for swap
+ *
+ * Sets the oom_score_adj value for current to @new_val iff its present value is
+ * @old_val.  Usually used to reinstate a previous value to prevent racing with
+ * userspacing tuning the value in the interim.
+ */
+void compare_swap_oom_score_adj(int old_val, int new_val)
+{
+        struct sighand_struct *sighand = current->sighand;
+        spin_lock_irq(&sighand->siglock);
+        if (current->signal->oom_score_adj == old_val)
+                current->signal->oom_score_adj = new_val;
+        spin_unlock_irq(&sighand->siglock);
+}
 /**
 * test_set_oom_score_adj() - set current's oom_score_adj and return old value
 * @new_val: new oom_score_adj value
@@ -53,13 +73,7 @@ int test_set_oom_score_adj(int new_val)
        spin_lock_irq(&sighand->siglock);
        old_val = current->signal->oom_score_adj;
-        if (new_val != old_val) {
+        current->signal->oom_score_adj = new_val;
-                if (new_val == OOM_SCORE_ADJ_MIN)
-                        atomic_inc(&current->mm->oom_disable_count);
-                else if (old_val == OOM_SCORE_ADJ_MIN)
-                        atomic_dec(&current->mm->oom_disable_count);
-                current->signal->oom_score_adj = new_val;
-        }
        spin_unlock_irq(&sighand->siglock);
        return old_val;
@@ -172,16 +186,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
                return 0;
        /*
-         * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
-         * so the entire heuristic doesn't need to be executed for something
-         * that cannot be killed.
-         */
-        if (atomic_read(&p->mm->oom_disable_count)) {
-                task_unlock(p);
-                return 0;
-        }
-        /*
         * The memory controller may have a limit of 0 bytes, so avoid a divide
         * by zero, if necessary.
         */
@@ -303,7 +307,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
        do_each_thread(g, p) {
                unsigned int points;
-                if (!p->mm)
+                if (p->exit_state)
                        continue;
                if (oom_unkillable_task(p, mem, nodemask))
                        continue;
@@ -317,8 +321,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                 * blocked waiting for another task which itself is waiting
                 * for memory. Is there a better alternative?
                 */
-                if (test_tsk_thread_flag(p, TIF_MEMDIE))
+                if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
+                        if (unlikely(frozen(p)))
+                                thaw_process(p);
                        return ERR_PTR(-1UL);
+                }
+                if (!p->mm)
+                        continue;
                if (p->flags & PF_EXITING) {
                        /*
@@ -433,7 +442,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
        task_unlock(p);
        /*
-         * Kill all processes sharing p->mm in other thread groups, if any.
+         * Kill all user processes sharing p->mm in other thread groups, if any.
         * They don't get access to memory reserves or a higher scheduler
         * priority, though, to avoid depletion of all memory or task
         * starvation.  This prevents mm->mmap_sem livelock when an oom killed
@@ -443,7 +452,11 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
         * signal.
         */
        for_each_process(q)
-                if (q->mm == mm && !same_thread_group(q, p)) {
+                if (q->mm == mm && !same_thread_group(q, p) &&
+                    !(q->flags & PF_KTHREAD)) {
+                        if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+                                continue;
                        task_lock(q);   /* Protect ->comm from prctl() */
                        pr_err("Kill process %d (%s) sharing same memory\n",
                                task_pid_nr(q), q->comm);
@@ -720,7 +733,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        read_lock(&tasklist_lock);
        if (sysctl_oom_kill_allocating_task &&
            !oom_unkillable_task(current, NULL, nodemask) &&
-            current->mm && !atomic_read(&current->mm->oom_disable_count)) {
+            current->mm) {
                /*
                 * oom_kill_process() needs tasklist_lock held.  If it returns
                 * non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d1960744f881..a3278f005230 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -12,7 +12,7 @@
 */
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
@@ -46,26 +46,14 @@
 */
 #define BANDWIDTH_INTERVAL      max(HZ/5, 1)
+#define RATELIMIT_CALC_SHIFT    10
 /*
 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
 * will look to see if it needs to force writeback or throttling.
 */
 static long ratelimit_pages = 32;
-/*
- * When balance_dirty_pages decides that the caller needs to perform some
- * non-background writeback, this is how many pages it will attempt to write.
- * It should be somewhat larger than dirtied pages to ensure that reasonably
- * large amounts of I/O are submitted.
- */
-static inline long sync_writeback_pages(unsigned long dirtied)
-{
-        if (dirtied < ratelimit_pages)
-                dirtied = ratelimit_pages;
-        return dirtied + dirtied / 2;
-}
 /* The following parameters are exported via /proc/sys/vm */
 /*
@@ -167,6 +155,8 @@ static void update_completion_period(void)
        int shift = calc_period_shift();
        prop_change_shift(&vm_completions, shift);
        prop_change_shift(&vm_dirties, shift);
+        writeback_set_ratelimit();
 }
 int dirty_background_ratio_handler(struct ctl_table *table, int write,
@@ -260,52 +250,10 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
                                numerator, denominator);
 }
-static inline void task_dirties_fraction(struct task_struct *tsk,
-                long *numerator, long *denominator)
-{
-        prop_fraction_single(&vm_dirties, &tsk->dirties,
-                                numerator, denominator);
-}
 /*
- * task_dirty_limit - scale down dirty throttling threshold for one task
+ * bdi_min_ratio keeps the sum of the minimum dirty shares of all
- *
+ * registered backing devices, which, for obvious reasons, can not
- * task specific dirty limit:
+ * exceed 100%.
- *
- *   dirty -= (dirty/8) * p_{t}
- *
- * To protect light/slow dirtying tasks from heavier/fast ones, we start
- * throttling individual tasks before reaching the bdi dirty limit.
- * Relatively low thresholds will be allocated to heavy dirtiers. So when
- * dirty pages grow large, heavy dirtiers will be throttled first, which will
- * effectively curb the growth of dirty pages. Light dirtiers with high enough
- * dirty threshold may never get throttled.
- */
-#define TASK_LIMIT_FRACTION 8
-static unsigned long task_dirty_limit(struct task_struct *tsk,
-                                       unsigned long bdi_dirty)
-{
-        long numerator, denominator;
-        unsigned long dirty = bdi_dirty;
-        u64 inv = dirty / TASK_LIMIT_FRACTION;
-        task_dirties_fraction(tsk, &numerator, &denominator);
-        inv *= numerator;
-        do_div(inv, denominator);
-        dirty -= inv;
-        return max(dirty, bdi_dirty/2);
-}
-/* Minimum limit for any task */
-static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
-{
-        return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
-}
-/*
- *
 */
 static unsigned int bdi_min_ratio;
@@ -411,6 +359,12 @@ unsigned long determine_dirtyable_memory(void)
        return x + 1;   /* Ensure that we never return 0 */
 }
+static unsigned long dirty_freerun_ceiling(unsigned long thresh,
+                                           unsigned long bg_thresh)
+{
+        return (thresh + bg_thresh) / 2;
+}
 static unsigned long hard_dirty_limit(unsigned long thresh)
 {
        return max(thresh, global_dirty_limit);
@@ -495,6 +449,198 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
        return bdi_dirty;
 }
+/*
+ * Dirty position control.
+ *
+ * (o) global/bdi setpoints
+ *
+ * We want the dirty pages be balanced around the global/bdi setpoints.
+ * When the number of dirty pages is higher/lower than the setpoint, the
+ * dirty position control ratio (and hence task dirty ratelimit) will be
+ * decreased/increased to bring the dirty pages back to the setpoint.
+ *
+ *     pos_ratio = 1 << RATELIMIT_CALC_SHIFT
+ *
+ *     if (dirty < setpoint) scale up   pos_ratio
+ *     if (dirty > setpoint) scale down pos_ratio
+ *
+ *     if (bdi_dirty < bdi_setpoint) scale up   pos_ratio
+ *     if (bdi_dirty > bdi_setpoint) scale down pos_ratio
+ *
+ *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
+ *
+ * (o) global control line
+ *
+ *     ^ pos_ratio
+ *     |
+ *     |            |<===== global dirty control scope ======>|
+ * 2.0 .............*
+ *     |            .*
+ *     |            . *
+ *     |            .   *
+ *     |            .     *
+ *     |            .        *
+ *     |            .            *
+ * 1.0 ................................*
+ *     |            .                  .     *
+ *     |            .                  .          *
+ *     |            .                  .              *
+ *     |            .                  .                 *
+ *     |            .                  .                    *
+ *   0 +------------.------------------.----------------------*------------->
+ *           freerun^          setpoint^                 limit^   dirty pages
+ *
+ * (o) bdi control line
+ *
+ *     ^ pos_ratio
+ *     |
+ *     |            *
+ *     |              *
+ *     |                *
+ *     |                  *
+ *     |                    * |<=========== span ============>|
+ * 1.0 .......................*
+ *     |                      . *
+ *     |                      .   *
+ *     |                      .     *
+ *     |                      .       *
+ *     |                      .         *
+ *     |                      .           *
+ *     |                      .             *
+ *     |                      .               *
+ *     |                      .                 *
+ *     |                      .                   *
+ *     |                      .                     *
+ * 1/4 ...............................................* * * * * * * * * * * *
+ *     |                      .                         .
+ *     |                      .                           .
+ *     |                      .                             .
+ *   0 +----------------------.-------------------------------.------------->
+ *                bdi_setpoint^                    x_intercept^
+ *
+ * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
+ * be smoothly throttled down to normal if it starts high in situations like
+ * - start writing to a slow SD card and a fast disk at the same time. The SD
+ *   card's bdi_dirty may rush to many times higher than bdi_setpoint.
+ * - the bdi dirty thresh drops quickly due to change of JBOD workload
+ */
+static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
+                                        unsigned long thresh,
+                                        unsigned long bg_thresh,
+                                        unsigned long dirty,
+                                        unsigned long bdi_thresh,
+                                        unsigned long bdi_dirty)
+{
+        unsigned long write_bw = bdi->avg_write_bandwidth;
+        unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
+        unsigned long limit = hard_dirty_limit(thresh);
+        unsigned long x_intercept;
+        unsigned long setpoint;         /* dirty pages' target balance point */
+        unsigned long bdi_setpoint;
+        unsigned long span;
+        long long pos_ratio;            /* for scaling up/down the rate limit */
+        long x;
+        if (unlikely(dirty >= limit))
+                return 0;
+        /*
+         * global setpoint
+         *
+         *                           setpoint - dirty 3
+         *        f(dirty) := 1.0 + (----------------)
+         *                           limit - setpoint
+         *
+         * it's a 3rd order polynomial that subjects to
+         *
+         * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
+         * (2) f(setpoint) = 1.0 => the balance point
+         * (3) f(limit)    = 0   => the hard limit
+         * (4) df/dx      <= 0   => negative feedback control
+         * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
+         *     => fast response on large errors; small oscillation near setpoint
+         */
+        setpoint = (freerun + limit) / 2;
+        x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT,
+                    limit - setpoint + 1);
+        pos_ratio = x;
+        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+        pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+        /*
+         * We have computed basic pos_ratio above based on global situation. If
+         * the bdi is over/under its share of dirty pages, we want to scale
+         * pos_ratio further down/up. That is done by the following mechanism.
+         */
+        /*
+         * bdi setpoint
+         *
+         *        f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
+         *
+         *                        x_intercept - bdi_dirty
+         *                     := --------------------------
+         *                        x_intercept - bdi_setpoint
+         *
+         * The main bdi control line is a linear function that subjects to
+         *
+         * (1) f(bdi_setpoint) = 1.0
+         * (2) k = - 1 / (8 * write_bw)  (in single bdi case)
+         *     or equally: x_intercept = bdi_setpoint + 8 * write_bw
+         *
+         * For single bdi case, the dirty pages are observed to fluctuate
+         * regularly within range
+         *        [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
+         * for various filesystems, where (2) can yield in a reasonable 12.5%
+         * fluctuation range for pos_ratio.
+         *
+         * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
+         * own size, so move the slope over accordingly and choose a slope that
+         * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
+         */
+        if (unlikely(bdi_thresh > thresh))
+                bdi_thresh = thresh;
+        bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+        /*
+         * scale global setpoint to bdi's:
+         *      bdi_setpoint = setpoint * bdi_thresh / thresh
+         */
+        x = div_u64((u64)bdi_thresh << 16, thresh + 1);
+        bdi_setpoint = setpoint * (u64)x >> 16;
+        /*
+         * Use span=(8*write_bw) in single bdi case as indicated by
+         * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+         *
+         *        bdi_thresh                    thresh - bdi_thresh
+         * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
+         *          thresh                            thresh
+         */
+        span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
+        x_intercept = bdi_setpoint + span;
+        if (bdi_dirty < x_intercept - span / 4) {
+                pos_ratio = div_u64(pos_ratio * (x_intercept - bdi_dirty),
+                                    x_intercept - bdi_setpoint + 1);
+        } else
+                pos_ratio /= 4;
+        /*
+         * bdi reserve area, safeguard against dirty pool underrun and disk idle
+         * It may push the desired control point of global dirty pages higher
+         * than setpoint.
+         */
+        x_intercept = bdi_thresh / 2;
+        if (bdi_dirty < x_intercept) {
+                if (bdi_dirty > x_intercept / 8)
+                        pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
+                else
+                        pos_ratio *= 8;
+        }
+        return pos_ratio;
+}
 static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
                                       unsigned long elapsed,
                                       unsigned long written)
@@ -591,8 +737,153 @@ static void global_update_bandwidth(unsigned long thresh,
        spin_unlock(&dirty_lock);
 }
+/*
+ * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+ *
+ * Normal bdi tasks will be curbed at or below it in long term.
+ * Obviously it should be around (write_bw / N) when there are N dd tasks.
+ */
+static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
+                                       unsigned long thresh,
+                                       unsigned long bg_thresh,
+                                       unsigned long dirty,
+                                       unsigned long bdi_thresh,
+                                       unsigned long bdi_dirty,
+                                       unsigned long dirtied,
+                                       unsigned long elapsed)
+{
+        unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
+        unsigned long limit = hard_dirty_limit(thresh);
+        unsigned long setpoint = (freerun + limit) / 2;
+        unsigned long write_bw = bdi->avg_write_bandwidth;
+        unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+        unsigned long dirty_rate;
+        unsigned long task_ratelimit;
+        unsigned long balanced_dirty_ratelimit;
+        unsigned long pos_ratio;
+        unsigned long step;
+        unsigned long x;
+        /*
+         * The dirty rate will match the writeout rate in long term, except
+         * when dirty pages are truncated by userspace or re-dirtied by FS.
+         */
+        dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+        pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
+                                       bdi_thresh, bdi_dirty);
+        /*
+         * task_ratelimit reflects each dd's dirty rate for the past 200ms.
+         */
+        task_ratelimit = (u64)dirty_ratelimit *
+                                        pos_ratio >> RATELIMIT_CALC_SHIFT;
+        task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
+        /*
+         * A linear estimation of the "balanced" throttle rate. The theory is,
+         * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+         * dirty_rate will be measured to be (N * task_ratelimit). So the below
+         * formula will yield the balanced rate limit (write_bw / N).
+         *
+         * Note that the expanded form is not a pure rate feedback:
+         *      rate_(i+1) = rate_(i) * (write_bw / dirty_rate)              (1)
+         * but also takes pos_ratio into account:
+         *      rate_(i+1) = rate_(i) * (write_bw / dirty_rate) * pos_ratio  (2)
+         *
+         * (1) is not realistic because pos_ratio also takes part in balancing
+         * the dirty rate.  Consider the state
+         *      pos_ratio = 0.5                                              (3)
+         *      rate = 2 * (write_bw / N)                                    (4)
+         * If (1) is used, it will stuck in that state! Because each dd will
+         * be throttled at
+         *      task_ratelimit = pos_ratio * rate = (write_bw / N)           (5)
+         * yielding
+         *      dirty_rate = N * task_ratelimit = write_bw                   (6)
+         * put (6) into (1) we get
+         *      rate_(i+1) = rate_(i)                                        (7)
+         *
+         * So we end up using (2) to always keep
+         *      rate_(i+1) ~= (write_bw / N)                                 (8)
+         * regardless of the value of pos_ratio. As long as (8) is satisfied,
+         * pos_ratio is able to drive itself to 1.0, which is not only where
+         * the dirty count meet the setpoint, but also where the slope of
+         * pos_ratio is most flat and hence task_ratelimit is least fluctuated.
+         */
+        balanced_dirty_ratelimit = div_u64((u64)task_ratelimit * write_bw,
+                                           dirty_rate | 1);
+        /*
+         * We could safely do this and return immediately:
+         *
+         *      bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+         *
+         * However to get a more stable dirty_ratelimit, the below elaborated
+         * code makes use of task_ratelimit to filter out sigular points and
+         * limit the step size.
+         *
+         * The below code essentially only uses the relative value of
+         *
+         *      task_ratelimit - dirty_ratelimit
+         *      = (pos_ratio - 1) * dirty_ratelimit
+         *
+         * which reflects the direction and size of dirty position error.
+         */
+        /*
+         * dirty_ratelimit will follow balanced_dirty_ratelimit iff
+         * task_ratelimit is on the same side of dirty_ratelimit, too.
+         * For example, when
+         * - dirty_ratelimit > balanced_dirty_ratelimit
+         * - dirty_ratelimit > task_ratelimit (dirty pages are above setpoint)
+         * lowering dirty_ratelimit will help meet both the position and rate
+         * control targets. Otherwise, don't update dirty_ratelimit if it will
+         * only help meet the rate target. After all, what the users ultimately
+         * feel and care are stable dirty rate and small position error.
+         *
+         * |task_ratelimit - dirty_ratelimit| is used to limit the step size
+         * and filter out the sigular points of balanced_dirty_ratelimit. Which
+         * keeps jumping around randomly and can even leap far away at times
+         * due to the small 200ms estimation period of dirty_rate (we want to
+         * keep that period small to reduce time lags).
+         */
+        step = 0;
+        if (dirty < setpoint) {
+                x = min(bdi->balanced_dirty_ratelimit,
+                         min(balanced_dirty_ratelimit, task_ratelimit));
+                if (dirty_ratelimit < x)
+                        step = x - dirty_ratelimit;
+        } else {
+                x = max(bdi->balanced_dirty_ratelimit,
+                         max(balanced_dirty_ratelimit, task_ratelimit));
+                if (dirty_ratelimit > x)
+                        step = dirty_ratelimit - x;
+        }
+        /*
+         * Don't pursue 100% rate matching. It's impossible since the balanced
+         * rate itself is constantly fluctuating. So decrease the track speed
+         * when it gets close to the target. Helps eliminate pointless tremors.
+         */
+        step >>= dirty_ratelimit / (2 * step + 1);
+        /*
+         * Limit the tracking speed to avoid overshooting.
+         */
+        step = (step + 7) / 8;
+        if (dirty_ratelimit < balanced_dirty_ratelimit)
+                dirty_ratelimit += step;
+        else
+                dirty_ratelimit -= step;
+        bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+        bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+        trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+}
 void __bdi_update_bandwidth(struct backing_dev_info *bdi,
                            unsigned long thresh,
+                            unsigned long bg_thresh,
                            unsigned long dirty,
                            unsigned long bdi_thresh,
                            unsigned long bdi_dirty,
@@ -600,6 +891,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 {
        unsigned long now = jiffies;
        unsigned long elapsed = now - bdi->bw_time_stamp;
+        unsigned long dirtied;
        unsigned long written;
        /*
@@ -608,6 +900,7 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
        if (elapsed < BANDWIDTH_INTERVAL)
                return;
+        dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
        written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
        /*
@@ -617,18 +910,23 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
        if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
                goto snapshot;
-        if (thresh)
+        if (thresh) {
                global_update_bandwidth(thresh, dirty, now);
+                bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
+                                           bdi_thresh, bdi_dirty,
+                                           dirtied, elapsed);
+        }
        bdi_update_write_bandwidth(bdi, elapsed, written);
 snapshot:
+        bdi->dirtied_stamp = dirtied;
        bdi->written_stamp = written;
        bdi->bw_time_stamp = now;
 }
 static void bdi_update_bandwidth(struct backing_dev_info *bdi,
                                 unsigned long thresh,
+                                 unsigned long bg_thresh,
                                 unsigned long dirty,
                                 unsigned long bdi_thresh,
                                 unsigned long bdi_dirty,
@@ -637,37 +935,99 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
        if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
                return;
        spin_lock(&bdi->wb.list_lock);
-        __bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
+        __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
-                               start_time);
+                               bdi_thresh, bdi_dirty, start_time);
        spin_unlock(&bdi->wb.list_lock);
 }
 /*
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * will look to see if it needs to start dirty throttling.
+ *
+ * If dirty_poll_interval is too low, big NUMA machines will call the expensive
+ * global_page_state() too often. So scale it near-sqrt to the safety margin
+ * (the number of pages we may dirty without exceeding the dirty limits).
+ */
+static unsigned long dirty_poll_interval(unsigned long dirty,
+                                         unsigned long thresh)
+{
+        if (thresh > dirty)
+                return 1UL << (ilog2(thresh - dirty) >> 1);
+        return 1;
+}
+static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
+                                   unsigned long bdi_dirty)
+{
+        unsigned long bw = bdi->avg_write_bandwidth;
+        unsigned long hi = ilog2(bw);
+        unsigned long lo = ilog2(bdi->dirty_ratelimit);
+        unsigned long t;
+        /* target for 20ms max pause on 1-dd case */
+        t = HZ / 50;
+        /*
+         * Scale up pause time for concurrent dirtiers in order to reduce CPU
+         * overheads.
+         *
+         * (N * 20ms) on 2^N concurrent tasks.
+         */
+        if (hi > lo)
+                t += (hi - lo) * (20 * HZ) / 1024;
+        /*
+         * Limit pause time for small memory systems. If sleeping for too long
+         * time, a small pool of dirty/writeback pages may go empty and disk go
+         * idle.
+         *
+         * 8 serves as the safety ratio.
+         */
+        if (bdi_dirty)
+                t = min(t, bdi_dirty * HZ / (8 * bw + 1));
+        /*
+         * The pause time will be settled within range (max_pause/4, max_pause).
+         * Apply a minimal value of 4 to get a non-zero max_pause/4.
+         */
+        return clamp_val(t, 4, MAX_PAUSE);
+}
+/*
 * balance_dirty_pages() must be called by processes which are generating dirty
 * data.  It looks at the number of dirty pages in the machine and will force
- * the caller to perform writeback if the system is over `vm_dirty_ratio'.
+ * the caller to wait once crossing the (background_thresh + dirty_thresh) / 2.
 * If we're over `background_thresh' then the writeback threads are woken to
 * perform some writeout.
 */
 static void balance_dirty_pages(struct address_space *mapping,
-                                unsigned long write_chunk)
+                                unsigned long pages_dirtied)
 {
-        unsigned long nr_reclaimable, bdi_nr_reclaimable;
+        unsigned long nr_reclaimable;   /* = file_dirty + unstable_nfs */
+        unsigned long bdi_reclaimable;
        unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
        unsigned long bdi_dirty;
+        unsigned long freerun;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long bdi_thresh;
-        unsigned long task_bdi_thresh;
+        long pause = 0;
-        unsigned long min_task_bdi_thresh;
+        long uninitialized_var(max_pause);
-        unsigned long pages_written = 0;
-        unsigned long pause = 1;
        bool dirty_exceeded = false;
-        bool clear_dirty_exceeded = true;
+        unsigned long task_ratelimit;
+        unsigned long uninitialized_var(dirty_ratelimit);
+        unsigned long pos_ratio;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
        unsigned long start_time = jiffies;
        for (;;) {
+                /*
+                 * Unstable writes are a feature of certain networked
+                 * filesystems (i.e. NFS) in which data may have been
+                 * written to the server's write cache, but has not yet
+                 * been flushed to permanent storage.
+                 */
                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                        global_page_state(NR_UNSTABLE_NFS);
                nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
@@ -679,12 +1039,28 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * catch-up. This avoids (excessively) small writeouts
                 * when the bdi limits are ramping up.
                 */
-                if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
+                freerun = dirty_freerun_ceiling(dirty_thresh,
+                                                background_thresh);
+                if (nr_dirty <= freerun)
                        break;
+                if (unlikely(!writeback_in_progress(bdi)))
+                        bdi_start_background_writeback(bdi);
+                /*
+                 * bdi_thresh is not treated as some limiting factor as
+                 * dirty_thresh, due to reasons
+                 * - in JBOD setup, bdi_thresh can fluctuate a lot
+                 * - in a system with HDD and USB key, the USB key may somehow
+                 *   go into state (bdi_dirty >> bdi_thresh) either because
+                 *   bdi_dirty starts high, or because bdi_thresh drops low.
+                 *   In this case we don't want to hard throttle the USB key
+                 *   dirtiers for 100 seconds until bdi_dirty drops under
+                 *   bdi_thresh. Instead the auxiliary bdi control line in
+                 *   bdi_position_ratio() will let the dirtier task progress
+                 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+                 */
                bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-                min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
-                task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
                /*
                 * In order to avoid the stacked BDI deadlock we need
@@ -696,56 +1072,69 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * actually dirty; with m+n sitting in the percpu
                 * deltas.
                 */
-                if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
+                if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
-                        bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+                        bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-                        bdi_dirty = bdi_nr_reclaimable +
+                        bdi_dirty = bdi_reclaimable +
                                    bdi_stat_sum(bdi, BDI_WRITEBACK);
                } else {
-                        bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+                        bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-                        bdi_dirty = bdi_nr_reclaimable +
+                        bdi_dirty = bdi_reclaimable +
                                    bdi_stat(bdi, BDI_WRITEBACK);
                }
-                /*
+                dirty_exceeded = (bdi_dirty > bdi_thresh) ||
-                 * The bdi thresh is somehow "soft" limit derived from the
-                 * global "hard" limit. The former helps to prevent heavy IO
-                 * bdi or process from holding back light ones; The latter is
-                 * the last resort safeguard.
-                 */
-                dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
                                  (nr_dirty > dirty_thresh);
-                clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
+                if (dirty_exceeded && !bdi->dirty_exceeded)
-                                        (nr_dirty <= dirty_thresh);
-                if (!dirty_exceeded)
-                        break;
-                if (!bdi->dirty_exceeded)
                        bdi->dirty_exceeded = 1;
-                bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+                bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
-                                     bdi_thresh, bdi_dirty, start_time);
+                                     nr_dirty, bdi_thresh, bdi_dirty,
+                                     start_time);
-                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
-                 * Unstable writes are a feature of certain networked
+                max_pause = bdi_max_pause(bdi, bdi_dirty);
-                 * filesystems (i.e. NFS) in which data may have been
-                 * written to the server's write cache, but has not yet
+                dirty_ratelimit = bdi->dirty_ratelimit;
-                 * been flushed to permanent storage.
+                pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
-                 * Only move pages to writeback if this bdi is over its
+                                               background_thresh, nr_dirty,
-                 * threshold otherwise wait until the disk writes catch
+                                               bdi_thresh, bdi_dirty);
-                 * up.
+                task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
-                 */
+                                                        RATELIMIT_CALC_SHIFT;
-                trace_balance_dirty_start(bdi);
+                if (unlikely(task_ratelimit == 0)) {
-                if (bdi_nr_reclaimable > task_bdi_thresh) {
+                        pause = max_pause;
-                        pages_written += writeback_inodes_wb(&bdi->wb,
+                        goto pause;
-                                                             write_chunk);
+                }
-                        trace_balance_dirty_written(bdi, pages_written);
+                pause = HZ * pages_dirtied / task_ratelimit;
-                        if (pages_written >= write_chunk)
+                if (unlikely(pause <= 0)) {
-                                break;          /* We've done our duty */
+                        trace_balance_dirty_pages(bdi,
+                                                  dirty_thresh,
+                                                  background_thresh,
+                                                  nr_dirty,
+                                                  bdi_thresh,
+                                                  bdi_dirty,
+                                                  dirty_ratelimit,
+                                                  task_ratelimit,
+                                                  pages_dirtied,
+                                                  pause,
+                                                  start_time);
+                        pause = 1; /* avoid resetting nr_dirtied_pause below */
+                        break;
                }
+                pause = min(pause, max_pause);
+pause:
+                trace_balance_dirty_pages(bdi,
+                                          dirty_thresh,
+                                          background_thresh,
+                                          nr_dirty,
+                                          bdi_thresh,
+                                          bdi_dirty,
+                                          dirty_ratelimit,
+                                          task_ratelimit,
+                                          pages_dirtied,
+                                          pause,
+                                          start_time);
                __set_current_state(TASK_UNINTERRUPTIBLE);
                io_schedule_timeout(pause);
-                trace_balance_dirty_wait(bdi);
                dirty_thresh = hard_dirty_limit(dirty_thresh);
                /*
@@ -754,35 +1143,30 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * 200ms is typically more than enough to curb heavy dirtiers;
                 * (b) the pause time limit makes the dirtiers more responsive.
                 */
-                if (nr_dirty < dirty_thresh +
+                if (nr_dirty < dirty_thresh)
-                               dirty_thresh / DIRTY_MAXPAUSE_AREA &&
-                    time_after(jiffies, start_time + MAX_PAUSE))
-                        break;
-                /*
-                 * pass-good area. When some bdi gets blocked (eg. NFS server
-                 * not responding), or write bandwidth dropped dramatically due
-                 * to concurrent reads, or dirty threshold suddenly dropped and
-                 * the dirty pages cannot be brought down anytime soon (eg. on
-                 * slow USB stick), at least let go of the good bdi's.
-                 */
-                if (nr_dirty < dirty_thresh +
-                               dirty_thresh / DIRTY_PASSGOOD_AREA &&
-                    bdi_dirty < bdi_thresh)
                        break;
-                /*
-                 * Increase the delay for each loop, up to our previous
-                 * default of taking a 100ms nap.
-                 */
-                pause <<= 1;
-                if (pause > HZ / 10)
-                        pause = HZ / 10;
        }
-        /* Clear dirty_exceeded flag only when no task can exceed the limit */
+        if (!dirty_exceeded && bdi->dirty_exceeded)
-        if (clear_dirty_exceeded && bdi->dirty_exceeded)
                bdi->dirty_exceeded = 0;
+        current->nr_dirtied = 0;
+        if (pause == 0) { /* in freerun area */
+                current->nr_dirtied_pause =
+                                dirty_poll_interval(nr_dirty, dirty_thresh);
+        } else if (pause <= max_pause / 4 &&
+                   pages_dirtied >= current->nr_dirtied_pause) {
+                current->nr_dirtied_pause = clamp_val(
+                                        dirty_ratelimit * (max_pause / 2) / HZ,
+                                        pages_dirtied + pages_dirtied / 8,
+                                        pages_dirtied * 4);
+        } else if (pause >= max_pause) {
+                current->nr_dirtied_pause = 1 | clamp_val(
+                                        dirty_ratelimit * (max_pause / 2) / HZ,
+                                        pages_dirtied / 4,
+                                        pages_dirtied - pages_dirtied / 8);
+        }
        if (writeback_in_progress(bdi))
                return;
@@ -794,8 +1178,10 @@ static void balance_dirty_pages(struct address_space *mapping,
         * In normal mode, we start background writeout at the lower
         * background_thresh, to keep the amount of dirty memory low.
         */
-        if ((laptop_mode && pages_written) ||
+        if (laptop_mode)
-            (!laptop_mode && (nr_reclaimable > background_thresh)))
+                return;
+        if (nr_reclaimable > background_thresh)
                bdi_start_background_writeback(bdi);
 }
@@ -809,7 +1195,7 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite)
        }
 }
-static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+static DEFINE_PER_CPU(int, bdp_ratelimits);
 /**
 * balance_dirty_pages_ratelimited_nr - balance dirty memory state
@@ -829,31 +1215,39 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                        unsigned long nr_pages_dirtied)
 {
        struct backing_dev_info *bdi = mapping->backing_dev_info;
-        unsigned long ratelimit;
+        int ratelimit;
-        unsigned long *p;
+        int *p;
        if (!bdi_cap_account_dirty(bdi))
                return;
-        ratelimit = ratelimit_pages;
+        ratelimit = current->nr_dirtied_pause;
-        if (mapping->backing_dev_info->dirty_exceeded)
+        if (bdi->dirty_exceeded)
-                ratelimit = 8;
+                ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
+        current->nr_dirtied += nr_pages_dirtied;
+        preempt_disable();
        /*
-         * Check the rate limiting. Also, we do not want to throttle real-time
+         * This prevents one CPU to accumulate too many dirtied pages without
-         * tasks in balance_dirty_pages(). Period.
+         * calling into balance_dirty_pages(), which can happen when there are
+         * 1000+ tasks, all of them start dirtying pages at exactly the same
+         * time, hence all honoured too large initial task->nr_dirtied_pause.
         */
-        preempt_disable();
        p =  &__get_cpu_var(bdp_ratelimits);
-        *p += nr_pages_dirtied;
+        if (unlikely(current->nr_dirtied >= ratelimit))
-        if (unlikely(*p >= ratelimit)) {
-                ratelimit = sync_writeback_pages(*p);
                *p = 0;
-                preempt_enable();
+        else {
-                balance_dirty_pages(mapping, ratelimit);
+                *p += nr_pages_dirtied;
-                return;
+                if (unlikely(*p >= ratelimit_pages)) {
+                        *p = 0;
+                        ratelimit = 0;
+                }
        }
        preempt_enable();
+        if (unlikely(current->nr_dirtied >= ratelimit))
+                balance_dirty_pages(mapping, current->nr_dirtied);
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
@@ -909,7 +1303,8 @@ void laptop_mode_timer_fn(unsigned long data)
         * threshold
         */
        if (bdi_has_dirty_io(&q->backing_dev_info))
-                bdi_start_writeback(&q->backing_dev_info, nr_pages);
+                bdi_start_writeback(&q->backing_dev_info, nr_pages,
+                                        WB_REASON_LAPTOP_TIMER);
 }
 /*
@@ -948,22 +1343,17 @@ void laptop_sync_completion(void)
 *
 * Here we set ratelimit_pages to a level which ensures that when all CPUs are
 * dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
- * thresholds before writeback cuts in.
+ * thresholds.
- *
- * But the limit should not be set too high.  Because it also controls the
- * amount of memory which the balance_dirty_pages() caller has to write back.
- * If this is too large then the caller will block on the IO queue all the
- * time.  So limit it to four megabytes - the balance_dirty_pages() caller
- * will write six megabyte chunks, max.
 */
 void writeback_set_ratelimit(void)
 {
-        ratelimit_pages = vm_total_pages / (num_online_cpus() * 32);
+        unsigned long background_thresh;
+        unsigned long dirty_thresh;
+        global_dirty_limits(&background_thresh, &dirty_thresh);
+        ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
        if (ratelimit_pages < 16)
                ratelimit_pages = 16;
-        if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
-                ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
 }
 static int __cpuinit
@@ -1333,6 +1723,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
                __inc_zone_page_state(page, NR_FILE_DIRTY);
                __inc_zone_page_state(page, NR_DIRTIED);
                __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+                __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
                task_dirty_inc(current);
                task_io_account_write(PAGE_CACHE_SIZE);
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1dbcf8888f14..9dd443d89d8b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -318,6 +318,7 @@ static void bad_page(struct page *page)
                current->comm, page_to_pfn(page));
        dump_page(page);
+        print_modules();
        dump_stack();
 out:
        /* Leave bad fields for debug, except PageBuddy could make trouble */
@@ -1409,14 +1410,11 @@ static int __init fail_page_alloc_debugfs(void)
 {
        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
        struct dentry *dir;
-        int err;
-        err = init_fault_attr_dentries(&fail_page_alloc.attr,
+        dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
-                                       "fail_page_alloc");
+                                        &fail_page_alloc.attr);
-        if (err)
+        if (IS_ERR(dir))
-                return err;
+                return PTR_ERR(dir);
-        dir = fail_page_alloc.attr.dir;
        if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
                                &fail_page_alloc.ignore_gfp_wait))
@@ -1430,7 +1428,7 @@ static int __init fail_page_alloc_debugfs(void)
        return 0;
 fail:
-        cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+        debugfs_remove_recursive(dir);
        return -ENOMEM;
 }
@@ -1756,7 +1754,6 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
 void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
-        va_list args;
        unsigned int filter = SHOW_MEM_FILTER_NODES;
        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
@@ -1775,14 +1772,21 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
                filter &= ~SHOW_MEM_FILTER_NODES;
        if (fmt) {
-                printk(KERN_WARNING);
+                struct va_format vaf;
+                va_list args;
                va_start(args, fmt);
-                vprintk(fmt, args);
+                vaf.fmt = fmt;
+                vaf.va = &args;
+                pr_warn("%pV", &vaf);
                va_end(args);
        }
-        pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
+        pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
-                   current->comm, order, gfp_mask);
+                current->comm, order, gfp_mask);
        dump_stack();
        if (!should_suppress_show_mem())
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 39d216d535ea..2d123f94a8df 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -133,10 +133,13 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc)
 static void *__meminit alloc_page_cgroup(size_t size, int nid)
 {
        void *addr = NULL;
+        gfp_t flags = GFP_KERNEL | __GFP_NOWARN;
-        addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
+        addr = alloc_pages_exact_nid(nid, size, flags);
-        if (addr)
+        if (addr) {
+                kmemleak_alloc(addr, size, 1, flags);
                return addr;
+        }
        if (node_state(nid, N_HIGH_MEMORY))
                addr = vmalloc_node(size, nid);
@@ -357,7 +360,7 @@ struct swap_cgroup_ctrl {
        spinlock_t      lock;
 };
-struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
+static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
 struct swap_cgroup {
        unsigned short          id;
@@ -513,11 +516,10 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
        length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
        array_size = length * sizeof(void *);
-        array = vmalloc(array_size);
+        array = vzalloc(array_size);
        if (!array)
                goto nomem;
-        memset(array, 0, array_size);
        ctrl = &swap_cgroup_ctrl[type];
        mutex_lock(&swap_cgroup_mutex);
        ctrl->length = length;
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
new file mode 100644
index 000000000000..e920aa3ce104
--- /dev/null
+++ b/mm/process_vm_access.c
@@ -0,0 +1,496 @@
+/*
+ * linux/mm/process_vm_access.c
+ *
+ * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+/**
+ * process_vm_rw_pages - read/write pages from task specified
+ * @task: task to read/write from
+ * @mm: mm for task
+ * @process_pages: struct pages area that can store at least
+ *  nr_pages_to_copy struct page pointers
+ * @pa: address of page in task to start copying from/to
+ * @start_offset: offset in page to start copying from/to
+ * @len: number of bytes to copy
+ * @lvec: iovec array specifying where to copy to/from
+ * @lvec_cnt: number of elements in iovec array
+ * @lvec_current: index in iovec array we are up to
+ * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @vm_write: 0 means copy from, 1 means copy to
+ * @nr_pages_to_copy: number of pages to copy
+ * @bytes_copied: returns number of bytes successfully copied
+ * Returns 0 on success, error code otherwise
+ */
+static int process_vm_rw_pages(struct task_struct *task,
+                               struct mm_struct *mm,
+                               struct page **process_pages,
+                               unsigned long pa,
+                               unsigned long start_offset,
+                               unsigned long len,
+                               const struct iovec *lvec,
+                               unsigned long lvec_cnt,
+                               unsigned long *lvec_current,
+                               size_t *lvec_offset,
+                               int vm_write,
+                               unsigned int nr_pages_to_copy,
+                               ssize_t *bytes_copied)
+{
+        int pages_pinned;
+        void *target_kaddr;
+        int pgs_copied = 0;
+        int j;
+        int ret;
+        ssize_t bytes_to_copy;
+        ssize_t rc = 0;
+        *bytes_copied = 0;
+        /* Get the pages we're interested in */
+        down_read(&mm->mmap_sem);
+        pages_pinned = get_user_pages(task, mm, pa,
+                                      nr_pages_to_copy,
+                                      vm_write, 0, process_pages, NULL);
+        up_read(&mm->mmap_sem);
+        if (pages_pinned != nr_pages_to_copy) {
+                rc = -EFAULT;
+                goto end;
+        }
+        /* Do the copy for each page */
+        for (pgs_copied = 0;
+             (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
+             pgs_copied++) {
+                /* Make sure we have a non zero length iovec */
+                while (*lvec_current < lvec_cnt
+                       && lvec[*lvec_current].iov_len == 0)
+                        (*lvec_current)++;
+                if (*lvec_current == lvec_cnt)
+                        break;
+                /*
+                 * Will copy smallest of:
+                 * - bytes remaining in page
+                 * - bytes remaining in destination iovec
+                 */
+                bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
+                                      len - *bytes_copied);
+                bytes_to_copy = min_t(ssize_t, bytes_to_copy,
+                                      lvec[*lvec_current].iov_len
+                                      - *lvec_offset);
+                target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
+                if (vm_write)
+                        ret = copy_from_user(target_kaddr,
+                                             lvec[*lvec_current].iov_base
+                                             + *lvec_offset,
+                                             bytes_to_copy);
+                else
+                        ret = copy_to_user(lvec[*lvec_current].iov_base
+                                           + *lvec_offset,
+                                           target_kaddr, bytes_to_copy);
+                kunmap(process_pages[pgs_copied]);
+                if (ret) {
+                        *bytes_copied += bytes_to_copy - ret;
+                        pgs_copied++;
+                        rc = -EFAULT;
+                        goto end;
+                }
+                *bytes_copied += bytes_to_copy;
+                *lvec_offset += bytes_to_copy;
+                if (*lvec_offset == lvec[*lvec_current].iov_len) {
+                        /*
+                         * Need to copy remaining part of page into the
+                         * next iovec if there are any bytes left in page
+                         */
+                        (*lvec_current)++;
+                        *lvec_offset = 0;
+                        start_offset = (start_offset + bytes_to_copy)
+                                % PAGE_SIZE;
+                        if (start_offset)
+                                pgs_copied--;
+                } else {
+                        start_offset = 0;
+                }
+        }
+end:
+        if (vm_write) {
+                for (j = 0; j < pages_pinned; j++) {
+                        if (j < pgs_copied)
+                                set_page_dirty_lock(process_pages[j]);
+                        put_page(process_pages[j]);
+                }
+        } else {
+                for (j = 0; j < pages_pinned; j++)
+                        put_page(process_pages[j]);
+        }
+        return rc;
+}
+/* Maximum number of pages kmalloc'd to hold struct page's during copy */
+#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
+/**
+ * process_vm_rw_single_vec - read/write pages from task specified
+ * @addr: start memory address of target process
+ * @len: size of area to copy to/from
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @lvec_cnt: number of elements in iovec array
+ * @lvec_current: index in iovec array we are up to
+ * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @process_pages: struct pages area that can store at least
+ *  nr_pages_to_copy struct page pointers
+ * @mm: mm for task
+ * @task: task to read/write from
+ * @vm_write: 0 means copy from, 1 means copy to
+ * @bytes_copied: returns number of bytes successfully copied
+ * Returns 0 on success or on failure error code
+ */
+static int process_vm_rw_single_vec(unsigned long addr,
+                                    unsigned long len,
+                                    const struct iovec *lvec,
+                                    unsigned long lvec_cnt,
+                                    unsigned long *lvec_current,
+                                    size_t *lvec_offset,
+                                    struct page **process_pages,
+                                    struct mm_struct *mm,
+                                    struct task_struct *task,
+                                    int vm_write,
+                                    ssize_t *bytes_copied)
+{
+        unsigned long pa = addr & PAGE_MASK;
+        unsigned long start_offset = addr - pa;
+        unsigned long nr_pages;
+        ssize_t bytes_copied_loop;
+        ssize_t rc = 0;
+        unsigned long nr_pages_copied = 0;
+        unsigned long nr_pages_to_copy;
+        unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
+                / sizeof(struct pages *);
+        *bytes_copied = 0;
+        /* Work out address and page range required */
+        if (len == 0)
+                return 0;
+        nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
+        while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
+                nr_pages_to_copy = min(nr_pages - nr_pages_copied,
+                                       max_pages_per_loop);
+                rc = process_vm_rw_pages(task, mm, process_pages, pa,
+                                         start_offset, len,
+                                         lvec, lvec_cnt,
+                                         lvec_current, lvec_offset,
+                                         vm_write, nr_pages_to_copy,
+                                         &bytes_copied_loop);
+                start_offset = 0;
+                *bytes_copied += bytes_copied_loop;
+                if (rc < 0) {
+                        return rc;
+                } else {
+                        len -= bytes_copied_loop;
+                        nr_pages_copied += nr_pages_to_copy;
+                        pa += nr_pages_to_copy * PAGE_SIZE;
+                }
+        }
+        return rc;
+}
+/* Maximum number of entries for process pages array
+   which lives on stack */
+#define PVM_MAX_PP_ARRAY_COUNT 16
+/**
+ * process_vm_rw_core - core of reading/writing pages from task specified
+ * @pid: PID of process to read/write from/to
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @liovcnt: size of lvec array
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ * Returns the number of bytes read/written or error code. May
+ *  return less bytes than expected if an error occurs during the copying
+ *  process.
+ */
+static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
+                                  unsigned long liovcnt,
+                                  const struct iovec *rvec,
+                                  unsigned long riovcnt,
+                                  unsigned long flags, int vm_write)
+{
+        struct task_struct *task;
+        struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
+        struct page **process_pages = pp_stack;
+        struct mm_struct *mm;
+        unsigned long i;
+        ssize_t rc = 0;
+        ssize_t bytes_copied_loop;
+        ssize_t bytes_copied = 0;
+        unsigned long nr_pages = 0;
+        unsigned long nr_pages_iov;
+        unsigned long iov_l_curr_idx = 0;
+        size_t iov_l_curr_offset = 0;
+        ssize_t iov_len;
+        /*
+         * Work out how many pages of struct pages we're going to need
+         * when eventually calling get_user_pages
+         */
+        for (i = 0; i < riovcnt; i++) {
+                iov_len = rvec[i].iov_len;
+                if (iov_len > 0) {
+                        nr_pages_iov = ((unsigned long)rvec[i].iov_base
+                                        + iov_len)
+                                / PAGE_SIZE - (unsigned long)rvec[i].iov_base
+                                / PAGE_SIZE + 1;
+                        nr_pages = max(nr_pages, nr_pages_iov);
+                }
+        }
+        if (nr_pages == 0)
+                return 0;
+        if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
+                /* For reliability don't try to kmalloc more than
+                   2 pages worth */
+                process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
+                                              sizeof(struct pages *)*nr_pages),
+                                        GFP_KERNEL);
+                if (!process_pages)
+                        return -ENOMEM;
+        }
+        /* Get process information */
+        rcu_read_lock();
+        task = find_task_by_vpid(pid);
+        if (task)
+                get_task_struct(task);
+        rcu_read_unlock();
+        if (!task) {
+                rc = -ESRCH;
+                goto free_proc_pages;
+        }
+        task_lock(task);
+        if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+                task_unlock(task);
+                rc = -EPERM;
+                goto put_task_struct;
+        }
+        mm = task->mm;
+        if (!mm || (task->flags & PF_KTHREAD)) {
+                task_unlock(task);
+                rc = -EINVAL;
+                goto put_task_struct;
+        }
+        atomic_inc(&mm->mm_users);
+        task_unlock(task);
+        for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
+                rc = process_vm_rw_single_vec(
+                        (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
+                        lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
+                        process_pages, mm, task, vm_write, &bytes_copied_loop);
+                bytes_copied += bytes_copied_loop;
+                if (rc != 0) {
+                        /* If we have managed to copy any data at all then
+                           we return the number of bytes copied. Otherwise
+                           we return the error code */
+                        if (bytes_copied)
+                                rc = bytes_copied;
+                        goto put_mm;
+                }
+        }
+        rc = bytes_copied;
+put_mm:
+        mmput(mm);
+put_task_struct:
+        put_task_struct(task);
+free_proc_pages:
+        if (process_pages != pp_stack)
+                kfree(process_pages);
+        return rc;
+}
+/**
+ * process_vm_rw - check iovecs before calling core routine
+ * @pid: PID of process to read/write from/to
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @liovcnt: size of lvec array
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ * Returns the number of bytes read/written or error code. May
+ *  return less bytes than expected if an error occurs during the copying
+ *  process.
+ */
+static ssize_t process_vm_rw(pid_t pid,
+                             const struct iovec __user *lvec,
+                             unsigned long liovcnt,
+                             const struct iovec __user *rvec,
+                             unsigned long riovcnt,
+                             unsigned long flags, int vm_write)
+{
+        struct iovec iovstack_l[UIO_FASTIOV];
+        struct iovec iovstack_r[UIO_FASTIOV];
+        struct iovec *iov_l = iovstack_l;
+        struct iovec *iov_r = iovstack_r;
+        ssize_t rc;
+        if (flags != 0)
+                return -EINVAL;
+        /* Check iovecs */
+        if (vm_write)
+                rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
+                                           iovstack_l, &iov_l, 1);
+        else
+                rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
+                                           iovstack_l, &iov_l, 1);
+        if (rc <= 0)
+                goto free_iovecs;
+        rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV,
+                                   iovstack_r, &iov_r, 0);
+        if (rc <= 0)
+                goto free_iovecs;
+        rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
+                                vm_write);
+free_iovecs:
+        if (iov_r != iovstack_r)
+                kfree(iov_r);
+        if (iov_l != iovstack_l)
+                kfree(iov_l);
+        return rc;
+}
+SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
+                unsigned long, liovcnt, const struct iovec __user *, rvec,
+                unsigned long, riovcnt, unsigned long, flags)
+{
+        return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
+}
+SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
+                const struct iovec __user *, lvec,
+                unsigned long, liovcnt, const struct iovec __user *, rvec,
+                unsigned long, riovcnt, unsigned long, flags)
+{
+        return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
+}
+#ifdef CONFIG_COMPAT
+asmlinkage ssize_t
+compat_process_vm_rw(compat_pid_t pid,
+                     const struct compat_iovec __user *lvec,
+                     unsigned long liovcnt,
+                     const struct compat_iovec __user *rvec,
+                     unsigned long riovcnt,
+                     unsigned long flags, int vm_write)
+{
+        struct iovec iovstack_l[UIO_FASTIOV];
+        struct iovec iovstack_r[UIO_FASTIOV];
+        struct iovec *iov_l = iovstack_l;
+        struct iovec *iov_r = iovstack_r;
+        ssize_t rc = -EFAULT;
+        if (flags != 0)
+                return -EINVAL;
+        if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
+                goto out;
+        if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
+                goto out;
+        if (vm_write)
+                rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
+                                                  UIO_FASTIOV, iovstack_l,
+                                                  &iov_l, 1);
+        else
+                rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
+                                                  UIO_FASTIOV, iovstack_l,
+                                                  &iov_l, 1);
+        if (rc <= 0)
+                goto free_iovecs;
+        rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt,
+                                          UIO_FASTIOV, iovstack_r,
+                                          &iov_r, 0);
+        if (rc <= 0)
+                goto free_iovecs;
+        rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
+                           vm_write);
+free_iovecs:
+        if (iov_r != iovstack_r)
+                kfree(iov_r);
+        if (iov_l != iovstack_l)
+                kfree(iov_l);
+out:
+        return rc;
+}
+asmlinkage ssize_t
+compat_sys_process_vm_readv(compat_pid_t pid,
+                            const struct compat_iovec __user *lvec,
+                            unsigned long liovcnt,
+                            const struct compat_iovec __user *rvec,
+                            unsigned long riovcnt,
+                            unsigned long flags)
+{
+        return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
+                                    riovcnt, flags, 0);
+}
+asmlinkage ssize_t
+compat_sys_process_vm_writev(compat_pid_t pid,
+                             const struct compat_iovec __user *lvec,
+                             unsigned long liovcnt,
+                             const struct compat_iovec __user *rvec,
+                             unsigned long riovcnt,
+                             unsigned long flags)
+{
+        return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
+                                    riovcnt, flags, 1);
+}
+#endif
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 2876349339a7..942212970529 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -17,7 +17,6 @@
 #include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
-#include <linux/module.h>
 #include <linux/quicklist.h>
 DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
diff --git a/mm/readahead.c b/mm/readahead.c
index 867f9dd82dcd..cbcbb02f3e28 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -11,7 +11,7 @@
 #include <linux/fs.h>
 #include <linux/gfp.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/task_io_accounting_ops.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index 8005080fb9e3..a4fd3680038b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -51,7 +51,7 @@
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
@@ -1164,7 +1164,7 @@ void page_remove_rmap(struct page *page)
 /*
 * Subfunctions of try_to_unmap: try_to_unmap_one called
- * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
+ * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
 */
 int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                     unsigned long address, enum ttu_flags flags)
diff --git a/mm/shmem.c b/mm/shmem.c
index 5cc21f8b4cd3..d6722506d2da 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -6,7 +6,8 @@
 *               2000-2001 Christoph Rohland
 *               2000-2001 SAP AG
 *               2002 Red Hat Inc.
- * Copyright (C) 2002-2005 Hugh Dickins.
+ * Copyright (C) 2002-2011 Hugh Dickins.
+ * Copyright (C) 2011 Google Inc.
 * Copyright (C) 2002-2005 VERITAS Software Corporation.
 * Copyright (C) 2004 Andi Kleen, SuSE Labs
 *
@@ -27,8 +28,7 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
-#include <linux/percpu_counter.h>
 #include <linux/swap.h>
 static struct vfsmount *shm_mnt;
@@ -51,6 +51,8 @@ static struct vfsmount *shm_mnt;
 #include <linux/shmem_fs.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/pagevec.h>
+#include <linux/percpu_counter.h>
 #include <linux/splice.h>
 #include <linux/security.h>
 #include <linux/swapops.h>
@@ -63,43 +65,17 @@ static struct vfsmount *shm_mnt;
 #include <linux/magic.h>
 #include <asm/uaccess.h>
-#include <asm/div64.h>
 #include <asm/pgtable.h>
-/*
- * The maximum size of a shmem/tmpfs file is limited by the maximum size of
- * its triple-indirect swap vector - see illustration at shmem_swp_entry().
- *
- * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
- * but one eighth of that on a 64-bit kernel.  With 8kB page size, maximum
- * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
- * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
- *
- * We use / and * instead of shifts in the definitions below, so that the swap
- * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
- */
-#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
-#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
-#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
-#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
-#define SHMEM_MAX_BYTES  min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
-#define SHMEM_MAX_INDEX  ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
 #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
-/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
-#define SHMEM_PAGEIN     VM_READ
-#define SHMEM_TRUNCATE   VM_WRITE
-/* Definition to limit shmem_truncate's steps between cond_rescheds */
-#define LATENCY_LIMIT    64
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
+/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
+#define SHORT_SYMLINK_LEN 128
 struct shmem_xattr {
        struct list_head list;  /* anchored by shmem_inode_info->xattr_list */
        char *name;             /* xattr name */
@@ -107,7 +83,7 @@ struct shmem_xattr {
        char value[0];
 };
-/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+/* Flag allocation requirements to shmem_getpage */
 enum sgp_type {
        SGP_READ,       /* don't exceed i_size, don't allocate page */
        SGP_CACHE,      /* don't exceed i_size, may allocate page */
@@ -137,56 +113,6 @@ static inline int shmem_getpage(struct inode *inode, pgoff_t index,
                        mapping_gfp_mask(inode->i_mapping), fault_type);
 }
-static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
-{
-        /*
-         * The above definition of ENTRIES_PER_PAGE, and the use of
-         * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
-         * might be reconsidered if it ever diverges from PAGE_SIZE.
-         *
-         * Mobility flags are masked out as swap vectors cannot move
-         */
-        return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
-                                PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-static inline void shmem_dir_free(struct page *page)
-{
-        __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
-}
-static struct page **shmem_dir_map(struct page *page)
-{
-        return (struct page **)kmap_atomic(page, KM_USER0);
-}
-static inline void shmem_dir_unmap(struct page **dir)
-{
-        kunmap_atomic(dir, KM_USER0);
-}
-static swp_entry_t *shmem_swp_map(struct page *page)
-{
-        return (swp_entry_t *)kmap_atomic(page, KM_USER1);
-}
-static inline void shmem_swp_balance_unmap(void)
-{
-        /*
-         * When passing a pointer to an i_direct entry, to code which
-         * also handles indirect entries and so will shmem_swp_unmap,
-         * we must arrange for the preempt count to remain in balance.
-         * What kmap_atomic of a lowmem page does depends on config
-         * and architecture, so pretend to kmap_atomic some lowmem page.
-         */
-        (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
-}
-static inline void shmem_swp_unmap(swp_entry_t *entry)
-{
-        kunmap_atomic(entry, KM_USER1);
-}
 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -244,15 +170,6 @@ static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 static LIST_HEAD(shmem_swaplist);
 static DEFINE_MUTEX(shmem_swaplist_mutex);
-static void shmem_free_blocks(struct inode *inode, long pages)
-{
-        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-        if (sbinfo->max_blocks) {
-                percpu_counter_add(&sbinfo->used_blocks, -pages);
-                inode->i_blocks -= pages*BLOCKS_PER_PAGE;
-        }
-}
 static int shmem_reserve_inode(struct super_block *sb)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@ -279,7 +196,7 @@ static void shmem_free_inode(struct super_block *sb)
 }
 /**
- * shmem_recalc_inode - recalculate the size of an inode
+ * shmem_recalc_inode - recalculate the block usage of an inode
 * @inode: inode to recalc
 *
 * We have to calculate the free blocks since the mm can drop
@@ -297,474 +214,297 @@ static void shmem_recalc_inode(struct inode *inode)
        freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
        if (freed > 0) {
+                struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+                if (sbinfo->max_blocks)
+                        percpu_counter_add(&sbinfo->used_blocks, -freed);
                info->alloced -= freed;
+                inode->i_blocks -= freed * BLOCKS_PER_PAGE;
                shmem_unacct_blocks(info->flags, freed);
-                shmem_free_blocks(inode, freed);
        }
 }
-/**
+/*
- * shmem_swp_entry - find the swap vector position in the info structure
+ * Replace item expected in radix tree by a new item, while holding tree lock.
- * @info:  info structure for the inode
- * @index: index of the page to find
- * @page:  optional page to add to the structure. Has to be preset to
- *         all zeros
- *
- * If there is no space allocated yet it will return NULL when
- * page is NULL, else it will use the page for the needed block,
- * setting it to NULL on return to indicate that it has been used.
- *
- * The swap vector is organized the following way:
- *
- * There are SHMEM_NR_DIRECT entries directly stored in the
- * shmem_inode_info structure. So small files do not need an addional
- * allocation.
- *
- * For pages with index > SHMEM_NR_DIRECT there is the pointer
- * i_indirect which points to a page which holds in the first half
- * doubly indirect blocks, in the second half triple indirect blocks:
- *
- * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
- * following layout (for SHMEM_NR_DIRECT == 16):
- *
- * i_indirect -> dir --> 16-19
- *            |      +-> 20-23
- *            |
- *            +-->dir2 --> 24-27
- *            |        +-> 28-31
- *            |        +-> 32-35
- *            |        +-> 36-39
- *            |
- *            +-->dir3 --> 40-43
- *                     +-> 44-47
- *                     +-> 48-51
- *                     +-> 52-55
 */
-static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
+static int shmem_radix_tree_replace(struct address_space *mapping,
-{
+                        pgoff_t index, void *expected, void *replacement)
-        unsigned long offset;
+{
-        struct page **dir;
+        void **pslot;
-        struct page *subdir;
+        void *item = NULL;
-        if (index < SHMEM_NR_DIRECT) {
+        VM_BUG_ON(!expected);
-                shmem_swp_balance_unmap();
+        pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
-                return info->i_direct+index;
+        if (pslot)
-        }
+                item = radix_tree_deref_slot_protected(pslot,
-        if (!info->i_indirect) {
+                                                        &mapping->tree_lock);
-                if (page) {
+        if (item != expected)
-                        info->i_indirect = *page;
+                return -ENOENT;
-                        *page = NULL;
+        if (replacement)
-                }
+                radix_tree_replace_slot(pslot, replacement);
-                return NULL;                    /* need another page */
+        else
-        }
+                radix_tree_delete(&mapping->page_tree, index);
+        return 0;
-        index -= SHMEM_NR_DIRECT;
+}
-        offset = index % ENTRIES_PER_PAGE;
-        index /= ENTRIES_PER_PAGE;
-        dir = shmem_dir_map(info->i_indirect);
-        if (index >= ENTRIES_PER_PAGE/2) {
-                index -= ENTRIES_PER_PAGE/2;
-                dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
-                index %= ENTRIES_PER_PAGE;
-                subdir = *dir;
-                if (!subdir) {
-                        if (page) {
-                                *dir = *page;
-                                *page = NULL;
-                        }
-                        shmem_dir_unmap(dir);
-                        return NULL;            /* need another page */
-                }
-                shmem_dir_unmap(dir);
-                dir = shmem_dir_map(subdir);
-        }
-        dir += index;
+/*
-        subdir = *dir;
+ * Like add_to_page_cache_locked, but error if expected item has gone.
-        if (!subdir) {
+ */
-                if (!page || !(subdir = *page)) {
+static int shmem_add_to_page_cache(struct page *page,
-                        shmem_dir_unmap(dir);
+                                   struct address_space *mapping,
-                        return NULL;            /* need a page */
+                                   pgoff_t index, gfp_t gfp, void *expected)
+{
+        int error = 0;
+        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageSwapBacked(page));
+        if (!expected)
+                error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+        if (!error) {
+                page_cache_get(page);
+                page->mapping = mapping;
+                page->index = index;
+                spin_lock_irq(&mapping->tree_lock);
+                if (!expected)
+                        error = radix_tree_insert(&mapping->page_tree,
+                                                        index, page);
+                else
+                        error = shmem_radix_tree_replace(mapping, index,
+                                                        expected, page);
+                if (!error) {
+                        mapping->nrpages++;
+                        __inc_zone_page_state(page, NR_FILE_PAGES);
+                        __inc_zone_page_state(page, NR_SHMEM);
+                        spin_unlock_irq(&mapping->tree_lock);
+                } else {
+                        page->mapping = NULL;
+                        spin_unlock_irq(&mapping->tree_lock);
+                        page_cache_release(page);
                }
-                *dir = subdir;
+                if (!expected)
-                *page = NULL;
+                        radix_tree_preload_end();
        }
-        shmem_dir_unmap(dir);
+        if (error)
-        return shmem_swp_map(subdir) + offset;
+                mem_cgroup_uncharge_cache_page(page);
+        return error;
 }
-static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
+/*
+ * Like delete_from_page_cache, but substitutes swap for page.
+ */
+static void shmem_delete_from_page_cache(struct page *page, void *radswap)
 {
-        long incdec = value? 1: -1;
+        struct address_space *mapping = page->mapping;
+        int error;
-        entry->val = value;
+        spin_lock_irq(&mapping->tree_lock);
-        info->swapped += incdec;
+        error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
-        if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
+        page->mapping = NULL;
-                struct page *page = kmap_atomic_to_page(entry);
+        mapping->nrpages--;
-                set_page_private(page, page_private(page) + incdec);
+        __dec_zone_page_state(page, NR_FILE_PAGES);
-        }
+        __dec_zone_page_state(page, NR_SHMEM);
+        spin_unlock_irq(&mapping->tree_lock);
+        page_cache_release(page);
+        BUG_ON(error);
 }
-/**
+/*
- * shmem_swp_alloc - get the position of the swap entry for the page.
+ * Like find_get_pages, but collecting swap entries as well as pages.
- * @info:       info structure for the inode
- * @index:      index of the page to find
- * @sgp:        check and recheck i_size? skip allocation?
- * @gfp:        gfp mask to use for any page allocation
- *
- * If the entry does not exist, allocate it.
 */
-static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info,
+static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
-                        unsigned long index, enum sgp_type sgp, gfp_t gfp)
+                                        pgoff_t start, unsigned int nr_pages,
-{
+                                        struct page **pages, pgoff_t *indices)
-        struct inode *inode = &info->vfs_inode;
+{
-        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+        unsigned int i;
-        struct page *page = NULL;
+        unsigned int ret;
-        swp_entry_t *entry;
+        unsigned int nr_found;
-        if (sgp != SGP_WRITE &&
+        rcu_read_lock();
-            ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
+restart:
-                return ERR_PTR(-EINVAL);
+        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                                (void ***)pages, indices, start, nr_pages);
-        while (!(entry = shmem_swp_entry(info, index, &page))) {
+        ret = 0;
-                if (sgp == SGP_READ)
+        for (i = 0; i < nr_found; i++) {
-                        return shmem_swp_map(ZERO_PAGE(0));
+                struct page *page;
-                /*
+repeat:
-                 * Test used_blocks against 1 less max_blocks, since we have 1 data
+                page = radix_tree_deref_slot((void **)pages[i]);
-                 * page (and perhaps indirect index pages) yet to allocate:
+                if (unlikely(!page))
-                 * a waste to allocate index if we cannot allocate data.
+                        continue;
-                 */
+                if (radix_tree_exception(page)) {
-                if (sbinfo->max_blocks) {
+                        if (radix_tree_deref_retry(page))
-                        if (percpu_counter_compare(&sbinfo->used_blocks,
+                                goto restart;
-                                                sbinfo->max_blocks - 1) >= 0)
+                        /*
-                                return ERR_PTR(-ENOSPC);
+                         * Otherwise, we must be storing a swap entry
-                        percpu_counter_inc(&sbinfo->used_blocks);
+                         * here as an exceptional entry: so return it
-                        inode->i_blocks += BLOCKS_PER_PAGE;
+                         * without attempting to raise page count.
+                         */
+                        goto export;
                }
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
-                spin_unlock(&info->lock);
+                /* Has the page moved? */
-                page = shmem_dir_alloc(gfp);
+                if (unlikely(page != *((void **)pages[i]))) {
-                spin_lock(&info->lock);
+                        page_cache_release(page);
+                        goto repeat;
-                if (!page) {
-                        shmem_free_blocks(inode, 1);
-                        return ERR_PTR(-ENOMEM);
-                }
-                if (sgp != SGP_WRITE &&
-                    ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
-                        entry = ERR_PTR(-EINVAL);
-                        break;
                }
-                if (info->next_index <= index)
+export:
-                        info->next_index = index + 1;
+                indices[ret] = indices[i];
-        }
+                pages[ret] = page;
-        if (page) {
+                ret++;
-                /* another task gave its page, or truncated the file */
+        }
-                shmem_free_blocks(inode, 1);
+        if (unlikely(!ret && nr_found))
-                shmem_dir_free(page);
+                goto restart;
-        }
+        rcu_read_unlock();
-        if (info->next_index <= index && !IS_ERR(entry))
+        return ret;
-                info->next_index = index + 1;
-        return entry;
 }
-/**
+/*
- * shmem_free_swp - free some swap entries in a directory
+ * Remove swap entry from radix tree, free the swap and its page cache.
- * @dir:        pointer to the directory
- * @edir:       pointer after last entry of the directory
- * @punch_lock: pointer to spinlock when needed for the holepunch case
 */
-static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
+static int shmem_free_swap(struct address_space *mapping,
-                                                spinlock_t *punch_lock)
+                           pgoff_t index, void *radswap)
-{
+{
-        spinlock_t *punch_unlock = NULL;
+        int error;
-        swp_entry_t *ptr;
-        int freed = 0;
+        spin_lock_irq(&mapping->tree_lock);
+        error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
-        for (ptr = dir; ptr < edir; ptr++) {
+        spin_unlock_irq(&mapping->tree_lock);
-                if (ptr->val) {
+        if (!error)
-                        if (unlikely(punch_lock)) {
+                free_swap_and_cache(radix_to_swp_entry(radswap));
-                                punch_unlock = punch_lock;
+        return error;
-                                punch_lock = NULL;
-                                spin_lock(punch_unlock);
-                                if (!ptr->val)
-                                        continue;
-                        }
-                        free_swap_and_cache(*ptr);
-                        *ptr = (swp_entry_t){0};
-                        freed++;
-                }
-        }
-        if (punch_unlock)
-                spin_unlock(punch_unlock);
-        return freed;
-}
-static int shmem_map_and_free_swp(struct page *subdir, int offset,
-                int limit, struct page ***dir, spinlock_t *punch_lock)
-{
-        swp_entry_t *ptr;
-        int freed = 0;
-        ptr = shmem_swp_map(subdir);
-        for (; offset < limit; offset += LATENCY_LIMIT) {
-                int size = limit - offset;
-                if (size > LATENCY_LIMIT)
-                        size = LATENCY_LIMIT;
-                freed += shmem_free_swp(ptr+offset, ptr+offset+size,
-                                                        punch_lock);
-                if (need_resched()) {
-                        shmem_swp_unmap(ptr);
-                        if (*dir) {
-                                shmem_dir_unmap(*dir);
-                                *dir = NULL;
-                        }
-                        cond_resched();
-                        ptr = shmem_swp_map(subdir);
-                }
-        }
-        shmem_swp_unmap(ptr);
-        return freed;
 }
-static void shmem_free_pages(struct list_head *next)
+/*
+ * Pagevec may contain swap entries, so shuffle up pages before releasing.
+ */
+static void shmem_pagevec_release(struct pagevec *pvec)
 {
-        struct page *page;
+        int i, j;
-        int freed = 0;
+        for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
-        do {
+                struct page *page = pvec->pages[i];
-                page = container_of(next, struct page, lru);
+                if (!radix_tree_exceptional_entry(page))
-                next = next->next;
+                        pvec->pages[j++] = page;
-                shmem_dir_free(page);
+        }
-                freed++;
+        pvec->nr = j;
-                if (freed >= LATENCY_LIMIT) {
+        pagevec_release(pvec);
-                        cond_resched();
-                        freed = 0;
-                }
-        } while (next);
 }
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+/*
+ * Remove range of pages and swap entries from radix tree, and free them.
+ */
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
+        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
-        unsigned long idx;
+        pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-        unsigned long size;
+        unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
-        unsigned long limit;
+        pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
-        unsigned long stage;
+        struct pagevec pvec;
-        unsigned long diroff;
+        pgoff_t indices[PAGEVEC_SIZE];
-        struct page **dir;
-        struct page *topdir;
-        struct page *middir;
-        struct page *subdir;
-        swp_entry_t *ptr;
-        LIST_HEAD(pages_to_free);
-        long nr_pages_to_free = 0;
        long nr_swaps_freed = 0;
-        int offset;
+        pgoff_t index;
-        int freed;
+        int i;
-        int punch_hole;
-        spinlock_t *needs_lock;
-        spinlock_t *punch_lock;
-        unsigned long upper_limit;
-        truncate_inode_pages_range(inode->i_mapping, start, end);
+        BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
-        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+        pagevec_init(&pvec, 0);
-        idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        index = start;
-        if (idx >= info->next_index)
+        while (index <= end) {
-                return;
+                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+                                                        pvec.pages, indices);
+                if (!pvec.nr)
+                        break;
+                mem_cgroup_uncharge_start();
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        struct page *page = pvec.pages[i];
-        spin_lock(&info->lock);
+                        index = indices[i];
-        info->flags |= SHMEM_TRUNCATE;
+                        if (index > end)
-        if (likely(end == (loff_t) -1)) {
+                                break;
-                limit = info->next_index;
-                upper_limit = SHMEM_MAX_INDEX;
+                        if (radix_tree_exceptional_entry(page)) {
-                info->next_index = idx;
+                                nr_swaps_freed += !shmem_free_swap(mapping,
-                needs_lock = NULL;
+                                                                index, page);
-                punch_hole = 0;
+                                continue;
-        } else {
+                        }
-                if (end + 1 >= inode->i_size) { /* we may free a little more */
-                        limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
-                                                        PAGE_CACHE_SHIFT;
-                        upper_limit = SHMEM_MAX_INDEX;
-                } else {
-                        limit = (end + 1) >> PAGE_CACHE_SHIFT;
-                        upper_limit = limit;
-                }
-                needs_lock = &info->lock;
-                punch_hole = 1;
-        }
-        topdir = info->i_indirect;
+                        if (!trylock_page(page))
-        if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
+                                continue;
-                info->i_indirect = NULL;
+                        if (page->mapping == mapping) {
-                nr_pages_to_free++;
+                                VM_BUG_ON(PageWriteback(page));
-                list_add(&topdir->lru, &pages_to_free);
+                                truncate_inode_page(mapping, page);
+                        }
+                        unlock_page(page);
+                }
+                shmem_pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
+                cond_resched();
+                index++;
        }
-        spin_unlock(&info->lock);
-        if (info->swapped && idx < SHMEM_NR_DIRECT) {
+        if (partial) {
-                ptr = info->i_direct;
+                struct page *page = NULL;
-                size = limit;
+                shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
-                if (size > SHMEM_NR_DIRECT)
+                if (page) {
-                        size = SHMEM_NR_DIRECT;
+                        zero_user_segment(page, partial, PAGE_CACHE_SIZE);
-                nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
+                        set_page_dirty(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                }
        }
-        /*
+        index = start;
-         * If there are no indirect blocks or we are punching a hole
+        for ( ; ; ) {
-         * below indirect blocks, nothing to be done.
+                cond_resched();
-         */
+                pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-        if (!topdir || limit <= SHMEM_NR_DIRECT)
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
-                goto done2;
+                                                        pvec.pages, indices);
+                if (!pvec.nr) {
+                        if (index == start)
+                                break;
+                        index = start;
+                        continue;
+                }
+                if (index == start && indices[0] > end) {
+                        shmem_pagevec_release(&pvec);
+                        break;
+                }
+                mem_cgroup_uncharge_start();
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        struct page *page = pvec.pages[i];
-        /*
+                        index = indices[i];
-         * The truncation case has already dropped info->lock, and we're safe
+                        if (index > end)
-         * because i_size and next_index have already been lowered, preventing
+                                break;
-         * access beyond.  But in the punch_hole case, we still need to take
-         * the lock when updating the swap directory, because there might be
-         * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
-         * shmem_writepage.  However, whenever we find we can remove a whole
-         * directory page (not at the misaligned start or end of the range),
-         * we first NULLify its pointer in the level above, and then have no
-         * need to take the lock when updating its contents: needs_lock and
-         * punch_lock (either pointing to info->lock or NULL) manage this.
-         */
-        upper_limit -= SHMEM_NR_DIRECT;
+                        if (radix_tree_exceptional_entry(page)) {
-        limit -= SHMEM_NR_DIRECT;
+                                nr_swaps_freed += !shmem_free_swap(mapping,
-        idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
+                                                                index, page);
-        offset = idx % ENTRIES_PER_PAGE;
+                                continue;
-        idx -= offset;
-        dir = shmem_dir_map(topdir);
-        stage = ENTRIES_PER_PAGEPAGE/2;
-        if (idx < ENTRIES_PER_PAGEPAGE/2) {
-                middir = topdir;
-                diroff = idx/ENTRIES_PER_PAGE;
-        } else {
-                dir += ENTRIES_PER_PAGE/2;
-                dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
-                while (stage <= idx)
-                        stage += ENTRIES_PER_PAGEPAGE;
-                middir = *dir;
-                if (*dir) {
-                        diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
-                                ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
-                        if (!diroff && !offset && upper_limit >= stage) {
-                                if (needs_lock) {
-                                        spin_lock(needs_lock);
-                                        *dir = NULL;
-                                        spin_unlock(needs_lock);
-                                        needs_lock = NULL;
-                                } else
-                                        *dir = NULL;
-                                nr_pages_to_free++;
-                                list_add(&middir->lru, &pages_to_free);
                        }
-                        shmem_dir_unmap(dir);
-                        dir = shmem_dir_map(middir);
-                } else {
-                        diroff = 0;
-                        offset = 0;
-                        idx = stage;
-                }
-        }
-        for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
+                        lock_page(page);
-                if (unlikely(idx == stage)) {
+                        if (page->mapping == mapping) {
-                        shmem_dir_unmap(dir);
+                                VM_BUG_ON(PageWriteback(page));
-                        dir = shmem_dir_map(topdir) +
+                                truncate_inode_page(mapping, page);
-                            ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
-                        while (!*dir) {
-                                dir++;
-                                idx += ENTRIES_PER_PAGEPAGE;
-                                if (idx >= limit)
-                                        goto done1;
                        }
-                        stage = idx + ENTRIES_PER_PAGEPAGE;
+                        unlock_page(page);
-                        middir = *dir;
-                        if (punch_hole)
-                                needs_lock = &info->lock;
-                        if (upper_limit >= stage) {
-                                if (needs_lock) {
-                                        spin_lock(needs_lock);
-                                        *dir = NULL;
-                                        spin_unlock(needs_lock);
-                                        needs_lock = NULL;
-                                } else
-                                        *dir = NULL;
-                                nr_pages_to_free++;
-                                list_add(&middir->lru, &pages_to_free);
-                        }
-                        shmem_dir_unmap(dir);
-                        cond_resched();
-                        dir = shmem_dir_map(middir);
-                        diroff = 0;
-                }
-                punch_lock = needs_lock;
-                subdir = dir[diroff];
-                if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
-                        if (needs_lock) {
-                                spin_lock(needs_lock);
-                                dir[diroff] = NULL;
-                                spin_unlock(needs_lock);
-                                punch_lock = NULL;
-                        } else
-                                dir[diroff] = NULL;
-                        nr_pages_to_free++;
-                        list_add(&subdir->lru, &pages_to_free);
-                }
-                if (subdir && page_private(subdir) /* has swap entries */) {
-                        size = limit - idx;
-                        if (size > ENTRIES_PER_PAGE)
-                                size = ENTRIES_PER_PAGE;
-                        freed = shmem_map_and_free_swp(subdir,
-                                        offset, size, &dir, punch_lock);
-                        if (!dir)
-                                dir = shmem_dir_map(middir);
-                        nr_swaps_freed += freed;
-                        if (offset || punch_lock) {
-                                spin_lock(&info->lock);
-                                set_page_private(subdir,
-                                        page_private(subdir) - freed);
-                                spin_unlock(&info->lock);
-                        } else
-                                BUG_ON(page_private(subdir) != freed);
                }
-                offset = 0;
+                shmem_pagevec_release(&pvec);
-        }
+                mem_cgroup_uncharge_end();
-done1:
+                index++;
-        shmem_dir_unmap(dir);
-done2:
-        if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
-                /*
-                 * Call truncate_inode_pages again: racing shmem_unuse_inode
-                 * may have swizzled a page in from swap since
-                 * truncate_pagecache or generic_delete_inode did it, before we
-                 * lowered next_index.  Also, though shmem_getpage checks
-                 * i_size before adding to cache, no recheck after: so fix the
-                 * narrow window there too.
-                 */
-                truncate_inode_pages_range(inode->i_mapping, start, end);
        }
        spin_lock(&info->lock);
-        info->flags &= ~SHMEM_TRUNCATE;
        info->swapped -= nr_swaps_freed;
-        if (nr_pages_to_free)
-                shmem_free_blocks(inode, nr_pages_to_free);
        shmem_recalc_inode(inode);
        spin_unlock(&info->lock);
-        /*
+        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-         * Empty swap vector directory pages to be freed?
-         */
-        if (!list_empty(&pages_to_free)) {
-                pages_to_free.prev->next = NULL;
-                shmem_free_pages(pages_to_free.next);
-        }
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
@@ -780,37 +520,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
                loff_t oldsize = inode->i_size;
                loff_t newsize = attr->ia_size;
-                struct page *page = NULL;
-                if (newsize < oldsize) {
-                        /*
-                         * If truncating down to a partial page, then
-                         * if that page is already allocated, hold it
-                         * in memory until the truncation is over, so
-                         * truncate_partial_page cannot miss it were
-                         * it assigned to swap.
-                         */
-                        if (newsize & (PAGE_CACHE_SIZE-1)) {
-                                (void) shmem_getpage(inode,
-                                        newsize >> PAGE_CACHE_SHIFT,
-                                                &page, SGP_READ, NULL);
-                                if (page)
-                                        unlock_page(page);
-                        }
-                        /*
-                         * Reset SHMEM_PAGEIN flag so that shmem_truncate can
-                         * detect if any pages might have been added to cache
-                         * after truncate_inode_pages.  But we needn't bother
-                         * if it's being fully truncated to zero-length: the
-                         * nrpages check is efficient enough in that case.
-                         */
-                        if (newsize) {
-                                struct shmem_inode_info *info = SHMEM_I(inode);
-                                spin_lock(&info->lock);
-                                info->flags &= ~SHMEM_PAGEIN;
-                                spin_unlock(&info->lock);
-                        }
-                }
                if (newsize != oldsize) {
                        i_size_write(inode, newsize);
                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -822,8 +532,6 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
                        /* unmap again to remove racily COWed private pages */
                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
                }
-                if (page)
-                        page_cache_release(page);
        }
        setattr_copy(inode, attr);
@@ -848,7 +556,8 @@ static void shmem_evict_inode(struct inode *inode)
                        list_del_init(&info->swaplist);
                        mutex_unlock(&shmem_swaplist_mutex);
                }
-        }
+        } else
+                kfree(info->symlink);
        list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
                kfree(xattr->name);
@@ -859,106 +568,27 @@ static void shmem_evict_inode(struct inode *inode)
        end_writeback(inode);
 }
-static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
+/*
-{
+ * If swap found in inode, free it and move page from swapcache to filecache.
-        swp_entry_t *ptr;
+ */
+static int shmem_unuse_inode(struct shmem_inode_info *info,
-        for (ptr = dir; ptr < edir; ptr++) {
+                             swp_entry_t swap, struct page *page)
-                if (ptr->val == entry.val)
-                        return ptr - dir;
-        }
-        return -1;
-}
-static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
 {
-        struct address_space *mapping;
+        struct address_space *mapping = info->vfs_inode.i_mapping;
-        unsigned long idx;
+        void *radswap;
-        unsigned long size;
+        pgoff_t index;
-        unsigned long limit;
-        unsigned long stage;
-        struct page **dir;
-        struct page *subdir;
-        swp_entry_t *ptr;
-        int offset;
        int error;
-        idx = 0;
+        radswap = swp_to_radix_entry(swap);
-        ptr = info->i_direct;
+        index = radix_tree_locate_item(&mapping->page_tree, radswap);
-        spin_lock(&info->lock);
+        if (index == -1)
-        if (!info->swapped) {
+                return 0;
-                list_del_init(&info->swaplist);
-                goto lost2;
-        }
-        limit = info->next_index;
-        size = limit;
-        if (size > SHMEM_NR_DIRECT)
-                size = SHMEM_NR_DIRECT;
-        offset = shmem_find_swp(entry, ptr, ptr+size);
-        if (offset >= 0) {
-                shmem_swp_balance_unmap();
-                goto found;
-        }
-        if (!info->i_indirect)
-                goto lost2;
-        dir = shmem_dir_map(info->i_indirect);
-        stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
-        for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
-                if (unlikely(idx == stage)) {
-                        shmem_dir_unmap(dir-1);
-                        if (cond_resched_lock(&info->lock)) {
-                                /* check it has not been truncated */
-                                if (limit > info->next_index) {
-                                        limit = info->next_index;
-                                        if (idx >= limit)
-                                                goto lost2;
-                                }
-                        }
-                        dir = shmem_dir_map(info->i_indirect) +
-                            ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
-                        while (!*dir) {
-                                dir++;
-                                idx += ENTRIES_PER_PAGEPAGE;
-                                if (idx >= limit)
-                                        goto lost1;
-                        }
-                        stage = idx + ENTRIES_PER_PAGEPAGE;
-                        subdir = *dir;
-                        shmem_dir_unmap(dir);
-                        dir = shmem_dir_map(subdir);
-                }
-                subdir = *dir;
-                if (subdir && page_private(subdir)) {
-                        ptr = shmem_swp_map(subdir);
-                        size = limit - idx;
-                        if (size > ENTRIES_PER_PAGE)
-                                size = ENTRIES_PER_PAGE;
-                        offset = shmem_find_swp(entry, ptr, ptr+size);
-                        shmem_swp_unmap(ptr);
-                        if (offset >= 0) {
-                                shmem_dir_unmap(dir);
-                                ptr = shmem_swp_map(subdir);
-                                goto found;
-                        }
-                }
-        }
-lost1:
-        shmem_dir_unmap(dir-1);
-lost2:
-        spin_unlock(&info->lock);
-        return 0;
-found:
-        idx += offset;
-        ptr += offset;
        /*
         * Move _head_ to start search for next from here.
         * But be careful: shmem_evict_inode checks list_empty without taking
         * mutex, and there's an instant in list_move_tail when info->swaplist
-         * would appear empty, if it were the only one on shmem_swaplist.  We
+         * would appear empty, if it were the only one on shmem_swaplist.
-         * could avoid doing it if inode NULL; or use this minor optimization.
         */
        if (shmem_swaplist.next != &info->swaplist)
                list_move_tail(&shmem_swaplist, &info->swaplist);
@@ -968,29 +598,34 @@ found:
         * but also to hold up shmem_evict_inode(): so inode cannot be freed
         * beneath us (pagelock doesn't help until the page is in pagecache).
         */
-        mapping = info->vfs_inode.i_mapping;
+        error = shmem_add_to_page_cache(page, mapping, index,
-        error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
+                                                GFP_NOWAIT, radswap);
        /* which does mem_cgroup_uncharge_cache_page on error */
        if (error != -ENOMEM) {
+                /*
+                 * Truncation and eviction use free_swap_and_cache(), which
+                 * only does trylock page: if we raced, best clean up here.
+                 */
                delete_from_swap_cache(page);
                set_page_dirty(page);
-                info->flags |= SHMEM_PAGEIN;
+                if (!error) {
-                shmem_swp_set(info, ptr, 0);
+                        spin_lock(&info->lock);
-                swap_free(entry);
+                        info->swapped--;
+                        spin_unlock(&info->lock);
+                        swap_free(swap);
+                }
                error = 1;      /* not an error, but entry was found */
        }
-        shmem_swp_unmap(ptr);
-        spin_unlock(&info->lock);
        return error;
 }
 /*
- * shmem_unuse() search for an eventually swapped out shmem page.
+ * Search through swapped inodes to find and replace swap by page.
 */
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
 {
-        struct list_head *p, *next;
+        struct list_head *this, *next;
        struct shmem_inode_info *info;
        int found = 0;
        int error;
@@ -999,32 +634,25 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
         * Charge page using GFP_KERNEL while we can wait, before taking
         * the shmem_swaplist_mutex which might hold up shmem_writepage().
         * Charged back to the user (not to caller) when swap account is used.
-         * add_to_page_cache() will be called with GFP_NOWAIT.
         */
        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
        if (error)
                goto out;
-        /*
+        /* No radix_tree_preload: swap entry keeps a place for page in tree */
-         * Try to preload while we can wait, to not make a habit of
-         * draining atomic reserves; but don't latch on to this cpu,
-         * it's okay if sometimes we get rescheduled after this.
-         */
-        error = radix_tree_preload(GFP_KERNEL);
-        if (error)
-                goto uncharge;
-        radix_tree_preload_end();
        mutex_lock(&shmem_swaplist_mutex);
-        list_for_each_safe(p, next, &shmem_swaplist) {
+        list_for_each_safe(this, next, &shmem_swaplist) {
-                info = list_entry(p, struct shmem_inode_info, swaplist);
+                info = list_entry(this, struct shmem_inode_info, swaplist);
-                found = shmem_unuse_inode(info, entry, page);
+                if (info->swapped)
+                        found = shmem_unuse_inode(info, swap, page);
+                else
+                        list_del_init(&info->swaplist);
                cond_resched();
                if (found)
                        break;
        }
        mutex_unlock(&shmem_swaplist_mutex);
-uncharge:
        if (!found)
                mem_cgroup_uncharge_cache_page(page);
        if (found < 0)
@@ -1041,10 +669,10 @@ out:
 static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct shmem_inode_info *info;
-        swp_entry_t *entry, swap;
        struct address_space *mapping;
-        unsigned long index;
        struct inode *inode;
+        swp_entry_t swap;
+        pgoff_t index;
        BUG_ON(!PageLocked(page));
        mapping = page->mapping;
@@ -1073,50 +701,32 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
        /*
         * Add inode to shmem_unuse()'s list of swapped-out inodes,
-         * if it's not already there.  Do it now because we cannot take
+         * if it's not already there.  Do it now before the page is
-         * mutex while holding spinlock, and must do so before the page
+         * moved to swap cache, when its pagelock no longer protects
-         * is moved to swap cache, when its pagelock no longer protects
         * the inode from eviction.  But don't unlock the mutex until
-         * we've taken the spinlock, because shmem_unuse_inode() will
+         * we've incremented swapped, because shmem_unuse_inode() will
-         * prune a !swapped inode from the swaplist under both locks.
+         * prune a !swapped inode from the swaplist under this mutex.
         */
        mutex_lock(&shmem_swaplist_mutex);
        if (list_empty(&info->swaplist))
                list_add_tail(&info->swaplist, &shmem_swaplist);
-        spin_lock(&info->lock);
-        mutex_unlock(&shmem_swaplist_mutex);
-        if (index >= info->next_index) {
-                BUG_ON(!(info->flags & SHMEM_TRUNCATE));
-                goto unlock;
-        }
-        entry = shmem_swp_entry(info, index, NULL);
-        if (entry->val) {
-                WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
-                free_swap_and_cache(*entry);
-                shmem_swp_set(info, entry, 0);
-        }
-        shmem_recalc_inode(inode);
        if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
-                delete_from_page_cache(page);
-                shmem_swp_set(info, entry, swap.val);
-                shmem_swp_unmap(entry);
                swap_shmem_alloc(swap);
+                shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+                spin_lock(&info->lock);
+                info->swapped++;
+                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+                mutex_unlock(&shmem_swaplist_mutex);
                BUG_ON(page_mapped(page));
                swap_writepage(page, wbc);
                return 0;
        }
-        shmem_swp_unmap(entry);
+        mutex_unlock(&shmem_swaplist_mutex);
-unlock:
-        spin_unlock(&info->lock);
-        /*
-         * add_to_swap_cache() doesn't return -EEXIST, so we can safely
-         * clear SWAP_HAS_CACHE flag.
-         */
        swapcache_free(swap, NULL);
 redirty:
        set_page_dirty(page);
@@ -1153,35 +763,33 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 }
 #endif /* CONFIG_TMPFS */
-static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
+static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
-                        struct shmem_inode_info *info, unsigned long idx)
+                        struct shmem_inode_info *info, pgoff_t index)
 {
        struct mempolicy mpol, *spol;
        struct vm_area_struct pvma;
-        struct page *page;
        spol = mpol_cond_copy(&mpol,
-                                mpol_shared_policy_lookup(&info->policy, idx));
+                        mpol_shared_policy_lookup(&info->policy, index));
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = idx;
+        pvma.vm_pgoff = index;
        pvma.vm_ops = NULL;
        pvma.vm_policy = spol;
-        page = swapin_readahead(entry, gfp, &pvma, 0);
+        return swapin_readahead(swap, gfp, &pvma, 0);
-        return page;
 }
 static struct page *shmem_alloc_page(gfp_t gfp,
-                        struct shmem_inode_info *info, unsigned long idx)
+                        struct shmem_inode_info *info, pgoff_t index)
 {
        struct vm_area_struct pvma;
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
-        pvma.vm_pgoff = idx;
+        pvma.vm_pgoff = index;
        pvma.vm_ops = NULL;
-        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
        /*
         * alloc_page_vma() will drop the shared policy reference
@@ -1190,19 +798,19 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
 {
 }
 #endif /* CONFIG_TMPFS */
-static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
+static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
-                        struct shmem_inode_info *info, unsigned long idx)
+                        struct shmem_inode_info *info, pgoff_t index)
 {
-        return swapin_readahead(entry, gfp, NULL, 0);
+        return swapin_readahead(swap, gfp, NULL, 0);
 }
 static inline struct page *shmem_alloc_page(gfp_t gfp,
-                        struct shmem_inode_info *info, unsigned long idx)
+                        struct shmem_inode_info *info, pgoff_t index)
 {
        return alloc_page(gfp);
 }
@@ -1222,243 +830,190 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 * vm. If we swap it in we mark it dirty since we also free the swap
 * entry since a page cannot live in both the swap and page cache
 */
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx,
+static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
        struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
 {
        struct address_space *mapping = inode->i_mapping;
-        struct shmem_inode_info *info = SHMEM_I(inode);
+        struct shmem_inode_info *info;
        struct shmem_sb_info *sbinfo;
        struct page *page;
-        struct page *prealloc_page = NULL;
-        swp_entry_t *entry;
        swp_entry_t swap;
        int error;
-        int ret;
+        int once = 0;
-        if (idx >= SHMEM_MAX_INDEX)
+        if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
                return -EFBIG;
 repeat:
-        page = find_lock_page(mapping, idx);
+        swap.val = 0;
-        if (page) {
+        page = find_lock_page(mapping, index);
+        if (radix_tree_exceptional_entry(page)) {
+                swap = radix_to_swp_entry(page);
+                page = NULL;
+        }
+        if (sgp != SGP_WRITE &&
+            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+                error = -EINVAL;
+                goto failed;
+        }
+        if (page || (sgp == SGP_READ && !swap.val)) {
                /*
                 * Once we can get the page lock, it must be uptodate:
                 * if there were an error in reading back from swap,
                 * the page would not be inserted into the filecache.
                 */
-                BUG_ON(!PageUptodate(page));
+                BUG_ON(page && !PageUptodate(page));
-                goto done;
+                *pagep = page;
+                return 0;
        }
        /*
-         * Try to preload while we can wait, to not make a habit of
+         * Fast cache lookup did not find it:
-         * draining atomic reserves; but don't latch on to this cpu.
+         * bring it back from swap or allocate.
         */
-        error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+        info = SHMEM_I(inode);
-        if (error)
+        sbinfo = SHMEM_SB(inode->i_sb);
-                goto out;
-        radix_tree_preload_end();
-        if (sgp != SGP_READ && !prealloc_page) {
-                prealloc_page = shmem_alloc_page(gfp, info, idx);
-                if (prealloc_page) {
-                        SetPageSwapBacked(prealloc_page);
-                        if (mem_cgroup_cache_charge(prealloc_page,
-                                        current->mm, GFP_KERNEL)) {
-                                page_cache_release(prealloc_page);
-                                prealloc_page = NULL;
-                        }
-                }
-        }
-        spin_lock(&info->lock);
-        shmem_recalc_inode(inode);
-        entry = shmem_swp_alloc(info, idx, sgp, gfp);
-        if (IS_ERR(entry)) {
-                spin_unlock(&info->lock);
-                error = PTR_ERR(entry);
-                goto out;
-        }
-        swap = *entry;
        if (swap.val) {
                /* Look it up and read it in.. */
                page = lookup_swap_cache(swap);
                if (!page) {
-                        shmem_swp_unmap(entry);
-                        spin_unlock(&info->lock);
                        /* here we actually do the io */
                        if (fault_type)
                                *fault_type |= VM_FAULT_MAJOR;
-                        page = shmem_swapin(swap, gfp, info, idx);
+                        page = shmem_swapin(swap, gfp, info, index);
                        if (!page) {
-                                spin_lock(&info->lock);
+                                error = -ENOMEM;
-                                entry = shmem_swp_alloc(info, idx, sgp, gfp);
+                                goto failed;
-                                if (IS_ERR(entry))
-                                        error = PTR_ERR(entry);
-                                else {
-                                        if (entry->val == swap.val)
-                                                error = -ENOMEM;
-                                        shmem_swp_unmap(entry);
-                                }
-                                spin_unlock(&info->lock);
-                                if (error)
-                                        goto out;
-                                goto repeat;
                        }
-                        wait_on_page_locked(page);
-                        page_cache_release(page);
-                        goto repeat;
                }
                /* We have to do this with page locked to prevent races */
-                if (!trylock_page(page)) {
+                lock_page(page);
-                        shmem_swp_unmap(entry);
-                        spin_unlock(&info->lock);
-                        wait_on_page_locked(page);
-                        page_cache_release(page);
-                        goto repeat;
-                }
-                if (PageWriteback(page)) {
-                        shmem_swp_unmap(entry);
-                        spin_unlock(&info->lock);
-                        wait_on_page_writeback(page);
-                        unlock_page(page);
-                        page_cache_release(page);
-                        goto repeat;
-                }
                if (!PageUptodate(page)) {
-                        shmem_swp_unmap(entry);
-                        spin_unlock(&info->lock);
-                        unlock_page(page);
-                        page_cache_release(page);
                        error = -EIO;
-                        goto out;
+                        goto failed;
                }
+                wait_on_page_writeback(page);
-                error = add_to_page_cache_locked(page, mapping,
-                                                 idx, GFP_NOWAIT);
+                /* Someone may have already done it for us */
-                if (error) {
+                if (page->mapping) {
-                        shmem_swp_unmap(entry);
+                        if (page->mapping == mapping &&
-                        spin_unlock(&info->lock);
+                            page->index == index)
-                        if (error == -ENOMEM) {
+                                goto done;
-                                /*
+                        error = -EEXIST;
-                                 * reclaim from proper memory cgroup and
+                        goto failed;
-                                 * call memcg's OOM if needed.
-                                 */
-                                error = mem_cgroup_shmem_charge_fallback(
-                                                page, current->mm, gfp);
-                                if (error) {
-                                        unlock_page(page);
-                                        page_cache_release(page);
-                                        goto out;
-                                }
-                        }
-                        unlock_page(page);
-                        page_cache_release(page);
-                        goto repeat;
                }
-                info->flags |= SHMEM_PAGEIN;
+                error = mem_cgroup_cache_charge(page, current->mm,
-                shmem_swp_set(info, entry, 0);
+                                                gfp & GFP_RECLAIM_MASK);
-                shmem_swp_unmap(entry);
+                if (!error)
-                delete_from_swap_cache(page);
+                        error = shmem_add_to_page_cache(page, mapping, index,
+                                                gfp, swp_to_radix_entry(swap));
+                if (error)
+                        goto failed;
+                spin_lock(&info->lock);
+                info->swapped--;
+                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+                delete_from_swap_cache(page);
                set_page_dirty(page);
                swap_free(swap);
-        } else if (sgp == SGP_READ) {
+        } else {
-                shmem_swp_unmap(entry);
+                if (shmem_acct_block(info->flags)) {
-                page = find_get_page(mapping, idx);
+                        error = -ENOSPC;
-                if (page && !trylock_page(page)) {
+                        goto failed;
-                        spin_unlock(&info->lock);
-                        wait_on_page_locked(page);
-                        page_cache_release(page);
-                        goto repeat;
                }
-                spin_unlock(&info->lock);
-        } else if (prealloc_page) {
-                shmem_swp_unmap(entry);
-                sbinfo = SHMEM_SB(inode->i_sb);
                if (sbinfo->max_blocks) {
                        if (percpu_counter_compare(&sbinfo->used_blocks,
-                                                sbinfo->max_blocks) >= 0 ||
+                                                sbinfo->max_blocks) >= 0) {
-                            shmem_acct_block(info->flags))
+                                error = -ENOSPC;
-                                goto nospace;
+                                goto unacct;
+                        }
                        percpu_counter_inc(&sbinfo->used_blocks);
-                        inode->i_blocks += BLOCKS_PER_PAGE;
-                } else if (shmem_acct_block(info->flags))
-                        goto nospace;
-                page = prealloc_page;
-                prealloc_page = NULL;
-                entry = shmem_swp_alloc(info, idx, sgp, gfp);
-                if (IS_ERR(entry))
-                        error = PTR_ERR(entry);
-                else {
-                        swap = *entry;
-                        shmem_swp_unmap(entry);
                }
-                ret = error || swap.val;
-                if (ret)
+                page = shmem_alloc_page(gfp, info, index);
-                        mem_cgroup_uncharge_cache_page(page);
+                if (!page) {
-                else
+                        error = -ENOMEM;
-                        ret = add_to_page_cache_lru(page, mapping,
+                        goto decused;
-                                                idx, GFP_NOWAIT);
-                /*
-                 * At add_to_page_cache_lru() failure,
-                 * uncharge will be done automatically.
-                 */
-                if (ret) {
-                        shmem_unacct_blocks(info->flags, 1);
-                        shmem_free_blocks(inode, 1);
-                        spin_unlock(&info->lock);
-                        page_cache_release(page);
-                        if (error)
-                                goto out;
-                        goto repeat;
                }
-                info->flags |= SHMEM_PAGEIN;
+                SetPageSwapBacked(page);
+                __set_page_locked(page);
+                error = mem_cgroup_cache_charge(page, current->mm,
+                                                gfp & GFP_RECLAIM_MASK);
+                if (!error)
+                        error = shmem_add_to_page_cache(page, mapping, index,
+                                                gfp, NULL);
+                if (error)
+                        goto decused;
+                lru_cache_add_anon(page);
+                spin_lock(&info->lock);
                info->alloced++;
+                inode->i_blocks += BLOCKS_PER_PAGE;
+                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
                clear_highpage(page);
                flush_dcache_page(page);
                SetPageUptodate(page);
                if (sgp == SGP_DIRTY)
                        set_page_dirty(page);
-        } else {
-                spin_unlock(&info->lock);
-                error = -ENOMEM;
-                goto out;
        }
 done:
-        *pagep = page;
+        /* Perhaps the file has been truncated since we checked */
-        error = 0;
+        if (sgp != SGP_WRITE &&
-out:
+            ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
-        if (prealloc_page) {
+                error = -EINVAL;
-                mem_cgroup_uncharge_cache_page(prealloc_page);
+                goto trunc;
-                page_cache_release(prealloc_page);
        }
-        return error;
+        *pagep = page;
+        return 0;
-nospace:
        /*
-         * Perhaps the page was brought in from swap between find_lock_page
+         * Error recovery.
-         * and taking info->lock?  We allow for that at add_to_page_cache_lru,
-         * but must also avoid reporting a spurious ENOSPC while working on a
-         * full tmpfs.
         */
-        page = find_get_page(mapping, idx);
+trunc:
+        ClearPageDirty(page);
+        delete_from_page_cache(page);
+        spin_lock(&info->lock);
+        info->alloced--;
+        inode->i_blocks -= BLOCKS_PER_PAGE;
        spin_unlock(&info->lock);
+decused:
+        if (sbinfo->max_blocks)
+                percpu_counter_add(&sbinfo->used_blocks, -1);
+unacct:
+        shmem_unacct_blocks(info->flags, 1);
+failed:
+        if (swap.val && error != -EINVAL) {
+                struct page *test = find_get_page(mapping, index);
+                if (test && !radix_tree_exceptional_entry(test))
+                        page_cache_release(test);
+                /* Have another try if the entry has changed */
+                if (test != swp_to_radix_entry(swap))
+                        error = -EEXIST;
+        }
        if (page) {
+                unlock_page(page);
                page_cache_release(page);
+        }
+        if (error == -ENOSPC && !once++) {
+                info = SHMEM_I(inode);
+                spin_lock(&info->lock);
+                shmem_recalc_inode(inode);
+                spin_unlock(&info->lock);
                goto repeat;
        }
-        error = -ENOSPC;
+        if (error == -EEXIST)
-        goto out;
+                goto repeat;
+        return error;
 }
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -1467,9 +1022,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        int error;
        int ret = VM_FAULT_LOCKED;
-        if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-                return VM_FAULT_SIGBUS;
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1482,20 +1034,20 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 #ifdef CONFIG_NUMA
-static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
 {
-        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-        return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+        return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
 }
 static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
                                          unsigned long addr)
 {
-        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
+        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
-        unsigned long idx;
+        pgoff_t index;
-        idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+        index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
-        return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+        return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
 }
 #endif
@@ -1516,6 +1068,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
                user_shm_unlock(inode->i_size, user);
                info->flags &= ~VM_LOCKED;
                mapping_clear_unevictable(file->f_mapping);
+                /*
+                 * Ensure that a racing putback_lru_page() can see
+                 * the pages of this mapping are evictable when we
+                 * skip them due to !PageLRU during the scan.
+                 */
+                smp_mb__after_clear_bit();
                scan_mapping_unevictable_pages(file->f_mapping);
        }
        retval = 0;
@@ -1593,7 +1151,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
-static const struct inode_operations shmem_symlink_inline_operations;
+static const struct inode_operations shmem_short_symlink_operations;
 static int
 shmem_write_begin(struct file *file, struct address_space *mapping,
@@ -1626,7 +1184,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
-        unsigned long index, offset;
+        pgoff_t index;
+        unsigned long offset;
        enum sgp_type sgp = SGP_READ;
        /*
@@ -1642,7 +1201,8 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
        for (;;) {
                struct page *page = NULL;
-                unsigned long end_index, nr, ret;
+                pgoff_t end_index;
+                unsigned long nr, ret;
                loff_t i_size = i_size_read(inode);
                end_index = i_size >> PAGE_CACHE_SHIFT;
@@ -1880,8 +1440,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_namelen = NAME_MAX;
        if (sbinfo->max_blocks) {
                buf->f_blocks = sbinfo->max_blocks;
-                buf->f_bavail = buf->f_bfree =
+                buf->f_bavail =
-                                sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
+                buf->f_bfree  = sbinfo->max_blocks -
+                                percpu_counter_sum(&sbinfo->used_blocks);
        }
        if (sbinfo->max_inodes) {
                buf->f_files = sbinfo->max_inodes;
@@ -1903,7 +1464,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
        if (inode) {
                error = security_inode_init_security(inode, dir,
-                                                     &dentry->d_name, NULL,
+                                                     &dentry->d_name,
                                                     NULL, NULL);
                if (error) {
                        if (error != -EOPNOTSUPP) {
@@ -2043,7 +1604,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        if (!inode)
                return -ENOSPC;
-        error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
+        error = security_inode_init_security(inode, dir, &dentry->d_name,
                                             NULL, NULL);
        if (error) {
                if (error != -EOPNOTSUPP) {
@@ -2055,10 +1616,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        info = SHMEM_I(inode);
        inode->i_size = len-1;
-        if (len <= SHMEM_SYMLINK_INLINE_LEN) {
+        if (len <= SHORT_SYMLINK_LEN) {
-                /* do it inline */
+                info->symlink = kmemdup(symname, len, GFP_KERNEL);
-                memcpy(info->inline_symlink, symname, len);
+                if (!info->symlink) {
-                inode->i_op = &shmem_symlink_inline_operations;
+                        iput(inode);
+                        return -ENOMEM;
+                }
+                inode->i_op = &shmem_short_symlink_operations;
        } else {
                error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
                if (error) {
@@ -2081,17 +1645,17 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        return 0;
 }
-static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
 {
-        nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
+        nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
        return NULL;
 }
 static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
        struct page *page = NULL;
-        int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
+        int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
-        nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+        nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
        if (page)
                unlock_page(page);
        return page;
@@ -2202,7 +1766,6 @@ out:
        return err;
 }
 static const struct xattr_handler *shmem_xattr_handlers[] = {
 #ifdef CONFIG_TMPFS_POSIX_ACL
        &generic_acl_access_handler,
@@ -2332,9 +1895,9 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
 }
 #endif /* CONFIG_TMPFS_XATTR */
-static const struct inode_operations shmem_symlink_inline_operations = {
+static const struct inode_operations shmem_short_symlink_operations = {
        .readlink       = generic_readlink,
-        .follow_link    = shmem_follow_link_inline,
+        .follow_link    = shmem_follow_short_symlink,
 #ifdef CONFIG_TMPFS_XATTR
        .setxattr       = shmem_setxattr,
        .getxattr       = shmem_getxattr,
@@ -2534,8 +2097,7 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        if (config.max_inodes < inodes)
                goto out;
        /*
-         * Those tests also disallow limited->unlimited while any are in
+         * Those tests disallow limited->unlimited while any are in use;
-         * use, so i_blocks will always be zero when max_blocks is zero;
         * but we must separately disallow unlimited->limited, because
         * in that case we have no record of how much is already in use.
         */
@@ -2627,7 +2189,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
                goto failed;
        sbinfo->free_inodes = sbinfo->max_inodes;
-        sb->s_maxbytes = SHMEM_MAX_BYTES;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = PAGE_CACHE_SIZE;
        sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
        sb->s_magic = TMPFS_MAGIC;
@@ -2662,14 +2224,14 @@ static struct kmem_cache *shmem_inode_cachep;
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
-        struct shmem_inode_info *p;
+        struct shmem_inode_info *info;
-        p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+        info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
-        if (!p)
+        if (!info)
                return NULL;
-        return &p->vfs_inode;
+        return &info->vfs_inode;
 }
-static void shmem_i_callback(struct rcu_head *head)
+static void shmem_destroy_callback(struct rcu_head *head)
 {
        struct inode *inode = container_of(head, struct inode, i_rcu);
        INIT_LIST_HEAD(&inode->i_dentry);
@@ -2678,29 +2240,26 @@ static void shmem_i_callback(struct rcu_head *head)
 static void shmem_destroy_inode(struct inode *inode)
 {
-        if ((inode->i_mode & S_IFMT) == S_IFREG) {
+        if ((inode->i_mode & S_IFMT) == S_IFREG)
-                /* only struct inode is valid if it's an inline symlink */
                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
-        }
+        call_rcu(&inode->i_rcu, shmem_destroy_callback);
-        call_rcu(&inode->i_rcu, shmem_i_callback);
 }
-static void init_once(void *foo)
+static void shmem_init_inode(void *foo)
 {
-        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
+        struct shmem_inode_info *info = foo;
+        inode_init_once(&info->vfs_inode);
-        inode_init_once(&p->vfs_inode);
 }
-static int init_inodecache(void)
+static int shmem_init_inodecache(void)
 {
        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                sizeof(struct shmem_inode_info),
-                                0, SLAB_PANIC, init_once);
+                                0, SLAB_PANIC, shmem_init_inode);
        return 0;
 }
-static void destroy_inodecache(void)
+static void shmem_destroy_inodecache(void)
 {
        kmem_cache_destroy(shmem_inode_cachep);
 }
@@ -2797,21 +2356,20 @@ static const struct vm_operations_struct shmem_vm_ops = {
 #endif
 };
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data)
 {
        return mount_nodev(fs_type, flags, data, shmem_fill_super);
 }
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "tmpfs",
        .mount          = shmem_mount,
        .kill_sb        = kill_litter_super,
 };
-int __init init_tmpfs(void)
+int __init shmem_init(void)
 {
        int error;
@@ -2819,18 +2377,18 @@ int __init init_tmpfs(void)
        if (error)
                goto out4;
-        error = init_inodecache();
+        error = shmem_init_inodecache();
        if (error)
                goto out3;
-        error = register_filesystem(&tmpfs_fs_type);
+        error = register_filesystem(&shmem_fs_type);
        if (error) {
                printk(KERN_ERR "Could not register tmpfs\n");
                goto out2;
        }
-        shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
+        shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
-                                tmpfs_fs_type.name, NULL);
+                                 shmem_fs_type.name, NULL);
        if (IS_ERR(shm_mnt)) {
                error = PTR_ERR(shm_mnt);
                printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@ -2839,9 +2397,9 @@ int __init init_tmpfs(void)
        return 0;
 out1:
-        unregister_filesystem(&tmpfs_fs_type);
+        unregister_filesystem(&shmem_fs_type);
 out2:
-        destroy_inodecache();
+        shmem_destroy_inodecache();
 out3:
        bdi_destroy(&shmem_backing_dev_info);
 out4:
@@ -2849,45 +2407,6 @@ out4:
        return error;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- * @inode: the inode to be searched
- * @pgoff: the offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
-                                        struct page **pagep, swp_entry_t *ent)
-{
-        swp_entry_t entry = { .val = 0 }, *ptr;
-        struct page *page = NULL;
-        struct shmem_inode_info *info = SHMEM_I(inode);
-        if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-                goto out;
-        spin_lock(&info->lock);
-        ptr = shmem_swp_entry(info, pgoff, NULL);
-#ifdef CONFIG_SWAP
-        if (ptr && ptr->val) {
-                entry.val = ptr->val;
-                page = find_get_page(&swapper_space, entry.val);
-        } else
-#endif
-                page = find_get_page(inode->i_mapping, pgoff);
-        if (ptr)
-                shmem_swp_unmap(ptr);
-        spin_unlock(&info->lock);
-out:
-        *pagep = page;
-        *ent = entry;
-}
-#endif
 #else /* !CONFIG_SHMEM */
 /*
@@ -2901,23 +2420,23 @@ out:
 #include <linux/ramfs.h>
-static struct file_system_type tmpfs_fs_type = {
+static struct file_system_type shmem_fs_type = {
        .name           = "tmpfs",
        .mount          = ramfs_mount,
        .kill_sb        = kill_litter_super,
 };
-int __init init_tmpfs(void)
+int __init shmem_init(void)
 {
-        BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+        BUG_ON(register_filesystem(&shmem_fs_type) != 0);
-        shm_mnt = kern_mount(&tmpfs_fs_type);
+        shm_mnt = kern_mount(&shmem_fs_type);
        BUG_ON(IS_ERR(shm_mnt));
        return 0;
 }
-int shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t swap, struct page *page)
 {
        return 0;
 }
@@ -2927,43 +2446,17 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
        return 0;
 }
-void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
 {
-        truncate_inode_pages_range(inode->i_mapping, start, end);
+        truncate_inode_pages_range(inode->i_mapping, lstart, lend);
 }
 EXPORT_SYMBOL_GPL(shmem_truncate_range);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
-/**
- * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- * @inode: the inode to be searched
- * @pgoff: the offset to be searched
- * @pagep: the pointer for the found page to be stored
- * @ent: the pointer for the found swap entry to be stored
- *
- * If a page is found, refcount of it is incremented. Callers should handle
- * these refcount.
- */
-void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
-                                        struct page **pagep, swp_entry_t *ent)
-{
-        struct page *page = NULL;
-        if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
-                goto out;
-        page = find_get_page(inode->i_mapping, pgoff);
-out:
-        *pagep = page;
-        *ent = (swp_entry_t){ .val = 0 };
-}
-#endif
 #define shmem_vm_ops                            generic_file_vm_ops
 #define shmem_file_operations                   ramfs_file_operations
 #define shmem_get_inode(sb, dir, mode, dev, flags)      ramfs_get_inode(sb, dir, mode, dev)
 #define shmem_acct_size(flags, size)            0
 #define shmem_unacct_size(flags, size)          do {} while (0)
-#define SHMEM_MAX_BYTES                         MAX_LFS_FILESIZE
 #endif /* CONFIG_SHMEM */
@@ -2987,7 +2480,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        if (IS_ERR(shm_mnt))
                return (void *)shm_mnt;
-        if (size < 0 || size > SHMEM_MAX_BYTES)
+        if (size < 0 || size > MAX_LFS_FILESIZE)
                return ERR_PTR(-EINVAL);
        if (shmem_acct_size(flags, size))
@@ -3010,7 +2503,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        d_instantiate(path.dentry, inode);
        inode->i_size = size;
-        inode->i_nlink = 0;     /* It is unlinked */
+        clear_nlink(inode);     /* It is unlinked */
 #ifndef CONFIG_MMU
        error = ramfs_nommu_expand_for_mapping(inode, size);
        if (error)
diff --git a/mm/slab.c b/mm/slab.c
index 1e523ed47c61..708efe886154 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -622,6 +622,51 @@ int slab_is_available(void)
 static struct lock_class_key on_slab_l3_key;
 static struct lock_class_key on_slab_alc_key;
+static struct lock_class_key debugobj_l3_key;
+static struct lock_class_key debugobj_alc_key;
+static void slab_set_lock_classes(struct kmem_cache *cachep,
+                struct lock_class_key *l3_key, struct lock_class_key *alc_key,
+                int q)
+{
+        struct array_cache **alc;
+        struct kmem_list3 *l3;
+        int r;
+        l3 = cachep->nodelists[q];
+        if (!l3)
+                return;
+        lockdep_set_class(&l3->list_lock, l3_key);
+        alc = l3->alien;
+        /*
+         * FIXME: This check for BAD_ALIEN_MAGIC
+         * should go away when common slab code is taught to
+         * work even without alien caches.
+         * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+         * for alloc_alien_cache,
+         */
+        if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+                return;
+        for_each_node(r) {
+                if (alc[r])
+                        lockdep_set_class(&alc[r]->lock, alc_key);
+        }
+}
+static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+        slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node);
+}
+static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
+{
+        int node;
+        for_each_online_node(node)
+                slab_set_debugobj_lock_classes_node(cachep, node);
+}
 static void init_node_lock_keys(int q)
 {
        struct cache_sizes *s = malloc_sizes;
@@ -630,29 +675,14 @@ static void init_node_lock_keys(int q)
                return;
        for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
-                struct array_cache **alc;
                struct kmem_list3 *l3;
-                int r;
                l3 = s->cs_cachep->nodelists[q];
                if (!l3 || OFF_SLAB(s->cs_cachep))
                        continue;
-                lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
-                alc = l3->alien;
+                slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
-                /*
+                                &on_slab_alc_key, q);
-                 * FIXME: This check for BAD_ALIEN_MAGIC
-                 * should go away when common slab code is taught to
-                 * work even without alien caches.
-                 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
-                 * for alloc_alien_cache,
-                 */
-                if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
-                        continue;
-                for_each_node(r) {
-                        if (alc[r])
-                                lockdep_set_class(&alc[r]->lock,
-                                        &on_slab_alc_key);
-                }
        }
 }
@@ -671,6 +701,14 @@ static void init_node_lock_keys(int q)
 static inline void init_lock_keys(void)
 {
 }
+static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+}
+static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
+{
+}
 #endif
 /*
@@ -1264,6 +1302,8 @@ static int __cpuinit cpuup_prepare(long cpu)
                spin_unlock_irq(&l3->list_lock);
                kfree(shared);
                free_alien_cache(alien);
+                if (cachep->flags & SLAB_DEBUG_OBJECTS)
+                        slab_set_debugobj_lock_classes_node(cachep, node);
        }
        init_node_lock_keys(node);
@@ -1626,6 +1666,9 @@ void __init kmem_cache_init_late(void)
 {
        struct kmem_cache *cachep;
+        /* Annotate slab for lockdep -- annotate the malloc caches */
+        init_lock_keys();
        /* 6) resize the head arrays to their final sizes */
        mutex_lock(&cache_chain_mutex);
        list_for_each_entry(cachep, &cache_chain, next)
@@ -1636,9 +1679,6 @@ void __init kmem_cache_init_late(void)
        /* Done! */
        g_cpucache_up = FULL;
-        /* Annotate slab for lockdep -- annotate the malloc caches */
-        init_lock_keys();
        /*
         * Register a cpu startup notifier callback that initializes
         * cpu_cache_get for all new cpus
@@ -1811,15 +1851,15 @@ static void dump_line(char *data, int offset, int limit)
        unsigned char error = 0;
        int bad_count = 0;
-        printk(KERN_ERR "%03x:", offset);
+        printk(KERN_ERR "%03x: ", offset);
        for (i = 0; i < limit; i++) {
                if (data[offset + i] != POISON_FREE) {
                        error = data[offset + i];
                        bad_count++;
                }
-                printk(" %02x", (unsigned char)data[offset + i]);
        }
-        printk("\n");
+        print_hex_dump(KERN_CONT, "", 0, 16, 1,
+                        &data[offset], limit, 1);
        if (bad_count == 1) {
                error ^= POISON_FREE;
@@ -2426,6 +2466,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                goto oops;
        }
+        if (flags & SLAB_DEBUG_OBJECTS) {
+                /*
+                 * Would deadlock through slab_destroy()->call_rcu()->
+                 * debug_object_activate()->kmem_cache_alloc().
+                 */
+                WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
+                slab_set_debugobj_lock_classes(cachep);
+        }
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
 oops:
@@ -2989,14 +3039,9 @@ bad:
                printk(KERN_ERR "slab: Internal list corruption detected in "
                                "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
                        cachep->name, cachep->num, slabp, slabp->inuse);
-                for (i = 0;
+                print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
-                     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
+                        sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
-                     i++) {
+                        1);
-                        if (i % 16 == 0)
-                                printk("\n%03x:", i);
-                        printk(" %02x", ((unsigned char *)slabp)[i]);
-                }
-                printk("\n");
                BUG();
        }
 }
@@ -3403,7 +3448,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
-        if (nodeid == -1)
+        if (nodeid == NUMA_NO_NODE)
                nodeid = slab_node;
        if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3934,7 +3979,7 @@ fail:
 struct ccupdate_struct {
        struct kmem_cache *cachep;
-        struct array_cache *new[NR_CPUS];
+        struct array_cache *new[0];
 };
 static void do_ccupdate_local(void *info)
@@ -3956,7 +4001,8 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
        struct ccupdate_struct *new;
        int i;
-        new = kzalloc(sizeof(*new), gfp);
+        new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
+                      gfp);
        if (!new)
                return -ENOMEM;
@@ -4533,7 +4579,7 @@ static const struct file_operations proc_slabstats_operations = {
 static int __init slab_proc_init(void)
 {
-        proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
+        proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
 #ifdef CONFIG_DEBUG_SLAB_LEAK
        proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
 #endif
diff --git a/mm/slob.c b/mm/slob.c
index bf3918187165..8105be42cad1 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -63,7 +63,7 @@
 #include <linux/swap.h> /* struct reclaim_state */
 #include <linux/cache.h>
 #include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
 #include <linux/kmemleak.h>
diff --git a/mm/slub.c b/mm/slub.c
index f8f5e8efeb88..7d2a996c307e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2,10 +2,11 @@
 * SLUB: A slab allocator that limits cache line use instead of queuing
 * objects in per cpu and per node lists.
 *
- * The allocator synchronizes using per slab locks and only
+ * The allocator synchronizes using per slab locks or atomic operatios
- * uses a centralized lock to manage a pool of partial slabs.
+ * and only uses a centralized lock to manage a pool of partial slabs.
 *
 * (C) 2007 SGI, Christoph Lameter
+ * (C) 2011 Linux Foundation, Christoph Lameter
 */
 #include <linux/mm.h>
@@ -33,15 +34,27 @@
 /*
 * Lock order:
- *   1. slab_lock(page)
+ *   1. slub_lock (Global Semaphore)
- *   2. slab->list_lock
+ *   2. node->list_lock
+ *   3. slab_lock(page) (Only on some arches and for debugging)
 *
- *   The slab_lock protects operations on the object of a particular
+ *   slub_lock
- *   slab and its metadata in the page struct. If the slab lock
+ *
- *   has been taken then no allocations nor frees can be performed
+ *   The role of the slub_lock is to protect the list of all the slabs
- *   on the objects in the slab nor can the slab be added or removed
+ *   and to synchronize major metadata changes to slab cache structures.
- *   from the partial or full lists since this would mean modifying
+ *
- *   the page_struct of the slab.
+ *   The slab_lock is only used for debugging and on arches that do not
+ *   have the ability to do a cmpxchg_double. It only protects the second
+ *   double word in the page struct. Meaning
+ *      A. page->freelist       -> List of object free in a page
+ *      B. page->counters       -> Counters of objects
+ *      C. page->frozen         -> frozen state
+ *
+ *   If a slab is frozen then it is exempt from list management. It is not
+ *   on any list. The processor that froze the slab is the one who can
+ *   perform list operations on the page. Other processors may put objects
+ *   onto the freelist but the processor that froze the slab is the only
+ *   one that can retrieve the objects from the page's freelist.
 *
 *   The list_lock protects the partial and full list on each node and
 *   the partial slab counter. If taken then no new slabs may be added or
@@ -54,20 +67,6 @@
 *   slabs, operations can continue without any centralized lock. F.e.
 *   allocating a long series of objects that fill up slabs does not require
 *   the list lock.
- *
- *   The lock order is sometimes inverted when we are trying to get a slab
- *   off a list. We take the list_lock and then look for a page on the list
- *   to use. While we do that objects in the slabs may be freed. We can
- *   only operate on the slab if we have also taken the slab_lock. So we use
- *   a slab_trylock() on the slab. If trylock was successful then no frees
- *   can occur anymore and we can use the slab for allocations etc. If the
- *   slab_trylock() does not succeed then frees are in progress in the slab and
- *   we must stay away from it for a while since we may cause a bouncing
- *   cacheline if we try to acquire the lock. So go onto the next slab.
- *   If all pages are busy then we may allocate a new slab instead of reusing
- *   a partial slab. A new slab has no one operating on it and thus there is
- *   no danger of cacheline contention.
- *
 *   Interrupts are disabled during allocation and deallocation in order to
 *   make the slab allocator safe to use in the context of an irq. In addition
 *   interrupts are disabled to ensure that the processor does not change
@@ -132,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 /* Enable to test recovery from slab corruption on boot */
 #undef SLUB_RESILIENCY_TEST
+/* Enable to log cmpxchg failures */
+#undef SLUB_DEBUG_CMPXCHG
 /*
 * Mininum number of partial slabs. These will be left on the partial
 * lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -167,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 #define OO_SHIFT        16
 #define OO_MASK         ((1 << OO_SHIFT) - 1)
-#define MAX_OBJS_PER_PAGE       65535 /* since page.objects is u16 */
+#define MAX_OBJS_PER_PAGE       32767 /* since page.objects is u15 */
 /* Internal SLUB flags */
 #define __OBJECT_POISON         0x80000000UL /* Poison object */
+#define __CMPXCHG_DOUBLE        0x40000000UL /* Use cmpxchg_double */
 static int kmem_size = sizeof(struct kmem_cache);
@@ -343,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
        return x.x & OO_MASK;
 }
+/*
+ * Per slab locking using the pagelock
+ */
+static __always_inline void slab_lock(struct page *page)
+{
+        bit_spin_lock(PG_locked, &page->flags);
+}
+static __always_inline void slab_unlock(struct page *page)
+{
+        __bit_spin_unlock(PG_locked, &page->flags);
+}
+/* Interrupts must be disabled (for the fallback code to work right) */
+static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+                void *freelist_old, unsigned long counters_old,
+                void *freelist_new, unsigned long counters_new,
+                const char *n)
+{
+        VM_BUG_ON(!irqs_disabled());
+#ifdef CONFIG_CMPXCHG_DOUBLE
+        if (s->flags & __CMPXCHG_DOUBLE) {
+                if (cmpxchg_double(&page->freelist,
+                        freelist_old, counters_old,
+                        freelist_new, counters_new))
+                return 1;
+        } else
+#endif
+        {
+                slab_lock(page);
+                if (page->freelist == freelist_old && page->counters == counters_old) {
+                        page->freelist = freelist_new;
+                        page->counters = counters_new;
+                        slab_unlock(page);
+                        return 1;
+                }
+                slab_unlock(page);
+        }
+        cpu_relax();
+        stat(s, CMPXCHG_DOUBLE_FAIL);
+#ifdef SLUB_DEBUG_CMPXCHG
+        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
+#endif
+        return 0;
+}
+static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
+                void *freelist_old, unsigned long counters_old,
+                void *freelist_new, unsigned long counters_new,
+                const char *n)
+{
+#ifdef CONFIG_CMPXCHG_DOUBLE
+        if (s->flags & __CMPXCHG_DOUBLE) {
+                if (cmpxchg_double(&page->freelist,
+                        freelist_old, counters_old,
+                        freelist_new, counters_new))
+                return 1;
+        } else
+#endif
+        {
+                unsigned long flags;
+                local_irq_save(flags);
+                slab_lock(page);
+                if (page->freelist == freelist_old && page->counters == counters_old) {
+                        page->freelist = freelist_new;
+                        page->counters = counters_new;
+                        slab_unlock(page);
+                        local_irq_restore(flags);
+                        return 1;
+                }
+                slab_unlock(page);
+                local_irq_restore(flags);
+        }
+        cpu_relax();
+        stat(s, CMPXCHG_DOUBLE_FAIL);
+#ifdef SLUB_DEBUG_CMPXCHG
+        printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
+#endif
+        return 0;
+}
 #ifdef CONFIG_SLUB_DEBUG
 /*
 * Determine a map of object in use on a page.
 *
- * Slab lock or node listlock must be held to guarantee that the page does
+ * Node listlock must be held to guarantee that the page does
 * not vanish from under us.
 */
 static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
@@ -376,34 +467,8 @@ static int disable_higher_order_debug;
 */
 static void print_section(char *text, u8 *addr, unsigned int length)
 {
-        int i, offset;
+        print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
-        int newline = 1;
+                        length, 1);
-        char ascii[17];
-        ascii[16] = 0;
-        for (i = 0; i < length; i++) {
-                if (newline) {
-                        printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
-                        newline = 0;
-                }
-                printk(KERN_CONT " %02x", addr[i]);
-                offset = i % 16;
-                ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
-                if (offset == 15) {
-                        printk(KERN_CONT " %s\n", ascii);
-                        newline = 1;
-                }
-        }
-        if (!newline) {
-                i %= 16;
-                while (i < 16) {
-                        printk(KERN_CONT "   ");
-                        ascii[i] = ' ';
-                        i++;
-                }
-                printk(KERN_CONT " %s\n", ascii);
-        }
 }
 static struct track *get_track(struct kmem_cache *s, void *object,
@@ -534,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
                        p, p - addr, get_freepointer(s, p));
        if (p > addr + 16)
-                print_section("Bytes b4", p - 16, 16);
+                print_section("Bytes b4 ", p - 16, 16);
-        print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
+        print_section("Object ", p, min_t(unsigned long, s->objsize,
+                                PAGE_SIZE));
        if (s->flags & SLAB_RED_ZONE)
-                print_section("Redzone", p + s->objsize,
+                print_section("Redzone ", p + s->objsize,
                        s->inuse - s->objsize);
        if (s->offset)
@@ -552,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
        if (off != s->size)
                /* Beginning of the filler is the free pointer */
-                print_section("Padding", p + off, s->size - off);
+                print_section("Padding ", p + off, s->size - off);
        dump_stack();
 }
@@ -590,49 +655,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
                memset(p + s->objsize, val, s->inuse - s->objsize);
 }
-static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
-{
-        while (bytes) {
-                if (*start != value)
-                        return start;
-                start++;
-                bytes--;
-        }
-        return NULL;
-}
-static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
-{
-        u64 value64;
-        unsigned int words, prefix;
-        if (bytes <= 16)
-                return check_bytes8(start, value, bytes);
-        value64 = value | value << 8 | value << 16 | value << 24;
-        value64 = value64 | value64 << 32;
-        prefix = 8 - ((unsigned long)start) % 8;
-        if (prefix) {
-                u8 *r = check_bytes8(start, value, prefix);
-                if (r)
-                        return r;
-                start += prefix;
-                bytes -= prefix;
-        }
-        words = bytes / 8;
-        while (words) {
-                if (*(u64 *)start != value64)
-                        return check_bytes8(start, value, 8);
-                start += 8;
-                words--;
-        }
-        return check_bytes8(start, value, bytes % 8);
-}
 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
                                                void *from, void *to)
 {
@@ -647,7 +669,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
        u8 *fault;
        u8 *end;
-        fault = check_bytes(start, value, bytes);
+        fault = memchr_inv(start, value, bytes);
        if (!fault)
                return 1;
@@ -740,14 +762,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
        if (!remainder)
                return 1;
-        fault = check_bytes(end - remainder, POISON_INUSE, remainder);
+        fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
        if (!fault)
                return 1;
        while (end > fault && end[-1] == POISON_INUSE)
                end--;
        slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
-        print_section("Padding", end - remainder, remainder);
+        print_section("Padding ", end - remainder, remainder);
        restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
        return 0;
@@ -838,10 +860,11 @@ static int check_slab(struct kmem_cache *s, struct page *page)
 static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 {
        int nr = 0;
-        void *fp = page->freelist;
+        void *fp;
        void *object = NULL;
        unsigned long max_objects;
+        fp = page->freelist;
        while (fp && nr <= page->objects) {
                if (fp == search)
                        return 1;
@@ -895,7 +918,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
                        page->freelist);
                if (!alloc)
-                        print_section("Object", (void *)object, s->objsize);
+                        print_section("Object ", (void *)object, s->objsize);
                dump_stack();
        }
@@ -946,26 +969,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
 /*
 * Tracking of fully allocated slabs for debugging purposes.
+ *
+ * list_lock must be held.
 */
-static void add_full(struct kmem_cache_node *n, struct page *page)
+static void add_full(struct kmem_cache *s,
+        struct kmem_cache_node *n, struct page *page)
 {
-        spin_lock(&n->list_lock);
+        if (!(s->flags & SLAB_STORE_USER))
+                return;
        list_add(&page->lru, &n->full);
-        spin_unlock(&n->list_lock);
 }
+/*
+ * list_lock must be held.
+ */
 static void remove_full(struct kmem_cache *s, struct page *page)
 {
-        struct kmem_cache_node *n;
        if (!(s->flags & SLAB_STORE_USER))
                return;
-        n = get_node(s, page_to_nid(page));
-        spin_lock(&n->list_lock);
        list_del(&page->lru);
-        spin_unlock(&n->list_lock);
 }
 /* Tracking of the number of slabs for debugging purposes */
@@ -1021,11 +1045,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa
        if (!check_slab(s, page))
                goto bad;
-        if (!on_freelist(s, page, object)) {
-                object_err(s, page, object, "Object already allocated");
-                goto bad;
-        }
        if (!check_valid_pointer(s, page, object)) {
                object_err(s, page, object, "Freelist Pointer check fails");
                goto bad;
@@ -1058,6 +1077,12 @@ bad:
 static noinline int free_debug_processing(struct kmem_cache *s,
                 struct page *page, void *object, unsigned long addr)
 {
+        unsigned long flags;
+        int rc = 0;
+        local_irq_save(flags);
+        slab_lock(page);
        if (!check_slab(s, page))
                goto fail;
@@ -1072,7 +1097,7 @@ static noinline int free_debug_processing(struct kmem_cache *s,
        }
        if (!check_object(s, page, object, SLUB_RED_ACTIVE))
-                return 0;
+                goto out;
        if (unlikely(s != page->slab)) {
                if (!PageSlab(page)) {
@@ -1089,18 +1114,19 @@ static noinline int free_debug_processing(struct kmem_cache *s,
                goto fail;
        }
-        /* Special debug activities for freeing objects */
-        if (!PageSlubFrozen(page) && !page->freelist)
-                remove_full(s, page);
        if (s->flags & SLAB_STORE_USER)
                set_track(s, object, TRACK_FREE, addr);
        trace(s, page, object, 0);
        init_object(s, object, SLUB_RED_INACTIVE);
-        return 1;
+        rc = 1;
+out:
+        slab_unlock(page);
+        local_irq_restore(flags);
+        return rc;
 fail:
        slab_fix(s, "Object at 0x%p not freed", object);
-        return 0;
+        goto out;
 }
 static int __init setup_slub_debug(char *str)
@@ -1200,7 +1226,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
                        { return 1; }
 static inline int check_object(struct kmem_cache *s, struct page *page,
                        void *object, u8 val) { return 1; }
-static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
+static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
+                                        struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
        void (*ctor)(void *))
@@ -1252,6 +1280,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
        struct kmem_cache_order_objects oo = s->oo;
        gfp_t alloc_gfp;
+        flags &= gfp_allowed_mask;
+        if (flags & __GFP_WAIT)
+                local_irq_enable();
        flags |= s->allocflags;
        /*
@@ -1268,12 +1301,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                 * Try a lower order alloc if possible
                 */
                page = alloc_slab_page(flags, node, oo);
-                if (!page)
-                        return NULL;
-                stat(s, ORDER_FALLBACK);
+                if (page)
+                        stat(s, ORDER_FALLBACK);
        }
+        if (flags & __GFP_WAIT)
+                local_irq_disable();
+        if (!page)
+                return NULL;
        if (kmemcheck_enabled
                && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
                int pages = 1 << oo_order(oo);
@@ -1340,7 +1378,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        set_freepointer(s, last, NULL);
        page->freelist = start;
-        page->inuse = 0;
+        page->inuse = page->objects;
+        page->frozen = 1;
 out:
        return page;
 }
@@ -1418,79 +1457,80 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
 }
 /*
- * Per slab locking using the pagelock
+ * Management of partially allocated slabs.
- */
+ *
-static __always_inline void slab_lock(struct page *page)
+ * list_lock must be held.
-{
-        bit_spin_lock(PG_locked, &page->flags);
-}
-static __always_inline void slab_unlock(struct page *page)
-{
-        __bit_spin_unlock(PG_locked, &page->flags);
-}
-static __always_inline int slab_trylock(struct page *page)
-{
-        int rc = 1;
-        rc = bit_spin_trylock(PG_locked, &page->flags);
-        return rc;
-}
-/*
- * Management of partially allocated slabs
 */
-static void add_partial(struct kmem_cache_node *n,
+static inline void add_partial(struct kmem_cache_node *n,
                                struct page *page, int tail)
 {
-        spin_lock(&n->list_lock);
        n->nr_partial++;
-        if (tail)
+        if (tail == DEACTIVATE_TO_TAIL)
                list_add_tail(&page->lru, &n->partial);
        else
                list_add(&page->lru, &n->partial);
-        spin_unlock(&n->list_lock);
 }
-static inline void __remove_partial(struct kmem_cache_node *n,
+/*
+ * list_lock must be held.
+ */
+static inline void remove_partial(struct kmem_cache_node *n,
                                        struct page *page)
 {
        list_del(&page->lru);
        n->nr_partial--;
 }
-static void remove_partial(struct kmem_cache *s, struct page *page)
-{
-        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-        spin_lock(&n->list_lock);
-        __remove_partial(n, page);
-        spin_unlock(&n->list_lock);
-}
 /*
- * Lock slab and remove from the partial list.
+ * Lock slab, remove from the partial list and put the object into the
+ * per cpu freelist.
+ *
+ * Returns a list of objects or NULL if it fails.
 *
 * Must hold list_lock.
 */
-static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
+static inline void *acquire_slab(struct kmem_cache *s,
-                                                        struct page *page)
+                struct kmem_cache_node *n, struct page *page,
+                int mode)
 {
-        if (slab_trylock(page)) {
+        void *freelist;
-                __remove_partial(n, page);
+        unsigned long counters;
-                __SetPageSlubFrozen(page);
+        struct page new;
-                return 1;
-        }
+        /*
-        return 0;
+         * Zap the freelist and set the frozen bit.
+         * The old freelist is the list of objects for the
+         * per cpu allocation list.
+         */
+        do {
+                freelist = page->freelist;
+                counters = page->counters;
+                new.counters = counters;
+                if (mode)
+                        new.inuse = page->objects;
+                VM_BUG_ON(new.frozen);
+                new.frozen = 1;
+        } while (!__cmpxchg_double_slab(s, page,
+                        freelist, counters,
+                        NULL, new.counters,
+                        "lock and freeze"));
+        remove_partial(n, page);
+        return freelist;
 }
+static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
 /*
 * Try to allocate a partial slab from a specific node.
 */
-static struct page *get_partial_node(struct kmem_cache_node *n)
+static void *get_partial_node(struct kmem_cache *s,
+                struct kmem_cache_node *n, struct kmem_cache_cpu *c)
 {
-        struct page *page;
+        struct page *page, *page2;
+        void *object = NULL;
        /*
         * Racy check. If we mistakenly see no partial slabs then we
@@ -1502,26 +1542,43 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
                return NULL;
        spin_lock(&n->list_lock);
-        list_for_each_entry(page, &n->partial, lru)
+        list_for_each_entry_safe(page, page2, &n->partial, lru) {
-                if (lock_and_freeze_slab(n, page))
+                void *t = acquire_slab(s, n, page, object == NULL);
-                        goto out;
+                int available;
-        page = NULL;
-out:
+                if (!t)
+                        break;
+                if (!object) {
+                        c->page = page;
+                        c->node = page_to_nid(page);
+                        stat(s, ALLOC_FROM_PARTIAL);
+                        object = t;
+                        available =  page->objects - page->inuse;
+                } else {
+                        page->freelist = t;
+                        available = put_cpu_partial(s, page, 0);
+                }
+                if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
+                        break;
+        }
        spin_unlock(&n->list_lock);
-        return page;
+        return object;
 }
 /*
 * Get a page from somewhere. Search in increasing NUMA distances.
 */
-static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
+static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
+                struct kmem_cache_cpu *c)
 {
 #ifdef CONFIG_NUMA
        struct zonelist *zonelist;
        struct zoneref *z;
        struct zone *zone;
        enum zone_type high_zoneidx = gfp_zone(flags);
-        struct page *page;
+        void *object;
        /*
         * The defrag ratio allows a configuration of the tradeoffs between
@@ -1554,10 +1611,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
                                n->nr_partial > s->min_partial) {
-                        page = get_partial_node(n);
+                        object = get_partial_node(s, n, c);
-                        if (page) {
+                        if (object) {
                                put_mems_allowed();
-                                return page;
+                                return object;
                        }
                }
        }
@@ -1569,63 +1626,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 /*
 * Get a partial page, lock it and return it.
 */
-static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
+static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
+                struct kmem_cache_cpu *c)
 {
-        struct page *page;
+        void *object;
        int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
-        page = get_partial_node(get_node(s, searchnode));
+        object = get_partial_node(s, get_node(s, searchnode), c);
-        if (page || node != NUMA_NO_NODE)
+        if (object || node != NUMA_NO_NODE)
-                return page;
+                return object;
-        return get_any_partial(s, flags);
+        return get_any_partial(s, flags, c);
-}
-/*
- * Move a page back to the lists.
- *
- * Must be called with the slab lock held.
- *
- * On exit the slab lock will have been dropped.
- */
-static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
-        __releases(bitlock)
-{
-        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-        __ClearPageSlubFrozen(page);
-        if (page->inuse) {
-                if (page->freelist) {
-                        add_partial(n, page, tail);
-                        stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
-                } else {
-                        stat(s, DEACTIVATE_FULL);
-                        if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER))
-                                add_full(n, page);
-                }
-                slab_unlock(page);
-        } else {
-                stat(s, DEACTIVATE_EMPTY);
-                if (n->nr_partial < s->min_partial) {
-                        /*
-                         * Adding an empty slab to the partial slabs in order
-                         * to avoid page allocator overhead. This slab needs
-                         * to come after the other slabs with objects in
-                         * so that the others get filled first. That way the
-                         * size of the partial list stays small.
-                         *
-                         * kmem_cache_shrink can reclaim any empty slabs from
-                         * the partial list.
-                         */
-                        add_partial(n, page, 1);
-                        slab_unlock(page);
-                } else {
-                        slab_unlock(page);
-                        stat(s, FREE_SLAB);
-                        discard_slab(s, page);
-                }
-        }
 }
 #ifdef CONFIG_PREEMPT
@@ -1694,45 +1705,278 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
        for_each_possible_cpu(cpu)
                per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
 }
 /*
 * Remove the cpu slab
 */
 static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
-        __releases(bitlock)
 {
+        enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
        struct page *page = c->page;
-        int tail = 1;
+        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+        int lock = 0;
-        if (page->freelist)
+        enum slab_modes l = M_NONE, m = M_NONE;
+        void *freelist;
+        void *nextfree;
+        int tail = DEACTIVATE_TO_HEAD;
+        struct page new;
+        struct page old;
+        if (page->freelist) {
                stat(s, DEACTIVATE_REMOTE_FREES);
+                tail = DEACTIVATE_TO_TAIL;
+        }
+        c->tid = next_tid(c->tid);
+        c->page = NULL;
+        freelist = c->freelist;
+        c->freelist = NULL;
+        /*
+         * Stage one: Free all available per cpu objects back
+         * to the page freelist while it is still frozen. Leave the
+         * last one.
+         *
+         * There is no need to take the list->lock because the page
+         * is still frozen.
+         */
+        while (freelist && (nextfree = get_freepointer(s, freelist))) {
+                void *prior;
+                unsigned long counters;
+                do {
+                        prior = page->freelist;
+                        counters = page->counters;
+                        set_freepointer(s, freelist, prior);
+                        new.counters = counters;
+                        new.inuse--;
+                        VM_BUG_ON(!new.frozen);
+                } while (!__cmpxchg_double_slab(s, page,
+                        prior, counters,
+                        freelist, new.counters,
+                        "drain percpu freelist"));
+                freelist = nextfree;
+        }
        /*
-         * Merge cpu freelist into slab freelist. Typically we get here
+         * Stage two: Ensure that the page is unfrozen while the
-         * because both freelists are empty. So this is unlikely
+         * list presence reflects the actual number of objects
-         * to occur.
+         * during unfreeze.
+         *
+         * We setup the list membership and then perform a cmpxchg
+         * with the count. If there is a mismatch then the page
+         * is not unfrozen but the page is on the wrong list.
+         *
+         * Then we restart the process which may have to remove
+         * the page from the list that we just put it on again
+         * because the number of objects in the slab may have
+         * changed.
         */
-        while (unlikely(c->freelist)) {
+redo:
-                void **object;
+        old.freelist = page->freelist;
+        old.counters = page->counters;
+        VM_BUG_ON(!old.frozen);
-                tail = 0;       /* Hot objects. Put the slab first */
+        /* Determine target state of the slab */
+        new.counters = old.counters;
+        if (freelist) {
+                new.inuse--;
+                set_freepointer(s, freelist, old.freelist);
+                new.freelist = freelist;
+        } else
+                new.freelist = old.freelist;
-                /* Retrieve object from cpu_freelist */
+        new.frozen = 0;
-                object = c->freelist;
-                c->freelist = get_freepointer(s, c->freelist);
-                /* And put onto the regular freelist */
+        if (!new.inuse && n->nr_partial > s->min_partial)
-                set_freepointer(s, object, page->freelist);
+                m = M_FREE;
-                page->freelist = object;
+        else if (new.freelist) {
-                page->inuse--;
+                m = M_PARTIAL;
+                if (!lock) {
+                        lock = 1;
+                        /*
+                         * Taking the spinlock removes the possiblity
+                         * that acquire_slab() will see a slab page that
+                         * is frozen
+                         */
+                        spin_lock(&n->list_lock);
+                }
+        } else {
+                m = M_FULL;
+                if (kmem_cache_debug(s) && !lock) {
+                        lock = 1;
+                        /*
+                         * This also ensures that the scanning of full
+                         * slabs from diagnostic functions will not see
+                         * any frozen slabs.
+                         */
+                        spin_lock(&n->list_lock);
+                }
+        }
+        if (l != m) {
+                if (l == M_PARTIAL)
+                        remove_partial(n, page);
+                else if (l == M_FULL)
+                        remove_full(s, page);
+                if (m == M_PARTIAL) {
+                        add_partial(n, page, tail);
+                        stat(s, tail);
+                } else if (m == M_FULL) {
+                        stat(s, DEACTIVATE_FULL);
+                        add_full(s, n, page);
+                }
+        }
+        l = m;
+        if (!__cmpxchg_double_slab(s, page,
+                                old.freelist, old.counters,
+                                new.freelist, new.counters,
+                                "unfreezing slab"))
+                goto redo;
+        if (lock)
+                spin_unlock(&n->list_lock);
+        if (m == M_FREE) {
+                stat(s, DEACTIVATE_EMPTY);
+                discard_slab(s, page);
+                stat(s, FREE_SLAB);
        }
-        c->page = NULL;
+}
-        c->tid = next_tid(c->tid);
-        unfreeze_slab(s, page, tail);
+/* Unfreeze all the cpu partial slabs */
+static void unfreeze_partials(struct kmem_cache *s)
+{
+        struct kmem_cache_node *n = NULL;
+        struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
+        struct page *page;
+        while ((page = c->partial)) {
+                enum slab_modes { M_PARTIAL, M_FREE };
+                enum slab_modes l, m;
+                struct page new;
+                struct page old;
+                c->partial = page->next;
+                l = M_FREE;
+                do {
+                        old.freelist = page->freelist;
+                        old.counters = page->counters;
+                        VM_BUG_ON(!old.frozen);
+                        new.counters = old.counters;
+                        new.freelist = old.freelist;
+                        new.frozen = 0;
+                        if (!new.inuse && (!n || n->nr_partial > s->min_partial))
+                                m = M_FREE;
+                        else {
+                                struct kmem_cache_node *n2 = get_node(s,
+                                                        page_to_nid(page));
+                                m = M_PARTIAL;
+                                if (n != n2) {
+                                        if (n)
+                                                spin_unlock(&n->list_lock);
+                                        n = n2;
+                                        spin_lock(&n->list_lock);
+                                }
+                        }
+                        if (l != m) {
+                                if (l == M_PARTIAL)
+                                        remove_partial(n, page);
+                                else
+                                        add_partial(n, page, 1);
+                                l = m;
+                        }
+                } while (!cmpxchg_double_slab(s, page,
+                                old.freelist, old.counters,
+                                new.freelist, new.counters,
+                                "unfreezing slab"));
+                if (m == M_FREE) {
+                        stat(s, DEACTIVATE_EMPTY);
+                        discard_slab(s, page);
+                        stat(s, FREE_SLAB);
+                }
+        }
+        if (n)
+                spin_unlock(&n->list_lock);
+}
+/*
+ * Put a page that was just frozen (in __slab_free) into a partial page
+ * slot if available. This is done without interrupts disabled and without
+ * preemption disabled. The cmpxchg is racy and may put the partial page
+ * onto a random cpus partial slot.
+ *
+ * If we did not find a slot then simply move all the partials to the
+ * per node partial list.
+ */
+int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+{
+        struct page *oldpage;
+        int pages;
+        int pobjects;
+        do {
+                pages = 0;
+                pobjects = 0;
+                oldpage = this_cpu_read(s->cpu_slab->partial);
+                if (oldpage) {
+                        pobjects = oldpage->pobjects;
+                        pages = oldpage->pages;
+                        if (drain && pobjects > s->cpu_partial) {
+                                unsigned long flags;
+                                /*
+                                 * partial array is full. Move the existing
+                                 * set to the per node partial list.
+                                 */
+                                local_irq_save(flags);
+                                unfreeze_partials(s);
+                                local_irq_restore(flags);
+                                pobjects = 0;
+                                pages = 0;
+                        }
+                }
+                pages++;
+                pobjects += page->objects - page->inuse;
+                page->pages = pages;
+                page->pobjects = pobjects;
+                page->next = oldpage;
+        } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+        stat(s, CPU_PARTIAL_FREE);
+        return pobjects;
 }
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
        stat(s, CPUSLAB_FLUSH);
-        slab_lock(c->page);
        deactivate_slab(s, c);
 }
@@ -1745,8 +1989,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
 {
        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
-        if (likely(c && c->page))
+        if (likely(c)) {
-                flush_slab(s, c);
+                if (c->page)
+                        flush_slab(s, c);
+                unfreeze_partials(s);
+        }
 }
 static void flush_cpu_slab(void *d)
@@ -1837,12 +2085,39 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
        }
 }
+static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
+                        int node, struct kmem_cache_cpu **pc)
+{
+        void *object;
+        struct kmem_cache_cpu *c;
+        struct page *page = new_slab(s, flags, node);
+        if (page) {
+                c = __this_cpu_ptr(s->cpu_slab);
+                if (c->page)
+                        flush_slab(s, c);
+                /*
+                 * No other reference to the page yet so we can
+                 * muck around with it freely without cmpxchg
+                 */
+                object = page->freelist;
+                page->freelist = NULL;
+                stat(s, ALLOC_SLAB);
+                c->node = page_to_nid(page);
+                c->page = page;
+                *pc = c;
+        } else
+                object = NULL;
+        return object;
+}
 /*
 * Slow path. The lockless freelist is empty or we need to perform
 * debugging duties.
 *
- * Interrupts are disabled.
- *
 * Processing is still very fast if new objects have been freed to the
 * regular freelist. In that case we simply take over the regular freelist
 * as the lockless freelist and zap the regular freelist.
@@ -1859,8 +2134,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
                          unsigned long addr, struct kmem_cache_cpu *c)
 {
        void **object;
-        struct page *page;
        unsigned long flags;
+        struct page new;
+        unsigned long counters;
        local_irq_save(flags);
 #ifdef CONFIG_PREEMPT
@@ -1872,81 +2148,91 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
        c = this_cpu_ptr(s->cpu_slab);
 #endif
-        /* We handle __GFP_ZERO in the caller */
+        if (!c->page)
-        gfpflags &= ~__GFP_ZERO;
-        page = c->page;
-        if (!page)
                goto new_slab;
+redo:
+        if (unlikely(!node_match(c, node))) {
+                stat(s, ALLOC_NODE_MISMATCH);
+                deactivate_slab(s, c);
+                goto new_slab;
+        }
-        slab_lock(page);
+        stat(s, ALLOC_SLOWPATH);
-        if (unlikely(!node_match(c, node)))
-                goto another_slab;
+        do {
+                object = c->page->freelist;
+                counters = c->page->counters;
+                new.counters = counters;
+                VM_BUG_ON(!new.frozen);
+                /*
+                 * If there is no object left then we use this loop to
+                 * deactivate the slab which is simple since no objects
+                 * are left in the slab and therefore we do not need to
+                 * put the page back onto the partial list.
+                 *
+                 * If there are objects left then we retrieve them
+                 * and use them to refill the per cpu queue.
+                 */
+                new.inuse = c->page->objects;
+                new.frozen = object != NULL;
+        } while (!__cmpxchg_double_slab(s, c->page,
+                        object, counters,
+                        NULL, new.counters,
+                        "__slab_alloc"));
+        if (!object) {
+                c->page = NULL;
+                stat(s, DEACTIVATE_BYPASS);
+                goto new_slab;
+        }
        stat(s, ALLOC_REFILL);
 load_freelist:
-        object = page->freelist;
-        if (unlikely(!object))
-                goto another_slab;
-        if (kmem_cache_debug(s))
-                goto debug;
        c->freelist = get_freepointer(s, object);
-        page->inuse = page->objects;
-        page->freelist = NULL;
-        slab_unlock(page);
        c->tid = next_tid(c->tid);
        local_irq_restore(flags);
-        stat(s, ALLOC_SLOWPATH);
        return object;
-another_slab:
-        deactivate_slab(s, c);
 new_slab:
-        page = get_partial(s, gfpflags, node);
-        if (page) {
+        if (c->partial) {
-                stat(s, ALLOC_FROM_PARTIAL);
+                c->page = c->partial;
-                c->node = page_to_nid(page);
+                c->partial = c->page->next;
-                c->page = page;
+                c->node = page_to_nid(c->page);
-                goto load_freelist;
+                stat(s, CPU_PARTIAL_ALLOC);
+                c->freelist = NULL;
+                goto redo;
        }
-        gfpflags &= gfp_allowed_mask;
+        /* Then do expensive stuff like retrieving pages from the partial lists */
-        if (gfpflags & __GFP_WAIT)
+        object = get_partial(s, gfpflags, node, c);
-                local_irq_enable();
-        page = new_slab(s, gfpflags, node);
+        if (unlikely(!object)) {
-        if (gfpflags & __GFP_WAIT)
+                object = new_slab_objects(s, gfpflags, node, &c);
-                local_irq_disable();
-        if (page) {
+                if (unlikely(!object)) {
-                c = __this_cpu_ptr(s->cpu_slab);
+                        if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
-                stat(s, ALLOC_SLAB);
+                                slab_out_of_memory(s, gfpflags, node);
-                if (c->page)
-                        flush_slab(s, c);
-                slab_lock(page);
+                        local_irq_restore(flags);
-                __SetPageSlubFrozen(page);
+                        return NULL;
-                c->node = page_to_nid(page);
+                }
-                c->page = page;
-                goto load_freelist;
        }
-        if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
-                slab_out_of_memory(s, gfpflags, node);
-        local_irq_restore(flags);
-        return NULL;
-debug:
-        if (!alloc_debug_processing(s, page, object, addr))
-                goto another_slab;
-        page->inuse++;
+        if (likely(!kmem_cache_debug(s)))
-        page->freelist = get_freepointer(s, object);
+                goto load_freelist;
+        /* Only entered in the debug case */
+        if (!alloc_debug_processing(s, c->page, object, addr))
+                goto new_slab;  /* Slab failed checks. Next slab needed */
+        c->freelist = get_freepointer(s, object);
        deactivate_slab(s, c);
-        c->page = NULL;
        c->node = NUMA_NO_NODE;
        local_irq_restore(flags);
        return object;
@@ -2096,52 +2382,110 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 {
        void *prior;
        void **object = (void *)x;
-        unsigned long flags;
+        int was_frozen;
+        int inuse;
+        struct page new;
+        unsigned long counters;
+        struct kmem_cache_node *n = NULL;
+        unsigned long uninitialized_var(flags);
-        local_irq_save(flags);
-        slab_lock(page);
        stat(s, FREE_SLOWPATH);
        if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
-                goto out_unlock;
+                return;
-        prior = page->freelist;
+        do {
-        set_freepointer(s, object, prior);
+                prior = page->freelist;
-        page->freelist = object;
+                counters = page->counters;
-        page->inuse--;
+                set_freepointer(s, object, prior);
+                new.counters = counters;
+                was_frozen = new.frozen;
+                new.inuse--;
+                if ((!new.inuse || !prior) && !was_frozen && !n) {
-        if (unlikely(PageSlubFrozen(page))) {
+                        if (!kmem_cache_debug(s) && !prior)
-                stat(s, FREE_FROZEN);
-                goto out_unlock;
+                                /*
-        }
+                                 * Slab was on no list before and will be partially empty
+                                 * We can defer the list move and instead freeze it.
+                                 */
+                                new.frozen = 1;
+                        else { /* Needs to be taken off a list */
+                                n = get_node(s, page_to_nid(page));
+                                /*
+                                 * Speculatively acquire the list_lock.
+                                 * If the cmpxchg does not succeed then we may
+                                 * drop the list_lock without any processing.
+                                 *
+                                 * Otherwise the list_lock will synchronize with
+                                 * other processors updating the list of slabs.
+                                 */
+                                spin_lock_irqsave(&n->list_lock, flags);
+                        }
+                }
+                inuse = new.inuse;
-        if (unlikely(!page->inuse))
+        } while (!cmpxchg_double_slab(s, page,
-                goto slab_empty;
+                prior, counters,
+                object, new.counters,
+                "__slab_free"));
+        if (likely(!n)) {
+                /*
+                 * If we just froze the page then put it onto the
+                 * per cpu partial list.
+                 */
+                if (new.frozen && !was_frozen)
+                        put_cpu_partial(s, page, 1);
+                /*
+                 * The list lock was not taken therefore no list
+                 * activity can be necessary.
+                 */
+                if (was_frozen)
+                        stat(s, FREE_FROZEN);
+                return;
+        }
        /*
-         * Objects left in the slab. If it was not on the partial list before
+         * was_frozen may have been set after we acquired the list_lock in
-         * then add it.
+         * an earlier loop. So we need to check it here again.
         */
-        if (unlikely(!prior)) {
+        if (was_frozen)
-                add_partial(get_node(s, page_to_nid(page)), page, 1);
+                stat(s, FREE_FROZEN);
-                stat(s, FREE_ADD_PARTIAL);
+        else {
-        }
+                if (unlikely(!inuse && n->nr_partial > s->min_partial))
+                        goto slab_empty;
-out_unlock:
+                /*
-        slab_unlock(page);
+                 * Objects left in the slab. If it was not on the partial list before
-        local_irq_restore(flags);
+                 * then add it.
+                 */
+                if (unlikely(!prior)) {
+                        remove_full(s, page);
+                        add_partial(n, page, DEACTIVATE_TO_TAIL);
+                        stat(s, FREE_ADD_PARTIAL);
+                }
+        }
+        spin_unlock_irqrestore(&n->list_lock, flags);
        return;
 slab_empty:
        if (prior) {
                /*
-                 * Slab still on the partial list.
+                 * Slab on the partial list.
                 */
-                remove_partial(s, page);
+                remove_partial(n, page);
                stat(s, FREE_REMOVE_PARTIAL);
-        }
+        } else
-        slab_unlock(page);
+                /* Slab must be on the full list */
-        local_irq_restore(flags);
+                remove_full(s, page);
+        spin_unlock_irqrestore(&n->list_lock, flags);
        stat(s, FREE_SLAB);
        discard_slab(s, page);
 }
@@ -2167,7 +2511,6 @@ static __always_inline void slab_free(struct kmem_cache *s,
        slab_free_hook(s, x);
 redo:
        /*
         * Determine the currently cpus per cpu slab.
         * The cpu may change afterward. However that does not matter since
@@ -2415,7 +2758,6 @@ static void early_kmem_cache_node_alloc(int node)
 {
        struct page *page;
        struct kmem_cache_node *n;
-        unsigned long flags;
        BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
@@ -2432,7 +2774,8 @@ static void early_kmem_cache_node_alloc(int node)
        n = page->freelist;
        BUG_ON(!n);
        page->freelist = get_freepointer(kmem_cache_node, n);
-        page->inuse++;
+        page->inuse = 1;
+        page->frozen = 0;
        kmem_cache_node->node[node] = n;
 #ifdef CONFIG_SLUB_DEBUG
        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
@@ -2441,14 +2784,7 @@ static void early_kmem_cache_node_alloc(int node)
        init_kmem_cache_node(n, kmem_cache_node);
        inc_slabs_node(kmem_cache_node, node, page->objects);
-        /*
+        add_partial(n, page, DEACTIVATE_TO_HEAD);
-         * lockdep requires consistent irq usage for each lock
-         * so even though there cannot be a race this early in
-         * the boot sequence, we still disable irqs.
-         */
-        local_irq_save(flags);
-        add_partial(n, page, 0);
-        local_irq_restore(flags);
 }
 static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2654,11 +2990,44 @@ static int kmem_cache_open(struct kmem_cache *s,
                }
        }
+#ifdef CONFIG_CMPXCHG_DOUBLE
+        if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
+                /* Enable fast mode */
+                s->flags |= __CMPXCHG_DOUBLE;
+#endif
        /*
         * The larger the object size is, the more pages we want on the partial
         * list to avoid pounding the page allocator excessively.
         */
-        set_min_partial(s, ilog2(s->size));
+        set_min_partial(s, ilog2(s->size) / 2);
+        /*
+         * cpu_partial determined the maximum number of objects kept in the
+         * per cpu partial lists of a processor.
+         *
+         * Per cpu partial lists mainly contain slabs that just have one
+         * object freed. If they are used for allocation then they can be
+         * filled up again with minimal effort. The slab will never hit the
+         * per node partial lists and therefore no locking will be required.
+         *
+         * This setting also determines
+         *
+         * A) The number of objects from per cpu partial slabs dumped to the
+         *    per node list when we reach the limit.
+         * B) The number of objects in cpu partial slabs to extract from the
+         *    per node list when we run out of per cpu objects. We only fetch 50%
+         *    to keep some capacity around for frees.
+         */
+        if (s->size >= PAGE_SIZE)
+                s->cpu_partial = 2;
+        else if (s->size >= 1024)
+                s->cpu_partial = 6;
+        else if (s->size >= 256)
+                s->cpu_partial = 13;
+        else
+                s->cpu_partial = 30;
        s->refcount = 1;
 #ifdef CONFIG_NUMA
        s->remote_node_defrag_ratio = 1000;
@@ -2717,23 +3086,22 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
 /*
 * Attempt to free all partial slabs on a node.
+ * This is called from kmem_cache_close(). We must be the last thread
+ * using the cache and therefore we do not need to lock anymore.
 */
 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 {
-        unsigned long flags;
        struct page *page, *h;
-        spin_lock_irqsave(&n->list_lock, flags);
        list_for_each_entry_safe(page, h, &n->partial, lru) {
                if (!page->inuse) {
-                        __remove_partial(n, page);
+                        remove_partial(n, page);
                        discard_slab(s, page);
                } else {
                        list_slab_objects(s, page,
                                "Objects remaining on kmem_cache_close()");
                }
        }
-        spin_unlock_irqrestore(&n->list_lock, flags);
 }
 /*
@@ -2767,6 +3135,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
        s->refcount--;
        if (!s->refcount) {
                list_del(&s->list);
+                up_write(&slub_lock);
                if (kmem_cache_close(s)) {
                        printk(KERN_ERR "SLUB %s: %s called for cache that "
                                "still has objects.\n", s->name, __func__);
@@ -2775,8 +3144,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
                if (s->flags & SLAB_DESTROY_BY_RCU)
                        rcu_barrier();
                sysfs_slab_remove(s);
-        }
+        } else
-        up_write(&slub_lock);
+                up_write(&slub_lock);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -3094,29 +3463,23 @@ int kmem_cache_shrink(struct kmem_cache *s)
                 * list_lock. page->inuse here is the upper limit.
                 */
                list_for_each_entry_safe(page, t, &n->partial, lru) {
-                        if (!page->inuse && slab_trylock(page)) {
+                        list_move(&page->lru, slabs_by_inuse + page->inuse);
-                                /*
+                        if (!page->inuse)
-                                 * Must hold slab lock here because slab_free
+                                n->nr_partial--;
-                                 * may have freed the last object and be
-                                 * waiting to release the slab.
-                                 */
-                                __remove_partial(n, page);
-                                slab_unlock(page);
-                                discard_slab(s, page);
-                        } else {
-                                list_move(&page->lru,
-                                slabs_by_inuse + page->inuse);
-                        }
                }
                /*
                 * Rebuild the partial list with the slabs filled up most
                 * first and the least used slabs at the end.
                 */
-                for (i = objects - 1; i >= 0; i--)
+                for (i = objects - 1; i > 0; i--)
                        list_splice(slabs_by_inuse + i, n->partial.prev);
                spin_unlock_irqrestore(&n->list_lock, flags);
+                /* Release empty slabs */
+                list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+                        discard_slab(s, page);
        }
        kfree(slabs_by_inuse);
@@ -3689,12 +4052,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
 static void validate_slab_slab(struct kmem_cache *s, struct page *page,
                                                unsigned long *map)
 {
-        if (slab_trylock(page)) {
+        slab_lock(page);
-                validate_slab(s, page, map);
+        validate_slab(s, page, map);
-                slab_unlock(page);
+        slab_unlock(page);
-        } else
-                printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
-                        s->name, page);
 }
 static int validate_slab_node(struct kmem_cache *s,
@@ -4075,6 +4435,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                for_each_possible_cpu(cpu) {
                        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+                        struct page *page;
                        if (!c || c->node < 0)
                                continue;
@@ -4090,6 +4451,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                                total += x;
                                nodes[c->node] += x;
                        }
+                        page = c->partial;
+                        if (page) {
+                                x = page->pobjects;
+                                total += x;
+                                nodes[c->node] += x;
+                        }
                        per_cpu[c->node]++;
                }
        }
@@ -4168,11 +4536,12 @@ struct slab_attribute {
 };
 #define SLAB_ATTR_RO(_name) \
-        static struct slab_attribute _name##_attr = __ATTR_RO(_name)
+        static struct slab_attribute _name##_attr = \
+        __ATTR(_name, 0400, _name##_show, NULL)
 #define SLAB_ATTR(_name) \
        static struct slab_attribute _name##_attr =  \
-        __ATTR(_name, 0644, _name##_show, _name##_store)
+        __ATTR(_name, 0600, _name##_show, _name##_store)
 static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
 {
@@ -4241,6 +4610,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
 }
 SLAB_ATTR(min_partial);
+static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%u\n", s->cpu_partial);
+}
+static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
+                                 size_t length)
+{
+        unsigned long objects;
+        int err;
+        err = strict_strtoul(buf, 10, &objects);
+        if (err)
+                return err;
+        s->cpu_partial = objects;
+        flush_all(s);
+        return length;
+}
+SLAB_ATTR(cpu_partial);
 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
 {
        if (!s->ctor)
@@ -4279,6 +4669,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR_RO(objects_partial);
+static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+        int objects = 0;
+        int pages = 0;
+        int cpu;
+        int len;
+        for_each_online_cpu(cpu) {
+                struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
+                if (page) {
+                        pages += page->pages;
+                        objects += page->pobjects;
+                }
+        }
+        len = sprintf(buf, "%d(%d)", objects, pages);
+#ifdef CONFIG_SMP
+        for_each_online_cpu(cpu) {
+                struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
+                if (page && len < PAGE_SIZE - 20)
+                        len += sprintf(buf + len, " C%d=%d(%d)", cpu,
+                                page->pobjects, page->pages);
+        }
+#endif
+        return len + sprintf(buf + len, "\n");
+}
+SLAB_ATTR_RO(slabs_cpu_partial);
 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
 {
        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4342,8 +4763,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s,
                                const char *buf, size_t length)
 {
        s->flags &= ~SLAB_DEBUG_FREE;
-        if (buf[0] == '1')
+        if (buf[0] == '1') {
+                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_DEBUG_FREE;
+        }
        return length;
 }
 SLAB_ATTR(sanity_checks);
@@ -4357,8 +4780,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
                                                        size_t length)
 {
        s->flags &= ~SLAB_TRACE;
-        if (buf[0] == '1')
+        if (buf[0] == '1') {
+                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_TRACE;
+        }
        return length;
 }
 SLAB_ATTR(trace);
@@ -4375,8 +4800,10 @@ static ssize_t red_zone_store(struct kmem_cache *s,
                return -EBUSY;
        s->flags &= ~SLAB_RED_ZONE;
-        if (buf[0] == '1')
+        if (buf[0] == '1') {
+                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_RED_ZONE;
+        }
        calculate_sizes(s, -1);
        return length;
 }
@@ -4394,8 +4821,10 @@ static ssize_t poison_store(struct kmem_cache *s,
                return -EBUSY;
        s->flags &= ~SLAB_POISON;
-        if (buf[0] == '1')
+        if (buf[0] == '1') {
+                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_POISON;
+        }
        calculate_sizes(s, -1);
        return length;
 }
@@ -4413,8 +4842,10 @@ static ssize_t store_user_store(struct kmem_cache *s,
                return -EBUSY;
        s->flags &= ~SLAB_STORE_USER;
-        if (buf[0] == '1')
+        if (buf[0] == '1') {
+                s->flags &= ~__CMPXCHG_DOUBLE;
                s->flags |= SLAB_STORE_USER;
+        }
        calculate_sizes(s, -1);
        return length;
 }
@@ -4579,6 +5010,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial);
 STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial);
 STAT_ATTR(ALLOC_SLAB, alloc_slab);
 STAT_ATTR(ALLOC_REFILL, alloc_refill);
+STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch);
 STAT_ATTR(FREE_SLAB, free_slab);
 STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush);
 STAT_ATTR(DEACTIVATE_FULL, deactivate_full);
@@ -4586,7 +5018,12 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty);
 STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head);
 STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail);
 STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees);
+STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
 STAT_ATTR(ORDER_FALLBACK, order_fallback);
+STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
+STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
+STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
+STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
 #endif
 static struct attribute *slab_attrs[] = {
@@ -4595,6 +5032,7 @@ static struct attribute *slab_attrs[] = {
        &objs_per_slab_attr.attr,
        &order_attr.attr,
        &min_partial_attr.attr,
+        &cpu_partial_attr.attr,
        &objects_attr.attr,
        &objects_partial_attr.attr,
        &partial_attr.attr,
@@ -4607,6 +5045,7 @@ static struct attribute *slab_attrs[] = {
        &destroy_by_rcu_attr.attr,
        &shrink_attr.attr,
        &reserved_attr.attr,
+        &slabs_cpu_partial_attr.attr,
 #ifdef CONFIG_SLUB_DEBUG
        &total_objects_attr.attr,
        &slabs_attr.attr,
@@ -4636,6 +5075,7 @@ static struct attribute *slab_attrs[] = {
        &alloc_from_partial_attr.attr,
        &alloc_slab_attr.attr,
        &alloc_refill_attr.attr,
+        &alloc_node_mismatch_attr.attr,
        &free_slab_attr.attr,
        &cpuslab_flush_attr.attr,
        &deactivate_full_attr.attr,
@@ -4643,7 +5083,12 @@ static struct attribute *slab_attrs[] = {
        &deactivate_to_head_attr.attr,
        &deactivate_to_tail_attr.attr,
        &deactivate_remote_frees_attr.attr,
+        &deactivate_bypass_attr.attr,
        &order_fallback_attr.attr,
+        &cmpxchg_double_fail_attr.attr,
+        &cmpxchg_double_cpu_fail_attr.attr,
+        &cpu_partial_alloc_attr.attr,
+        &cpu_partial_free_attr.attr,
 #endif
 #ifdef CONFIG_FAILSLAB
        &failslab_attr.attr,
@@ -4995,7 +5440,7 @@ static const struct file_operations proc_slabinfo_operations = {
 static int __init slab_proc_init(void)
 {
-        proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
+        proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
        return 0;
 }
 module_init(slab_proc_init);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 64b984091edb..1b7e22ab9b09 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -21,7 +21,6 @@
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
 #include <linux/highmem.h>
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
diff --git a/mm/sparse.c b/mm/sparse.c
index 858e1dff9b2a..61d7cde23111 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -6,7 +6,7 @@
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
 #include <linux/highmem.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include "internal.h"
diff --git a/mm/swap.c b/mm/swap.c
index 3a442f18b0b3..a91caf754d9b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -21,7 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
 #include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm_inline.h>
 #include <linux/buffer_head.h>  /* for try_to_release_page() */
 #include <linux/percpu_counter.h>
@@ -78,39 +78,22 @@ static void put_compound_page(struct page *page)
 {
        if (unlikely(PageTail(page))) {
                /* __split_huge_page_refcount can run under us */
-                struct page *page_head = page->first_page;
+                struct page *page_head = compound_trans_head(page);
-                smp_rmb();
-                /*
+                if (likely(page != page_head &&
-                 * If PageTail is still set after smp_rmb() we can be sure
+                           get_page_unless_zero(page_head))) {
-                 * that the page->first_page we read wasn't a dangling pointer.
-                 * See __split_huge_page_refcount() smp_wmb().
-                 */
-                if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
                        unsigned long flags;
                        /*
-                         * Verify that our page_head wasn't converted
+                         * page_head wasn't a dangling pointer but it
-                         * to a a regular page before we got a
+                         * may not be a head page anymore by the time
-                         * reference on it.
+                         * we obtain the lock. That is ok as long as it
+                         * can't be freed from under us.
                         */
-                        if (unlikely(!PageHead(page_head))) {
-                                /* PageHead is cleared after PageTail */
-                                smp_rmb();
-                                VM_BUG_ON(PageTail(page));
-                                goto out_put_head;
-                        }
-                        /*
-                         * Only run compound_lock on a valid PageHead,
-                         * after having it pinned with
-                         * get_page_unless_zero() above.
-                         */
-                        smp_mb();
-                        /* page_head wasn't a dangling pointer */
                        flags = compound_lock_irqsave(page_head);
                        if (unlikely(!PageTail(page))) {
                                /* __split_huge_page_refcount run before us */
                                compound_unlock_irqrestore(page_head, flags);
                                VM_BUG_ON(PageHead(page_head));
-                        out_put_head:
                                if (put_page_testzero(page_head))
                                        __put_single_page(page_head);
                        out_put_single:
@@ -121,16 +104,17 @@ static void put_compound_page(struct page *page)
                        VM_BUG_ON(page_head != page->first_page);
                        /*
                         * We can release the refcount taken by
-                         * get_page_unless_zero now that
+                         * get_page_unless_zero() now that
-                         * split_huge_page_refcount is blocked on the
+                         * __split_huge_page_refcount() is blocked on
-                         * compound_lock.
+                         * the compound_lock.
                         */
                        if (put_page_testzero(page_head))
                                VM_BUG_ON(1);
                        /* __split_huge_page_refcount will wait now */
-                        VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                        VM_BUG_ON(page_mapcount(page) <= 0);
-                        atomic_dec(&page->_count);
+                        atomic_dec(&page->_mapcount);
                        VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+                        VM_BUG_ON(atomic_read(&page->_count) != 0);
                        compound_unlock_irqrestore(page_head, flags);
                        if (put_page_testzero(page_head)) {
                                if (PageHead(page_head))
@@ -160,6 +144,45 @@ void put_page(struct page *page)
 }
 EXPORT_SYMBOL(put_page);
+/*
+ * This function is exported but must not be called by anything other
+ * than get_page(). It implements the slow path of get_page().
+ */
+bool __get_page_tail(struct page *page)
+{
+        /*
+         * This takes care of get_page() if run on a tail page
+         * returned by one of the get_user_pages/follow_page variants.
+         * get_user_pages/follow_page itself doesn't need the compound
+         * lock because it runs __get_page_tail_foll() under the
+         * proper PT lock that already serializes against
+         * split_huge_page().
+         */
+        unsigned long flags;
+        bool got = false;
+        struct page *page_head = compound_trans_head(page);
+        if (likely(page != page_head && get_page_unless_zero(page_head))) {
+                /*
+                 * page_head wasn't a dangling pointer but it
+                 * may not be a head page anymore by the time
+                 * we obtain the lock. That is ok as long as it
+                 * can't be freed from under us.
+                 */
+                flags = compound_lock_irqsave(page_head);
+                /* here __split_huge_page_refcount won't run anymore */
+                if (likely(PageTail(page))) {
+                        __get_page_tail_foll(page, false);
+                        got = true;
+                }
+                compound_unlock_irqrestore(page_head, flags);
+                if (unlikely(!got))
+                        put_page(page_head);
+        }
+        return got;
+}
+EXPORT_SYMBOL(__get_page_tail);
 /**
 * put_pages_list() - release a list of pages
 * @pages: list of pages threaded on page->lru
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 46680461785b..78cc4d1f6cce 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -6,7 +6,6 @@
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */
-#include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/gfp.h>
 #include <linux/kernel_stat.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1b8c33907242..b1cd12060723 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -21,7 +21,6 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/init.h>
-#include <linux/module.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/security.h>
@@ -1617,7 +1616,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
        err = try_to_unuse(type);
-        test_set_oom_score_adj(oom_score_adj);
+        compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
        if (err) {
                /*
@@ -1924,20 +1923,24 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
        /*
         * Find out how many pages are allowed for a single swap
-         * device. There are two limiting factors: 1) the number of
+         * device. There are three limiting factors: 1) the number
-         * bits for the swap offset in the swp_entry_t type and
+         * of bits for the swap offset in the swp_entry_t type, and
-         * 2) the number of bits in the a swap pte as defined by
+         * 2) the number of bits in the swap pte as defined by the
-         * the different architectures. In order to find the
+         * the different architectures, and 3) the number of free bits
-         * largest possible bit mask a swap entry with swap type 0
+         * in an exceptional radix_tree entry. In order to find the
+         * largest possible bit mask, a swap entry with swap type 0
         * and swap offset ~0UL is created, encoded to a swap pte,
-         * decoded to a swp_entry_t again and finally the swap
+         * decoded to a swp_entry_t again, and finally the swap
         * offset is extracted. This will mask all the bits from
         * the initial ~0UL mask that can't be encoded in either
         * the swp_entry_t or the architecture definition of a
-         * swap pte.
+         * swap pte.  Then the same is done for a radix_tree entry.
         */
        maxpages = swp_offset(pte_to_swp_entry(
-                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
+                        swp_entry_to_pte(swp_entry(0, ~0UL))));
+        maxpages = swp_offset(radix_to_swp_entry(
+                        swp_to_radix_entry(swp_entry(0, maxpages)))) + 1;
        if (maxpages > swap_header->info.last_page) {
                maxpages = swap_header->info.last_page + 1;
                /* p->max is an unsigned int: don't overflow it */
diff --git a/mm/thrash.c b/mm/thrash.c
index e53f7d02c17c..57ad495dbd54 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -29,7 +29,7 @@
 static DEFINE_SPINLOCK(swap_token_lock);
 struct mm_struct *swap_token_mm;
-struct mem_cgroup *swap_token_memcg;
+static struct mem_cgroup *swap_token_memcg;
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
diff --git a/mm/truncate.c b/mm/truncate.c
index 232eb2736a79..632b15e29f74 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -12,7 +12,7 @@
 #include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 #include <linux/pagevec.h>
@@ -336,6 +336,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
        unsigned long count = 0;
        int i;
+        /*
+         * Note: this function may get called on a shmem/tmpfs mapping:
+         * pagevec_lookup() might then return 0 prematurely (because it
+         * got a gangful of swap entries); but it's hardly worth worrying
+         * about - it can rarely have anything to free from such a mapping
+         * (most pages are dirty), and already skips over any difficulties.
+         */
        pagevec_init(&pvec, 0);
        while (index <= end && pagevec_lookup(&pvec, mapping, index,
                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
diff --git a/mm/util.c b/mm/util.c
index 88ea1bd661c0..136ac4f322b8 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/err.h>
 #include <linux/sched.h>
 #include <asm/uaccess.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 464621d18eb2..b669aa6f6caf 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -725,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
 #define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
 #define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
 #define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
-#define VMAP_BBMAP_BITS         VMAP_MIN(VMAP_BBMAP_BITS_MAX,           \
+#define VMAP_BBMAP_BITS         \
-                                        VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
+                VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
-                                                VMALLOC_PAGES / NR_CPUS / 16))
+                VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
+                        VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
 #define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
@@ -1252,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area);
 DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
-static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
                              unsigned long flags, void *caller)
 {
-        struct vm_struct *tmp, **p;
        vm->flags = flags;
        vm->addr = (void *)va->va_start;
        vm->size = va->va_end - va->va_start;
        vm->caller = caller;
        va->private = vm;
        va->flags |= VM_VM_AREA;
+}
+static void insert_vmalloc_vmlist(struct vm_struct *vm)
+{
+        struct vm_struct *tmp, **p;
+        vm->flags &= ~VM_UNLIST;
        write_lock(&vmlist_lock);
        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
                if (tmp->addr >= vm->addr)
@@ -1274,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
        write_unlock(&vmlist_lock);
 }
+static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+                              unsigned long flags, void *caller)
+{
+        setup_vmalloc_vm(vm, va, flags, caller);
+        insert_vmalloc_vmlist(vm);
+}
 static struct vm_struct *__get_vm_area_node(unsigned long size,
                unsigned long align, unsigned long flags, unsigned long start,
                unsigned long end, int node, gfp_t gfp_mask, void *caller)
@@ -1312,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
                return NULL;
        }
-        insert_vmalloc_vm(area, va, flags, caller);
+        /*
+         * When this function is called from __vmalloc_node_range,
+         * we do not add vm_struct to vmlist here to avoid
+         * accessing uninitialized members of vm_struct such as
+         * pages and nr_pages fields. They will be set later.
+         * To distinguish it from others, we use a VM_UNLIST flag.
+         */
+        if (flags & VM_UNLIST)
+                setup_vmalloc_vm(area, va, flags, caller);
+        else
+                insert_vmalloc_vm(area, va, flags, caller);
        return area;
 }
@@ -1380,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr)
        va = find_vmap_area((unsigned long)addr);
        if (va && va->flags & VM_VM_AREA) {
                struct vm_struct *vm = va->private;
-                struct vm_struct *tmp, **p;
-                /*
+                if (!(vm->flags & VM_UNLIST)) {
-                 * remove from list and disallow access to this vm_struct
+                        struct vm_struct *tmp, **p;
-                 * before unmap. (address range confliction is maintained by
+                        /*
-                 * vmap.)
+                         * remove from list and disallow access to
-                 */
+                         * this vm_struct before unmap. (address range
-                write_lock(&vmlist_lock);
+                         * confliction is maintained by vmap.)
-                for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
+                         */
-                        ;
+                        write_lock(&vmlist_lock);
-                *p = tmp->next;
+                        for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
-                write_unlock(&vmlist_lock);
+                                ;
+                        *p = tmp->next;
+                        write_unlock(&vmlist_lock);
+                }
                vmap_debug_free_range(va->va_start, va->va_end);
                free_unmap_vmap_area(va);
@@ -1567,8 +1593,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        return area->addr;
 fail:
-        warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, "
+        warn_alloc_failed(gfp_mask, order,
-                          "allocated %ld of %ld bytes\n",
+                          "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
                          (area->nr_pages*PAGE_SIZE), area->size);
        vfree(area->addr);
        return NULL;
@@ -1599,17 +1625,22 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
        size = PAGE_ALIGN(size);
        if (!size || (size >> PAGE_SHIFT) > totalram_pages)
-                return NULL;
+                goto fail;
-        area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
-                                  gfp_mask, caller);
+        area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
+                                  start, end, node, gfp_mask, caller);
        if (!area)
-                return NULL;
+                goto fail;
        addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
        /*
+         * In this function, newly allocated vm_struct is not added
+         * to vmlist at __get_vm_area_node(). so, it is added here.
+         */
+        insert_vmalloc_vmlist(area);
+        /*
         * A ref_count = 3 is needed because the vm_struct and vmap_area
         * structures allocated in the __get_vm_area_node() function contain
         * references to the virtual address of the vmalloc'ed block.
@@ -1617,6 +1648,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
        kmemleak_alloc(addr, real_size, 3, gfp_mask);
        return addr;
+fail:
+        warn_alloc_failed(gfp_mask, 0,
+                          "vmalloc: allocation failure: %lu bytes\n",
+                          real_size);
+        return NULL;
 }
 /**
@@ -2139,6 +2176,14 @@ struct vm_struct *alloc_vm_area(size_t size)
                return NULL;
        }
+        /*
+         * If the allocated address space is passed to a hypercall
+         * before being used then we cannot rely on a page fault to
+         * trigger an update of the page tables.  So sync all the page
+         * tables here.
+         */
+        vmalloc_sync_all();
        return area;
 }
 EXPORT_SYMBOL_GPL(alloc_vm_area);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7ef69124fa3e..a1893c050795 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -105,7 +105,6 @@ struct scan_control {
        /* Which cgroup do we reclaim from */
        struct mem_cgroup *mem_cgroup;
-        struct memcg_scanrecord *memcg_record;
        /*
         * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -496,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        return PAGE_ACTIVATE;
                }
-                /*
-                 * Wait on writeback if requested to. This happens when
-                 * direct reclaiming a large contiguous area and the
-                 * first attempt to free a range of pages fails.
-                 */
-                if (PageWriteback(page) &&
-                    (sc->reclaim_mode & RECLAIM_MODE_SYNC))
-                        wait_on_page_writeback(page);
                if (!PageWriteback(page)) {
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
@@ -643,13 +633,14 @@ redo:
                lru = LRU_UNEVICTABLE;
                add_page_to_unevictable_list(page);
                /*
-                 * When racing with an mlock clearing (page is
+                 * When racing with an mlock or AS_UNEVICTABLE clearing
-                 * unlocked), make sure that if the other thread does
+                 * (page is unlocked) make sure that if the other thread
-                 * not observe our setting of PG_lru and fails
+                 * does not observe our setting of PG_lru and fails
-                 * isolation, we see PG_mlocked cleared below and move
+                 * isolation/check_move_unevictable_page,
+                 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
                 * the page back to the evictable list.
                 *
-                 * The other side is TestClearPageMlocked().
+                 * The other side is TestClearPageMlocked() or shmem_lock().
                 */
                smp_mb();
        }
@@ -760,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
 */
 static unsigned long shrink_page_list(struct list_head *page_list,
                                      struct zone *zone,
-                                      struct scan_control *sc)
+                                      struct scan_control *sc,
+                                      int priority,
+                                      unsigned long *ret_nr_dirty,
+                                      unsigned long *ret_nr_writeback)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
@@ -768,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        unsigned long nr_dirty = 0;
        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
+        unsigned long nr_writeback = 0;
        cond_resched();
@@ -804,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
                if (PageWriteback(page)) {
+                        nr_writeback++;
                        /*
-                         * Synchronous reclaim is performed in two passes,
+                         * Synchronous reclaim cannot queue pages for
-                         * first an asynchronous pass over the list to
+                         * writeback due to the possibility of stack overflow
-                         * start parallel writeback, and a second synchronous
+                         * but if it encounters a page under writeback, wait
-                         * pass to wait for the IO to complete.  Wait here
+                         * for the IO to complete.
-                         * for any page for which writeback has already
-                         * started.
                         */
                        if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
                            may_enter_fs)
@@ -866,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PageDirty(page)) {
                        nr_dirty++;
+                        /*
+                         * Only kswapd can writeback filesystem pages to
+                         * avoid risk of stack overflow but do not writeback
+                         * unless under significant pressure.
+                         */
+                        if (page_is_file_cache(page) &&
+                                        (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+                                /*
+                                 * Immediately reclaim when written back.
+                                 * Similar in principal to deactivate_page()
+                                 * except we already have the page isolated
+                                 * and know it's dirty
+                                 */
+                                inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+                                SetPageReclaim(page);
+                                goto keep_locked;
+                        }
                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
@@ -1000,6 +1013,8 @@ keep_lumpy:
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
+        *ret_nr_dirty += nr_dirty;
+        *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
 }
@@ -1013,23 +1028,27 @@ keep_lumpy:
 *
 * returns 0 on success, -ve errno on failure.
 */
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
 {
+        bool all_lru_mode;
        int ret = -EINVAL;
        /* Only take pages on the LRU. */
        if (!PageLRU(page))
                return ret;
+        all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+                (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
        /*
         * When checking the active state, we need to be sure we are
         * dealing with comparible boolean values.  Take the logical not
         * of each.
         */
-        if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+        if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
                return ret;
-        if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+        if (!all_lru_mode && !!page_is_file_cache(page) != file)
                return ret;
        /*
@@ -1042,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file)
        ret = -EBUSY;
+        if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
+                return ret;
+        if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+                return ret;
        if (likely(get_page_unless_zero(page))) {
                /*
                 * Be careful not to clear PageLRU until after we're
@@ -1077,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
 */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                struct list_head *src, struct list_head *dst,
-                unsigned long *scanned, int order, int mode, int file)
+                unsigned long *scanned, int order, isolate_mode_t mode,
+                int file)
 {
        unsigned long nr_taken = 0;
        unsigned long nr_lumpy_taken = 0;
@@ -1202,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 static unsigned long isolate_pages_global(unsigned long nr,
                                        struct list_head *dst,
                                        unsigned long *scanned, int order,
-                                        int mode, struct zone *z,
+                                        isolate_mode_t mode,
-                                        int active, int file)
+                                        struct zone *z, int active, int file)
 {
        int lru = LRU_BASE;
        if (active)
@@ -1349,8 +1375,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
                        int file = is_file_lru(lru);
                        int numpages = hpage_nr_pages(page);
                        reclaim_stat->recent_rotated[file] += numpages;
-                        if (!scanning_global_lru(sc))
-                                sc->memcg_record->nr_rotated[file] += numpages;
                }
                if (!pagevec_add(&pvec, page)) {
                        spin_unlock_irq(&zone->lru_lock);
@@ -1394,14 +1418,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
        reclaim_stat->recent_scanned[0] += *nr_anon;
        reclaim_stat->recent_scanned[1] += *nr_file;
-        if (!scanning_global_lru(sc)) {
-                sc->memcg_record->nr_scanned[0] += *nr_anon;
-                sc->memcg_record->nr_scanned[1] += *nr_file;
-        }
 }
 /*
- * Returns true if the caller should wait to clean dirty/writeback pages.
+ * Returns true if a direct reclaim should wait on pages under writeback.
 *
 * If we are direct reclaiming for contiguous pages and we do not reclaim
 * everything in the list, try again and wait for writeback IO to complete.
@@ -1423,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
        if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
                return false;
-        /* If we have relaimed everything on the isolated list, no stall */
+        /* If we have reclaimed everything on the isolated list, no stall */
        if (nr_freed == nr_taken)
                return false;
@@ -1455,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        unsigned long nr_taken;
        unsigned long nr_anon;
        unsigned long nr_file;
+        unsigned long nr_dirty = 0;
+        unsigned long nr_writeback = 0;
+        isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
        while (unlikely(too_many_isolated(zone, file, sc))) {
                congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1465,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        }
        set_reclaim_mode(priority, sc, false);
+        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+                reclaim_mode |= ISOLATE_ACTIVE;
        lru_add_drain();
+        if (!sc->may_unmap)
+                reclaim_mode |= ISOLATE_UNMAPPED;
+        if (!sc->may_writepage)
+                reclaim_mode |= ISOLATE_CLEAN;
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
-                nr_taken = isolate_pages_global(nr_to_scan,
+                nr_taken = isolate_pages_global(nr_to_scan, &page_list,
-                        &page_list, &nr_scanned, sc->order,
+                        &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
-                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
-                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1482,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
                                               nr_scanned);
        } else {
-                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+                nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
-                        &page_list, &nr_scanned, sc->order,
+                        &nr_scanned, sc->order, reclaim_mode, zone,
-                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
+                        sc->mem_cgroup, 0, file);
-                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
-                        zone, sc->mem_cgroup,
-                        0, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
                 * scanned pages on its own.
@@ -1503,17 +1529,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
-        nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+        nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
+                                                &nr_dirty, &nr_writeback);
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                set_reclaim_mode(priority, sc, true);
-                nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+                nr_reclaimed += shrink_page_list(&page_list, zone, sc,
+                                        priority, &nr_dirty, &nr_writeback);
        }
-        if (!scanning_global_lru(sc))
-                sc->memcg_record->nr_freed[file] += nr_reclaimed;
        local_irq_disable();
        if (current_is_kswapd())
                __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
@@ -1521,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+        /*
+         * If reclaim is isolating dirty pages under writeback, it implies
+         * that the long-lived page allocation rate is exceeding the page
+         * laundering rate. Either the global limits are not being effective
+         * at throttling processes due to the page distribution throughout
+         * zones or there is heavy usage of a slow backing device. The
+         * only option is to throttle from reclaim context which is not ideal
+         * as there is no guarantee the dirtying process is throttled in the
+         * same way balance_dirty_pages() manages.
+         *
+         * This scales the number of dirty pages that must be under writeback
+         * before throttling depending on priority. It is a simple backoff
+         * function that has the most effect in the range DEF_PRIORITY to
+         * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+         * in trouble and reclaim is considered to be in trouble.
+         *
+         * DEF_PRIORITY   100% isolated pages must be PageWriteback to throttle
+         * DEF_PRIORITY-1  50% must be PageWriteback
+         * DEF_PRIORITY-2  25% must be PageWriteback, kswapd in trouble
+         * ...
+         * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+         *                     isolated page is PageWriteback
+         */
+        if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
                zone_idx(zone),
                nr_scanned, nr_reclaimed,
@@ -1592,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        struct page *page;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        unsigned long nr_rotated = 0;
+        isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
        lru_add_drain();
+        if (!sc->may_unmap)
+                reclaim_mode |= ISOLATE_UNMAPPED;
+        if (!sc->may_writepage)
+                reclaim_mode |= ISOLATE_CLEAN;
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                                ISOLATE_ACTIVE, zone,
+                                                reclaim_mode, zone,
                                                1, file);
                zone->pages_scanned += pgscanned;
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
                                                &pgscanned, sc->order,
-                                                ISOLATE_ACTIVE, zone,
+                                                reclaim_mode, zone,
                                                sc->mem_cgroup, 1, file);
                /*
                 * mem_cgroup_isolate_pages() keeps track of
@@ -1613,8 +1671,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        }
        reclaim_stat->recent_scanned[file] += nr_taken;
-        if (!scanning_global_lru(sc))
-                sc->memcg_record->nr_scanned[file] += nr_taken;
        __count_zone_vm_events(PGREFILL, zone, pgscanned);
        if (file)
@@ -1666,8 +1722,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
         * get_scan_ratio.
         */
        reclaim_stat->recent_rotated[file] += nr_rotated;
-        if (!scanning_global_lru(sc))
-                sc->memcg_record->nr_rotated[file] += nr_rotated;
        move_active_pages_to_lru(zone, &l_active,
                                                LRU_ACTIVE + file * LRU_FILE);
@@ -1713,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
        if (scanning_global_lru(sc))
                low = inactive_anon_is_low_global(zone);
        else
-                low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
+                low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
        return low;
 }
 #else
@@ -1756,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
        if (scanning_global_lru(sc))
                low = inactive_file_is_low_global(zone);
        else
-                low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
+                low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
        return low;
 }
@@ -1808,23 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        u64 fraction[2], denominator;
        enum lru_list l;
        int noswap = 0;
-        int force_scan = 0;
+        bool force_scan = false;
-        unsigned long nr_force_scan[2];
-        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+        /*
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+         * If the zone or memcg is small, nr[l] can be 0.  This
-        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+         * results in no scanning on this priority and a potential
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+         * priority drop.  Global direct reclaim can go to the next
+         * zone and tends to have no problems. Global kswapd is for
-        if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
+         * zone balancing and it needs to scan a minimum amount. When
-                /* kswapd does zone balancing and need to scan this zone */
+         * reclaiming for a memcg, a priority drop can cause high
-                if (scanning_global_lru(sc) && current_is_kswapd())
+         * latencies, so it's better to scan a minimum amount there as
-                        force_scan = 1;
+         * well.
-                /* memcg may have small limit and need to avoid priority drop */
+         */
-                if (!scanning_global_lru(sc))
+        if (scanning_global_lru(sc) && current_is_kswapd())
-                        force_scan = 1;
+                force_scan = true;
-        }
+        if (!scanning_global_lru(sc))
+                force_scan = true;
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1832,11 +1885,14 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                fraction[0] = 0;
                fraction[1] = 1;
                denominator = 1;
-                nr_force_scan[0] = 0;
-                nr_force_scan[1] = SWAP_CLUSTER_MAX;
                goto out;
        }
+        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
        if (scanning_global_lru(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
@@ -1845,8 +1901,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                        fraction[0] = 1;
                        fraction[1] = 0;
                        denominator = 1;
-                        nr_force_scan[0] = SWAP_CLUSTER_MAX;
-                        nr_force_scan[1] = 0;
                        goto out;
                }
        }
@@ -1895,11 +1949,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        fraction[0] = ap;
        fraction[1] = fp;
        denominator = ap + fp + 1;
-        if (force_scan) {
-                unsigned long scan = SWAP_CLUSTER_MAX;
-                nr_force_scan[0] = div64_u64(scan * ap, denominator);
-                nr_force_scan[1] = div64_u64(scan * fp, denominator);
-        }
 out:
        for_each_evictable_lru(l) {
                int file = is_file_lru(l);
@@ -1908,20 +1957,10 @@ out:
                scan = zone_nr_lru_pages(zone, sc, l);
                if (priority || noswap) {
                        scan >>= priority;
+                        if (!scan && force_scan)
+                                scan = SWAP_CLUSTER_MAX;
                        scan = div64_u64(scan * fraction[file], denominator);
                }
-                /*
-                 * If zone is small or memcg is small, nr[l] can be 0.
-                 * This results no-scan on this priority and priority drop down.
-                 * For global direct reclaim, it can visit next zone and tend
-                 * not to have problems. For global kswapd, it's for zone
-                 * balancing and it need to scan a small amounts. When using
-                 * memcg, priority drop can cause big latency. So, it's better
-                 * to scan small amount. See may_noscan above.
-                 */
-                if (!scan && force_scan)
-                        scan = nr_force_scan[file];
                nr[l] = scan;
        }
 }
@@ -2000,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone,
        enum lru_list l;
        unsigned long nr_reclaimed, nr_scanned;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+        struct blk_plug plug;
 restart:
        nr_reclaimed = 0;
        nr_scanned = sc->nr_scanned;
        get_scan_count(zone, sc, nr, priority);
+        blk_start_plug(&plug);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                for_each_evictable_lru(l) {
@@ -2029,6 +2070,7 @@ restart:
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
+        blk_finish_plug(&plug);
        sc->nr_reclaimed += nr_reclaimed;
        /*
@@ -2061,14 +2103,19 @@ restart:
 *
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is either ready to begin or deferred.
+ * This indicates to the caller that it should retry the allocation or fail.
 */
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
        unsigned long nr_soft_reclaimed;
        unsigned long nr_soft_scanned;
+        bool should_abort_reclaim = false;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2083,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
+                        if (COMPACTION_BUILD) {
+                                /*
+                                 * If we already have plenty of memory free for
+                                 * compaction in this zone, don't free any more.
+                                 * Even though compaction is invoked for any
+                                 * non-zero order, only frequent costly order
+                                 * reclamation is disruptive enough to become a
+                                 * noticable problem, like transparent huge page
+                                 * allocations.
+                                 */
+                                if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+                                        (compaction_suitable(zone, sc->order) ||
+                                         compaction_deferred(zone))) {
+                                        should_abort_reclaim = true;
+                                        continue;
+                                }
+                        }
                        /*
                         * This steals pages from memory cgroups over softlimit
                         * and returns the number of reclaimed pages and
@@ -2100,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                shrink_zone(priority, zone, sc);
        }
+        return should_abort_reclaim;
 }
 static bool zone_reclaimable(struct zone *zone)
@@ -2164,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token(sc->mem_cgroup);
-                shrink_zones(priority, zonelist, sc);
+                if (shrink_zones(priority, zonelist, sc))
+                        break;
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
@@ -2198,7 +2266,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 */
                writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
                if (total_scanned > writeback_threshold) {
-                        wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
+                        wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
+                                                WB_REASON_TRY_TO_FREE_PAGES);
                        sc->may_writepage = 1;
                }
@@ -2268,10 +2337,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
-                                        gfp_t gfp_mask, bool noswap,
+                                                gfp_t gfp_mask, bool noswap,
-                                        struct zone *zone,
+                                                struct zone *zone,
-                                        struct memcg_scanrecord *rec,
+                                                unsigned long *nr_scanned)
-                                        unsigned long *scanned)
 {
        struct scan_control sc = {
                .nr_scanned = 0,
@@ -2281,9 +2349,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .may_swap = !noswap,
                .order = 0,
                .mem_cgroup = mem,
-                .memcg_record = rec,
        };
-        unsigned long start, end;
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2292,7 +2358,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                                                      sc.may_writepage,
                                                      sc.gfp_mask);
-        start = sched_clock();
        /*
         * NOTE: Although we can get the priority field, using it
         * here is not a good idea, since it limits the pages we can scan.
@@ -2301,25 +2366,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
         * the priority and make it zero.
         */
        shrink_zone(0, zone, &sc);
-        end = sched_clock();
-        if (rec)
-                rec->elapsed += end - start;
-        *scanned = sc.nr_scanned;
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+        *nr_scanned = sc.nr_scanned;
        return sc.nr_reclaimed;
 }
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                           gfp_t gfp_mask,
-                                           bool noswap,
+                                           bool noswap)
-                                           struct memcg_scanrecord *rec)
 {
        struct zonelist *zonelist;
        unsigned long nr_reclaimed;
-        unsigned long start, end;
        int nid;
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
@@ -2328,7 +2387,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .order = 0,
                .mem_cgroup = mem_cont,
-                .memcg_record = rec,
                .nodemask = NULL, /* we don't care the placement */
                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2337,7 +2395,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .gfp_mask = sc.gfp_mask,
        };
-        start = sched_clock();
        /*
         * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
         * take care of from where we get pages. So the node where we start the
@@ -2352,9 +2409,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                            sc.gfp_mask);
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
-        end = sched_clock();
-        if (rec)
-                rec->elapsed += end - start;
        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -2529,6 +2583,9 @@ loop_again:
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
+                        } else {
+                                /* If balanced, clear the congested flag */
+                                zone_clear_flag(zone, ZONE_CONGESTED);
                        }
                }
                if (i < 0)
@@ -2719,6 +2776,8 @@ out:
                        /* If balanced, clear the congested flag */
                        zone_clear_flag(zone, ZONE_CONGESTED);
+                        if (i <= *classzone_idx)
+                                balanced += zone->present_pages;
                }
        }
@@ -2792,7 +2851,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
 static int kswapd(void *p)
 {
        unsigned long order, new_order;
+        unsigned balanced_order;
        int classzone_idx, new_classzone_idx;
+        int balanced_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
@@ -2823,7 +2884,9 @@ static int kswapd(void *p)
        set_freezable();
        order = new_order = 0;
+        balanced_order = 0;
        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+        balanced_classzone_idx = classzone_idx;
        for ( ; ; ) {
                int ret;
@@ -2832,7 +2895,8 @@ static int kswapd(void *p)
                 * new request of a similar or harder type will succeed soon
                 * so consider going to sleep on the basis we reclaimed at
                 */
-                if (classzone_idx >= new_classzone_idx && order == new_order) {
+                if (balanced_classzone_idx >= new_classzone_idx &&
+                                        balanced_order == new_order) {
                        new_order = pgdat->kswapd_max_order;
                        new_classzone_idx = pgdat->classzone_idx;
                        pgdat->kswapd_max_order =  0;
@@ -2847,9 +2911,12 @@ static int kswapd(void *p)
                        order = new_order;
                        classzone_idx = new_classzone_idx;
                } else {
-                        kswapd_try_to_sleep(pgdat, order, classzone_idx);
+                        kswapd_try_to_sleep(pgdat, balanced_order,
+                                                balanced_classzone_idx);
                        order = pgdat->kswapd_max_order;
                        classzone_idx = pgdat->classzone_idx;
+                        new_order = order;
+                        new_classzone_idx = classzone_idx;
                        pgdat->kswapd_max_order = 0;
                        pgdat->classzone_idx = pgdat->nr_zones - 1;
                }
@@ -2864,7 +2931,9 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                        order = balance_pgdat(pgdat, order, &classzone_idx);
+                        balanced_classzone_idx = classzone_idx;
+                        balanced_order = balance_pgdat(pgdat, order,
+                                                &balanced_classzone_idx);
                }
        }
        return 0;
@@ -3376,66 +3445,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
 }
-/**
+static void warn_scan_unevictable_pages(void)
- * scan_zone_unevictable_pages - check unevictable list for evictable pages
- * @zone - zone of which to scan the unevictable list
- *
- * Scan @zone's unevictable LRU lists to check for pages that have become
- * evictable.  Move those that have to @zone's inactive list where they
- * become candidates for reclaim, unless shrink_inactive_zone() decides
- * to reactivate them.  Pages that are still unevictable are rotated
- * back onto @zone's unevictable list.
- */
-#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-static void scan_zone_unevictable_pages(struct zone *zone)
 {
-        struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
+        printk_once(KERN_WARNING
-        unsigned long scan;
+                    "The scan_unevictable_pages sysctl/node-interface has been "
-        unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
+                    "disabled for lack of a legitimate use case.  If you have "
+                    "one, please send an email to linux-mm@kvack.org.\n");
-        while (nr_to_scan > 0) {
-                unsigned long batch_size = min(nr_to_scan,
-                                                SCAN_UNEVICTABLE_BATCH_SIZE);
-                spin_lock_irq(&zone->lru_lock);
-                for (scan = 0;  scan < batch_size; scan++) {
-                        struct page *page = lru_to_page(l_unevictable);
-                        if (!trylock_page(page))
-                                continue;
-                        prefetchw_prev_lru_page(page, l_unevictable, flags);
-                        if (likely(PageLRU(page) && PageUnevictable(page)))
-                                check_move_unevictable_page(page, zone);
-                        unlock_page(page);
-                }
-                spin_unlock_irq(&zone->lru_lock);
-                nr_to_scan -= batch_size;
-        }
-}
-/**
- * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
- *
- * A really big hammer:  scan all zones' unevictable LRU lists to check for
- * pages that have become evictable.  Move those back to the zones'
- * inactive list where they become candidates for reclaim.
- * This occurs when, e.g., we have unswappable pages on the unevictable lists,
- * and we add swap to the system.  As such, it runs in the context of a task
- * that has possibly/probably made some previously unevictable pages
- * evictable.
- */
-static void scan_all_zones_unevictable_pages(void)
-{
-        struct zone *zone;
-        for_each_zone(zone) {
-                scan_zone_unevictable_pages(zone);
-        }
 }
 /*
@@ -3448,11 +3463,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
                           void __user *buffer,
                           size_t *length, loff_t *ppos)
 {
+        warn_scan_unevictable_pages();
        proc_doulongvec_minmax(table, write, buffer, length, ppos);
-        if (write && *(unsigned long *)table->data)
-                scan_all_zones_unevictable_pages();
        scan_unevictable_pages = 0;
        return 0;
 }
@@ -3467,6 +3479,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev,
                                          struct sysdev_attribute *attr,
                                          char *buf)
 {
+        warn_scan_unevictable_pages();
        return sprintf(buf, "0\n");     /* always zero; should fit... */
 }
@@ -3474,19 +3487,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,
                                           struct sysdev_attribute *attr,
                                        const char *buf, size_t count)
 {
-        struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
+        warn_scan_unevictable_pages();
-        struct zone *zone;
-        unsigned long res;
-        unsigned long req = strict_strtoul(buf, 10, &res);
-        if (!req)
-                return 1;       /* zero is no-op */
-        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-                if (!populated_zone(zone))
-                        continue;
-                scan_zone_unevictable_pages(zone);
-        }
        return 1;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c18b7694b2..8fd603b1665e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
 *
 * vm_stat contains the global counters
 */
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
 EXPORT_SYMBOL(vm_stat);
 #ifdef CONFIG_SMP
@@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
 }
 #endif
-#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS)
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
 #ifdef CONFIG_ZONE_DMA
 #define TEXT_FOR_DMA(xx) xx "_dma",
 #else
@@ -702,6 +702,7 @@ const char * const vmstat_text[] = {
        "nr_unstable",
        "nr_bounce",
        "nr_vmscan_write",
+        "nr_vmscan_immediate_reclaim",
        "nr_writeback_temp",
        "nr_isolated_anon",
        "nr_isolated_file",
@@ -788,7 +789,7 @@ const char * const vmstat_text[] = {
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };
-#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */
+#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
 #ifdef CONFIG_PROC_FS
author	Luciano Coelho <coelho@ti.com>	2011-12-01 05:14:48 -0500
committer	Luciano Coelho <coelho@ti.com>	2011-12-01 05:14:48 -0500
commit	e4da3fbfbd1de56d2367653e3823e6445e49f8a9 (patch)
tree	f69f424f731b89a75f881967903ff2f38f4b6a92 /mm
parent	b693289406f0b8ca70ab77e745be6196d5740eb0 (diff)
parent	ba5736a5e9ac20c378ae4179e8a0ed3cc4b44351 (diff)