49 files changed, 1158 insertions, 813 deletions
diff --git a/mm/Makefile b/mm/Makefile
index 7a68d2ab556..6c2a73a54a4 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,7 +33,11 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
-obj-$(CONFIG_SMP) += percpu.o
+ifdef CONFIG_SMP
+obj-y += percpu.o
+else
+obj-y += percpu_up.o
+endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0e8ca034770..660a87a2251 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -11,6 +11,8 @@
 #include <linux/writeback.h>
 #include <linux/device.h>
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
 }
@@ -25,6 +27,11 @@ struct backing_dev_info default_backing_dev_info = {
 };
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
+struct backing_dev_info noop_backing_dev_info = {
+        .name           = "noop",
+};
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 static struct class *bdi_class;
 /*
@@ -41,7 +48,6 @@ static struct timer_list sync_supers_timer;
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
-static void arm_supers_timer(void);
 static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
@@ -227,6 +233,9 @@ static struct device_attribute bdi_dev_attrs[] = {
 static __init int bdi_class_init(void)
 {
        bdi_class = class_create(THIS_MODULE, "bdi");
+        if (IS_ERR(bdi_class))
+                return PTR_ERR(bdi_class);
        bdi_class->dev_attrs = bdi_dev_attrs;
        bdi_debug_init();
        return 0;
@@ -242,7 +251,7 @@ static int __init default_bdi_init(void)
        init_timer(&sync_supers_timer);
        setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
-        arm_supers_timer();
+        bdi_arm_supers_timer();
        err = bdi_init(&default_backing_dev_info);
        if (!err)
@@ -364,10 +373,13 @@ static int bdi_sync_supers(void *unused)
        return 0;
 }
-static void arm_supers_timer(void)
+void bdi_arm_supers_timer(void)
 {
        unsigned long next;
+        if (!dirty_writeback_interval)
+                return;
        next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
        mod_timer(&sync_supers_timer, round_jiffies_up(next));
 }
@@ -375,7 +387,7 @@ static void arm_supers_timer(void)
 static void sync_supers_timer_fn(unsigned long unused)
 {
        wake_up_process(sync_supers_tsk);
-        arm_supers_timer();
+        bdi_arm_supers_timer();
 }
 static int bdi_forker_task(void *ptr)
@@ -418,7 +430,10 @@ static int bdi_forker_task(void *ptr)
                        spin_unlock_bh(&bdi_lock);
                        wait = msecs_to_jiffies(dirty_writeback_interval * 10);
-                        schedule_timeout(wait);
+                        if (wait)
+                                schedule_timeout(wait);
+                        else
+                                schedule();
                        try_to_freeze();
                        continue;
                }
@@ -712,6 +727,33 @@ void bdi_destroy(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(bdi_destroy);
+/*
+ * For use from filesystems to quickly init and register a bdi associated
+ * with dirty writeback
+ */
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
+                           unsigned int cap)
+{
+        char tmp[32];
+        int err;
+        bdi->name = name;
+        bdi->capabilities = cap;
+        err = bdi_init(bdi);
+        if (err)
+                return err;
+        sprintf(tmp, "%.28s%s", name, "-%d");
+        err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
+        if (err) {
+                bdi_destroy(bdi);
+                return err;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(bdi_setup_and_register);
 static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
diff --git a/mm/bootmem.c b/mm/bootmem.c
index d7c791ef003..58c66cc5056 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -10,6 +10,7 @@
 */
 #include <linux/init.h>
 #include <linux/pfn.h>
+#include <linux/slab.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/kmemleak.h>
@@ -180,19 +181,12 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end)
        end_aligned = end & ~(BITS_PER_LONG - 1);
        if (end_aligned <= start_aligned) {
-#if 1
-                printk(KERN_DEBUG " %lx - %lx\n", start, end);
-#endif
                for (i = start; i < end; i++)
                        __free_pages_bootmem(pfn_to_page(i), 0);
                return;
        }
-#if 1
-        printk(KERN_DEBUG " %lx %lx - %lx %lx\n",
-                 start, start_aligned, end_aligned, end);
-#endif
        for (i = start; i < start_aligned; i++)
                __free_pages_bootmem(pfn_to_page(i), 0);
@@ -310,9 +304,22 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 unsigned long __init free_all_bootmem(void)
 {
 #ifdef CONFIG_NO_BOOTMEM
-        return free_all_memory_core_early(NODE_DATA(0)->node_id);
+        /*
+         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+         *  because in some case like Node0 doesnt have RAM installed
+         *  low ram will be on Node1
+         * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
+         *  will be used instead of only Node0 related
+         */
+        return free_all_memory_core_early(MAX_NUMNODES);
 #else
-        return free_all_bootmem_core(NODE_DATA(0)->bdata);
+        unsigned long total_pages = 0;
+        bootmem_data_t *bdata;
+        list_for_each_entry(bdata, &bdata_list, list)
+                total_pages += free_all_bootmem_core(bdata);
+        return total_pages;
 #endif
 }
@@ -428,9 +435,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 {
 #ifdef CONFIG_NO_BOOTMEM
        free_early(physaddr, physaddr + size);
-#if 0
-        printk(KERN_DEBUG "free %lx %lx\n", physaddr, size);
-#endif
 #else
        unsigned long start, end;
@@ -456,9 +460,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
 {
 #ifdef CONFIG_NO_BOOTMEM
        free_early(addr, addr + size);
-#if 0
-        printk(KERN_DEBUG "free %lx %lx\n", addr, size);
-#endif
 #else
        unsigned long start, end;
diff --git a/mm/bounce.c b/mm/bounce.c
index a2b76a588e3..13b6dad1eed 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -6,6 +6,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/gfp.h>
 #include <linux/bio.h>
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
diff --git a/mm/failslab.c b/mm/failslab.c
index bb41f98dd8b..c5f88f240dd 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,5 +1,4 @@
 #include <linux/fault-inject.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 static struct {
diff --git a/mm/filemap.c b/mm/filemap.c
index 045b31c3765..140ebda9640 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -10,13 +10,13 @@
 * the NFS filesystem used to do this differently, for example)
 */
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/aio.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/mman.h>
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 78b94f0b6d5..83364df74a3 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -17,6 +17,7 @@
 #include <linux/sched.h>
 #include <linux/seqlock.h>
 #include <linux/mutex.h>
+#include <linux/gfp.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3a5aeb37c11..4c9e6bbf377 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2,7 +2,6 @@
 * Generic hugetlb support.
 * (C) William Irwin, April 2004
 */
-#include <linux/gfp.h>
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -18,6 +17,7 @@
 #include <linux/mutex.h>
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
+#include <linux/slab.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -546,6 +546,7 @@ static void free_huge_page(struct page *page)
        mapping = (struct address_space *) page_private(page);
        set_page_private(page, 0);
+        page->mapping = NULL;
        BUG_ON(page_count(page));
        INIT_LIST_HEAD(&page->lru);
@@ -1038,7 +1039,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
                page = alloc_buddy_huge_page(h, vma, addr);
                if (!page) {
                        hugetlb_put_quota(inode->i_mapping, chg);
-                        return ERR_PTR(-VM_FAULT_OOM);
+                        return ERR_PTR(-VM_FAULT_SIGBUS);
                }
        }
@@ -2447,8 +2448,10 @@ retry:
                        spin_lock(&inode->i_lock);
                        inode->i_blocks += blocks_per_huge_page(h);
                        spin_unlock(&inode->i_lock);
-                } else
+                } else {
                        lock_page(page);
+                        page->mapping = HUGETLB_POISON;
+                }
        }
        /*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5b069e4f5e4..2c0d032ac89 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -72,7 +72,6 @@
 #include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/prio_tree.h>
-#include <linux/gfp.h>
 #include <linux/fs.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
diff --git a/mm/ksm.c b/mm/ksm.c
index a93f1b7f508..956880f2ff4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -365,7 +365,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
        do {
                cond_resched();
                page = follow_page(vma, addr, FOLL_GET);
-                if (!page)
+                if (IS_ERR_OR_NULL(page))
                        break;
                if (PageKsm(page))
                        ret = handle_mm_fault(vma->vm_mm, vma, addr,
@@ -447,7 +447,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
                goto out;
        page = follow_page(vma, addr, FOLL_GET);
-        if (!page)
+        if (IS_ERR_OR_NULL(page))
                goto out;
        if (PageAnon(page)) {
                flush_anon_page(vma, page, addr);
@@ -751,7 +751,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
                 * page
                 */
                if (page_mapcount(page) + 1 + swapped != page_count(page)) {
-                        set_pte_at_notify(mm, addr, ptep, entry);
+                        set_pte_at(mm, addr, ptep, entry);
                        goto out_unlock;
                }
                entry = pte_wrprotect(entry);
@@ -1086,7 +1086,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
                cond_resched();
                tree_rmap_item = rb_entry(*new, struct rmap_item, node);
                tree_page = get_mergeable_page(tree_rmap_item);
-                if (!tree_page)
+                if (IS_ERR_OR_NULL(tree_page))
                        return NULL;
                /*
@@ -1294,7 +1294,7 @@ next_mm:
                        if (ksm_test_exit(mm))
                                break;
                        *page = follow_page(vma, ksm_scan.address, FOLL_GET);
-                        if (*page && PageAnon(*page)) {
+                        if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(slot,
@@ -1308,7 +1308,7 @@ next_mm:
                                up_read(&mm->mmap_sem);
                                return rmap_item;
                        }
-                        if (*page)
+                        if (!IS_ERR_OR_NULL(*page))
                                put_page(*page);
                        ksm_scan.address += PAGE_SIZE;
                        cond_resched();
@@ -1367,7 +1367,7 @@ next_mm:
 static void ksm_do_scan(unsigned int scan_npages)
 {
        struct rmap_item *rmap_item;
-        struct page *page;
+        struct page *uninitialized_var(page);
        while (scan_npages--) {
                cond_resched();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7973b5221fb..c8569bc298f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1359,16 +1359,19 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
        lock_page_cgroup(pc);
        mem = pc->mem_cgroup;
-        if (!mem)
+        if (!mem || !PageCgroupUsed(pc))
-                goto done;
-        if (!PageCgroupUsed(pc))
                goto done;
        /*
         * Preemption is already disabled. We can use __this_cpu_xxx
         */
-        __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
+        if (val > 0) {
+                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+                SetPageCgroupFileMapped(pc);
+        } else {
+                __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+                ClearPageCgroupFileMapped(pc);
+        }
 done:
        unlock_page_cgroup(pc);
@@ -1435,7 +1438,7 @@ static void drain_local_stock(struct work_struct *dummy)
 /*
 * Cache charges(val) which is from res_counter, to local per_cpu area.
- * This will be consumed by consumt_stock() function, later.
+ * This will be consumed by consume_stock() function, later.
 */
 static void refill_stock(struct mem_cgroup *mem, int val)
 {
@@ -1598,7 +1601,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                         * There is a small race that "from" or "to" can be
                         * freed by rmdir, so we use css_tryget().
                         */
-                        rcu_read_lock();
                        from = mc.from;
                        to = mc.to;
                        if (from && css_tryget(&from->css)) {
@@ -1619,7 +1621,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                                        do_continue = (to == mem_over_limit);
                                css_put(&to->css);
                        }
-                        rcu_read_unlock();
                        if (do_continue) {
                                DEFINE_WAIT(wait);
                                prepare_to_wait(&mc.waitq, &wait,
@@ -1801,16 +1802,13 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 static void __mem_cgroup_move_account(struct page_cgroup *pc,
        struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
-        struct page *page;
        VM_BUG_ON(from == to);
        VM_BUG_ON(PageLRU(pc->page));
        VM_BUG_ON(!PageCgroupLocked(pc));
        VM_BUG_ON(!PageCgroupUsed(pc));
        VM_BUG_ON(pc->mem_cgroup != from);
-        page = pc->page;
+        if (PageCgroupFileMapped(pc)) {
-        if (page_mapped(page) && !PageAnon(page)) {
                /* Update mapped_file data for mem_cgroup */
                preempt_disable();
                __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
@@ -2429,11 +2427,11 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
        }
        unlock_page_cgroup(pc);
+        *ptr = mem;
        if (mem) {
-                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
                css_put(&mem->css);
        }
-        *ptr = mem;
        return ret;
 }
@@ -3691,8 +3689,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        else
                mem = vmalloc(size);
-        if (mem)
+        if (!mem)
-                memset(mem, 0, size);
+                return NULL;
+        memset(mem, 0, size);
        mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
        if (!mem->stat) {
                if (size < PAGE_SIZE)
@@ -3946,28 +3946,6 @@ one_by_one:
        }
        return ret;
 }
-#else   /* !CONFIG_MMU */
-static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
-                                struct cgroup *cgroup,
-                                struct task_struct *p,
-                                bool threadgroup)
-{
-        return 0;
-}
-static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
-                                struct cgroup *cgroup,
-                                struct task_struct *p,
-                                bool threadgroup)
-{
-}
-static void mem_cgroup_move_task(struct cgroup_subsys *ss,
-                                struct cgroup *cont,
-                                struct cgroup *old_cont,
-                                struct task_struct *p,
-                                bool threadgroup)
-{
-}
-#endif
 /**
 * is_target_pte_for_mc - check a pte whether it is valid for move charge
@@ -4330,6 +4308,28 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
        }
        mem_cgroup_clear_mc();
 }
+#else   /* !CONFIG_MMU */
+static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+                                struct cgroup *cgroup,
+                                struct task_struct *p,
+                                bool threadgroup)
+{
+        return 0;
+}
+static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+                                struct cgroup *cgroup,
+                                struct task_struct *p,
+                                bool threadgroup)
+{
+}
+static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+                                struct cgroup *cont,
+                                struct cgroup *old_cont,
+                                struct task_struct *p,
+                                bool threadgroup)
+{
+}
+#endif
 struct cgroup_subsys mem_cgroup_subsys = {
        .name = "memory",
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d1f33516297..620b0b46159 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -44,6 +44,7 @@
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
 #include <linux/suspend.h>
+#include <linux/slab.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
diff --git a/mm/memory.c b/mm/memory.c
index 5b7f2002e54..833952d8b74 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -56,6 +56,7 @@
 #include <linux/kallsyms.h>
 #include <linux/swapops.h>
 #include <linux/elf.h>
+#include <linux/gfp.h>
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -124,7 +125,7 @@ core_initcall(init_zero_pfn);
 #if defined(SPLIT_RSS_COUNTING)
-void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
 {
        int i;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 643f66e1018..08f40a2f3fe 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -73,7 +73,6 @@
 #include <linux/sched.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
-#include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
@@ -806,9 +805,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
        err = 0;
        if (nmask) {
-                task_lock(current);
+                if (mpol_store_user_nodemask(pol)) {
-                get_policy_nodemask(pol, nmask);
+                        *nmask = pol->w.user_nodemask;
-                task_unlock(current);
+                } else {
+                        task_lock(current);
+                        get_policy_nodemask(pol, nmask);
+                        task_unlock(current);
+                }
        }
 out:
@@ -2195,8 +2198,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                        char *rest = nodelist;
                        while (isdigit(*rest))
                                rest++;
-                        if (!*rest)
+                        if (*rest)
-                                err = 0;
+                                goto out;
                }
                break;
        case MPOL_INTERLEAVE:
@@ -2205,7 +2208,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                 */
                if (!nodelist)
                        nodes = node_states[N_HIGH_MEMORY];
-                err = 0;
                break;
        case MPOL_LOCAL:
                /*
@@ -2215,11 +2217,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                        goto out;
                mode = MPOL_PREFERRED;
                break;
+        case MPOL_DEFAULT:
-        /*
+                /*
-         * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
+                 * Insist on a empty nodelist
-         * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
+                 */
-         */
+                if (!nodelist)
+                        err = 0;
+                goto out;
+        case MPOL_BIND:
+                /*
+                 * Insist on a nodelist
+                 */
+                if (!nodelist)
+                        goto out;
        }
        mode_flags = 0;
@@ -2233,13 +2243,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                else if (!strcmp(flags, "relative"))
                        mode_flags |= MPOL_F_RELATIVE_NODES;
                else
-                        err = 1;
+                        goto out;
        }
        new = mpol_new(mode, mode_flags, &nodes);
        if (IS_ERR(new))
-                err = 1;
+                goto out;
-        else {
+        {
                int ret;
                NODEMASK_SCRATCH(scratch);
                if (scratch) {
@@ -2250,13 +2261,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                        ret = -ENOMEM;
                NODEMASK_SCRATCH_FREE(scratch);
                if (ret) {
-                        err = 1;
                        mpol_put(new);
-                } else if (no_context) {
+                        goto out;
-                        /* save for contextualization */
-                        new->w.user_nodemask = nodes;
                }
        }
+        err = 0;
+        if (no_context) {
+                /* save for contextualization */
+                new->w.user_nodemask = nodes;
+        }
 out:
        /* Restore string for error message */
diff --git a/mm/migrate.c b/mm/migrate.c
index 88000b89fc9..d3f3f7f8107 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
+#include <linux/gfp.h>
 #include "internal.h"
diff --git a/mm/mincore.c b/mm/mincore.c
index 7a3436ef39e..f77433c2027 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -7,8 +7,8 @@
 /*
 * The mincore() system call.
 */
-#include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/syscalls.h>
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f4e2dfceec..3f82720e051 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -607,44 +607,3 @@ void user_shm_unlock(size_t size, struct user_struct *user)
        spin_unlock(&shmlock_user_lock);
        free_uid(user);
 }
-int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
-                          size_t size)
-{
-        unsigned long lim, vm, pgsz;
-        int error = -ENOMEM;
-        pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-        down_write(&mm->mmap_sem);
-        lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
-        vm   = mm->total_vm + pgsz;
-        if (lim < vm)
-                goto out;
-        lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
-        vm   = mm->locked_vm + pgsz;
-        if (lim < vm)
-                goto out;
-        mm->total_vm  += pgsz;
-        mm->locked_vm += pgsz;
-        error = 0;
- out:
-        up_write(&mm->mmap_sem);
-        return error;
-}
-void refund_locked_memory(struct mm_struct *mm, size_t size)
-{
-        unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-        down_write(&mm->mmap_sem);
-        mm->total_vm  -= pgsz;
-        mm->locked_vm -= pgsz;
-        up_write(&mm->mmap_sem);
-}
diff --git a/mm/mmap.c b/mm/mmap.c
index 75557c639ad..456ec6f2788 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -507,11 +507,12 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        struct address_space *mapping = NULL;
        struct prio_tree_root *root = NULL;
        struct file *file = vma->vm_file;
-        struct anon_vma *anon_vma = NULL;
        long adjust_next = 0;
        int remove_next = 0;
        if (next && !insert) {
+                struct vm_area_struct *exporter = NULL;
                if (end >= next->vm_end) {
                        /*
                         * vma expands, overlapping all the next, and
@@ -519,7 +520,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
                         */
 again:                  remove_next = 1 + (end > next->vm_end);
                        end = next->vm_end;
-                        anon_vma = next->anon_vma;
+                        exporter = next;
                        importer = vma;
                } else if (end > next->vm_start) {
                        /*
@@ -527,7 +528,7 @@ again:			remove_next = 1 + (end > next->vm_end);
                         * mprotect case 5 shifting the boundary up.
                         */
                        adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
-                        anon_vma = next->anon_vma;
+                        exporter = next;
                        importer = vma;
                } else if (end < vma->vm_end) {
                        /*
@@ -536,28 +537,19 @@ again:			remove_next = 1 + (end > next->vm_end);
                         * mprotect case 4 shifting the boundary down.
                         */
                        adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT);
-                        anon_vma = next->anon_vma;
+                        exporter = vma;
                        importer = next;
                }
-        }
-        /*
-         * When changing only vma->vm_end, we don't really need anon_vma lock.
-         */
-        if (vma->anon_vma && (insert || importer || start != vma->vm_start))
-                anon_vma = vma->anon_vma;
-        if (anon_vma) {
                /*
                 * Easily overlooked: when mprotect shifts the boundary,
                 * make sure the expanding vma has anon_vma set if the
                 * shrinking vma had, to cover any anon pages imported.
                 */
-                if (importer && !importer->anon_vma) {
+                if (exporter && exporter->anon_vma && !importer->anon_vma) {
-                        /* Block reverse map lookups until things are set up. */
+                        if (anon_vma_clone(importer, exporter))
-                        if (anon_vma_clone(importer, vma)) {
                                return -ENOMEM;
-                        }
+                        importer->anon_vma = exporter->anon_vma;
-                        importer->anon_vma = anon_vma;
                }
        }
@@ -825,6 +817,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 }
 /*
+ * Rough compatbility check to quickly see if it's even worth looking
+ * at sharing an anon_vma.
+ *
+ * They need to have the same vm_file, and the flags can only differ
+ * in things that mprotect may change.
+ *
+ * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
+ * we can merge the two vma's. For example, we refuse to merge a vma if
+ * there is a vm_ops->close() function, because that indicates that the
+ * driver is doing some kind of reference counting. But that doesn't
+ * really matter for the anon_vma sharing case.
+ */
+static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
+{
+        return a->vm_end == b->vm_start &&
+                mpol_equal(vma_policy(a), vma_policy(b)) &&
+                a->vm_file == b->vm_file &&
+                !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) &&
+                b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
+}
+/*
+ * Do some basic sanity checking to see if we can re-use the anon_vma
+ * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
+ * the same as 'old', the other will be the new one that is trying
+ * to share the anon_vma.
+ *
+ * NOTE! This runs with mm_sem held for reading, so it is possible that
+ * the anon_vma of 'old' is concurrently in the process of being set up
+ * by another page fault trying to merge _that_. But that's ok: if it
+ * is being set up, that automatically means that it will be a singleton
+ * acceptable for merging, so we can do all of this optimistically. But
+ * we do that ACCESS_ONCE() to make sure that we never re-load the pointer.
+ *
+ * IOW: that the "list_is_singular()" test on the anon_vma_chain only
+ * matters for the 'stable anon_vma' case (ie the thing we want to avoid
+ * is to return an anon_vma that is "complex" due to having gone through
+ * a fork).
+ *
+ * We also make sure that the two vma's are compatible (adjacent,
+ * and with the same memory policies). That's all stable, even with just
+ * a read lock on the mm_sem.
+ */
+static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
+{
+        if (anon_vma_compatible(a, b)) {
+                struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma);
+                if (anon_vma && list_is_singular(&old->anon_vma_chain))
+                        return anon_vma;
+        }
+        return NULL;
+}
+/*
 * find_mergeable_anon_vma is used by anon_vma_prepare, to check
 * neighbouring vmas for a suitable anon_vma, before it goes off
 * to allocate a new anon_vma.  It checks because a repetitive
@@ -834,28 +881,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 */
 struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
 {
+        struct anon_vma *anon_vma;
        struct vm_area_struct *near;
-        unsigned long vm_flags;
        near = vma->vm_next;
        if (!near)
                goto try_prev;
-        /*
+        anon_vma = reusable_anon_vma(near, vma, near);
-         * Since only mprotect tries to remerge vmas, match flags
+        if (anon_vma)
-         * which might be mprotected into each other later on.
+                return anon_vma;
-         * Neither mlock nor madvise tries to remerge at present,
-         * so leave their flags as obstructing a merge.
-         */
-        vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
-        vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
-        if (near->anon_vma && vma->vm_end == near->vm_start &&
-                        mpol_equal(vma_policy(vma), vma_policy(near)) &&
-                        can_vma_merge_before(near, vm_flags,
-                                NULL, vma->vm_file, vma->vm_pgoff +
-                                ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT)))
-                return near->anon_vma;
 try_prev:
        /*
         * It is potentially slow to have to call find_vma_prev here.
@@ -868,14 +903,9 @@ try_prev:
        if (!near)
                goto none;
-        vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC);
+        anon_vma = reusable_anon_vma(near, near, vma);
-        vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC);
+        if (anon_vma)
+                return anon_vma;
-        if (near->anon_vma && near->vm_end == vma->vm_start &&
-                        mpol_equal(vma_policy(near), vma_policy(vma)) &&
-                        can_vma_merge_after(near, vm_flags,
-                                NULL, vma->vm_file, vma->vm_pgoff))
-                return near->anon_vma;
 none:
        /*
         * There's no absolute need to look only at touching neighbours:
@@ -1947,7 +1977,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
                return 0;
        /* Clean everything up if vma_adjust failed. */
-        new->vm_ops->close(new);
+        if (new->vm_ops && new->vm_ops->close)
+                new->vm_ops->close(new);
        if (new->vm_file) {
                if (vma->vm_flags & VM_EXECUTABLE)
                        removed_exe_file_vma(mm);
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 0777654147c..9e82e937000 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,6 +53,7 @@ void unuse_mm(struct mm_struct *mm)
        struct task_struct *tsk = current;
        task_lock(tsk);
+        sync_mm_rss(tsk, mm);
        tsk->mm = NULL;
        /* active_mm is still 'mm' */
        enter_lazy_tlb(mm, tsk);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 7e33f2cb3c7..438951d366f 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -16,6 +16,7 @@
 #include <linux/err.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 /*
 * This function can't run concurrently against mmu_notifier_register
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8bc969d8112..2d1bf7cf885 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -10,7 +10,6 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
-#include <linux/slab.h>
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/fs.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index e9c75efce60..cde56ee51ef 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -9,7 +9,6 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
-#include <linux/slab.h>
 #include <linux/shm.h>
 #include <linux/ksm.h>
 #include <linux/mman.h>
diff --git a/mm/msync.c b/mm/msync.c
index 4083209b7f0..632df4527c0 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -82,7 +82,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
                                (vma->vm_flags & VM_SHARED)) {
                        get_file(file);
                        up_read(&mm->mmap_sem);
-                        error = vfs_fsync(file, file->f_path.dentry, 0);
+                        error = vfs_fsync(file, 0);
                        fput(file);
                        if (error || start >= end)
                                goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 605ace8982a..63fa17d121f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -146,7 +146,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
        for (i = 0; i < nr_pages; i++) {
-                vma = find_extend_vma(mm, start);
+                vma = find_vma(mm, start);
                if (!vma)
                        goto finish_or_fault;
@@ -162,7 +162,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                }
                if (vmas)
                        vmas[i] = vma;
-                start += PAGE_SIZE;
+                start = (start + PAGE_SIZE) & PAGE_MASK;
        }
        return i;
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(find_vma);
 */
 struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
 {
-        return find_vma(mm, addr & PAGE_MASK);
+        return find_vma(mm, addr);
 }
 /*
@@ -1040,10 +1040,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
        if (ret != -ENOSYS)
                return ret;
-        /* getting an ENOSYS error indicates that direct mmap isn't
+        /* getting -ENOSYS indicates that direct mmap isn't possible (as
-         * possible (as opposed to tried but failed) so we'll fall
+         * opposed to tried but failed) so we can only give a suitable error as
-         * through to making a private copy of the data and mapping
+         * it's not possible to make a private copy if MAP_SHARED was given */
-         * that if we can */
        return -ENODEV;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9b223af6a14..b68e802a7a7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -18,6 +18,7 @@
 #include <linux/oom.h>
 #include <linux/mm.h>
 #include <linux/err.h>
+#include <linux/gfp.h>
 #include <linux/sched.h>
 #include <linux/swap.h>
 #include <linux/timex.h>
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943ecf8..b289310e2c8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
            (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
                               + global_page_state(NR_UNSTABLE_NFS))
                                          > background_thresh)))
-                bdi_start_writeback(bdi, NULL, 0);
+                bdi_start_writeback(bdi, NULL, 0, 0);
 }
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        }
 }
-static void laptop_timer_fn(unsigned long unused);
-static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 /*
 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
 */
@@ -694,24 +690,24 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
        proc_dointvec(table, write, buffer, length, ppos);
+        bdi_arm_supers_timer();
        return 0;
 }
-static void do_laptop_sync(struct work_struct *work)
+#ifdef CONFIG_BLOCK
+void laptop_mode_timer_fn(unsigned long data)
 {
-        wakeup_flusher_threads(0);
+        struct request_queue *q = (struct request_queue *)data;
-        kfree(work);
+        int nr_pages = global_page_state(NR_FILE_DIRTY) +
-}
+                global_page_state(NR_UNSTABLE_NFS);
-static void laptop_timer_fn(unsigned long unused)
+        /*
-{
+         * We want to write everything out, not just down to the dirty
-        struct work_struct *work;
+         * threshold
+         */
-        work = kmalloc(sizeof(*work), GFP_ATOMIC);
+        if (bdi_has_dirty_io(&q->backing_dev_info))
-        if (work) {
+                bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0);
-                INIT_WORK(work, do_laptop_sync);
-                schedule_work(work);
-        }
 }
 /*
@@ -719,9 +715,9 @@ static void laptop_timer_fn(unsigned long unused)
 * of all dirty data a few seconds from now.  If the flush is already scheduled
 * then push it back - the user is still using the disk.
 */
-void laptop_io_completion(void)
+void laptop_io_completion(struct backing_dev_info *info)
 {
-        mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
+        mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
 }
 /*
@@ -731,8 +727,16 @@ void laptop_io_completion(void)
 */
 void laptop_sync_completion(void)
 {
-        del_timer(&laptop_mode_wb_timer);
+        struct backing_dev_info *bdi;
+        rcu_read_lock();
+        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+                del_timer(&bdi->laptop_mode_wb_timer);
+        rcu_read_unlock();
 }
+#endif
 /*
 * If ratelimit_pages is too high then we can get into dirty-data overload
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d03c946d556..a6326c71b66 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2579,7 +2579,7 @@ static int default_zonelist_order(void)
        struct zone *z;
        int average_size;
        /*
-         * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
+         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
         * If they are really small and used heavily, the system can fall
         * into OOM very easily.
         * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
diff --git a/mm/page_io.c b/mm/page_io.c
index a19af956ee1..31a3b962230 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
+#include <linux/gfp.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 7b47a57b664..8b1a2ce21ee 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -80,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
        return err;
 }
+#ifdef CONFIG_HUGETLB_PAGE
+static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
+                                       unsigned long end)
+{
+        unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
+        return boundary < end ? boundary : end;
+}
+static int walk_hugetlb_range(struct vm_area_struct *vma,
+                              unsigned long addr, unsigned long end,
+                              struct mm_walk *walk)
+{
+        struct hstate *h = hstate_vma(vma);
+        unsigned long next;
+        unsigned long hmask = huge_page_mask(h);
+        pte_t *pte;
+        int err = 0;
+        do {
+                next = hugetlb_entry_end(h, addr, end);
+                pte = huge_pte_offset(walk->mm, addr & hmask);
+                if (pte && walk->hugetlb_entry)
+                        err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
+                if (err)
+                        return err;
+        } while (addr = next, addr != end);
+        return 0;
+}
+#endif
 /**
 * walk_page_range - walk a memory map's page tables with a callback
 * @mm: memory map to walk
@@ -128,20 +159,16 @@ int walk_page_range(unsigned long addr, unsigned long end,
                vma = find_vma(walk->mm, addr);
 #ifdef CONFIG_HUGETLB_PAGE
                if (vma && is_vm_hugetlb_page(vma)) {
-                        pte_t *pte;
-                        struct hstate *hs;
                        if (vma->vm_end < next)
                                next = vma->vm_end;
-                        hs = hstate_vma(vma);
+                        /*
-                        pte = huge_pte_offset(walk->mm,
+                         * Hugepage is very tightly coupled with vma, so
-                                              addr & huge_page_mask(hs));
+                         * walk through hugetlb entries within a given vma.
-                        if (pte && !huge_pte_none(huge_ptep_get(pte))
+                         */
-                            && walk->hugetlb_entry)
+                        err = walk_hugetlb_range(vma, addr, next, walk);
-                                err = walk->hugetlb_entry(pte, addr,
-                                                          next, walk);
                        if (err)
                                break;
+                        pgd = pgd_offset(walk->mm, next);
                        continue;
                }
 #endif
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
new file mode 100644
index 00000000000..df680855540
--- /dev/null
+++ b/mm/percpu-km.c
@@ -0,0 +1,104 @@
+/*
+ * mm/percpu-km.c - kernel memory based chunk allocation
+ *
+ * Copyright (C) 2010           SUSE Linux Products GmbH
+ * Copyright (C) 2010           Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Chunks are allocated as a contiguous kernel memory using gfp
+ * allocation.  This is to be used on nommu architectures.
+ *
+ * To use percpu-km,
+ *
+ * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig.
+ *
+ * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined.  It's
+ *   not compatible with PER_CPU_KM.  EMBED_FIRST_CHUNK should work
+ *   fine.
+ *
+ * - NUMA is not supported.  When setting up the first chunk,
+ *   @cpu_distance_fn should be NULL or report all CPUs to be nearer
+ *   than or at LOCAL_DISTANCE.
+ *
+ * - It's best if the chunk size is power of two multiple of
+ *   PAGE_SIZE.  Because each chunk is allocated as a contiguous
+ *   kernel memory block using alloc_pages(), memory will be wasted if
+ *   chunk size is not aligned.  percpu-km code will whine about it.
+ */
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#error "contiguous percpu allocation is incompatible with paged first chunk"
+#endif
+#include <linux/log2.h>
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+        /* noop */
+        return 0;
+}
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+        /* nada */
+}
+static struct pcpu_chunk *pcpu_create_chunk(void)
+{
+        const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+        struct pcpu_chunk *chunk;
+        struct page *pages;
+        int i;
+        chunk = pcpu_alloc_chunk();
+        if (!chunk)
+                return NULL;
+        pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+        if (!pages) {
+                pcpu_free_chunk(chunk);
+                return NULL;
+        }
+        for (i = 0; i < nr_pages; i++)
+                pcpu_set_page_chunk(nth_page(pages, i), chunk);
+        chunk->data = pages;
+        chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
+        return chunk;
+}
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+        const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+        if (chunk && chunk->data)
+                __free_pages(chunk->data, order_base_2(nr_pages));
+        pcpu_free_chunk(chunk);
+}
+static struct page *pcpu_addr_to_page(void *addr)
+{
+        return virt_to_page(addr);
+}
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+        size_t nr_pages, alloc_pages;
+        /* all units must be in a single group */
+        if (ai->nr_groups != 1) {
+                printk(KERN_CRIT "percpu: can't handle more than one groups\n");
+                return -EINVAL;
+        }
+        nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT;
+        alloc_pages = roundup_pow_of_two(nr_pages);
+        if (alloc_pages > nr_pages)
+                printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n",
+                       alloc_pages - nr_pages);
+        return 0;
+}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
new file mode 100644
index 00000000000..7d9c1d0ebd3
--- /dev/null
+++ b/mm/percpu-vm.c
@@ -0,0 +1,451 @@
+/*
+ * mm/percpu-vm.c - vmalloc area based chunk allocation
+ *
+ * Copyright (C) 2010           SUSE Linux Products GmbH
+ * Copyright (C) 2010           Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Chunks are mapped into vmalloc areas and populated page by page.
+ * This is the default chunk allocator.
+ */
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+                                    unsigned int cpu, int page_idx)
+{
+        /* must not be used on pre-mapped chunk */
+        WARN_ON(chunk->immutable);
+        return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
+}
+/**
+ * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
+ * @chunk: chunk of interest
+ * @bitmapp: output parameter for bitmap
+ * @may_alloc: may allocate the array
+ *
+ * Returns pointer to array of pointers to struct page and bitmap,
+ * both of which can be indexed with pcpu_page_idx().  The returned
+ * array is cleared to zero and *@bitmapp is copied from
+ * @chunk->populated.  Note that there is only one array and bitmap
+ * and access exclusion is the caller's responsibility.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ * Otherwise, don't care.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success, NULL on failure.
+ */
+static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+                                               unsigned long **bitmapp,
+                                               bool may_alloc)
+{
+        static struct page **pages;
+        static unsigned long *bitmap;
+        size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+        size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+                             sizeof(unsigned long);
+        if (!pages || !bitmap) {
+                if (may_alloc && !pages)
+                        pages = pcpu_mem_alloc(pages_size);
+                if (may_alloc && !bitmap)
+                        bitmap = pcpu_mem_alloc(bitmap_size);
+                if (!pages || !bitmap)
+                        return NULL;
+        }
+        memset(pages, 0, pages_size);
+        bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
+        *bitmapp = bitmap;
+        return pages;
+}
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+                            struct page **pages, unsigned long *populated,
+                            int page_start, int page_end)
+{
+        unsigned int cpu;
+        int i;
+        for_each_possible_cpu(cpu) {
+                for (i = page_start; i < page_end; i++) {
+                        struct page *page = pages[pcpu_page_idx(cpu, i)];
+                        if (page)
+                                __free_page(page);
+                }
+        }
+}
+/**
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk.  Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
+ */
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+                            struct page **pages, unsigned long *populated,
+                            int page_start, int page_end)
+{
+        const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+        unsigned int cpu;
+        int i;
+        for_each_possible_cpu(cpu) {
+                for (i = page_start; i < page_end; i++) {
+                        struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+                        *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+                        if (!*pagep) {
+                                pcpu_free_pages(chunk, pages, populated,
+                                                page_start, page_end);
+                                return -ENOMEM;
+                        }
+                }
+        }
+        return 0;
+}
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped.  Flush cache.  As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu.  This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+                                 int page_start, int page_end)
+{
+        flush_cache_vunmap(
+                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+        unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished.  The caller should call
+ * proper pre/post flush functions.
+ */
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+                             struct page **pages, unsigned long *populated,
+                             int page_start, int page_end)
+{
+        unsigned int cpu;
+        int i;
+        for_each_possible_cpu(cpu) {
+                for (i = page_start; i < page_end; i++) {
+                        struct page *page;
+                        page = pcpu_chunk_page(chunk, cpu, i);
+                        WARN_ON(!page);
+                        pages[pcpu_page_idx(cpu, i)] = page;
+                }
+                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+                                   page_end - page_start);
+        }
+        for (i = page_start; i < page_end; i++)
+                __clear_bit(i, populated);
+}
+/**
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
+ * TLB for the regions.  This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+                                      int page_start, int page_end)
+{
+        flush_tlb_kernel_range(
+                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+static int __pcpu_map_pages(unsigned long addr, struct page **pages,
+                            int nr_pages)
+{
+        return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
+                                        PAGE_KERNEL, pages);
+}
+/**
+ * pcpu_map_pages - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting corresponding bits in
+ * @chunk->populated bitmap and whatever is necessary for reverse
+ * lookup (addr -> chunk).
+ */
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+                          struct page **pages, unsigned long *populated,
+                          int page_start, int page_end)
+{
+        unsigned int cpu, tcpu;
+        int i, err;
+        for_each_possible_cpu(cpu) {
+                err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+                                       &pages[pcpu_page_idx(cpu, page_start)],
+                                       page_end - page_start);
+                if (err < 0)
+                        goto err;
+        }
+        /* mapping successful, link chunk and mark populated */
+        for (i = page_start; i < page_end; i++) {
+                for_each_possible_cpu(cpu)
+                        pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+                                            chunk);
+                __set_bit(i, populated);
+        }
+        return 0;
+err:
+        for_each_possible_cpu(tcpu) {
+                if (tcpu == cpu)
+                        break;
+                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+                                   page_end - page_start);
+        }
+        return err;
+}
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+                                int page_start, int page_end)
+{
+        flush_cache_vmap(
+                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate in bytes
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk.  The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+        int page_start = PFN_DOWN(off);
+        int page_end = PFN_UP(off + size);
+        int free_end = page_start, unmap_end = page_start;
+        struct page **pages;
+        unsigned long *populated;
+        unsigned int cpu;
+        int rs, re, rc;
+        /* quick path, check whether all pages are already there */
+        rs = page_start;
+        pcpu_next_pop(chunk, &rs, &re, page_end);
+        if (rs == page_start && re == page_end)
+                goto clear;
+        /* need to allocate and map pages, this chunk can't be immutable */
+        WARN_ON(chunk->immutable);
+        pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+        if (!pages)
+                return -ENOMEM;
+        /* alloc and map */
+        pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+                rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+                if (rc)
+                        goto err_free;
+                free_end = re;
+        }
+        pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+                rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+                if (rc)
+                        goto err_unmap;
+                unmap_end = re;
+        }
+        pcpu_post_map_flush(chunk, page_start, page_end);
+        /* commit new bitmap */
+        bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+clear:
+        for_each_possible_cpu(cpu)
+                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+        return 0;
+err_unmap:
+        pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+        pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+                pcpu_unmap_pages(chunk, pages, populated, rs, re);
+        pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+err_free:
+        pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+                pcpu_free_pages(chunk, pages, populated, rs, re);
+        return rc;
+}
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk.  If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+        int page_start = PFN_DOWN(off);
+        int page_end = PFN_UP(off + size);
+        struct page **pages;
+        unsigned long *populated;
+        int rs, re;
+        /* quick path, check whether it's empty already */
+        rs = page_start;
+        pcpu_next_unpop(chunk, &rs, &re, page_end);
+        if (rs == page_start && re == page_end)
+                return;
+        /* immutable chunks can't be depopulated */
+        WARN_ON(chunk->immutable);
+        /*
+         * If control reaches here, there must have been at least one
+         * successful population attempt so the temp pages array must
+         * be available now.
+         */
+        pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+        BUG_ON(!pages);
+        /* unmap and free */
+        pcpu_pre_unmap_flush(chunk, page_start, page_end);
+        pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+                pcpu_unmap_pages(chunk, pages, populated, rs, re);
+        /* no need to flush tlb, vmalloc will handle it lazily */
+        pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+                pcpu_free_pages(chunk, pages, populated, rs, re);
+        /* commit new bitmap */
+        bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+}
+static struct pcpu_chunk *pcpu_create_chunk(void)
+{
+        struct pcpu_chunk *chunk;
+        struct vm_struct **vms;
+        chunk = pcpu_alloc_chunk();
+        if (!chunk)
+                return NULL;
+        vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
+                                pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
+        if (!vms) {
+                pcpu_free_chunk(chunk);
+                return NULL;
+        }
+        chunk->data = vms;
+        chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
+        return chunk;
+}
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+        if (chunk && chunk->data)
+                pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
+        pcpu_free_chunk(chunk);
+}
+static struct page *pcpu_addr_to_page(void *addr)
+{
+        return vmalloc_to_page(addr);
+}
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+        /* no extra restriction */
+        return 0;
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index 768419d44ad..39f7dfd5958 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1,5 +1,5 @@
 /*
- * linux/mm/percpu.c - percpu memory allocator
+ * mm/percpu.c - percpu memory allocator
 *
 * Copyright (C) 2009           SUSE Linux Products GmbH
 * Copyright (C) 2009           Tejun Heo <tj@kernel.org>
@@ -7,14 +7,13 @@
 * This file is released under the GPLv2.
 *
 * This is percpu allocator which can handle both static and dynamic
- * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
+ * areas.  Percpu areas are allocated in chunks.  Each chunk is
- * chunk is consisted of boot-time determined number of units and the
+ * consisted of boot-time determined number of units and the first
- * first chunk is used for static percpu variables in the kernel image
+ * chunk is used for static percpu variables in the kernel image
 * (special boot time alloc/init handling necessary as these areas
 * need to be brought up before allocation services are running).
 * Unit grows as necessary and all units grow or shrink in unison.
- * When a chunk is filled up, another chunk is allocated.  ie. in
+ * When a chunk is filled up, another chunk is allocated.
- * vmalloc area
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
@@ -99,7 +98,7 @@ struct pcpu_chunk {
        int                     map_used;       /* # of map entries used */
        int                     map_alloc;      /* # of map entries allocated */
        int                     *map;           /* allocation map */
-        struct vm_struct        **vms;          /* mapped vmalloc regions */
+        void                    *data;          /* chunk data */
        bool                    immutable;      /* no [de]population allowed */
        unsigned long           populated[];    /* populated bitmap */
 };
@@ -177,6 +176,21 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static void pcpu_reclaim(struct work_struct *work);
 static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+static bool pcpu_addr_in_first_chunk(void *addr)
+{
+        void *first_start = pcpu_first_chunk->base_addr;
+        return addr >= first_start && addr < first_start + pcpu_unit_size;
+}
+static bool pcpu_addr_in_reserved_chunk(void *addr)
+{
+        void *first_start = pcpu_first_chunk->base_addr;
+        return addr >= first_start &&
+                addr < first_start + pcpu_reserved_chunk_limit;
+}
 static int __pcpu_size_to_slot(int size)
 {
        int highbit = fls(size);        /* size is in bytes */
@@ -198,27 +212,6 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
        return pcpu_size_to_slot(chunk->free_size);
 }
-static int pcpu_page_idx(unsigned int cpu, int page_idx)
-{
-        return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
-}
-static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
-                                     unsigned int cpu, int page_idx)
-{
-        return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
-                (page_idx << PAGE_SHIFT);
-}
-static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
-                                    unsigned int cpu, int page_idx)
-{
-        /* must not be used on pre-mapped chunk */
-        WARN_ON(chunk->immutable);
-        return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
-}
 /* set the pointer to a chunk in a page struct */
 static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
 {
@@ -231,13 +224,27 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
        return (struct pcpu_chunk *)page->index;
 }
-static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+        return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
+}
+static unsigned long __maybe_unused pcpu_chunk_addr(struct pcpu_chunk *chunk,
+                                                unsigned int cpu, int page_idx)
+{
+        return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
+                (page_idx << PAGE_SHIFT);
+}
+static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
+                                           int *rs, int *re, int end)
 {
        *rs = find_next_zero_bit(chunk->populated, end, *rs);
        *re = find_next_bit(chunk->populated, end, *rs + 1);
 }
-static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
+                                         int *rs, int *re, int end)
 {
        *rs = find_next_bit(chunk->populated, end, *rs);
        *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
@@ -326,36 +333,6 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 }
 /**
- * pcpu_chunk_addr_search - determine chunk containing specified address
- * @addr: address for which the chunk needs to be determined.
- *
- * RETURNS:
- * The address of the found chunk.
- */
-static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
-{
-        void *first_start = pcpu_first_chunk->base_addr;
-        /* is it in the first chunk? */
-        if (addr >= first_start && addr < first_start + pcpu_unit_size) {
-                /* is it in the reserved area? */
-                if (addr < first_start + pcpu_reserved_chunk_limit)
-                        return pcpu_reserved_chunk;
-                return pcpu_first_chunk;
-        }
-        /*
-         * The address is relative to unit0 which might be unused and
-         * thus unmapped.  Offset the address to the unit space of the
-         * current processor before looking it up in the vmalloc
-         * space.  Note that any possible cpu id can be used here, so
-         * there's no need to worry about preemption or cpu hotplug.
-         */
-        addr += pcpu_unit_offsets[raw_smp_processor_id()];
-        return pcpu_get_page_chunk(vmalloc_to_page(addr));
-}
-/**
 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
 * @chunk: chunk of interest
 *
@@ -623,434 +600,92 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
        pcpu_chunk_relocate(chunk, oslot);
 }
-/**
+static struct pcpu_chunk *pcpu_alloc_chunk(void)
- * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
- * @chunk: chunk of interest
- * @bitmapp: output parameter for bitmap
- * @may_alloc: may allocate the array
- *
- * Returns pointer to array of pointers to struct page and bitmap,
- * both of which can be indexed with pcpu_page_idx().  The returned
- * array is cleared to zero and *@bitmapp is copied from
- * @chunk->populated.  Note that there is only one array and bitmap
- * and access exclusion is the caller's responsibility.
- *
- * CONTEXT:
- * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
- * Otherwise, don't care.
- *
- * RETURNS:
- * Pointer to temp pages array on success, NULL on failure.
- */
-static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
-                                               unsigned long **bitmapp,
-                                               bool may_alloc)
-{
-        static struct page **pages;
-        static unsigned long *bitmap;
-        size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
-        size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
-                             sizeof(unsigned long);
-        if (!pages || !bitmap) {
-                if (may_alloc && !pages)
-                        pages = pcpu_mem_alloc(pages_size);
-                if (may_alloc && !bitmap)
-                        bitmap = pcpu_mem_alloc(bitmap_size);
-                if (!pages || !bitmap)
-                        return NULL;
-        }
-        memset(pages, 0, pages_size);
-        bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
-        *bitmapp = bitmap;
-        return pages;
-}
-/**
- * pcpu_free_pages - free pages which were allocated for @chunk
- * @chunk: chunk pages were allocated for
- * @pages: array of pages to be freed, indexed by pcpu_page_idx()
- * @populated: populated bitmap
- * @page_start: page index of the first page to be freed
- * @page_end: page index of the last page to be freed + 1
- *
- * Free pages [@page_start and @page_end) in @pages for all units.
- * The pages were allocated for @chunk.
- */
-static void pcpu_free_pages(struct pcpu_chunk *chunk,
-                            struct page **pages, unsigned long *populated,
-                            int page_start, int page_end)
 {
-        unsigned int cpu;
+        struct pcpu_chunk *chunk;
-        int i;
-        for_each_possible_cpu(cpu) {
+        chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
-                for (i = page_start; i < page_end; i++) {
+        if (!chunk)
-                        struct page *page = pages[pcpu_page_idx(cpu, i)];
+                return NULL;
-                        if (page)
+        chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
-                                __free_page(page);
+        if (!chunk->map) {
-                }
+                kfree(chunk);
+                return NULL;
        }
-}
-/**
+        chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
- * pcpu_alloc_pages - allocates pages for @chunk
+        chunk->map[chunk->map_used++] = pcpu_unit_size;
- * @chunk: target chunk
- * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
- * @populated: populated bitmap
- * @page_start: page index of the first page to be allocated
- * @page_end: page index of the last page to be allocated + 1
- *
- * Allocate pages [@page_start,@page_end) into @pages for all units.
- * The allocation is for @chunk.  Percpu core doesn't care about the
- * content of @pages and will pass it verbatim to pcpu_map_pages().
- */
-static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
-                            struct page **pages, unsigned long *populated,
-                            int page_start, int page_end)
-{
-        const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
-        unsigned int cpu;
-        int i;
-        for_each_possible_cpu(cpu) {
+        INIT_LIST_HEAD(&chunk->list);
-                for (i = page_start; i < page_end; i++) {
+        chunk->free_size = pcpu_unit_size;
-                        struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+        chunk->contig_hint = pcpu_unit_size;
-                        *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
-                        if (!*pagep) {
-                                pcpu_free_pages(chunk, pages, populated,
-                                                page_start, page_end);
-                                return -ENOMEM;
-                        }
-                }
-        }
-        return 0;
-}
-/**
+        return chunk;
- * pcpu_pre_unmap_flush - flush cache prior to unmapping
- * @chunk: chunk the regions to be flushed belongs to
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages in [@page_start,@page_end) of @chunk are about to be
- * unmapped.  Flush cache.  As each flushing trial can be very
- * expensive, issue flush on the whole region at once rather than
- * doing it for each cpu.  This could be an overkill but is more
- * scalable.
- */
-static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
-                                 int page_start, int page_end)
-{
-        flush_cache_vunmap(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
 }
-static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+static void pcpu_free_chunk(struct pcpu_chunk *chunk)
 {
-        unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+        if (!chunk)
+                return;
+        pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
+        kfree(chunk);
 }
-/**
+/*
- * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * Chunk management implementation.
- * @chunk: chunk of interest
+ *
- * @pages: pages array which can be used to pass information to free
+ * To allow different implementations, chunk alloc/free and
- * @populated: populated bitmap
+ * [de]population are implemented in a separate file which is pulled
- * @page_start: page index of the first page to unmap
+ * into this file and compiled together.  The following functions
- * @page_end: page index of the last page to unmap + 1
+ * should be implemented.
- *
+ *
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * pcpu_populate_chunk          - populate the specified range of a chunk
- * Corresponding elements in @pages were cleared by the caller and can
+ * pcpu_depopulate_chunk        - depopulate the specified range of a chunk
- * be used to carry information to pcpu_free_pages() which will be
+ * pcpu_create_chunk            - create a new chunk
- * called after all unmaps are finished.  The caller should call
+ * pcpu_destroy_chunk           - destroy a chunk, always preceded by full depop
- * proper pre/post flush functions.
+ * pcpu_addr_to_page            - translate address to physical address
+ * pcpu_verify_alloc_info       - check alloc_info is acceptable during init
 */
-static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
-                             struct page **pages, unsigned long *populated,
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
-                             int page_start, int page_end)
+static struct pcpu_chunk *pcpu_create_chunk(void);
-{
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
-        unsigned int cpu;
+static struct page *pcpu_addr_to_page(void *addr);
-        int i;
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
-        for_each_possible_cpu(cpu) {
+#ifdef CONFIG_NEED_PER_CPU_KM
-                for (i = page_start; i < page_end; i++) {
+#include "percpu-km.c"
-                        struct page *page;
+#else
+#include "percpu-vm.c"
-                        page = pcpu_chunk_page(chunk, cpu, i);
+#endif
-                        WARN_ON(!page);
-                        pages[pcpu_page_idx(cpu, i)] = page;
-                }
-                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
-                                   page_end - page_start);
-        }
-        for (i = page_start; i < page_end; i++)
-                __clear_bit(i, populated);
-}
 /**
- * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * pcpu_chunk_addr_search - determine chunk containing specified address
- * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @addr: address for which the chunk needs to be determined.
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
- * TLB for the regions.  This can be skipped if the area is to be
- * returned to vmalloc as vmalloc will handle TLB flushing lazily.
 *
- * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * RETURNS:
- * for the whole region.
+ * The address of the found chunk.
- */
-static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
-                                      int page_start, int page_end)
-{
-        flush_tlb_kernel_range(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
-}
-static int __pcpu_map_pages(unsigned long addr, struct page **pages,
-                            int nr_pages)
-{
-        return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
-                                        PAGE_KERNEL, pages);
-}
-/**
- * pcpu_map_pages - map pages into a pcpu_chunk
- * @chunk: chunk of interest
- * @pages: pages array containing pages to be mapped
- * @populated: populated bitmap
- * @page_start: page index of the first page to map
- * @page_end: page index of the last page to map + 1
- *
- * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
- * caller is responsible for calling pcpu_post_map_flush() after all
- * mappings are complete.
- *
- * This function is responsible for setting corresponding bits in
- * @chunk->populated bitmap and whatever is necessary for reverse
- * lookup (addr -> chunk).
 */
-static int pcpu_map_pages(struct pcpu_chunk *chunk,
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
-                          struct page **pages, unsigned long *populated,
-                          int page_start, int page_end)
 {
-        unsigned int cpu, tcpu;
+        /* is it in the first chunk? */
-        int i, err;
+        if (pcpu_addr_in_first_chunk(addr)) {
+                /* is it in the reserved area? */
-        for_each_possible_cpu(cpu) {
+                if (pcpu_addr_in_reserved_chunk(addr))
-                err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+                        return pcpu_reserved_chunk;
-                                       &pages[pcpu_page_idx(cpu, page_start)],
+                return pcpu_first_chunk;
-                                       page_end - page_start);
-                if (err < 0)
-                        goto err;
-        }
-        /* mapping successful, link chunk and mark populated */
-        for (i = page_start; i < page_end; i++) {
-                for_each_possible_cpu(cpu)
-                        pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
-                                            chunk);
-                __set_bit(i, populated);
-        }
-        return 0;
-err:
-        for_each_possible_cpu(tcpu) {
-                if (tcpu == cpu)
-                        break;
-                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
-                                   page_end - page_start);
        }
-        return err;
-}
-/**
- * pcpu_post_map_flush - flush cache after mapping
- * @chunk: pcpu_chunk the regions to be flushed belong to
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
- * cache.
- *
- * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
- * for the whole region.
- */
-static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
-                                int page_start, int page_end)
-{
-        flush_cache_vmap(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
-}
-/**
- * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
- * @chunk: chunk to depopulate
- * @off: offset to the area to depopulate
- * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
- *
- * For each cpu, depopulate and unmap pages [@page_start,@page_end)
- * from @chunk.  If @flush is true, vcache is flushed before unmapping
- * and tlb after.
- *
- * CONTEXT:
- * pcpu_alloc_mutex.
- */
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
-{
-        int page_start = PFN_DOWN(off);
-        int page_end = PFN_UP(off + size);
-        struct page **pages;
-        unsigned long *populated;
-        int rs, re;
-        /* quick path, check whether it's empty already */
-        rs = page_start;
-        pcpu_next_unpop(chunk, &rs, &re, page_end);
-        if (rs == page_start && re == page_end)
-                return;
-        /* immutable chunks can't be depopulated */
-        WARN_ON(chunk->immutable);
        /*
-         * If control reaches here, there must have been at least one
+         * The address is relative to unit0 which might be unused and
-         * successful population attempt so the temp pages array must
+         * thus unmapped.  Offset the address to the unit space of the
-         * be available now.
+         * current processor before looking it up in the vmalloc
+         * space.  Note that any possible cpu id can be used here, so
+         * there's no need to worry about preemption or cpu hotplug.
         */
-        pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+        addr += pcpu_unit_offsets[raw_smp_processor_id()];
-        BUG_ON(!pages);
+        return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
-        /* unmap and free */
-        pcpu_pre_unmap_flush(chunk, page_start, page_end);
-        pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
-                pcpu_unmap_pages(chunk, pages, populated, rs, re);
-        /* no need to flush tlb, vmalloc will handle it lazily */
-        pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
-                pcpu_free_pages(chunk, pages, populated, rs, re);
-        /* commit new bitmap */
-        bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
-}
-/**
- * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
- * @chunk: chunk of interest
- * @off: offset to the area to populate
- * @size: size of the area to populate in bytes
- *
- * For each cpu, populate and map pages [@page_start,@page_end) into
- * @chunk.  The area is cleared on return.
- *
- * CONTEXT:
- * pcpu_alloc_mutex, does GFP_KERNEL allocation.
- */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
-{
-        int page_start = PFN_DOWN(off);
-        int page_end = PFN_UP(off + size);
-        int free_end = page_start, unmap_end = page_start;
-        struct page **pages;
-        unsigned long *populated;
-        unsigned int cpu;
-        int rs, re, rc;
-        /* quick path, check whether all pages are already there */
-        rs = page_start;
-        pcpu_next_pop(chunk, &rs, &re, page_end);
-        if (rs == page_start && re == page_end)
-                goto clear;
-        /* need to allocate and map pages, this chunk can't be immutable */
-        WARN_ON(chunk->immutable);
-        pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
-        if (!pages)
-                return -ENOMEM;
-        /* alloc and map */
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
-                rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
-                if (rc)
-                        goto err_free;
-                free_end = re;
-        }
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
-                rc = pcpu_map_pages(chunk, pages, populated, rs, re);
-                if (rc)
-                        goto err_unmap;
-                unmap_end = re;
-        }
-        pcpu_post_map_flush(chunk, page_start, page_end);
-        /* commit new bitmap */
-        bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
-clear:
-        for_each_possible_cpu(cpu)
-                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
-        return 0;
-err_unmap:
-        pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
-                pcpu_unmap_pages(chunk, pages, populated, rs, re);
-        pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
-err_free:
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
-                pcpu_free_pages(chunk, pages, populated, rs, re);
-        return rc;
-}
-static void free_pcpu_chunk(struct pcpu_chunk *chunk)
-{
-        if (!chunk)
-                return;
-        if (chunk->vms)
-                pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
-        pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
-        kfree(chunk);
-}
-static struct pcpu_chunk *alloc_pcpu_chunk(void)
-{
-        struct pcpu_chunk *chunk;
-        chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
-        if (!chunk)
-                return NULL;
-        chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
-        chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
-        chunk->map[chunk->map_used++] = pcpu_unit_size;
-        chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
-                                       pcpu_nr_groups, pcpu_atom_size,
-                                       GFP_KERNEL);
-        if (!chunk->vms) {
-                free_pcpu_chunk(chunk);
-                return NULL;
-        }
-        INIT_LIST_HEAD(&chunk->list);
-        chunk->free_size = pcpu_unit_size;
-        chunk->contig_hint = pcpu_unit_size;
-        chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
-        return chunk;
 }
 /**
@@ -1142,7 +777,7 @@ restart:
        /* hmmm... no space left, create a new chunk */
        spin_unlock_irqrestore(&pcpu_lock, flags);
-        chunk = alloc_pcpu_chunk();
+        chunk = pcpu_create_chunk();
        if (!chunk) {
                err = "failed to allocate new chunk";
                goto fail_unlock_mutex;
@@ -1254,7 +889,7 @@ static void pcpu_reclaim(struct work_struct *work)
        list_for_each_entry_safe(chunk, next, &todo, list) {
                pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
-                free_pcpu_chunk(chunk);
+                pcpu_destroy_chunk(chunk);
        }
        mutex_unlock(&pcpu_alloc_mutex);
@@ -1304,6 +939,32 @@ void free_percpu(void __percpu *ptr)
 EXPORT_SYMBOL_GPL(free_percpu);
 /**
+ * is_kernel_percpu_address - test whether address is from static percpu area
+ * @addr: address to test
+ *
+ * Test whether @addr belongs to in-kernel static percpu area.  Module
+ * static percpu areas are not considered.  For those, use
+ * is_module_percpu_address().
+ *
+ * RETURNS:
+ * %true if @addr is from in-kernel static percpu area, %false otherwise.
+ */
+bool is_kernel_percpu_address(unsigned long addr)
+{
+        const size_t static_size = __per_cpu_end - __per_cpu_start;
+        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
+        unsigned int cpu;
+        for_each_possible_cpu(cpu) {
+                void *start = per_cpu_ptr(base, cpu);
+                if ((void *)addr >= start && (void *)addr < start + static_size)
+                        return true;
+        }
+        return false;
+}
+/**
 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
 * @addr: the address to be converted to physical address
 *
@@ -1317,11 +978,14 @@ EXPORT_SYMBOL_GPL(free_percpu);
 */
 phys_addr_t per_cpu_ptr_to_phys(void *addr)
 {
-        if ((unsigned long)addr < VMALLOC_START ||
+        if (pcpu_addr_in_first_chunk(addr)) {
-                        (unsigned long)addr >= VMALLOC_END)
+                if ((unsigned long)addr < VMALLOC_START ||
-                return __pa(addr);
+                    (unsigned long)addr >= VMALLOC_END)
-        else
+                        return __pa(addr);
-                return page_to_phys(vmalloc_to_page(addr));
+                else
+                        return page_to_phys(vmalloc_to_page(addr));
+        } else
+                return page_to_phys(pcpu_addr_to_page(addr));
 }
 static inline size_t pcpu_calc_fc_sizes(size_t static_size,
@@ -1693,6 +1357,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
        /* process group information and build config tables accordingly */
        group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
new file mode 100644
index 00000000000..c4351c7f57d
--- /dev/null
+++ b/mm/percpu_up.c
@@ -0,0 +1,30 @@
+/*
+ * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
+ */
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+void __percpu *__alloc_percpu(size_t size, size_t align)
+{
+        /*
+         * Can't easily make larger alignment work with kmalloc.  WARN
+         * on it.  Larger alignment should only be used for module
+         * percpu sections on SMP for which this path isn't used.
+         */
+        WARN_ON_ONCE(align > SMP_CACHE_BYTES);
+        return kzalloc(size, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu);
+void free_percpu(void __percpu *p)
+{
+        kfree(p);
+}
+EXPORT_SYMBOL_GPL(free_percpu);
+phys_addr_t per_cpu_ptr_to_phys(void *addr)
+{
+        return __pa(addr);
+}
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 6633965bb27..2876349339a 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -14,6 +14,7 @@
 */
 #include <linux/kernel.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/module.h>
diff --git a/mm/readahead.c b/mm/readahead.c
index 337b20e946f..dfa9a1a03a1 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
@@ -502,7 +503,7 @@ void page_cache_sync_readahead(struct address_space *mapping,
                return;
        /* be dumb */
-        if (filp->f_mode & FMODE_RANDOM) {
+        if (filp && (filp->f_mode & FMODE_RANDOM)) {
                force_page_cache_readahead(mapping, filp, offset, req_size);
                return;
        }
diff --git a/mm/rmap.c b/mm/rmap.c
index fcd593c9c99..0feeef860a8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -133,8 +133,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                                goto out_enomem_free_avc;
                        allocated = anon_vma;
                }
-                spin_lock(&anon_vma->lock);
+                spin_lock(&anon_vma->lock);
                /* page_table_lock to protect against threads */
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
@@ -144,14 +144,15 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        list_add(&avc->same_vma, &vma->anon_vma_chain);
                        list_add(&avc->same_anon_vma, &anon_vma->head);
                        allocated = NULL;
+                        avc = NULL;
                }
                spin_unlock(&mm->page_table_lock);
                spin_unlock(&anon_vma->lock);
-                if (unlikely(allocated)) {
+                if (unlikely(allocated))
                        anon_vma_free(allocated);
+                if (unlikely(avc))
                        anon_vma_chain_free(avc);
-                }
        }
        return 0;
@@ -182,7 +183,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 {
        struct anon_vma_chain *avc, *pavc;
-        list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
+        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
                avc = anon_vma_chain_alloc();
                if (!avc)
                        goto enomem_failure;
@@ -232,6 +233,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 out_error_free_anon_vma:
        anon_vma_free(anon_vma);
 out_error:
+        unlink_anon_vmas(vma);
        return -ENOMEM;
 }
@@ -334,14 +336,13 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 /*
 * At what user virtual address is page expected in vma?
- * checking that the page matches the vma.
+ * Caller should check the page is actually part of the vma.
 */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
-        if (PageAnon(page)) {
+        if (PageAnon(page))
-                if (vma->anon_vma != page_anon_vma(page))
+                ;
-                        return -EFAULT;
+        else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
-        } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
                if (!vma->vm_file ||
                    vma->vm_file->f_mapping != page->mapping)
                        return -EFAULT;
@@ -729,13 +730,29 @@ void page_move_anon_rmap(struct page *page,
 * @page:       the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
 * @address:    the user virtual address mapped
+ * @exclusive:  the page is exclusively owned by the current process
 */
 static void __page_set_anon_rmap(struct page *page,
-        struct vm_area_struct *vma, unsigned long address)
+        struct vm_area_struct *vma, unsigned long address, int exclusive)
 {
        struct anon_vma *anon_vma = vma->anon_vma;
        BUG_ON(!anon_vma);
+        /*
+         * If the page isn't exclusively mapped into this vma,
+         * we must use the _oldest_ possible anon_vma for the
+         * page mapping!
+         *
+         * So take the last AVC chain entry in the vma, which is
+         * the deepest ancestor, and use the anon_vma from that.
+         */
+        if (!exclusive) {
+                struct anon_vma_chain *avc;
+                avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
+                anon_vma = avc->anon_vma;
+        }
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
        page->index = linear_page_index(vma, address);
@@ -790,7 +807,7 @@ void page_add_anon_rmap(struct page *page,
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        if (first)
-                __page_set_anon_rmap(page, vma, address);
+                __page_set_anon_rmap(page, vma, address, 0);
        else
                __page_check_anon_rmap(page, vma, address);
 }
@@ -812,7 +829,7 @@ void page_add_new_anon_rmap(struct page *page,
        SetPageSwapBacked(page);
        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
        __inc_zone_page_state(page, NR_ANON_PAGES);
-        __page_set_anon_rmap(page, vma, address);
+        __page_set_anon_rmap(page, vma, address, 1);
        if (page_evictable(page, vma))
                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
        else
diff --git a/mm/shmem.c b/mm/shmem.c
index eef4ebea515..0cd7f66f1c6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1545,8 +1545,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
-static struct inode *shmem_get_inode(struct super_block *sb, int mode,
+static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
-                                        dev_t dev, unsigned long flags)
+                                     int mode, dev_t dev, unsigned long flags)
 {
        struct inode *inode;
        struct shmem_inode_info *info;
@@ -1557,9 +1557,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
        inode = new_inode(sb);
        if (inode) {
-                inode->i_mode = mode;
+                inode_init_owner(inode, dir, mode);
-                inode->i_uid = current_fsuid();
-                inode->i_gid = current_fsgid();
                inode->i_blocks = 0;
                inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -1814,7 +1812,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        struct inode *inode;
        int error = -ENOSPC;
-        inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE);
+        inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
        if (inode) {
                error = security_inode_init_security(inode, dir, NULL, NULL,
                                                     NULL);
@@ -1833,11 +1831,6 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 #else
                error = 0;
 #endif
-                if (dir->i_mode & S_ISGID) {
-                        inode->i_gid = dir->i_gid;
-                        if (S_ISDIR(mode))
-                                inode->i_mode |= S_ISGID;
-                }
                dir->i_size += BOGO_DIRENT_SIZE;
                dir->i_ctime = dir->i_mtime = CURRENT_TIME;
                d_instantiate(dentry, inode);
@@ -1957,7 +1950,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        if (len > PAGE_CACHE_SIZE)
                return -ENAMETOOLONG;
-        inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
+        inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
        if (!inode)
                return -ENOSPC;
@@ -1992,8 +1985,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                unlock_page(page);
                page_cache_release(page);
        }
-        if (dir->i_mode & S_ISGID)
-                inode->i_gid = dir->i_gid;
        dir->i_size += BOGO_DIRENT_SIZE;
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        d_instantiate(dentry, inode);
@@ -2071,14 +2062,14 @@ static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
                                          size, flags);
 }
-static struct xattr_handler shmem_xattr_security_handler = {
+static const struct xattr_handler shmem_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = shmem_xattr_security_list,
        .get    = shmem_xattr_security_get,
        .set    = shmem_xattr_security_set,
 };
-static struct xattr_handler *shmem_xattr_handlers[] = {
+static const struct xattr_handler *shmem_xattr_handlers[] = {
        &generic_acl_access_handler,
        &generic_acl_default_handler,
        &shmem_xattr_security_handler,
@@ -2366,7 +2357,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_flags |= MS_POSIXACL;
 #endif
-        inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
+        inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
        if (!inode)
                goto failed;
        inode->i_uid = sbinfo->uid;
@@ -2611,7 +2602,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
 #define shmem_vm_ops                            generic_file_vm_ops
 #define shmem_file_operations                   ramfs_file_operations
-#define shmem_get_inode(sb, mode, dev, flags)   ramfs_get_inode(sb, mode, dev)
+#define shmem_get_inode(sb, dir, mode, dev, flags)      ramfs_get_inode(sb, dir, mode, dev)
 #define shmem_acct_size(flags, size)            0
 #define shmem_unacct_size(flags, size)          do {} while (0)
 #define SHMEM_MAX_BYTES                         MAX_LFS_FILESIZE
@@ -2655,7 +2646,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        path.mnt = mntget(shm_mnt);
        error = -ENOSPC;
-        inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
+        inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
        if (!inode)
                goto put_dentry;
diff --git a/mm/slab.c b/mm/slab.c
index 3230cd2c6b3..50a73fca19c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -145,30 +145,6 @@
 #define BYTES_PER_WORD          sizeof(void *)
 #define REDZONE_ALIGN           max(BYTES_PER_WORD, __alignof__(unsigned long long))
-#ifndef ARCH_KMALLOC_MINALIGN
-/*
- * Enforce a minimum alignment for the kmalloc caches.
- * Usually, the kmalloc caches are cache_line_size() aligned, except when
- * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
- * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than the alignment of a 64-bit integer.
- * ARCH_KMALLOC_MINALIGN allows that.
- * Note that increasing this value may disable some debug features.
- */
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-#ifndef ARCH_SLAB_MINALIGN
-/*
- * Enforce a minimum alignment for all caches.
- * Intended for archs that get misalignment faults even for BYTES_PER_WORD
- * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
- * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
- * some debug features.
- */
-#define ARCH_SLAB_MINALIGN 0
-#endif
 #ifndef ARCH_KMALLOC_FLAGS
 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 #endif
@@ -2313,8 +2289,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (ralign < align) {
                ralign = align;
        }
-        /* disable debug if necessary */
+        /* disable debug if not aligning with REDZONE_ALIGN */
-        if (ralign > __alignof__(unsigned long long))
+        if (ralign & (__alignof__(unsigned long long) - 1))
                flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        /*
         * 4) Store it.
@@ -2340,8 +2316,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         */
        if (flags & SLAB_RED_ZONE) {
                /* add space for red zone words */
-                cachep->obj_offset += sizeof(unsigned long long);
+                cachep->obj_offset += align;
-                size += 2 * sizeof(unsigned long long);
+                size += align + sizeof(unsigned long long);
        }
        if (flags & SLAB_STORE_USER) {
                /* user store requires one word storage behind the end of
@@ -3695,21 +3671,10 @@ EXPORT_SYMBOL(kmem_cache_alloc_notrace);
 */
 int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
 {
-        unsigned long addr = (unsigned long)ptr;
-        unsigned long min_addr = PAGE_OFFSET;
-        unsigned long align_mask = BYTES_PER_WORD - 1;
        unsigned long size = cachep->buffer_size;
        struct page *page;
-        if (unlikely(addr < min_addr))
+        if (unlikely(!kern_ptr_validate(ptr, size)))
-                goto out;
-        if (unlikely(addr > (unsigned long)high_memory - size))
-                goto out;
-        if (unlikely(addr & align_mask))
-                goto out;
-        if (unlikely(!kern_addr_valid(addr)))
-                goto out;
-        if (unlikely(!kern_addr_valid(addr + size - 1)))
                goto out;
        page = virt_to_page(ptr);
        if (unlikely(!PageSlab(page)))
@@ -4320,10 +4285,11 @@ static int s_show(struct seq_file *m, void *p)
                unsigned long node_frees = cachep->node_frees;
                unsigned long overflows = cachep->node_overflow;
-                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
+                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
-                                %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
+                           "%4lu %4lu %4lu %4lu %4lu",
-                                reaped, errors, max_freeable, node_allocs,
+                           allocs, high, grown,
-                                node_frees, overflows);
+                           reaped, errors, max_freeable, node_allocs,
+                           node_frees, overflows);
        }
        /* cpu stats */
        {
diff --git a/mm/slob.c b/mm/slob.c
index 837ebd64cc3..23631e2bb57 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -467,14 +467,6 @@ out:
 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
 */
-#ifndef ARCH_KMALLOC_MINALIGN
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
-#endif
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
-#endif
 void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 {
        unsigned int *m;
diff --git a/mm/slub.c b/mm/slub.c
index b364844a106..e46e3129697 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -157,14 +157,6 @@
 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
                SLAB_CACHE_DMA | SLAB_NOTRACK)
-#ifndef ARCH_KMALLOC_MINALIGN
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
-#endif
 #define OO_SHIFT        16
 #define OO_MASK         ((1 << OO_SHIFT) - 1)
 #define MAX_OBJS_PER_PAGE       65535 /* since page.objects is u16 */
@@ -1084,7 +1076,7 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
        if (node == -1)
                return alloc_pages(flags, order);
        else
-                return alloc_pages_node(node, flags, order);
+                return alloc_pages_exact_node(node, flags, order);
 }
 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2153,7 +2145,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
        int local_node;
        if (slab_state >= UP && (s < kmalloc_caches ||
-                        s > kmalloc_caches + KMALLOC_CACHES))
+                        s >= kmalloc_caches + KMALLOC_CACHES))
                local_node = page_to_nid(virt_to_page(s));
        else
                local_node = 0;
@@ -2386,6 +2378,9 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object)
 {
        struct page *page;
+        if (!kern_ptr_validate(object, s->size))
+                return 0;
        page = get_object_page(object);
        if (!page || s != page->slab)
@@ -2426,9 +2421,11 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
 #ifdef CONFIG_SLUB_DEBUG
        void *addr = page_address(page);
        void *p;
-        DECLARE_BITMAP(map, page->objects);
+        long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long),
+                            GFP_ATOMIC);
-        bitmap_zero(map, page->objects);
+        if (!map)
+                return;
        slab_err(s, page, "%s", text);
        slab_lock(page);
        for_each_free_object(p, s, page->freelist)
@@ -2443,6 +2440,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
                }
        }
        slab_unlock(page);
+        kfree(map);
 #endif
 }
@@ -3335,8 +3333,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
        struct kmem_cache *s;
        void *ret;
-        if (unlikely(size > SLUB_MAX_SIZE))
+        if (unlikely(size > SLUB_MAX_SIZE)) {
-                return kmalloc_large_node(size, gfpflags, node);
+                ret = kmalloc_large_node(size, gfpflags, node);
+                trace_kmalloc_node(caller, ret,
+                                   size, PAGE_SIZE << get_order(size),
+                                   gfpflags, node);
+                return ret;
+        }
        s = get_slab(size, gfpflags);
@@ -3648,10 +3653,10 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
 }
 static void process_slab(struct loc_track *t, struct kmem_cache *s,
-                struct page *page, enum track_item alloc)
+                struct page *page, enum track_item alloc,
+                long *map)
 {
        void *addr = page_address(page);
-        DECLARE_BITMAP(map, page->objects);
        void *p;
        bitmap_zero(map, page->objects);
@@ -3670,11 +3675,14 @@ static int list_locations(struct kmem_cache *s, char *buf,
        unsigned long i;
        struct loc_track t = { 0, 0, NULL };
        int node;
+        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
+                                     sizeof(unsigned long), GFP_KERNEL);
-        if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+        if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
-                        GFP_TEMPORARY))
+                                     GFP_TEMPORARY)) {
+                kfree(map);
                return sprintf(buf, "Out of memory\n");
+        }
        /* Push back cpu slabs */
        flush_all(s);
@@ -3688,9 +3696,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
                spin_lock_irqsave(&n->list_lock, flags);
                list_for_each_entry(page, &n->partial, lru)
-                        process_slab(&t, s, page, alloc);
+                        process_slab(&t, s, page, alloc, map);
                list_for_each_entry(page, &n->full, lru)
-                        process_slab(&t, s, page, alloc);
+                        process_slab(&t, s, page, alloc, map);
                spin_unlock_irqrestore(&n->list_lock, flags);
        }
@@ -3741,6 +3749,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
        }
        free_loc_track(&t);
+        kfree(map);
        if (!t.count)
                len += sprintf(buf, "No data\n");
        return len;
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 392b9bb5bc0..aa33fd67fa4 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -22,6 +22,7 @@
 #include <linux/bootmem.h>
 #include <linux/highmem.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
diff --git a/mm/sparse.c b/mm/sparse.c
index 22896d58913..dc0cc4d43ff 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -2,6 +2,7 @@
 * sparse memory mappings.
 */
 #include <linux/mm.h>
+#include <linux/slab.h>
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
 #include <linux/highmem.h>
diff --git a/mm/swap.c b/mm/swap.c
index 9036b89813a..7cd60bf0a97 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
 #include <linux/notifier.h>
 #include <linux/backing-dev.h>
 #include <linux/memcontrol.h>
+#include <linux/gfp.h>
 #include "internal.h"
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 6d1daeb1cb4..e10f5833167 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -8,6 +8,7 @@
 */
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/gfp.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cd0a8f90dc..03aa2d55f1a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si)
        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
        if (nr_blocks) {
                err = blkdev_issue_discard(si->bdev, start_block,
-                                nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+                                nr_blocks, GFP_KERNEL,
+                                BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (err)
                        return err;
                cond_resched();
@@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si)
                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
                err = blkdev_issue_discard(si->bdev, start_block,
-                                nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+                                nr_blocks, GFP_KERNEL,
+                                BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (err)
                        break;
@@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
                        start_block <<= PAGE_SHIFT - 9;
                        nr_blocks <<= PAGE_SHIFT - 9;
                        if (blkdev_issue_discard(si->bdev, start_block,
-                                    nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
+                                    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
+                                                        BLKDEV_IFL_BARRIER))
                                break;
                }
@@ -574,6 +577,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
        /* free if no reference */
        if (!usage) {
+                struct gendisk *disk = p->bdev->bd_disk;
                if (offset < p->lowest_bit)
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
@@ -583,6 +587,9 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
                        swap_list.next = p->type;
                nr_swap_pages++;
                p->inuse_pages--;
+                if ((p->flags & SWP_BLKDEV) &&
+                                disk->fops->swap_slot_free_notify)
+                        disk->fops->swap_slot_free_notify(p->bdev, offset);
        }
        return usage;
@@ -1884,6 +1891,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                if (error < 0)
                        goto bad_swap;
                p->bdev = bdev;
+                p->flags |= SWP_BLKDEV;
        } else if (S_ISREG(inode->i_mode)) {
                p->bdev = inode->i_sb->s_bdev;
                mutex_lock(&inode->i_mutex);
diff --git a/mm/truncate.c b/mm/truncate.c
index e87e3724482..f42675a3615 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/backing-dev.h>
+#include <linux/gfp.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/module.h>
diff --git a/mm/util.c b/mm/util.c
index 834db7be240..f5712e8964b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -186,6 +186,27 @@ void kzfree(const void *p)
 }
 EXPORT_SYMBOL(kzfree);
+int kern_ptr_validate(const void *ptr, unsigned long size)
+{
+        unsigned long addr = (unsigned long)ptr;
+        unsigned long min_addr = PAGE_OFFSET;
+        unsigned long align_mask = sizeof(void *) - 1;
+        if (unlikely(addr < min_addr))
+                goto out;
+        if (unlikely(addr > (unsigned long)high_memory - size))
+                goto out;
+        if (unlikely(addr & align_mask))
+                goto out;
+        if (unlikely(!kern_addr_valid(addr)))
+                goto out;
+        if (unlikely(!kern_addr_valid(addr + size - 1)))
+                goto out;
+        return 1;
+out:
+        return 0;
+}
 /*
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 79c809895fb..3ff3311447f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -13,7 +13,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/gfp.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
@@ -1535,13 +1535,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
        unsigned long ap, fp;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-        /* If we have no swap space, do not bother scanning anon pages. */
-        if (!sc->may_swap || (nr_swap_pages <= 0)) {
-                percent[0] = 0;
-                percent[1] = 100;
-                return;
-        }
        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1639,20 +1632,22 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr_reclaimed = sc->nr_reclaimed;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+        int noswap = 0;
-        get_scan_ratio(zone, sc, percent);
+        /* If we have no swap space, do not bother scanning anon pages. */
+        if (!sc->may_swap || (nr_swap_pages <= 0)) {
+                noswap = 1;
+                percent[0] = 0;
+                percent[1] = 100;
+        } else
+                get_scan_ratio(zone, sc, percent);
        for_each_evictable_lru(l) {
                int file = is_file_lru(l);
                unsigned long scan;
-                if (percent[file] == 0) {
-                        nr[l] = 0;
-                        continue;
-                }
                scan = zone_nr_lru_pages(zone, sc, l);
-                if (priority) {
+                if (priority || noswap) {
                        scan >>= priority;
                        scan = (scan * percent[file]) / 100;
                }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7f760cbc73f..fa12ea3051f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/vmstat.h>
 #include <linux/sched.h>