18 files changed, 956 insertions, 686 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f13e067e146..660a87a2251 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -11,6 +11,8 @@
 #include <linux/writeback.h>
 #include <linux/device.h>
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
 }
@@ -25,6 +27,11 @@ struct backing_dev_info default_backing_dev_info = {
 };
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
+struct backing_dev_info noop_backing_dev_info = {
+        .name           = "noop",
+};
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 static struct class *bdi_class;
 /*
@@ -41,7 +48,6 @@ static struct timer_list sync_supers_timer;
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
-static void arm_supers_timer(void);
 static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
@@ -245,7 +251,7 @@ static int __init default_bdi_init(void)
        init_timer(&sync_supers_timer);
        setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
-        arm_supers_timer();
+        bdi_arm_supers_timer();
        err = bdi_init(&default_backing_dev_info);
        if (!err)
@@ -367,10 +373,13 @@ static int bdi_sync_supers(void *unused)
        return 0;
 }
-static void arm_supers_timer(void)
+void bdi_arm_supers_timer(void)
 {
        unsigned long next;
+        if (!dirty_writeback_interval)
+                return;
        next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies;
        mod_timer(&sync_supers_timer, round_jiffies_up(next));
 }
@@ -378,7 +387,7 @@ static void arm_supers_timer(void)
 static void sync_supers_timer_fn(unsigned long unused)
 {
        wake_up_process(sync_supers_tsk);
-        arm_supers_timer();
+        bdi_arm_supers_timer();
 }
 static int bdi_forker_task(void *ptr)
@@ -421,7 +430,10 @@ static int bdi_forker_task(void *ptr)
                        spin_unlock_bh(&bdi_lock);
                        wait = msecs_to_jiffies(dirty_writeback_interval * 10);
-                        schedule_timeout(wait);
+                        if (wait)
+                                schedule_timeout(wait);
+                        else
+                                schedule();
                        try_to_freeze();
                        continue;
                }
@@ -715,6 +727,33 @@ void bdi_destroy(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(bdi_destroy);
+/*
+ * For use from filesystems to quickly init and register a bdi associated
+ * with dirty writeback
+ */
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
+                           unsigned int cap)
+{
+        char tmp[32];
+        int err;
+        bdi->name = name;
+        bdi->capabilities = cap;
+        err = bdi_init(bdi);
+        if (err)
+                return err;
+        sprintf(tmp, "%.28s%s", name, "-%d");
+        err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq));
+        if (err) {
+                bdi_destroy(bdi);
+                return err;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(bdi_setup_and_register);
 static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6034dc9e979..4c9e6bbf377 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -546,6 +546,7 @@ static void free_huge_page(struct page *page)
        mapping = (struct address_space *) page_private(page);
        set_page_private(page, 0);
+        page->mapping = NULL;
        BUG_ON(page_count(page));
        INIT_LIST_HEAD(&page->lru);
@@ -1038,7 +1039,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
                page = alloc_buddy_huge_page(h, vma, addr);
                if (!page) {
                        hugetlb_put_quota(inode->i_mapping, chg);
-                        return ERR_PTR(-VM_FAULT_OOM);
+                        return ERR_PTR(-VM_FAULT_SIGBUS);
                }
        }
@@ -2447,8 +2448,10 @@ retry:
                        spin_lock(&inode->i_lock);
                        inode->i_blocks += blocks_per_huge_page(h);
                        spin_unlock(&inode->i_lock);
-                } else
+                } else {
                        lock_page(page);
+                        page->mapping = HUGETLB_POISON;
+                }
        }
        /*
diff --git a/mm/ksm.c b/mm/ksm.c
index 8cdfc2a1e8b..956880f2ff4 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -365,7 +365,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
        do {
                cond_resched();
                page = follow_page(vma, addr, FOLL_GET);
-                if (!page)
+                if (IS_ERR_OR_NULL(page))
                        break;
                if (PageKsm(page))
                        ret = handle_mm_fault(vma->vm_mm, vma, addr,
@@ -447,7 +447,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
                goto out;
        page = follow_page(vma, addr, FOLL_GET);
-        if (!page)
+        if (IS_ERR_OR_NULL(page))
                goto out;
        if (PageAnon(page)) {
                flush_anon_page(vma, page, addr);
@@ -1086,7 +1086,7 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
                cond_resched();
                tree_rmap_item = rb_entry(*new, struct rmap_item, node);
                tree_page = get_mergeable_page(tree_rmap_item);
-                if (!tree_page)
+                if (IS_ERR_OR_NULL(tree_page))
                        return NULL;
                /*
@@ -1294,7 +1294,7 @@ next_mm:
                        if (ksm_test_exit(mm))
                                break;
                        *page = follow_page(vma, ksm_scan.address, FOLL_GET);
-                        if (*page && PageAnon(*page)) {
+                        if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(slot,
@@ -1308,7 +1308,7 @@ next_mm:
                                up_read(&mm->mmap_sem);
                                return rmap_item;
                        }
-                        if (*page)
+                        if (!IS_ERR_OR_NULL(*page))
                                put_page(*page);
                        ksm_scan.address += PAGE_SIZE;
                        cond_resched();
@@ -1367,7 +1367,7 @@ next_mm:
 static void ksm_do_scan(unsigned int scan_npages)
 {
        struct rmap_item *rmap_item;
-        struct page *page;
+        struct page *uninitialized_var(page);
        while (scan_npages--) {
                cond_resched();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f4ede99c8b9..c8569bc298f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1438,7 +1438,7 @@ static void drain_local_stock(struct work_struct *dummy)
 /*
 * Cache charges(val) which is from res_counter, to local per_cpu area.
- * This will be consumed by consumt_stock() function, later.
+ * This will be consumed by consume_stock() function, later.
 */
 static void refill_stock(struct mem_cgroup *mem, int val)
 {
@@ -1601,7 +1601,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                         * There is a small race that "from" or "to" can be
                         * freed by rmdir, so we use css_tryget().
                         */
-                        rcu_read_lock();
                        from = mc.from;
                        to = mc.to;
                        if (from && css_tryget(&from->css)) {
@@ -1622,7 +1621,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                                        do_continue = (to == mem_over_limit);
                                css_put(&to->css);
                        }
-                        rcu_read_unlock();
                        if (do_continue) {
                                DEFINE_WAIT(wait);
                                prepare_to_wait(&mc.waitq, &wait,
@@ -2429,11 +2427,11 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
        }
        unlock_page_cgroup(pc);
+        *ptr = mem;
        if (mem) {
-                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
                css_put(&mem->css);
        }
-        *ptr = mem;
        return ret;
 }
diff --git a/mm/mlock.c b/mm/mlock.c
index 8f4e2dfceec..3f82720e051 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -607,44 +607,3 @@ void user_shm_unlock(size_t size, struct user_struct *user)
        spin_unlock(&shmlock_user_lock);
        free_uid(user);
 }
-int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
-                          size_t size)
-{
-        unsigned long lim, vm, pgsz;
-        int error = -ENOMEM;
-        pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-        down_write(&mm->mmap_sem);
-        lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
-        vm   = mm->total_vm + pgsz;
-        if (lim < vm)
-                goto out;
-        lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
-        vm   = mm->locked_vm + pgsz;
-        if (lim < vm)
-                goto out;
-        mm->total_vm  += pgsz;
-        mm->locked_vm += pgsz;
-        error = 0;
- out:
-        up_write(&mm->mmap_sem);
-        return error;
-}
-void refund_locked_memory(struct mm_struct *mm, size_t size)
-{
-        unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
-        down_write(&mm->mmap_sem);
-        mm->total_vm  -= pgsz;
-        mm->locked_vm -= pgsz;
-        up_write(&mm->mmap_sem);
-}
diff --git a/mm/mmap.c b/mm/mmap.c
index f90ea92f755..456ec6f2788 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1977,7 +1977,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
                return 0;
        /* Clean everything up if vma_adjust failed. */
-        new->vm_ops->close(new);
+        if (new->vm_ops && new->vm_ops->close)
+                new->vm_ops->close(new);
        if (new->vm_file) {
                if (vma->vm_flags & VM_EXECUTABLE)
                        removed_exe_file_vma(mm);
diff --git a/mm/msync.c b/mm/msync.c
index 4083209b7f0..632df4527c0 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -82,7 +82,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
                                (vma->vm_flags & VM_SHARED)) {
                        get_file(file);
                        up_read(&mm->mmap_sem);
-                        error = vfs_fsync(file, file->f_path.dentry, 0);
+                        error = vfs_fsync(file, 0);
                        fput(file);
                        if (error || start >= end)
                                goto out;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0b19943ecf8..b289310e2c8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -597,7 +597,7 @@ static void balance_dirty_pages(struct address_space *mapping,
            (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
                               + global_page_state(NR_UNSTABLE_NFS))
                                          > background_thresh)))
-                bdi_start_writeback(bdi, NULL, 0);
+                bdi_start_writeback(bdi, NULL, 0, 0);
 }
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -683,10 +683,6 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        }
 }
-static void laptop_timer_fn(unsigned long unused);
-static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 /*
 * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
 */
@@ -694,24 +690,24 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
        proc_dointvec(table, write, buffer, length, ppos);
+        bdi_arm_supers_timer();
        return 0;
 }
-static void do_laptop_sync(struct work_struct *work)
+#ifdef CONFIG_BLOCK
+void laptop_mode_timer_fn(unsigned long data)
 {
-        wakeup_flusher_threads(0);
+        struct request_queue *q = (struct request_queue *)data;
-        kfree(work);
+        int nr_pages = global_page_state(NR_FILE_DIRTY) +
-}
+                global_page_state(NR_UNSTABLE_NFS);
-static void laptop_timer_fn(unsigned long unused)
+        /*
-{
+         * We want to write everything out, not just down to the dirty
-        struct work_struct *work;
+         * threshold
+         */
-        work = kmalloc(sizeof(*work), GFP_ATOMIC);
+        if (bdi_has_dirty_io(&q->backing_dev_info))
-        if (work) {
+                bdi_start_writeback(&q->backing_dev_info, NULL, nr_pages, 0);
-                INIT_WORK(work, do_laptop_sync);
-                schedule_work(work);
-        }
 }
 /*
@@ -719,9 +715,9 @@ static void laptop_timer_fn(unsigned long unused)
 * of all dirty data a few seconds from now.  If the flush is already scheduled
 * then push it back - the user is still using the disk.
 */
-void laptop_io_completion(void)
+void laptop_io_completion(struct backing_dev_info *info)
 {
-        mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
+        mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode);
 }
 /*
@@ -731,8 +727,16 @@ void laptop_io_completion(void)
 */
 void laptop_sync_completion(void)
 {
-        del_timer(&laptop_mode_wb_timer);
+        struct backing_dev_info *bdi;
+        rcu_read_lock();
+        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list)
+                del_timer(&bdi->laptop_mode_wb_timer);
+        rcu_read_unlock();
 }
+#endif
 /*
 * If ratelimit_pages is too high then we can get into dirty-data overload
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d03c946d556..a6326c71b66 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2579,7 +2579,7 @@ static int default_zonelist_order(void)
        struct zone *z;
        int average_size;
        /*
-         * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
+         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
         * If they are really small and used heavily, the system can fall
         * into OOM very easily.
         * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
new file mode 100644
index 00000000000..df680855540
--- /dev/null
+++ b/mm/percpu-km.c
@@ -0,0 +1,104 @@
+/*
+ * mm/percpu-km.c - kernel memory based chunk allocation
+ *
+ * Copyright (C) 2010           SUSE Linux Products GmbH
+ * Copyright (C) 2010           Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Chunks are allocated as a contiguous kernel memory using gfp
+ * allocation.  This is to be used on nommu architectures.
+ *
+ * To use percpu-km,
+ *
+ * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig.
+ *
+ * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined.  It's
+ *   not compatible with PER_CPU_KM.  EMBED_FIRST_CHUNK should work
+ *   fine.
+ *
+ * - NUMA is not supported.  When setting up the first chunk,
+ *   @cpu_distance_fn should be NULL or report all CPUs to be nearer
+ *   than or at LOCAL_DISTANCE.
+ *
+ * - It's best if the chunk size is power of two multiple of
+ *   PAGE_SIZE.  Because each chunk is allocated as a contiguous
+ *   kernel memory block using alloc_pages(), memory will be wasted if
+ *   chunk size is not aligned.  percpu-km code will whine about it.
+ */
+#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#error "contiguous percpu allocation is incompatible with paged first chunk"
+#endif
+#include <linux/log2.h>
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+        /* noop */
+        return 0;
+}
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+        /* nada */
+}
+static struct pcpu_chunk *pcpu_create_chunk(void)
+{
+        const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+        struct pcpu_chunk *chunk;
+        struct page *pages;
+        int i;
+        chunk = pcpu_alloc_chunk();
+        if (!chunk)
+                return NULL;
+        pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
+        if (!pages) {
+                pcpu_free_chunk(chunk);
+                return NULL;
+        }
+        for (i = 0; i < nr_pages; i++)
+                pcpu_set_page_chunk(nth_page(pages, i), chunk);
+        chunk->data = pages;
+        chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
+        return chunk;
+}
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+        const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
+        if (chunk && chunk->data)
+                __free_pages(chunk->data, order_base_2(nr_pages));
+        pcpu_free_chunk(chunk);
+}
+static struct page *pcpu_addr_to_page(void *addr)
+{
+        return virt_to_page(addr);
+}
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+        size_t nr_pages, alloc_pages;
+        /* all units must be in a single group */
+        if (ai->nr_groups != 1) {
+                printk(KERN_CRIT "percpu: can't handle more than one groups\n");
+                return -EINVAL;
+        }
+        nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT;
+        alloc_pages = roundup_pow_of_two(nr_pages);
+        if (alloc_pages > nr_pages)
+                printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n",
+                       alloc_pages - nr_pages);
+        return 0;
+}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
new file mode 100644
index 00000000000..7d9c1d0ebd3
--- /dev/null
+++ b/mm/percpu-vm.c
@@ -0,0 +1,451 @@
+/*
+ * mm/percpu-vm.c - vmalloc area based chunk allocation
+ *
+ * Copyright (C) 2010           SUSE Linux Products GmbH
+ * Copyright (C) 2010           Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Chunks are mapped into vmalloc areas and populated page by page.
+ * This is the default chunk allocator.
+ */
+static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
+                                    unsigned int cpu, int page_idx)
+{
+        /* must not be used on pre-mapped chunk */
+        WARN_ON(chunk->immutable);
+        return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
+}
+/**
+ * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
+ * @chunk: chunk of interest
+ * @bitmapp: output parameter for bitmap
+ * @may_alloc: may allocate the array
+ *
+ * Returns pointer to array of pointers to struct page and bitmap,
+ * both of which can be indexed with pcpu_page_idx().  The returned
+ * array is cleared to zero and *@bitmapp is copied from
+ * @chunk->populated.  Note that there is only one array and bitmap
+ * and access exclusion is the caller's responsibility.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
+ * Otherwise, don't care.
+ *
+ * RETURNS:
+ * Pointer to temp pages array on success, NULL on failure.
+ */
+static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
+                                               unsigned long **bitmapp,
+                                               bool may_alloc)
+{
+        static struct page **pages;
+        static unsigned long *bitmap;
+        size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
+        size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
+                             sizeof(unsigned long);
+        if (!pages || !bitmap) {
+                if (may_alloc && !pages)
+                        pages = pcpu_mem_alloc(pages_size);
+                if (may_alloc && !bitmap)
+                        bitmap = pcpu_mem_alloc(bitmap_size);
+                if (!pages || !bitmap)
+                        return NULL;
+        }
+        memset(pages, 0, pages_size);
+        bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
+        *bitmapp = bitmap;
+        return pages;
+}
+/**
+ * pcpu_free_pages - free pages which were allocated for @chunk
+ * @chunk: chunk pages were allocated for
+ * @pages: array of pages to be freed, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be freed
+ * @page_end: page index of the last page to be freed + 1
+ *
+ * Free pages [@page_start and @page_end) in @pages for all units.
+ * The pages were allocated for @chunk.
+ */
+static void pcpu_free_pages(struct pcpu_chunk *chunk,
+                            struct page **pages, unsigned long *populated,
+                            int page_start, int page_end)
+{
+        unsigned int cpu;
+        int i;
+        for_each_possible_cpu(cpu) {
+                for (i = page_start; i < page_end; i++) {
+                        struct page *page = pages[pcpu_page_idx(cpu, i)];
+                        if (page)
+                                __free_page(page);
+                }
+        }
+}
+/**
+ * pcpu_alloc_pages - allocates pages for @chunk
+ * @chunk: target chunk
+ * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to be allocated
+ * @page_end: page index of the last page to be allocated + 1
+ *
+ * Allocate pages [@page_start,@page_end) into @pages for all units.
+ * The allocation is for @chunk.  Percpu core doesn't care about the
+ * content of @pages and will pass it verbatim to pcpu_map_pages().
+ */
+static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
+                            struct page **pages, unsigned long *populated,
+                            int page_start, int page_end)
+{
+        const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+        unsigned int cpu;
+        int i;
+        for_each_possible_cpu(cpu) {
+                for (i = page_start; i < page_end; i++) {
+                        struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+                        *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
+                        if (!*pagep) {
+                                pcpu_free_pages(chunk, pages, populated,
+                                                page_start, page_end);
+                                return -ENOMEM;
+                        }
+                }
+        }
+        return 0;
+}
+/**
+ * pcpu_pre_unmap_flush - flush cache prior to unmapping
+ * @chunk: chunk the regions to be flushed belongs to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages in [@page_start,@page_end) of @chunk are about to be
+ * unmapped.  Flush cache.  As each flushing trial can be very
+ * expensive, issue flush on the whole region at once rather than
+ * doing it for each cpu.  This could be an overkill but is more
+ * scalable.
+ */
+static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
+                                 int page_start, int page_end)
+{
+        flush_cache_vunmap(
+                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+{
+        unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+}
+/**
+ * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array which can be used to pass information to free
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * Corresponding elements in @pages were cleared by the caller and can
+ * be used to carry information to pcpu_free_pages() which will be
+ * called after all unmaps are finished.  The caller should call
+ * proper pre/post flush functions.
+ */
+static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+                             struct page **pages, unsigned long *populated,
+                             int page_start, int page_end)
+{
+        unsigned int cpu;
+        int i;
+        for_each_possible_cpu(cpu) {
+                for (i = page_start; i < page_end; i++) {
+                        struct page *page;
+                        page = pcpu_chunk_page(chunk, cpu, i);
+                        WARN_ON(!page);
+                        pages[pcpu_page_idx(cpu, i)] = page;
+                }
+                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+                                   page_end - page_start);
+        }
+        for (i = page_start; i < page_end; i++)
+                __clear_bit(i, populated);
+}
+/**
+ * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
+ * TLB for the regions.  This can be skipped if the area is to be
+ * returned to vmalloc as vmalloc will handle TLB flushing lazily.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
+                                      int page_start, int page_end)
+{
+        flush_tlb_kernel_range(
+                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+static int __pcpu_map_pages(unsigned long addr, struct page **pages,
+                            int nr_pages)
+{
+        return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
+                                        PAGE_KERNEL, pages);
+}
+/**
+ * pcpu_map_pages - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @pages: pages array containing pages to be mapped
+ * @populated: populated bitmap
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
+ * caller is responsible for calling pcpu_post_map_flush() after all
+ * mappings are complete.
+ *
+ * This function is responsible for setting corresponding bits in
+ * @chunk->populated bitmap and whatever is necessary for reverse
+ * lookup (addr -> chunk).
+ */
+static int pcpu_map_pages(struct pcpu_chunk *chunk,
+                          struct page **pages, unsigned long *populated,
+                          int page_start, int page_end)
+{
+        unsigned int cpu, tcpu;
+        int i, err;
+        for_each_possible_cpu(cpu) {
+                err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+                                       &pages[pcpu_page_idx(cpu, page_start)],
+                                       page_end - page_start);
+                if (err < 0)
+                        goto err;
+        }
+        /* mapping successful, link chunk and mark populated */
+        for (i = page_start; i < page_end; i++) {
+                for_each_possible_cpu(cpu)
+                        pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
+                                            chunk);
+                __set_bit(i, populated);
+        }
+        return 0;
+err:
+        for_each_possible_cpu(tcpu) {
+                if (tcpu == cpu)
+                        break;
+                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
+                                   page_end - page_start);
+        }
+        return err;
+}
+/**
+ * pcpu_post_map_flush - flush cache after mapping
+ * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @page_start: page index of the first page to be flushed
+ * @page_end: page index of the last page to be flushed + 1
+ *
+ * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
+ * cache.
+ *
+ * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * for the whole region.
+ */
+static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
+                                int page_start, int page_end)
+{
+        flush_cache_vmap(
+                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
+                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
+}
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate in bytes
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk.  The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+        int page_start = PFN_DOWN(off);
+        int page_end = PFN_UP(off + size);
+        int free_end = page_start, unmap_end = page_start;
+        struct page **pages;
+        unsigned long *populated;
+        unsigned int cpu;
+        int rs, re, rc;
+        /* quick path, check whether all pages are already there */
+        rs = page_start;
+        pcpu_next_pop(chunk, &rs, &re, page_end);
+        if (rs == page_start && re == page_end)
+                goto clear;
+        /* need to allocate and map pages, this chunk can't be immutable */
+        WARN_ON(chunk->immutable);
+        pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
+        if (!pages)
+                return -ENOMEM;
+        /* alloc and map */
+        pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+                rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
+                if (rc)
+                        goto err_free;
+                free_end = re;
+        }
+        pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+                rc = pcpu_map_pages(chunk, pages, populated, rs, re);
+                if (rc)
+                        goto err_unmap;
+                unmap_end = re;
+        }
+        pcpu_post_map_flush(chunk, page_start, page_end);
+        /* commit new bitmap */
+        bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+clear:
+        for_each_possible_cpu(cpu)
+                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+        return 0;
+err_unmap:
+        pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
+        pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
+                pcpu_unmap_pages(chunk, pages, populated, rs, re);
+        pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
+err_free:
+        pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
+                pcpu_free_pages(chunk, pages, populated, rs, re);
+        return rc;
+}
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate in bytes
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk.  If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+        int page_start = PFN_DOWN(off);
+        int page_end = PFN_UP(off + size);
+        struct page **pages;
+        unsigned long *populated;
+        int rs, re;
+        /* quick path, check whether it's empty already */
+        rs = page_start;
+        pcpu_next_unpop(chunk, &rs, &re, page_end);
+        if (rs == page_start && re == page_end)
+                return;
+        /* immutable chunks can't be depopulated */
+        WARN_ON(chunk->immutable);
+        /*
+         * If control reaches here, there must have been at least one
+         * successful population attempt so the temp pages array must
+         * be available now.
+         */
+        pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+        BUG_ON(!pages);
+        /* unmap and free */
+        pcpu_pre_unmap_flush(chunk, page_start, page_end);
+        pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+                pcpu_unmap_pages(chunk, pages, populated, rs, re);
+        /* no need to flush tlb, vmalloc will handle it lazily */
+        pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
+                pcpu_free_pages(chunk, pages, populated, rs, re);
+        /* commit new bitmap */
+        bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
+}
+static struct pcpu_chunk *pcpu_create_chunk(void)
+{
+        struct pcpu_chunk *chunk;
+        struct vm_struct **vms;
+        chunk = pcpu_alloc_chunk();
+        if (!chunk)
+                return NULL;
+        vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
+                                pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
+        if (!vms) {
+                pcpu_free_chunk(chunk);
+                return NULL;
+        }
+        chunk->data = vms;
+        chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
+        return chunk;
+}
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
+{
+        if (chunk && chunk->data)
+                pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
+        pcpu_free_chunk(chunk);
+}
+static struct page *pcpu_addr_to_page(void *addr)
+{
+        return vmalloc_to_page(addr);
+}
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
+{
+        /* no extra restriction */
+        return 0;
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index 6e09741ddc6..39f7dfd5958 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1,5 +1,5 @@
 /*
- * linux/mm/percpu.c - percpu memory allocator
+ * mm/percpu.c - percpu memory allocator
 *
 * Copyright (C) 2009           SUSE Linux Products GmbH
 * Copyright (C) 2009           Tejun Heo <tj@kernel.org>
@@ -7,14 +7,13 @@
 * This file is released under the GPLv2.
 *
 * This is percpu allocator which can handle both static and dynamic
- * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
+ * areas.  Percpu areas are allocated in chunks.  Each chunk is
- * chunk is consisted of boot-time determined number of units and the
+ * consisted of boot-time determined number of units and the first
- * first chunk is used for static percpu variables in the kernel image
+ * chunk is used for static percpu variables in the kernel image
 * (special boot time alloc/init handling necessary as these areas
 * need to be brought up before allocation services are running).
 * Unit grows as necessary and all units grow or shrink in unison.
- * When a chunk is filled up, another chunk is allocated.  ie. in
+ * When a chunk is filled up, another chunk is allocated.
- * vmalloc area
 *
 *  c0                           c1                         c2
 *  -------------------          -------------------        ------------
@@ -99,7 +98,7 @@ struct pcpu_chunk {
        int                     map_used;       /* # of map entries used */
        int                     map_alloc;      /* # of map entries allocated */
        int                     *map;           /* allocation map */
-        struct vm_struct        **vms;          /* mapped vmalloc regions */
+        void                    *data;          /* chunk data */
        bool                    immutable;      /* no [de]population allowed */
        unsigned long           populated[];    /* populated bitmap */
 };
@@ -177,6 +176,21 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static void pcpu_reclaim(struct work_struct *work);
 static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
+static bool pcpu_addr_in_first_chunk(void *addr)
+{
+        void *first_start = pcpu_first_chunk->base_addr;
+        return addr >= first_start && addr < first_start + pcpu_unit_size;
+}
+static bool pcpu_addr_in_reserved_chunk(void *addr)
+{
+        void *first_start = pcpu_first_chunk->base_addr;
+        return addr >= first_start &&
+                addr < first_start + pcpu_reserved_chunk_limit;
+}
 static int __pcpu_size_to_slot(int size)
 {
        int highbit = fls(size);        /* size is in bytes */
@@ -198,27 +212,6 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
        return pcpu_size_to_slot(chunk->free_size);
 }
-static int pcpu_page_idx(unsigned int cpu, int page_idx)
-{
-        return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
-}
-static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
-                                     unsigned int cpu, int page_idx)
-{
-        return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
-                (page_idx << PAGE_SHIFT);
-}
-static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
-                                    unsigned int cpu, int page_idx)
-{
-        /* must not be used on pre-mapped chunk */
-        WARN_ON(chunk->immutable);
-        return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
-}
 /* set the pointer to a chunk in a page struct */
 static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
 {
@@ -231,13 +224,27 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
        return (struct pcpu_chunk *)page->index;
 }
-static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+        return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
+}
+static unsigned long __maybe_unused pcpu_chunk_addr(struct pcpu_chunk *chunk,
+                                                unsigned int cpu, int page_idx)
+{
+        return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] +
+                (page_idx << PAGE_SHIFT);
+}
+static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk,
+                                           int *rs, int *re, int end)
 {
        *rs = find_next_zero_bit(chunk->populated, end, *rs);
        *re = find_next_bit(chunk->populated, end, *rs + 1);
 }
-static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
+static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
+                                         int *rs, int *re, int end)
 {
        *rs = find_next_bit(chunk->populated, end, *rs);
        *re = find_next_zero_bit(chunk->populated, end, *rs + 1);
@@ -326,36 +333,6 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 }
 /**
- * pcpu_chunk_addr_search - determine chunk containing specified address
- * @addr: address for which the chunk needs to be determined.
- *
- * RETURNS:
- * The address of the found chunk.
- */
-static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
-{
-        void *first_start = pcpu_first_chunk->base_addr;
-        /* is it in the first chunk? */
-        if (addr >= first_start && addr < first_start + pcpu_unit_size) {
-                /* is it in the reserved area? */
-                if (addr < first_start + pcpu_reserved_chunk_limit)
-                        return pcpu_reserved_chunk;
-                return pcpu_first_chunk;
-        }
-        /*
-         * The address is relative to unit0 which might be unused and
-         * thus unmapped.  Offset the address to the unit space of the
-         * current processor before looking it up in the vmalloc
-         * space.  Note that any possible cpu id can be used here, so
-         * there's no need to worry about preemption or cpu hotplug.
-         */
-        addr += pcpu_unit_offsets[raw_smp_processor_id()];
-        return pcpu_get_page_chunk(vmalloc_to_page(addr));
-}
-/**
 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
 * @chunk: chunk of interest
 *
@@ -623,434 +600,92 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
        pcpu_chunk_relocate(chunk, oslot);
 }
-/**
+static struct pcpu_chunk *pcpu_alloc_chunk(void)
- * pcpu_get_pages_and_bitmap - get temp pages array and bitmap
- * @chunk: chunk of interest
- * @bitmapp: output parameter for bitmap
- * @may_alloc: may allocate the array
- *
- * Returns pointer to array of pointers to struct page and bitmap,
- * both of which can be indexed with pcpu_page_idx().  The returned
- * array is cleared to zero and *@bitmapp is copied from
- * @chunk->populated.  Note that there is only one array and bitmap
- * and access exclusion is the caller's responsibility.
- *
- * CONTEXT:
- * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
- * Otherwise, don't care.
- *
- * RETURNS:
- * Pointer to temp pages array on success, NULL on failure.
- */
-static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
-                                               unsigned long **bitmapp,
-                                               bool may_alloc)
-{
-        static struct page **pages;
-        static unsigned long *bitmap;
-        size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
-        size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
-                             sizeof(unsigned long);
-        if (!pages || !bitmap) {
-                if (may_alloc && !pages)
-                        pages = pcpu_mem_alloc(pages_size);
-                if (may_alloc && !bitmap)
-                        bitmap = pcpu_mem_alloc(bitmap_size);
-                if (!pages || !bitmap)
-                        return NULL;
-        }
-        memset(pages, 0, pages_size);
-        bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
-        *bitmapp = bitmap;
-        return pages;
-}
-/**
- * pcpu_free_pages - free pages which were allocated for @chunk
- * @chunk: chunk pages were allocated for
- * @pages: array of pages to be freed, indexed by pcpu_page_idx()
- * @populated: populated bitmap
- * @page_start: page index of the first page to be freed
- * @page_end: page index of the last page to be freed + 1
- *
- * Free pages [@page_start and @page_end) in @pages for all units.
- * The pages were allocated for @chunk.
- */
-static void pcpu_free_pages(struct pcpu_chunk *chunk,
-                            struct page **pages, unsigned long *populated,
-                            int page_start, int page_end)
 {
-        unsigned int cpu;
+        struct pcpu_chunk *chunk;
-        int i;
-        for_each_possible_cpu(cpu) {
+        chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
-                for (i = page_start; i < page_end; i++) {
+        if (!chunk)
-                        struct page *page = pages[pcpu_page_idx(cpu, i)];
+                return NULL;
-                        if (page)
+        chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
-                                __free_page(page);
+        if (!chunk->map) {
-                }
+                kfree(chunk);
+                return NULL;
        }
-}
-/**
+        chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
- * pcpu_alloc_pages - allocates pages for @chunk
+        chunk->map[chunk->map_used++] = pcpu_unit_size;
- * @chunk: target chunk
- * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
- * @populated: populated bitmap
- * @page_start: page index of the first page to be allocated
- * @page_end: page index of the last page to be allocated + 1
- *
- * Allocate pages [@page_start,@page_end) into @pages for all units.
- * The allocation is for @chunk.  Percpu core doesn't care about the
- * content of @pages and will pass it verbatim to pcpu_map_pages().
- */
-static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
-                            struct page **pages, unsigned long *populated,
-                            int page_start, int page_end)
-{
-        const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
-        unsigned int cpu;
-        int i;
-        for_each_possible_cpu(cpu) {
+        INIT_LIST_HEAD(&chunk->list);
-                for (i = page_start; i < page_end; i++) {
+        chunk->free_size = pcpu_unit_size;
-                        struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
+        chunk->contig_hint = pcpu_unit_size;
-                        *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
-                        if (!*pagep) {
-                                pcpu_free_pages(chunk, pages, populated,
-                                                page_start, page_end);
-                                return -ENOMEM;
-                        }
-                }
-        }
-        return 0;
-}
-/**
+        return chunk;
- * pcpu_pre_unmap_flush - flush cache prior to unmapping
- * @chunk: chunk the regions to be flushed belongs to
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages in [@page_start,@page_end) of @chunk are about to be
- * unmapped.  Flush cache.  As each flushing trial can be very
- * expensive, issue flush on the whole region at once rather than
- * doing it for each cpu.  This could be an overkill but is more
- * scalable.
- */
-static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
-                                 int page_start, int page_end)
-{
-        flush_cache_vunmap(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
 }
-static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
+static void pcpu_free_chunk(struct pcpu_chunk *chunk)
 {
-        unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
+        if (!chunk)
+                return;
+        pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
+        kfree(chunk);
 }
-/**
+/*
- * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
+ * Chunk management implementation.
- * @chunk: chunk of interest
+ *
- * @pages: pages array which can be used to pass information to free
+ * To allow different implementations, chunk alloc/free and
- * @populated: populated bitmap
+ * [de]population are implemented in a separate file which is pulled
- * @page_start: page index of the first page to unmap
+ * into this file and compiled together.  The following functions
- * @page_end: page index of the last page to unmap + 1
+ * should be implemented.
- *
+ *
- * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * pcpu_populate_chunk          - populate the specified range of a chunk
- * Corresponding elements in @pages were cleared by the caller and can
+ * pcpu_depopulate_chunk        - depopulate the specified range of a chunk
- * be used to carry information to pcpu_free_pages() which will be
+ * pcpu_create_chunk            - create a new chunk
- * called after all unmaps are finished.  The caller should call
+ * pcpu_destroy_chunk           - destroy a chunk, always preceded by full depop
- * proper pre/post flush functions.
+ * pcpu_addr_to_page            - translate address to physical address
+ * pcpu_verify_alloc_info       - check alloc_info is acceptable during init
 */
-static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size);
-                             struct page **pages, unsigned long *populated,
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size);
-                             int page_start, int page_end)
+static struct pcpu_chunk *pcpu_create_chunk(void);
-{
+static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
-        unsigned int cpu;
+static struct page *pcpu_addr_to_page(void *addr);
-        int i;
+static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
-        for_each_possible_cpu(cpu) {
+#ifdef CONFIG_NEED_PER_CPU_KM
-                for (i = page_start; i < page_end; i++) {
+#include "percpu-km.c"
-                        struct page *page;
+#else
+#include "percpu-vm.c"
-                        page = pcpu_chunk_page(chunk, cpu, i);
+#endif
-                        WARN_ON(!page);
-                        pages[pcpu_page_idx(cpu, i)] = page;
-                }
-                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
-                                   page_end - page_start);
-        }
-        for (i = page_start; i < page_end; i++)
-                __clear_bit(i, populated);
-}
 /**
- * pcpu_post_unmap_tlb_flush - flush TLB after unmapping
+ * pcpu_chunk_addr_search - determine chunk containing specified address
- * @chunk: pcpu_chunk the regions to be flushed belong to
+ * @addr: address for which the chunk needs to be determined.
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages [@page_start,@page_end) of @chunk have been unmapped.  Flush
- * TLB for the regions.  This can be skipped if the area is to be
- * returned to vmalloc as vmalloc will handle TLB flushing lazily.
 *
- * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
+ * RETURNS:
- * for the whole region.
+ * The address of the found chunk.
- */
-static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
-                                      int page_start, int page_end)
-{
-        flush_tlb_kernel_range(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
-}
-static int __pcpu_map_pages(unsigned long addr, struct page **pages,
-                            int nr_pages)
-{
-        return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
-                                        PAGE_KERNEL, pages);
-}
-/**
- * pcpu_map_pages - map pages into a pcpu_chunk
- * @chunk: chunk of interest
- * @pages: pages array containing pages to be mapped
- * @populated: populated bitmap
- * @page_start: page index of the first page to map
- * @page_end: page index of the last page to map + 1
- *
- * For each cpu, map pages [@page_start,@page_end) into @chunk.  The
- * caller is responsible for calling pcpu_post_map_flush() after all
- * mappings are complete.
- *
- * This function is responsible for setting corresponding bits in
- * @chunk->populated bitmap and whatever is necessary for reverse
- * lookup (addr -> chunk).
 */
-static int pcpu_map_pages(struct pcpu_chunk *chunk,
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
-                          struct page **pages, unsigned long *populated,
-                          int page_start, int page_end)
 {
-        unsigned int cpu, tcpu;
+        /* is it in the first chunk? */
-        int i, err;
+        if (pcpu_addr_in_first_chunk(addr)) {
+                /* is it in the reserved area? */
-        for_each_possible_cpu(cpu) {
+                if (pcpu_addr_in_reserved_chunk(addr))
-                err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
+                        return pcpu_reserved_chunk;
-                                       &pages[pcpu_page_idx(cpu, page_start)],
+                return pcpu_first_chunk;
-                                       page_end - page_start);
-                if (err < 0)
-                        goto err;
-        }
-        /* mapping successful, link chunk and mark populated */
-        for (i = page_start; i < page_end; i++) {
-                for_each_possible_cpu(cpu)
-                        pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
-                                            chunk);
-                __set_bit(i, populated);
-        }
-        return 0;
-err:
-        for_each_possible_cpu(tcpu) {
-                if (tcpu == cpu)
-                        break;
-                __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
-                                   page_end - page_start);
        }
-        return err;
-}
-/**
- * pcpu_post_map_flush - flush cache after mapping
- * @chunk: pcpu_chunk the regions to be flushed belong to
- * @page_start: page index of the first page to be flushed
- * @page_end: page index of the last page to be flushed + 1
- *
- * Pages [@page_start,@page_end) of @chunk have been mapped.  Flush
- * cache.
- *
- * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
- * for the whole region.
- */
-static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
-                                int page_start, int page_end)
-{
-        flush_cache_vmap(
-                pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
-                pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
-}
-/**
- * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
- * @chunk: chunk to depopulate
- * @off: offset to the area to depopulate
- * @size: size of the area to depopulate in bytes
- * @flush: whether to flush cache and tlb or not
- *
- * For each cpu, depopulate and unmap pages [@page_start,@page_end)
- * from @chunk.  If @flush is true, vcache is flushed before unmapping
- * and tlb after.
- *
- * CONTEXT:
- * pcpu_alloc_mutex.
- */
-static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
-{
-        int page_start = PFN_DOWN(off);
-        int page_end = PFN_UP(off + size);
-        struct page **pages;
-        unsigned long *populated;
-        int rs, re;
-        /* quick path, check whether it's empty already */
-        rs = page_start;
-        pcpu_next_unpop(chunk, &rs, &re, page_end);
-        if (rs == page_start && re == page_end)
-                return;
-        /* immutable chunks can't be depopulated */
-        WARN_ON(chunk->immutable);
        /*
-         * If control reaches here, there must have been at least one
+         * The address is relative to unit0 which might be unused and
-         * successful population attempt so the temp pages array must
+         * thus unmapped.  Offset the address to the unit space of the
-         * be available now.
+         * current processor before looking it up in the vmalloc
+         * space.  Note that any possible cpu id can be used here, so
+         * there's no need to worry about preemption or cpu hotplug.
         */
-        pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
+        addr += pcpu_unit_offsets[raw_smp_processor_id()];
-        BUG_ON(!pages);
+        return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
-        /* unmap and free */
-        pcpu_pre_unmap_flush(chunk, page_start, page_end);
-        pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
-                pcpu_unmap_pages(chunk, pages, populated, rs, re);
-        /* no need to flush tlb, vmalloc will handle it lazily */
-        pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
-                pcpu_free_pages(chunk, pages, populated, rs, re);
-        /* commit new bitmap */
-        bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
-}
-/**
- * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
- * @chunk: chunk of interest
- * @off: offset to the area to populate
- * @size: size of the area to populate in bytes
- *
- * For each cpu, populate and map pages [@page_start,@page_end) into
- * @chunk.  The area is cleared on return.
- *
- * CONTEXT:
- * pcpu_alloc_mutex, does GFP_KERNEL allocation.
- */
-static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
-{
-        int page_start = PFN_DOWN(off);
-        int page_end = PFN_UP(off + size);
-        int free_end = page_start, unmap_end = page_start;
-        struct page **pages;
-        unsigned long *populated;
-        unsigned int cpu;
-        int rs, re, rc;
-        /* quick path, check whether all pages are already there */
-        rs = page_start;
-        pcpu_next_pop(chunk, &rs, &re, page_end);
-        if (rs == page_start && re == page_end)
-                goto clear;
-        /* need to allocate and map pages, this chunk can't be immutable */
-        WARN_ON(chunk->immutable);
-        pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
-        if (!pages)
-                return -ENOMEM;
-        /* alloc and map */
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
-                rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
-                if (rc)
-                        goto err_free;
-                free_end = re;
-        }
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
-                rc = pcpu_map_pages(chunk, pages, populated, rs, re);
-                if (rc)
-                        goto err_unmap;
-                unmap_end = re;
-        }
-        pcpu_post_map_flush(chunk, page_start, page_end);
-        /* commit new bitmap */
-        bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
-clear:
-        for_each_possible_cpu(cpu)
-                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
-        return 0;
-err_unmap:
-        pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
-                pcpu_unmap_pages(chunk, pages, populated, rs, re);
-        pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
-err_free:
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
-                pcpu_free_pages(chunk, pages, populated, rs, re);
-        return rc;
-}
-static void free_pcpu_chunk(struct pcpu_chunk *chunk)
-{
-        if (!chunk)
-                return;
-        if (chunk->vms)
-                pcpu_free_vm_areas(chunk->vms, pcpu_nr_groups);
-        pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));
-        kfree(chunk);
-}
-static struct pcpu_chunk *alloc_pcpu_chunk(void)
-{
-        struct pcpu_chunk *chunk;
-        chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
-        if (!chunk)
-                return NULL;
-        chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
-        chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
-        chunk->map[chunk->map_used++] = pcpu_unit_size;
-        chunk->vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
-                                       pcpu_nr_groups, pcpu_atom_size,
-                                       GFP_KERNEL);
-        if (!chunk->vms) {
-                free_pcpu_chunk(chunk);
-                return NULL;
-        }
-        INIT_LIST_HEAD(&chunk->list);
-        chunk->free_size = pcpu_unit_size;
-        chunk->contig_hint = pcpu_unit_size;
-        chunk->base_addr = chunk->vms[0]->addr - pcpu_group_offsets[0];
-        return chunk;
 }
 /**
@@ -1142,7 +777,7 @@ restart:
        /* hmmm... no space left, create a new chunk */
        spin_unlock_irqrestore(&pcpu_lock, flags);
-        chunk = alloc_pcpu_chunk();
+        chunk = pcpu_create_chunk();
        if (!chunk) {
                err = "failed to allocate new chunk";
                goto fail_unlock_mutex;
@@ -1254,7 +889,7 @@ static void pcpu_reclaim(struct work_struct *work)
        list_for_each_entry_safe(chunk, next, &todo, list) {
                pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
-                free_pcpu_chunk(chunk);
+                pcpu_destroy_chunk(chunk);
        }
        mutex_unlock(&pcpu_alloc_mutex);
@@ -1343,11 +978,14 @@ bool is_kernel_percpu_address(unsigned long addr)
 */
 phys_addr_t per_cpu_ptr_to_phys(void *addr)
 {
-        if ((unsigned long)addr < VMALLOC_START ||
+        if (pcpu_addr_in_first_chunk(addr)) {
-                        (unsigned long)addr >= VMALLOC_END)
+                if ((unsigned long)addr < VMALLOC_START ||
-                return __pa(addr);
+                    (unsigned long)addr >= VMALLOC_END)
-        else
+                        return __pa(addr);
-                return page_to_phys(vmalloc_to_page(addr));
+                else
+                        return page_to_phys(vmalloc_to_page(addr));
+        } else
+                return page_to_phys(pcpu_addr_to_page(addr));
 }
 static inline size_t pcpu_calc_fc_sizes(size_t static_size,
@@ -1719,6 +1357,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
+        PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
        /* process group information and build config tables accordingly */
        group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0]));
diff --git a/mm/rmap.c b/mm/rmap.c
index 4bad3267537..0feeef860a8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -133,8 +133,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                                goto out_enomem_free_avc;
                        allocated = anon_vma;
                }
-                spin_lock(&anon_vma->lock);
+                spin_lock(&anon_vma->lock);
                /* page_table_lock to protect against threads */
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
@@ -144,14 +144,15 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        list_add(&avc->same_vma, &vma->anon_vma_chain);
                        list_add(&avc->same_anon_vma, &anon_vma->head);
                        allocated = NULL;
+                        avc = NULL;
                }
                spin_unlock(&mm->page_table_lock);
                spin_unlock(&anon_vma->lock);
-                if (unlikely(allocated)) {
+                if (unlikely(allocated))
                        anon_vma_free(allocated);
+                if (unlikely(avc))
                        anon_vma_chain_free(avc);
-                }
        }
        return 0;
@@ -335,14 +336,13 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 /*
 * At what user virtual address is page expected in vma?
- * checking that the page matches the vma.
+ * Caller should check the page is actually part of the vma.
 */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
-        if (PageAnon(page)) {
+        if (PageAnon(page))
-                if (vma->anon_vma != page_anon_vma(page))
+                ;
-                        return -EFAULT;
+        else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
-        } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
                if (!vma->vm_file ||
                    vma->vm_file->f_mapping != page->mapping)
                        return -EFAULT;
@@ -730,23 +730,28 @@ void page_move_anon_rmap(struct page *page,
 * @page:       the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
 * @address:    the user virtual address mapped
+ * @exclusive:  the page is exclusively owned by the current process
 */
 static void __page_set_anon_rmap(struct page *page,
-        struct vm_area_struct *vma, unsigned long address)
+        struct vm_area_struct *vma, unsigned long address, int exclusive)
 {
-        struct anon_vma_chain *avc;
+        struct anon_vma *anon_vma = vma->anon_vma;
-        struct anon_vma *anon_vma;
-        BUG_ON(!vma->anon_vma);
+        BUG_ON(!anon_vma);
        /*
-         * We must use the _oldest_ possible anon_vma for the page mapping!
+         * If the page isn't exclusively mapped into this vma,
+         * we must use the _oldest_ possible anon_vma for the
+         * page mapping!
         *
-         * So take the last AVC chain entry in the vma, which is the deepest
+         * So take the last AVC chain entry in the vma, which is
-         * ancestor, and use the anon_vma from that.
+         * the deepest ancestor, and use the anon_vma from that.
         */
-        avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
+        if (!exclusive) {
-        anon_vma = avc->anon_vma;
+                struct anon_vma_chain *avc;
+                avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
+                anon_vma = avc->anon_vma;
+        }
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
@@ -802,7 +807,7 @@ void page_add_anon_rmap(struct page *page,
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        if (first)
-                __page_set_anon_rmap(page, vma, address);
+                __page_set_anon_rmap(page, vma, address, 0);
        else
                __page_check_anon_rmap(page, vma, address);
 }
@@ -824,7 +829,7 @@ void page_add_new_anon_rmap(struct page *page,
        SetPageSwapBacked(page);
        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
        __inc_zone_page_state(page, NR_ANON_PAGES);
-        __page_set_anon_rmap(page, vma, address);
+        __page_set_anon_rmap(page, vma, address, 1);
        if (page_evictable(page, vma))
                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
        else
diff --git a/mm/shmem.c b/mm/shmem.c
index eef4ebea515..0cd7f66f1c6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1545,8 +1545,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
-static struct inode *shmem_get_inode(struct super_block *sb, int mode,
+static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
-                                        dev_t dev, unsigned long flags)
+                                     int mode, dev_t dev, unsigned long flags)
 {
        struct inode *inode;
        struct shmem_inode_info *info;
@@ -1557,9 +1557,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
        inode = new_inode(sb);
        if (inode) {
-                inode->i_mode = mode;
+                inode_init_owner(inode, dir, mode);
-                inode->i_uid = current_fsuid();
-                inode->i_gid = current_fsgid();
                inode->i_blocks = 0;
                inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -1814,7 +1812,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        struct inode *inode;
        int error = -ENOSPC;
-        inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE);
+        inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
        if (inode) {
                error = security_inode_init_security(inode, dir, NULL, NULL,
                                                     NULL);
@@ -1833,11 +1831,6 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
 #else
                error = 0;
 #endif
-                if (dir->i_mode & S_ISGID) {
-                        inode->i_gid = dir->i_gid;
-                        if (S_ISDIR(mode))
-                                inode->i_mode |= S_ISGID;
-                }
                dir->i_size += BOGO_DIRENT_SIZE;
                dir->i_ctime = dir->i_mtime = CURRENT_TIME;
                d_instantiate(dentry, inode);
@@ -1957,7 +1950,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        if (len > PAGE_CACHE_SIZE)
                return -ENAMETOOLONG;
-        inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
+        inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
        if (!inode)
                return -ENOSPC;
@@ -1992,8 +1985,6 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                unlock_page(page);
                page_cache_release(page);
        }
-        if (dir->i_mode & S_ISGID)
-                inode->i_gid = dir->i_gid;
        dir->i_size += BOGO_DIRENT_SIZE;
        dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        d_instantiate(dentry, inode);
@@ -2071,14 +2062,14 @@ static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
                                          size, flags);
 }
-static struct xattr_handler shmem_xattr_security_handler = {
+static const struct xattr_handler shmem_xattr_security_handler = {
        .prefix = XATTR_SECURITY_PREFIX,
        .list   = shmem_xattr_security_list,
        .get    = shmem_xattr_security_get,
        .set    = shmem_xattr_security_set,
 };
-static struct xattr_handler *shmem_xattr_handlers[] = {
+static const struct xattr_handler *shmem_xattr_handlers[] = {
        &generic_acl_access_handler,
        &generic_acl_default_handler,
        &shmem_xattr_security_handler,
@@ -2366,7 +2357,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_flags |= MS_POSIXACL;
 #endif
-        inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
+        inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
        if (!inode)
                goto failed;
        inode->i_uid = sbinfo->uid;
@@ -2611,7 +2602,7 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
 #define shmem_vm_ops                            generic_file_vm_ops
 #define shmem_file_operations                   ramfs_file_operations
-#define shmem_get_inode(sb, mode, dev, flags)   ramfs_get_inode(sb, mode, dev)
+#define shmem_get_inode(sb, dir, mode, dev, flags)      ramfs_get_inode(sb, dir, mode, dev)
 #define shmem_acct_size(flags, size)            0
 #define shmem_unacct_size(flags, size)          do {} while (0)
 #define SHMEM_MAX_BYTES                         MAX_LFS_FILESIZE
@@ -2655,7 +2646,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        path.mnt = mntget(shm_mnt);
        error = -ENOSPC;
-        inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
+        inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
        if (!inode)
                goto put_dentry;
diff --git a/mm/slab.c b/mm/slab.c
index 525c6646646..50a73fca19c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
 #include        <linux/reciprocal_div.h>
 #include        <linux/debugobjects.h>
 #include        <linux/kmemcheck.h>
+#include        <linux/memory.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
@@ -144,30 +145,6 @@
 #define BYTES_PER_WORD          sizeof(void *)
 #define REDZONE_ALIGN           max(BYTES_PER_WORD, __alignof__(unsigned long long))
-#ifndef ARCH_KMALLOC_MINALIGN
-/*
- * Enforce a minimum alignment for the kmalloc caches.
- * Usually, the kmalloc caches are cache_line_size() aligned, except when
- * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
- * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than the alignment of a 64-bit integer.
- * ARCH_KMALLOC_MINALIGN allows that.
- * Note that increasing this value may disable some debug features.
- */
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-#ifndef ARCH_SLAB_MINALIGN
-/*
- * Enforce a minimum alignment for all caches.
- * Intended for archs that get misalignment faults even for BYTES_PER_WORD
- * aligned buffers. Includes ARCH_KMALLOC_MINALIGN.
- * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables
- * some debug features.
- */
-#define ARCH_SLAB_MINALIGN 0
-#endif
 #ifndef ARCH_KMALLOC_FLAGS
 #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 #endif
@@ -1102,6 +1079,52 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 }
 #endif
+/*
+ * Allocates and initializes nodelists for a node on each slab cache, used for
+ * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3
+ * will be allocated off-node since memory is not yet online for the new node.
+ * When hotplugging memory or a cpu, existing nodelists are not replaced if
+ * already in use.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int init_cache_nodelists_node(int node)
+{
+        struct kmem_cache *cachep;
+        struct kmem_list3 *l3;
+        const int memsize = sizeof(struct kmem_list3);
+        list_for_each_entry(cachep, &cache_chain, next) {
+                /*
+                 * Set up the size64 kmemlist for cpu before we can
+                 * begin anything. Make sure some other cpu on this
+                 * node has not already allocated this
+                 */
+                if (!cachep->nodelists[node]) {
+                        l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+                        if (!l3)
+                                return -ENOMEM;
+                        kmem_list3_init(l3);
+                        l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
+                            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+                        /*
+                         * The l3s don't come and go as CPUs come and
+                         * go.  cache_chain_mutex is sufficient
+                         * protection here.
+                         */
+                        cachep->nodelists[node] = l3;
+                }
+                spin_lock_irq(&cachep->nodelists[node]->list_lock);
+                cachep->nodelists[node]->free_limit =
+                        (1 + nr_cpus_node(node)) *
+                        cachep->batchcount + cachep->num;
+                spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+        }
+        return 0;
+}
 static void __cpuinit cpuup_canceled(long cpu)
 {
        struct kmem_cache *cachep;
@@ -1172,7 +1195,7 @@ static int __cpuinit cpuup_prepare(long cpu)
        struct kmem_cache *cachep;
        struct kmem_list3 *l3 = NULL;
        int node = cpu_to_node(cpu);
-        const int memsize = sizeof(struct kmem_list3);
+        int err;
        /*
         * We need to do this right in the beginning since
@@ -1180,35 +1203,9 @@ static int __cpuinit cpuup_prepare(long cpu)
         * kmalloc_node allows us to add the slab to the right
         * kmem_list3 and not this cpu's kmem_list3
         */
+        err = init_cache_nodelists_node(node);
-        list_for_each_entry(cachep, &cache_chain, next) {
+        if (err < 0)
-                /*
+                goto bad;
-                 * Set up the size64 kmemlist for cpu before we can
-                 * begin anything. Make sure some other cpu on this
-                 * node has not already allocated this
-                 */
-                if (!cachep->nodelists[node]) {
-                        l3 = kmalloc_node(memsize, GFP_KERNEL, node);
-                        if (!l3)
-                                goto bad;
-                        kmem_list3_init(l3);
-                        l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                            ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-                        /*
-                         * The l3s don't come and go as CPUs come and
-                         * go.  cache_chain_mutex is sufficient
-                         * protection here.
-                         */
-                        cachep->nodelists[node] = l3;
-                }
-                spin_lock_irq(&cachep->nodelists[node]->list_lock);
-                cachep->nodelists[node]->free_limit =
-                        (1 + nr_cpus_node(node)) *
-                        cachep->batchcount + cachep->num;
-                spin_unlock_irq(&cachep->nodelists[node]->list_lock);
-        }
        /*
         * Now we can go ahead with allocating the shared arrays and
@@ -1331,11 +1328,75 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
        &cpuup_callback, NULL, 0
 };
+#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+/*
+ * Drains freelist for a node on each slab cache, used for memory hot-remove.
+ * Returns -EBUSY if all objects cannot be drained so that the node is not
+ * removed.
+ *
+ * Must hold cache_chain_mutex.
+ */
+static int __meminit drain_cache_nodelists_node(int node)
+{
+        struct kmem_cache *cachep;
+        int ret = 0;
+        list_for_each_entry(cachep, &cache_chain, next) {
+                struct kmem_list3 *l3;
+                l3 = cachep->nodelists[node];
+                if (!l3)
+                        continue;
+                drain_freelist(cachep, l3, l3->free_objects);
+                if (!list_empty(&l3->slabs_full) ||
+                    !list_empty(&l3->slabs_partial)) {
+                        ret = -EBUSY;
+                        break;
+                }
+        }
+        return ret;
+}
+static int __meminit slab_memory_callback(struct notifier_block *self,
+                                        unsigned long action, void *arg)
+{
+        struct memory_notify *mnb = arg;
+        int ret = 0;
+        int nid;
+        nid = mnb->status_change_nid;
+        if (nid < 0)
+                goto out;
+        switch (action) {
+        case MEM_GOING_ONLINE:
+                mutex_lock(&cache_chain_mutex);
+                ret = init_cache_nodelists_node(nid);
+                mutex_unlock(&cache_chain_mutex);
+                break;
+        case MEM_GOING_OFFLINE:
+                mutex_lock(&cache_chain_mutex);
+                ret = drain_cache_nodelists_node(nid);
+                mutex_unlock(&cache_chain_mutex);
+                break;
+        case MEM_ONLINE:
+        case MEM_OFFLINE:
+        case MEM_CANCEL_ONLINE:
+        case MEM_CANCEL_OFFLINE:
+                break;
+        }
+out:
+        return ret ? notifier_from_errno(ret) : NOTIFY_OK;
+}
+#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
 /*
 * swap the static kmem_list3 with kmalloced memory
 */
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
-                        int nodeid)
+                                int nodeid)
 {
        struct kmem_list3 *ptr;
@@ -1580,6 +1641,14 @@ void __init kmem_cache_init_late(void)
         */
        register_cpu_notifier(&cpucache_notifier);
+#ifdef CONFIG_NUMA
+        /*
+         * Register a memory hotplug callback that initializes and frees
+         * nodelists.
+         */
+        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
+#endif
        /*
         * The reap timers are started later, with a module init call: That part
         * of the kernel is not yet operational.
@@ -4216,10 +4285,11 @@ static int s_show(struct seq_file *m, void *p)
                unsigned long node_frees = cachep->node_frees;
                unsigned long overflows = cachep->node_overflow;
-                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
+                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu "
-                                %4lu %4lu %4lu %4lu %4lu", allocs, high, grown,
+                           "%4lu %4lu %4lu %4lu %4lu",
-                                reaped, errors, max_freeable, node_allocs,
+                           allocs, high, grown,
-                                node_frees, overflows);
+                           reaped, errors, max_freeable, node_allocs,
+                           node_frees, overflows);
        }
        /* cpu stats */
        {
diff --git a/mm/slob.c b/mm/slob.c
index 837ebd64cc3..23631e2bb57 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -467,14 +467,6 @@ out:
 * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend.
 */
-#ifndef ARCH_KMALLOC_MINALIGN
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
-#endif
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
-#endif
 void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 {
        unsigned int *m;
diff --git a/mm/slub.c b/mm/slub.c
index 7d6c8b1ccf6..e46e3129697 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -157,14 +157,6 @@
 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
                SLAB_CACHE_DMA | SLAB_NOTRACK)
-#ifndef ARCH_KMALLOC_MINALIGN
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-#ifndef ARCH_SLAB_MINALIGN
-#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
-#endif
 #define OO_SHIFT        16
 #define OO_MASK         ((1 << OO_SHIFT) - 1)
 #define MAX_OBJS_PER_PAGE       65535 /* since page.objects is u16 */
@@ -1084,7 +1076,7 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node,
        if (node == -1)
                return alloc_pages(flags, order);
        else
-                return alloc_pages_node(node, flags, order);
+                return alloc_pages_exact_node(node, flags, order);
 }
 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -2153,7 +2145,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
        int local_node;
        if (slab_state >= UP && (s < kmalloc_caches ||
-                        s > kmalloc_caches + KMALLOC_CACHES))
+                        s >= kmalloc_caches + KMALLOC_CACHES))
                local_node = page_to_nid(virt_to_page(s));
        else
                local_node = 0;
@@ -2429,9 +2421,11 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
 #ifdef CONFIG_SLUB_DEBUG
        void *addr = page_address(page);
        void *p;
-        DECLARE_BITMAP(map, page->objects);
+        long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long),
+                            GFP_ATOMIC);
-        bitmap_zero(map, page->objects);
+        if (!map)
+                return;
        slab_err(s, page, "%s", text);
        slab_lock(page);
        for_each_free_object(p, s, page->freelist)
@@ -2446,6 +2440,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
                }
        }
        slab_unlock(page);
+        kfree(map);
 #endif
 }
@@ -3338,8 +3333,15 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
        struct kmem_cache *s;
        void *ret;
-        if (unlikely(size > SLUB_MAX_SIZE))
+        if (unlikely(size > SLUB_MAX_SIZE)) {
-                return kmalloc_large_node(size, gfpflags, node);
+                ret = kmalloc_large_node(size, gfpflags, node);
+                trace_kmalloc_node(caller, ret,
+                                   size, PAGE_SIZE << get_order(size),
+                                   gfpflags, node);
+                return ret;
+        }
        s = get_slab(size, gfpflags);
@@ -3651,10 +3653,10 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
 }
 static void process_slab(struct loc_track *t, struct kmem_cache *s,
-                struct page *page, enum track_item alloc)
+                struct page *page, enum track_item alloc,
+                long *map)
 {
        void *addr = page_address(page);
-        DECLARE_BITMAP(map, page->objects);
        void *p;
        bitmap_zero(map, page->objects);
@@ -3673,11 +3675,14 @@ static int list_locations(struct kmem_cache *s, char *buf,
        unsigned long i;
        struct loc_track t = { 0, 0, NULL };
        int node;
+        unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) *
+                                     sizeof(unsigned long), GFP_KERNEL);
-        if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
+        if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
-                        GFP_TEMPORARY))
+                                     GFP_TEMPORARY)) {
+                kfree(map);
                return sprintf(buf, "Out of memory\n");
+        }
        /* Push back cpu slabs */
        flush_all(s);
@@ -3691,9 +3696,9 @@ static int list_locations(struct kmem_cache *s, char *buf,
                spin_lock_irqsave(&n->list_lock, flags);
                list_for_each_entry(page, &n->partial, lru)
-                        process_slab(&t, s, page, alloc);
+                        process_slab(&t, s, page, alloc, map);
                list_for_each_entry(page, &n->full, lru)
-                        process_slab(&t, s, page, alloc);
+                        process_slab(&t, s, page, alloc, map);
                spin_unlock_irqrestore(&n->list_lock, flags);
        }
@@ -3744,6 +3749,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
        }
        free_loc_track(&t);
+        kfree(map);
        if (!t.count)
                len += sprintf(buf, "No data\n");
        return len;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cd0a8f90dc..03aa2d55f1a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -139,7 +139,8 @@ static int discard_swap(struct swap_info_struct *si)
        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
        if (nr_blocks) {
                err = blkdev_issue_discard(si->bdev, start_block,
-                                nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+                                nr_blocks, GFP_KERNEL,
+                                BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (err)
                        return err;
                cond_resched();
@@ -150,7 +151,8 @@ static int discard_swap(struct swap_info_struct *si)
                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
                err = blkdev_issue_discard(si->bdev, start_block,
-                                nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+                                nr_blocks, GFP_KERNEL,
+                                BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
                if (err)
                        break;
@@ -189,7 +191,8 @@ static void discard_swap_cluster(struct swap_info_struct *si,
                        start_block <<= PAGE_SHIFT - 9;
                        nr_blocks <<= PAGE_SHIFT - 9;
                        if (blkdev_issue_discard(si->bdev, start_block,
-                                    nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
+                                    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
+                                                        BLKDEV_IFL_BARRIER))
                                break;
                }
@@ -574,6 +577,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
        /* free if no reference */
        if (!usage) {
+                struct gendisk *disk = p->bdev->bd_disk;
                if (offset < p->lowest_bit)
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
@@ -583,6 +587,9 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
                        swap_list.next = p->type;
                nr_swap_pages++;
                p->inuse_pages--;
+                if ((p->flags & SWP_BLKDEV) &&
+                                disk->fops->swap_slot_free_notify)
+                        disk->fops->swap_slot_free_notify(p->bdev, offset);
        }
        return usage;
@@ -1884,6 +1891,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                if (error < 0)
                        goto bad_swap;
                p->bdev = bdev;
+                p->flags |= SWP_BLKDEV;
        } else if (S_ISREG(inode->i_mode)) {
                p->bdev = inode->i_sb->s_bdev;
                mutex_lock(&inode->i_mutex);