Merge branch 'fix/asoc' into for-linus

author: Takashi Iwai <tiwai@suse.de> 2010-11-03 10:51:26 -0400
committer: Takashi Iwai <tiwai@suse.de> 2010-11-03 10:51:26 -0400
commit: 69dbdd819599e2f3b77c172e83af512845bca5ad (patch)
tree: 49939d8b80ec2115a801eae2aebc21f23867c876 /mm
parent: 87232dd49aeb6b7d1af291edca8bd129a82ef4b5 (diff)
parent: 75e3f3137cb570661c2ad3035a139dda671fbb63 (diff)
39 files changed, 2779 insertions, 1458 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f0fb9124e410..c2c8a4a11898 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -301,3 +301,11 @@ config NOMMU_INITIAL_TRIM_EXCESS
          of 1 says that all excess pages should be trimmed.
          See Documentation/nommu-mmap.txt for more information.
+#
+# UP and nommu archs use km based percpu allocator
+#
+config NEED_PER_CPU_KM
+        depends on !SMP
+        bool
+        default y
diff --git a/mm/Makefile b/mm/Makefile
index 34b2546a9e37..f73f75a29f82 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                           page_isolation.o mm_init.o mmu_context.o \
+                           page_isolation.o mm_init.o mmu_context.o percpu.o \
                           $(mmu-y)
 obj-y += init-mm.o
@@ -36,11 +36,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
-ifdef CONFIG_SMP
-obj-y += percpu.o
-else
-obj-y += percpu_up.o
-endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 65d420499a61..027100d30227 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -74,11 +74,11 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
        nr_wb = nr_dirty = nr_io = nr_more_io = 0;
        spin_lock(&inode_lock);
-        list_for_each_entry(inode, &wb->b_dirty, i_list)
+        list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                nr_dirty++;
-        list_for_each_entry(inode, &wb->b_io, i_list)
+        list_for_each_entry(inode, &wb->b_io, i_wb_list)
                nr_io++;
-        list_for_each_entry(inode, &wb->b_more_io, i_list)
+        list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
                nr_more_io++;
        spin_unlock(&inode_lock);
@@ -362,7 +362,7 @@ static int bdi_forker_thread(void *ptr)
 {
        struct bdi_writeback *me = ptr;
-        current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+        current->flags |= PF_SWAPWRITE;
        set_freezable();
        /*
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
        };
+static atomic_t nr_bdi_congested[2];
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
        wait_queue_head_t *wqh = &congestion_wqh[sync];
        bit = sync ? BDI_sync_congested : BDI_async_congested;
-        clear_bit(bit, &bdi->state);
+        if (test_and_clear_bit(bit, &bdi->state))
+                atomic_dec(&nr_bdi_congested[sync]);
        smp_mb__after_clear_bit();
        if (waitqueue_active(wqh))
                wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
        enum bdi_state bit;
        bit = sync ? BDI_sync_congested : BDI_async_congested;
-        set_bit(bit, &bdi->state);
+        if (!test_and_set_bit(bit, &bdi->state))
+                atomic_inc(&nr_bdi_congested[sync]);
 }
 EXPORT_SYMBOL(set_bdi_congested);
@@ -764,13 +767,72 @@ EXPORT_SYMBOL(set_bdi_congested);
 long congestion_wait(int sync, long timeout)
 {
        long ret;
+        unsigned long start = jiffies;
        DEFINE_WAIT(wait);
        wait_queue_head_t *wqh = &congestion_wqh[sync];
        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
        ret = io_schedule_timeout(timeout);
        finish_wait(wqh, &wait);
+        trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+                                        jiffies_to_usecs(jiffies - start));
        return ret;
 }
 EXPORT_SYMBOL(congestion_wait);
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+        long ret;
+        unsigned long start = jiffies;
+        DEFINE_WAIT(wait);
+        wait_queue_head_t *wqh = &congestion_wqh[sync];
+        /*
+         * If there is no congestion, or heavy congestion is not being
+         * encountered in the current zone, yield if necessary instead
+         * of sleeping on the congestion queue
+         */
+        if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+                        !zone_is_reclaim_congested(zone)) {
+                cond_resched();
+                /* In case we scheduled, work out time remaining */
+                ret = timeout - (jiffies - start);
+                if (ret < 0)
+                        ret = 0;
+                goto out;
+        }
+        /* Sleep until uncongested or a write happens */
+        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+        ret = io_schedule_timeout(timeout);
+        finish_wait(wqh, &wait);
+out:
+        trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+                                        jiffies_to_usecs(jiffies - start));
+        return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 142c84a54993..13b0caa9793c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/kmemleak.h>
 #include <linux/range.h>
+#include <linux/memblock.h>
 #include <asm/bug.h>
 #include <asm/io.h>
@@ -434,7 +435,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
                              unsigned long size)
 {
 #ifdef CONFIG_NO_BOOTMEM
-        free_early(physaddr, physaddr + size);
+        kmemleak_free_part(__va(physaddr), size);
+        memblock_x86_free_range(physaddr, physaddr + size);
 #else
        unsigned long start, end;
@@ -459,7 +461,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 void __init free_bootmem(unsigned long addr, unsigned long size)
 {
 #ifdef CONFIG_NO_BOOTMEM
-        free_early(addr, addr + size);
+        kmemleak_free_part(__va(addr), size);
+        memblock_x86_free_range(addr, addr + size);
 #else
        unsigned long start, end;
@@ -526,6 +529,12 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
 }
 #ifndef CONFIG_NO_BOOTMEM
+int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+                                   int flags)
+{
+        return reserve_bootmem(phys, len, flags);
+}
 static unsigned long __init align_idx(struct bootmem_data *bdata,
                                      unsigned long idx, unsigned long step)
 {
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 3df063706f53..4df2de77e069 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
        size_t offset;
        void *retval;
+        might_sleep_if(mem_flags & __GFP_WAIT);
        spin_lock_irqsave(&pool->lock, flags);
 restart:
        list_for_each_entry(page, &pool->page_list, page_list) {
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44e4221..75572b5f2374 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -612,6 +612,19 @@ void __lock_page_nosync(struct page *page)
                                                        TASK_UNINTERRUPTIBLE);
 }
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
+                         unsigned int flags)
+{
+        if (!(flags & FAULT_FLAG_ALLOW_RETRY)) {
+                __lock_page(page);
+                return 1;
+        } else {
+                up_read(&mm->mmap_sem);
+                wait_on_page_locked(page);
+                return 0;
+        }
+}
 /**
 * find_get_page - find and get a page reference
 * @mapping: the address_space to search
@@ -1539,25 +1552,28 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 * waiting for the lock.
                 */
                do_async_mmap_readahead(vma, ra, file, page, offset);
-                lock_page(page);
-                /* Did it get truncated? */
-                if (unlikely(page->mapping != mapping)) {
-                        unlock_page(page);
-                        put_page(page);
-                        goto no_cached_page;
-                }
        } else {
                /* No page in the page cache at all */
                do_sync_mmap_readahead(vma, ra, file, offset);
                count_vm_event(PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
 retry_find:
-                page = find_lock_page(mapping, offset);
+                page = find_get_page(mapping, offset);
                if (!page)
                        goto no_cached_page;
        }
+        if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+                return ret | VM_FAULT_RETRY;
+        /* Did it get truncated? */
+        if (unlikely(page->mapping != mapping)) {
+                unlock_page(page);
+                put_page(page);
+                goto retry_find;
+        }
+        VM_BUG_ON(page->index != offset);
        /*
         * We have a locked page in the page cache, now we need to check
         * that it's up-to-date. If not, it is going to be due to an error.
@@ -2177,12 +2193,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        }
        if (written > 0) {
-                loff_t end = pos + written;
+                pos += written;
-                if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+                if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
-                        i_size_write(inode,  end);
+                        i_size_write(inode, pos);
                        mark_inode_dirty(inode);
                }
-                *ppos = end;
+                *ppos = pos;
        }
 out:
        return written;
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a0aa1be4993..693394daa2ed 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,6 +29,11 @@
 #include <linux/kgdb.h>
 #include <asm/tlbflush.h>
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
+DEFINE_PER_CPU(int, __kmap_atomic_idx);
+#endif
 /*
 * Virtual_count is not a pure "count".
 *  0 means that it is not mapped, and has not been mapped
@@ -42,6 +47,9 @@
 unsigned long totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(totalhigh_pages);
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
 unsigned int nr_free_highpages (void)
 {
        pg_data_t *pgdat;
@@ -422,61 +430,3 @@ void __init page_address_init(void)
 }
 #endif  /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
-#ifdef CONFIG_DEBUG_HIGHMEM
-void debug_kmap_atomic(enum km_type type)
-{
-        static int warn_count = 10;
-        if (unlikely(warn_count < 0))
-                return;
-        if (unlikely(in_interrupt())) {
-                if (in_nmi()) {
-                        if (type != KM_NMI && type != KM_NMI_PTE) {
-                                WARN_ON(1);
-                                warn_count--;
-                        }
-                } else if (in_irq()) {
-                        if (type != KM_IRQ0 && type != KM_IRQ1 &&
-                            type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
-                            type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
-                                WARN_ON(1);
-                                warn_count--;
-                        }
-                } else if (!irqs_disabled()) {  /* softirq */
-                        if (type != KM_IRQ0 && type != KM_IRQ1 &&
-                            type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
-                            type != KM_SKB_SUNRPC_DATA &&
-                            type != KM_SKB_DATA_SOFTIRQ &&
-                            type != KM_BOUNCE_READ) {
-                                WARN_ON(1);
-                                warn_count--;
-                        }
-                }
-        }
-        if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-                        type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
-                        type == KM_IRQ_PTE || type == KM_NMI ||
-                        type == KM_NMI_PTE ) {
-                if (!irqs_disabled()) {
-                        WARN_ON(1);
-                        warn_count--;
-                }
-        } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
-                if (irq_count() == 0 && !irqs_disabled()) {
-                        WARN_ON(1);
-                        warn_count--;
-                }
-        }
-#ifdef CONFIG_KGDB_KDB
-        if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) {
-                WARN_ON(1);
-                warn_count--;
-        }
-#endif /* CONFIG_KGDB_KDB */
-}
-#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..c4a3558589ab 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
        }
 }
-static void copy_gigantic_page(struct page *dst, struct page *src,
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
        struct hstate *h = hstate_vma(vma);
        struct page *dst_base = dst;
        struct page *src_base = src;
-        might_sleep();
        for (i = 0; i < pages_per_huge_page(h); ) {
                cond_resched();
                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
                src = mem_map_next(src, src_base, i);
        }
 }
-static void copy_huge_page(struct page *dst, struct page *src,
+static void copy_user_huge_page(struct page *dst, struct page *src,
                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
        struct hstate *h = hstate_vma(vma);
        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-                copy_gigantic_page(dst, src, addr, vma);
+                copy_user_gigantic_page(dst, src, addr, vma);
                return;
        }
@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
        }
 }
+static void copy_gigantic_page(struct page *dst, struct page *src)
+{
+        int i;
+        struct hstate *h = page_hstate(src);
+        struct page *dst_base = dst;
+        struct page *src_base = src;
+        for (i = 0; i < pages_per_huge_page(h); ) {
+                cond_resched();
+                copy_highpage(dst, src);
+                i++;
+                dst = mem_map_next(dst, dst_base, i);
+                src = mem_map_next(src, src_base, i);
+        }
+}
+void copy_huge_page(struct page *dst, struct page *src)
+{
+        int i;
+        struct hstate *h = page_hstate(src);
+        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+                copy_gigantic_page(dst, src);
+                return;
+        }
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page(h); i++) {
+                cond_resched();
+                copy_highpage(dst + i, src + i);
+        }
+}
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
        h->free_huge_pages_node[nid]++;
 }
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+        struct page *page;
+        if (list_empty(&h->hugepage_freelists[nid]))
+                return NULL;
+        page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
+        list_del(&page->lru);
+        set_page_refcounted(page);
+        h->free_huge_pages--;
+        h->free_huge_pages_node[nid]--;
+        return page;
+}
 static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, int avoid_reserve)
 {
-        int nid;
        struct page *page = NULL;
        struct mempolicy *mpol;
        nodemask_t *nodemask;
@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
-                nid = zone_to_nid(zone);
+                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
-                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
+                        page = dequeue_huge_page_node(h, zone_to_nid(zone));
-                    !list_empty(&h->hugepage_freelists[nid])) {
+                        if (page) {
-                        page = list_entry(h->hugepage_freelists[nid].next,
+                                if (!avoid_reserve)
-                                          struct page, lru);
+                                        decrement_hugepage_resv_vma(h, vma);
-                        list_del(&page->lru);
+                                break;
-                        h->free_huge_pages--;
+                        }
-                        h->free_huge_pages_node[nid]--;
-                        if (!avoid_reserve)
-                                decrement_hugepage_resv_vma(h, vma);
-                        break;
                }
        }
 err:
@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
        return ret;
 }
-static struct page *alloc_buddy_huge_page(struct hstate *h,
+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
-                        struct vm_area_struct *vma, unsigned long address)
 {
        struct page *page;
-        unsigned int nid;
+        unsigned int r_nid;
        if (h->order >= MAX_ORDER)
                return NULL;
@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
        }
        spin_unlock(&hugetlb_lock);
-        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+        if (nid == NUMA_NO_NODE)
-                                        __GFP_REPEAT|__GFP_NOWARN,
+                page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
-                                        huge_page_order(h));
+                                   __GFP_REPEAT|__GFP_NOWARN,
+                                   huge_page_order(h));
+        else
+                page = alloc_pages_exact_node(nid,
+                        htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+                        __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
        if (page && arch_prepare_hugepage(page)) {
                __free_pages(page, huge_page_order(h));
@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
        spin_lock(&hugetlb_lock);
        if (page) {
-                /*
+                r_nid = page_to_nid(page);
-                 * This page is now managed by the hugetlb allocator and has
-                 * no users -- drop the buddy allocator's reference.
-                 */
-                put_page_testzero(page);
-                VM_BUG_ON(page_count(page));
-                nid = page_to_nid(page);
                set_compound_page_dtor(page, free_huge_page);
                /*
                 * We incremented the global counters already
                 */
-                h->nr_huge_pages_node[nid]++;
+                h->nr_huge_pages_node[r_nid]++;
-                h->surplus_huge_pages_node[nid]++;
+                h->surplus_huge_pages_node[r_nid]++;
                __count_vm_event(HTLB_BUDDY_PGALLOC);
        } else {
                h->nr_huge_pages--;
@@ -848,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
 }
 /*
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+        struct page *page;
+        spin_lock(&hugetlb_lock);
+        page = dequeue_huge_page_node(h, nid);
+        spin_unlock(&hugetlb_lock);
+        if (!page)
+                page = alloc_buddy_huge_page(h, nid);
+        return page;
+}
+/*
 * Increase the hugetlb pool such that it can accomodate a reservation
 * of size 'delta'.
 */
@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
-                page = alloc_buddy_huge_page(h, NULL, 0);
+                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
-                if (!page) {
+                if (!page)
                        /*
                         * We were not able to allocate enough pages to
                         * satisfy the entire reservation so we free what
                         * we've allocated so far.
                         */
-                        spin_lock(&hugetlb_lock);
-                        needed = 0;
                        goto free;
-                }
                list_add(&page->lru, &surplus_list);
        }
@@ -908,31 +964,31 @@ retry:
        needed += allocated;
        h->resv_huge_pages += delta;
        ret = 0;
-free:
+        spin_unlock(&hugetlb_lock);
        /* Free the needed pages to the hugetlb pool */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
                        break;
                list_del(&page->lru);
+                /*
+                 * This page is now managed by the hugetlb allocator and has
+                 * no users -- drop the buddy allocator's reference.
+                 */
+                put_page_testzero(page);
+                VM_BUG_ON(page_count(page));
                enqueue_huge_page(h, page);
        }
        /* Free unnecessary surplus pages to the buddy allocator */
+free:
        if (!list_empty(&surplus_list)) {
-                spin_unlock(&hugetlb_lock);
                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                        list_del(&page->lru);
-                        /*
+                        put_page(page);
-                         * The page has a reference count of zero already, so
-                         * call free_huge_page directly instead of using
-                         * put_page.  This must be done with hugetlb_lock
-                         * unlocked which is safe because free_huge_page takes
-                         * hugetlb_lock before deciding how to free the page.
-                         */
-                        free_huge_page(page);
                }
-                spin_lock(&hugetlb_lock);
        }
+        spin_lock(&hugetlb_lock);
        return ret;
 }
@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        spin_unlock(&hugetlb_lock);
        if (!page) {
-                page = alloc_buddy_huge_page(h, vma, addr);
+                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
                        hugetlb_put_quota(inode->i_mapping, chg);
                        return ERR_PTR(-VM_FAULT_SIGBUS);
                }
        }
-        set_page_refcounted(page);
        set_page_private(page, (unsigned long) mapping);
        vma_commit_reservation(h, vma, addr);
@@ -2153,6 +2208,19 @@ nomem:
        return -ENOMEM;
 }
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+        swp_entry_t swp;
+        if (huge_pte_none(pte) || pte_present(pte))
+                return 0;
+        swp = pte_to_swp_entry(pte);
+        if (non_swap_entry(swp) && is_migration_entry(swp)) {
+                return 1;
+        } else
+                return 0;
+}
 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
 {
        swp_entry_t swp;
@@ -2380,10 +2448,13 @@ retry_avoidcopy:
         * When the original hugepage is shared one, it does not have
         * anon_vma prepared.
         */
-        if (unlikely(anon_vma_prepare(vma)))
+        if (unlikely(anon_vma_prepare(vma))) {
+                /* Caller expects lock to be held */
+                spin_lock(&mm->page_table_lock);
                return VM_FAULT_OOM;
+        }
-        copy_huge_page(new_page, old_page, address, vma);
+        copy_user_huge_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
        /*
@@ -2515,22 +2586,20 @@ retry:
                        hugepage_add_new_anon_rmap(page, vma, address);
                }
        } else {
+                /*
+                 * If memory error occurs between mmap() and fault, some process
+                 * don't have hwpoisoned swap entry for errored virtual address.
+                 * So we need to block hugepage fault by PG_hwpoison bit check.
+                 */
+                if (unlikely(PageHWPoison(page))) {
+                        ret = VM_FAULT_HWPOISON | 
+                              VM_FAULT_SET_HINDEX(h - hstates);
+                        goto backout_unlocked;
+                }
                page_dup_rmap(page);
        }
        /*
-         * Since memory error handler replaces pte into hwpoison swap entry
-         * at the time of error handling, a process which reserved but not have
-         * the mapping to the error hugepage does not have hwpoison swap entry.
-         * So we need to block accesses from such a process by checking
-         * PG_hwpoison bit here.
-         */
-        if (unlikely(PageHWPoison(page))) {
-                ret = VM_FAULT_HWPOISON;
-                goto backout_unlocked;
-        }
-        /*
         * If we are going to COW a private mapping later, we examine the
         * pending reservations for this page now. This will ensure that
         * any allocations necessary to record that reservation occur outside
@@ -2587,8 +2656,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        ptep = huge_pte_offset(mm, address);
        if (ptep) {
                entry = huge_ptep_get(ptep);
-                if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                if (unlikely(is_hugetlb_entry_migration(entry))) {
-                        return VM_FAULT_HWPOISON;
+                        migration_entry_wait(mm, (pmd_t *)ptep, address);
+                        return 0;
+                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                        return VM_FAULT_HWPOISON_LARGE | 
+                               VM_FAULT_SET_HINDEX(h - hstates);
        }
        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2878,18 +2951,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        hugetlb_acct_memory(h, -(chg - freed));
 }
+#ifdef CONFIG_MEMORY_FAILURE
+/* Should be called in hugetlb_lock */
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+        struct page *page;
+        struct page *tmp;
+        struct hstate *h = page_hstate(hpage);
+        int nid = page_to_nid(hpage);
+        list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
+                if (page == hpage)
+                        return 1;
+        return 0;
+}
 /*
 * This function is called from memory failure code.
 * Assume the caller holds page lock of the head page.
 */
-void __isolate_hwpoisoned_huge_page(struct page *hpage)
+int dequeue_hwpoisoned_huge_page(struct page *hpage)
 {
        struct hstate *h = page_hstate(hpage);
        int nid = page_to_nid(hpage);
+        int ret = -EBUSY;
        spin_lock(&hugetlb_lock);
-        list_del(&hpage->lru);
+        if (is_hugepage_on_freelist(hpage)) {
-        h->free_huge_pages--;
+                list_del(&hpage->lru);
-        h->free_huge_pages_node[nid]--;
+                set_page_refcounted(hpage);
+                h->free_huge_pages--;
+                h->free_huge_pages_node[nid]--;
+                ret = 0;
+        }
        spin_unlock(&hugetlb_lock);
+        return ret;
 }
+#endif
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..dedb0aff673f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,7 +62,7 @@ extern bool is_free_buddy_page(struct page *page);
 */
 static inline unsigned long page_order(struct page *page)
 {
-        VM_BUG_ON(!PageBuddy(page));
+        /* PageBuddy() must be checked by the caller */
        return page_private(page);
 }
diff --git a/mm/maccess.c b/mm/maccess.c
index 4e348dbaecd7..e2b6f5634e0d 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,9 +1,9 @@
 /*
 * Access kernel memory without faulting.
 */
-#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/uaccess.h>
 /**
 * probe_kernel_read(): safely attempt to read from a location
diff --git a/mm/memblock.c b/mm/memblock.c
index 43840b305ecb..400dc62697d7 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -11,237 +11,423 @@
 */
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/bitops.h>
+#include <linux/poison.h>
+#include <linux/pfn.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <linux/memblock.h>
-#define MEMBLOCK_ALLOC_ANYWHERE 0
+struct memblock memblock __initdata_memblock;
-struct memblock memblock;
+int memblock_debug __initdata_memblock;
+int memblock_can_resize __initdata_memblock;
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
-static int memblock_debug;
+/* inline so we don't get a warning when pr_debug is compiled out */
+static inline const char *memblock_type_name(struct memblock_type *type)
+{
+        if (type == &memblock.memory)
+                return "memory";
+        else if (type == &memblock.reserved)
+                return "reserved";
+        else
+                return "unknown";
+}
-static int __init early_memblock(char *p)
+/*
+ * Address comparison utilities
+ */
+static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
 {
-        if (p && strstr(p, "debug"))
+        return addr & ~(size - 1);
-                memblock_debug = 1;
+}
+static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
+{
+        return (addr + (size - 1)) & ~(size - 1);
+}
+static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
+                                       phys_addr_t base2, phys_addr_t size2)
+{
+        return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+}
+static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1,
+                               phys_addr_t base2, phys_addr_t size2)
+{
+        if (base2 == base1 + size1)
+                return 1;
+        else if (base1 == base2 + size2)
+                return -1;
        return 0;
 }
-early_param("memblock", early_memblock);
-static void memblock_dump(struct memblock_region *region, char *name)
+static long __init_memblock memblock_regions_adjacent(struct memblock_type *type,
+                                 unsigned long r1, unsigned long r2)
 {
-        unsigned long long base, size;
+        phys_addr_t base1 = type->regions[r1].base;
-        int i;
+        phys_addr_t size1 = type->regions[r1].size;
+        phys_addr_t base2 = type->regions[r2].base;
+        phys_addr_t size2 = type->regions[r2].size;
-        pr_info(" %s.cnt  = 0x%lx\n", name, region->cnt);
+        return memblock_addrs_adjacent(base1, size1, base2, size2);
+}
-        for (i = 0; i < region->cnt; i++) {
+long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
-                base = region->region[i].base;
+{
-                size = region->region[i].size;
+        unsigned long i;
-                pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n",
+        for (i = 0; i < type->cnt; i++) {
-                    name, i, base, base + size - 1, size);
+                phys_addr_t rgnbase = type->regions[i].base;
+                phys_addr_t rgnsize = type->regions[i].size;
+                if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+                        break;
        }
+        return (i < type->cnt) ? i : -1;
 }
-void memblock_dump_all(void)
+/*
+ * Find, allocate, deallocate or reserve unreserved regions. All allocations
+ * are top-down.
+ */
+static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end,
+                                          phys_addr_t size, phys_addr_t align)
 {
-        if (!memblock_debug)
+        phys_addr_t base, res_base;
-                return;
+        long j;
-        pr_info("MEMBLOCK configuration:\n");
+        /* In case, huge size is requested */
-        pr_info(" rmo_size    = 0x%llx\n", (unsigned long long)memblock.rmo_size);
+        if (end < size)
-        pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size);
+                return MEMBLOCK_ERROR;
-        memblock_dump(&memblock.memory, "memory");
+        base = memblock_align_down((end - size), align);
-        memblock_dump(&memblock.reserved, "reserved");
+        /* Prevent allocations returning 0 as it's also used to
+         * indicate an allocation failure
+         */
+        if (start == 0)
+                start = PAGE_SIZE;
+        while (start <= base) {
+                j = memblock_overlaps_region(&memblock.reserved, base, size);
+                if (j < 0)
+                        return base;
+                res_base = memblock.reserved.regions[j].base;
+                if (res_base < size)
+                        break;
+                base = memblock_align_down(res_base - size, align);
+        }
+        return MEMBLOCK_ERROR;
 }
-static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2,
+static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
-                                        u64 size2)
+                        phys_addr_t align, phys_addr_t start, phys_addr_t end)
 {
-        return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+        long i;
+        BUG_ON(0 == size);
+        size = memblock_align_up(size, align);
+        /* Pump up max_addr */
+        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+                end = memblock.current_limit;
+        /* We do a top-down search, this tends to limit memory
+         * fragmentation by keeping early boot allocs near the
+         * top of memory
+         */
+        for (i = memblock.memory.cnt - 1; i >= 0; i--) {
+                phys_addr_t memblockbase = memblock.memory.regions[i].base;
+                phys_addr_t memblocksize = memblock.memory.regions[i].size;
+                phys_addr_t bottom, top, found;
+                if (memblocksize < size)
+                        continue;
+                if ((memblockbase + memblocksize) <= start)
+                        break;
+                bottom = max(memblockbase, start);
+                top = min(memblockbase + memblocksize, end);
+                if (bottom >= top)
+                        continue;
+                found = memblock_find_region(bottom, top, size, align);
+                if (found != MEMBLOCK_ERROR)
+                        return found;
+        }
+        return MEMBLOCK_ERROR;
 }
-static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2)
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
 {
-        if (base2 == base1 + size1)
+        return memblock_find_base(size, align, start, end);
-                return 1;
+}
-        else if (base1 == base2 + size2)
-                return -1;
-        return 0;
+/*
+ * Free memblock.reserved.regions
+ */
+int __init_memblock memblock_free_reserved_regions(void)
+{
+        if (memblock.reserved.regions == memblock_reserved_init_regions)
+                return 0;
+        return memblock_free(__pa(memblock.reserved.regions),
+                 sizeof(struct memblock_region) * memblock.reserved.max);
 }
-static long memblock_regions_adjacent(struct memblock_region *rgn,
+/*
-                unsigned long r1, unsigned long r2)
+ * Reserve memblock.reserved.regions
+ */
+int __init_memblock memblock_reserve_reserved_regions(void)
 {
-        u64 base1 = rgn->region[r1].base;
+        if (memblock.reserved.regions == memblock_reserved_init_regions)
-        u64 size1 = rgn->region[r1].size;
+                return 0;
-        u64 base2 = rgn->region[r2].base;
-        u64 size2 = rgn->region[r2].size;
-        return memblock_addrs_adjacent(base1, size1, base2, size2);
+        return memblock_reserve(__pa(memblock.reserved.regions),
+                 sizeof(struct memblock_region) * memblock.reserved.max);
 }
-static void memblock_remove_region(struct memblock_region *rgn, unsigned long r)
+static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
 {
        unsigned long i;
-        for (i = r; i < rgn->cnt - 1; i++) {
+        for (i = r; i < type->cnt - 1; i++) {
-                rgn->region[i].base = rgn->region[i + 1].base;
+                type->regions[i].base = type->regions[i + 1].base;
-                rgn->region[i].size = rgn->region[i + 1].size;
+                type->regions[i].size = type->regions[i + 1].size;
        }
-        rgn->cnt--;
+        type->cnt--;
 }
 /* Assumption: base addr of region 1 < base addr of region 2 */
-static void memblock_coalesce_regions(struct memblock_region *rgn,
+static void __init_memblock memblock_coalesce_regions(struct memblock_type *type,
                unsigned long r1, unsigned long r2)
 {
-        rgn->region[r1].size += rgn->region[r2].size;
+        type->regions[r1].size += type->regions[r2].size;
-        memblock_remove_region(rgn, r2);
+        memblock_remove_region(type, r2);
 }
-void __init memblock_init(void)
+/* Defined below but needed now */
+static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
+static int __init_memblock memblock_double_array(struct memblock_type *type)
 {
-        /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
+        struct memblock_region *new_array, *old_array;
-         * This simplifies the memblock_add() code below...
+        phys_addr_t old_size, new_size, addr;
+        int use_slab = slab_is_available();
+        /* We don't allow resizing until we know about the reserved regions
+         * of memory that aren't suitable for allocation
         */
-        memblock.memory.region[0].base = 0;
+        if (!memblock_can_resize)
-        memblock.memory.region[0].size = 0;
+                return -1;
-        memblock.memory.cnt = 1;
-        /* Ditto. */
+        /* Calculate new doubled size */
-        memblock.reserved.region[0].base = 0;
+        old_size = type->max * sizeof(struct memblock_region);
-        memblock.reserved.region[0].size = 0;
+        new_size = old_size << 1;
-        memblock.reserved.cnt = 1;
-}
+        /* Try to find some space for it.
+         *
+         * WARNING: We assume that either slab_is_available() and we use it or
+         * we use MEMBLOCK for allocations. That means that this is unsafe to use
+         * when bootmem is currently active (unless bootmem itself is implemented
+         * on top of MEMBLOCK which isn't the case yet)
+         *
+         * This should however not be an issue for now, as we currently only
+         * call into MEMBLOCK while it's still active, or much later when slab is
+         * active for memory hotplug operations
+         */
+        if (use_slab) {
+                new_array = kmalloc(new_size, GFP_KERNEL);
+                addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
+        } else
+                addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
+        if (addr == MEMBLOCK_ERROR) {
+                pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
+                       memblock_type_name(type), type->max, type->max * 2);
+                return -1;
+        }
+        new_array = __va(addr);
-void __init memblock_analyze(void)
+        memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
-{
+                 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
-        int i;
-        memblock.memory.size = 0;
+        /* Found space, we now need to move the array over before
+         * we add the reserved region since it may be our reserved
+         * array itself that is full.
+         */
+        memcpy(new_array, type->regions, old_size);
+        memset(new_array + type->max, 0, old_size);
+        old_array = type->regions;
+        type->regions = new_array;
+        type->max <<= 1;
+        /* If we use SLAB that's it, we are done */
+        if (use_slab)
+                return 0;
-        for (i = 0; i < memblock.memory.cnt; i++)
+        /* Add the new reserved region now. Should not fail ! */
-                memblock.memory.size += memblock.memory.region[i].size;
+        BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0);
+        /* If the array wasn't our static init one, then free it. We only do
+         * that before SLAB is available as later on, we don't know whether
+         * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
+         * anyways
+         */
+        if (old_array != memblock_memory_init_regions &&
+            old_array != memblock_reserved_init_regions)
+                memblock_free(__pa(old_array), old_size);
+        return 0;
 }
-static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size)
+extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
+                                          phys_addr_t addr2, phys_addr_t size2)
+{
+        return 1;
+}
+static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
 {
        unsigned long coalesced = 0;
        long adjacent, i;
-        if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) {
+        if ((type->cnt == 1) && (type->regions[0].size == 0)) {
-                rgn->region[0].base = base;
+                type->regions[0].base = base;
-                rgn->region[0].size = size;
+                type->regions[0].size = size;
                return 0;
        }
        /* First try and coalesce this MEMBLOCK with another. */
-        for (i = 0; i < rgn->cnt; i++) {
+        for (i = 0; i < type->cnt; i++) {
-                u64 rgnbase = rgn->region[i].base;
+                phys_addr_t rgnbase = type->regions[i].base;
-                u64 rgnsize = rgn->region[i].size;
+                phys_addr_t rgnsize = type->regions[i].size;
                if ((rgnbase == base) && (rgnsize == size))
                        /* Already have this region, so we're done */
                        return 0;
                adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
+                /* Check if arch allows coalescing */
+                if (adjacent != 0 && type == &memblock.memory &&
+                    !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize))
+                        break;
                if (adjacent > 0) {
-                        rgn->region[i].base -= size;
+                        type->regions[i].base -= size;
-                        rgn->region[i].size += size;
+                        type->regions[i].size += size;
                        coalesced++;
                        break;
                } else if (adjacent < 0) {
-                        rgn->region[i].size += size;
+                        type->regions[i].size += size;
                        coalesced++;
                        break;
                }
        }
-        if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) {
+        /* If we plugged a hole, we may want to also coalesce with the
-                memblock_coalesce_regions(rgn, i, i+1);
+         * next region
+         */
+        if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) &&
+            ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base,
+                                                             type->regions[i].size,
+                                                             type->regions[i+1].base,
+                                                             type->regions[i+1].size)))) {
+                memblock_coalesce_regions(type, i, i+1);
                coalesced++;
        }
        if (coalesced)
                return coalesced;
-        if (rgn->cnt >= MAX_MEMBLOCK_REGIONS)
+        /* If we are out of space, we fail. It's too late to resize the array
+         * but then this shouldn't have happened in the first place.
+         */
+        if (WARN_ON(type->cnt >= type->max))
                return -1;
        /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
-        for (i = rgn->cnt - 1; i >= 0; i--) {
+        for (i = type->cnt - 1; i >= 0; i--) {
-                if (base < rgn->region[i].base) {
+                if (base < type->regions[i].base) {
-                        rgn->region[i+1].base = rgn->region[i].base;
+                        type->regions[i+1].base = type->regions[i].base;
-                        rgn->region[i+1].size = rgn->region[i].size;
+                        type->regions[i+1].size = type->regions[i].size;
                } else {
-                        rgn->region[i+1].base = base;
+                        type->regions[i+1].base = base;
-                        rgn->region[i+1].size = size;
+                        type->regions[i+1].size = size;
                        break;
                }
        }
-        if (base < rgn->region[0].base) {
+        if (base < type->regions[0].base) {
-                rgn->region[0].base = base;
+                type->regions[0].base = base;
-                rgn->region[0].size = size;
+                type->regions[0].size = size;
+        }
+        type->cnt++;
+        /* The array is full ? Try to resize it. If that fails, we undo
+         * our allocation and return an error
+         */
+        if (type->cnt == type->max && memblock_double_array(type)) {
+                type->cnt--;
+                return -1;
        }
-        rgn->cnt++;
        return 0;
 }
-long memblock_add(u64 base, u64 size)
+long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
 {
-        struct memblock_region *_rgn = &memblock.memory;
+        return memblock_add_region(&memblock.memory, base, size);
-        /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */
-        if (base == 0)
-                memblock.rmo_size = size;
-        return memblock_add_region(_rgn, base, size);
 }
-static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
+static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
 {
-        u64 rgnbegin, rgnend;
+        phys_addr_t rgnbegin, rgnend;
-        u64 end = base + size;
+        phys_addr_t end = base + size;
        int i;
        rgnbegin = rgnend = 0; /* supress gcc warnings */
        /* Find the region where (base, size) belongs to */
-        for (i=0; i < rgn->cnt; i++) {
+        for (i=0; i < type->cnt; i++) {
-                rgnbegin = rgn->region[i].base;
+                rgnbegin = type->regions[i].base;
-                rgnend = rgnbegin + rgn->region[i].size;
+                rgnend = rgnbegin + type->regions[i].size;
                if ((rgnbegin <= base) && (end <= rgnend))
                        break;
        }
        /* Didn't find the region */
-        if (i == rgn->cnt)
+        if (i == type->cnt)
                return -1;
        /* Check to see if we are removing entire region */
        if ((rgnbegin == base) && (rgnend == end)) {
-                memblock_remove_region(rgn, i);
+                memblock_remove_region(type, i);
                return 0;
        }
        /* Check to see if region is matching at the front */
        if (rgnbegin == base) {
-                rgn->region[i].base = end;
+                type->regions[i].base = end;
-                rgn->region[i].size -= size;
+                type->regions[i].size -= size;
                return 0;
        }
        /* Check to see if the region is matching at the end */
        if (rgnend == end) {
-                rgn->region[i].size -= size;
+                type->regions[i].size -= size;
                return 0;
        }
@@ -249,208 +435,189 @@ static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
         * We need to split the entry -  adjust the current one to the
         * beginging of the hole and add the region after hole.
         */
-        rgn->region[i].size = base - rgn->region[i].base;
+        type->regions[i].size = base - type->regions[i].base;
-        return memblock_add_region(rgn, end, rgnend - end);
+        return memblock_add_region(type, end, rgnend - end);
 }
-long memblock_remove(u64 base, u64 size)
+long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
 {
        return __memblock_remove(&memblock.memory, base, size);
 }
-long __init memblock_free(u64 base, u64 size)
+long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
 {
        return __memblock_remove(&memblock.reserved, base, size);
 }
-long __init memblock_reserve(u64 base, u64 size)
+long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 {
-        struct memblock_region *_rgn = &memblock.reserved;
+        struct memblock_type *_rgn = &memblock.reserved;
        BUG_ON(0 == size);
        return memblock_add_region(_rgn, base, size);
 }
-long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size)
+phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
 {
-        unsigned long i;
+        phys_addr_t found;
-        for (i = 0; i < rgn->cnt; i++) {
+        /* We align the size to limit fragmentation. Without this, a lot of
-                u64 rgnbase = rgn->region[i].base;
+         * small allocs quickly eat up the whole reserve array on sparc
-                u64 rgnsize = rgn->region[i].size;
+         */
-                if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+        size = memblock_align_up(size, align);
-                        break;
-        }
-        return (i < rgn->cnt) ? i : -1;
+        found = memblock_find_base(size, align, 0, max_addr);
+        if (found != MEMBLOCK_ERROR &&
+            memblock_add_region(&memblock.reserved, found, size) >= 0)
+                return found;
+        return 0;
 }
-static u64 memblock_align_down(u64 addr, u64 size)
+phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
 {
-        return addr & ~(size - 1);
+        phys_addr_t alloc;
+        alloc = __memblock_alloc_base(size, align, max_addr);
+        if (alloc == 0)
+                panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+                      (unsigned long long) size, (unsigned long long) max_addr);
+        return alloc;
 }
-static u64 memblock_align_up(u64 addr, u64 size)
+phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
 {
-        return (addr + (size - 1)) & ~(size - 1);
+        return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 }
-static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end,
-                                           u64 size, u64 align)
+/*
+ * Additional node-local allocators. Search for node memory is bottom up
+ * and walks memblock regions within that node bottom-up as well, but allocation
+ * within an memblock region is top-down. XXX I plan to fix that at some stage
+ *
+ * WARNING: Only available after early_node_map[] has been populated,
+ * on some architectures, that is after all the calls to add_active_range()
+ * have been done to populate it.
+ */
+phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
 {
-        u64 base, res_base;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-        long j;
+        /*
+         * This code originates from sparc which really wants use to walk by addresses
+         * and returns the nid. This is not very convenient for early_pfn_map[] users
+         * as the map isn't sorted yet, and it really wants to be walked by nid.
+         *
+         * For now, I implement the inefficient method below which walks the early
+         * map multiple times. Eventually we may want to use an ARCH config option
+         * to implement a completely different method for both case.
+         */
+        unsigned long start_pfn, end_pfn;
+        int i;
-        base = memblock_align_down((end - size), align);
+        for (i = 0; i < MAX_NUMNODES; i++) {
-        while (start <= base) {
+                get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
-                j = memblock_overlaps_region(&memblock.reserved, base, size);
+                if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
-                if (j < 0) {
+                        continue;
-                        /* this area isn't reserved, take it */
+                *nid = i;
-                        if (memblock_add_region(&memblock.reserved, base, size) < 0)
+                return min(end, PFN_PHYS(end_pfn));
-                                base = ~(u64)0;
-                        return base;
-                }
-                res_base = memblock.reserved.region[j].base;
-                if (res_base < size)
-                        break;
-                base = memblock_align_down(res_base - size, align);
        }
+#endif
+        *nid = 0;
-        return ~(u64)0;
+        return end;
 }
-static u64 __init memblock_alloc_nid_region(struct memblock_property *mp,
+static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
-                                       u64 (*nid_range)(u64, u64, int *),
+                                               phys_addr_t size,
-                                       u64 size, u64 align, int nid)
+                                               phys_addr_t align, int nid)
 {
-        u64 start, end;
+        phys_addr_t start, end;
        start = mp->base;
        end = start + mp->size;
        start = memblock_align_up(start, align);
        while (start < end) {
-                u64 this_end;
+                phys_addr_t this_end;
                int this_nid;
-                this_end = nid_range(start, end, &this_nid);
+                this_end = memblock_nid_range(start, end, &this_nid);
                if (this_nid == nid) {
-                        u64 ret = memblock_alloc_nid_unreserved(start, this_end,
+                        phys_addr_t ret = memblock_find_region(start, this_end, size, align);
-                                                           size, align);
+                        if (ret != MEMBLOCK_ERROR &&
-                        if (ret != ~(u64)0)
+                            memblock_add_region(&memblock.reserved, ret, size) >= 0)
                                return ret;
                }
                start = this_end;
        }
-        return ~(u64)0;
+        return MEMBLOCK_ERROR;
 }
-u64 __init memblock_alloc_nid(u64 size, u64 align, int nid,
+phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
-                         u64 (*nid_range)(u64 start, u64 end, int *nid))
 {
-        struct memblock_region *mem = &memblock.memory;
+        struct memblock_type *mem = &memblock.memory;
        int i;
        BUG_ON(0 == size);
+        /* We align the size to limit fragmentation. Without this, a lot of
+         * small allocs quickly eat up the whole reserve array on sparc
+         */
        size = memblock_align_up(size, align);
+        /* We do a bottom-up search for a region with the right
+         * nid since that's easier considering how memblock_nid_range()
+         * works
+         */
        for (i = 0; i < mem->cnt; i++) {
-                u64 ret = memblock_alloc_nid_region(&mem->region[i],
+                phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
-                                               nid_range,
                                               size, align, nid);
-                if (ret != ~(u64)0)
+                if (ret != MEMBLOCK_ERROR)
                        return ret;
        }
-        return memblock_alloc(size, align);
+        return 0;
-}
-u64 __init memblock_alloc(u64 size, u64 align)
-{
-        return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
 }
-u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr)
+phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
-        u64 alloc;
+        phys_addr_t res = memblock_alloc_nid(size, align, nid);
-        alloc = __memblock_alloc_base(size, align, max_addr);
-        if (alloc == 0)
+        if (res)
-                panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+                return res;
-                      (unsigned long long) size, (unsigned long long) max_addr);
+        return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
-        return alloc;
 }
-u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr)
-{
-        long i, j;
-        u64 base = 0;
-        u64 res_base;
-        BUG_ON(0 == size);
-        size = memblock_align_up(size, align);
+/*
+ * Remaining API functions
-        /* On some platforms, make sure we allocate lowmem */
+ */
-        /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */
-        if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
-                max_addr = MEMBLOCK_REAL_LIMIT;
-        for (i = memblock.memory.cnt - 1; i >= 0; i--) {
-                u64 memblockbase = memblock.memory.region[i].base;
-                u64 memblocksize = memblock.memory.region[i].size;
-                if (memblocksize < size)
-                        continue;
-                if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
-                        base = memblock_align_down(memblockbase + memblocksize - size, align);
-                else if (memblockbase < max_addr) {
-                        base = min(memblockbase + memblocksize, max_addr);
-                        base = memblock_align_down(base - size, align);
-                } else
-                        continue;
-                while (base && memblockbase <= base) {
-                        j = memblock_overlaps_region(&memblock.reserved, base, size);
-                        if (j < 0) {
-                                /* this area isn't reserved, take it */
-                                if (memblock_add_region(&memblock.reserved, base, size) < 0)
-                                        return 0;
-                                return base;
-                        }
-                        res_base = memblock.reserved.region[j].base;
-                        if (res_base < size)
-                                break;
-                        base = memblock_align_down(res_base - size, align);
-                }
-        }
-        return 0;
-}
 /* You must call memblock_analyze() before this. */
-u64 __init memblock_phys_mem_size(void)
+phys_addr_t __init memblock_phys_mem_size(void)
 {
-        return memblock.memory.size;
+        return memblock.memory_size;
 }
-u64 memblock_end_of_DRAM(void)
+phys_addr_t __init_memblock memblock_end_of_DRAM(void)
 {
        int idx = memblock.memory.cnt - 1;
-        return (memblock.memory.region[idx].base + memblock.memory.region[idx].size);
+        return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
 }
 /* You must call memblock_analyze() after this. */
-void __init memblock_enforce_memory_limit(u64 memory_limit)
+void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
 {
        unsigned long i;
-        u64 limit;
+        phys_addr_t limit;
-        struct memblock_property *p;
+        struct memblock_region *p;
        if (!memory_limit)
                return;
@@ -458,24 +625,21 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
        /* Truncate the memblock regions to satisfy the memory limit. */
        limit = memory_limit;
        for (i = 0; i < memblock.memory.cnt; i++) {
-                if (limit > memblock.memory.region[i].size) {
+                if (limit > memblock.memory.regions[i].size) {
-                        limit -= memblock.memory.region[i].size;
+                        limit -= memblock.memory.regions[i].size;
                        continue;
                }
-                memblock.memory.region[i].size = limit;
+                memblock.memory.regions[i].size = limit;
                memblock.memory.cnt = i + 1;
                break;
        }
-        if (memblock.memory.region[0].size < memblock.rmo_size)
-                memblock.rmo_size = memblock.memory.region[0].size;
        memory_limit = memblock_end_of_DRAM();
        /* And truncate any reserves above the limit also. */
        for (i = 0; i < memblock.reserved.cnt; i++) {
-                p = &memblock.reserved.region[i];
+                p = &memblock.reserved.regions[i];
                if (p->base > memory_limit)
                        p->size = 0;
@@ -489,53 +653,190 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
        }
 }
-int __init memblock_is_reserved(u64 addr)
+static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
+{
+        unsigned int left = 0, right = type->cnt;
+        do {
+                unsigned int mid = (right + left) / 2;
+                if (addr < type->regions[mid].base)
+                        right = mid;
+                else if (addr >= (type->regions[mid].base +
+                                  type->regions[mid].size))
+                        left = mid + 1;
+                else
+                        return mid;
+        } while (left < right);
+        return -1;
+}
+int __init memblock_is_reserved(phys_addr_t addr)
+{
+        return memblock_search(&memblock.reserved, addr) != -1;
+}
+int __init_memblock memblock_is_memory(phys_addr_t addr)
+{
+        return memblock_search(&memblock.memory, addr) != -1;
+}
+int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
+{
+        int idx = memblock_search(&memblock.reserved, base);
+        if (idx == -1)
+                return 0;
+        return memblock.reserved.regions[idx].base <= base &&
+                (memblock.reserved.regions[idx].base +
+                 memblock.reserved.regions[idx].size) >= (base + size);
+}
+int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+{
+        return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+}
+void __init_memblock memblock_set_current_limit(phys_addr_t limit)
 {
+        memblock.current_limit = limit;
+}
+static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
+{
+        unsigned long long base, size;
        int i;
-        for (i = 0; i < memblock.reserved.cnt; i++) {
+        pr_info(" %s.cnt  = 0x%lx\n", name, region->cnt);
-                u64 upper = memblock.reserved.region[i].base +
-                        memblock.reserved.region[i].size - 1;
+        for (i = 0; i < region->cnt; i++) {
-                if ((addr >= memblock.reserved.region[i].base) && (addr <= upper))
+                base = region->regions[i].base;
-                        return 1;
+                size = region->regions[i].size;
+                pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
+                    name, i, base, base + size - 1, size);
        }
-        return 0;
 }
-int memblock_is_region_reserved(u64 base, u64 size)
+void __init_memblock memblock_dump_all(void)
 {
-        return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+        if (!memblock_debug)
+                return;
+        pr_info("MEMBLOCK configuration:\n");
+        pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size);
+        memblock_dump(&memblock.memory, "memory");
+        memblock_dump(&memblock.reserved, "reserved");
 }
-/*
+void __init memblock_analyze(void)
- * Given a <base, len>, find which memory regions belong to this range.
- * Adjust the request and return a contiguous chunk.
- */
-int memblock_find(struct memblock_property *res)
 {
        int i;
-        u64 rstart, rend;
-        rstart = res->base;
+        /* Check marker in the unused last array entry */
-        rend = rstart + res->size - 1;
+        WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
+                != (phys_addr_t)RED_INACTIVE);
+        WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
+                != (phys_addr_t)RED_INACTIVE);
+        memblock.memory_size = 0;
+        for (i = 0; i < memblock.memory.cnt; i++)
+                memblock.memory_size += memblock.memory.regions[i].size;
+        /* We allow resizing from there */
+        memblock_can_resize = 1;
+}
+void __init memblock_init(void)
+{
+        static int init_done __initdata = 0;
+        if (init_done)
+                return;
+        init_done = 1;
+        /* Hookup the initial arrays */
+        memblock.memory.regions = memblock_memory_init_regions;
+        memblock.memory.max             = INIT_MEMBLOCK_REGIONS;
+        memblock.reserved.regions       = memblock_reserved_init_regions;
+        memblock.reserved.max   = INIT_MEMBLOCK_REGIONS;
+        /* Write a marker in the unused last array entry */
+        memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+        memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+        /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
+         * This simplifies the memblock_add() code below...
+         */
+        memblock.memory.regions[0].base = 0;
+        memblock.memory.regions[0].size = 0;
+        memblock.memory.cnt = 1;
+        /* Ditto. */
+        memblock.reserved.regions[0].base = 0;
+        memblock.reserved.regions[0].size = 0;
+        memblock.reserved.cnt = 1;
+        memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
+}
+static int __init early_memblock(char *p)
+{
+        if (p && strstr(p, "debug"))
+                memblock_debug = 1;
+        return 0;
+}
+early_param("memblock", early_memblock);
+#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
+static int memblock_debug_show(struct seq_file *m, void *private)
+{
+        struct memblock_type *type = m->private;
+        struct memblock_region *reg;
+        int i;
+        for (i = 0; i < type->cnt; i++) {
+                reg = &type->regions[i];
+                seq_printf(m, "%4d: ", i);
+                if (sizeof(phys_addr_t) == 4)
+                        seq_printf(m, "0x%08lx..0x%08lx\n",
+                                   (unsigned long)reg->base,
+                                   (unsigned long)(reg->base + reg->size - 1));
+                else
+                        seq_printf(m, "0x%016llx..0x%016llx\n",
+                                   (unsigned long long)reg->base,
+                                   (unsigned long long)(reg->base + reg->size - 1));
-        for (i = 0; i < memblock.memory.cnt; i++) {
-                u64 start = memblock.memory.region[i].base;
-                u64 end = start + memblock.memory.region[i].size - 1;
-                if (start > rend)
-                        return -1;
-                if ((end >= rstart) && (start < rend)) {
-                        /* adjust the request */
-                        if (rstart < start)
-                                rstart = start;
-                        if (rend > end)
-                                rend = end;
-                        res->base = rstart;
-                        res->size = rend - rstart + 1;
-                        return 0;
-                }
        }
-        return -1;
+        return 0;
+}
+static int memblock_debug_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, memblock_debug_show, inode->i_private);
 }
+static const struct file_operations memblock_debug_fops = {
+        .open = memblock_debug_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int __init memblock_init_debugfs(void)
+{
+        struct dentry *root = debugfs_create_dir("memblock", NULL);
+        if (!root)
+                return -ENXIO;
+        debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
+        debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
+        return 0;
+}
+__initcall(memblock_init_debugfs);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9be3cf8a5da4..9a99cfaf0a19 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -89,7 +89,10 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
        MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
        MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
-        MEM_CGROUP_EVENTS,      /* incremented at every  pagein/pageout */
+        MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
+        /* incremented at every  pagein/pageout */
+        MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA,
+        MEM_CGROUP_ON_MOVE,     /* someone is moving account between groups */
        MEM_CGROUP_STAT_NSTATS,
 };
@@ -254,6 +257,12 @@ struct mem_cgroup {
         * percpu counter.
         */
        struct mem_cgroup_stat_cpu *stat;
+        /*
+         * used when a cpu is offlined or other synchronizations
+         * See mem_cgroup_read_stat().
+         */
+        struct mem_cgroup_stat_cpu nocpu_base;
+        spinlock_t pcp_counter_lock;
 };
 /* Stuffs for move charges at task migration. */
@@ -530,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
        return mz;
 }
+/*
+ * Implementation Note: reading percpu statistics for memcg.
+ *
+ * Both of vmstat[] and percpu_counter has threshold and do periodic
+ * synchronization to implement "quick" read. There are trade-off between
+ * reading cost and precision of value. Then, we may have a chance to implement
+ * a periodic synchronizion of counter in memcg's counter.
+ *
+ * But this _read() function is used for user interface now. The user accounts
+ * memory usage by memory cgroup and he _always_ requires exact value because
+ * he accounts memory. Even if we provide quick-and-fuzzy read, we always
+ * have to visit all online cpus and make sum. So, for now, unnecessary
+ * synchronization is not implemented. (just implemented for cpu hotplug)
+ *
+ * If there are kernel internal actions which can make use of some not-exact
+ * value, and reading all cpu value can be performance bottleneck in some
+ * common workload, threashold and synchonization as vmstat[] should be
+ * implemented.
+ */
 static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
                enum mem_cgroup_stat_index idx)
 {
        int cpu;
        s64 val = 0;
-        for_each_possible_cpu(cpu)
+        get_online_cpus();
+        for_each_online_cpu(cpu)
                val += per_cpu(mem->stat->count[idx], cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+        spin_lock(&mem->pcp_counter_lock);
+        val += mem->nocpu_base.count[idx];
+        spin_unlock(&mem->pcp_counter_lock);
+#endif
+        put_online_cpus();
        return val;
 }
@@ -659,40 +694,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
        return mem;
 }
-/*
+/* The caller has to guarantee "mem" exists before calling this */
- * Call callback function against all cgroup under hierarchy tree.
+static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
- */
-static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
-                          int (*func)(struct mem_cgroup *, void *))
 {
-        int found, ret, nextid;
        struct cgroup_subsys_state *css;
-        struct mem_cgroup *mem;
+        int found;
-        if (!root->use_hierarchy)
-                return (*func)(root, data);
-        nextid = 1;
+        if (!mem) /* ROOT cgroup has the smallest ID */
-        do {
+                return root_mem_cgroup; /*css_put/get against root is ignored*/
-                ret = 0;
+        if (!mem->use_hierarchy) {
+                if (css_tryget(&mem->css))
+                        return mem;
+                return NULL;
+        }
+        rcu_read_lock();
+        /*
+         * searching a memory cgroup which has the smallest ID under given
+         * ROOT cgroup. (ID >= 1)
+         */
+        css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
+        if (css && css_tryget(css))
+                mem = container_of(css, struct mem_cgroup, css);
+        else
                mem = NULL;
+        rcu_read_unlock();
+        return mem;
+}
+static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
+                                        struct mem_cgroup *root,
+                                        bool cond)
+{
+        int nextid = css_id(&iter->css) + 1;
+        int found;
+        int hierarchy_used;
+        struct cgroup_subsys_state *css;
+        hierarchy_used = iter->use_hierarchy;
+        css_put(&iter->css);
+        /* If no ROOT, walk all, ignore hierarchy */
+        if (!cond || (root && !hierarchy_used))
+                return NULL;
+        if (!root)
+                root = root_mem_cgroup;
+        do {
+                iter = NULL;
                rcu_read_lock();
-                css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
-                                   &found);
+                css = css_get_next(&mem_cgroup_subsys, nextid,
+                                &root->css, &found);
                if (css && css_tryget(css))
-                        mem = container_of(css, struct mem_cgroup, css);
+                        iter = container_of(css, struct mem_cgroup, css);
                rcu_read_unlock();
+                /* If css is NULL, no more cgroups will be found */
-                if (mem) {
-                        ret = (*func)(mem, data);
-                        css_put(&mem->css);
-                }
                nextid = found + 1;
-        } while (!ret && css);
+        } while (css && !iter);
-        return ret;
+        return iter;
 }
+/*
+ * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
+ * be careful that "break" loop is not allowed. We have reference count.
+ * Instead of that modify "cond" to be false and "continue" to exit the loop.
+ */
+#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
+        for (iter = mem_cgroup_start_loop(root);\
+             iter != NULL;\
+             iter = mem_cgroup_get_next(iter, root, cond))
+#define for_each_mem_cgroup_tree(iter, root) \
+        for_each_mem_cgroup_tree_cond(iter, root, true)
+#define for_each_mem_cgroup_all(iter) \
+        for_each_mem_cgroup_tree_cond(iter, NULL, true)
 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
 {
@@ -1051,7 +1129,52 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
        return swappiness;
 }
-/* A routine for testing mem is not under move_account */
+static void mem_cgroup_start_move(struct mem_cgroup *mem)
+{
+        int cpu;
+        get_online_cpus();
+        spin_lock(&mem->pcp_counter_lock);
+        for_each_online_cpu(cpu)
+                per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+        mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
+        spin_unlock(&mem->pcp_counter_lock);
+        put_online_cpus();
+        synchronize_rcu();
+}
+static void mem_cgroup_end_move(struct mem_cgroup *mem)
+{
+        int cpu;
+        if (!mem)
+                return;
+        get_online_cpus();
+        spin_lock(&mem->pcp_counter_lock);
+        for_each_online_cpu(cpu)
+                per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+        mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
+        spin_unlock(&mem->pcp_counter_lock);
+        put_online_cpus();
+}
+/*
+ * 2 routines for checking "mem" is under move_account() or not.
+ *
+ * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
+ *                        for avoiding race in accounting. If true,
+ *                        pc->mem_cgroup may be overwritten.
+ *
+ * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
+ *                        under hierarchy of moving cgroups. This is for
+ *                        waiting at hith-memory prressure caused by "move".
+ */
+static bool mem_cgroup_stealed(struct mem_cgroup *mem)
+{
+        VM_BUG_ON(!rcu_read_lock_held());
+        return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+}
 static bool mem_cgroup_under_move(struct mem_cgroup *mem)
 {
@@ -1092,13 +1215,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
        return false;
 }
-static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
-{
-        int *val = data;
-        (*val)++;
-        return 0;
-}
 /**
 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
 * @memcg: The memory cgroup that went over limit
@@ -1173,7 +1289,10 @@ done:
 static int mem_cgroup_count_children(struct mem_cgroup *mem)
 {
        int num = 0;
-        mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
+        struct mem_cgroup *iter;
+        for_each_mem_cgroup_tree(iter, mem)
+                num++;
        return num;
 }
@@ -1322,49 +1441,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        return total;
 }
-static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
-{
-        int *val = (int *)data;
-        int x;
-        /*
-         * Logically, we can stop scanning immediately when we find
-         * a memcg is already locked. But condidering unlock ops and
-         * creation/removal of memcg, scan-all is simple operation.
-         */
-        x = atomic_inc_return(&mem->oom_lock);
-        *val = max(x, *val);
-        return 0;
-}
 /*
 * Check OOM-Killer is already running under our hierarchy.
 * If someone is running, return false.
 */
 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
 {
-        int lock_count = 0;
+        int x, lock_count = 0;
+        struct mem_cgroup *iter;
-        mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
+        for_each_mem_cgroup_tree(iter, mem) {
+                x = atomic_inc_return(&iter->oom_lock);
+                lock_count = max(x, lock_count);
+        }
        if (lock_count == 1)
                return true;
        return false;
 }
-static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
 {
+        struct mem_cgroup *iter;
        /*
         * When a new child is created while the hierarchy is under oom,
         * mem_cgroup_oom_lock() may not be called. We have to use
         * atomic_add_unless() here.
         */
-        atomic_add_unless(&mem->oom_lock, -1, 0);
+        for_each_mem_cgroup_tree(iter, mem)
+                atomic_add_unless(&iter->oom_lock, -1, 0);
        return 0;
 }
-static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
-{
-        mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
-}
 static DEFINE_MUTEX(memcg_oom_mutex);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1462,34 +1571,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 /*
 * Currently used to update mapped file statistics, but the routine can be
 * generalized to update other statistics as well.
+ *
+ * Notes: Race condition
+ *
+ * We usually use page_cgroup_lock() for accessing page_cgroup member but
+ * it tends to be costly. But considering some conditions, we doesn't need
+ * to do so _always_.
+ *
+ * Considering "charge", lock_page_cgroup() is not required because all
+ * file-stat operations happen after a page is attached to radix-tree. There
+ * are no race with "charge".
+ *
+ * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
+ * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
+ * if there are race with "uncharge". Statistics itself is properly handled
+ * by flags.
+ *
+ * Considering "move", this is an only case we see a race. To make the race
+ * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
+ * possibility of race condition. If there is, we take a lock.
 */
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+static void mem_cgroup_update_file_stat(struct page *page, int idx, int val)
 {
        struct mem_cgroup *mem;
-        struct page_cgroup *pc;
+        struct page_cgroup *pc = lookup_page_cgroup(page);
+        bool need_unlock = false;
-        pc = lookup_page_cgroup(page);
        if (unlikely(!pc))
                return;
-        lock_page_cgroup(pc);
+        rcu_read_lock();
        mem = pc->mem_cgroup;
-        if (!mem || !PageCgroupUsed(pc))
+        if (unlikely(!mem || !PageCgroupUsed(pc)))
-                goto done;
+                goto out;
+        /* pc->mem_cgroup is unstable ? */
+        if (unlikely(mem_cgroup_stealed(mem))) {
+                /* take a lock against to access pc->mem_cgroup */
+                lock_page_cgroup(pc);
+                need_unlock = true;
+                mem = pc->mem_cgroup;
+                if (!mem || !PageCgroupUsed(pc))
+                        goto out;
+        }
-        /*
+        this_cpu_add(mem->stat->count[idx], val);
-         * Preemption is already disabled. We can use __this_cpu_xxx
-         */
+        switch (idx) {
-        if (val > 0) {
+        case MEM_CGROUP_STAT_FILE_MAPPED:
-                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+                if (val > 0)
-                SetPageCgroupFileMapped(pc);
+                        SetPageCgroupFileMapped(pc);
-        } else {
+                else if (!page_mapped(page))
-                __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+                        ClearPageCgroupFileMapped(pc);
-                ClearPageCgroupFileMapped(pc);
+                break;
+        default:
+                BUG();
        }
-done:
+out:
-        unlock_page_cgroup(pc);
+        if (unlikely(need_unlock))
+                unlock_page_cgroup(pc);
+        rcu_read_unlock();
+        return;
+}
+void mem_cgroup_update_file_mapped(struct page *page, int val)
+{
+        mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val);
 }
 /*
@@ -1605,15 +1753,55 @@ static void drain_all_stock_sync(void)
        atomic_dec(&memcg_drain_count);
 }
-static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
+/*
+ * This function drains percpu counter value from DEAD cpu and
+ * move it to local cpu. Note that this function can be preempted.
+ */
+static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
+{
+        int i;
+        spin_lock(&mem->pcp_counter_lock);
+        for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
+                s64 x = per_cpu(mem->stat->count[i], cpu);
+                per_cpu(mem->stat->count[i], cpu) = 0;
+                mem->nocpu_base.count[i] += x;
+        }
+        /* need to clear ON_MOVE value, works as a kind of lock. */
+        per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
+        spin_unlock(&mem->pcp_counter_lock);
+}
+static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
+{
+        int idx = MEM_CGROUP_ON_MOVE;
+        spin_lock(&mem->pcp_counter_lock);
+        per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
+        spin_unlock(&mem->pcp_counter_lock);
+}
+static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
                                        unsigned long action,
                                        void *hcpu)
 {
        int cpu = (unsigned long)hcpu;
        struct memcg_stock_pcp *stock;
+        struct mem_cgroup *iter;
+        if ((action == CPU_ONLINE)) {
+                for_each_mem_cgroup_all(iter)
+                        synchronize_mem_cgroup_on_move(iter, cpu);
+                return NOTIFY_OK;
+        }
-        if (action != CPU_DEAD)
+        if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
                return NOTIFY_OK;
+        for_each_mem_cgroup_all(iter)
+                mem_cgroup_drain_pcp_counter(iter, cpu);
        stock = &per_cpu(memcg_stock, cpu);
        drain_stock(stock);
        return NOTIFY_OK;
@@ -3038,6 +3226,7 @@ move_account:
                lru_add_drain_all();
                drain_all_stock_sync();
                ret = 0;
+                mem_cgroup_start_move(mem);
                for_each_node_state(node, N_HIGH_MEMORY) {
                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
                                enum lru_list l;
@@ -3051,6 +3240,7 @@ move_account:
                        if (ret)
                                break;
                }
+                mem_cgroup_end_move(mem);
                memcg_oom_recover(mem);
                /* it seems parent cgroup doesn't have enough mem */
                if (ret == -ENOMEM)
@@ -3137,33 +3327,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
        return retval;
 }
-struct mem_cgroup_idx_data {
-        s64 val;
-        enum mem_cgroup_stat_index idx;
-};
-static int
+static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
-mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
+                                enum mem_cgroup_stat_index idx)
 {
-        struct mem_cgroup_idx_data *d = data;
+        struct mem_cgroup *iter;
-        d->val += mem_cgroup_read_stat(mem, d->idx);
+        s64 val = 0;
-        return 0;
-}
-static void
+        /* each per cpu's value can be minus.Then, use s64 */
-mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
+        for_each_mem_cgroup_tree(iter, mem)
-                                enum mem_cgroup_stat_index idx, s64 *val)
+                val += mem_cgroup_read_stat(iter, idx);
-{
-        struct mem_cgroup_idx_data d;
+        if (val < 0) /* race ? */
-        d.idx = idx;
+                val = 0;
-        d.val = 0;
+        return val;
-        mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
-        *val = d.val;
 }
 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
 {
-        u64 idx_val, val;
+        u64 val;
        if (!mem_cgroup_is_root(mem)) {
                if (!swap)
@@ -3172,16 +3354,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
                        return res_counter_read_u64(&mem->memsw, RES_USAGE);
        }
-        mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
+        val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE);
-        val = idx_val;
+        val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS);
-        mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
-        val += idx_val;
-        if (swap) {
+        if (swap)
-                mem_cgroup_get_recursive_idx_stat(mem,
+                val += mem_cgroup_get_recursive_idx_stat(mem,
-                                MEM_CGROUP_STAT_SWAPOUT, &idx_val);
+                                MEM_CGROUP_STAT_SWAPOUT);
-                val += idx_val;
-        }
        return val << PAGE_SHIFT;
 }
@@ -3389,9 +3567,9 @@ struct {
 };
-static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
+static void
+mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
 {
-        struct mcs_total_stat *s = data;
        s64 val;
        /* per cpu stat */
@@ -3421,13 +3599,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
        s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
        val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
        s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
-        return 0;
 }
 static void
 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
 {
-        mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
+        struct mem_cgroup *iter;
+        for_each_mem_cgroup_tree(iter, mem)
+                mem_cgroup_get_local_stat(iter, s);
 }
 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
@@ -3604,7 +3784,7 @@ static int compare_thresholds(const void *a, const void *b)
        return _a->threshold - _b->threshold;
 }
-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
 {
        struct mem_cgroup_eventfd_list *ev;
@@ -3615,7 +3795,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
 static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
 {
-        mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
+        struct mem_cgroup *iter;
+        for_each_mem_cgroup_tree(iter, mem)
+                mem_cgroup_oom_notify_cb(iter);
 }
 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
@@ -4032,6 +4215,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
                        vfree(mem);
                mem = NULL;
        }
+        spin_lock_init(&mem->pcp_counter_lock);
        return mem;
 }
@@ -4158,7 +4342,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                                                &per_cpu(memcg_stock, cpu);
                        INIT_WORK(&stock->work, drain_local_stock);
                }
-                hotcpu_notifier(memcg_stock_cpu_callback, 0);
+                hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
        } else {
                parent = mem_cgroup_from_cont(cont->parent);
                mem->use_hierarchy = parent->use_hierarchy;
@@ -4513,6 +4697,7 @@ static void mem_cgroup_clear_mc(void)
        mc.to = NULL;
        mc.moving_task = NULL;
        spin_unlock(&mc.lock);
+        mem_cgroup_end_move(from);
        memcg_oom_recover(from);
        memcg_oom_recover(to);
        wake_up_all(&mc.waitq);
@@ -4543,6 +4728,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
                        VM_BUG_ON(mc.moving_task);
+                        mem_cgroup_start_move(from);
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = mem;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 757f6b0accfe..124324134ff6 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
 * Free Software Foundation.
 *
 * High level machine check handler. Handles pages reported by the
- * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
 * failure.
+ * 
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
 *
 * Handles page cache pages in various states.  The tricky part
- * here is that we can access any page asynchronous to other VM
+ * here is that we can access any page asynchronously in respect to 
- * users, because memory failures could happen anytime and anywhere,
+ * other VM users, because memory failures could happen anytime and 
- * possibly violating some of their assumptions. This is why this code
+ * anywhere. This could violate some of their assumptions. This is why 
- * has to be extremely careful. Generally it tries to use normal locking
+ * this code has to be extremely careful. Generally it tries to use 
- * rules, as in get the standard locks, even if that means the
+ * normal locking rules, as in get the standard locks, even if that means 
- * error handling takes potentially a long time.
+ * the error handling takes potentially a long time.
- *
+ * 
- * The operation to map back from RMAP chains to processes has to walk
+ * There are several operations here with exponential complexity because
- * the complete process list and has non linear complexity with the number
+ * of unsuitable VM data structures. For example the operation to map back 
- * mappings. In short it can be quite slow. But since memory corruptions
+ * from RMAP chains to processes has to walk the complete process list and 
- * are rare we hope to get away with this.
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core 
+ * VM.
 */
 /*
@@ -30,7 +35,6 @@
 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
 * - pass bad pages to kdump next kernel
 */
-#define DEBUG 1         /* remove me in 2.6.34 */
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
                return 0;
        /*
-         * page_mapping() does not accept slab page
+         * page_mapping() does not accept slab pages.
         */
        if (PageSlab(p))
                return -EINVAL;
@@ -268,7 +272,7 @@ struct to_kill {
        struct list_head nd;
        struct task_struct *tsk;
        unsigned long addr;
-        unsigned addr_valid:1;
+        char addr_valid;
 };
 /*
@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
         * a SIGKILL because the error is not contained anymore.
         */
        if (tk->addr == -EFAULT) {
-                pr_debug("MCE: Unable to find user space address %lx in %s\n",
+                pr_info("MCE: Unable to find user space address %lx in %s\n",
                        page_to_pfn(p), tsk->comm);
                tk->addr_valid = 0;
        }
@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                                        pfn, err);
                } else if (page_has_private(p) &&
                                !try_to_release_page(p, GFP_NOIO)) {
-                        pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+                        pr_info("MCE %#lx: failed to release buffers\n", pfn);
                } else {
                        ret = RECOVERED;
                }
@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
 * Issues:
 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
 *   To narrow down kill region to one page, we need to break up pmd.
- * - To support soft-offlining for hugepage, we need to support hugepage
- *   migration.
 */
 static int me_huge_page(struct page *p, unsigned long pfn)
 {
+        int res = 0;
        struct page *hpage = compound_head(p);
        /*
         * We can safely recover from error on free or reserved (i.e.
@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
         * so there is no race between isolation and mapping/unmapping.
         */
        if (!(page_mapping(hpage) || PageAnon(hpage))) {
-                __isolate_hwpoisoned_huge_page(hpage);
+                res = dequeue_hwpoisoned_huge_page(hpage);
-                return RECOVERED;
+                if (!res)
+                        return RECOVERED;
        }
        return DELAYED;
 }
@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
        return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
 }
-#define N_UNMAP_TRIES 5
 /*
 * Do all that is necessary to remove user space mappings. Unmap
 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        struct address_space *mapping;
        LIST_HEAD(tokill);
        int ret;
-        int i;
        int kill = 1;
        struct page *hpage = compound_head(p);
@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        if (kill)
                collect_procs(hpage, &tokill);
-        /*
+        ret = try_to_unmap(hpage, ttu);
-         * try_to_unmap can fail temporarily due to races.
-         * Try a few times (RED-PEN better strategy?)
-         */
-        for (i = 0; i < N_UNMAP_TRIES; i++) {
-                ret = try_to_unmap(hpage, ttu);
-                if (ret == SWAP_SUCCESS)
-                        break;
-                pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
-        }
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
                                pfn, page_mapcount(hpage));
@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * We need/can do nothing about count=0 pages.
         * 1) it's a free page, and therefore in safe hand:
         *    prep_new_page() will be the gate keeper.
-         * 2) it's part of a non-compound high order page.
+         * 2) it's a free hugepage, which is also safe:
+         *    an affected hugepage will be dequeued from hugepage freelist,
+         *    so there's no concern about reusing it ever after.
+         * 3) it's part of a non-compound high order page.
         *    Implies some kernel user: cannot stop them from
         *    R/W the page; let's pray that the page has been
         *    used and will be freed some time later.
@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                if (is_free_buddy_page(p)) {
                        action_result(pfn, "free buddy", DELAYED);
                        return 0;
+                } else if (PageHuge(hpage)) {
+                        /*
+                         * Check "just unpoisoned", "filter hit", and
+                         * "race with other subpage."
+                         */
+                        lock_page_nosync(hpage);
+                        if (!PageHWPoison(hpage)
+                            || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+                            || (p != hpage && TestSetPageHWPoison(hpage))) {
+                                atomic_long_sub(nr_pages, &mce_bad_pages);
+                                return 0;
+                        }
+                        set_page_hwpoison_huge_page(hpage);
+                        res = dequeue_hwpoisoned_huge_page(hpage);
+                        action_result(pfn, "free huge",
+                                      res ? IGNORED : DELAYED);
+                        unlock_page(hpage);
+                        return res;
                } else {
                        action_result(pfn, "high order kernel", IGNORED);
                        return -EBUSY;
@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn)
        page = compound_head(p);
        if (!PageHWPoison(p)) {
-                pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+                pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
                return 0;
        }
        nr_pages = 1 << compound_order(page);
        if (!get_page_unless_zero(page)) {
+                /*
+                 * Since HWPoisoned hugepage should have non-zero refcount,
+                 * race between memory failure and unpoison seems to happen.
+                 * In such case unpoison fails and memory failure runs
+                 * to the end.
+                 */
+                if (PageHuge(page)) {
+                        pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+                        return 0;
+                }
                if (TestClearPageHWPoison(p))
                        atomic_long_sub(nr_pages, &mce_bad_pages);
-                pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+                pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
                return 0;
        }
@@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn)
         * the free buddy page pool.
         */
        if (TestClearPageHWPoison(page)) {
-                pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
+                pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
                atomic_long_sub(nr_pages, &mce_bad_pages);
                freeit = 1;
+                if (PageHuge(page))
+                        clear_page_hwpoison_huge_page(page);
        }
-        if (PageHuge(p))
-                clear_page_hwpoison_huge_page(page);
        unlock_page(page);
        put_page(page);
@@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
 static struct page *new_page(struct page *p, unsigned long private, int **x)
 {
        int nid = page_to_nid(p);
-        return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+        if (PageHuge(p))
+                return alloc_huge_page_node(page_hstate(compound_head(p)),
+                                                   nid);
+        else
+                return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
 }
 /*
@@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
         * was free.
         */
        set_migratetype_isolate(p);
+        /*
+         * When the target page is a free hugepage, just remove it
+         * from free hugepage list.
+         */
        if (!get_page_unless_zero(compound_head(p))) {
-                if (is_free_buddy_page(p)) {
+                if (PageHuge(p)) {
-                        pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+                        pr_info("get_any_page: %#lx free huge page\n", pfn);
+                        ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+                } else if (is_free_buddy_page(p)) {
+                        pr_info("get_any_page: %#lx free buddy page\n", pfn);
                        /* Set hwpoison bit while page is still isolated */
                        SetPageHWPoison(p);
                        ret = 0;
                } else {
-                        pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+                        pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
                                pfn, p->flags);
                        ret = -EIO;
                }
@@ -1235,6 +1268,46 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
        return ret;
 }
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+        int ret;
+        unsigned long pfn = page_to_pfn(page);
+        struct page *hpage = compound_head(page);
+        LIST_HEAD(pagelist);
+        ret = get_any_page(page, pfn, flags);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                goto done;
+        if (PageHWPoison(hpage)) {
+                put_page(hpage);
+                pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+                return -EBUSY;
+        }
+        /* Keep page count to indicate a given hugepage is isolated. */
+        list_add(&hpage->lru, &pagelist);
+        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+        if (ret) {
+                        putback_lru_pages(&pagelist);
+                pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                         pfn, ret, page->flags);
+                if (ret > 0)
+                        ret = -EIO;
+                return ret;
+        }
+done:
+        if (!PageHWPoison(hpage))
+                atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+        set_page_hwpoison_huge_page(hpage);
+        dequeue_hwpoisoned_huge_page(hpage);
+        /* keep elevated page count for bad page */
+        return ret;
+}
 /**
 * soft_offline_page - Soft offline a page.
 * @page: page to offline
@@ -1262,6 +1335,9 @@ int soft_offline_page(struct page *page, int flags)
        int ret;
        unsigned long pfn = page_to_pfn(page);
+        if (PageHuge(page))
+                return soft_offline_huge_page(page, flags);
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
                return ret;
@@ -1288,7 +1364,7 @@ int soft_offline_page(struct page *page, int flags)
                        goto done;
        }
        if (!PageLRU(page)) {
-                pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+                pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
                                pfn, page->flags);
                return -EIO;
        }
@@ -1302,7 +1378,7 @@ int soft_offline_page(struct page *page, int flags)
        if (PageHWPoison(page)) {
                unlock_page(page);
                put_page(page);
-                pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+                pr_info("soft offline: %#lx page already poisoned\n", pfn);
                return -EBUSY;
        }
@@ -1323,7 +1399,7 @@ int soft_offline_page(struct page *page, int flags)
        put_page(page);
        if (ret == 1) {
                ret = 0;
-                pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+                pr_info("soft_offline: %#lx: invalidated\n", pfn);
                goto done;
        }
@@ -1339,13 +1415,13 @@ int soft_offline_page(struct page *page, int flags)
                list_add(&page->lru, &pagelist);
                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
                if (ret) {
-                        pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
                                ret = -EIO;
                }
        } else {
-                pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+                pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
                                pfn, ret, page_count(page), page->flags);
        }
        if (ret)
diff --git a/mm/memory.c b/mm/memory.c
index 0e18b4d649ec..02e48aa0ed13 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -736,7 +736,7 @@ again:
        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte)
                return -ENOMEM;
-        src_pte = pte_offset_map_nested(src_pmd, addr);
+        src_pte = pte_offset_map(src_pmd, addr);
        src_ptl = pte_lockptr(src_mm, src_pmd);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        orig_src_pte = src_pte;
@@ -767,7 +767,7 @@ again:
        arch_leave_lazy_mmu_mode();
        spin_unlock(src_ptl);
-        pte_unmap_nested(orig_src_pte);
+        pte_unmap(orig_src_pte);
        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                        if (ret & VM_FAULT_OOM)
                                                return i ? i : -ENOMEM;
                                        if (ret &
-                                            (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
+                                            (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
+                                             VM_FAULT_SIGBUS))
                                                return i ? i : -EFAULT;
                                        BUG();
                                }
@@ -1590,7 +1591,7 @@ struct page *get_dump_page(unsigned long addr)
 }
 #endif /* CONFIG_ELF_CORE */
-pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                        spinlock_t **ptl)
 {
        pgd_t * pgd = pgd_offset(mm, addr);
@@ -2079,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
                 * zeroes.
                 */
                if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
-                        memset(kaddr, 0, PAGE_SIZE);
+                        clear_page(kaddr);
                kunmap_atomic(kaddr, KM_USER0);
                flush_dcache_page(dst);
        } else
@@ -2107,6 +2108,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
                spinlock_t *ptl, pte_t orig_pte)
+        __releases(ptl)
 {
        struct page *old_page, *new_page;
        pte_t entry;
@@ -2626,6 +2628,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page, *swapcache = NULL;
        swp_entry_t entry;
        pte_t pte;
+        int locked;
        struct mem_cgroup *ptr = NULL;
        int exclusive = 0;
        int ret = 0;
@@ -2676,8 +2679,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_release;
        }
-        lock_page(page);
+        locked = lock_page_or_retry(page, mm, flags);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+        if (!locked) {
+                ret |= VM_FAULT_RETRY;
+                goto out_release;
+        }
        /*
         * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
@@ -2926,7 +2933,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        vmf.page = NULL;
        ret = vma->vm_ops->fault(vma, &vmf);
-        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
+                            VM_FAULT_RETRY)))
                return ret;
        if (unlikely(PageHWPoison(vmf.page))) {
@@ -3185,7 +3193,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                 * with threads.
                 */
                if (flags & FAULT_FLAG_WRITE)
-                        flush_tlb_page(vma, address);
+                        flush_tlb_fix_spurious_fault(vma, address);
        }
 unlock:
        pte_unmap_unlock(pte, ptl);
@@ -3343,7 +3351,7 @@ int in_gate_area_no_task(unsigned long addr)
 #endif  /* __HAVE_ARCH_GATE_AREA */
-static int follow_pte(struct mm_struct *mm, unsigned long address,
+static int __follow_pte(struct mm_struct *mm, unsigned long address,
                pte_t **ptepp, spinlock_t **ptlp)
 {
        pgd_t *pgd;
@@ -3380,6 +3388,17 @@ out:
        return -EINVAL;
 }
+static inline int follow_pte(struct mm_struct *mm, unsigned long address,
+                             pte_t **ptepp, spinlock_t **ptlp)
+{
+        int res;
+        /* (void) is needed to make gcc happy */
+        (void) __cond_lock(*ptlp,
+                           !(res = __follow_pte(mm, address, ptepp, ptlp)));
+        return res;
+}
 /**
 * follow_pfn - look up PFN at a user virtual address
 * @vma: memory mapping
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dd186c1a5d53..9260314a221e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -602,27 +602,14 @@ static struct page *next_active_pageblock(struct page *page)
 /* Checks if this range of memory is likely to be hot-removable. */
 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 {
-        int type;
        struct page *page = pfn_to_page(start_pfn);
        struct page *end_page = page + nr_pages;
        /* Check the starting page of each pageblock within the range */
        for (; page < end_page; page = next_active_pageblock(page)) {
-                type = get_pageblock_migratetype(page);
+                if (!is_pageblock_removable_nolock(page))
-                /*
-                 * A pageblock containing MOVABLE or free pages is considered
-                 * removable
-                 */
-                if (type != MIGRATE_MOVABLE && !pageblock_free(page))
-                        return 0;
-                /*
-                 * A pageblock starting with a PageReserved page is not
-                 * considered removable.
-                 */
-                if (PageReserved(page))
                        return 0;
+                cond_resched();
        }
        /* All pageblocks in the memory block are likely to be hot-removable */
@@ -659,7 +646,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
 * Scanning pfn is much easier than scanning lru list.
 * Scan pfn from start to end and Find LRU page.
 */
-int scan_lru_pages(unsigned long start, unsigned long end)
+static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
 {
        unsigned long pfn;
        struct page *page;
@@ -709,29 +696,30 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                                            page_is_file_cache(page));
                } else {
-                        /* Becasue we don't have big zone->lock. we should
-                           check this again here. */
-                        if (page_count(page))
-                                not_managed++;
 #ifdef CONFIG_DEBUG_VM
                        printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
                               pfn);
                        dump_page(page);
 #endif
+                        /* Becasue we don't have big zone->lock. we should
+                           check this again here. */
+                        if (page_count(page)) {
+                                not_managed++;
+                                ret = -EBUSY;
+                                break;
+                        }
                }
        }
-        ret = -EBUSY;
+        if (!list_empty(&source)) {
-        if (not_managed) {
+                if (not_managed) {
-                if (!list_empty(&source))
+                        putback_lru_pages(&source);
+                        goto out;
+                }
+                /* this function returns # of failed pages */
+                ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
+                if (ret)
                        putback_lru_pages(&source);
-                goto out;
        }
-        ret = 0;
-        if (list_empty(&source))
-                goto out;
-        /* this function returns # of failed pages */
-        ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
 out:
        return ret;
 }
@@ -840,7 +828,6 @@ repeat:
        ret = 0;
        if (drain) {
                lru_add_drain_all();
-                flush_scheduled_work();
                cond_resched();
                drain_all_pages();
        }
@@ -862,7 +849,6 @@ repeat:
        }
        /* drain all zone's lru pagevec, this is asyncronous... */
        lru_add_drain_all();
-        flush_scheduled_work();
        yield();
        /* drain pcp pages , this is synchrouns. */
        drain_all_pages();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f969da5dd8a2..4a57f135b76e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -924,15 +924,21 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
        nodemask_t nmask;
        LIST_HEAD(pagelist);
        int err = 0;
+        struct vm_area_struct *vma;
        nodes_clear(nmask);
        node_set(source, nmask);
-        check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+        vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+        if (IS_ERR(vma))
+                return PTR_ERR(vma);
-        if (!list_empty(&pagelist))
+        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_node_page, dest, 0);
+                if (err)
+                        putback_lru_pages(&pagelist);
+        }
        return err;
 }
@@ -1147,9 +1153,12 @@ static long do_mbind(unsigned long start, unsigned long len,
                err = mbind_range(mm, start, end, new);
-                if (!list_empty(&pagelist))
+                if (!list_empty(&pagelist)) {
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
                                                (unsigned long)vma, 0);
+                        if (nr_failed)
+                                putback_lru_pages(&pagelist);
+                }
                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
@@ -1588,7 +1597,7 @@ unsigned slab_node(struct mempolicy *policy)
                (void)first_zones_zonelist(zonelist, highest_zoneidx,
                                                        &policy->v.nodes,
                                                        &zone);
-                return zone->node;
+                return zone ? zone->node : numa_node_id();
        }
        default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..fe5a3c6a5426 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
+#include <linux/hugetlb.h>
 #include <linux/gfp.h>
 #include "internal.h"
@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
        pte_t *ptep, pte;
        spinlock_t *ptl;
-        pgd = pgd_offset(mm, addr);
+        if (unlikely(PageHuge(new))) {
-        if (!pgd_present(*pgd))
+                ptep = huge_pte_offset(mm, addr);
-                goto out;
+                if (!ptep)
+                        goto out;
+                ptl = &mm->page_table_lock;
+        } else {
+                pgd = pgd_offset(mm, addr);
+                if (!pgd_present(*pgd))
+                        goto out;
-        pud = pud_offset(pgd, addr);
+                pud = pud_offset(pgd, addr);
-        if (!pud_present(*pud))
+                if (!pud_present(*pud))
-                goto out;
+                        goto out;
-        pmd = pmd_offset(pud, addr);
+                pmd = pmd_offset(pud, addr);
-        if (!pmd_present(*pmd))
+                if (!pmd_present(*pmd))
-                goto out;
+                        goto out;
-        ptep = pte_offset_map(pmd, addr);
+                ptep = pte_offset_map(pmd, addr);
-        if (!is_swap_pte(*ptep)) {
+                if (!is_swap_pte(*ptep)) {
-                pte_unmap(ptep);
+                        pte_unmap(ptep);
-                goto out;
+                        goto out;
-        }
+                }
+                ptl = pte_lockptr(mm, pmd);
+        }
-        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        pte = *ptep;
        if (!is_swap_pte(pte))
@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
        if (is_write_migration_entry(entry))
                pte = pte_mkwrite(pte);
+#ifdef CONFIG_HUGETLB_PAGE
+        if (PageHuge(new))
+                pte = pte_mkhuge(pte);
+#endif
        flush_cache_page(vma, addr, pte_pfn(pte));
        set_pte_at(mm, addr, ptep, pte);
-        if (PageAnon(new))
+        if (PageHuge(new)) {
+                if (PageAnon(new))
+                        hugepage_add_anon_rmap(new, vma, addr);
+                else
+                        page_dup_rmap(new);
+        } else if (PageAnon(new))
                page_add_anon_rmap(new, vma, addr);
        else
                page_add_file_rmap(new);
@@ -276,11 +294,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 }
 /*
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
+ */
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+                                   struct page *newpage, struct page *page)
+{
+        int expected_count;
+        void **pslot;
+        if (!mapping) {
+                if (page_count(page) != 1)
+                        return -EAGAIN;
+                return 0;
+        }
+        spin_lock_irq(&mapping->tree_lock);
+        pslot = radix_tree_lookup_slot(&mapping->page_tree,
+                                        page_index(page));
+        expected_count = 2 + page_has_private(page);
+        if (page_count(page) != expected_count ||
+            (struct page *)radix_tree_deref_slot(pslot) != page) {
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        if (!page_freeze_refs(page, expected_count)) {
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        get_page(newpage);
+        radix_tree_replace_slot(pslot, newpage);
+        page_unfreeze_refs(page, expected_count);
+        __put_page(page);
+        spin_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
+/*
 * Copy the page to its new location
 */
-static void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_copy(struct page *newpage, struct page *page)
 {
-        copy_highpage(newpage, page);
+        if (PageHuge(page))
+                copy_huge_page(newpage, page);
+        else
+                copy_highpage(newpage, page);
        if (PageError(page))
                SetPageError(newpage);
@@ -431,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page)
                .nr_to_write = 1,
                .range_start = 0,
                .range_end = LLONG_MAX,
-                .nonblocking = 1,
                .for_reclaim = 1
        };
        int rc;
@@ -724,6 +789,92 @@ move_newpage:
 }
 /*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ *
+ * There is also no race when direct I/O is issued on the page under migration,
+ * because then pte is replaced with migration swap entry and direct I/O code
+ * will wait in the page fault for migration to complete.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+                                unsigned long private, struct page *hpage,
+                                int force, int offlining)
+{
+        int rc = 0;
+        int *result = NULL;
+        struct page *new_hpage = get_new_page(hpage, private, &result);
+        int rcu_locked = 0;
+        struct anon_vma *anon_vma = NULL;
+        if (!new_hpage)
+                return -ENOMEM;
+        rc = -EAGAIN;
+        if (!trylock_page(hpage)) {
+                if (!force)
+                        goto out;
+                lock_page(hpage);
+        }
+        if (PageAnon(hpage)) {
+                rcu_read_lock();
+                rcu_locked = 1;
+                if (page_mapped(hpage)) {
+                        anon_vma = page_anon_vma(hpage);
+                        atomic_inc(&anon_vma->external_refcount);
+                }
+        }
+        try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+        if (!page_mapped(hpage))
+                rc = move_to_new_page(new_hpage, hpage, 1);
+        if (rc)
+                remove_migration_ptes(hpage, hpage);
+        if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
+                                            &anon_vma->lock)) {
+                int empty = list_empty(&anon_vma->head);
+                spin_unlock(&anon_vma->lock);
+                if (empty)
+                        anon_vma_free(anon_vma);
+        }
+        if (rcu_locked)
+                rcu_read_unlock();
+out:
+        unlock_page(hpage);
+        if (rc != -EAGAIN) {
+                list_del(&hpage->lru);
+                put_page(hpage);
+        }
+        put_page(new_hpage);
+        if (result) {
+                if (rc)
+                        *result = rc;
+                else
+                        *result = page_to_nid(new_hpage);
+        }
+        return rc;
+}
+/*
 * migrate_pages
 *
 * The function takes one list of pages to migrate and a function
@@ -732,8 +883,9 @@ move_newpage:
 *
 * The function returns after 10 attempts or if no pages
 * are movable anymore because to has become empty
- * or no retryable pages exist anymore. All pages will be
+ * or no retryable pages exist anymore.
- * returned to the LRU or freed.
+ * Caller should call putback_lru_pages to return pages to the LRU
+ * or free list.
 *
 * Return: Number of pages not migrated or error code.
 */
@@ -780,7 +932,51 @@ out:
        if (!swapwrite)
                current->flags &= ~PF_SWAPWRITE;
-        putback_lru_pages(from);
+        if (rc)
+                return rc;
+        return nr_failed + retry;
+}
+int migrate_huge_pages(struct list_head *from,
+                new_page_t get_new_page, unsigned long private, int offlining)
+{
+        int retry = 1;
+        int nr_failed = 0;
+        int pass = 0;
+        struct page *page;
+        struct page *page2;
+        int rc;
+        for (pass = 0; pass < 10 && retry; pass++) {
+                retry = 0;
+                list_for_each_entry_safe(page, page2, from, lru) {
+                        cond_resched();
+                        rc = unmap_and_move_huge_page(get_new_page,
+                                        private, page, pass > 2, offlining);
+                        switch(rc) {
+                        case -ENOMEM:
+                                goto out;
+                        case -EAGAIN:
+                                retry++;
+                                break;
+                        case 0:
+                                break;
+                        default:
+                                /* Permanent failure */
+                                nr_failed++;
+                                break;
+                        }
+                }
+        }
+        rc = 0;
+out:
+        list_for_each_entry_safe(page, page2, from, lru)
+                put_page(page);
        if (rc)
                return rc;
@@ -841,7 +1037,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                err = -EFAULT;
                vma = find_vma(mm, pp->addr);
-                if (!vma || !vma_migratable(vma))
+                if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
                        goto set_status;
                page = follow_page(vma, pp->addr, FOLL_GET);
@@ -890,9 +1086,12 @@ set_status:
        }
        err = 0;
-        if (!list_empty(&pagelist))
+        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
                                (unsigned long)pm, 0);
+                if (err)
+                        putback_lru_pages(&pagelist);
+        }
        up_read(&mm->mmap_sem);
        return err;
@@ -1005,7 +1204,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
                int err = -EFAULT;
                vma = find_vma(mm, addr);
-                if (!vma)
+                if (!vma || addr < vma->vm_start)
                        goto set_status;
                page = follow_page(vma, addr, 0);
diff --git a/mm/mmap.c b/mm/mmap.c
index 00161a48a451..b179abb1474a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
+#include <linux/audit.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1108,6 +1109,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
        unsigned long retval = -EBADF;
        if (!(flags & MAP_ANONYMOUS)) {
+                audit_mmap_fd(fd, flags);
                if (unlikely(flags & MAP_HUGETLB))
                        return -EINVAL;
                file = fget(fd);
diff --git a/mm/mremap.c b/mm/mremap.c
index cde56ee51ef7..563fbdd6293a 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
         * pte locks because exclusive mmap_sem prevents deadlock.
         */
        old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
-        new_pte = pte_offset_map_nested(new_pmd, new_addr);
+        new_pte = pte_offset_map(new_pmd, new_addr);
        new_ptl = pte_lockptr(mm, new_pmd);
        if (new_ptl != old_ptl)
                spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        arch_leave_lazy_mmu_mode();
        if (new_ptl != old_ptl)
                spin_unlock(new_ptl);
-        pte_unmap_nested(new_pte - 1);
+        pte_unmap(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (mapping)
                spin_unlock(&mapping->i_mmap_lock);
diff --git a/mm/nommu.c b/mm/nommu.c
index 88ff091eb07a..3613517c7592 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -29,6 +29,7 @@
 #include <linux/personality.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/audit.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
@@ -293,11 +294,58 @@ void *vmalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc);
+/*
+ *      vzalloc - allocate virtually continguos memory with zero fill
+ *
+ *      @size:          allocation size
+ *
+ *      Allocate enough pages to cover @size from the page level
+ *      allocator and map them into continguos kernel virtual space.
+ *      The memory allocated is set to zero.
+ *
+ *      For tight control over page level allocator and protection flags
+ *      use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+                        PAGE_KERNEL);
+}
+EXPORT_SYMBOL(vzalloc);
+/**
+ * vmalloc_node - allocate memory on a specific node
+ * @size:       allocation size
+ * @node:       numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
 void *vmalloc_node(unsigned long size, int node)
 {
        return vmalloc(size);
 }
-EXPORT_SYMBOL(vmalloc_node);
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size:       allocation size
+ * @node:       numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+        return vzalloc(size);
+}
+EXPORT_SYMBOL(vzalloc_node);
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
@@ -1411,6 +1459,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
        struct file *file = NULL;
        unsigned long retval = -EBADF;
+        audit_mmap_fd(fd, flags);
        if (!(flags & MAP_ANONYMOUS)) {
                file = fget(fd);
                if (!file)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4029583a1024..7dcca55ede7c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -162,10 +162,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
                return 0;
        /*
-         * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
+         * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
-         * need to be executed for something that cannot be killed.
+         * so the entire heuristic doesn't need to be executed for something
+         * that cannot be killed.
         */
-        if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+        if (atomic_read(&p->mm->oom_disable_count)) {
                task_unlock(p);
                return 0;
        }
@@ -403,16 +404,40 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 {
+        struct task_struct *q;
+        struct mm_struct *mm;
        p = find_lock_task_mm(p);
        if (!p)
                return 1;
+        /* mm cannot be safely dereferenced after task_unlock(p) */
+        mm = p->mm;
        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
                task_pid_nr(p), p->comm, K(p->mm->total_vm),
                K(get_mm_counter(p->mm, MM_ANONPAGES)),
                K(get_mm_counter(p->mm, MM_FILEPAGES)));
        task_unlock(p);
+        /*
+         * Kill all processes sharing p->mm in other thread groups, if any.
+         * They don't get access to memory reserves or a higher scheduler
+         * priority, though, to avoid depletion of all memory or task
+         * starvation.  This prevents mm->mmap_sem livelock when an oom killed
+         * task cannot exit because it requires the semaphore and its contended
+         * by another thread trying to allocate memory itself.  That thread will
+         * now get access to memory reserves since it has a pending fatal
+         * signal.
+         */
+        for_each_process(q)
+                if (q->mm == mm && !same_thread_group(q, p)) {
+                        task_lock(q);   /* Protect ->comm from prctl() */
+                        pr_err("Kill process %d (%s) sharing same memory\n",
+                                task_pid_nr(q), q->comm);
+                        task_unlock(q);
+                        force_sig(SIGKILL, q);
+                }
        set_tsk_thread_flag(p, TIF_MEMDIE);
        force_sig(SIGKILL, p);
@@ -680,7 +705,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        read_lock(&tasklist_lock);
        if (sysctl_oom_kill_allocating_task &&
            !oom_unkillable_task(current, NULL, nodemask) &&
-            (current->signal->oom_adj != OOM_DISABLE)) {
+            current->mm && !atomic_read(&current->mm->oom_disable_count)) {
                /*
                 * oom_kill_process() needs tasklist_lock held.  If it returns
                 * non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e3bccac1f025..b840afa89761 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -415,14 +415,8 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
        if (vm_dirty_bytes)
                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
-        else {
+        else
-                int dirty_ratio;
+                dirty = (vm_dirty_ratio * available_memory) / 100;
-                dirty_ratio = vm_dirty_ratio;
-                if (dirty_ratio < 5)
-                        dirty_ratio = 5;
-                dirty = (dirty_ratio * available_memory) / 100;
-        }
        if (dirty_background_bytes)
                background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
@@ -510,7 +504,7 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * catch-up. This avoids (excessively) small writeouts
                 * when the bdi limits are ramping up.
                 */
-                if (nr_reclaimable + nr_writeback <
+                if (nr_reclaimable + nr_writeback <=
                                (background_thresh + dirty_thresh) / 2)
                        break;
@@ -542,8 +536,8 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * the last resort safeguard.
                 */
                dirty_exceeded =
-                        (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
+                        (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
-                        || (nr_reclaimable + nr_writeback >= dirty_thresh);
+                        || (nr_reclaimable + nr_writeback > dirty_thresh);
                if (!dirty_exceeded)
                        break;
@@ -1121,6 +1115,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 {
        if (mapping_cap_account_dirty(mapping)) {
                __inc_zone_page_state(page, NR_FILE_DIRTY);
+                __inc_zone_page_state(page, NR_DIRTIED);
                __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
                task_dirty_inc(current);
                task_io_account_write(PAGE_CACHE_SIZE);
@@ -1129,6 +1124,18 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 EXPORT_SYMBOL(account_page_dirtied);
 /*
+ * Helper function for set_page_writeback family.
+ * NOTE: Unlike account_page_dirtied this does not rely on being atomic
+ * wrt interrupts.
+ */
+void account_page_writeback(struct page *page)
+{
+        inc_zone_page_state(page, NR_WRITEBACK);
+        inc_zone_page_state(page, NR_WRITTEN);
+}
+EXPORT_SYMBOL(account_page_writeback);
+/*
 * For address_spaces which do not use buffers.  Just tag the page as dirty in
 * its radix tree.
 *
@@ -1366,7 +1373,7 @@ int test_set_page_writeback(struct page *page)
                ret = TestSetPageWriteback(page);
        }
        if (!ret)
-                inc_zone_page_state(page, NR_WRITEBACK);
+                account_page_writeback(page);
        return ret;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f12ad1836abe..07a654486f75 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
@@ -530,7 +531,7 @@ static inline void __free_one_page(struct page *page,
         * so it's less likely to be used soon and more likely to be merged
         * as a higher order page
         */
-        if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+        if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                struct page *higher_page, *higher_buddy;
                combined_idx = __find_combined_index(page_idx, order);
                higher_page = page + combined_idx - page_idx;
@@ -1906,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
                        preferred_zone, migratetype);
                if (!page && gfp_mask & __GFP_NOFAIL)
-                        congestion_wait(BLK_RW_ASYNC, HZ/50);
+                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
        } while (!page && (gfp_mask & __GFP_NOFAIL));
        return page;
@@ -1931,7 +1932,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
        const gfp_t wait = gfp_mask & __GFP_WAIT;
        /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
-        BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+        BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
        /*
         * The caller may dip into page reserves a bit more if the caller
@@ -1939,7 +1940,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
         * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
         * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
         */
-        alloc_flags |= (gfp_mask & __GFP_HIGH);
+        alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
        if (!wait) {
                alloc_flags |= ALLOC_HARDER;
@@ -2094,7 +2095,7 @@ rebalance:
        pages_reclaimed += did_some_progress;
        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
-                congestion_wait(BLK_RW_ASYNC, HZ/50);
+                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
        }
@@ -3636,6 +3637,41 @@ void __init free_bootmem_with_active_regions(int nid,
        }
 }
+#ifdef CONFIG_HAVE_MEMBLOCK
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
+                                        u64 goal, u64 limit)
+{
+        int i;
+        /* Need to go over early_node_map to find out good range for node */
+        for_each_active_range_index_in_nid(i, nid) {
+                u64 addr;
+                u64 ei_start, ei_last;
+                u64 final_start, final_end;
+                ei_last = early_node_map[i].end_pfn;
+                ei_last <<= PAGE_SHIFT;
+                ei_start = early_node_map[i].start_pfn;
+                ei_start <<= PAGE_SHIFT;
+                final_start = max(ei_start, goal);
+                final_end = min(ei_last, limit);
+                if (final_start >= final_end)
+                        continue;
+                addr = memblock_find_in_range(final_start, final_end, size, align);
+                if (addr == MEMBLOCK_ERROR)
+                        continue;
+                return addr;
+        }
+        return MEMBLOCK_ERROR;
+}
+#endif
 int __init add_from_early_node_map(struct range *range, int az,
                                   int nr_range, int nid)
 {
@@ -3655,46 +3691,26 @@ int __init add_from_early_node_map(struct range *range, int az,
 void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
                                        u64 goal, u64 limit)
 {
-        int i;
        void *ptr;
+        u64 addr;
-        if (limit > get_max_mapped())
+        if (limit > memblock.current_limit)
-                limit = get_max_mapped();
+                limit = memblock.current_limit;
-        /* need to go over early_node_map to find out good range for node */
+        addr = find_memory_core_early(nid, size, align, goal, limit);
-        for_each_active_range_index_in_nid(i, nid) {
-                u64 addr;
-                u64 ei_start, ei_last;
-                ei_last = early_node_map[i].end_pfn;
+        if (addr == MEMBLOCK_ERROR)
-                ei_last <<= PAGE_SHIFT;
+                return NULL;
-                ei_start = early_node_map[i].start_pfn;
-                ei_start <<= PAGE_SHIFT;
-                addr = find_early_area(ei_start, ei_last,
-                                         goal, limit, size, align);
-                if (addr == -1ULL)
-                        continue;
-#if 0
-                printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
-                                nid,
-                                ei_start, ei_last, goal, limit, size,
-                                align, addr);
-#endif
-                ptr = phys_to_virt(addr);
-                memset(ptr, 0, size);
-                reserve_early_without_check(addr, addr + size, "BOOTMEM");
-                /*
-                 * The min_count is set to 0 so that bootmem allocated blocks
-                 * are never reported as leaks.
-                 */
-                kmemleak_alloc(ptr, size, 0, 0);
-                return ptr;
-        }
-        return NULL;
+        ptr = phys_to_virt(addr);
+        memset(ptr, 0, size);
+        memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+        /*
+         * The min_count is set to 0 so that bootmem allocated blocks
+         * are never reported as leaks.
+         */
+        kmemleak_alloc(ptr, size, 0, 0);
+        return ptr;
 }
 #endif
@@ -5281,12 +5297,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 * page allocater never alloc memory from ISOLATE block.
 */
+static int
+__count_immobile_pages(struct zone *zone, struct page *page, int count)
+{
+        unsigned long pfn, iter, found;
+        /*
+         * For avoiding noise data, lru_add_drain_all() should be called
+         * If ZONE_MOVABLE, the zone never contains immobile pages
+         */
+        if (zone_idx(zone) == ZONE_MOVABLE)
+                return true;
+        if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+                return true;
+        pfn = page_to_pfn(page);
+        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+                unsigned long check = pfn + iter;
+                if (!pfn_valid_within(check)) {
+                        iter++;
+                        continue;
+                }
+                page = pfn_to_page(check);
+                if (!page_count(page)) {
+                        if (PageBuddy(page))
+                                iter += (1 << page_order(page)) - 1;
+                        continue;
+                }
+                if (!PageLRU(page))
+                        found++;
+                /*
+                 * If there are RECLAIMABLE pages, we need to check it.
+                 * But now, memory offline itself doesn't call shrink_slab()
+                 * and it still to be fixed.
+                 */
+                /*
+                 * If the page is not RAM, page_count()should be 0.
+                 * we don't need more check. This is an _used_ not-movable page.
+                 *
+                 * The problematic thing here is PG_reserved pages. PG_reserved
+                 * is set to both of a memory hole page and a _used_ kernel
+                 * page at boot.
+                 */
+                if (found > count)
+                        return false;
+        }
+        return true;
+}
+bool is_pageblock_removable_nolock(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        return __count_immobile_pages(zone, page, 0);
+}
 int set_migratetype_isolate(struct page *page)
 {
        struct zone *zone;
-        struct page *curr_page;
+        unsigned long flags, pfn;
-        unsigned long flags, pfn, iter;
-        unsigned long immobile = 0;
        struct memory_isolate_notify arg;
        int notifier_ret;
        int ret = -EBUSY;
@@ -5296,11 +5365,6 @@ int set_migratetype_isolate(struct page *page)
        zone_idx = zone_idx(zone);
        spin_lock_irqsave(&zone->lock, flags);
-        if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
-            zone_idx == ZONE_MOVABLE) {
-                ret = 0;
-                goto out;
-        }
        pfn = page_to_pfn(page);
        arg.start_pfn = pfn;
@@ -5320,23 +5384,20 @@ int set_migratetype_isolate(struct page *page)
         */
        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
        notifier_ret = notifier_to_errno(notifier_ret);
-        if (notifier_ret || !arg.pages_found)
+        if (notifier_ret)
                goto out;
+        /*
-        for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
+         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
-                if (!pfn_valid_within(pfn))
+         * We just check MOVABLE pages.
-                        continue;
+         */
+        if (__count_immobile_pages(zone, page, arg.pages_found))
-                curr_page = pfn_to_page(iter);
-                if (!page_count(curr_page) || PageLRU(curr_page))
-                        continue;
-                immobile++;
-        }
-        if (arg.pages_found == immobile)
                ret = 0;
+        /*
+         * immobile means "not-on-lru" paes. If immobile is larger than
+         * removable-by-driver pages reported by notifier, we'll fail.
+         */
 out:
        if (!ret) {
                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e0ffd967452..4ae42bb40892 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
 * all pages in [start_pfn...end_pfn) must be in the same zone.
 * zone->lock must be held before call this.
 *
- * Returns 0 if all pages in the range is isolated.
+ * Returns 1 if all pages in the range is isolated.
 */
 static int
 __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
@@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        struct zone *zone;
        int ret;
-        pfn = start_pfn;
        /*
         * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
         * is not aligned to pageblock_nr_pages.
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index df680855540a..89633fefc6a2 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -27,7 +27,7 @@
 *   chunk size is not aligned.  percpu-km code will whine about it.
 */
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
 #error "contiguous percpu allocation is incompatible with paged first chunk"
 #endif
@@ -35,7 +35,11 @@
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
-        /* noop */
+        unsigned int cpu;
+        for_each_possible_cpu(cpu)
+                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
        return 0;
 }
diff --git a/mm/percpu.c b/mm/percpu.c
index c76ef3891e0d..efe816856a9d 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -31,7 +31,7 @@
 * as small as 4 bytes.  The allocator organizes chunks into lists
 * according to free size and tries to allocate from the fullest one.
 * Each chunk keeps the maximum contiguous area size hint which is
- * guaranteed to be eqaul to or larger than the maximum contiguous
+ * guaranteed to be equal to or larger than the maximum contiguous
 * area in the chunk.  This helps the allocator not to iterate the
 * chunk maps unnecessarily.
 *
@@ -76,6 +76,7 @@
 #define PCPU_SLOT_BASE_SHIFT            5       /* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC              16      /* start a map with 16 ents */
+#ifdef CONFIG_SMP
 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
 #ifndef __addr_to_pcpu_ptr
 #define __addr_to_pcpu_ptr(addr)                                        \
@@ -89,6 +90,11 @@
                         (unsigned long)pcpu_base_addr -                \
                         (unsigned long)__per_cpu_start)
 #endif
+#else   /* CONFIG_SMP */
+/* on UP, it's always identity mapped */
+#define __addr_to_pcpu_ptr(addr)        (void __percpu *)(addr)
+#define __pcpu_ptr_to_addr(ptr)         (void __force *)(ptr)
+#endif  /* CONFIG_SMP */
 struct pcpu_chunk {
        struct list_head        list;           /* linked to pcpu_slot lists */
@@ -820,8 +826,8 @@ fail_unlock_mutex:
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 *
- * Allocate percpu area of @size bytes aligned at @align.  Might
+ * Allocate zero-filled percpu area of @size bytes aligned at @align.
- * sleep.  Might trigger writeouts.
+ * Might sleep.  Might trigger writeouts.
 *
 * CONTEXT:
 * Does GFP_KERNEL allocation.
@@ -840,9 +846,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 *
- * Allocate percpu area of @size bytes aligned at @align from reserved
+ * Allocate zero-filled percpu area of @size bytes aligned at @align
- * percpu area if arch has set it up; otherwise, allocation is served
+ * from reserved percpu area if arch has set it up; otherwise,
- * from the same dynamic area.  Might sleep.  Might trigger writeouts.
+ * allocation is served from the same dynamic area.  Might sleep.
+ * Might trigger writeouts.
 *
 * CONTEXT:
 * Does GFP_KERNEL allocation.
@@ -949,6 +956,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
 */
 bool is_kernel_percpu_address(unsigned long addr)
 {
+#ifdef CONFIG_SMP
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        unsigned int cpu;
@@ -959,6 +967,8 @@ bool is_kernel_percpu_address(unsigned long addr)
                if ((void *)addr >= start && (void *)addr < start + static_size)
                        return true;
        }
+#endif
+        /* on UP, can't distinguish from other static vars, always false */
        return false;
 }
@@ -1067,161 +1077,6 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
 }
 /**
- * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
- * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: minimum free size for dynamic allocation in bytes
- * @atom_size: allocation atom size
- * @cpu_distance_fn: callback to determine distance between cpus, optional
- *
- * This function determines grouping of units, their mappings to cpus
- * and other parameters considering needed percpu size, allocation
- * atom size and distances between CPUs.
- *
- * Groups are always mutliples of atom size and CPUs which are of
- * LOCAL_DISTANCE both ways are grouped together and share space for
- * units in the same group.  The returned configuration is guaranteed
- * to have CPUs on different nodes on different groups and >=75% usage
- * of allocated virtual address space.
- *
- * RETURNS:
- * On success, pointer to the new allocation_info is returned.  On
- * failure, ERR_PTR value is returned.
- */
-static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
-                                size_t reserved_size, size_t dyn_size,
-                                size_t atom_size,
-                                pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
-{
-        static int group_map[NR_CPUS] __initdata;
-        static int group_cnt[NR_CPUS] __initdata;
-        const size_t static_size = __per_cpu_end - __per_cpu_start;
-        int nr_groups = 1, nr_units = 0;
-        size_t size_sum, min_unit_size, alloc_size;
-        int upa, max_upa, uninitialized_var(best_upa);  /* units_per_alloc */
-        int last_allocs, group, unit;
-        unsigned int cpu, tcpu;
-        struct pcpu_alloc_info *ai;
-        unsigned int *cpu_map;
-        /* this function may be called multiple times */
-        memset(group_map, 0, sizeof(group_map));
-        memset(group_cnt, 0, sizeof(group_cnt));
-        /* calculate size_sum and ensure dyn_size is enough for early alloc */
-        size_sum = PFN_ALIGN(static_size + reserved_size +
-                            max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
-        dyn_size = size_sum - static_size - reserved_size;
-        /*
-         * Determine min_unit_size, alloc_size and max_upa such that
-         * alloc_size is multiple of atom_size and is the smallest
-         * which can accomodate 4k aligned segments which are equal to
-         * or larger than min_unit_size.
-         */
-        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-        alloc_size = roundup(min_unit_size, atom_size);
-        upa = alloc_size / min_unit_size;
-        while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
-                upa--;
-        max_upa = upa;
-        /* group cpus according to their proximity */
-        for_each_possible_cpu(cpu) {
-                group = 0;
-        next_group:
-                for_each_possible_cpu(tcpu) {
-                        if (cpu == tcpu)
-                                break;
-                        if (group_map[tcpu] == group && cpu_distance_fn &&
-                            (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
-                             cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
-                                group++;
-                                nr_groups = max(nr_groups, group + 1);
-                                goto next_group;
-                        }
-                }
-                group_map[cpu] = group;
-                group_cnt[group]++;
-        }
-        /*
-         * Expand unit size until address space usage goes over 75%
-         * and then as much as possible without using more address
-         * space.
-         */
-        last_allocs = INT_MAX;
-        for (upa = max_upa; upa; upa--) {
-                int allocs = 0, wasted = 0;
-                if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
-                        continue;
-                for (group = 0; group < nr_groups; group++) {
-                        int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
-                        allocs += this_allocs;
-                        wasted += this_allocs * upa - group_cnt[group];
-                }
-                /*
-                 * Don't accept if wastage is over 1/3.  The
-                 * greater-than comparison ensures upa==1 always
-                 * passes the following check.
-                 */
-                if (wasted > num_possible_cpus() / 3)
-                        continue;
-                /* and then don't consume more memory */
-                if (allocs > last_allocs)
-                        break;
-                last_allocs = allocs;
-                best_upa = upa;
-        }
-        upa = best_upa;
-        /* allocate and fill alloc_info */
-        for (group = 0; group < nr_groups; group++)
-                nr_units += roundup(group_cnt[group], upa);
-        ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
-        if (!ai)
-                return ERR_PTR(-ENOMEM);
-        cpu_map = ai->groups[0].cpu_map;
-        for (group = 0; group < nr_groups; group++) {
-                ai->groups[group].cpu_map = cpu_map;
-                cpu_map += roundup(group_cnt[group], upa);
-        }
-        ai->static_size = static_size;
-        ai->reserved_size = reserved_size;
-        ai->dyn_size = dyn_size;
-        ai->unit_size = alloc_size / upa;
-        ai->atom_size = atom_size;
-        ai->alloc_size = alloc_size;
-        for (group = 0, unit = 0; group_cnt[group]; group++) {
-                struct pcpu_group_info *gi = &ai->groups[group];
-                /*
-                 * Initialize base_offset as if all groups are located
-                 * back-to-back.  The caller should update this to
-                 * reflect actual allocation.
-                 */
-                gi->base_offset = unit * ai->unit_size;
-                for_each_possible_cpu(cpu)
-                        if (group_map[cpu] == group)
-                                gi->cpu_map[gi->nr_units++] = cpu;
-                gi->nr_units = roundup(gi->nr_units, upa);
-                unit += gi->nr_units;
-        }
-        BUG_ON(unit != nr_units);
-        return ai;
-}
-/**
 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
 * @lvl: loglevel
 * @ai: allocation info to dump
@@ -1363,7 +1218,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        /* sanity checks */
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
+#ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
+#endif
        PCPU_SETUP_BUG_ON(!base_addr);
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
@@ -1488,6 +1345,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        return 0;
 }
+#ifdef CONFIG_SMP
 const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
        [PCPU_FC_AUTO]  = "auto",
        [PCPU_FC_EMBED] = "embed",
@@ -1515,8 +1374,180 @@ static int __init percpu_alloc_setup(char *str)
 }
 early_param("percpu_alloc", percpu_alloc_setup);
+/*
+ * pcpu_embed_first_chunk() is used by the generic percpu setup.
+ * Build it if needed by the arch config or the generic setup is going
+ * to be used.
+ */
 #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
        !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
+#define BUILD_EMBED_FIRST_CHUNK
+#endif
+/* build pcpu_page_first_chunk() iff needed by the arch config */
+#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
+#define BUILD_PAGE_FIRST_CHUNK
+#endif
+/* pcpu_build_alloc_info() is used by both embed and page first chunk */
+#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
+/**
+ * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: minimum free size for dynamic allocation in bytes
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ *
+ * This function determines grouping of units, their mappings to cpus
+ * and other parameters considering needed percpu size, allocation
+ * atom size and distances between CPUs.
+ *
+ * Groups are always mutliples of atom size and CPUs which are of
+ * LOCAL_DISTANCE both ways are grouped together and share space for
+ * units in the same group.  The returned configuration is guaranteed
+ * to have CPUs on different nodes on different groups and >=75% usage
+ * of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, pointer to the new allocation_info is returned.  On
+ * failure, ERR_PTR value is returned.
+ */
+static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+                                size_t reserved_size, size_t dyn_size,
+                                size_t atom_size,
+                                pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+        static int group_map[NR_CPUS] __initdata;
+        static int group_cnt[NR_CPUS] __initdata;
+        const size_t static_size = __per_cpu_end - __per_cpu_start;
+        int nr_groups = 1, nr_units = 0;
+        size_t size_sum, min_unit_size, alloc_size;
+        int upa, max_upa, uninitialized_var(best_upa);  /* units_per_alloc */
+        int last_allocs, group, unit;
+        unsigned int cpu, tcpu;
+        struct pcpu_alloc_info *ai;
+        unsigned int *cpu_map;
+        /* this function may be called multiple times */
+        memset(group_map, 0, sizeof(group_map));
+        memset(group_cnt, 0, sizeof(group_cnt));
+        /* calculate size_sum and ensure dyn_size is enough for early alloc */
+        size_sum = PFN_ALIGN(static_size + reserved_size +
+                            max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
+        dyn_size = size_sum - static_size - reserved_size;
+        /*
+         * Determine min_unit_size, alloc_size and max_upa such that
+         * alloc_size is multiple of atom_size and is the smallest
+         * which can accomodate 4k aligned segments which are equal to
+         * or larger than min_unit_size.
+         */
+        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+        alloc_size = roundup(min_unit_size, atom_size);
+        upa = alloc_size / min_unit_size;
+        while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+                upa--;
+        max_upa = upa;
+        /* group cpus according to their proximity */
+        for_each_possible_cpu(cpu) {
+                group = 0;
+        next_group:
+                for_each_possible_cpu(tcpu) {
+                        if (cpu == tcpu)
+                                break;
+                        if (group_map[tcpu] == group && cpu_distance_fn &&
+                            (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+                             cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+                                group++;
+                                nr_groups = max(nr_groups, group + 1);
+                                goto next_group;
+                        }
+                }
+                group_map[cpu] = group;
+                group_cnt[group]++;
+        }
+        /*
+         * Expand unit size until address space usage goes over 75%
+         * and then as much as possible without using more address
+         * space.
+         */
+        last_allocs = INT_MAX;
+        for (upa = max_upa; upa; upa--) {
+                int allocs = 0, wasted = 0;
+                if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+                        continue;
+                for (group = 0; group < nr_groups; group++) {
+                        int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+                        allocs += this_allocs;
+                        wasted += this_allocs * upa - group_cnt[group];
+                }
+                /*
+                 * Don't accept if wastage is over 1/3.  The
+                 * greater-than comparison ensures upa==1 always
+                 * passes the following check.
+                 */
+                if (wasted > num_possible_cpus() / 3)
+                        continue;
+                /* and then don't consume more memory */
+                if (allocs > last_allocs)
+                        break;
+                last_allocs = allocs;
+                best_upa = upa;
+        }
+        upa = best_upa;
+        /* allocate and fill alloc_info */
+        for (group = 0; group < nr_groups; group++)
+                nr_units += roundup(group_cnt[group], upa);
+        ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
+        if (!ai)
+                return ERR_PTR(-ENOMEM);
+        cpu_map = ai->groups[0].cpu_map;
+        for (group = 0; group < nr_groups; group++) {
+                ai->groups[group].cpu_map = cpu_map;
+                cpu_map += roundup(group_cnt[group], upa);
+        }
+        ai->static_size = static_size;
+        ai->reserved_size = reserved_size;
+        ai->dyn_size = dyn_size;
+        ai->unit_size = alloc_size / upa;
+        ai->atom_size = atom_size;
+        ai->alloc_size = alloc_size;
+        for (group = 0, unit = 0; group_cnt[group]; group++) {
+                struct pcpu_group_info *gi = &ai->groups[group];
+                /*
+                 * Initialize base_offset as if all groups are located
+                 * back-to-back.  The caller should update this to
+                 * reflect actual allocation.
+                 */
+                gi->base_offset = unit * ai->unit_size;
+                for_each_possible_cpu(cpu)
+                        if (group_map[cpu] == group)
+                                gi->cpu_map[gi->nr_units++] = cpu;
+                gi->nr_units = roundup(gi->nr_units, upa);
+                unit += gi->nr_units;
+        }
+        BUG_ON(unit != nr_units);
+        return ai;
+}
+#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
+#if defined(BUILD_EMBED_FIRST_CHUNK)
 /**
 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
 * @reserved_size: the size of reserved percpu area in bytes
@@ -1645,10 +1676,9 @@ out_free:
                free_bootmem(__pa(areas), areas_size);
        return rc;
 }
-#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
+#endif /* BUILD_EMBED_FIRST_CHUNK */
-          !CONFIG_HAVE_SETUP_PER_CPU_AREA */
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#ifdef BUILD_PAGE_FIRST_CHUNK
 /**
 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
 * @reserved_size: the size of reserved percpu area in bytes
@@ -1756,10 +1786,11 @@ out_free_ar:
        pcpu_free_alloc_info(ai);
        return rc;
 }
-#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
+#endif /* BUILD_PAGE_FIRST_CHUNK */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 /*
- * Generic percpu area setup.
+ * Generic SMP percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
 * the original non-dynamic generic percpu area setup.  This is
@@ -1770,7 +1801,6 @@ out_free_ar:
 * on the physical linear memory mapping which uses large page
 * mappings on applicable archs.
 */
-#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
@@ -1799,13 +1829,48 @@ void __init setup_per_cpu_areas(void)
                                    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
                                    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
        if (rc < 0)
-                panic("Failed to initialized percpu areas.");
+                panic("Failed to initialize percpu areas.");
        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu)
                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
 }
-#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+#endif  /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+#else   /* CONFIG_SMP */
+/*
+ * UP percpu area setup.
+ *
+ * UP always uses km-based percpu allocator with identity mapping.
+ * Static percpu variables are indistinguishable from the usual static
+ * variables and don't require any special preparation.
+ */
+void __init setup_per_cpu_areas(void)
+{
+        const size_t unit_size =
+                roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
+                                         PERCPU_DYNAMIC_RESERVE));
+        struct pcpu_alloc_info *ai;
+        void *fc;
+        ai = pcpu_alloc_alloc_info(1, 1);
+        fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+        if (!ai || !fc)
+                panic("Failed to allocate memory for percpu areas.");
+        ai->dyn_size = unit_size;
+        ai->unit_size = unit_size;
+        ai->atom_size = unit_size;
+        ai->alloc_size = unit_size;
+        ai->groups[0].nr_units = 1;
+        ai->groups[0].cpu_map[0] = 0;
+        if (pcpu_setup_first_chunk(ai, fc) < 0)
+                panic("Failed to initialize percpu areas.");
+}
+#endif  /* CONFIG_SMP */
 /*
 * First and reserved chunks are initialized with temporary allocation
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
deleted file mode 100644
index db884fae5721..000000000000
--- a/mm/percpu_up.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
- */
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-void __percpu *__alloc_percpu(size_t size, size_t align)
-{
-        /*
-         * Can't easily make larger alignment work with kmalloc.  WARN
-         * on it.  Larger alignment should only be used for module
-         * percpu sections on SMP for which this path isn't used.
-         */
-        WARN_ON_ONCE(align > SMP_CACHE_BYTES);
-        return (void __percpu __force *)kzalloc(size, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(__alloc_percpu);
-void free_percpu(void __percpu *p)
-{
-        kfree(this_cpu_ptr(p));
-}
-EXPORT_SYMBOL_GPL(free_percpu);
-phys_addr_t per_cpu_ptr_to_phys(void *addr)
-{
-        return __pa(addr);
-}
diff --git a/mm/rmap.c b/mm/rmap.c
index 92e6757f196e..1a8bf76bfd03 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -80,7 +80,7 @@ static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
        return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
 }
-void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
+static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
 {
        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
 }
@@ -314,7 +314,7 @@ void __init anon_vma_init(void)
 * Getting a lock on a stable anon_vma from a page off the LRU is
 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
 */
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *__page_lock_anon_vma(struct page *page)
 {
        struct anon_vma *anon_vma, *root_anon_vma;
        unsigned long anon_mapping;
@@ -348,6 +348,8 @@ out:
 }
 void page_unlock_anon_vma(struct anon_vma *anon_vma)
+        __releases(&anon_vma->root->lock)
+        __releases(RCU)
 {
        anon_vma_unlock(anon_vma);
        rcu_read_unlock();
@@ -407,7 +409,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 *
 * On success returns with pte mapped and locked.
 */
-pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
                          unsigned long address, spinlock_t **ptlp, int sync)
 {
        pgd_t *pgd;
@@ -745,7 +747,7 @@ int page_mkclean(struct page *page)
                if (mapping) {
                        ret = page_mkclean_file(mapping, page);
                        if (page_test_dirty(page)) {
-                                page_clear_dirty(page);
+                                page_clear_dirty(page, 1);
                                ret = 1;
                        }
                }
@@ -780,10 +782,10 @@ void page_move_anon_rmap(struct page *page,
 }
 /**
- * __page_set_anon_rmap - setup new anonymous rmap
+ * __page_set_anon_rmap - set up new anonymous rmap
- * @page:       the page to add the mapping to
+ * @page:       Page to add to rmap     
- * @vma:        the vm area in which the mapping is added
+ * @vma:        VM area to add page to.
- * @address:    the user virtual address mapped
+ * @address:    User virtual address of the mapping     
 * @exclusive:  the page is exclusively owned by the current process
 */
 static void __page_set_anon_rmap(struct page *page,
@@ -793,25 +795,16 @@ static void __page_set_anon_rmap(struct page *page,
        BUG_ON(!anon_vma);
+        if (PageAnon(page))
+                return;
        /*
         * If the page isn't exclusively mapped into this vma,
         * we must use the _oldest_ possible anon_vma for the
         * page mapping!
         */
-        if (!exclusive) {
+        if (!exclusive)
-                if (PageAnon(page))
-                        return;
                anon_vma = anon_vma->root;
-        } else {
-                /*
-                 * In this case, swapped-out-but-not-discarded swap-cache
-                 * is remapped. So, no need to update page->mapping here.
-                 * We convice anon_vma poitned by page->mapping is not obsolete
-                 * because vma->anon_vma is necessary to be a family of it.
-                 */
-                if (PageAnon(page))
-                        return;
-        }
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
@@ -942,7 +935,7 @@ void page_remove_rmap(struct page *page)
         * containing the swap entry, but page not yet written to swap.
         */
        if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
-                page_clear_dirty(page);
+                page_clear_dirty(page, 1);
                set_page_dirty(page);
        }
        /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 080b09a57a8f..47fdeeb9d636 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1586,6 +1586,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
        inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
                inode->i_blocks = 0;
                inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1903,7 +1904,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
        dir->i_size += BOGO_DIRENT_SIZE;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);    /* New dentry reference */
+        ihold(inode);   /* New dentry reference */
        dget(dentry);           /* Extra pinning count for the created dentry */
        d_instantiate(dentry, inode);
 out:
@@ -2146,7 +2147,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
        if (*len < 3)
                return 255;
-        if (hlist_unhashed(&inode->i_hash)) {
+        if (inode_unhashed(inode)) {
                /* Unfortunately insert_inode_hash is not idempotent,
                 * so as we hash inodes here rather than at creation
                 * time, we need a lock to ensure we only try
@@ -2154,7 +2155,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
                 */
                static DEFINE_SPINLOCK(lock);
                spin_lock(&lock);
-                if (hlist_unhashed(&inode->i_hash))
+                if (inode_unhashed(inode))
                        __insert_inode_hash(inode,
                                            inode->i_ino + inode->i_generation);
                spin_unlock(&lock);
@@ -2537,16 +2538,16 @@ static const struct vm_operations_struct shmem_vm_ops = {
 };
-static int shmem_get_sb(struct file_system_type *fs_type,
+static struct dentry *shmem_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, shmem_fill_super);
 }
 static struct file_system_type tmpfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "tmpfs",
-        .get_sb         = shmem_get_sb,
+        .mount          = shmem_mount,
        .kill_sb        = kill_litter_super,
 };
@@ -2642,7 +2643,7 @@ out:
 static struct file_system_type tmpfs_fs_type = {
        .name           = "tmpfs",
-        .get_sb         = ramfs_get_sb,
+        .mount          = ramfs_mount,
        .kill_sb        = kill_litter_super,
 };
diff --git a/mm/slab.c b/mm/slab.c
index fcae9815d3b3..b1e40dafbab3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -901,7 +901,7 @@ static int transfer_objects(struct array_cache *to,
                struct array_cache *from, unsigned int max)
 {
        /* Figure out how many entries to transfer */
-        int nr = min(min(from->avail, max), to->limit - to->avail);
+        int nr = min3(from->avail, max, to->limit - to->avail);
        if (!nr)
                return 0;
diff --git a/mm/slob.c b/mm/slob.c
index d582171c8101..617b6d6c42c7 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -500,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
        } else {
                unsigned int order = get_order(size);
-                ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
+                if (likely(order))
+                        gfp |= __GFP_COMP;
+                ret = slob_new_pages(gfp, order, node);
                if (ret) {
                        struct page *page;
                        page = virt_to_page(ret);
diff --git a/mm/slub.c b/mm/slub.c
index 13fffe1f0f3d..8fd5401bb071 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -168,7 +168,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 /* Internal SLUB flags */
 #define __OBJECT_POISON         0x80000000UL /* Poison object */
-#define __SYSFS_ADD_DEFERRED    0x40000000UL /* Not yet visible via sysfs */
 static int kmem_size = sizeof(struct kmem_cache);
@@ -178,7 +177,7 @@ static struct notifier_block slab_notifier;
 static enum {
        DOWN,           /* No slab functionality available */
-        PARTIAL,        /* kmem_cache_open() works but kmalloc does not */
+        PARTIAL,        /* Kmem_cache_node works */
        UP,             /* Everything works but does not show up in sysfs */
        SYSFS           /* Sysfs up */
 } slab_state = DOWN;
@@ -199,7 +198,7 @@ struct track {
 enum track_item { TRACK_ALLOC, TRACK_FREE };
-#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SYSFS
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 static void sysfs_slab_remove(struct kmem_cache *);
@@ -210,6 +209,7 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
                                                        { return 0; }
 static inline void sysfs_slab_remove(struct kmem_cache *s)
 {
+        kfree(s->name);
        kfree(s);
 }
@@ -233,11 +233,7 @@ int slab_is_available(void)
 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 {
-#ifdef CONFIG_NUMA
        return s->node[node];
-#else
-        return &s->local_node;
-#endif
 }
 /* Verify that a pointer has an address that is valid within a slab page */
@@ -494,7 +490,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
        dump_stack();
 }
-static void init_object(struct kmem_cache *s, void *object, int active)
+static void init_object(struct kmem_cache *s, void *object, u8 val)
 {
        u8 *p = object;
@@ -504,9 +500,7 @@ static void init_object(struct kmem_cache *s, void *object, int active)
        }
        if (s->flags & SLAB_RED_ZONE)
-                memset(p + s->objsize,
+                memset(p + s->objsize, val, s->inuse - s->objsize);
-                        active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
-                        s->inuse - s->objsize);
 }
 static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
@@ -641,17 +635,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 }
 static int check_object(struct kmem_cache *s, struct page *page,
-                                        void *object, int active)
+                                        void *object, u8 val)
 {
        u8 *p = object;
        u8 *endobject = object + s->objsize;
        if (s->flags & SLAB_RED_ZONE) {
-                unsigned int red =
-                        active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
                if (!check_bytes_and_report(s, page, object, "Redzone",
-                        endobject, red, s->inuse - s->objsize))
+                        endobject, val, s->inuse - s->objsize))
                        return 0;
        } else {
                if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
@@ -661,7 +652,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
        }
        if (s->flags & SLAB_POISON) {
-                if (!active && (s->flags & __OBJECT_POISON) &&
+                if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
                        (!check_bytes_and_report(s, page, p, "Poison", p,
                                        POISON_FREE, s->objsize - 1) ||
                         !check_bytes_and_report(s, page, p, "Poison",
@@ -673,7 +664,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
                check_pad_bytes(s, page, p);
        }
-        if (!s->offset && active)
+        if (!s->offset && val == SLUB_RED_ACTIVE)
                /*
                 * Object and freepointer overlap. Cannot check
                 * freepointer while object is allocated.
@@ -792,6 +783,39 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
 }
 /*
+ * Hooks for other subsystems that check memory allocations. In a typical
+ * production configuration these hooks all should produce no code at all.
+ */
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+{
+        flags &= gfp_allowed_mask;
+        lockdep_trace_alloc(flags);
+        might_sleep_if(flags & __GFP_WAIT);
+        return should_failslab(s->objsize, flags, s->flags);
+}
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
+{
+        flags &= gfp_allowed_mask;
+        kmemcheck_slab_alloc(s, flags, object, s->objsize);
+        kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
+}
+static inline void slab_free_hook(struct kmem_cache *s, void *x)
+{
+        kmemleak_free_recursive(x, s->flags);
+}
+static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
+{
+        kmemcheck_slab_free(s, object, s->objsize);
+        debug_check_no_locks_freed(object, s->objsize);
+        if (!(s->flags & SLAB_DEBUG_OBJECTS))
+                debug_check_no_obj_freed(object, s->objsize);
+}
+/*
 * Tracking of fully allocated slabs for debugging purposes.
 */
 static void add_full(struct kmem_cache_node *n, struct page *page)
@@ -838,7 +862,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
         * dilemma by deferring the increment of the count during
         * bootstrap (see early_kmem_cache_node_alloc).
         */
-        if (!NUMA_BUILD || n) {
+        if (n) {
                atomic_long_inc(&n->nr_slabs);
                atomic_long_add(objects, &n->total_objects);
        }
@@ -858,11 +882,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
        if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
                return;
-        init_object(s, object, 0);
+        init_object(s, object, SLUB_RED_INACTIVE);
        init_tracking(s, object);
 }
-static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
+static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
                                        void *object, unsigned long addr)
 {
        if (!check_slab(s, page))
@@ -878,14 +902,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
                goto bad;
        }
-        if (!check_object(s, page, object, 0))
+        if (!check_object(s, page, object, SLUB_RED_INACTIVE))
                goto bad;
        /* Success perform special debug activities for allocs */
        if (s->flags & SLAB_STORE_USER)
                set_track(s, object, TRACK_ALLOC, addr);
        trace(s, page, object, 1);
-        init_object(s, object, 1);
+        init_object(s, object, SLUB_RED_ACTIVE);
        return 1;
 bad:
@@ -902,8 +926,8 @@ bad:
        return 0;
 }
-static int free_debug_processing(struct kmem_cache *s, struct page *page,
+static noinline int free_debug_processing(struct kmem_cache *s,
-                                        void *object, unsigned long addr)
+                 struct page *page, void *object, unsigned long addr)
 {
        if (!check_slab(s, page))
                goto fail;
@@ -918,7 +942,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
                goto fail;
        }
-        if (!check_object(s, page, object, 1))
+        if (!check_object(s, page, object, SLUB_RED_ACTIVE))
                return 0;
        if (unlikely(s != page->slab)) {
@@ -942,7 +966,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
        if (s->flags & SLAB_STORE_USER)
                set_track(s, object, TRACK_FREE, addr);
        trace(s, page, object, 0);
-        init_object(s, object, 0);
+        init_object(s, object, SLUB_RED_INACTIVE);
        return 1;
 fail:
@@ -1046,7 +1070,7 @@ static inline int free_debug_processing(struct kmem_cache *s,
 static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
                        { return 1; }
 static inline int check_object(struct kmem_cache *s, struct page *page,
-                        void *object, int active) { return 1; }
+                        void *object, u8 val) { return 1; }
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
@@ -1066,7 +1090,19 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
                                                        int objects) {}
 static inline void dec_slabs_node(struct kmem_cache *s, int node,
                                                        int objects) {}
-#endif
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+                                                        { return 0; }
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+                void *object) {}
+static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
+static inline void slab_free_hook_irq(struct kmem_cache *s,
+                void *object) {}
+#endif /* CONFIG_SLUB_DEBUG */
 /*
 * Slab allocation and freeing
@@ -1194,7 +1230,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
                slab_pad_check(s, page);
                for_each_object(p, s, page_address(page),
                                                page->objects)
-                        check_object(s, page, p, 0);
+                        check_object(s, page, p, SLUB_RED_INACTIVE);
        }
        kmemcheck_free_shadow(page, compound_order(page));
@@ -1274,13 +1310,19 @@ static void add_partial(struct kmem_cache_node *n,
        spin_unlock(&n->list_lock);
 }
+static inline void __remove_partial(struct kmem_cache_node *n,
+                                        struct page *page)
+{
+        list_del(&page->lru);
+        n->nr_partial--;
+}
 static void remove_partial(struct kmem_cache *s, struct page *page)
 {
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
        spin_lock(&n->list_lock);
-        list_del(&page->lru);
+        __remove_partial(n, page);
-        n->nr_partial--;
        spin_unlock(&n->list_lock);
 }
@@ -1293,8 +1335,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
                                                        struct page *page)
 {
        if (slab_trylock(page)) {
-                list_del(&page->lru);
+                __remove_partial(n, page);
-                n->nr_partial--;
                __SetPageSlubFrozen(page);
                return 1;
        }
@@ -1405,6 +1446,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
 * On exit the slab lock will have been dropped.
 */
 static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
+        __releases(bitlock)
 {
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -1447,6 +1489,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
 * Remove the cpu slab
 */
 static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+        __releases(bitlock)
 {
        struct page *page = c->page;
        int tail = 1;
@@ -1647,6 +1690,7 @@ new_slab:
                goto load_freelist;
        }
+        gfpflags &= gfp_allowed_mask;
        if (gfpflags & __GFP_WAIT)
                local_irq_enable();
@@ -1674,7 +1718,7 @@ debug:
        c->page->inuse++;
        c->page->freelist = get_freepointer(s, object);
-        c->node = -1;
+        c->node = NUMA_NO_NODE;
        goto unlock_out;
 }
@@ -1695,12 +1739,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
        struct kmem_cache_cpu *c;
        unsigned long flags;
-        gfpflags &= gfp_allowed_mask;
+        if (slab_pre_alloc_hook(s, gfpflags))
-        lockdep_trace_alloc(gfpflags);
-        might_sleep_if(gfpflags & __GFP_WAIT);
-        if (should_failslab(s->objsize, gfpflags, s->flags))
                return NULL;
        local_irq_save(flags);
@@ -1719,8 +1758,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
        if (unlikely(gfpflags & __GFP_ZERO) && object)
                memset(object, 0, s->objsize);
-        kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
+        slab_post_alloc_hook(s, gfpflags, object);
-        kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
        return object;
 }
@@ -1754,7 +1792,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
        return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
-#endif
 #ifdef CONFIG_TRACING
 void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
@@ -1765,6 +1802,7 @@ void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
 #endif
+#endif
 /*
 * Slow patch handling. This may still be called frequently since objects
@@ -1850,14 +1888,14 @@ static __always_inline void slab_free(struct kmem_cache *s,
        struct kmem_cache_cpu *c;
        unsigned long flags;
-        kmemleak_free_recursive(x, s->flags);
+        slab_free_hook(s, x);
        local_irq_save(flags);
        c = __this_cpu_ptr(s->cpu_slab);
-        kmemcheck_slab_free(s, object, s->objsize);
-        debug_check_no_locks_freed(object, s->objsize);
+        slab_free_hook_irq(s, x);
-        if (!(s->flags & SLAB_DEBUG_OBJECTS))
-                debug_check_no_obj_freed(object, s->objsize);
+        if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
-        if (likely(page == c->page && c->node >= 0)) {
                set_freepointer(s, object, c->freelist);
                c->freelist = object;
                stat(s, FREE_FASTPATH);
@@ -2062,26 +2100,18 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
 #endif
 }
-static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
-static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
 {
-        if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
+        BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
-                /*
+                        SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
-                 * Boot time creation of the kmalloc array. Use static per cpu data
-                 * since the per cpu allocator is not available yet.
-                 */
-                s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
-        else
-                s->cpu_slab =  alloc_percpu(struct kmem_cache_cpu);
-        if (!s->cpu_slab)
+        s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
-                return 0;
-        return 1;
+        return s->cpu_slab != NULL;
 }
-#ifdef CONFIG_NUMA
+static struct kmem_cache *kmem_cache_node;
 /*
 * No kmalloc_node yet so do it by hand. We know that this is the first
 * slab on the node for this slabcache. There are no concurrent accesses
@@ -2091,15 +2121,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
 * memory on a fresh node that has no slab structures yet.
 */
-static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
+static void early_kmem_cache_node_alloc(int node)
 {
        struct page *page;
        struct kmem_cache_node *n;
        unsigned long flags;
-        BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
+        BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
-        page = new_slab(kmalloc_caches, gfpflags, node);
+        page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
        BUG_ON(!page);
        if (page_to_nid(page) != node) {
@@ -2111,15 +2141,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
        n = page->freelist;
        BUG_ON(!n);
-        page->freelist = get_freepointer(kmalloc_caches, n);
+        page->freelist = get_freepointer(kmem_cache_node, n);
        page->inuse++;
-        kmalloc_caches->node[node] = n;
+        kmem_cache_node->node[node] = n;
 #ifdef CONFIG_SLUB_DEBUG
-        init_object(kmalloc_caches, n, 1);
+        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
-        init_tracking(kmalloc_caches, n);
+        init_tracking(kmem_cache_node, n);
 #endif
-        init_kmem_cache_node(n, kmalloc_caches);
+        init_kmem_cache_node(n, kmem_cache_node);
-        inc_slabs_node(kmalloc_caches, node, page->objects);
+        inc_slabs_node(kmem_cache_node, node, page->objects);
        /*
         * lockdep requires consistent irq usage for each lock
@@ -2137,13 +2167,15 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = s->node[node];
                if (n)
-                        kmem_cache_free(kmalloc_caches, n);
+                        kmem_cache_free(kmem_cache_node, n);
                s->node[node] = NULL;
        }
 }
-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
+static int init_kmem_cache_nodes(struct kmem_cache *s)
 {
        int node;
@@ -2151,11 +2183,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
                struct kmem_cache_node *n;
                if (slab_state == DOWN) {
-                        early_kmem_cache_node_alloc(gfpflags, node);
+                        early_kmem_cache_node_alloc(node);
                        continue;
                }
-                n = kmem_cache_alloc_node(kmalloc_caches,
+                n = kmem_cache_alloc_node(kmem_cache_node,
-                                                gfpflags, node);
+                                                GFP_KERNEL, node);
                if (!n) {
                        free_kmem_cache_nodes(s);
@@ -2167,17 +2199,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
        }
        return 1;
 }
-#else
-static void free_kmem_cache_nodes(struct kmem_cache *s)
-{
-}
-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
-{
-        init_kmem_cache_node(&s->local_node, s);
-        return 1;
-}
-#endif
 static void set_min_partial(struct kmem_cache *s, unsigned long min)
 {
@@ -2312,7 +2333,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 }
-static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
+static int kmem_cache_open(struct kmem_cache *s,
                const char *name, size_t size,
                size_t align, unsigned long flags,
                void (*ctor)(void *))
@@ -2348,10 +2369,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 #ifdef CONFIG_NUMA
        s->remote_node_defrag_ratio = 1000;
 #endif
-        if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+        if (!init_kmem_cache_nodes(s))
                goto error;
-        if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
+        if (alloc_kmem_cache_cpus(s))
                return 1;
        free_kmem_cache_nodes(s);
@@ -2414,9 +2435,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
 #ifdef CONFIG_SLUB_DEBUG
        void *addr = page_address(page);
        void *p;
-        long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long),
+        unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
-                            GFP_ATOMIC);
+                                     sizeof(long), GFP_ATOMIC);
        if (!map)
                return;
        slab_err(s, page, "%s", text);
@@ -2448,9 +2468,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
        spin_lock_irqsave(&n->list_lock, flags);
        list_for_each_entry_safe(page, h, &n->partial, lru) {
                if (!page->inuse) {
-                        list_del(&page->lru);
+                        __remove_partial(n, page);
                        discard_slab(s, page);
-                        n->nr_partial--;
                } else {
                        list_slab_objects(s, page,
                                "Objects remaining on kmem_cache_close()");
@@ -2507,9 +2526,15 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 *              Kmalloc subsystem
 *******************************************************************/
-struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
+struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
 EXPORT_SYMBOL(kmalloc_caches);
+static struct kmem_cache *kmem_cache;
+#ifdef CONFIG_ZONE_DMA
+static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
+#endif
 static int __init setup_slub_min_order(char *str)
 {
        get_option(&str, &slub_min_order);
@@ -2546,116 +2571,29 @@ static int __init setup_slub_nomerge(char *str)
 __setup("slub_nomerge", setup_slub_nomerge);
-static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
+static struct kmem_cache *__init create_kmalloc_cache(const char *name,
-                const char *name, int size, gfp_t gfp_flags)
+                                                int size, unsigned int flags)
 {
-        unsigned int flags = 0;
+        struct kmem_cache *s;
-        if (gfp_flags & SLUB_DMA)
+        s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
-                flags = SLAB_CACHE_DMA;
        /*
         * This function is called with IRQs disabled during early-boot on
         * single CPU so there's no need to take slub_lock here.
         */
-        if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
+        if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
                                                                flags, NULL))
                goto panic;
        list_add(&s->list, &slab_caches);
-        if (sysfs_slab_add(s))
-                goto panic;
        return s;
 panic:
        panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
+        return NULL;
 }
-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
-static void sysfs_add_func(struct work_struct *w)
-{
-        struct kmem_cache *s;
-        down_write(&slub_lock);
-        list_for_each_entry(s, &slab_caches, list) {
-                if (s->flags & __SYSFS_ADD_DEFERRED) {
-                        s->flags &= ~__SYSFS_ADD_DEFERRED;
-                        sysfs_slab_add(s);
-                }
-        }
-        up_write(&slub_lock);
-}
-static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
-static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
-{
-        struct kmem_cache *s;
-        char *text;
-        size_t realsize;
-        unsigned long slabflags;
-        int i;
-        s = kmalloc_caches_dma[index];
-        if (s)
-                return s;
-        /* Dynamically create dma cache */
-        if (flags & __GFP_WAIT)
-                down_write(&slub_lock);
-        else {
-                if (!down_write_trylock(&slub_lock))
-                        goto out;
-        }
-        if (kmalloc_caches_dma[index])
-                goto unlock_out;
-        realsize = kmalloc_caches[index].objsize;
-        text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
-                         (unsigned int)realsize);
-        s = NULL;
-        for (i = 0; i < KMALLOC_CACHES; i++)
-                if (!kmalloc_caches[i].size)
-                        break;
-        BUG_ON(i >= KMALLOC_CACHES);
-        s = kmalloc_caches + i;
-        /*
-         * Must defer sysfs creation to a workqueue because we don't know
-         * what context we are called from. Before sysfs comes up, we don't
-         * need to do anything because our sysfs initcall will start by
-         * adding all existing slabs to sysfs.
-         */
-        slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
-        if (slab_state >= SYSFS)
-                slabflags |= __SYSFS_ADD_DEFERRED;
-        if (!text || !kmem_cache_open(s, flags, text,
-                        realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
-                s->size = 0;
-                kfree(text);
-                goto unlock_out;
-        }
-        list_add(&s->list, &slab_caches);
-        kmalloc_caches_dma[index] = s;
-        if (slab_state >= SYSFS)
-                schedule_work(&sysfs_add_work);
-unlock_out:
-        up_write(&slub_lock);
-out:
-        return kmalloc_caches_dma[index];
-}
-#endif
 /*
 * Conversion table for small slabs sizes / 8 to the index in the
 * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -2708,10 +2646,10 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
 #ifdef CONFIG_ZONE_DMA
        if (unlikely((flags & SLUB_DMA)))
-                return dma_kmalloc_cache(index, flags);
+                return kmalloc_dma_caches[index];
 #endif
-        return &kmalloc_caches[index];
+        return kmalloc_caches[index];
 }
 void *__kmalloc(size_t size, gfp_t flags)
@@ -2735,6 +2673,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 }
 EXPORT_SYMBOL(__kmalloc);
+#ifdef CONFIG_NUMA
 static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 {
        struct page *page;
@@ -2749,7 +2688,6 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
        return ptr;
 }
-#ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
        struct kmem_cache *s;
@@ -2889,8 +2827,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
                                 * may have freed the last object and be
                                 * waiting to release the slab.
                                 */
-                                list_del(&page->lru);
+                                __remove_partial(n, page);
-                                n->nr_partial--;
                                slab_unlock(page);
                                discard_slab(s, page);
                        } else {
@@ -2914,7 +2851,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
-#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+#if defined(CONFIG_MEMORY_HOTPLUG)
 static int slab_mem_going_offline_callback(void *arg)
 {
        struct kmem_cache *s;
@@ -2956,7 +2893,7 @@ static void slab_mem_offline_callback(void *arg)
                        BUG_ON(slabs_node(s, offline_node));
                        s->node[offline_node] = NULL;
-                        kmem_cache_free(kmalloc_caches, n);
+                        kmem_cache_free(kmem_cache_node, n);
                }
        }
        up_read(&slub_lock);
@@ -2989,7 +2926,7 @@ static int slab_mem_going_online_callback(void *arg)
                 *      since memory is not yet available from the node that
                 *      is brought up.
                 */
-                n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL);
+                n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
                if (!n) {
                        ret = -ENOMEM;
                        goto out;
@@ -3035,46 +2972,92 @@ static int slab_memory_callback(struct notifier_block *self,
 *                      Basic setup of slabs
 *******************************************************************/
+/*
+ * Used for early kmem_cache structures that were allocated using
+ * the page allocator
+ */
+static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
+{
+        int node;
+        list_add(&s->list, &slab_caches);
+        s->refcount = -1;
+        for_each_node_state(node, N_NORMAL_MEMORY) {
+                struct kmem_cache_node *n = get_node(s, node);
+                struct page *p;
+                if (n) {
+                        list_for_each_entry(p, &n->partial, lru)
+                                p->slab = s;
+#ifdef CONFIG_SLAB_DEBUG
+                        list_for_each_entry(p, &n->full, lru)
+                                p->slab = s;
+#endif
+                }
+        }
+}
 void __init kmem_cache_init(void)
 {
        int i;
        int caches = 0;
+        struct kmem_cache *temp_kmem_cache;
+        int order;
+        struct kmem_cache *temp_kmem_cache_node;
+        unsigned long kmalloc_size;
+        kmem_size = offsetof(struct kmem_cache, node) +
+                                nr_node_ids * sizeof(struct kmem_cache_node *);
+        /* Allocate two kmem_caches from the page allocator */
+        kmalloc_size = ALIGN(kmem_size, cache_line_size());
+        order = get_order(2 * kmalloc_size);
+        kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
-#ifdef CONFIG_NUMA
        /*
         * Must first have the slab cache available for the allocations of the
         * struct kmem_cache_node's. There is special bootstrap code in
         * kmem_cache_open for slab_state == DOWN.
         */
-        create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
+        kmem_cache_node = (void *)kmem_cache + kmalloc_size;
-                sizeof(struct kmem_cache_node), GFP_NOWAIT);
-        kmalloc_caches[0].refcount = -1;
+        kmem_cache_open(kmem_cache_node, "kmem_cache_node",
-        caches++;
+                sizeof(struct kmem_cache_node),
+                0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
-#endif
        /* Able to allocate the per node structures */
        slab_state = PARTIAL;
-        /* Caches that are not of the two-to-the-power-of size */
+        temp_kmem_cache = kmem_cache;
-        if (KMALLOC_MIN_SIZE <= 32) {
+        kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
-                create_kmalloc_cache(&kmalloc_caches[1],
+                0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
-                                "kmalloc-96", 96, GFP_NOWAIT);
+        kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
-                caches++;
+        memcpy(kmem_cache, temp_kmem_cache, kmem_size);
-        }
-        if (KMALLOC_MIN_SIZE <= 64) {
-                create_kmalloc_cache(&kmalloc_caches[2],
-                                "kmalloc-192", 192, GFP_NOWAIT);
-                caches++;
-        }
-        for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+        /*
-                create_kmalloc_cache(&kmalloc_caches[i],
+         * Allocate kmem_cache_node properly from the kmem_cache slab.
-                        "kmalloc", 1 << i, GFP_NOWAIT);
+         * kmem_cache_node is separately allocated so no need to
-                caches++;
+         * update any list pointers.
-        }
+         */
+        temp_kmem_cache_node = kmem_cache_node;
+        kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+        memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
+        kmem_cache_bootstrap_fixup(kmem_cache_node);
+        caches++;
+        kmem_cache_bootstrap_fixup(kmem_cache);
+        caches++;
+        /* Free temporary boot structure */
+        free_pages((unsigned long)temp_kmem_cache, order);
+        /* Now we can use the kmem_cache to allocate kmalloc slabs */
        /*
         * Patch up the size_index table if we have strange large alignment
@@ -3114,26 +3097,60 @@ void __init kmem_cache_init(void)
                        size_index[size_index_elem(i)] = 8;
        }
+        /* Caches that are not of the two-to-the-power-of size */
+        if (KMALLOC_MIN_SIZE <= 32) {
+                kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
+                caches++;
+        }
+        if (KMALLOC_MIN_SIZE <= 64) {
+                kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
+                caches++;
+        }
+        for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+                kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
+                caches++;
+        }
        slab_state = UP;
        /* Provide the correct kmalloc names now that the caches are up */
+        if (KMALLOC_MIN_SIZE <= 32) {
+                kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
+                BUG_ON(!kmalloc_caches[1]->name);
+        }
+        if (KMALLOC_MIN_SIZE <= 64) {
+                kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
+                BUG_ON(!kmalloc_caches[2]->name);
+        }
        for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
                char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
                BUG_ON(!s);
-                kmalloc_caches[i].name = s;
+                kmalloc_caches[i]->name = s;
        }
 #ifdef CONFIG_SMP
        register_cpu_notifier(&slab_notifier);
 #endif
-#ifdef CONFIG_NUMA
-        kmem_size = offsetof(struct kmem_cache, node) +
-                                nr_node_ids * sizeof(struct kmem_cache_node *);
-#else
-        kmem_size = sizeof(struct kmem_cache);
-#endif
+#ifdef CONFIG_ZONE_DMA
+        for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
+                struct kmem_cache *s = kmalloc_caches[i];
+                if (s && s->size) {
+                        char *name = kasprintf(GFP_NOWAIT,
+                                 "dma-kmalloc-%d", s->objsize);
+                        BUG_ON(!name);
+                        kmalloc_dma_caches[i] = create_kmalloc_cache(name,
+                                s->objsize, SLAB_CACHE_DMA);
+                }
+        }
+#endif
        printk(KERN_INFO
                "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
                " CPUs=%d, Nodes=%d\n",
@@ -3211,6 +3228,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s;
+        char *n;
        if (WARN_ON(!name))
                return NULL;
@@ -3234,19 +3252,25 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                return s;
        }
+        n = kstrdup(name, GFP_KERNEL);
+        if (!n)
+                goto err;
        s = kmalloc(kmem_size, GFP_KERNEL);
        if (s) {
-                if (kmem_cache_open(s, GFP_KERNEL, name,
+                if (kmem_cache_open(s, n,
                                size, align, flags, ctor)) {
                        list_add(&s->list, &slab_caches);
                        if (sysfs_slab_add(s)) {
                                list_del(&s->list);
+                                kfree(n);
                                kfree(s);
                                goto err;
                        }
                        up_write(&slub_lock);
                        return s;
                }
+                kfree(n);
                kfree(s);
        }
        up_write(&slub_lock);
@@ -3318,6 +3342,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
        return ret;
 }
+#ifdef CONFIG_NUMA
 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
                                        int node, unsigned long caller)
 {
@@ -3346,8 +3371,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
        return ret;
 }
+#endif
-#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SYSFS
 static int count_inuse(struct page *page)
 {
        return page->inuse;
@@ -3357,7 +3383,9 @@ static int count_total(struct page *page)
 {
        return page->objects;
 }
+#endif
+#ifdef CONFIG_SLUB_DEBUG
 static int validate_slab(struct kmem_cache *s, struct page *page,
                                                unsigned long *map)
 {
@@ -3448,65 +3476,6 @@ static long validate_slab_cache(struct kmem_cache *s)
        kfree(map);
        return count;
 }
-#ifdef SLUB_RESILIENCY_TEST
-static void resiliency_test(void)
-{
-        u8 *p;
-        printk(KERN_ERR "SLUB resiliency testing\n");
-        printk(KERN_ERR "-----------------------\n");
-        printk(KERN_ERR "A. Corruption after allocation\n");
-        p = kzalloc(16, GFP_KERNEL);
-        p[16] = 0x12;
-        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
-                        " 0x12->0x%p\n\n", p + 16);
-        validate_slab_cache(kmalloc_caches + 4);
-        /* Hmmm... The next two are dangerous */
-        p = kzalloc(32, GFP_KERNEL);
-        p[32 + sizeof(void *)] = 0x34;
-        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
-                        " 0x34 -> -0x%p\n", p);
-        printk(KERN_ERR
-                "If allocated object is overwritten then not detectable\n\n");
-        validate_slab_cache(kmalloc_caches + 5);
-        p = kzalloc(64, GFP_KERNEL);
-        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
-        *p = 0x56;
-        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
-                                                                        p);
-        printk(KERN_ERR
-                "If allocated object is overwritten then not detectable\n\n");
-        validate_slab_cache(kmalloc_caches + 6);
-        printk(KERN_ERR "\nB. Corruption after free\n");
-        p = kzalloc(128, GFP_KERNEL);
-        kfree(p);
-        *p = 0x78;
-        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
-        validate_slab_cache(kmalloc_caches + 7);
-        p = kzalloc(256, GFP_KERNEL);
-        kfree(p);
-        p[50] = 0x9a;
-        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
-                        p);
-        validate_slab_cache(kmalloc_caches + 8);
-        p = kzalloc(512, GFP_KERNEL);
-        kfree(p);
-        p[512] = 0xab;
-        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
-        validate_slab_cache(kmalloc_caches + 9);
-}
-#else
-static void resiliency_test(void) {};
-#endif
 /*
 * Generate lists of code addresses where slabcache objects are allocated
 * and freed.
@@ -3635,7 +3604,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
 static void process_slab(struct loc_track *t, struct kmem_cache *s,
                struct page *page, enum track_item alloc,
-                long *map)
+                unsigned long *map)
 {
        void *addr = page_address(page);
        void *p;
@@ -3735,7 +3704,71 @@ static int list_locations(struct kmem_cache *s, char *buf,
                len += sprintf(buf, "No data\n");
        return len;
 }
+#endif
+#ifdef SLUB_RESILIENCY_TEST
+static void resiliency_test(void)
+{
+        u8 *p;
+        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
+        printk(KERN_ERR "SLUB resiliency testing\n");
+        printk(KERN_ERR "-----------------------\n");
+        printk(KERN_ERR "A. Corruption after allocation\n");
+        p = kzalloc(16, GFP_KERNEL);
+        p[16] = 0x12;
+        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
+                        " 0x12->0x%p\n\n", p + 16);
+        validate_slab_cache(kmalloc_caches[4]);
+        /* Hmmm... The next two are dangerous */
+        p = kzalloc(32, GFP_KERNEL);
+        p[32 + sizeof(void *)] = 0x34;
+        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
+                        " 0x34 -> -0x%p\n", p);
+        printk(KERN_ERR
+                "If allocated object is overwritten then not detectable\n\n");
+        validate_slab_cache(kmalloc_caches[5]);
+        p = kzalloc(64, GFP_KERNEL);
+        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
+        *p = 0x56;
+        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+                                                                        p);
+        printk(KERN_ERR
+                "If allocated object is overwritten then not detectable\n\n");
+        validate_slab_cache(kmalloc_caches[6]);
+        printk(KERN_ERR "\nB. Corruption after free\n");
+        p = kzalloc(128, GFP_KERNEL);
+        kfree(p);
+        *p = 0x78;
+        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+        validate_slab_cache(kmalloc_caches[7]);
+        p = kzalloc(256, GFP_KERNEL);
+        kfree(p);
+        p[50] = 0x9a;
+        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
+                        p);
+        validate_slab_cache(kmalloc_caches[8]);
+        p = kzalloc(512, GFP_KERNEL);
+        kfree(p);
+        p[512] = 0xab;
+        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+        validate_slab_cache(kmalloc_caches[9]);
+}
+#else
+#ifdef CONFIG_SYSFS
+static void resiliency_test(void) {};
+#endif
+#endif
+#ifdef CONFIG_SYSFS
 enum slab_stat_type {
        SL_ALL,                 /* All slabs */
        SL_PARTIAL,             /* Only partially allocated slabs */
@@ -3788,6 +3821,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                }
        }
+        down_read(&slub_lock);
+#ifdef CONFIG_SLUB_DEBUG
        if (flags & SO_ALL) {
                for_each_node_state(node, N_NORMAL_MEMORY) {
                        struct kmem_cache_node *n = get_node(s, node);
@@ -3804,7 +3839,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                        nodes[node] += x;
                }
-        } else if (flags & SO_PARTIAL) {
+        } else
+#endif
+        if (flags & SO_PARTIAL) {
                for_each_node_state(node, N_NORMAL_MEMORY) {
                        struct kmem_cache_node *n = get_node(s, node);
@@ -3829,6 +3866,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
        return x + sprintf(buf + x, "\n");
 }
+#ifdef CONFIG_SLUB_DEBUG
 static int any_slab_objects(struct kmem_cache *s)
 {
        int node;
@@ -3844,6 +3882,7 @@ static int any_slab_objects(struct kmem_cache *s)
        }
        return 0;
 }
+#endif
 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
 #define to_slab(n) container_of(n, struct kmem_cache, kobj);
@@ -3945,12 +3984,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR_RO(aliases);
-static ssize_t slabs_show(struct kmem_cache *s, char *buf)
-{
-        return show_slab_objects(s, buf, SO_ALL);
-}
-SLAB_ATTR_RO(slabs);
 static ssize_t partial_show(struct kmem_cache *s, char *buf)
 {
        return show_slab_objects(s, buf, SO_PARTIAL);
@@ -3975,93 +4008,83 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR_RO(objects_partial);
-static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
+static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
-{
-        return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
-}
-SLAB_ATTR_RO(total_objects);
-static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
 }
-static ssize_t sanity_checks_store(struct kmem_cache *s,
+static ssize_t reclaim_account_store(struct kmem_cache *s,
                                const char *buf, size_t length)
 {
-        s->flags &= ~SLAB_DEBUG_FREE;
+        s->flags &= ~SLAB_RECLAIM_ACCOUNT;
        if (buf[0] == '1')
-                s->flags |= SLAB_DEBUG_FREE;
+                s->flags |= SLAB_RECLAIM_ACCOUNT;
        return length;
 }
-SLAB_ATTR(sanity_checks);
+SLAB_ATTR(reclaim_account);
-static ssize_t trace_show(struct kmem_cache *s, char *buf)
+static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
 }
+SLAB_ATTR_RO(hwcache_align);
-static ssize_t trace_store(struct kmem_cache *s, const char *buf,
+#ifdef CONFIG_ZONE_DMA
-                                                        size_t length)
+static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
 {
-        s->flags &= ~SLAB_TRACE;
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
-        if (buf[0] == '1')
-                s->flags |= SLAB_TRACE;
-        return length;
 }
-SLAB_ATTR(trace);
+SLAB_ATTR_RO(cache_dma);
+#endif
-#ifdef CONFIG_FAILSLAB
+static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
-static ssize_t failslab_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
 }
+SLAB_ATTR_RO(destroy_by_rcu);
-static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+#ifdef CONFIG_SLUB_DEBUG
-                                                        size_t length)
+static ssize_t slabs_show(struct kmem_cache *s, char *buf)
 {
-        s->flags &= ~SLAB_FAILSLAB;
+        return show_slab_objects(s, buf, SO_ALL);
-        if (buf[0] == '1')
-                s->flags |= SLAB_FAILSLAB;
-        return length;
 }
-SLAB_ATTR(failslab);
+SLAB_ATTR_RO(slabs);
-#endif
-static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+        return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
 }
+SLAB_ATTR_RO(total_objects);
-static ssize_t reclaim_account_store(struct kmem_cache *s,
+static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
-                                const char *buf, size_t length)
 {
-        s->flags &= ~SLAB_RECLAIM_ACCOUNT;
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
-        if (buf[0] == '1')
-                s->flags |= SLAB_RECLAIM_ACCOUNT;
-        return length;
 }
-SLAB_ATTR(reclaim_account);
-static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
+static ssize_t sanity_checks_store(struct kmem_cache *s,
+                                const char *buf, size_t length)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+        s->flags &= ~SLAB_DEBUG_FREE;
+        if (buf[0] == '1')
+                s->flags |= SLAB_DEBUG_FREE;
+        return length;
 }
-SLAB_ATTR_RO(hwcache_align);
+SLAB_ATTR(sanity_checks);
-#ifdef CONFIG_ZONE_DMA
+static ssize_t trace_show(struct kmem_cache *s, char *buf)
-static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
 }
-SLAB_ATTR_RO(cache_dma);
-#endif
-static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
+static ssize_t trace_store(struct kmem_cache *s, const char *buf,
+                                                        size_t length)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
+        s->flags &= ~SLAB_TRACE;
+        if (buf[0] == '1')
+                s->flags |= SLAB_TRACE;
+        return length;
 }
-SLAB_ATTR_RO(destroy_by_rcu);
+SLAB_ATTR(trace);
 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
 {
@@ -4139,6 +4162,40 @@ static ssize_t validate_store(struct kmem_cache *s,
 }
 SLAB_ATTR(validate);
+static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
+{
+        if (!(s->flags & SLAB_STORE_USER))
+                return -ENOSYS;
+        return list_locations(s, buf, TRACK_ALLOC);
+}
+SLAB_ATTR_RO(alloc_calls);
+static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
+{
+        if (!(s->flags & SLAB_STORE_USER))
+                return -ENOSYS;
+        return list_locations(s, buf, TRACK_FREE);
+}
+SLAB_ATTR_RO(free_calls);
+#endif /* CONFIG_SLUB_DEBUG */
+#ifdef CONFIG_FAILSLAB
+static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+}
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+                                                        size_t length)
+{
+        s->flags &= ~SLAB_FAILSLAB;
+        if (buf[0] == '1')
+                s->flags |= SLAB_FAILSLAB;
+        return length;
+}
+SLAB_ATTR(failslab);
+#endif
 static ssize_t shrink_show(struct kmem_cache *s, char *buf)
 {
        return 0;
@@ -4158,22 +4215,6 @@ static ssize_t shrink_store(struct kmem_cache *s,
 }
 SLAB_ATTR(shrink);
-static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
-{
-        if (!(s->flags & SLAB_STORE_USER))
-                return -ENOSYS;
-        return list_locations(s, buf, TRACK_ALLOC);
-}
-SLAB_ATTR_RO(alloc_calls);
-static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
-{
-        if (!(s->flags & SLAB_STORE_USER))
-                return -ENOSYS;
-        return list_locations(s, buf, TRACK_FREE);
-}
-SLAB_ATTR_RO(free_calls);
 #ifdef CONFIG_NUMA
 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
 {
@@ -4279,25 +4320,27 @@ static struct attribute *slab_attrs[] = {
        &min_partial_attr.attr,
        &objects_attr.attr,
        &objects_partial_attr.attr,
-        &total_objects_attr.attr,
-        &slabs_attr.attr,
        &partial_attr.attr,
        &cpu_slabs_attr.attr,
        &ctor_attr.attr,
        &aliases_attr.attr,
        &align_attr.attr,
-        &sanity_checks_attr.attr,
-        &trace_attr.attr,
        &hwcache_align_attr.attr,
        &reclaim_account_attr.attr,
        &destroy_by_rcu_attr.attr,
+        &shrink_attr.attr,
+#ifdef CONFIG_SLUB_DEBUG
+        &total_objects_attr.attr,
+        &slabs_attr.attr,
+        &sanity_checks_attr.attr,
+        &trace_attr.attr,
        &red_zone_attr.attr,
        &poison_attr.attr,
        &store_user_attr.attr,
        &validate_attr.attr,
-        &shrink_attr.attr,
        &alloc_calls_attr.attr,
        &free_calls_attr.attr,
+#endif
 #ifdef CONFIG_ZONE_DMA
        &cache_dma_attr.attr,
 #endif
@@ -4377,6 +4420,7 @@ static void kmem_cache_release(struct kobject *kobj)
 {
        struct kmem_cache *s = to_slab(kobj);
+        kfree(s->name);
        kfree(s);
 }
@@ -4579,7 +4623,7 @@ static int __init slab_sysfs_init(void)
 }
 __initcall(slab_sysfs_init);
-#endif
+#endif /* CONFIG_SYSFS */
 /*
 * The /proc/slabinfo ABI
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index aa33fd67fa41..29d6cbffb283 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -220,18 +220,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
        if (vmemmap_buf_start) {
                /* need to free left buf */
-#ifdef CONFIG_NO_BOOTMEM
-                free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
-                if (vmemmap_buf_start < vmemmap_buf) {
-                        char name[15];
-                        snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
-                        reserve_early_without_check(__pa(vmemmap_buf_start),
-                                                    __pa(vmemmap_buf), name);
-                }
-#else
                free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
-#endif
                vmemmap_buf = NULL;
                vmemmap_buf_end = NULL;
        }
diff --git a/mm/swap.c b/mm/swap.c
index 3ce7bc373a52..3f4854205b16 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -378,6 +378,7 @@ void release_pages(struct page **pages, int nr, int cold)
        pagevec_free(&pages_to_free);
 }
+EXPORT_SYMBOL(release_pages);
 /*
 * The pages which we're about to release may be in the deferred lru-addition
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7c703ff2f36f..67ddaaf98c74 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -30,6 +30,7 @@
 #include <linux/capability.h>
 #include <linux/syscalls.h>
 #include <linux/memcontrol.h>
+#include <linux/poll.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -58,6 +59,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES];
 static DEFINE_MUTEX(swapon_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
+/* Activity counter to indicate that a swapon or swapoff has occurred */
+static atomic_t proc_poll_event = ATOMIC_INIT(0);
 static inline unsigned char swap_count(unsigned char ent)
 {
        return ent & ~SWAP_HAS_CACHE;   /* may include SWAP_HAS_CONT flag */
@@ -139,7 +144,7 @@ static int discard_swap(struct swap_info_struct *si)
        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
        if (nr_blocks) {
                err = blkdev_issue_discard(si->bdev, start_block,
-                                nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+                                nr_blocks, GFP_KERNEL, 0);
                if (err)
                        return err;
                cond_resched();
@@ -150,7 +155,7 @@ static int discard_swap(struct swap_info_struct *si)
                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
                err = blkdev_issue_discard(si->bdev, start_block,
-                                nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+                                nr_blocks, GFP_KERNEL, 0);
                if (err)
                        break;
@@ -189,7 +194,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
                        start_block <<= PAGE_SHIFT - 9;
                        nr_blocks <<= PAGE_SHIFT - 9;
                        if (blkdev_issue_discard(si->bdev, start_block,
-                                    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT))
+                                    nr_blocks, GFP_NOIO, 0))
                                break;
                }
@@ -1680,6 +1685,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        }
        filp_close(swap_file, NULL);
        err = 0;
+        atomic_inc(&proc_poll_event);
+        wake_up_interruptible(&proc_poll_wait);
 out_dput:
        filp_close(victim, NULL);
@@ -1688,6 +1695,25 @@ out:
 }
 #ifdef CONFIG_PROC_FS
+struct proc_swaps {
+        struct seq_file seq;
+        int event;
+};
+static unsigned swaps_poll(struct file *file, poll_table *wait)
+{
+        struct proc_swaps *s = file->private_data;
+        poll_wait(file, &proc_poll_wait, wait);
+        if (s->event != atomic_read(&proc_poll_event)) {
+                s->event = atomic_read(&proc_poll_event);
+                return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+        }
+        return POLLIN | POLLRDNORM;
+}
 /* iterator */
 static void *swap_start(struct seq_file *swap, loff_t *pos)
 {
@@ -1771,7 +1797,24 @@ static const struct seq_operations swaps_op = {
 static int swaps_open(struct inode *inode, struct file *file)
 {
-        return seq_open(file, &swaps_op);
+        struct proc_swaps *s;
+        int ret;
+        s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
+        if (!s)
+                return -ENOMEM;
+        file->private_data = s;
+        ret = seq_open(file, &swaps_op);
+        if (ret) {
+                kfree(s);
+                return ret;
+        }
+        s->seq.private = s;
+        s->event = atomic_read(&proc_poll_event);
+        return ret;
 }
 static const struct file_operations proc_swaps_operations = {
@@ -1779,6 +1822,7 @@ static const struct file_operations proc_swaps_operations = {
        .read           = seq_read,
        .llseek         = seq_lseek,
        .release        = seq_release,
+        .poll           = swaps_poll,
 };
 static int __init procswaps_init(void)
@@ -2084,6 +2128,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                swap_info[prev]->next = type;
        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
+        atomic_inc(&proc_poll_event);
+        wake_up_interruptible(&proc_poll_wait);
        error = 0;
        goto out;
 bad_swap:
diff --git a/mm/util.c b/mm/util.c
index 4735ea481816..73dac81e9f78 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -245,6 +245,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 }
 #endif
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ * If the architecture not support this fucntion, simply return with no
+ * page pinned
+ */
+int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+                                 int nr_pages, int write, struct page **pages)
+{
+        return 0;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
 /**
 * get_user_pages_fast() - pin user pages in memory
 * @start:      starting user address
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8889da69a6..a3d66b3dc5cb 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -293,13 +293,13 @@ static void __insert_vmap_area(struct vmap_area *va)
        struct rb_node *tmp;
        while (*p) {
-                struct vmap_area *tmp;
+                struct vmap_area *tmp_va;
                parent = *p;
-                tmp = rb_entry(parent, struct vmap_area, rb_node);
+                tmp_va = rb_entry(parent, struct vmap_area, rb_node);
-                if (va->va_start < tmp->va_end)
+                if (va->va_start < tmp_va->va_end)
                        p = &(*p)->rb_left;
-                else if (va->va_end > tmp->va_start)
+                else if (va->va_end > tmp_va->va_start)
                        p = &(*p)->rb_right;
                else
                        BUG();
@@ -517,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
 static void purge_fragmented_blocks_allcpus(void);
 /*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+        atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+/*
 * Purges all lazily-freed vmap areas.
 *
 * If sync is 0 then don't purge if there is already a purge in progress.
@@ -1587,6 +1596,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
+static inline void *__vmalloc_node_flags(unsigned long size,
+                                        int node, gfp_t flags)
+{
+        return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+                                        node, __builtin_return_address(0));
+}
 /**
 *      vmalloc  -  allocate virtually contiguous memory
 *      @size:          allocation size
@@ -1598,12 +1614,28 @@ EXPORT_SYMBOL(__vmalloc);
 */
 void *vmalloc(unsigned long size)
 {
-        return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+        return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
-                                        -1, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc);
 /**
+ *      vzalloc - allocate virtually contiguous memory with zero fill
+ *      @size:  allocation size
+ *      Allocate enough pages to cover @size from the page level
+ *      allocator and map them into contiguous kernel virtual space.
+ *      The memory allocated is set to zero.
+ *
+ *      For tight control over page level allocator and protection flags
+ *      use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+        return __vmalloc_node_flags(size, -1,
+                                GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc);
+/**
 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
 * @size: allocation size
 *
@@ -1644,6 +1676,25 @@ void *vmalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vmalloc_node);
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size:       allocation size
+ * @node:       numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc_node() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+        return __vmalloc_node_flags(size, node,
+                         GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc_node);
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 #endif
@@ -2056,6 +2107,7 @@ void free_vm_area(struct vm_struct *area)
 }
 EXPORT_SYMBOL_GPL(free_vm_area);
+#ifdef CONFIG_SMP
 static struct vmap_area *node_to_va(struct rb_node *n)
 {
        return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
@@ -2336,9 +2388,11 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
                free_vm_area(vms[i]);
        kfree(vms);
 }
+#endif  /* CONFIG_SMP */
 #ifdef CONFIG_PROC_FS
 static void *s_start(struct seq_file *m, loff_t *pos)
+        __acquires(&vmlist_lock)
 {
        loff_t n = *pos;
        struct vm_struct *v;
@@ -2365,6 +2419,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 }
 static void s_stop(struct seq_file *m, void *p)
+        __releases(&vmlist_lock)
 {
        read_unlock(&vmlist_lock);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5dfabf25f11..b8a6fdc21312 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -51,6 +51,12 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
+enum lumpy_mode {
+        LUMPY_MODE_NONE,
+        LUMPY_MODE_ASYNC,
+        LUMPY_MODE_SYNC,
+};
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
@@ -79,10 +85,10 @@ struct scan_control {
        int order;
        /*
-         * Intend to reclaim enough contenious memory rather than to reclaim
+         * Intend to reclaim enough continuous memory rather than reclaim
-         * enough amount memory. I.e, it's the mode for high order allocation.
+         * enough amount of memory. i.e, mode for high order allocation.
         */
-        bool lumpy_reclaim_mode;
+        enum lumpy_mode lumpy_reclaim_mode;
        /* Which cgroup do we reclaim from */
        struct mem_cgroup *mem_cgroup;
@@ -265,6 +271,36 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
        return ret;
 }
+static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
+                                   bool sync)
+{
+        enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
+        /*
+         * Some reclaim have alredy been failed. No worth to try synchronous
+         * lumpy reclaim.
+         */
+        if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
+                return;
+        /*
+         * If we need a large contiguous chunk of memory, or have
+         * trouble getting a small set of contiguous pages, we
+         * will reclaim both active and inactive pages.
+         */
+        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+                sc->lumpy_reclaim_mode = mode;
+        else if (sc->order && priority < DEF_PRIORITY - 2)
+                sc->lumpy_reclaim_mode = mode;
+        else
+                sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+}
+static void disable_lumpy_reclaim_mode(struct scan_control *sc)
+{
+        sc->lumpy_reclaim_mode = LUMPY_MODE_NONE;
+}
 static inline int is_page_cache_freeable(struct page *page)
 {
        /*
@@ -275,7 +311,8 @@ static inline int is_page_cache_freeable(struct page *page)
        return page_count(page) - page_has_private(page) == 2;
 }
-static int may_write_to_queue(struct backing_dev_info *bdi)
+static int may_write_to_queue(struct backing_dev_info *bdi,
+                              struct scan_control *sc)
 {
        if (current->flags & PF_SWAPWRITE)
                return 1;
@@ -283,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
                return 1;
        if (bdi == current->backing_dev_info)
                return 1;
+        /* lumpy reclaim for hugepage often need a lot of write */
+        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+                return 1;
        return 0;
 }
@@ -307,12 +348,6 @@ static void handle_write_error(struct address_space *mapping,
        unlock_page(page);
 }
-/* Request for sync pageout. */
-enum pageout_io {
-        PAGEOUT_IO_ASYNC,
-        PAGEOUT_IO_SYNC,
-};
 /* possible outcome of pageout() */
 typedef enum {
        /* failed to write page out, page is locked */
@@ -330,7 +365,7 @@ typedef enum {
 * Calls ->writepage().
 */
 static pageout_t pageout(struct page *page, struct address_space *mapping,
-                                                enum pageout_io sync_writeback)
+                         struct scan_control *sc)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -366,7 +401,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
-        if (!may_write_to_queue(mapping->backing_dev_info))
+        if (!may_write_to_queue(mapping->backing_dev_info, sc))
                return PAGE_KEEP;
        if (clear_page_dirty_for_io(page)) {
@@ -376,7 +411,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        .nr_to_write = SWAP_CLUSTER_MAX,
                        .range_start = 0,
                        .range_end = LLONG_MAX,
-                        .nonblocking = 1,
                        .for_reclaim = 1,
                };
@@ -394,7 +428,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                 * direct reclaiming a large contiguous area and the
                 * first attempt to free a range of pages fails.
                 */
-                if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+                if (PageWriteback(page) &&
+                    sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC)
                        wait_on_page_writeback(page);
                if (!PageWriteback(page)) {
@@ -402,7 +437,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        ClearPageReclaim(page);
                }
                trace_mm_vmscan_writepage(page,
-                        trace_reclaim_flags(page, sync_writeback));
+                        trace_reclaim_flags(page, sc->lumpy_reclaim_mode));
                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -580,7 +615,7 @@ static enum page_references page_check_references(struct page *page,
        referenced_page = TestClearPageReferenced(page);
        /* Lumpy reclaim - ignore references */
-        if (sc->lumpy_reclaim_mode)
+        if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE)
                return PAGEREF_RECLAIM;
        /*
@@ -616,7 +651,7 @@ static enum page_references page_check_references(struct page *page,
        }
        /* Reclaim if clean, defer dirty pages to writeback */
-        if (referenced_page)
+        if (referenced_page && !PageSwapBacked(page))
                return PAGEREF_RECLAIM_CLEAN;
        return PAGEREF_RECLAIM;
@@ -644,12 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
 * shrink_page_list() returns the number of reclaimed pages
 */
 static unsigned long shrink_page_list(struct list_head *page_list,
-                                        struct scan_control *sc,
+                                      struct zone *zone,
-                                        enum pageout_io sync_writeback)
+                                      struct scan_control *sc)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
        int pgactivate = 0;
+        unsigned long nr_dirty = 0;
+        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
        cond_resched();
@@ -669,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        goto keep;
                VM_BUG_ON(PageActive(page));
+                VM_BUG_ON(page_zone(page) != zone);
                sc->nr_scanned++;
@@ -694,10 +732,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         * for any page for which writeback has already
                         * started.
                         */
-                        if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+                        if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC &&
+                            may_enter_fs)
                                wait_on_page_writeback(page);
-                        else
+                        else {
-                                goto keep_locked;
+                                unlock_page(page);
+                                goto keep_lumpy;
+                        }
                }
                references = page_check_references(page, sc);
@@ -743,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                }
                if (PageDirty(page)) {
+                        nr_dirty++;
                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
@@ -751,14 +794,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                        /* Page is dirty, try to write it out here */
-                        switch (pageout(page, mapping, sync_writeback)) {
+                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
+                                nr_congested++;
                                goto keep_locked;
                        case PAGE_ACTIVATE:
                                goto activate_locked;
                        case PAGE_SUCCESS:
-                                if (PageWriteback(page) || PageDirty(page))
+                                if (PageWriteback(page))
+                                        goto keep_lumpy;
+                                if (PageDirty(page))
                                        goto keep;
                                /*
                                 * A synchronous write - probably a ramdisk.  Go
                                 * ahead and try to reclaim the page.
@@ -841,6 +888,7 @@ cull_mlocked:
                        try_to_free_swap(page);
                unlock_page(page);
                putback_lru_page(page);
+                disable_lumpy_reclaim_mode(sc);
                continue;
 activate_locked:
@@ -853,10 +901,21 @@ activate_locked:
 keep_locked:
                unlock_page(page);
 keep:
+                disable_lumpy_reclaim_mode(sc);
+keep_lumpy:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
+        /*
+         * Tag a zone as congested if all the dirty pages encountered were
+         * backed by a congested BDI. In this case, reclaimers should just
+         * back off and wait for congestion to clear because further reclaim
+         * will encounter the same problem
+         */
+        if (nr_dirty == nr_congested)
+                zone_set_flag(zone, ZONE_CONGESTED);
        free_page_list(&free_pages);
        list_splice(&ret_pages, page_list);
@@ -1006,7 +1065,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        /* Check that we have not crossed a zone boundary. */
                        if (unlikely(page_zone_id(cursor_page) != zone_id))
-                                continue;
+                                break;
                        /*
                         * If we don't have enough swap space, reclaiming of
@@ -1014,8 +1073,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                         * pointless.
                         */
                        if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
-                                        !PageSwapCache(cursor_page))
+                            !PageSwapCache(cursor_page))
-                                continue;
+                                break;
                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
                                list_move(&cursor_page->lru, dst);
@@ -1026,11 +1085,16 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                                        nr_lumpy_dirty++;
                                scan++;
                        } else {
-                                if (mode == ISOLATE_BOTH &&
+                                /* the page is freed already. */
-                                                page_count(cursor_page))
+                                if (!page_count(cursor_page))
-                                        nr_lumpy_failed++;
+                                        continue;
+                                break;
                        }
                }
+                /* If we break out of the loop above, lumpy reclaim failed */
+                if (pfn < end_pfn)
+                        nr_lumpy_failed++;
        }
        *scanned = scan;
@@ -1253,7 +1317,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
                return false;
        /* Only stall on lumpy reclaim */
-        if (!sc->lumpy_reclaim_mode)
+        if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE)
                return false;
        /* If we have relaimed everything on the isolated list, no stall */
@@ -1286,7 +1350,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        unsigned long nr_scanned;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_taken;
-        unsigned long nr_active;
        unsigned long nr_anon;
        unsigned long nr_file;
@@ -1298,15 +1361,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        return SWAP_CLUSTER_MAX;
        }
+        set_lumpy_reclaim_mode(priority, sc, false);
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode ?
+                        sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
-                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                                        ISOLATE_INACTIVE : ISOLATE_BOTH,
                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
@@ -1318,8 +1381,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode ?
+                        sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ?
-                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                                        ISOLATE_INACTIVE : ISOLATE_BOTH,
                        zone, sc->mem_cgroup,
                        0, file);
                /*
@@ -1337,20 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
-        nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+        nr_reclaimed = shrink_page_list(&page_list, zone, sc);
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-                congestion_wait(BLK_RW_ASYNC, HZ/10);
+                set_lumpy_reclaim_mode(priority, sc, true);
+                nr_reclaimed += shrink_page_list(&page_list, zone, sc);
-                /*
-                 * The attempt at page out may have made some
-                 * of the pages active, mark them inactive again.
-                 */
-                nr_active = clear_active_flags(&page_list, NULL);
-                count_vm_events(PGDEACTIVATE, nr_active);
-                nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
        }
        local_irq_disable();
@@ -1359,6 +1414,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
        putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
+                zone_idx(zone),
+                nr_scanned, nr_reclaimed,
+                priority,
+                trace_shrink_flags(file, sc->lumpy_reclaim_mode));
        return nr_reclaimed;
 }
@@ -1506,6 +1567,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
 }
+#ifdef CONFIG_SWAP
 static int inactive_anon_is_low_global(struct zone *zone)
 {
        unsigned long active, inactive;
@@ -1531,12 +1593,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
 {
        int low;
+        /*
+         * If we don't have swap space, anonymous page deactivation
+         * is pointless.
+         */
+        if (!total_swap_pages)
+                return 0;
        if (scanning_global_lru(sc))
                low = inactive_anon_is_low_global(zone);
        else
                low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
        return low;
 }
+#else
+static inline int inactive_anon_is_low(struct zone *zone,
+                                        struct scan_control *sc)
+{
+        return 0;
+}
+#endif
 static int inactive_file_is_low_global(struct zone *zone)
 {
@@ -1721,21 +1797,6 @@ out:
        }
 }
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
-{
-        /*
-         * If we need a large contiguous chunk of memory, or have
-         * trouble getting a small set of contiguous pages, we
-         * will reclaim both active and inactive pages.
-         */
-        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
-                sc->lumpy_reclaim_mode = 1;
-        else if (sc->order && priority < DEF_PRIORITY - 2)
-                sc->lumpy_reclaim_mode = 1;
-        else
-                sc->lumpy_reclaim_mode = 0;
-}
 /*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
@@ -1750,8 +1811,6 @@ static void shrink_zone(int priority, struct zone *zone,
        get_scan_count(zone, sc, nr, priority);
-        set_lumpy_reclaim_mode(priority, sc);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                for_each_evictable_lru(l) {
@@ -1782,7 +1841,7 @@ static void shrink_zone(int priority, struct zone *zone,
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-        if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
+        if (inactive_anon_is_low(zone, sc))
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
        throttle_vm_writeout(sc->gfp_mask);
@@ -1937,21 +1996,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                /* Take a nap, wait for some writeback to complete */
                if (!sc->hibernation_mode && sc->nr_scanned &&
-                    priority < DEF_PRIORITY - 2)
+                    priority < DEF_PRIORITY - 2) {
-                        congestion_wait(BLK_RW_ASYNC, HZ/10);
+                        struct zone *preferred_zone;
+                        first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+                                                        NULL, &preferred_zone);
+                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
+                }
        }
 out:
-        /*
-         * Now that we've scanned all the zones at this priority level, note
-         * that level within the zone so that the next thread which performs
-         * scanning of this zone will immediately start out at this priority
-         * level.  This affects only the decision whether or not to bring
-         * mapped pages onto the inactive list.
-         */
-        if (priority < 0)
-                priority = 0;
        delayacct_freepages_end();
        put_mems_allowed();
@@ -2247,6 +2301,15 @@ loop_again:
                                if (!zone_watermark_ok(zone, order,
                                            min_wmark_pages(zone), end_zone, 0))
                                        has_under_min_watermark_zone = 1;
+                        } else {
+                                /*
+                                 * If a zone reaches its high watermark,
+                                 * consider it to be no longer congested. It's
+                                 * possible there are dirty pages backed by
+                                 * congested BDIs but as pressure is relieved,
+                                 * spectulatively avoid congestion waits
+                                 */
+                                zone_clear_flag(zone, ZONE_CONGESTED);
                        }
                }
@@ -2987,6 +3050,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
        return 0;
 }
+#ifdef CONFIG_NUMA
 /*
 * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
 * a specified node's per zone unevictable lists for evictable pages.
@@ -3033,4 +3097,4 @@ void scan_unevictable_unregister_node(struct node *node)
 {
        sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
 }
+#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e669aaa..cd2e42be7b68 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,8 @@
 #include <linux/vmstat.h>
 #include <linux/sched.h>
 #include <linux/math64.h>
+#include <linux/writeback.h>
+#include <linux/compaction.h>
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -394,6 +396,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #endif
 #ifdef CONFIG_COMPACTION
 struct contig_page_info {
        unsigned long free_pages;
        unsigned long free_blocks_total;
@@ -745,6 +748,11 @@ static const char * const vmstat_text[] = {
        "nr_isolated_anon",
        "nr_isolated_file",
        "nr_shmem",
+        "nr_dirtied",
+        "nr_written",
+        "nr_dirty_threshold",
+        "nr_dirty_background_threshold",
 #ifdef CONFIG_NUMA
        "numa_hit",
        "numa_miss",
@@ -904,36 +912,44 @@ static const struct file_operations proc_zoneinfo_file_operations = {
        .release        = seq_release,
 };
+enum writeback_stat_item {
+        NR_DIRTY_THRESHOLD,
+        NR_DIRTY_BG_THRESHOLD,
+        NR_VM_WRITEBACK_STAT_ITEMS,
+};
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 {
        unsigned long *v;
-#ifdef CONFIG_VM_EVENT_COUNTERS
+        int i, stat_items_size;
-        unsigned long *e;
-#endif
-        int i;
        if (*pos >= ARRAY_SIZE(vmstat_text))
                return NULL;
+        stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+                          NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
 #ifdef CONFIG_VM_EVENT_COUNTERS
-        v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
+        stat_items_size += sizeof(struct vm_event_state);
-                        + sizeof(struct vm_event_state), GFP_KERNEL);
-#else
-        v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
-                        GFP_KERNEL);
 #endif
+        v = kmalloc(stat_items_size, GFP_KERNEL);
        m->private = v;
        if (!v)
                return ERR_PTR(-ENOMEM);
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
                v[i] = global_page_state(i);
+        v += NR_VM_ZONE_STAT_ITEMS;
+        global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
+                            v + NR_DIRTY_THRESHOLD);
+        v += NR_VM_WRITEBACK_STAT_ITEMS;
 #ifdef CONFIG_VM_EVENT_COUNTERS
-        e = v + NR_VM_ZONE_STAT_ITEMS;
+        all_vm_events(v);
-        all_vm_events(e);
+        v[PGPGIN] /= 2;         /* sectors -> kbytes */
-        e[PGPGIN] /= 2;         /* sectors -> kbytes */
+        v[PGPGOUT] /= 2;
-        e[PGPGOUT] /= 2;
 #endif
-        return v + *pos;
+        return m->private + *pos;
 }
 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
author	Takashi Iwai <tiwai@suse.de>	2010-11-03 10:51:26 -0400
committer	Takashi Iwai <tiwai@suse.de>	2010-11-03 10:51:26 -0400
commit	69dbdd819599e2f3b77c172e83af512845bca5ad (patch)
tree	49939d8b80ec2115a801eae2aebc21f23867c876 /mm
parent	87232dd49aeb6b7d1af291edca8bd129a82ef4b5 (diff)
parent	75e3f3137cb570661c2ad3035a139dda671fbb63 (diff)