35 files changed, 2777 insertions, 977 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d34c2b971032..9c61158308dc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME
 config SPARSEMEM_VMEMMAP_ENABLE
        bool
+config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+        def_bool y
+        depends on SPARSEMEM && X86_64
 config SPARSEMEM_VMEMMAP
        bool "Sparse Memory virtual memmap"
        depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 7d1486875e1c..d7c791ef0036 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -13,6 +13,7 @@
 #include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/kmemleak.h>
+#include <linux/range.h>
 #include <asm/bug.h>
 #include <asm/io.h>
@@ -32,6 +33,7 @@ unsigned long max_pfn;
 unsigned long saved_max_pfn;
 #endif
+#ifndef CONFIG_NO_BOOTMEM
 bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
 static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -142,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
        min_low_pfn = start;
        return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
 }
+#endif
 /*
 * free_bootmem_late - free bootmem pages directly to page allocator
 * @addr: starting address of the range
@@ -167,6 +169,60 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
        }
 }
+#ifdef CONFIG_NO_BOOTMEM
+static void __init __free_pages_memory(unsigned long start, unsigned long end)
+{
+        int i;
+        unsigned long start_aligned, end_aligned;
+        int order = ilog2(BITS_PER_LONG);
+        start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+        end_aligned = end & ~(BITS_PER_LONG - 1);
+        if (end_aligned <= start_aligned) {
+#if 1
+                printk(KERN_DEBUG " %lx - %lx\n", start, end);
+#endif
+                for (i = start; i < end; i++)
+                        __free_pages_bootmem(pfn_to_page(i), 0);
+                return;
+        }
+#if 1
+        printk(KERN_DEBUG " %lx %lx - %lx %lx\n",
+                 start, start_aligned, end_aligned, end);
+#endif
+        for (i = start; i < start_aligned; i++)
+                __free_pages_bootmem(pfn_to_page(i), 0);
+        for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
+                __free_pages_bootmem(pfn_to_page(i), order);
+        for (i = end_aligned; i < end; i++)
+                __free_pages_bootmem(pfn_to_page(i), 0);
+}
+unsigned long __init free_all_memory_core_early(int nodeid)
+{
+        int i;
+        u64 start, end;
+        unsigned long count = 0;
+        struct range *range = NULL;
+        int nr_range;
+        nr_range = get_free_all_memory_range(&range, nodeid);
+        for (i = 0; i < nr_range; i++) {
+                start = range[i].start;
+                end = range[i].end;
+                count += end - start;
+                __free_pages_memory(start, end);
+        }
+        return count;
+}
+#else
 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 {
        int aligned;
@@ -227,6 +283,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        return count;
 }
+#endif
 /**
 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -237,7 +294,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
        register_page_bootmem_info_node(pgdat);
+#ifdef CONFIG_NO_BOOTMEM
+        /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
+        return 0;
+#else
        return free_all_bootmem_core(pgdat->bdata);
+#endif
 }
 /**
@@ -247,9 +309,14 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 */
 unsigned long __init free_all_bootmem(void)
 {
+#ifdef CONFIG_NO_BOOTMEM
+        return free_all_memory_core_early(NODE_DATA(0)->node_id);
+#else
        return free_all_bootmem_core(NODE_DATA(0)->bdata);
+#endif
 }
+#ifndef CONFIG_NO_BOOTMEM
 static void __init __free(bootmem_data_t *bdata,
                        unsigned long sidx, unsigned long eidx)
 {
@@ -344,6 +411,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
        }
        BUG();
 }
+#endif
 /**
 * free_bootmem_node - mark a page range as usable
@@ -358,6 +426,12 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
 void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
                              unsigned long size)
 {
+#ifdef CONFIG_NO_BOOTMEM
+        free_early(physaddr, physaddr + size);
+#if 0
+        printk(KERN_DEBUG "free %lx %lx\n", physaddr, size);
+#endif
+#else
        unsigned long start, end;
        kmemleak_free_part(__va(physaddr), size);
@@ -366,6 +440,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
        end = PFN_DOWN(physaddr + size);
        mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
+#endif
 }
 /**
@@ -379,6 +454,12 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 */
 void __init free_bootmem(unsigned long addr, unsigned long size)
 {
+#ifdef CONFIG_NO_BOOTMEM
+        free_early(addr, addr + size);
+#if 0
+        printk(KERN_DEBUG "free %lx %lx\n", addr, size);
+#endif
+#else
        unsigned long start, end;
        kmemleak_free_part(__va(addr), size);
@@ -387,6 +468,7 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
        end = PFN_DOWN(addr + size);
        mark_bootmem(start, end, 0, 0);
+#endif
 }
 /**
@@ -403,12 +485,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
 int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
                                 unsigned long size, int flags)
 {
+#ifdef CONFIG_NO_BOOTMEM
+        panic("no bootmem");
+        return 0;
+#else
        unsigned long start, end;
        start = PFN_DOWN(physaddr);
        end = PFN_UP(physaddr + size);
        return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
+#endif
 }
 /**
@@ -424,14 +511,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 int __init reserve_bootmem(unsigned long addr, unsigned long size,
                            int flags)
 {
+#ifdef CONFIG_NO_BOOTMEM
+        panic("no bootmem");
+        return 0;
+#else
        unsigned long start, end;
        start = PFN_DOWN(addr);
        end = PFN_UP(addr + size);
        return mark_bootmem(start, end, 1, flags);
+#endif
 }
+#ifndef CONFIG_NO_BOOTMEM
 static unsigned long __init align_idx(struct bootmem_data *bdata,
                                      unsigned long idx, unsigned long step)
 {
@@ -582,12 +675,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
 #endif
        return NULL;
 }
+#endif
 static void * __init ___alloc_bootmem_nopanic(unsigned long size,
                                        unsigned long align,
                                        unsigned long goal,
                                        unsigned long limit)
 {
+#ifdef CONFIG_NO_BOOTMEM
+        void *ptr;
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc(size, GFP_NOWAIT);
+restart:
+        ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+        if (ptr)
+                return ptr;
+        if (goal != 0) {
+                goal = 0;
+                goto restart;
+        }
+        return NULL;
+#else
        bootmem_data_t *bdata;
        void *region;
@@ -613,6 +727,7 @@ restart:
        }
        return NULL;
+#endif
 }
 /**
@@ -631,7 +746,13 @@ restart:
 void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
                                        unsigned long goal)
 {
-        return ___alloc_bootmem_nopanic(size, align, goal, 0);
+        unsigned long limit = 0;
+#ifdef CONFIG_NO_BOOTMEM
+        limit = -1UL;
+#endif
+        return ___alloc_bootmem_nopanic(size, align, goal, limit);
 }
 static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
@@ -665,9 +786,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem(unsigned long size, unsigned long align,
                              unsigned long goal)
 {
-        return ___alloc_bootmem(size, align, goal, 0);
+        unsigned long limit = 0;
+#ifdef CONFIG_NO_BOOTMEM
+        limit = -1UL;
+#endif
+        return ___alloc_bootmem(size, align, goal, limit);
 }
+#ifndef CONFIG_NO_BOOTMEM
 static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
                                unsigned long size, unsigned long align,
                                unsigned long goal, unsigned long limit)
@@ -684,6 +812,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
        return ___alloc_bootmem(size, align, goal, limit);
 }
+#endif
 /**
 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -706,7 +835,46 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+#ifdef CONFIG_NO_BOOTMEM
+        return __alloc_memory_core_early(pgdat->node_id, size, align,
+                                         goal, -1ULL);
+#else
        return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
+#endif
+}
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+                                   unsigned long align, unsigned long goal)
+{
+#ifdef MAX_DMA32_PFN
+        unsigned long end_pfn;
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+        /* update goal according ...MAX_DMA32_PFN */
+        end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
+        if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
+            (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
+                void *ptr;
+                unsigned long new_goal;
+                new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
+#ifdef CONFIG_NO_BOOTMEM
+                ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
+                                                 new_goal, -1ULL);
+#else
+                ptr = alloc_bootmem_core(pgdat->bdata, size, align,
+                                                 new_goal, 0);
+#endif
+                if (ptr)
+                        return ptr;
+        }
+#endif
+        return __alloc_bootmem_node(pgdat, size, align, goal);
 }
 #ifdef CONFIG_SPARSEMEM
@@ -720,6 +888,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 void * __init alloc_bootmem_section(unsigned long size,
                                    unsigned long section_nr)
 {
+#ifdef CONFIG_NO_BOOTMEM
+        unsigned long pfn, goal, limit;
+        pfn = section_nr_to_pfn(section_nr);
+        goal = pfn << PAGE_SHIFT;
+        limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+        return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
+                                         SMP_CACHE_BYTES, goal, limit);
+#else
        bootmem_data_t *bdata;
        unsigned long pfn, goal, limit;
@@ -729,6 +907,7 @@ void * __init alloc_bootmem_section(unsigned long size,
        bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
        return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
+#endif
 }
 #endif
@@ -740,11 +919,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+#ifdef CONFIG_NO_BOOTMEM
+        ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
+                                                 goal, -1ULL);
+#else
        ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
        if (ptr)
                return ptr;
        ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+#endif
        if (ptr)
                return ptr;
@@ -795,6 +979,11 @@ void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+#ifdef CONFIG_NO_BOOTMEM
+        return __alloc_memory_core_early(pgdat->node_id, size, align,
+                                goal, ARCH_LOW_ADDRESS_LIMIT);
+#else
        return ___alloc_bootmem_node(pgdat->bdata, size, align,
                                goal, ARCH_LOW_ADDRESS_LIMIT);
+#endif
 }
diff --git a/mm/fadvise.c b/mm/fadvise.c
index e43359214f6f..8d723c9e8b75 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
        switch (advice) {
        case POSIX_FADV_NORMAL:
                file->f_ra.ra_pages = bdi->ra_pages;
+                spin_lock(&file->f_lock);
+                file->f_mode &= ~FMODE_RANDOM;
+                spin_unlock(&file->f_lock);
                break;
        case POSIX_FADV_RANDOM:
-                file->f_ra.ra_pages = 0;
+                spin_lock(&file->f_lock);
+                file->f_mode |= FMODE_RANDOM;
+                spin_unlock(&file->f_lock);
                break;
        case POSIX_FADV_SEQUENTIAL:
                file->f_ra.ra_pages = bdi->ra_pages * 2;
+                spin_lock(&file->f_lock);
+                file->f_mode &= ~FMODE_RANDOM;
+                spin_unlock(&file->f_lock);
                break;
        case POSIX_FADV_WILLNEED:
                if (!mapping->a_ops->readpage) {
diff --git a/mm/failslab.c b/mm/failslab.c
index 9339de5f0a91..bb41f98dd8b7 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,18 +1,22 @@
 #include <linux/fault-inject.h>
 #include <linux/gfp.h>
+#include <linux/slab.h>
 static struct {
        struct fault_attr attr;
        u32 ignore_gfp_wait;
+        int cache_filter;
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
        struct dentry *ignore_gfp_wait_file;
+        struct dentry *cache_filter_file;
 #endif
 } failslab = {
        .attr = FAULT_ATTR_INITIALIZER,
        .ignore_gfp_wait = 1,
+        .cache_filter = 0,
 };
-bool should_failslab(size_t size, gfp_t gfpflags)
+bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags)
 {
        if (gfpflags & __GFP_NOFAIL)
                return false;
@@ -20,6 +24,9 @@ bool should_failslab(size_t size, gfp_t gfpflags)
        if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT))
                return false;
+        if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB))
+                return false;
        return should_fail(&failslab.attr, size);
 }
@@ -30,7 +37,6 @@ static int __init setup_failslab(char *str)
 __setup("failslab=", setup_failslab);
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
 static int __init failslab_debugfs_init(void)
 {
        mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
@@ -46,8 +52,14 @@ static int __init failslab_debugfs_init(void)
                debugfs_create_bool("ignore-gfp-wait", mode, dir,
                                      &failslab.ignore_gfp_wait);
-        if (!failslab.ignore_gfp_wait_file) {
+        failslab.cache_filter_file =
+                debugfs_create_bool("cache-filter", mode, dir,
+                                      &failslab.cache_filter);
+        if (!failslab.ignore_gfp_wait_file ||
+            !failslab.cache_filter_file) {
                err = -ENOMEM;
+                debugfs_remove(failslab.cache_filter_file);
                debugfs_remove(failslab.ignore_gfp_wait_file);
                cleanup_fault_attr_dentries(&failslab.attr);
        }
diff --git a/mm/filemap.c b/mm/filemap.c
index 698ea80f2102..045b31c37653 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1117,7 +1117,7 @@ readpage:
                        if (!PageUptodate(page)) {
                                if (page->mapping == NULL) {
                                        /*
-                                         * invalidate_inode_pages got it
+                                         * invalidate_mapping_pages got it
                                         */
                                        unlock_page(page);
                                        page_cache_release(page);
@@ -1986,7 +1986,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count);
 inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk)
 {
        struct inode *inode = file->f_mapping->host;
-        unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+        unsigned long limit = rlimit(RLIMIT_FSIZE);
        if (unlikely(*pos < 0))
                return -EINVAL;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 1888b2d71bb8..78b94f0b6d5d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -194,7 +194,7 @@ retry:
                        flush_cache_page(vma, address, pte_pfn(*pte));
                        pteval = ptep_clear_flush_notify(vma, address, pte);
                        page_remove_rmap(page);
-                        dec_mm_counter(mm, file_rss);
+                        dec_mm_counter(mm, MM_FILEPAGES);
                        BUG_ON(pte_dirty(pteval));
                        pte_unmap_unlock(pte, ptl);
                        page_cache_release(page);
diff --git a/mm/fremap.c b/mm/fremap.c
index b6ec85abbb39..46f5dacf90a2 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
                        page_remove_rmap(page);
                        page_cache_release(page);
                        update_hiwater_rss(mm);
-                        dec_mm_counter(mm, file_rss);
+                        dec_mm_counter(mm, MM_FILEPAGES);
                }
        } else {
                if (!pte_file(pte))
diff --git a/mm/highmem.c b/mm/highmem.c
index 9c1e627f282e..bed8a8bfd01f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -220,7 +220,7 @@ EXPORT_SYMBOL(kmap_high);
 * @page: &struct page to pin
 *
 * Returns the page's current virtual memory address, or NULL if no mapping
- * exists.  When and only when a non null address is returned then a
+ * exists.  If and only if a non null address is returned then a
 * matching call to kunmap_high() is necessary.
 *
 * This can be called from any context.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2d16fa6b8c2d..3a5aeb37c110 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2087,7 +2087,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
        entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
        if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
-                update_mmu_cache(vma, address, entry);
+                update_mmu_cache(vma, address, ptep);
        }
 }
@@ -2558,7 +2558,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        entry = pte_mkyoung(entry);
        if (huge_ptep_set_access_flags(vma, address, ptep, entry,
                                                flags & FAULT_FLAG_WRITE))
-                update_mmu_cache(vma, address, entry);
+                update_mmu_cache(vma, address, ptep);
 out_page_table_lock:
        spin_unlock(&mm->page_table_lock);
diff --git a/mm/ksm.c b/mm/ksm.c
index 56a0da1f9979..a93f1b7f508c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1563,10 +1563,12 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
 again:
        hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
                struct anon_vma *anon_vma = rmap_item->anon_vma;
+                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
                spin_lock(&anon_vma->lock);
-                list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
                                continue;
@@ -1614,10 +1616,12 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
 again:
        hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
                struct anon_vma *anon_vma = rmap_item->anon_vma;
+                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
                spin_lock(&anon_vma->lock);
-                list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
                                continue;
@@ -1664,10 +1668,12 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
 again:
        hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
                struct anon_vma *anon_vma = rmap_item->anon_vma;
+                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
                spin_lock(&anon_vma->lock);
-                list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
                            rmap_item->address >= vma->vm_end)
                                continue;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 954032b80bed..7973b5221fb8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6,6 +6,10 @@
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <xemul@openvz.org>
 *
+ * Memory thresholds
+ * Copyright (C) 2009 Nokia Corporation
+ * Author: Kirill A. Shutemov
+ *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +25,7 @@
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
+#include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
@@ -32,7 +37,10 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/spinlock.h>
+#include <linux/eventfd.h>
+#include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
@@ -55,7 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
 #define do_swap_account         (0)
 #endif
-#define SOFTLIMIT_EVENTS_THRESH (1000)
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. This counter
+ * is used for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ *
+ * These values will be used as !((event) & ((1 <<(thresh)) - 1))
+ */
+#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
+#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
 /*
 * Statistics for memory cgroup.
@@ -69,62 +85,16 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
        MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
        MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
-        MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
        MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
+        MEM_CGROUP_EVENTS,      /* incremented at every  pagein/pageout */
        MEM_CGROUP_STAT_NSTATS,
 };
 struct mem_cgroup_stat_cpu {
        s64 count[MEM_CGROUP_STAT_NSTATS];
-} ____cacheline_aligned_in_smp;
-struct mem_cgroup_stat {
-        struct mem_cgroup_stat_cpu cpustat[0];
 };
-static inline void
-__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
-                                enum mem_cgroup_stat_index idx)
-{
-        stat->count[idx] = 0;
-}
-static inline s64
-__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
-                                enum mem_cgroup_stat_index idx)
-{
-        return stat->count[idx];
-}
-/*
- * For accounting under irq disable, no need for increment preempt count.
- */
-static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
-                enum mem_cgroup_stat_index idx, int val)
-{
-        stat->count[idx] += val;
-}
-static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
-                enum mem_cgroup_stat_index idx)
-{
-        int cpu;
-        s64 ret = 0;
-        for_each_possible_cpu(cpu)
-                ret += stat->cpustat[cpu].count[idx];
-        return ret;
-}
-static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
-{
-        s64 ret;
-        ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
-        ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
-        return ret;
-}
 /*
 * per-zone information in memory controller.
 */
@@ -174,6 +144,22 @@ struct mem_cgroup_tree {
 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
+struct mem_cgroup_threshold {
+        struct eventfd_ctx *eventfd;
+        u64 threshold;
+};
+struct mem_cgroup_threshold_ary {
+        /* An array index points to threshold just below usage. */
+        atomic_t current_threshold;
+        /* Size of entries[] */
+        unsigned int size;
+        /* Array of thresholds */
+        struct mem_cgroup_threshold entries[0];
+};
+static void mem_cgroup_threshold(struct mem_cgroup *mem);
 /*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
@@ -217,7 +203,7 @@ struct mem_cgroup {
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
-        unsigned long   last_oom_jiffies;
+        atomic_t        oom_lock;
        atomic_t        refcnt;
        unsigned int    swappiness;
@@ -225,10 +211,48 @@ struct mem_cgroup {
        /* set when res.limit == memsw.limit */
        bool            memsw_is_minimum;
+        /* protect arrays of thresholds */
+        struct mutex thresholds_lock;
+        /* thresholds for memory usage. RCU-protected */
+        struct mem_cgroup_threshold_ary *thresholds;
+        /* thresholds for mem+swap usage. RCU-protected */
+        struct mem_cgroup_threshold_ary *memsw_thresholds;
        /*
-         * statistics. This must be placed at the end of memcg.
+         * Should we move charges of a task when a task is moved into this
+         * mem_cgroup ? And what type of charges should we move ?
         */
-        struct mem_cgroup_stat stat;
+        unsigned long   move_charge_at_immigrate;
+        /*
+         * percpu counter.
+         */
+        struct mem_cgroup_stat_cpu *stat;
+};
+/* Stuffs for move charges at task migration. */
+/*
+ * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
+ * left-shifted bitmap of these types.
+ */
+enum move_type {
+        MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
+        NR_MOVE_TYPE,
+};
+/* "mc" and its members are protected by cgroup_mutex */
+static struct move_charge_struct {
+        struct mem_cgroup *from;
+        struct mem_cgroup *to;
+        unsigned long precharge;
+        unsigned long moved_charge;
+        unsigned long moved_swap;
+        struct task_struct *moving_task;        /* a task moving charges */
+        wait_queue_head_t waitq;                /* a waitq for other context */
+} mc = {
+        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
 /*
@@ -371,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
        spin_unlock(&mctz->lock);
 }
-static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
-{
-        bool ret = false;
-        int cpu;
-        s64 val;
-        struct mem_cgroup_stat_cpu *cpustat;
-        cpu = get_cpu();
-        cpustat = &mem->stat.cpustat[cpu];
-        val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
-        if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
-                __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
-                ret = true;
-        }
-        put_cpu();
-        return ret;
-}
 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
 {
@@ -481,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
        return mz;
 }
+static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
+                enum mem_cgroup_stat_index idx)
+{
+        int cpu;
+        s64 val = 0;
+        for_each_possible_cpu(cpu)
+                val += per_cpu(mem->stat->count[idx], cpu);
+        return val;
+}
+static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
+{
+        s64 ret;
+        ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
+        ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
+        return ret;
+}
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
                                         bool charge)
 {
        int val = (charge) ? 1 : -1;
-        struct mem_cgroup_stat *stat = &mem->stat;
+        this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
-        struct mem_cgroup_stat_cpu *cpustat;
-        int cpu = get_cpu();
-        cpustat = &stat->cpustat[cpu];
-        __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
-        put_cpu();
 }
 static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -499,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
                                         bool charge)
 {
        int val = (charge) ? 1 : -1;
-        struct mem_cgroup_stat *stat = &mem->stat;
-        struct mem_cgroup_stat_cpu *cpustat;
-        int cpu = get_cpu();
-        cpustat = &stat->cpustat[cpu];
+        preempt_disable();
        if (PageCgroupCache(pc))
-                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
+                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
        else
-                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
+                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
        if (charge)
-                __mem_cgroup_stat_add_safe(cpustat,
+                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
-                                MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
        else
-                __mem_cgroup_stat_add_safe(cpustat,
+                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
-                                MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
+        __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
-        __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
-        put_cpu();
+        preempt_enable();
 }
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -534,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
        return total;
 }
+static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
+{
+        s64 val;
+        val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
+        return !(val & ((1 << event_mask_shift) - 1));
+}
+/*
+ * Check events in order.
+ *
+ */
+static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
+{
+        /* threshold event is triggered in finer grain than soft limit */
+        if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
+                mem_cgroup_threshold(mem);
+                if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
+                        mem_cgroup_update_tree(mem, page);
+        }
+}
 static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 {
        return container_of(cgroup_subsys_state(cont,
@@ -1000,7 +1041,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
 }
 /**
- * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
+ * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
 * @memcg: The memory cgroup that went over limit
 * @p: Task that is going to be killed
 *
@@ -1174,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                }
                        }
                }
-                if (!mem_cgroup_local_usage(&victim->stat)) {
+                if (!mem_cgroup_local_usage(victim)) {
                        /* this cgroup's local usage == 0 */
                        css_put(&victim->css);
                        continue;
@@ -1205,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        return total;
 }
-bool mem_cgroup_oom_called(struct task_struct *task)
+static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
 {
-        bool ret = false;
+        int *val = (int *)data;
-        struct mem_cgroup *mem;
+        int x;
-        struct mm_struct *mm;
+        /*
+         * Logically, we can stop scanning immediately when we find
+         * a memcg is already locked. But condidering unlock ops and
+         * creation/removal of memcg, scan-all is simple operation.
+         */
+        x = atomic_inc_return(&mem->oom_lock);
+        *val = max(x, *val);
+        return 0;
+}
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+{
+        int lock_count = 0;
-        rcu_read_lock();
+        mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
-        mm = task->mm;
-        if (!mm)
+        if (lock_count == 1)
-                mm = &init_mm;
+                return true;
-        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+        return false;
-        if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
-                ret = true;
-        rcu_read_unlock();
-        return ret;
 }
-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
 {
-        mem->last_oom_jiffies = jiffies;
+        /*
+         * When a new child is created while the hierarchy is under oom,
+         * mem_cgroup_oom_lock() may not be called. We have to use
+         * atomic_add_unless() here.
+         */
+        atomic_add_unless(&mem->oom_lock, -1, 0);
        return 0;
 }
-static void record_last_oom(struct mem_cgroup *mem)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
 {
-        mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+        mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
+}
+static DEFINE_MUTEX(memcg_oom_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+/*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ */
+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
+{
+        DEFINE_WAIT(wait);
+        bool locked;
+        /* At first, try to OOM lock hierarchy under mem.*/
+        mutex_lock(&memcg_oom_mutex);
+        locked = mem_cgroup_oom_lock(mem);
+        /*
+         * Even if signal_pending(), we can't quit charge() loop without
+         * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+         * under OOM is always welcomed, use TASK_KILLABLE here.
+         */
+        if (!locked)
+                prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
+        mutex_unlock(&memcg_oom_mutex);
+        if (locked)
+                mem_cgroup_out_of_memory(mem, mask);
+        else {
+                schedule();
+                finish_wait(&memcg_oom_waitq, &wait);
+        }
+        mutex_lock(&memcg_oom_mutex);
+        mem_cgroup_oom_unlock(mem);
+        /*
+         * Here, we use global waitq .....more fine grained waitq ?
+         * Assume following hierarchy.
+         * A/
+         *   01
+         *   02
+         * assume OOM happens both in A and 01 at the same time. Tthey are
+         * mutually exclusive by lock. (kill in 01 helps A.)
+         * When we use per memcg waitq, we have to wake up waiters on A and 02
+         * in addtion to waiters on 01. We use global waitq for avoiding mess.
+         * It will not be a big problem.
+         * (And a task may be moved to other groups while it's waiting for OOM.)
+         */
+        wake_up_all(&memcg_oom_waitq);
+        mutex_unlock(&memcg_oom_mutex);
+        if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+                return false;
+        /* Give chance to dying process */
+        schedule_timeout(1);
+        return true;
 }
 /*
@@ -1240,9 +1351,6 @@ static void record_last_oom(struct mem_cgroup *mem)
 void mem_cgroup_update_file_mapped(struct page *page, int val)
 {
        struct mem_cgroup *mem;
-        struct mem_cgroup_stat *stat;
-        struct mem_cgroup_stat_cpu *cpustat;
-        int cpu;
        struct page_cgroup *pc;
        pc = lookup_page_cgroup(page);
@@ -1258,13 +1366,10 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
                goto done;
        /*
-         * Preemption is already disabled, we don't need get_cpu()
+         * Preemption is already disabled. We can use __this_cpu_xxx
         */
-        cpu = smp_processor_id();
+        __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val);
-        stat = &mem->stat;
-        cpustat = &stat->cpustat[cpu];
-        __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
 done:
        unlock_page_cgroup(pc);
 }
@@ -1401,19 +1506,21 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
 * oom-killer can be invoked.
 */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
-                        gfp_t gfp_mask, struct mem_cgroup **memcg,
+                        gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
-                        bool oom, struct page *page)
 {
        struct mem_cgroup *mem, *mem_over_limit;
        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct res_counter *fail_res;
        int csize = CHARGE_SIZE;
-        if (unlikely(test_thread_flag(TIF_MEMDIE))) {
+        /*
-                /* Don't account this! */
+         * Unlike gloval-vm's OOM-kill, we're not in memory shortage
-                *memcg = NULL;
+         * in system level. So, allow to go ahead dying process in addition to
-                return 0;
+         * MEMDIE process.
-        }
+         */
+        if (unlikely(test_thread_flag(TIF_MEMDIE)
+                     || fatal_signal_pending(current)))
+                goto bypass;
        /*
         * We always charge the cgroup the mm_struct belongs to.
@@ -1440,7 +1547,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                unsigned long flags = 0;
                if (consume_stock(mem))
-                        goto charged;
+                        goto done;
                ret = res_counter_charge(&mem->res, csize, &fail_res);
                if (likely(!ret)) {
@@ -1483,28 +1590,70 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                if (mem_cgroup_check_under_limit(mem_over_limit))
                        continue;
+                /* try to avoid oom while someone is moving charge */
+                if (mc.moving_task && current != mc.moving_task) {
+                        struct mem_cgroup *from, *to;
+                        bool do_continue = false;
+                        /*
+                         * There is a small race that "from" or "to" can be
+                         * freed by rmdir, so we use css_tryget().
+                         */
+                        rcu_read_lock();
+                        from = mc.from;
+                        to = mc.to;
+                        if (from && css_tryget(&from->css)) {
+                                if (mem_over_limit->use_hierarchy)
+                                        do_continue = css_is_ancestor(
+                                                        &from->css,
+                                                        &mem_over_limit->css);
+                                else
+                                        do_continue = (from == mem_over_limit);
+                                css_put(&from->css);
+                        }
+                        if (!do_continue && to && css_tryget(&to->css)) {
+                                if (mem_over_limit->use_hierarchy)
+                                        do_continue = css_is_ancestor(
+                                                        &to->css,
+                                                        &mem_over_limit->css);
+                                else
+                                        do_continue = (to == mem_over_limit);
+                                css_put(&to->css);
+                        }
+                        rcu_read_unlock();
+                        if (do_continue) {
+                                DEFINE_WAIT(wait);
+                                prepare_to_wait(&mc.waitq, &wait,
+                                                        TASK_INTERRUPTIBLE);
+                                /* moving charge context might have finished. */
+                                if (mc.moving_task)
+                                        schedule();
+                                finish_wait(&mc.waitq, &wait);
+                                continue;
+                        }
+                }
                if (!nr_retries--) {
-                        if (oom) {
+                        if (!oom)
-                                mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
+                                goto nomem;
-                                record_last_oom(mem_over_limit);
+                        if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
+                                nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+                                continue;
                        }
-                        goto nomem;
+                        /* When we reach here, current task is dying .*/
+                        css_put(&mem->css);
+                        goto bypass;
                }
        }
        if (csize > PAGE_SIZE)
                refill_stock(mem, csize - PAGE_SIZE);
-charged:
-        /*
-         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
-         * if they exceeds softlimit.
-         */
-        if (mem_cgroup_soft_limit_check(mem))
-                mem_cgroup_update_tree(mem, page);
 done:
        return 0;
 nomem:
        css_put(&mem->css);
        return -ENOMEM;
+bypass:
+        *memcg = NULL;
+        return 0;
 }
 /*
@@ -1512,14 +1661,23 @@ nomem:
 * This function is for that and do uncharge, put css's refcnt.
 * gotten by try_charge().
 */
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
+                                                        unsigned long count)
 {
        if (!mem_cgroup_is_root(mem)) {
-                res_counter_uncharge(&mem->res, PAGE_SIZE);
+                res_counter_uncharge(&mem->res, PAGE_SIZE * count);
                if (do_swap_account)
-                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+                        res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
+                VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
+                WARN_ON_ONCE(count > INT_MAX);
+                __css_put(&mem->css, (int)count);
        }
-        css_put(&mem->css);
+        /* we don't need css_put for root */
+}
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+{
+        __mem_cgroup_cancel_charge(mem, 1);
 }
 /*
@@ -1615,6 +1773,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
        mem_cgroup_charge_statistics(mem, pc, true);
        unlock_page_cgroup(pc);
+        /*
+         * "charge_statistics" updated event counter. Then, check it.
+         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+         * if they exceeds softlimit.
+         */
+        memcg_check_events(mem, pc->page);
 }
 /**
@@ -1622,22 +1786,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 * @pc: page_cgroup of the page.
 * @from: mem_cgroup which the page is moved from.
 * @to: mem_cgroup which the page is moved to. @from != @to.
+ * @uncharge: whether we should call uncharge and css_put against @from.
 *
 * The caller must confirm following.
 * - page is not on LRU (isolate_page() is useful.)
 * - the pc is locked, used, and ->mem_cgroup points to @from.
 *
- * This function does "uncharge" from old cgroup but doesn't do "charge" to
+ * This function doesn't do "charge" nor css_get to new cgroup. It should be
- * new cgroup. It should be done by a caller.
+ * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
+ * true, this function does "uncharge" from old cgroup, but it doesn't if
+ * @uncharge is false, so a caller should do "uncharge".
 */
 static void __mem_cgroup_move_account(struct page_cgroup *pc,
-        struct mem_cgroup *from, struct mem_cgroup *to)
+        struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
        struct page *page;
-        int cpu;
-        struct mem_cgroup_stat *stat;
-        struct mem_cgroup_stat_cpu *cpustat;
        VM_BUG_ON(from == to);
        VM_BUG_ON(PageLRU(pc->page));
@@ -1645,38 +1809,28 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
        VM_BUG_ON(!PageCgroupUsed(pc));
        VM_BUG_ON(pc->mem_cgroup != from);
-        if (!mem_cgroup_is_root(from))
-                res_counter_uncharge(&from->res, PAGE_SIZE);
-        mem_cgroup_charge_statistics(from, pc, false);
        page = pc->page;
        if (page_mapped(page) && !PageAnon(page)) {
-                cpu = smp_processor_id();
+                /* Update mapped_file data for mem_cgroup */
-                /* Update mapped_file data for mem_cgroup "from" */
+                preempt_disable();
-                stat = &from->stat;
+                __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-                cpustat = &stat->cpustat[cpu];
+                __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
-                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
+                preempt_enable();
-                                                -1);
-                /* Update mapped_file data for mem_cgroup "to" */
-                stat = &to->stat;
-                cpustat = &stat->cpustat[cpu];
-                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
-                                                1);
        }
+        mem_cgroup_charge_statistics(from, pc, false);
+        if (uncharge)
+                /* This is not "cancel", but cancel_charge does all we need. */
+                mem_cgroup_cancel_charge(from);
-        if (do_swap_account && !mem_cgroup_is_root(from))
+        /* caller should have done css_get */
-                res_counter_uncharge(&from->memsw, PAGE_SIZE);
-        css_put(&from->css);
-        css_get(&to->css);
        pc->mem_cgroup = to;
        mem_cgroup_charge_statistics(to, pc, true);
        /*
         * We charges against "to" which may not have any tasks. Then, "to"
         * can be under rmdir(). But in current implementation, caller of
-         * this function is just force_empty() and it's garanteed that
+         * this function is just force_empty() and move charge, so it's
-         * "to" is never removed. So, we don't check rmdir status here.
+         * garanteed that "to" is never removed. So, we don't check rmdir
+         * status here.
         */
 }
@@ -1685,15 +1839,20 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
 * __mem_cgroup_move_account()
 */
 static int mem_cgroup_move_account(struct page_cgroup *pc,
-                                struct mem_cgroup *from, struct mem_cgroup *to)
+                struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
 {
        int ret = -EINVAL;
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
-                __mem_cgroup_move_account(pc, from, to);
+                __mem_cgroup_move_account(pc, from, to, uncharge);
                ret = 0;
        }
        unlock_page_cgroup(pc);
+        /*
+         * check events
+         */
+        memcg_check_events(to, pc->page);
+        memcg_check_events(from, pc->page);
        return ret;
 }
@@ -1722,15 +1881,13 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
                goto put;
        parent = mem_cgroup_from_cont(pcg);
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
        if (ret || !parent)
                goto put_back;
-        ret = mem_cgroup_move_account(pc, child, parent);
+        ret = mem_cgroup_move_account(pc, child, parent, true);
-        if (!ret)
+        if (ret)
-                css_put(&parent->css);  /* drop extra refcnt by try_charge() */
+                mem_cgroup_cancel_charge(parent);
-        else
-                mem_cgroup_cancel_charge(parent);       /* does css_put */
 put_back:
        putback_lru_page(page);
 put:
@@ -1760,7 +1917,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        prefetchw(pc);
        mem = memcg;
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
        if (ret || !mem)
                return ret;
@@ -1880,14 +2037,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
        if (!mem)
                goto charge_cur_mm;
        *ptr = mem;
-        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
+        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
        /* drop extra refcnt from tryget */
        css_put(&mem->css);
        return ret;
 charge_cur_mm:
        if (unlikely(!mm))
                mm = &init_mm;
-        return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
+        return __mem_cgroup_try_charge(mm, mask, ptr, true);
 }
 static void
@@ -2064,8 +2221,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        mz = page_cgroup_zoneinfo(pc);
        unlock_page_cgroup(pc);
-        if (mem_cgroup_soft_limit_check(mem))
+        memcg_check_events(mem, page);
-                mem_cgroup_update_tree(mem, page);
        /* at swapout, this memcg will be accessed to record to swap */
        if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
                css_put(&mem->css);
@@ -2192,6 +2348,64 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
        }
        rcu_read_unlock();
 }
+/**
+ * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
+ * @entry: swap entry to be moved
+ * @from:  mem_cgroup which the entry is moved from
+ * @to:  mem_cgroup which the entry is moved to
+ * @need_fixup: whether we should fixup res_counters and refcounts.
+ *
+ * It succeeds only when the swap_cgroup's record for this entry is the same
+ * as the mem_cgroup's id of @from.
+ *
+ * Returns 0 on success, -EINVAL on failure.
+ *
+ * The caller must have charged to @to, IOW, called res_counter_charge() about
+ * both res and memsw, and called css_get().
+ */
+static int mem_cgroup_move_swap_account(swp_entry_t entry,
+                struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+{
+        unsigned short old_id, new_id;
+        old_id = css_id(&from->css);
+        new_id = css_id(&to->css);
+        if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
+                mem_cgroup_swap_statistics(from, false);
+                mem_cgroup_swap_statistics(to, true);
+                /*
+                 * This function is only called from task migration context now.
+                 * It postpones res_counter and refcount handling till the end
+                 * of task migration(mem_cgroup_clear_mc()) for performance
+                 * improvement. But we cannot postpone mem_cgroup_get(to)
+                 * because if the process that has been moved to @to does
+                 * swap-in, the refcount of @to might be decreased to 0.
+                 */
+                mem_cgroup_get(to);
+                if (need_fixup) {
+                        if (!mem_cgroup_is_root(from))
+                                res_counter_uncharge(&from->memsw, PAGE_SIZE);
+                        mem_cgroup_put(from);
+                        /*
+                         * we charged both to->res and to->memsw, so we should
+                         * uncharge to->res.
+                         */
+                        if (!mem_cgroup_is_root(to))
+                                res_counter_uncharge(&to->res, PAGE_SIZE);
+                        css_put(&to->css);
+                }
+                return 0;
+        }
+        return -EINVAL;
+}
+#else
+static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
+                struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup)
+{
+        return -EINVAL;
+}
 #endif
 /*
@@ -2216,8 +2430,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
        unlock_page_cgroup(pc);
        if (mem) {
-                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
+                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
-                                                page);
                css_put(&mem->css);
        }
        *ptr = mem;
@@ -2545,7 +2758,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
                pc = list_entry(list->prev, struct page_cgroup, lru);
                if (busy == pc) {
                        list_move(&pc->lru, list);
-                        busy = 0;
+                        busy = NULL;
                        spin_unlock_irqrestore(&zone->lru_lock, flags);
                        continue;
                }
@@ -2704,7 +2917,7 @@ static int
 mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
 {
        struct mem_cgroup_idx_data *d = data;
-        d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
+        d->val += mem_cgroup_read_stat(mem, d->idx);
        return 0;
 }
@@ -2719,40 +2932,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
        *val = d.val;
 }
+static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
+{
+        u64 idx_val, val;
+        if (!mem_cgroup_is_root(mem)) {
+                if (!swap)
+                        return res_counter_read_u64(&mem->res, RES_USAGE);
+                else
+                        return res_counter_read_u64(&mem->memsw, RES_USAGE);
+        }
+        mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
+        val = idx_val;
+        mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
+        val += idx_val;
+        if (swap) {
+                mem_cgroup_get_recursive_idx_stat(mem,
+                                MEM_CGROUP_STAT_SWAPOUT, &idx_val);
+                val += idx_val;
+        }
+        return val << PAGE_SHIFT;
+}
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-        u64 idx_val, val;
+        u64 val;
        int type, name;
        type = MEMFILE_TYPE(cft->private);
        name = MEMFILE_ATTR(cft->private);
        switch (type) {
        case _MEM:
-                if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
+                if (name == RES_USAGE)
-                        mem_cgroup_get_recursive_idx_stat(mem,
+                        val = mem_cgroup_usage(mem, false);
-                                MEM_CGROUP_STAT_CACHE, &idx_val);
+                else
-                        val = idx_val;
-                        mem_cgroup_get_recursive_idx_stat(mem,
-                                MEM_CGROUP_STAT_RSS, &idx_val);
-                        val += idx_val;
-                        val <<= PAGE_SHIFT;
-                } else
                        val = res_counter_read_u64(&mem->res, name);
                break;
        case _MEMSWAP:
-                if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
+                if (name == RES_USAGE)
-                        mem_cgroup_get_recursive_idx_stat(mem,
+                        val = mem_cgroup_usage(mem, true);
-                                MEM_CGROUP_STAT_CACHE, &idx_val);
+                else
-                        val = idx_val;
-                        mem_cgroup_get_recursive_idx_stat(mem,
-                                MEM_CGROUP_STAT_RSS, &idx_val);
-                        val += idx_val;
-                        mem_cgroup_get_recursive_idx_stat(mem,
-                                MEM_CGROUP_STAT_SWAPOUT, &idx_val);
-                        val += idx_val;
-                        val <<= PAGE_SHIFT;
-                } else
                        val = res_counter_read_u64(&mem->memsw, name);
                break;
        default:
@@ -2865,6 +3088,39 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
        return 0;
 }
+static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp,
+                                        struct cftype *cft)
+{
+        return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate;
+}
+#ifdef CONFIG_MMU
+static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
+                                        struct cftype *cft, u64 val)
+{
+        struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp);
+        if (val >= (1 << NR_MOVE_TYPE))
+                return -EINVAL;
+        /*
+         * We check this value several times in both in can_attach() and
+         * attach(), so we need cgroup lock to prevent this value from being
+         * inconsistent.
+         */
+        cgroup_lock();
+        mem->move_charge_at_immigrate = val;
+        cgroup_unlock();
+        return 0;
+}
+#else
+static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
+                                        struct cftype *cft, u64 val)
+{
+        return -ENOSYS;
+}
+#endif
 /* For read statistics */
 enum {
@@ -2910,18 +3166,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
        s64 val;
        /* per cpu stat */
-        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
+        val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
        s->stat[MCS_CACHE] += val * PAGE_SIZE;
-        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
+        val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
        s->stat[MCS_RSS] += val * PAGE_SIZE;
-        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
+        val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
        s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
-        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
+        val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
        s->stat[MCS_PGPGIN] += val;
-        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
+        val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
        s->stat[MCS_PGPGOUT] += val;
        if (do_swap_account) {
-                val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
+                val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
                s->stat[MCS_SWAP] += val * PAGE_SIZE;
        }
@@ -3049,12 +3305,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
        return 0;
 }
+static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
+{
+        struct mem_cgroup_threshold_ary *t;
+        u64 usage;
+        int i;
+        rcu_read_lock();
+        if (!swap)
+                t = rcu_dereference(memcg->thresholds);
+        else
+                t = rcu_dereference(memcg->memsw_thresholds);
+        if (!t)
+                goto unlock;
+        usage = mem_cgroup_usage(memcg, swap);
+        /*
+         * current_threshold points to threshold just below usage.
+         * If it's not true, a threshold was crossed after last
+         * call of __mem_cgroup_threshold().
+         */
+        i = atomic_read(&t->current_threshold);
+        /*
+         * Iterate backward over array of thresholds starting from
+         * current_threshold and check if a threshold is crossed.
+         * If none of thresholds below usage is crossed, we read
+         * only one element of the array here.
+         */
+        for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
+                eventfd_signal(t->entries[i].eventfd, 1);
+        /* i = current_threshold + 1 */
+        i++;
+        /*
+         * Iterate forward over array of thresholds starting from
+         * current_threshold+1 and check if a threshold is crossed.
+         * If none of thresholds above usage is crossed, we read
+         * only one element of the array here.
+         */
+        for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
+                eventfd_signal(t->entries[i].eventfd, 1);
+        /* Update current_threshold */
+        atomic_set(&t->current_threshold, i - 1);
+unlock:
+        rcu_read_unlock();
+}
+static void mem_cgroup_threshold(struct mem_cgroup *memcg)
+{
+        __mem_cgroup_threshold(memcg, false);
+        if (do_swap_account)
+                __mem_cgroup_threshold(memcg, true);
+}
+static int compare_thresholds(const void *a, const void *b)
+{
+        const struct mem_cgroup_threshold *_a = a;
+        const struct mem_cgroup_threshold *_b = b;
+        return _a->threshold - _b->threshold;
+}
+static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
+                struct eventfd_ctx *eventfd, const char *args)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+        struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
+        int type = MEMFILE_TYPE(cft->private);
+        u64 threshold, usage;
+        int size;
+        int i, ret;
+        ret = res_counter_memparse_write_strategy(args, &threshold);
+        if (ret)
+                return ret;
+        mutex_lock(&memcg->thresholds_lock);
+        if (type == _MEM)
+                thresholds = memcg->thresholds;
+        else if (type == _MEMSWAP)
+                thresholds = memcg->memsw_thresholds;
+        else
+                BUG();
+        usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
+        /* Check if a threshold crossed before adding a new one */
+        if (thresholds)
+                __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+        if (thresholds)
+                size = thresholds->size + 1;
+        else
+                size = 1;
+        /* Allocate memory for new array of thresholds */
+        thresholds_new = kmalloc(sizeof(*thresholds_new) +
+                        size * sizeof(struct mem_cgroup_threshold),
+                        GFP_KERNEL);
+        if (!thresholds_new) {
+                ret = -ENOMEM;
+                goto unlock;
+        }
+        thresholds_new->size = size;
+        /* Copy thresholds (if any) to new array */
+        if (thresholds)
+                memcpy(thresholds_new->entries, thresholds->entries,
+                                thresholds->size *
+                                sizeof(struct mem_cgroup_threshold));
+        /* Add new threshold */
+        thresholds_new->entries[size - 1].eventfd = eventfd;
+        thresholds_new->entries[size - 1].threshold = threshold;
+        /* Sort thresholds. Registering of new threshold isn't time-critical */
+        sort(thresholds_new->entries, size,
+                        sizeof(struct mem_cgroup_threshold),
+                        compare_thresholds, NULL);
+        /* Find current threshold */
+        atomic_set(&thresholds_new->current_threshold, -1);
+        for (i = 0; i < size; i++) {
+                if (thresholds_new->entries[i].threshold < usage) {
+                        /*
+                         * thresholds_new->current_threshold will not be used
+                         * until rcu_assign_pointer(), so it's safe to increment
+                         * it here.
+                         */
+                        atomic_inc(&thresholds_new->current_threshold);
+                }
+        }
+        if (type == _MEM)
+                rcu_assign_pointer(memcg->thresholds, thresholds_new);
+        else
+                rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
+        /* To be sure that nobody uses thresholds before freeing it */
+        synchronize_rcu();
+        kfree(thresholds);
+unlock:
+        mutex_unlock(&memcg->thresholds_lock);
+        return ret;
+}
+static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft,
+                struct eventfd_ctx *eventfd)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+        struct mem_cgroup_threshold_ary *thresholds, *thresholds_new;
+        int type = MEMFILE_TYPE(cft->private);
+        u64 usage;
+        int size = 0;
+        int i, j, ret;
+        mutex_lock(&memcg->thresholds_lock);
+        if (type == _MEM)
+                thresholds = memcg->thresholds;
+        else if (type == _MEMSWAP)
+                thresholds = memcg->memsw_thresholds;
+        else
+                BUG();
+        /*
+         * Something went wrong if we trying to unregister a threshold
+         * if we don't have thresholds
+         */
+        BUG_ON(!thresholds);
+        usage = mem_cgroup_usage(memcg, type == _MEMSWAP);
+        /* Check if a threshold crossed before removing */
+        __mem_cgroup_threshold(memcg, type == _MEMSWAP);
+        /* Calculate new number of threshold */
+        for (i = 0; i < thresholds->size; i++) {
+                if (thresholds->entries[i].eventfd != eventfd)
+                        size++;
+        }
+        /* Set thresholds array to NULL if we don't have thresholds */
+        if (!size) {
+                thresholds_new = NULL;
+                goto assign;
+        }
+        /* Allocate memory for new array of thresholds */
+        thresholds_new = kmalloc(sizeof(*thresholds_new) +
+                        size * sizeof(struct mem_cgroup_threshold),
+                        GFP_KERNEL);
+        if (!thresholds_new) {
+                ret = -ENOMEM;
+                goto unlock;
+        }
+        thresholds_new->size = size;
+        /* Copy thresholds and find current threshold */
+        atomic_set(&thresholds_new->current_threshold, -1);
+        for (i = 0, j = 0; i < thresholds->size; i++) {
+                if (thresholds->entries[i].eventfd == eventfd)
+                        continue;
+                thresholds_new->entries[j] = thresholds->entries[i];
+                if (thresholds_new->entries[j].threshold < usage) {
+                        /*
+                         * thresholds_new->current_threshold will not be used
+                         * until rcu_assign_pointer(), so it's safe to increment
+                         * it here.
+                         */
+                        atomic_inc(&thresholds_new->current_threshold);
+                }
+                j++;
+        }
+assign:
+        if (type == _MEM)
+                rcu_assign_pointer(memcg->thresholds, thresholds_new);
+        else
+                rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new);
+        /* To be sure that nobody uses thresholds before freeing it */
+        synchronize_rcu();
+        kfree(thresholds);
+unlock:
+        mutex_unlock(&memcg->thresholds_lock);
+        return ret;
+}
 static struct cftype mem_cgroup_files[] = {
        {
                .name = "usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
                .read_u64 = mem_cgroup_read,
+                .register_event = mem_cgroup_register_event,
+                .unregister_event = mem_cgroup_unregister_event,
        },
        {
                .name = "max_usage_in_bytes",
@@ -3098,6 +3591,11 @@ static struct cftype mem_cgroup_files[] = {
                .read_u64 = mem_cgroup_swappiness_read,
                .write_u64 = mem_cgroup_swappiness_write,
        },
+        {
+                .name = "move_charge_at_immigrate",
+                .read_u64 = mem_cgroup_move_charge_read,
+                .write_u64 = mem_cgroup_move_charge_write,
+        },
 };
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3106,6 +3604,8 @@ static struct cftype memsw_cgroup_files[] = {
                .name = "memsw.usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
                .read_u64 = mem_cgroup_read,
+                .register_event = mem_cgroup_register_event,
+                .unregister_event = mem_cgroup_unregister_event,
        },
        {
                .name = "memsw.max_usage_in_bytes",
@@ -3180,17 +3680,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
        kfree(mem->info.nodeinfo[node]);
 }
-static int mem_cgroup_size(void)
-{
-        int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
-        return sizeof(struct mem_cgroup) + cpustat_size;
-}
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
        struct mem_cgroup *mem;
-        int size = mem_cgroup_size();
+        int size = sizeof(struct mem_cgroup);
+        /* Can be very big if MAX_NUMNODES is very big */
        if (size < PAGE_SIZE)
                mem = kmalloc(size, GFP_KERNEL);
        else
@@ -3198,6 +3693,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        if (mem)
                memset(mem, 0, size);
+        mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+        if (!mem->stat) {
+                if (size < PAGE_SIZE)
+                        kfree(mem);
+                else
+                        vfree(mem);
+                mem = NULL;
+        }
        return mem;
 }
@@ -3222,7 +3725,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
        for_each_node_state(node, N_POSSIBLE)
                free_mem_cgroup_per_zone_info(mem, node);
-        if (mem_cgroup_size() < PAGE_SIZE)
+        free_percpu(mem->stat);
+        if (sizeof(struct mem_cgroup) < PAGE_SIZE)
                kfree(mem);
        else
                vfree(mem);
@@ -3233,9 +3737,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem)
        atomic_inc(&mem->refcnt);
 }
-static void mem_cgroup_put(struct mem_cgroup *mem)
+static void __mem_cgroup_put(struct mem_cgroup *mem, int count)
 {
-        if (atomic_dec_and_test(&mem->refcnt)) {
+        if (atomic_sub_and_test(count, &mem->refcnt)) {
                struct mem_cgroup *parent = parent_mem_cgroup(mem);
                __mem_cgroup_free(mem);
                if (parent)
@@ -3243,6 +3747,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem)
        }
 }
+static void mem_cgroup_put(struct mem_cgroup *mem)
+{
+        __mem_cgroup_put(mem, 1);
+}
 /*
 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
 */
@@ -3319,7 +3828,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                        INIT_WORK(&stock->work, drain_local_stock);
                }
                hotcpu_notifier(memcg_stock_cpu_callback, 0);
        } else {
                parent = mem_cgroup_from_cont(cont->parent);
                mem->use_hierarchy = parent->use_hierarchy;
@@ -3345,6 +3853,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        if (parent)
                mem->swappiness = get_swappiness(parent);
        atomic_set(&mem->refcnt, 1);
+        mem->move_charge_at_immigrate = 0;
+        mutex_init(&mem->thresholds_lock);
        return &mem->css;
 free_out:
        __mem_cgroup_free(mem);
@@ -3381,16 +3891,444 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
        return ret;
 }
+#ifdef CONFIG_MMU
+/* Handlers for move charge at task migration. */
+#define PRECHARGE_COUNT_AT_ONCE 256
+static int mem_cgroup_do_precharge(unsigned long count)
+{
+        int ret = 0;
+        int batch_count = PRECHARGE_COUNT_AT_ONCE;
+        struct mem_cgroup *mem = mc.to;
+        if (mem_cgroup_is_root(mem)) {
+                mc.precharge += count;
+                /* we don't need css_get for root */
+                return ret;
+        }
+        /* try to charge at once */
+        if (count > 1) {
+                struct res_counter *dummy;
+                /*
+                 * "mem" cannot be under rmdir() because we've already checked
+                 * by cgroup_lock_live_cgroup() that it is not removed and we
+                 * are still under the same cgroup_mutex. So we can postpone
+                 * css_get().
+                 */
+                if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy))
+                        goto one_by_one;
+                if (do_swap_account && res_counter_charge(&mem->memsw,
+                                                PAGE_SIZE * count, &dummy)) {
+                        res_counter_uncharge(&mem->res, PAGE_SIZE * count);
+                        goto one_by_one;
+                }
+                mc.precharge += count;
+                VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
+                WARN_ON_ONCE(count > INT_MAX);
+                __css_get(&mem->css, (int)count);
+                return ret;
+        }
+one_by_one:
+        /* fall back to one by one charge */
+        while (count--) {
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                if (!batch_count--) {
+                        batch_count = PRECHARGE_COUNT_AT_ONCE;
+                        cond_resched();
+                }
+                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+                if (ret || !mem)
+                        /* mem_cgroup_clear_mc() will do uncharge later */
+                        return -ENOMEM;
+                mc.precharge++;
+        }
+        return ret;
+}
+#else   /* !CONFIG_MMU */
+static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+                                struct cgroup *cgroup,
+                                struct task_struct *p,
+                                bool threadgroup)
+{
+        return 0;
+}
+static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+                                struct cgroup *cgroup,
+                                struct task_struct *p,
+                                bool threadgroup)
+{
+}
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *cont,
                                struct cgroup *old_cont,
                                struct task_struct *p,
                                bool threadgroup)
 {
+}
+#endif
+/**
+ * is_target_pte_for_mc - check a pte whether it is valid for move charge
+ * @vma: the vma the pte to be checked belongs
+ * @addr: the address corresponding to the pte to be checked
+ * @ptent: the pte to be checked
+ * @target: the pointer the target page or swap ent will be stored(can be NULL)
+ *
+ * Returns
+ *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
+ *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
+ *     move charge. if @target is not NULL, the page is stored in target->page
+ *     with extra refcnt got(Callers should handle it).
+ *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
+ *     target for charge migration. if @target is not NULL, the entry is stored
+ *     in target->ent.
+ *
+ * Called with pte lock held.
+ */
+union mc_target {
+        struct page     *page;
+        swp_entry_t     ent;
+};
+enum mc_target_type {
+        MC_TARGET_NONE, /* not used */
+        MC_TARGET_PAGE,
+        MC_TARGET_SWAP,
+};
+static int is_target_pte_for_mc(struct vm_area_struct *vma,
+                unsigned long addr, pte_t ptent, union mc_target *target)
+{
+        struct page *page = NULL;
+        struct page_cgroup *pc;
+        int ret = 0;
+        swp_entry_t ent = { .val = 0 };
+        int usage_count = 0;
+        bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON,
+                                        &mc.to->move_charge_at_immigrate);
+        if (!pte_present(ptent)) {
+                /* TODO: handle swap of shmes/tmpfs */
+                if (pte_none(ptent) || pte_file(ptent))
+                        return 0;
+                else if (is_swap_pte(ptent)) {
+                        ent = pte_to_swp_entry(ptent);
+                        if (!move_anon || non_swap_entry(ent))
+                                return 0;
+                        usage_count = mem_cgroup_count_swap_user(ent, &page);
+                }
+        } else {
+                page = vm_normal_page(vma, addr, ptent);
+                if (!page || !page_mapped(page))
+                        return 0;
+                /*
+                 * TODO: We don't move charges of file(including shmem/tmpfs)
+                 * pages for now.
+                 */
+                if (!move_anon || !PageAnon(page))
+                        return 0;
+                if (!get_page_unless_zero(page))
+                        return 0;
+                usage_count = page_mapcount(page);
+        }
+        if (usage_count > 1) {
+                /*
+                 * TODO: We don't move charges of shared(used by multiple
+                 * processes) pages for now.
+                 */
+                if (page)
+                        put_page(page);
+                return 0;
+        }
+        if (page) {
+                pc = lookup_page_cgroup(page);
+                /*
+                 * Do only loose check w/o page_cgroup lock.
+                 * mem_cgroup_move_account() checks the pc is valid or not under
+                 * the lock.
+                 */
+                if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+                        ret = MC_TARGET_PAGE;
+                        if (target)
+                                target->page = page;
+                }
+                if (!ret || !target)
+                        put_page(page);
+        }
+        /* throught */
+        if (ent.val && do_swap_account && !ret &&
+                        css_id(&mc.from->css) == lookup_swap_cgroup(ent)) {
+                ret = MC_TARGET_SWAP;
+                if (target)
+                        target->ent = ent;
+        }
+        return ret;
+}
+static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
+                                        unsigned long addr, unsigned long end,
+                                        struct mm_walk *walk)
+{
+        struct vm_area_struct *vma = walk->private;
+        pte_t *pte;
+        spinlock_t *ptl;
+        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        for (; addr != end; pte++, addr += PAGE_SIZE)
+                if (is_target_pte_for_mc(vma, addr, *pte, NULL))
+                        mc.precharge++; /* increment precharge temporarily */
+        pte_unmap_unlock(pte - 1, ptl);
+        cond_resched();
+        return 0;
+}
+static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
+{
+        unsigned long precharge;
+        struct vm_area_struct *vma;
+        down_read(&mm->mmap_sem);
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                struct mm_walk mem_cgroup_count_precharge_walk = {
+                        .pmd_entry = mem_cgroup_count_precharge_pte_range,
+                        .mm = mm,
+                        .private = vma,
+                };
+                if (is_vm_hugetlb_page(vma))
+                        continue;
+                /* TODO: We don't move charges of shmem/tmpfs pages for now. */
+                if (vma->vm_flags & VM_SHARED)
+                        continue;
+                walk_page_range(vma->vm_start, vma->vm_end,
+                                        &mem_cgroup_count_precharge_walk);
+        }
+        up_read(&mm->mmap_sem);
+        precharge = mc.precharge;
+        mc.precharge = 0;
+        return precharge;
+}
+static int mem_cgroup_precharge_mc(struct mm_struct *mm)
+{
+        return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+}
+static void mem_cgroup_clear_mc(void)
+{
+        /* we must uncharge all the leftover precharges from mc.to */
+        if (mc.precharge) {
+                __mem_cgroup_cancel_charge(mc.to, mc.precharge);
+                mc.precharge = 0;
+        }
        /*
-         * FIXME: It's better to move charges of this process from old
+         * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
-         * memcg to new memcg. But it's just on TODO-List now.
+         * we must uncharge here.
         */
+        if (mc.moved_charge) {
+                __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
+                mc.moved_charge = 0;
+        }
+        /* we must fixup refcnts and charges */
+        if (mc.moved_swap) {
+                WARN_ON_ONCE(mc.moved_swap > INT_MAX);
+                /* uncharge swap account from the old cgroup */
+                if (!mem_cgroup_is_root(mc.from))
+                        res_counter_uncharge(&mc.from->memsw,
+                                                PAGE_SIZE * mc.moved_swap);
+                __mem_cgroup_put(mc.from, mc.moved_swap);
+                if (!mem_cgroup_is_root(mc.to)) {
+                        /*
+                         * we charged both to->res and to->memsw, so we should
+                         * uncharge to->res.
+                         */
+                        res_counter_uncharge(&mc.to->res,
+                                                PAGE_SIZE * mc.moved_swap);
+                        VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
+                        __css_put(&mc.to->css, mc.moved_swap);
+                }
+                /* we've already done mem_cgroup_get(mc.to) */
+                mc.moved_swap = 0;
+        }
+        mc.from = NULL;
+        mc.to = NULL;
+        mc.moving_task = NULL;
+        wake_up_all(&mc.waitq);
+}
+static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+                                struct cgroup *cgroup,
+                                struct task_struct *p,
+                                bool threadgroup)
+{
+        int ret = 0;
+        struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
+        if (mem->move_charge_at_immigrate) {
+                struct mm_struct *mm;
+                struct mem_cgroup *from = mem_cgroup_from_task(p);
+                VM_BUG_ON(from == mem);
+                mm = get_task_mm(p);
+                if (!mm)
+                        return 0;
+                /* We move charges only when we move a owner of the mm */
+                if (mm->owner == p) {
+                        VM_BUG_ON(mc.from);
+                        VM_BUG_ON(mc.to);
+                        VM_BUG_ON(mc.precharge);
+                        VM_BUG_ON(mc.moved_charge);
+                        VM_BUG_ON(mc.moved_swap);
+                        VM_BUG_ON(mc.moving_task);
+                        mc.from = from;
+                        mc.to = mem;
+                        mc.precharge = 0;
+                        mc.moved_charge = 0;
+                        mc.moved_swap = 0;
+                        mc.moving_task = current;
+                        ret = mem_cgroup_precharge_mc(mm);
+                        if (ret)
+                                mem_cgroup_clear_mc();
+                }
+                mmput(mm);
+        }
+        return ret;
+}
+static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+                                struct cgroup *cgroup,
+                                struct task_struct *p,
+                                bool threadgroup)
+{
+        mem_cgroup_clear_mc();
+}
+static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
+                                unsigned long addr, unsigned long end,
+                                struct mm_walk *walk)
+{
+        int ret = 0;
+        struct vm_area_struct *vma = walk->private;
+        pte_t *pte;
+        spinlock_t *ptl;
+retry:
+        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+        for (; addr != end; addr += PAGE_SIZE) {
+                pte_t ptent = *(pte++);
+                union mc_target target;
+                int type;
+                struct page *page;
+                struct page_cgroup *pc;
+                swp_entry_t ent;
+                if (!mc.precharge)
+                        break;
+                type = is_target_pte_for_mc(vma, addr, ptent, &target);
+                switch (type) {
+                case MC_TARGET_PAGE:
+                        page = target.page;
+                        if (isolate_lru_page(page))
+                                goto put;
+                        pc = lookup_page_cgroup(page);
+                        if (!mem_cgroup_move_account(pc,
+                                                mc.from, mc.to, false)) {
+                                mc.precharge--;
+                                /* we uncharge from mc.from later. */
+                                mc.moved_charge++;
+                        }
+                        putback_lru_page(page);
+put:                    /* is_target_pte_for_mc() gets the page */
+                        put_page(page);
+                        break;
+                case MC_TARGET_SWAP:
+                        ent = target.ent;
+                        if (!mem_cgroup_move_swap_account(ent,
+                                                mc.from, mc.to, false)) {
+                                mc.precharge--;
+                                /* we fixup refcnts and charges later. */
+                                mc.moved_swap++;
+                        }
+                        break;
+                default:
+                        break;
+                }
+        }
+        pte_unmap_unlock(pte - 1, ptl);
+        cond_resched();
+        if (addr != end) {
+                /*
+                 * We have consumed all precharges we got in can_attach().
+                 * We try charge one by one, but don't do any additional
+                 * charges to mc.to if we have failed in charge once in attach()
+                 * phase.
+                 */
+                ret = mem_cgroup_do_precharge(1);
+                if (!ret)
+                        goto retry;
+        }
+        return ret;
+}
+static void mem_cgroup_move_charge(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        lru_add_drain_all();
+        down_read(&mm->mmap_sem);
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                int ret;
+                struct mm_walk mem_cgroup_move_charge_walk = {
+                        .pmd_entry = mem_cgroup_move_charge_pte_range,
+                        .mm = mm,
+                        .private = vma,
+                };
+                if (is_vm_hugetlb_page(vma))
+                        continue;
+                /* TODO: We don't move charges of shmem/tmpfs pages for now. */
+                if (vma->vm_flags & VM_SHARED)
+                        continue;
+                ret = walk_page_range(vma->vm_start, vma->vm_end,
+                                                &mem_cgroup_move_charge_walk);
+                if (ret)
+                        /*
+                         * means we have consumed all precharges and failed in
+                         * doing additional charge. Just abandon here.
+                         */
+                        break;
+        }
+        up_read(&mm->mmap_sem);
+}
+static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+                                struct cgroup *cont,
+                                struct cgroup *old_cont,
+                                struct task_struct *p,
+                                bool threadgroup)
+{
+        struct mm_struct *mm;
+        if (!mc.to)
+                /* no need to move charge */
+                return;
+        mm = get_task_mm(p);
+        if (mm) {
+                mem_cgroup_move_charge(mm);
+                mmput(mm);
+        }
+        mem_cgroup_clear_mc();
 }
 struct cgroup_subsys mem_cgroup_subsys = {
@@ -3400,6 +4338,8 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .pre_destroy = mem_cgroup_pre_destroy,
        .destroy = mem_cgroup_destroy,
        .populate = mem_cgroup_populate,
+        .can_attach = mem_cgroup_can_attach,
+        .cancel_attach = mem_cgroup_cancel_attach,
        .attach = mem_cgroup_move_task,
        .early_init = 0,
        .use_id = 1,
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 17299fd4577c..d1f335162976 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -383,9 +383,12 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        if (av == NULL) /* Not actually mapped anymore */
                goto out;
        for_each_process (tsk) {
+                struct anon_vma_chain *vmac;
                if (!task_early_kill(tsk))
                        continue;
-                list_for_each_entry (vma, &av->head, anon_vma_node) {
+                list_for_each_entry(vmac, &av->head, same_anon_vma) {
+                        vma = vmac->vma;
                        if (!page_mapped_in_vma(page, vma))
                                continue;
                        if (vma->vm_mm == tsk->mm)
diff --git a/mm/memory.c b/mm/memory.c
index 09e4b1be7b67..5b7f2002e54b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -121,6 +121,77 @@ static int __init init_zero_pfn(void)
 }
 core_initcall(init_zero_pfn);
+#if defined(SPLIT_RSS_COUNTING)
+void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
+{
+        int i;
+        for (i = 0; i < NR_MM_COUNTERS; i++) {
+                if (task->rss_stat.count[i]) {
+                        add_mm_counter(mm, i, task->rss_stat.count[i]);
+                        task->rss_stat.count[i] = 0;
+                }
+        }
+        task->rss_stat.events = 0;
+}
+static void add_mm_counter_fast(struct mm_struct *mm, int member, int val)
+{
+        struct task_struct *task = current;
+        if (likely(task->mm == mm))
+                task->rss_stat.count[member] += val;
+        else
+                add_mm_counter(mm, member, val);
+}
+#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1)
+#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1)
+/* sync counter once per 64 page faults */
+#define TASK_RSS_EVENTS_THRESH  (64)
+static void check_sync_rss_stat(struct task_struct *task)
+{
+        if (unlikely(task != current))
+                return;
+        if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH))
+                __sync_task_rss_stat(task, task->mm);
+}
+unsigned long get_mm_counter(struct mm_struct *mm, int member)
+{
+        long val = 0;
+        /*
+         * Don't use task->mm here...for avoiding to use task_get_mm()..
+         * The caller must guarantee task->mm is not invalid.
+         */
+        val = atomic_long_read(&mm->rss_stat.count[member]);
+        /*
+         * counter is updated in asynchronous manner and may go to minus.
+         * But it's never be expected number for users.
+         */
+        if (val < 0)
+                return 0;
+        return (unsigned long)val;
+}
+void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
+{
+        __sync_task_rss_stat(task, mm);
+}
+#else
+#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
+#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
+static void check_sync_rss_stat(struct task_struct *task)
+{
+}
+#endif
 /*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
@@ -300,7 +371,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 * Hide vma from rmap and truncate_pagecache before freeing
                 * pgtables
                 */
-                anon_vma_unlink(vma);
+                unlink_anon_vmas(vma);
                unlink_file_vma(vma);
                if (is_vm_hugetlb_page(vma)) {
@@ -314,7 +385,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                               && !is_vm_hugetlb_page(next)) {
                                vma = next;
                                next = vma->vm_next;
-                                anon_vma_unlink(vma);
+                                unlink_anon_vmas(vma);
                                unlink_file_vma(vma);
                        }
                        free_pgd_range(tlb, addr, vma->vm_end,
@@ -376,12 +447,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        return 0;
 }
-static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
+static inline void init_rss_vec(int *rss)
 {
-        if (file_rss)
+        memset(rss, 0, sizeof(int) * NR_MM_COUNTERS);
-                add_mm_counter(mm, file_rss, file_rss);
+}
-        if (anon_rss)
-                add_mm_counter(mm, anon_rss, anon_rss);
+static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss)
+{
+        int i;
+        if (current->mm == mm)
+                sync_mm_rss(current, mm);
+        for (i = 0; i < NR_MM_COUNTERS; i++)
+                if (rss[i])
+                        add_mm_counter(mm, i, rss[i]);
 }
 /*
@@ -430,12 +509,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
                "BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
                current->comm,
                (long long)pte_val(pte), (long long)pmd_val(*pmd));
-        if (page) {
+        if (page)
-                printk(KERN_ALERT
+                dump_page(page);
-                "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
-                page, (void *)page->flags, page_count(page),
-                page_mapcount(page), page->mapping, page->index);
-        }
        printk(KERN_ALERT
                "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
                (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
@@ -597,7 +672,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                                 &src_mm->mmlist);
                                spin_unlock(&mmlist_lock);
                        }
-                        if (is_write_migration_entry(entry) &&
+                        if (likely(!non_swap_entry(entry)))
+                                rss[MM_SWAPENTS]++;
+                        else if (is_write_migration_entry(entry) &&
                                        is_cow_mapping(vm_flags)) {
                                /*
                                 * COW mappings require pages in both parent
@@ -632,7 +709,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (page) {
                get_page(page);
                page_dup_rmap(page);
-                rss[PageAnon(page)]++;
+                if (PageAnon(page))
+                        rss[MM_ANONPAGES]++;
+                else
+                        rss[MM_FILEPAGES]++;
        }
 out_set_pte:
@@ -648,11 +728,12 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        pte_t *src_pte, *dst_pte;
        spinlock_t *src_ptl, *dst_ptl;
        int progress = 0;
-        int rss[2];
+        int rss[NR_MM_COUNTERS];
        swp_entry_t entry = (swp_entry_t){0};
 again:
-        rss[1] = rss[0] = 0;
+        init_rss_vec(rss);
        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte)
                return -ENOMEM;
@@ -688,7 +769,7 @@ again:
        arch_leave_lazy_mmu_mode();
        spin_unlock(src_ptl);
        pte_unmap_nested(orig_src_pte);
-        add_mm_rss(dst_mm, rss[0], rss[1]);
+        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
@@ -816,8 +897,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        struct mm_struct *mm = tlb->mm;
        pte_t *pte;
        spinlock_t *ptl;
-        int file_rss = 0;
+        int rss[NR_MM_COUNTERS];
-        int anon_rss = 0;
+        init_rss_vec(rss);
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        arch_enter_lazy_mmu_mode();
@@ -863,14 +945,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                set_pte_at(mm, addr, pte,
                                           pgoff_to_pte(page->index));
                        if (PageAnon(page))
-                                anon_rss--;
+                                rss[MM_ANONPAGES]--;
                        else {
                                if (pte_dirty(ptent))
                                        set_page_dirty(page);
                                if (pte_young(ptent) &&
                                    likely(!VM_SequentialReadHint(vma)))
                                        mark_page_accessed(page);
-                                file_rss--;
+                                rss[MM_FILEPAGES]--;
                        }
                        page_remove_rmap(page);
                        if (unlikely(page_mapcount(page) < 0))
@@ -887,13 +969,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                if (pte_file(ptent)) {
                        if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
                                print_bad_pte(vma, addr, ptent, NULL);
-                } else if
+                } else {
-                  (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
+                        swp_entry_t entry = pte_to_swp_entry(ptent);
-                        print_bad_pte(vma, addr, ptent, NULL);
+                        if (!non_swap_entry(entry))
+                                rss[MM_SWAPENTS]--;
+                        if (unlikely(!free_swap_and_cache(entry)))
+                                print_bad_pte(vma, addr, ptent, NULL);
+                }
                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
-        add_mm_rss(mm, file_rss, anon_rss);
+        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();
        pte_unmap_unlock(pte - 1, ptl);
@@ -1527,7 +1614,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
        /* Ok, finally just insert the thing.. */
        get_page(page);
-        inc_mm_counter(mm, file_rss);
+        inc_mm_counter_fast(mm, MM_FILEPAGES);
        page_add_file_rmap(page);
        set_pte_at(mm, addr, pte, mk_pte(page, prot));
@@ -1593,7 +1680,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
        /* Ok, finally just insert the thing.. */
        entry = pte_mkspecial(pfn_pte(pfn, prot));
        set_pte_at(mm, addr, pte, entry);
-        update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
+        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
        retval = 0;
 out_unlock:
@@ -2044,6 +2131,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        page_cache_release(old_page);
                }
                reuse = reuse_swap_page(old_page);
+                if (reuse)
+                        /*
+                         * The page is all ours.  Move it to our anon_vma so
+                         * the rmap code will not search our parent or siblings.
+                         * Protected against the rmap code by the page lock.
+                         */
+                        page_move_anon_rmap(old_page, vma, address);
                unlock_page(old_page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
@@ -2116,7 +2210,7 @@ reuse:
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                if (ptep_set_access_flags(vma, address, page_table, entry,1))
-                        update_mmu_cache(vma, address, entry);
+                        update_mmu_cache(vma, address, page_table);
                ret |= VM_FAULT_WRITE;
                goto unlock;
        }
@@ -2163,11 +2257,11 @@ gotten:
        if (likely(pte_same(*page_table, orig_pte))) {
                if (old_page) {
                        if (!PageAnon(old_page)) {
-                                dec_mm_counter(mm, file_rss);
+                                dec_mm_counter_fast(mm, MM_FILEPAGES);
-                                inc_mm_counter(mm, anon_rss);
+                                inc_mm_counter_fast(mm, MM_ANONPAGES);
                        }
                } else
-                        inc_mm_counter(mm, anon_rss);
+                        inc_mm_counter_fast(mm, MM_ANONPAGES);
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2185,7 +2279,7 @@ gotten:
                 * new page to be mapped directly into the secondary page table.
                 */
                set_pte_at_notify(mm, address, page_table, entry);
-                update_mmu_cache(vma, address, entry);
+                update_mmu_cache(vma, address, page_table);
                if (old_page) {
                        /*
                         * Only after switching the pte to the new page may
@@ -2604,7 +2698,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * discarded at swap_free().
         */
-        inc_mm_counter(mm, anon_rss);
+        inc_mm_counter_fast(mm, MM_ANONPAGES);
+        dec_mm_counter_fast(mm, MM_SWAPENTS);
        pte = mk_pte(page, vma->vm_page_prot);
        if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -2629,7 +2724,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        /* No need to invalidate - it was non-present before */
-        update_mmu_cache(vma, address, pte);
+        update_mmu_cache(vma, address, page_table);
 unlock:
        pte_unmap_unlock(page_table, ptl);
 out:
@@ -2688,13 +2783,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pte_none(*page_table))
                goto release;
-        inc_mm_counter(mm, anon_rss);
+        inc_mm_counter_fast(mm, MM_ANONPAGES);
        page_add_new_anon_rmap(page, vma, address);
 setpte:
        set_pte_at(mm, address, page_table, entry);
        /* No need to invalidate - it was non-present before */
-        update_mmu_cache(vma, address, entry);
+        update_mmu_cache(vma, address, page_table);
 unlock:
        pte_unmap_unlock(page_table, ptl);
        return 0;
@@ -2842,10 +2937,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                if (flags & FAULT_FLAG_WRITE)
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                if (anon) {
-                        inc_mm_counter(mm, anon_rss);
+                        inc_mm_counter_fast(mm, MM_ANONPAGES);
                        page_add_new_anon_rmap(page, vma, address);
                } else {
-                        inc_mm_counter(mm, file_rss);
+                        inc_mm_counter_fast(mm, MM_FILEPAGES);
                        page_add_file_rmap(page);
                        if (flags & FAULT_FLAG_WRITE) {
                                dirty_page = page;
@@ -2855,7 +2950,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                set_pte_at(mm, address, page_table, entry);
                /* no need to invalidate: a not-present page won't be cached */
-                update_mmu_cache(vma, address, entry);
+                update_mmu_cache(vma, address, page_table);
        } else {
                if (charged)
                        mem_cgroup_uncharge_page(page);
@@ -2992,7 +3087,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        }
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
-                update_mmu_cache(vma, address, entry);
+                update_mmu_cache(vma, address, pte);
        } else {
                /*
                 * This is needed only for protection faults but the arch code
@@ -3023,6 +3118,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        count_vm_event(PGFAULT);
+        /* do counter updates before entering really critical section. */
+        check_sync_rss_stat(current);
        if (unlikely(is_vm_hugetlb_page(vma)))
                return hugetlb_fault(mm, vma, address, flags);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 030ce8a5bb0e..be211a582930 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -28,6 +28,7 @@
 #include <linux/pfn.h>
 #include <linux/suspend.h>
 #include <linux/mm_inline.h>
+#include <linux/firmware-map.h>
 #include <asm/tlbflush.h>
@@ -523,6 +524,9 @@ int __ref add_memory(int nid, u64 start, u64 size)
                BUG_ON(ret);
        }
+        /* create new memmap entry */
+        firmware_map_add_hotplug(start, start + size, "System RAM");
        goto out;
 error:
@@ -684,9 +688,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                        if (page_count(page))
                                not_managed++;
 #ifdef CONFIG_DEBUG_VM
-                        printk(KERN_INFO "removing from LRU failed"
+                        printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
-                                         " %lx/%d/%lx\n",
+                               pfn);
-                                pfn, page_count(page), page->flags);
+                        dump_page(page);
 #endif
                }
        }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3cec080faa23..643f66e10187 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -563,24 +563,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 }
 /* Step 2: apply policy to a range and do splits. */
-static int mbind_range(struct vm_area_struct *vma, unsigned long start,
+static int mbind_range(struct mm_struct *mm, unsigned long start,
-                       unsigned long end, struct mempolicy *new)
+                       unsigned long end, struct mempolicy *new_pol)
 {
        struct vm_area_struct *next;
-        int err;
+        struct vm_area_struct *prev;
+        struct vm_area_struct *vma;
+        int err = 0;
+        pgoff_t pgoff;
+        unsigned long vmstart;
+        unsigned long vmend;
-        err = 0;
+        vma = find_vma_prev(mm, start, &prev);
-        for (; vma && vma->vm_start < end; vma = next) {
+        if (!vma || vma->vm_start > start)
+                return -EFAULT;
+        for (; vma && vma->vm_start < end; prev = vma, vma = next) {
                next = vma->vm_next;
-                if (vma->vm_start < start)
+                vmstart = max(start, vma->vm_start);
-                        err = split_vma(vma->vm_mm, vma, start, 1);
+                vmend   = min(end, vma->vm_end);
-                if (!err && vma->vm_end > end)
-                        err = split_vma(vma->vm_mm, vma, end, 0);
+                pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
-                if (!err)
+                prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
-                        err = policy_vma(vma, new);
+                                  vma->anon_vma, vma->vm_file, pgoff, new_pol);
+                if (prev) {
+                        vma = prev;
+                        next = vma->vm_next;
+                        continue;
+                }
+                if (vma->vm_start != vmstart) {
+                        err = split_vma(vma->vm_mm, vma, vmstart, 1);
+                        if (err)
+                                goto out;
+                }
+                if (vma->vm_end != vmend) {
+                        err = split_vma(vma->vm_mm, vma, vmend, 0);
+                        if (err)
+                                goto out;
+                }
+                err = policy_vma(vma, new_pol);
                if (err)
-                        break;
+                        goto out;
        }
+ out:
        return err;
 }
@@ -862,36 +888,36 @@ int do_migrate_pages(struct mm_struct *mm,
        if (err)
                goto out;
-/*
+        /*
- * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+         * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
- * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
+         * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
- * bit in 'tmp', and return that <source, dest> pair for migration.
+         * bit in 'tmp', and return that <source, dest> pair for migration.
- * The pair of nodemasks 'to' and 'from' define the map.
+         * The pair of nodemasks 'to' and 'from' define the map.
- *
+         *
- * If no pair of bits is found that way, fallback to picking some
+         * If no pair of bits is found that way, fallback to picking some
- * pair of 'source' and 'dest' bits that are not the same.  If the
+         * pair of 'source' and 'dest' bits that are not the same.  If the
- * 'source' and 'dest' bits are the same, this represents a node
+         * 'source' and 'dest' bits are the same, this represents a node
- * that will be migrating to itself, so no pages need move.
+         * that will be migrating to itself, so no pages need move.
- *
+         *
- * If no bits are left in 'tmp', or if all remaining bits left
+         * If no bits are left in 'tmp', or if all remaining bits left
- * in 'tmp' correspond to the same bit in 'to', return false
+         * in 'tmp' correspond to the same bit in 'to', return false
- * (nothing left to migrate).
+         * (nothing left to migrate).
- *
+         *
- * This lets us pick a pair of nodes to migrate between, such that
+         * This lets us pick a pair of nodes to migrate between, such that
- * if possible the dest node is not already occupied by some other
+         * if possible the dest node is not already occupied by some other
- * source node, minimizing the risk of overloading the memory on a
+         * source node, minimizing the risk of overloading the memory on a
- * node that would happen if we migrated incoming memory to a node
+         * node that would happen if we migrated incoming memory to a node
- * before migrating outgoing memory source that same node.
+         * before migrating outgoing memory source that same node.
- *
+         *
- * A single scan of tmp is sufficient.  As we go, we remember the
+         * A single scan of tmp is sufficient.  As we go, we remember the
- * most recent <s, d> pair that moved (s != d).  If we find a pair
+         * most recent <s, d> pair that moved (s != d).  If we find a pair
- * that not only moved, but what's better, moved to an empty slot
+         * that not only moved, but what's better, moved to an empty slot
- * (d is not set in tmp), then we break out then, with that pair.
+         * (d is not set in tmp), then we break out then, with that pair.
- * Otherwise when we finish scannng from_tmp, we at least have the
+         * Otherwise when we finish scannng from_tmp, we at least have the
- * most recent <s, d> pair that moved.  If we get all the way through
+         * most recent <s, d> pair that moved.  If we get all the way through
- * the scan of tmp without finding any node that moved, much less
+         * the scan of tmp without finding any node that moved, much less
- * moved to an empty node, then there is nothing left worth migrating.
+         * moved to an empty node, then there is nothing left worth migrating.
- */
+         */
        tmp = *from_nodes;
        while (!nodes_empty(tmp)) {
@@ -1047,7 +1073,7 @@ static long do_mbind(unsigned long start, unsigned long len,
        if (!IS_ERR(vma)) {
                int nr_failed = 0;
-                err = mbind_range(vma, start, end, new);
+                err = mbind_range(mm, start, end, new);
                if (!list_empty(&pagelist))
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
diff --git a/mm/migrate.c b/mm/migrate.c
index 880bd592d38e..88000b89fc9a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -134,7 +134,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
                page_add_file_rmap(new);
        /* No need to invalidate - it was non-present before */
-        update_mmu_cache(vma, addr, pte);
+        update_mmu_cache(vma, addr, ptep);
 unlock:
        pte_unmap_unlock(ptep, ptl);
 out:
@@ -275,8 +275,6 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 */
 static void migrate_page_copy(struct page *newpage, struct page *page)
 {
-        int anon;
        copy_highpage(newpage, page);
        if (PageError(page))
@@ -313,8 +311,6 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
        ClearPageSwapCache(page);
        ClearPagePrivate(page);
        set_page_private(page, 0);
-        /* page->mapping contains a flag for PageAnon() */
-        anon = PageAnon(page);
        page->mapping = NULL;
        /*
diff --git a/mm/mlock.c b/mm/mlock.c
index 2b8335a89400..8f4e2dfceec1 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -25,7 +25,7 @@ int can_do_mlock(void)
 {
        if (capable(CAP_IPC_LOCK))
                return 1;
-        if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
+        if (rlimit(RLIMIT_MEMLOCK) != 0)
                return 1;
        return 0;
 }
@@ -487,7 +487,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
        locked = len >> PAGE_SHIFT;
        locked += current->mm->locked_vm;
-        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        /* check against resource limits */
@@ -550,7 +550,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
        down_write(&current->mm->mmap_sem);
-        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
        ret = -ENOMEM;
@@ -584,7 +584,7 @@ int user_shm_lock(size_t size, struct user_struct *user)
        int allowed = 0;
        locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+        lock_limit = rlimit(RLIMIT_MEMLOCK);
        if (lock_limit == RLIM_INFINITY)
                allowed = 1;
        lock_limit >>= PAGE_SHIFT;
@@ -618,12 +618,12 @@ int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
        down_write(&mm->mmap_sem);
-        lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+        lim = ACCESS_ONCE(rlim[RLIMIT_AS].rlim_cur) >> PAGE_SHIFT;
        vm   = mm->total_vm + pgsz;
        if (lim < vm)
                goto out;
-        lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+        lim = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur) >> PAGE_SHIFT;
        vm   = mm->locked_vm + pgsz;
        if (lim < vm)
                goto out;
diff --git a/mm/mmap.c b/mm/mmap.c
index ee2298936fe6..75557c639ad4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -265,7 +265,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
         * segment grow beyond its set limit the in case where the limit is
         * not page aligned -Ram Gupta
         */
-        rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur;
+        rlim = rlimit(RLIMIT_DATA);
        if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
                        (mm->end_data - mm->start_data) > rlim)
                goto out;
@@ -437,7 +437,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        __vma_link_list(mm, vma, prev, rb_parent);
        __vma_link_rb(mm, vma, rb_link, rb_parent);
-        __anon_vma_link(vma);
 }
 static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -499,7 +498,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
 * are necessary.  The "insert" vma (if any) is to be inserted
 * before we drop the necessary locks.
 */
-void vma_adjust(struct vm_area_struct *vma, unsigned long start,
+int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
 {
        struct mm_struct *mm = vma->vm_mm;
@@ -542,6 +541,26 @@ again:			remove_next = 1 + (end > next->vm_end);
                }
        }
+        /*
+         * When changing only vma->vm_end, we don't really need anon_vma lock.
+         */
+        if (vma->anon_vma && (insert || importer || start != vma->vm_start))
+                anon_vma = vma->anon_vma;
+        if (anon_vma) {
+                /*
+                 * Easily overlooked: when mprotect shifts the boundary,
+                 * make sure the expanding vma has anon_vma set if the
+                 * shrinking vma had, to cover any anon pages imported.
+                 */
+                if (importer && !importer->anon_vma) {
+                        /* Block reverse map lookups until things are set up. */
+                        if (anon_vma_clone(importer, vma)) {
+                                return -ENOMEM;
+                        }
+                        importer->anon_vma = anon_vma;
+                }
+        }
        if (file) {
                mapping = file->f_mapping;
                if (!(vma->vm_flags & VM_NONLINEAR))
@@ -567,25 +586,6 @@ again:			remove_next = 1 + (end > next->vm_end);
                }
        }
-        /*
-         * When changing only vma->vm_end, we don't really need
-         * anon_vma lock.
-         */
-        if (vma->anon_vma && (insert || importer || start != vma->vm_start))
-                anon_vma = vma->anon_vma;
-        if (anon_vma) {
-                spin_lock(&anon_vma->lock);
-                /*
-                 * Easily overlooked: when mprotect shifts the boundary,
-                 * make sure the expanding vma has anon_vma set if the
-                 * shrinking vma had, to cover any anon pages imported.
-                 */
-                if (importer && !importer->anon_vma) {
-                        importer->anon_vma = anon_vma;
-                        __anon_vma_link(importer);
-                }
-        }
        if (root) {
                flush_dcache_mmap_lock(mapping);
                vma_prio_tree_remove(vma, root);
@@ -616,8 +616,6 @@ again:			remove_next = 1 + (end > next->vm_end);
                __vma_unlink(mm, next, vma);
                if (file)
                        __remove_shared_vm_struct(next, file, mapping);
-                if (next->anon_vma)
-                        __anon_vma_merge(vma, next);
        } else if (insert) {
                /*
                 * split_vma has split insert from vma, and needs
@@ -627,8 +625,6 @@ again:			remove_next = 1 + (end > next->vm_end);
                __insert_vm_struct(mm, insert);
        }
-        if (anon_vma)
-                spin_unlock(&anon_vma->lock);
        if (mapping)
                spin_unlock(&mapping->i_mmap_lock);
@@ -638,6 +634,8 @@ again:			remove_next = 1 + (end > next->vm_end);
                        if (next->vm_flags & VM_EXECUTABLE)
                                removed_exe_file_vma(mm);
                }
+                if (next->anon_vma)
+                        anon_vma_merge(vma, next);
                mm->map_count--;
                mpol_put(vma_policy(next));
                kmem_cache_free(vm_area_cachep, next);
@@ -653,6 +651,8 @@ again:			remove_next = 1 + (end > next->vm_end);
        }
        validate_mm(mm);
+        return 0;
 }
 /*
@@ -759,6 +759,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
 {
        pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
        struct vm_area_struct *area, *next;
+        int err;
        /*
         * We later require that vma->vm_flags == vm_flags,
@@ -792,11 +793,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                is_mergeable_anon_vma(prev->anon_vma,
                                                      next->anon_vma)) {
                                                        /* cases 1, 6 */
-                        vma_adjust(prev, prev->vm_start,
+                        err = vma_adjust(prev, prev->vm_start,
                                next->vm_end, prev->vm_pgoff, NULL);
                } else                                  /* cases 2, 5, 7 */
-                        vma_adjust(prev, prev->vm_start,
+                        err = vma_adjust(prev, prev->vm_start,
                                end, prev->vm_pgoff, NULL);
+                if (err)
+                        return NULL;
                return prev;
        }
@@ -808,11 +811,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                        can_vma_merge_before(next, vm_flags,
                                        anon_vma, file, pgoff+pglen)) {
                if (prev && addr < prev->vm_end)        /* case 4 */
-                        vma_adjust(prev, prev->vm_start,
+                        err = vma_adjust(prev, prev->vm_start,
                                addr, prev->vm_pgoff, NULL);
                else                                    /* cases 3, 8 */
-                        vma_adjust(area, addr, next->vm_end,
+                        err = vma_adjust(area, addr, next->vm_end,
                                next->vm_pgoff - pglen, NULL);
+                if (err)
+                        return NULL;
                return area;
        }
@@ -967,7 +972,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                unsigned long locked, lock_limit;
                locked = len >> PAGE_SHIFT;
                locked += mm->locked_vm;
-                lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+                lock_limit = rlimit(RLIMIT_MEMLOCK);
                lock_limit >>= PAGE_SHIFT;
                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
                        return -EAGAIN;
@@ -1083,6 +1088,30 @@ out:
        return retval;
 }
+#ifdef __ARCH_WANT_SYS_OLD_MMAP
+struct mmap_arg_struct {
+        unsigned long addr;
+        unsigned long len;
+        unsigned long prot;
+        unsigned long flags;
+        unsigned long fd;
+        unsigned long offset;
+};
+SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
+{
+        struct mmap_arg_struct a;
+        if (copy_from_user(&a, arg, sizeof(a)))
+                return -EFAULT;
+        if (a.offset & ~PAGE_MASK)
+                return -EINVAL;
+        return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+                              a.offset >> PAGE_SHIFT);
+}
+#endif /* __ARCH_WANT_SYS_OLD_MMAP */
 /*
 * Some shared mappigns will want the pages marked read-only
 * to track write events. If so, we'll downgrade vm_page_prot
@@ -1205,6 +1234,7 @@ munmap_back:
        vma->vm_flags = vm_flags;
        vma->vm_page_prot = vm_get_page_prot(vm_flags);
        vma->vm_pgoff = pgoff;
+        INIT_LIST_HEAD(&vma->anon_vma_chain);
        if (file) {
                error = -EINVAL;
@@ -1265,13 +1295,8 @@ out:
        mm->total_vm += len >> PAGE_SHIFT;
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
-                /*
+                if (!mlock_vma_pages_range(vma, addr, addr + len))
-                 * makes pages present; downgrades, drops, reacquires mmap_sem
+                        mm->locked_vm += (len >> PAGE_SHIFT);
-                 */
-                long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
-                if (nr_pages < 0)
-                        return nr_pages;        /* vma gone! */
-                mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
        } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
                make_pages_present(addr, addr + len);
        return addr;
@@ -1599,7 +1624,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
                return -ENOMEM;
        /* Stack limit test */
-        if (size > rlim[RLIMIT_STACK].rlim_cur)
+        if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
                return -ENOMEM;
        /* mlock limit tests */
@@ -1607,7 +1632,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
                unsigned long locked;
                unsigned long limit;
                locked = mm->locked_vm + grow;
-                limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+                limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
+                limit >>= PAGE_SHIFT;
                if (locked > limit && !capable(CAP_IPC_LOCK))
                        return -ENOMEM;
        }
@@ -1754,8 +1780,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
        if (!prev || expand_stack(prev, addr))
                return NULL;
        if (prev->vm_flags & VM_LOCKED) {
-                if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
+                mlock_vma_pages_range(prev, addr, prev->vm_end);
-                        return NULL;    /* vma gone! */
        }
        return prev;
 }
@@ -1783,8 +1808,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
        if (expand_stack(vma, addr))
                return NULL;
        if (vma->vm_flags & VM_LOCKED) {
-                if (mlock_vma_pages_range(vma, addr, start) < 0)
+                mlock_vma_pages_range(vma, addr, start);
-                        return NULL;    /* vma gone! */
        }
        return vma;
 }
@@ -1871,6 +1895,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 {
        struct mempolicy *pol;
        struct vm_area_struct *new;
+        int err = -ENOMEM;
        if (is_vm_hugetlb_page(vma) && (addr &
                                        ~(huge_page_mask(hstate_vma(vma)))))
@@ -1878,11 +1903,13 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (!new)
-                return -ENOMEM;
+                goto out_err;
        /* most fields are the same, copy all, and then fixup */
        *new = *vma;
+        INIT_LIST_HEAD(&new->anon_vma_chain);
        if (new_below)
                new->vm_end = addr;
        else {
@@ -1892,11 +1919,14 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        pol = mpol_dup(vma_policy(vma));
        if (IS_ERR(pol)) {
-                kmem_cache_free(vm_area_cachep, new);
+                err = PTR_ERR(pol);
-                return PTR_ERR(pol);
+                goto out_free_vma;
        }
        vma_set_policy(new, pol);
+        if (anon_vma_clone(new, vma))
+                goto out_free_mpol;
        if (new->vm_file) {
                get_file(new->vm_file);
                if (vma->vm_flags & VM_EXECUTABLE)
@@ -1907,12 +1937,28 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
                new->vm_ops->open(new);
        if (new_below)
-                vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
+                err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff +
                        ((addr - new->vm_start) >> PAGE_SHIFT), new);
        else
-                vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+                err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
-        return 0;
+        /* Success. */
+        if (!err)
+                return 0;
+        /* Clean everything up if vma_adjust failed. */
+        new->vm_ops->close(new);
+        if (new->vm_file) {
+                if (vma->vm_flags & VM_EXECUTABLE)
+                        removed_exe_file_vma(mm);
+                fput(new->vm_file);
+        }
+ out_free_mpol:
+        mpol_put(pol);
+ out_free_vma:
+        kmem_cache_free(vm_area_cachep, new);
+ out_err:
+        return err;
 }
 /*
@@ -2074,7 +2120,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
                unsigned long locked, lock_limit;
                locked = len >> PAGE_SHIFT;
                locked += mm->locked_vm;
-                lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+                lock_limit = rlimit(RLIMIT_MEMLOCK);
                lock_limit >>= PAGE_SHIFT;
                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
                        return -EAGAIN;
@@ -2122,6 +2168,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
                return -ENOMEM;
        }
+        INIT_LIST_HEAD(&vma->anon_vma_chain);
        vma->vm_mm = mm;
        vma->vm_start = addr;
        vma->vm_end = addr + len;
@@ -2258,10 +2305,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                if (new_vma) {
                        *new_vma = *vma;
                        pol = mpol_dup(vma_policy(vma));
-                        if (IS_ERR(pol)) {
+                        if (IS_ERR(pol))
-                                kmem_cache_free(vm_area_cachep, new_vma);
+                                goto out_free_vma;
-                                return NULL;
+                        INIT_LIST_HEAD(&new_vma->anon_vma_chain);
-                        }
+                        if (anon_vma_clone(new_vma, vma))
+                                goto out_free_mempol;
                        vma_set_policy(new_vma, pol);
                        new_vma->vm_start = addr;
                        new_vma->vm_end = addr + len;
@@ -2277,6 +2325,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                }
        }
        return new_vma;
+ out_free_mempol:
+        mpol_put(pol);
+ out_free_vma:
+        kmem_cache_free(vm_area_cachep, new_vma);
+        return NULL;
 }
 /*
@@ -2288,7 +2342,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages)
        unsigned long cur = mm->total_vm;       /* pages */
        unsigned long lim;
-        lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+        lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT;
        if (cur + npages > lim)
                return 0;
@@ -2354,6 +2408,7 @@ int install_special_mapping(struct mm_struct *mm,
        if (unlikely(vma == NULL))
                return -ENOMEM;
+        INIT_LIST_HEAD(&vma->anon_vma_chain);
        vma->vm_mm = mm;
        vma->vm_start = addr;
        vma->vm_end = addr + len;
@@ -2454,6 +2509,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 int mm_take_all_locks(struct mm_struct *mm)
 {
        struct vm_area_struct *vma;
+        struct anon_vma_chain *avc;
        int ret = -EINTR;
        BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2471,7 +2527,8 @@ int mm_take_all_locks(struct mm_struct *mm)
                if (signal_pending(current))
                        goto out_unlock;
                if (vma->anon_vma)
-                        vm_lock_anon_vma(mm, vma->anon_vma);
+                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                                vm_lock_anon_vma(mm, avc->anon_vma);
        }
        ret = 0;
@@ -2526,13 +2583,15 @@ static void vm_unlock_mapping(struct address_space *mapping)
 void mm_drop_all_locks(struct mm_struct *mm)
 {
        struct vm_area_struct *vma;
+        struct anon_vma_chain *avc;
        BUG_ON(down_read_trylock(&mm->mmap_sem));
        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                if (vma->anon_vma)
-                        vm_unlock_anon_vma(vma->anon_vma);
+                        list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+                                vm_unlock_anon_vma(avc->anon_vma);
                if (vma->vm_file && vma->vm_file->f_mapping)
                        vm_unlock_mapping(vma->vm_file->f_mapping);
        }
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index ded9081f4021..0777654147c9 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 #include <linux/mmu_context.h>
+#include <linux/module.h>
 #include <linux/sched.h>
 #include <asm/mmu_context.h>
@@ -37,6 +38,7 @@ void use_mm(struct mm_struct *mm)
        if (active_mm != mm)
                mmdrop(active_mm);
 }
+EXPORT_SYMBOL_GPL(use_mm);
 /*
 * unuse_mm
@@ -56,3 +58,4 @@ void unuse_mm(struct mm_struct *mm)
        enter_lazy_tlb(mm, tsk);
        task_unlock(tsk);
 }
+EXPORT_SYMBOL_GPL(unuse_mm);
diff --git a/mm/mremap.c b/mm/mremap.c
index 845190898d59..e9c75efce609 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -285,7 +285,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
        if (vma->vm_flags & VM_LOCKED) {
                unsigned long locked, lock_limit;
                locked = mm->locked_vm << PAGE_SHIFT;
-                lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+                lock_limit = rlimit(RLIMIT_MEMLOCK);
                locked += new_len - old_len;
                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
                        goto Eagain;
@@ -460,8 +460,11 @@ unsigned long do_mremap(unsigned long addr,
                if (vma_expandable(vma, new_len - old_len)) {
                        int pages = (new_len - old_len) >> PAGE_SHIFT;
-                        vma_adjust(vma, vma->vm_start,
+                        if (vma_adjust(vma, vma->vm_start, addr + new_len,
-                                addr + new_len, vma->vm_pgoff, NULL);
+                                       vma->vm_pgoff, NULL)) {
+                                ret = -ENOMEM;
+                                goto out;
+                        }
                        mm->total_vm += pages;
                        vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
diff --git a/mm/nommu.c b/mm/nommu.c
index 48a2ecfaf059..605ace8982a8 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -146,7 +146,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
        for (i = 0; i < nr_pages; i++) {
-                vma = find_vma(mm, start);
+                vma = find_extend_vma(mm, start);
                if (!vma)
                        goto finish_or_fault;
@@ -764,7 +764,7 @@ EXPORT_SYMBOL(find_vma);
 */
 struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
 {
-        return find_vma(mm, addr);
+        return find_vma(mm, addr & PAGE_MASK);
 }
 /*
@@ -1209,7 +1209,7 @@ unsigned long do_mmap_pgoff(struct file *file,
        region->vm_flags = vm_flags;
        region->vm_pgoff = pgoff;
-        INIT_LIST_HEAD(&vma->anon_vma_node);
+        INIT_LIST_HEAD(&vma->anon_vma_chain);
        vma->vm_flags = vm_flags;
        vma->vm_pgoff = pgoff;
@@ -1428,6 +1428,30 @@ out:
        return retval;
 }
+#ifdef __ARCH_WANT_SYS_OLD_MMAP
+struct mmap_arg_struct {
+        unsigned long addr;
+        unsigned long len;
+        unsigned long prot;
+        unsigned long flags;
+        unsigned long fd;
+        unsigned long offset;
+};
+SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
+{
+        struct mmap_arg_struct a;
+        if (copy_from_user(&a, arg, sizeof(a)))
+                return -EFAULT;
+        if (a.offset & ~PAGE_MASK)
+                return -EINVAL;
+        return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
+                              a.offset >> PAGE_SHIFT);
+}
+#endif /* __ARCH_WANT_SYS_OLD_MMAP */
 /*
 * split a vma into two pieces at address 'addr', a new vma is allocated either
 * for the first part or the tail.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 237050478f28..9b223af6a147 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -401,8 +401,8 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
                       "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
                       task_pid_nr(p), p->comm,
                       K(p->mm->total_vm),
-                       K(get_mm_counter(p->mm, anon_rss)),
+                       K(get_mm_counter(p->mm, MM_ANONPAGES)),
-                       K(get_mm_counter(p->mm, file_rss)));
+                       K(get_mm_counter(p->mm, MM_FILEPAGES)));
        task_unlock(p);
        /*
@@ -473,6 +473,8 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
        unsigned long points = 0;
        struct task_struct *p;
+        if (sysctl_panic_on_oom == 2)
+                panic("out of memory(memcg). panic_on_oom is selected.\n");
        read_lock(&tasklist_lock);
 retry:
        p = select_bad_process(&points, mem);
@@ -601,13 +603,6 @@ void pagefault_out_of_memory(void)
                /* Got some memory back in the last second. */
                return;
-        /*
-         * If this is from memcg, oom-killer is already invoked.
-         * and not worth to go system-wide-oom.
-         */
-        if (mem_cgroup_oom_called(current))
-                goto rest_and_return;
        if (sysctl_panic_on_oom)
                panic("out of memory from page fault. panic_on_oom is selected.\n");
@@ -619,7 +614,6 @@ void pagefault_out_of_memory(void)
         * Give "p" a good chance of killing itself before we
         * retry to allocate memory.
         */
-rest_and_return:
        if (!test_thread_flag(TIF_MEMDIE))
                schedule_timeout_uninterruptible(1);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8deb9d0fd5b1..d03c946d5566 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -50,6 +50,7 @@
 #include <linux/kmemleak.h>
 #include <linux/memory.h>
 #include <trace/events/kmem.h>
+#include <linux/ftrace_event.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -76,6 +77,31 @@ unsigned long totalreserve_pages __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+#ifdef CONFIG_PM_SLEEP
+/*
+ * The following functions are used by the suspend/hibernate code to temporarily
+ * change gfp_allowed_mask in order to avoid using I/O during memory allocations
+ * while devices are suspended.  To avoid races with the suspend/hibernate code,
+ * they should always be called with pm_mutex held (gfp_allowed_mask also should
+ * only be modified with pm_mutex held, unless the suspend/hibernate code is
+ * guaranteed not to run in parallel with that modification).
+ */
+void set_gfp_allowed_mask(gfp_t mask)
+{
+        WARN_ON(!mutex_is_locked(&pm_mutex));
+        gfp_allowed_mask = mask;
+}
+gfp_t clear_gfp_allowed_mask(gfp_t mask)
+{
+        gfp_t ret = gfp_allowed_mask;
+        WARN_ON(!mutex_is_locked(&pm_mutex));
+        gfp_allowed_mask &= ~mask;
+        return ret;
+}
+#endif /* CONFIG_PM_SLEEP */
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 int pageblock_order __read_mostly;
 #endif
@@ -263,10 +289,7 @@ static void bad_page(struct page *page)
        printk(KERN_ALERT "BUG: Bad page state in process %s  pfn:%05lx\n",
                current->comm, page_to_pfn(page));
-        printk(KERN_ALERT
+        dump_page(page);
-                "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
-                page, (void *)page->flags, page_count(page),
-                page_mapcount(page), page->mapping, page->index);
        dump_stack();
 out:
@@ -530,7 +553,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
        int batch_free = 0;
        spin_lock(&zone->lock);
-        zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
+        zone->all_unreclaimable = 0;
        zone->pages_scanned = 0;
        __mod_zone_page_state(zone, NR_FREE_PAGES, count);
@@ -568,7 +591,7 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
                                int migratetype)
 {
        spin_lock(&zone->lock);
-        zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
+        zone->all_unreclaimable = 0;
        zone->pages_scanned = 0;
        __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
@@ -583,6 +606,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        int bad = 0;
        int wasMlocked = __TestClearPageMlocked(page);
+        trace_mm_page_free_direct(page, order);
        kmemcheck_free_shadow(page, order);
        for (i = 0 ; i < (1 << order) ; ++i)
@@ -1009,10 +1033,10 @@ static void drain_pages(unsigned int cpu)
                struct per_cpu_pageset *pset;
                struct per_cpu_pages *pcp;
-                pset = zone_pcp(zone, cpu);
+                local_irq_save(flags);
+                pset = per_cpu_ptr(zone->pageset, cpu);
                pcp = &pset->pcp;
-                local_irq_save(flags);
                free_pcppages_bulk(zone, pcp->count, pcp);
                pcp->count = 0;
                local_irq_restore(flags);
@@ -1073,8 +1097,9 @@ void mark_free_pages(struct zone *zone)
 /*
 * Free a 0-order page
+ * cold == 1 ? free a cold page : free a hot page
 */
-static void free_hot_cold_page(struct page *page, int cold)
+void free_hot_cold_page(struct page *page, int cold)
 {
        struct zone *zone = page_zone(page);
        struct per_cpu_pages *pcp;
@@ -1082,6 +1107,7 @@ static void free_hot_cold_page(struct page *page, int cold)
        int migratetype;
        int wasMlocked = __TestClearPageMlocked(page);
+        trace_mm_page_free_direct(page, 0);
        kmemcheck_free_shadow(page, 0);
        if (PageAnon(page))
@@ -1096,7 +1122,6 @@ static void free_hot_cold_page(struct page *page, int cold)
        arch_free_page(page, 0);
        kernel_map_pages(page, 1, 0);
-        pcp = &zone_pcp(zone, get_cpu())->pcp;
        migratetype = get_pageblock_migratetype(page);
        set_page_private(page, migratetype);
        local_irq_save(flags);
@@ -1119,6 +1144,7 @@ static void free_hot_cold_page(struct page *page, int cold)
                migratetype = MIGRATE_MOVABLE;
        }
+        pcp = &this_cpu_ptr(zone->pageset)->pcp;
        if (cold)
                list_add_tail(&page->lru, &pcp->lists[migratetype]);
        else
@@ -1131,15 +1157,8 @@ static void free_hot_cold_page(struct page *page, int cold)
 out:
        local_irq_restore(flags);
-        put_cpu();
 }
-void free_hot_page(struct page *page)
-{
-        trace_mm_page_free_direct(page, 0);
-        free_hot_cold_page(page, 0);
-}
-        
 /*
 * split_page takes a non-compound higher-order page, and splits it into
 * n (1<<order) sub-pages: page[0..n]
@@ -1181,17 +1200,15 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
        unsigned long flags;
        struct page *page;
        int cold = !!(gfp_flags & __GFP_COLD);
-        int cpu;
 again:
-        cpu  = get_cpu();
        if (likely(order == 0)) {
                struct per_cpu_pages *pcp;
                struct list_head *list;
-                pcp = &zone_pcp(zone, cpu)->pcp;
-                list = &pcp->lists[migratetype];
                local_irq_save(flags);
+                pcp = &this_cpu_ptr(zone->pageset)->pcp;
+                list = &pcp->lists[migratetype];
                if (list_empty(list)) {
                        pcp->count += rmqueue_bulk(zone, 0,
                                        pcp->batch, list,
@@ -1232,7 +1249,6 @@ again:
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
        zone_statistics(preferred_zone, zone);
        local_irq_restore(flags);
-        put_cpu();
        VM_BUG_ON(bad_range(zone, page));
        if (prep_new_page(page, order, gfp_flags))
@@ -1241,7 +1257,6 @@ again:
 failed:
        local_irq_restore(flags);
-        put_cpu();
        return NULL;
 }
@@ -2013,9 +2028,8 @@ void __pagevec_free(struct pagevec *pvec)
 void __free_pages(struct page *page, unsigned int order)
 {
        if (put_page_testzero(page)) {
-                trace_mm_page_free_direct(page, order);
                if (order == 0)
-                        free_hot_page(page);
+                        free_hot_cold_page(page, 0);
                else
                        __free_pages_ok(page, order);
        }
@@ -2180,7 +2194,7 @@ void show_free_areas(void)
                for_each_online_cpu(cpu) {
                        struct per_cpu_pageset *pageset;
-                        pageset = zone_pcp(zone, cpu);
+                        pageset = per_cpu_ptr(zone->pageset, cpu);
                        printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n",
                               cpu, pageset->pcp.high,
@@ -2271,7 +2285,7 @@ void show_free_areas(void)
                        K(zone_page_state(zone, NR_BOUNCE)),
                        K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
                        zone->pages_scanned,
-                        (zone_is_all_unreclaimable(zone) ? "yes" : "no")
+                        (zone->all_unreclaimable ? "yes" : "no")
                        );
                printk("lowmem_reserve[]:");
                for (i = 0; i < MAX_NR_ZONES; i++)
@@ -2745,10 +2759,29 @@ static void build_zonelist_cache(pg_data_t *pgdat)
 #endif  /* CONFIG_NUMA */
+/*
+ * Boot pageset table. One per cpu which is going to be used for all
+ * zones and all nodes. The parameters will be set in such a way
+ * that an item put on a list will immediately be handed over to
+ * the buddy list. This is safe since pageset manipulation is done
+ * with interrupts disabled.
+ *
+ * The boot_pagesets must be kept even after bootup is complete for
+ * unused processors and/or zones. They do play a role for bootstrapping
+ * hotplugged processors.
+ *
+ * zoneinfo_show() and maybe other functions do
+ * not check if the processor is online before following the pageset pointer.
+ * Other parts of the kernel may not check if the zone is available.
+ */
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
+static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
 /* return values int ....just for stop_machine() */
 static int __build_all_zonelists(void *dummy)
 {
        int nid;
+        int cpu;
 #ifdef CONFIG_NUMA
        memset(node_load, 0, sizeof(node_load));
@@ -2759,6 +2792,23 @@ static int __build_all_zonelists(void *dummy)
                build_zonelists(pgdat);
                build_zonelist_cache(pgdat);
        }
+        /*
+         * Initialize the boot_pagesets that are going to be used
+         * for bootstrapping processors. The real pagesets for
+         * each zone will be allocated later when the per cpu
+         * allocator is available.
+         *
+         * boot_pagesets are used also for bootstrapping offline
+         * cpus if the system is already booted because the pagesets
+         * are needed to initialize allocators on a specific cpu too.
+         * F.e. the percpu allocator needs the page allocator which
+         * needs the percpu allocator in order to allocate its pagesets
+         * (a chicken-egg dilemma).
+         */
+        for_each_possible_cpu(cpu)
+                setup_pageset(&per_cpu(boot_pageset, cpu), 0);
        return 0;
 }
@@ -3096,121 +3146,33 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                pcp->batch = PAGE_SHIFT * 8;
 }
-#ifdef CONFIG_NUMA
-/*
- * Boot pageset table. One per cpu which is going to be used for all
- * zones and all nodes. The parameters will be set in such a way
- * that an item put on a list will immediately be handed over to
- * the buddy list. This is safe since pageset manipulation is done
- * with interrupts disabled.
- *
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
- *
- * zoneinfo_show() and maybe other functions do
- * not check if the processor is online before following the pageset pointer.
- * Other parts of the kernel may not check if the zone is available.
- */
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
 /*
- * Dynamically allocate memory for the
+ * Allocate per cpu pagesets and initialize them.
- * per cpu pageset array in struct zone.
+ * Before this call only boot pagesets were available.
+ * Boot pagesets will no longer be used by this processorr
+ * after setup_per_cpu_pageset().
 */
-static int __cpuinit process_zones(int cpu)
+void __init setup_per_cpu_pageset(void)
 {
-        struct zone *zone, *dzone;
+        struct zone *zone;
-        int node = cpu_to_node(cpu);
+        int cpu;
-        node_set_state(node, N_CPU);    /* this node has a cpu */
        for_each_populated_zone(zone) {
-                zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
+                zone->pageset = alloc_percpu(struct per_cpu_pageset);
-                                         GFP_KERNEL, node);
-                if (!zone_pcp(zone, cpu))
-                        goto bad;
-                setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+                for_each_possible_cpu(cpu) {
+                        struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
-                if (percpu_pagelist_fraction)
+                        setup_pageset(pcp, zone_batchsize(zone));
-                        setup_pagelist_highmark(zone_pcp(zone, cpu),
-                            (zone->present_pages / percpu_pagelist_fraction));
-        }
-        return 0;
+                        if (percpu_pagelist_fraction)
-bad:
+                                setup_pagelist_highmark(pcp,
-        for_each_zone(dzone) {
+                                        (zone->present_pages /
-                if (!populated_zone(dzone))
+                                                percpu_pagelist_fraction));
-                        continue;
+                }
-                if (dzone == zone)
-                        break;
-                kfree(zone_pcp(dzone, cpu));
-                zone_pcp(dzone, cpu) = &boot_pageset[cpu];
-        }
-        return -ENOMEM;
-}
-static inline void free_zone_pagesets(int cpu)
-{
-        struct zone *zone;
-        for_each_zone(zone) {
-                struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-                /* Free per_cpu_pageset if it is slab allocated */
-                if (pset != &boot_pageset[cpu])
-                        kfree(pset);
-                zone_pcp(zone, cpu) = &boot_pageset[cpu];
-        }
-}
-static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
-                unsigned long action,
-                void *hcpu)
-{
-        int cpu = (long)hcpu;
-        int ret = NOTIFY_OK;
-        switch (action) {
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                if (process_zones(cpu))
-                        ret = NOTIFY_BAD;
-                break;
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-        case CPU_DEAD:
-        case CPU_DEAD_FROZEN:
-                free_zone_pagesets(cpu);
-                break;
-        default:
-                break;
        }
-        return ret;
 }
-static struct notifier_block __cpuinitdata pageset_notifier =
-        { &pageset_cpuup_callback, NULL, 0 };
-void __init setup_per_cpu_pageset(void)
-{
-        int err;
-        /* Initialize per_cpu_pageset for cpu 0.
-         * A cpuup callback will do this for every cpu
-         * as it comes online
-         */
-        err = process_zones(smp_processor_id());
-        BUG_ON(err);
-        register_cpu_notifier(&pageset_notifier);
-}
-#endif
 static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
@@ -3260,11 +3222,11 @@ static int __zone_pcp_update(void *data)
        int cpu;
        unsigned long batch = zone_batchsize(zone), flags;
-        for (cpu = 0; cpu < NR_CPUS; cpu++) {
+        for_each_possible_cpu(cpu) {
                struct per_cpu_pageset *pset;
                struct per_cpu_pages *pcp;
-                pset = zone_pcp(zone, cpu);
+                pset = per_cpu_ptr(zone->pageset, cpu);
                pcp = &pset->pcp;
                local_irq_save(flags);
@@ -3282,21 +3244,17 @@ void zone_pcp_update(struct zone *zone)
 static __meminit void zone_pcp_init(struct zone *zone)
 {
-        int cpu;
+        /*
-        unsigned long batch = zone_batchsize(zone);
+         * per cpu subsystem is not up at this point. The following code
+         * relies on the ability of the linker to provide the
+         * offset of a (static) per cpu variable into the per cpu area.
+         */
+        zone->pageset = &boot_pageset;
-        for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
-                /* Early boot. Slab allocator not functional yet */
-                zone_pcp(zone, cpu) = &boot_pageset[cpu];
-                setup_pageset(&boot_pageset[cpu],0);
-#else
-                setup_pageset(zone_pcp(zone,cpu), batch);
-#endif
-        }
        if (zone->present_pages)
-                printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+                printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
-                        zone->name, zone->present_pages, batch);
+                        zone->name, zone->present_pages,
+                                         zone_batchsize(zone));
 }
 __meminit int init_currently_empty_zone(struct zone *zone,
@@ -3435,6 +3393,61 @@ void __init free_bootmem_with_active_regions(int nid,
        }
 }
+int __init add_from_early_node_map(struct range *range, int az,
+                                   int nr_range, int nid)
+{
+        int i;
+        u64 start, end;
+        /* need to go over early_node_map to find out good range for node */
+        for_each_active_range_index_in_nid(i, nid) {
+                start = early_node_map[i].start_pfn;
+                end = early_node_map[i].end_pfn;
+                nr_range = add_range(range, az, nr_range, start, end);
+        }
+        return nr_range;
+}
+#ifdef CONFIG_NO_BOOTMEM
+void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+                                        u64 goal, u64 limit)
+{
+        int i;
+        void *ptr;
+        /* need to go over early_node_map to find out good range for node */
+        for_each_active_range_index_in_nid(i, nid) {
+                u64 addr;
+                u64 ei_start, ei_last;
+                ei_last = early_node_map[i].end_pfn;
+                ei_last <<= PAGE_SHIFT;
+                ei_start = early_node_map[i].start_pfn;
+                ei_start <<= PAGE_SHIFT;
+                addr = find_early_area(ei_start, ei_last,
+                                         goal, limit, size, align);
+                if (addr == -1ULL)
+                        continue;
+#if 0
+                printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
+                                nid,
+                                ei_start, ei_last, goal, limit, size,
+                                align, addr);
+#endif
+                ptr = phys_to_virt(addr);
+                memset(ptr, 0, size);
+                reserve_early_without_check(addr, addr + size, "BOOTMEM");
+                return ptr;
+        }
+        return NULL;
+}
+#endif
 void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
 {
        int i;
@@ -4377,8 +4390,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        for (i = 0; i < MAX_NR_ZONES; i++) {
                if (i == ZONE_MOVABLE)
                        continue;
-                printk("  %-8s %0#10lx -> %0#10lx\n",
+                printk("  %-8s ", zone_names[i]);
-                                zone_names[i],
+                if (arch_zone_lowest_possible_pfn[i] ==
+                                arch_zone_highest_possible_pfn[i])
+                        printk("empty\n");
+                else
+                        printk("%0#10lx -> %0#10lx\n",
                                arch_zone_lowest_possible_pfn[i],
                                arch_zone_highest_possible_pfn[i]);
        }
@@ -4467,7 +4484,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
+struct pglist_data __refdata contig_page_data = {
+#ifndef CONFIG_NO_BOOTMEM
+ .bdata = &bootmem_node_data[0]
+#endif
+ };
 EXPORT_SYMBOL(contig_page_data);
 #endif
@@ -4810,10 +4831,11 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
        if (!write || (ret == -EINVAL))
                return ret;
        for_each_populated_zone(zone) {
-                for_each_online_cpu(cpu) {
+                for_each_possible_cpu(cpu) {
                        unsigned long  high;
                        high = zone->present_pages / percpu_pagelist_fraction;
-                        setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+                        setup_pagelist_highmark(
+                                per_cpu_ptr(zone->pageset, cpu), high);
                }
        }
        return 0;
@@ -5159,3 +5181,80 @@ bool is_free_buddy_page(struct page *page)
        return order < MAX_ORDER;
 }
 #endif
+static struct trace_print_flags pageflag_names[] = {
+        {1UL << PG_locked,              "locked"        },
+        {1UL << PG_error,               "error"         },
+        {1UL << PG_referenced,          "referenced"    },
+        {1UL << PG_uptodate,            "uptodate"      },
+        {1UL << PG_dirty,               "dirty"         },
+        {1UL << PG_lru,                 "lru"           },
+        {1UL << PG_active,              "active"        },
+        {1UL << PG_slab,                "slab"          },
+        {1UL << PG_owner_priv_1,        "owner_priv_1"  },
+        {1UL << PG_arch_1,              "arch_1"        },
+        {1UL << PG_reserved,            "reserved"      },
+        {1UL << PG_private,             "private"       },
+        {1UL << PG_private_2,           "private_2"     },
+        {1UL << PG_writeback,           "writeback"     },
+#ifdef CONFIG_PAGEFLAGS_EXTENDED
+        {1UL << PG_head,                "head"          },
+        {1UL << PG_tail,                "tail"          },
+#else
+        {1UL << PG_compound,            "compound"      },
+#endif
+        {1UL << PG_swapcache,           "swapcache"     },
+        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
+        {1UL << PG_reclaim,             "reclaim"       },
+        {1UL << PG_buddy,               "buddy"         },
+        {1UL << PG_swapbacked,          "swapbacked"    },
+        {1UL << PG_unevictable,         "unevictable"   },
+#ifdef CONFIG_MMU
+        {1UL << PG_mlocked,             "mlocked"       },
+#endif
+#ifdef CONFIG_ARCH_USES_PG_UNCACHED
+        {1UL << PG_uncached,            "uncached"      },
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+        {1UL << PG_hwpoison,            "hwpoison"      },
+#endif
+        {-1UL,                          NULL            },
+};
+static void dump_page_flags(unsigned long flags)
+{
+        const char *delim = "";
+        unsigned long mask;
+        int i;
+        printk(KERN_ALERT "page flags: %#lx(", flags);
+        /* remove zone id */
+        flags &= (1UL << NR_PAGEFLAGS) - 1;
+        for (i = 0; pageflag_names[i].name && flags; i++) {
+                mask = pageflag_names[i].mask;
+                if ((flags & mask) != mask)
+                        continue;
+                flags &= ~mask;
+                printk("%s%s", delim, pageflag_names[i].name);
+                delim = "|";
+        }
+        /* check for left over flags */
+        if (flags)
+                printk("%s%#lx", delim, flags);
+        printk(")\n");
+}
+void dump_page(struct page *page)
+{
+        printk(KERN_ALERT
+               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
+                page, page_count(page), page_mapcount(page),
+                page->mapping, page->index);
+        dump_page_flags(page->flags);
+}
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 3d535d594826..3dd88539a0e6 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -335,6 +335,37 @@ not_enough_page:
 }
 /**
+ * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry.
+ * @end: swap entry to be cmpxchged
+ * @old: old id
+ * @new: new id
+ *
+ * Returns old id at success, 0 at failure.
+ * (There is no mem_cgroup useing 0 as its id)
+ */
+unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
+                                        unsigned short old, unsigned short new)
+{
+        int type = swp_type(ent);
+        unsigned long offset = swp_offset(ent);
+        unsigned long idx = offset / SC_PER_PAGE;
+        unsigned long pos = offset & SC_POS_MASK;
+        struct swap_cgroup_ctrl *ctrl;
+        struct page *mappage;
+        struct swap_cgroup *sc;
+        ctrl = &swap_cgroup_ctrl[type];
+        mappage = ctrl->map[idx];
+        sc = page_address(mappage);
+        sc += pos;
+        if (cmpxchg(&sc->id, old, new) == old)
+                return old;
+        else
+                return 0;
+}
+/**
 * swap_cgroup_record - record mem_cgroup for this swp_entry.
 * @ent: swap entry to be recorded into
 * @mem: mem_cgroup to be recorded
@@ -358,8 +389,7 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
        mappage = ctrl->map[idx];
        sc = page_address(mappage);
        sc += pos;
-        old = sc->id;
+        old = xchg(&sc->id, id);
-        sc->id = id;
        return old;
 }
diff --git a/mm/percpu.c b/mm/percpu.c
index 083e7c91e5f6..768419d44ad7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -80,13 +80,15 @@
 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
 #ifndef __addr_to_pcpu_ptr
 #define __addr_to_pcpu_ptr(addr)                                        \
-        (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr  \
+        (void __percpu *)((unsigned long)(addr) -                       \
-                 + (unsigned long)__per_cpu_start)
+                          (unsigned long)pcpu_base_addr +               \
+                          (unsigned long)__per_cpu_start)
 #endif
 #ifndef __pcpu_ptr_to_addr
 #define __pcpu_ptr_to_addr(ptr)                                         \
-        (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr   \
+        (void __force *)((unsigned long)(ptr) +                         \
-                 - (unsigned long)__per_cpu_start)
+                         (unsigned long)pcpu_base_addr -                \
+                         (unsigned long)__per_cpu_start)
 #endif
 struct pcpu_chunk {
@@ -913,11 +915,10 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
        int rs, re;
        /* quick path, check whether it's empty already */
-        pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
+        rs = page_start;
-                if (rs == page_start && re == page_end)
+        pcpu_next_unpop(chunk, &rs, &re, page_end);
-                        return;
+        if (rs == page_start && re == page_end)
-                break;
+                return;
-        }
        /* immutable chunks can't be depopulated */
        WARN_ON(chunk->immutable);
@@ -968,11 +969,10 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
        int rs, re, rc;
        /* quick path, check whether all pages are already there */
-        pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
+        rs = page_start;
-                if (rs == page_start && re == page_end)
+        pcpu_next_pop(chunk, &rs, &re, page_end);
-                        goto clear;
+        if (rs == page_start && re == page_end)
-                break;
+                goto clear;
-        }
        /* need to allocate and map pages, this chunk can't be immutable */
        WARN_ON(chunk->immutable);
@@ -1067,7 +1067,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
-static void *pcpu_alloc(size_t size, size_t align, bool reserved)
+static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
        static int warn_limit = 10;
        struct pcpu_chunk *chunk;
@@ -1196,7 +1196,7 @@ fail_unlock_mutex:
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
-void *__alloc_percpu(size_t size, size_t align)
+void __percpu *__alloc_percpu(size_t size, size_t align)
 {
        return pcpu_alloc(size, align, false);
 }
@@ -1217,7 +1217,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
-void *__alloc_reserved_percpu(size_t size, size_t align)
+void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
 {
        return pcpu_alloc(size, align, true);
 }
@@ -1269,7 +1269,7 @@ static void pcpu_reclaim(struct work_struct *work)
 * CONTEXT:
 * Can be called from atomic context.
 */
-void free_percpu(void *ptr)
+void free_percpu(void __percpu *ptr)
 {
        void *addr;
        struct pcpu_chunk *chunk;
diff --git a/mm/readahead.c b/mm/readahead.c
index 033bc135a41f..337b20e946f6 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -501,6 +501,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
        if (!ra->ra_pages)
                return;
+        /* be dumb */
+        if (filp->f_mode & FMODE_RANDOM) {
+                force_page_cache_readahead(mapping, filp, offset, req_size);
+                return;
+        }
        /* do read-ahead */
        ondemand_readahead(mapping, ra, filp, false, offset, req_size);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 278cd277bdec..fcd593c9c997 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -62,6 +62,7 @@
 #include "internal.h"
 static struct kmem_cache *anon_vma_cachep;
+static struct kmem_cache *anon_vma_chain_cachep;
 static inline struct anon_vma *anon_vma_alloc(void)
 {
@@ -73,6 +74,16 @@ void anon_vma_free(struct anon_vma *anon_vma)
        kmem_cache_free(anon_vma_cachep, anon_vma);
 }
+static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+{
+        return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+}
+void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
+{
+        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
+}
 /**
 * anon_vma_prepare - attach an anon_vma to a memory region
 * @vma: the memory region in question
@@ -103,18 +114,23 @@ void anon_vma_free(struct anon_vma *anon_vma)
 int anon_vma_prepare(struct vm_area_struct *vma)
 {
        struct anon_vma *anon_vma = vma->anon_vma;
+        struct anon_vma_chain *avc;
        might_sleep();
        if (unlikely(!anon_vma)) {
                struct mm_struct *mm = vma->vm_mm;
                struct anon_vma *allocated;
+                avc = anon_vma_chain_alloc();
+                if (!avc)
+                        goto out_enomem;
                anon_vma = find_mergeable_anon_vma(vma);
                allocated = NULL;
                if (!anon_vma) {
                        anon_vma = anon_vma_alloc();
                        if (unlikely(!anon_vma))
-                                return -ENOMEM;
+                                goto out_enomem_free_avc;
                        allocated = anon_vma;
                }
                spin_lock(&anon_vma->lock);
@@ -123,53 +139,113 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
                        vma->anon_vma = anon_vma;
-                        list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+                        avc->anon_vma = anon_vma;
+                        avc->vma = vma;
+                        list_add(&avc->same_vma, &vma->anon_vma_chain);
+                        list_add(&avc->same_anon_vma, &anon_vma->head);
                        allocated = NULL;
                }
                spin_unlock(&mm->page_table_lock);
                spin_unlock(&anon_vma->lock);
-                if (unlikely(allocated))
+                if (unlikely(allocated)) {
                        anon_vma_free(allocated);
+                        anon_vma_chain_free(avc);
+                }
        }
        return 0;
+ out_enomem_free_avc:
+        anon_vma_chain_free(avc);
+ out_enomem:
+        return -ENOMEM;
 }
-void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next)
+static void anon_vma_chain_link(struct vm_area_struct *vma,
+                                struct anon_vma_chain *avc,
+                                struct anon_vma *anon_vma)
 {
-        BUG_ON(vma->anon_vma != next->anon_vma);
+        avc->vma = vma;
-        list_del(&next->anon_vma_node);
+        avc->anon_vma = anon_vma;
+        list_add(&avc->same_vma, &vma->anon_vma_chain);
+        spin_lock(&anon_vma->lock);
+        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+        spin_unlock(&anon_vma->lock);
 }
-void __anon_vma_link(struct vm_area_struct *vma)
+/*
+ * Attach the anon_vmas from src to dst.
+ * Returns 0 on success, -ENOMEM on failure.
+ */
+int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 {
-        struct anon_vma *anon_vma = vma->anon_vma;
+        struct anon_vma_chain *avc, *pavc;
-        if (anon_vma)
+        list_for_each_entry(pavc, &src->anon_vma_chain, same_vma) {
-                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+                avc = anon_vma_chain_alloc();
+                if (!avc)
+                        goto enomem_failure;
+                anon_vma_chain_link(dst, avc, pavc->anon_vma);
+        }
+        return 0;
+ enomem_failure:
+        unlink_anon_vmas(dst);
+        return -ENOMEM;
 }
-void anon_vma_link(struct vm_area_struct *vma)
+/*
+ * Attach vma to its own anon_vma, as well as to the anon_vmas that
+ * the corresponding VMA in the parent process is attached to.
+ * Returns 0 on success, non-zero on failure.
+ */
+int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
 {
-        struct anon_vma *anon_vma = vma->anon_vma;
+        struct anon_vma_chain *avc;
+        struct anon_vma *anon_vma;
-        if (anon_vma) {
+        /* Don't bother if the parent process has no anon_vma here. */
-                spin_lock(&anon_vma->lock);
+        if (!pvma->anon_vma)
-                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
+                return 0;
-                spin_unlock(&anon_vma->lock);
-        }
+        /*
+         * First, attach the new VMA to the parent VMA's anon_vmas,
+         * so rmap can find non-COWed pages in child processes.
+         */
+        if (anon_vma_clone(vma, pvma))
+                return -ENOMEM;
+        /* Then add our own anon_vma. */
+        anon_vma = anon_vma_alloc();
+        if (!anon_vma)
+                goto out_error;
+        avc = anon_vma_chain_alloc();
+        if (!avc)
+                goto out_error_free_anon_vma;
+        anon_vma_chain_link(vma, avc, anon_vma);
+        /* Mark this anon_vma as the one where our new (COWed) pages go. */
+        vma->anon_vma = anon_vma;
+        return 0;
+ out_error_free_anon_vma:
+        anon_vma_free(anon_vma);
+ out_error:
+        return -ENOMEM;
 }
-void anon_vma_unlink(struct vm_area_struct *vma)
+static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
 {
-        struct anon_vma *anon_vma = vma->anon_vma;
+        struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
        int empty;
+        /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
        if (!anon_vma)
                return;
        spin_lock(&anon_vma->lock);
-        list_del(&vma->anon_vma_node);
+        list_del(&anon_vma_chain->same_anon_vma);
        /* We must garbage collect the anon_vma if it's empty */
        empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
@@ -179,6 +255,18 @@ void anon_vma_unlink(struct vm_area_struct *vma)
                anon_vma_free(anon_vma);
 }
+void unlink_anon_vmas(struct vm_area_struct *vma)
+{
+        struct anon_vma_chain *avc, *next;
+        /* Unlink each anon_vma chained to the VMA. */
+        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+                anon_vma_unlink(avc);
+                list_del(&avc->same_vma);
+                anon_vma_chain_free(avc);
+        }
+}
 static void anon_vma_ctor(void *data)
 {
        struct anon_vma *anon_vma = data;
@@ -192,6 +280,7 @@ void __init anon_vma_init(void)
 {
        anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
                        0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
+        anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
 }
 /*
@@ -396,7 +485,7 @@ static int page_referenced_anon(struct page *page,
 {
        unsigned int mapcount;
        struct anon_vma *anon_vma;
-        struct vm_area_struct *vma;
+        struct anon_vma_chain *avc;
        int referenced = 0;
        anon_vma = page_lock_anon_vma(page);
@@ -404,7 +493,8 @@ static int page_referenced_anon(struct page *page,
                return referenced;
        mapcount = page_mapcount(page);
-        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
                if (address == -EFAULT)
                        continue;
@@ -511,9 +601,6 @@ int page_referenced(struct page *page,
        int referenced = 0;
        int we_locked = 0;
-        if (TestClearPageReferenced(page))
-                referenced++;
        *vm_flags = 0;
        if (page_mapped(page) && page_rmapping(page)) {
                if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
@@ -614,6 +701,30 @@ int page_mkclean(struct page *page)
 EXPORT_SYMBOL_GPL(page_mkclean);
 /**
+ * page_move_anon_rmap - move a page to our anon_vma
+ * @page:       the page to move to our anon_vma
+ * @vma:        the vma the page belongs to
+ * @address:    the user virtual address mapped
+ *
+ * When a page belongs exclusively to one process after a COW event,
+ * that page can be moved into the anon_vma that belongs to just that
+ * process, so the rmap code will not search the parent or sibling
+ * processes.
+ */
+void page_move_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address)
+{
+        struct anon_vma *anon_vma = vma->anon_vma;
+        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!anon_vma);
+        VM_BUG_ON(page->index != linear_page_index(vma, address));
+        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+        page->mapping = (struct address_space *) anon_vma;
+}
+/**
 * __page_set_anon_rmap - setup new anonymous rmap
 * @page:       the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
@@ -652,9 +763,6 @@ static void __page_check_anon_rmap(struct page *page,
         * are initially only visible via the pagetables, and the pte is locked
         * over the call to page_add_new_anon_rmap.
         */
-        struct anon_vma *anon_vma = vma->anon_vma;
-        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-        BUG_ON(page->mapping != (struct address_space *)anon_vma);
        BUG_ON(page->index != linear_page_index(vma, address));
 #endif
 }
@@ -815,9 +923,9 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
                if (PageAnon(page))
-                        dec_mm_counter(mm, anon_rss);
+                        dec_mm_counter(mm, MM_ANONPAGES);
                else
-                        dec_mm_counter(mm, file_rss);
+                        dec_mm_counter(mm, MM_FILEPAGES);
                set_pte_at(mm, address, pte,
                                swp_entry_to_pte(make_hwpoison_entry(page)));
        } else if (PageAnon(page)) {
@@ -839,7 +947,8 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                        list_add(&mm->mmlist, &init_mm.mmlist);
                                spin_unlock(&mmlist_lock);
                        }
-                        dec_mm_counter(mm, anon_rss);
+                        dec_mm_counter(mm, MM_ANONPAGES);
+                        inc_mm_counter(mm, MM_SWAPENTS);
                } else if (PAGE_MIGRATION) {
                        /*
                         * Store the pfn of the page in a special migration
@@ -857,7 +966,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                entry = make_migration_entry(page, pte_write(pteval));
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
        } else
-                dec_mm_counter(mm, file_rss);
+                dec_mm_counter(mm, MM_FILEPAGES);
        page_remove_rmap(page);
        page_cache_release(page);
@@ -996,7 +1105,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                page_remove_rmap(page);
                page_cache_release(page);
-                dec_mm_counter(mm, file_rss);
+                dec_mm_counter(mm, MM_FILEPAGES);
                (*mapcount)--;
        }
        pte_unmap_unlock(pte - 1, ptl);
@@ -1024,14 +1133,15 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
 static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
        struct anon_vma *anon_vma;
-        struct vm_area_struct *vma;
+        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
        anon_vma = page_lock_anon_vma(page);
        if (!anon_vma)
                return ret;
-        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
                if (address == -EFAULT)
                        continue;
@@ -1222,7 +1332,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
                struct vm_area_struct *, unsigned long, void *), void *arg)
 {
        struct anon_vma *anon_vma;
-        struct vm_area_struct *vma;
+        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
        /*
@@ -1237,7 +1347,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        if (!anon_vma)
                return ret;
        spin_lock(&anon_vma->lock);
-        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
                if (address == -EFAULT)
                        continue;
diff --git a/mm/slab.c b/mm/slab.c
index 7451bdacaf18..a9f325b28bed 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -935,7 +935,6 @@ static int transfer_objects(struct array_cache *to,
        from->avail -= nr;
        to->avail += nr;
-        to->touched = 1;
        return nr;
 }
@@ -983,13 +982,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
        if (limit > 1)
                limit = 12;
-        ac_ptr = kmalloc_node(memsize, gfp, node);
+        ac_ptr = kzalloc_node(memsize, gfp, node);
        if (ac_ptr) {
                for_each_node(i) {
-                        if (i == node || !node_online(i)) {
+                        if (i == node || !node_online(i))
-                                ac_ptr[i] = NULL;
                                continue;
-                        }
                        ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
                        if (!ac_ptr[i]) {
                                for (i--; i >= 0; i--)
@@ -2963,8 +2960,10 @@ retry:
        spin_lock(&l3->list_lock);
        /* See if we can refill from the shared array */
-        if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
+        if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
+                l3->shared->touched = 1;
                goto alloc_done;
+        }
        while (batchcount > 0) {
                struct list_head *entry;
@@ -3101,7 +3100,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags)
        if (cachep == &cache_cache)
                return false;
-        return should_failslab(obj_size(cachep), flags);
+        return should_failslab(obj_size(cachep), flags, cachep->flags);
 }
 static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
diff --git a/mm/slub.c b/mm/slub.c
index 8d71aaf888d7..b364844a1068 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -151,7 +151,8 @@
 * Set of flags that will prevent slab merging
 */
 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
-                SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
+                SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
+                SLAB_FAILSLAB)
 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
                SLAB_CACHE_DMA | SLAB_NOTRACK)
@@ -217,10 +218,10 @@ static inline void sysfs_slab_remove(struct kmem_cache *s)
 #endif
-static inline void stat(struct kmem_cache_cpu *c, enum stat_item si)
+static inline void stat(struct kmem_cache *s, enum stat_item si)
 {
 #ifdef CONFIG_SLUB_STATS
-        c->stat[si]++;
+        __this_cpu_inc(s->cpu_slab->stat[si]);
 #endif
 }
@@ -242,15 +243,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 #endif
 }
-static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
-{
-#ifdef CONFIG_SMP
-        return s->cpu_slab[cpu];
-#else
-        return &s->cpu_slab;
-#endif
-}
 /* Verify that a pointer has an address that is valid within a slab page */
 static inline int check_valid_pointer(struct kmem_cache *s,
                                struct page *page, const void *object)
@@ -269,13 +261,6 @@ static inline int check_valid_pointer(struct kmem_cache *s,
        return 1;
 }
-/*
- * Slow version of get and set free pointer.
- *
- * This version requires touching the cache lines of kmem_cache which
- * we avoid to do in the fast alloc free paths. There we obtain the offset
- * from the page struct.
- */
 static inline void *get_freepointer(struct kmem_cache *s, void *object)
 {
        return *(void **)(object + s->offset);
@@ -1020,6 +1005,9 @@ static int __init setup_slub_debug(char *str)
                case 't':
                        slub_debug |= SLAB_TRACE;
                        break;
+                case 'a':
+                        slub_debug |= SLAB_FAILSLAB;
+                        break;
                default:
                        printk(KERN_ERR "slub_debug option '%c' "
                                "unknown. skipped\n", *str);
@@ -1124,7 +1112,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
                if (!page)
                        return NULL;
-                stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK);
+                stat(s, ORDER_FALLBACK);
        }
        if (kmemcheck_enabled
@@ -1422,23 +1410,22 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
 static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
 {
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
-        struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
        __ClearPageSlubFrozen(page);
        if (page->inuse) {
                if (page->freelist) {
                        add_partial(n, page, tail);
-                        stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
+                        stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
                } else {
-                        stat(c, DEACTIVATE_FULL);
+                        stat(s, DEACTIVATE_FULL);
                        if (SLABDEBUG && PageSlubDebug(page) &&
                                                (s->flags & SLAB_STORE_USER))
                                add_full(n, page);
                }
                slab_unlock(page);
        } else {
-                stat(c, DEACTIVATE_EMPTY);
+                stat(s, DEACTIVATE_EMPTY);
                if (n->nr_partial < s->min_partial) {
                        /*
                         * Adding an empty slab to the partial slabs in order
@@ -1454,7 +1441,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
                        slab_unlock(page);
                } else {
                        slab_unlock(page);
-                        stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB);
+                        stat(s, FREE_SLAB);
                        discard_slab(s, page);
                }
        }
@@ -1469,7 +1456,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
        int tail = 1;
        if (page->freelist)
-                stat(c, DEACTIVATE_REMOTE_FREES);
+                stat(s, DEACTIVATE_REMOTE_FREES);
        /*
         * Merge cpu freelist into slab freelist. Typically we get here
         * because both freelists are empty. So this is unlikely
@@ -1482,10 +1469,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
                /* Retrieve object from cpu_freelist */
                object = c->freelist;
-                c->freelist = c->freelist[c->offset];
+                c->freelist = get_freepointer(s, c->freelist);
                /* And put onto the regular freelist */
-                object[c->offset] = page->freelist;
+                set_freepointer(s, object, page->freelist);
                page->freelist = object;
                page->inuse--;
        }
@@ -1495,7 +1482,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
-        stat(c, CPUSLAB_FLUSH);
+        stat(s, CPUSLAB_FLUSH);
        slab_lock(c->page);
        deactivate_slab(s, c);
 }
@@ -1507,7 +1494,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 */
 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
 {
-        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
        if (likely(c && c->page))
                flush_slab(s, c);
@@ -1635,7 +1622,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
        if (unlikely(!node_match(c, node)))
                goto another_slab;
-        stat(c, ALLOC_REFILL);
+        stat(s, ALLOC_REFILL);
 load_freelist:
        object = c->page->freelist;
@@ -1644,13 +1631,13 @@ load_freelist:
        if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
                goto debug;
-        c->freelist = object[c->offset];
+        c->freelist = get_freepointer(s, object);
        c->page->inuse = c->page->objects;
        c->page->freelist = NULL;
        c->node = page_to_nid(c->page);
 unlock_out:
        slab_unlock(c->page);
-        stat(c, ALLOC_SLOWPATH);
+        stat(s, ALLOC_SLOWPATH);
        return object;
 another_slab:
@@ -1660,7 +1647,7 @@ new_slab:
        new = get_partial(s, gfpflags, node);
        if (new) {
                c->page = new;
-                stat(c, ALLOC_FROM_PARTIAL);
+                stat(s, ALLOC_FROM_PARTIAL);
                goto load_freelist;
        }
@@ -1673,8 +1660,8 @@ new_slab:
                local_irq_disable();
        if (new) {
-                c = get_cpu_slab(s, smp_processor_id());
+                c = __this_cpu_ptr(s->cpu_slab);
-                stat(c, ALLOC_SLAB);
+                stat(s, ALLOC_SLAB);
                if (c->page)
                        flush_slab(s, c);
                slab_lock(new);
@@ -1690,7 +1677,7 @@ debug:
                goto another_slab;
        c->page->inuse++;
-        c->page->freelist = object[c->offset];
+        c->page->freelist = get_freepointer(s, object);
        c->node = -1;
        goto unlock_out;
 }
@@ -1711,35 +1698,33 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
        void **object;
        struct kmem_cache_cpu *c;
        unsigned long flags;
-        unsigned int objsize;
        gfpflags &= gfp_allowed_mask;
        lockdep_trace_alloc(gfpflags);
        might_sleep_if(gfpflags & __GFP_WAIT);
-        if (should_failslab(s->objsize, gfpflags))
+        if (should_failslab(s->objsize, gfpflags, s->flags))
                return NULL;
        local_irq_save(flags);
-        c = get_cpu_slab(s, smp_processor_id());
+        c = __this_cpu_ptr(s->cpu_slab);
-        objsize = c->objsize;
+        object = c->freelist;
-        if (unlikely(!c->freelist || !node_match(c, node)))
+        if (unlikely(!object || !node_match(c, node)))
                object = __slab_alloc(s, gfpflags, node, addr, c);
        else {
-                object = c->freelist;
+                c->freelist = get_freepointer(s, object);
-                c->freelist = object[c->offset];
+                stat(s, ALLOC_FASTPATH);
-                stat(c, ALLOC_FASTPATH);
        }
        local_irq_restore(flags);
        if (unlikely(gfpflags & __GFP_ZERO) && object)
-                memset(object, 0, objsize);
+                memset(object, 0, s->objsize);
-        kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
+        kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
-        kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
+        kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
        return object;
 }
@@ -1794,26 +1779,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
 * handling required then we can return immediately.
 */
 static void __slab_free(struct kmem_cache *s, struct page *page,
-                        void *x, unsigned long addr, unsigned int offset)
+                        void *x, unsigned long addr)
 {
        void *prior;
        void **object = (void *)x;
-        struct kmem_cache_cpu *c;
-        c = get_cpu_slab(s, raw_smp_processor_id());
+        stat(s, FREE_SLOWPATH);
-        stat(c, FREE_SLOWPATH);
        slab_lock(page);
        if (unlikely(SLABDEBUG && PageSlubDebug(page)))
                goto debug;
 checks_ok:
-        prior = object[offset] = page->freelist;
+        prior = page->freelist;
+        set_freepointer(s, object, prior);
        page->freelist = object;
        page->inuse--;
        if (unlikely(PageSlubFrozen(page))) {
-                stat(c, FREE_FROZEN);
+                stat(s, FREE_FROZEN);
                goto out_unlock;
        }
@@ -1826,7 +1810,7 @@ checks_ok:
         */
        if (unlikely(!prior)) {
                add_partial(get_node(s, page_to_nid(page)), page, 1);
-                stat(c, FREE_ADD_PARTIAL);
+                stat(s, FREE_ADD_PARTIAL);
        }
 out_unlock:
@@ -1839,10 +1823,10 @@ slab_empty:
                 * Slab still on the partial list.
                 */
                remove_partial(s, page);
-                stat(c, FREE_REMOVE_PARTIAL);
+                stat(s, FREE_REMOVE_PARTIAL);
        }
        slab_unlock(page);
-        stat(c, FREE_SLAB);
+        stat(s, FREE_SLAB);
        discard_slab(s, page);
        return;
@@ -1872,17 +1856,17 @@ static __always_inline void slab_free(struct kmem_cache *s,
        kmemleak_free_recursive(x, s->flags);
        local_irq_save(flags);
-        c = get_cpu_slab(s, smp_processor_id());
+        c = __this_cpu_ptr(s->cpu_slab);
-        kmemcheck_slab_free(s, object, c->objsize);
+        kmemcheck_slab_free(s, object, s->objsize);
-        debug_check_no_locks_freed(object, c->objsize);
+        debug_check_no_locks_freed(object, s->objsize);
        if (!(s->flags & SLAB_DEBUG_OBJECTS))
-                debug_check_no_obj_freed(object, c->objsize);
+                debug_check_no_obj_freed(object, s->objsize);
        if (likely(page == c->page && c->node >= 0)) {
-                object[c->offset] = c->freelist;
+                set_freepointer(s, object, c->freelist);
                c->freelist = object;
-                stat(c, FREE_FASTPATH);
+                stat(s, FREE_FASTPATH);
        } else
-                __slab_free(s, page, x, addr, c->offset);
+                __slab_free(s, page, x, addr);
        local_irq_restore(flags);
 }
@@ -2069,19 +2053,6 @@ static unsigned long calculate_alignment(unsigned long flags,
        return ALIGN(align, sizeof(void *));
 }
-static void init_kmem_cache_cpu(struct kmem_cache *s,
-                        struct kmem_cache_cpu *c)
-{
-        c->page = NULL;
-        c->freelist = NULL;
-        c->node = 0;
-        c->offset = s->offset / sizeof(void *);
-        c->objsize = s->objsize;
-#ifdef CONFIG_SLUB_STATS
-        memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned));
-#endif
-}
 static void
 init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
 {
@@ -2095,130 +2066,24 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
 #endif
 }
-#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
-/*
- * Per cpu array for per cpu structures.
- *
- * The per cpu array places all kmem_cache_cpu structures from one processor
- * close together meaning that it becomes possible that multiple per cpu
- * structures are contained in one cacheline. This may be particularly
- * beneficial for the kmalloc caches.
- *
- * A desktop system typically has around 60-80 slabs. With 100 here we are
- * likely able to get per cpu structures for all caches from the array defined
- * here. We must be able to cover all kmalloc caches during bootstrap.
- *
- * If the per cpu array is exhausted then fall back to kmalloc
- * of individual cachelines. No sharing is possible then.
- */
-#define NR_KMEM_CACHE_CPU 100
-static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
-                      kmem_cache_cpu);
-static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
-static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
-static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
-                                                        int cpu, gfp_t flags)
-{
-        struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
-        if (c)
-                per_cpu(kmem_cache_cpu_free, cpu) =
-                                (void *)c->freelist;
-        else {
-                /* Table overflow: So allocate ourselves */
-                c = kmalloc_node(
-                        ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
-                        flags, cpu_to_node(cpu));
-                if (!c)
-                        return NULL;
-        }
-        init_kmem_cache_cpu(s, c);
-        return c;
-}
-static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
-{
-        if (c < per_cpu(kmem_cache_cpu, cpu) ||
-                        c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
-                kfree(c);
-                return;
-        }
-        c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
-        per_cpu(kmem_cache_cpu_free, cpu) = c;
-}
-static void free_kmem_cache_cpus(struct kmem_cache *s)
-{
-        int cpu;
-        for_each_online_cpu(cpu) {
-                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-                if (c) {
-                        s->cpu_slab[cpu] = NULL;
-                        free_kmem_cache_cpu(c, cpu);
-                }
-        }
-}
-static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
-{
-        int cpu;
-        for_each_online_cpu(cpu) {
-                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-                if (c)
-                        continue;
-                c = alloc_kmem_cache_cpu(s, cpu, flags);
-                if (!c) {
-                        free_kmem_cache_cpus(s);
-                        return 0;
-                }
-                s->cpu_slab[cpu] = c;
-        }
-        return 1;
-}
-/*
- * Initialize the per cpu array.
- */
-static void init_alloc_cpu_cpu(int cpu)
-{
-        int i;
-        if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)))
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
-                return;
-        for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
-                free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
-        cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
-}
-static void __init init_alloc_cpu(void)
 {
-        int cpu;
+        if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
+                /*
-        for_each_online_cpu(cpu)
+                 * Boot time creation of the kmalloc array. Use static per cpu data
-                init_alloc_cpu_cpu(cpu);
+                 * since the per cpu allocator is not available yet.
-  }
+                 */
+                s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
+        else
+                s->cpu_slab =  alloc_percpu(struct kmem_cache_cpu);
-#else
+        if (!s->cpu_slab)
-static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
+                return 0;
-static inline void init_alloc_cpu(void) {}
-static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
-{
-        init_kmem_cache_cpu(s, &s->cpu_slab);
        return 1;
 }
-#endif
 #ifdef CONFIG_NUMA
 /*
@@ -2287,7 +2152,8 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
        int node;
        int local_node;
-        if (slab_state >= UP)
+        if (slab_state >= UP && (s < kmalloc_caches ||
+                        s > kmalloc_caches + KMALLOC_CACHES))
                local_node = page_to_nid(virt_to_page(s));
        else
                local_node = 0;
@@ -2502,6 +2368,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
        if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
                return 1;
        free_kmem_cache_nodes(s);
 error:
        if (flags & SLAB_PANIC)
@@ -2609,9 +2476,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
        int node;
        flush_all(s);
+        free_percpu(s->cpu_slab);
        /* Attempt to free all objects */
-        free_kmem_cache_cpus(s);
        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
@@ -2651,7 +2517,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 *              Kmalloc subsystem
 *******************************************************************/
-struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 static int __init setup_slub_min_order(char *str)
@@ -2741,6 +2607,7 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
        char *text;
        size_t realsize;
        unsigned long slabflags;
+        int i;
        s = kmalloc_caches_dma[index];
        if (s)
@@ -2760,7 +2627,14 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
        realsize = kmalloc_caches[index].objsize;
        text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
                         (unsigned int)realsize);
-        s = kmalloc(kmem_size, flags & ~SLUB_DMA);
+        s = NULL;
+        for (i = 0; i < KMALLOC_CACHES; i++)
+                if (!kmalloc_caches[i].size)
+                        break;
+        BUG_ON(i >= KMALLOC_CACHES);
+        s = kmalloc_caches + i;
        /*
         * Must defer sysfs creation to a workqueue because we don't know
@@ -2772,9 +2646,9 @@ static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
        if (slab_state >= SYSFS)
                slabflags |= __SYSFS_ADD_DEFERRED;
-        if (!s || !text || !kmem_cache_open(s, flags, text,
+        if (!text || !kmem_cache_open(s, flags, text,
                        realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
-                kfree(s);
+                s->size = 0;
                kfree(text);
                goto unlock_out;
        }
@@ -3086,7 +2960,7 @@ static void slab_mem_offline_callback(void *arg)
                        /*
                         * if n->nr_slabs > 0, slabs still exist on the node
                         * that is going down. We were unable to free them,
-                         * and offline_pages() function shoudn't call this
+                         * and offline_pages() function shouldn't call this
                         * callback. So, we must fail.
                         */
                        BUG_ON(slabs_node(s, offline_node));
@@ -3176,8 +3050,6 @@ void __init kmem_cache_init(void)
        int i;
        int caches = 0;
-        init_alloc_cpu();
 #ifdef CONFIG_NUMA
        /*
         * Must first have the slab cache available for the allocations of the
@@ -3261,8 +3133,10 @@ void __init kmem_cache_init(void)
 #ifdef CONFIG_SMP
        register_cpu_notifier(&slab_notifier);
-        kmem_size = offsetof(struct kmem_cache, cpu_slab) +
+#endif
-                                nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
+#ifdef CONFIG_NUMA
+        kmem_size = offsetof(struct kmem_cache, node) +
+                                nr_node_ids * sizeof(struct kmem_cache_node *);
 #else
        kmem_size = sizeof(struct kmem_cache);
 #endif
@@ -3351,22 +3225,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
        down_write(&slub_lock);
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
-                int cpu;
                s->refcount++;
                /*
                 * Adjust the object sizes so that we clear
                 * the complete object on kzalloc.
                 */
                s->objsize = max(s->objsize, (int)size);
-                /*
-                 * And then we need to update the object size in the
-                 * per cpu structures
-                 */
-                for_each_online_cpu(cpu)
-                        get_cpu_slab(s, cpu)->objsize = s->objsize;
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
                up_write(&slub_lock);
@@ -3420,29 +3284,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
        unsigned long flags;
        switch (action) {
-        case CPU_UP_PREPARE:
-        case CPU_UP_PREPARE_FROZEN:
-                init_alloc_cpu_cpu(cpu);
-                down_read(&slub_lock);
-                list_for_each_entry(s, &slab_caches, list)
-                        s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
-                                                        GFP_KERNEL);
-                up_read(&slub_lock);
-                break;
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                down_read(&slub_lock);
                list_for_each_entry(s, &slab_caches, list) {
-                        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
                        local_irq_save(flags);
                        __flush_cpu_slab(s, cpu);
                        local_irq_restore(flags);
-                        free_kmem_cache_cpu(c, cpu);
-                        s->cpu_slab[cpu] = NULL;
                }
                up_read(&slub_lock);
                break;
@@ -3928,7 +3778,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                int cpu;
                for_each_possible_cpu(cpu) {
-                        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                        struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
                        if (!c || c->node < 0)
                                continue;
@@ -4171,6 +4021,23 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf,
 }
 SLAB_ATTR(trace);
+#ifdef CONFIG_FAILSLAB
+static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+}
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+                                                        size_t length)
+{
+        s->flags &= ~SLAB_FAILSLAB;
+        if (buf[0] == '1')
+                s->flags |= SLAB_FAILSLAB;
+        return length;
+}
+SLAB_ATTR(failslab);
+#endif
 static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
 {
        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4353,7 +4220,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
                return -ENOMEM;
        for_each_online_cpu(cpu) {
-                unsigned x = get_cpu_slab(s, cpu)->stat[si];
+                unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si];
                data[cpu] = x;
                sum += x;
@@ -4376,7 +4243,7 @@ static void clear_stat(struct kmem_cache *s, enum stat_item si)
        int cpu;
        for_each_online_cpu(cpu)
-                get_cpu_slab(s, cpu)->stat[si] = 0;
+                per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0;
 }
 #define STAT_ATTR(si, text)                                     \
@@ -4467,6 +4334,10 @@ static struct attribute *slab_attrs[] = {
        &deactivate_remote_frees_attr.attr,
        &order_fallback_attr.attr,
 #endif
+#ifdef CONFIG_FAILSLAB
+        &failslab_attr.attr,
+#endif
        NULL
 };
@@ -4519,7 +4390,7 @@ static void kmem_cache_release(struct kobject *kobj)
        kfree(s);
 }
-static struct sysfs_ops slab_sysfs_ops = {
+static const struct sysfs_ops slab_sysfs_ops = {
        .show = slab_attr_show,
        .store = slab_attr_store,
 };
@@ -4538,7 +4409,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj)
        return 0;
 }
-static struct kset_uevent_ops slab_uevent_ops = {
+static const struct kset_uevent_ops slab_uevent_ops = {
        .filter = uevent_filter,
 };
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index d9714bdcb4a3..392b9bb5bc01 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -40,9 +40,11 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node,
                                unsigned long align,
                                unsigned long goal)
 {
-        return __alloc_bootmem_node(NODE_DATA(node), size, align, goal);
+        return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal);
 }
+static void *vmemmap_buf;
+static void *vmemmap_buf_end;
 void * __meminit vmemmap_alloc_block(unsigned long size, int node)
 {
@@ -64,6 +66,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
                                __pa(MAX_DMA_ADDRESS));
 }
+/* need to make sure size is all the same during early stage */
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+{
+        void *ptr;
+        if (!vmemmap_buf)
+                return vmemmap_alloc_block(size, node);
+        /* take the from buf */
+        ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size);
+        if (ptr + size > vmemmap_buf_end)
+                return vmemmap_alloc_block(size, node);
+        vmemmap_buf = ptr + size;
+        return ptr;
+}
 void __meminit vmemmap_verify(pte_t *pte, int node,
                                unsigned long start, unsigned long end)
 {
@@ -80,7 +100,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
        pte_t *pte = pte_offset_kernel(pmd, addr);
        if (pte_none(*pte)) {
                pte_t entry;
-                void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+                void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
                if (!p)
                        return NULL;
                entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -163,3 +183,55 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
        return map;
 }
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+                                          unsigned long pnum_begin,
+                                          unsigned long pnum_end,
+                                          unsigned long map_count, int nodeid)
+{
+        unsigned long pnum;
+        unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+        void *vmemmap_buf_start;
+        size = ALIGN(size, PMD_SIZE);
+        vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count,
+                         PMD_SIZE, __pa(MAX_DMA_ADDRESS));
+        if (vmemmap_buf_start) {
+                vmemmap_buf = vmemmap_buf_start;
+                vmemmap_buf_end = vmemmap_buf_start + size * map_count;
+        }
+        for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+                struct mem_section *ms;
+                if (!present_section_nr(pnum))
+                        continue;
+                map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+                if (map_map[pnum])
+                        continue;
+                ms = __nr_to_section(pnum);
+                printk(KERN_ERR "%s: sparsemem memory map backing failed "
+                        "some memory will not be available.\n", __func__);
+                ms->section_mem_map = 0;
+        }
+        if (vmemmap_buf_start) {
+                /* need to free left buf */
+#ifdef CONFIG_NO_BOOTMEM
+                free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
+                if (vmemmap_buf_start < vmemmap_buf) {
+                        char name[15];
+                        snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
+                        reserve_early_without_check(__pa(vmemmap_buf_start),
+                                                    __pa(vmemmap_buf), name);
+                }
+#else
+                free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
+#endif
+                vmemmap_buf = NULL;
+                vmemmap_buf_end = NULL;
+        }
+}
diff --git a/mm/sparse.c b/mm/sparse.c
index 6ce4aab69e99..22896d589133 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -271,7 +271,8 @@ static unsigned long *__kmalloc_section_usemap(void)
 #ifdef CONFIG_MEMORY_HOTREMOVE
 static unsigned long * __init
-sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+                                         unsigned long count)
 {
        unsigned long section_nr;
@@ -286,7 +287,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
         * this problem.
         */
        section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
-        return alloc_bootmem_section(usemap_size(), section_nr);
+        return alloc_bootmem_section(usemap_size() * count, section_nr);
 }
 static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
@@ -329,7 +330,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 }
 #else
 static unsigned long * __init
-sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
+                                         unsigned long count)
 {
        return NULL;
 }
@@ -339,27 +341,40 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
-static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
+static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
+                                 unsigned long pnum_begin,
+                                 unsigned long pnum_end,
+                                 unsigned long usemap_count, int nodeid)
 {
-        unsigned long *usemap;
+        void *usemap;
-        struct mem_section *ms = __nr_to_section(pnum);
+        unsigned long pnum;
-        int nid = sparse_early_nid(ms);
+        int size = usemap_size();
-        usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
-        if (usemap)
-                return usemap;
-        usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+        usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
+                                                                 usemap_count);
        if (usemap) {
-                check_usemap_section_nr(nid, usemap);
+                for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
-                return usemap;
+                        if (!present_section_nr(pnum))
+                                continue;
+                        usemap_map[pnum] = usemap;
+                        usemap += size;
+                }
+                return;
        }
-        /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
+        usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count);
-        nid = 0;
+        if (usemap) {
+                for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+                        if (!present_section_nr(pnum))
+                                continue;
+                        usemap_map[pnum] = usemap;
+                        usemap += size;
+                        check_usemap_section_nr(nodeid, usemap_map[pnum]);
+                }
+                return;
+        }
        printk(KERN_WARNING "%s: allocation failed\n", __func__);
-        return NULL;
 }
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
@@ -375,8 +390,65 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
                       PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
        return map;
 }
+void __init sparse_mem_maps_populate_node(struct page **map_map,
+                                          unsigned long pnum_begin,
+                                          unsigned long pnum_end,
+                                          unsigned long map_count, int nodeid)
+{
+        void *map;
+        unsigned long pnum;
+        unsigned long size = sizeof(struct page) * PAGES_PER_SECTION;
+        map = alloc_remap(nodeid, size * map_count);
+        if (map) {
+                for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+                        if (!present_section_nr(pnum))
+                                continue;
+                        map_map[pnum] = map;
+                        map += size;
+                }
+                return;
+        }
+        size = PAGE_ALIGN(size);
+        map = alloc_bootmem_pages_node(NODE_DATA(nodeid), size * map_count);
+        if (map) {
+                for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+                        if (!present_section_nr(pnum))
+                                continue;
+                        map_map[pnum] = map;
+                        map += size;
+                }
+                return;
+        }
+        /* fallback */
+        for (pnum = pnum_begin; pnum < pnum_end; pnum++) {
+                struct mem_section *ms;
+                if (!present_section_nr(pnum))
+                        continue;
+                map_map[pnum] = sparse_mem_map_populate(pnum, nodeid);
+                if (map_map[pnum])
+                        continue;
+                ms = __nr_to_section(pnum);
+                printk(KERN_ERR "%s: sparsemem memory map backing failed "
+                        "some memory will not be available.\n", __func__);
+                ms->section_mem_map = 0;
+        }
+}
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
+                                 unsigned long pnum_begin,
+                                 unsigned long pnum_end,
+                                 unsigned long map_count, int nodeid)
+{
+        sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
+                                         map_count, nodeid);
+}
+#else
 static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 {
        struct page *map;
@@ -392,10 +464,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
        ms->section_mem_map = 0;
        return NULL;
 }
+#endif
 void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
 {
 }
 /*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
@@ -407,6 +481,14 @@ void __init sparse_init(void)
        unsigned long *usemap;
        unsigned long **usemap_map;
        int size;
+        int nodeid_begin = 0;
+        unsigned long pnum_begin = 0;
+        unsigned long usemap_count;
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+        unsigned long map_count;
+        int size2;
+        struct page **map_map;
+#endif
        /*
         * map is using big page (aka 2M in x86 64 bit)
@@ -425,10 +507,81 @@ void __init sparse_init(void)
                panic("can not allocate usemap_map\n");
        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+                struct mem_section *ms;
                if (!present_section_nr(pnum))
                        continue;
-                usemap_map[pnum] = sparse_early_usemap_alloc(pnum);
+                ms = __nr_to_section(pnum);
+                nodeid_begin = sparse_early_nid(ms);
+                pnum_begin = pnum;
+                break;
        }
+        usemap_count = 1;
+        for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+                struct mem_section *ms;
+                int nodeid;
+                if (!present_section_nr(pnum))
+                        continue;
+                ms = __nr_to_section(pnum);
+                nodeid = sparse_early_nid(ms);
+                if (nodeid == nodeid_begin) {
+                        usemap_count++;
+                        continue;
+                }
+                /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+                sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
+                                                 usemap_count, nodeid_begin);
+                /* new start, update count etc*/
+                nodeid_begin = nodeid;
+                pnum_begin = pnum;
+                usemap_count = 1;
+        }
+        /* ok, last chunk */
+        sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
+                                         usemap_count, nodeid_begin);
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+        size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
+        map_map = alloc_bootmem(size2);
+        if (!map_map)
+                panic("can not allocate map_map\n");
+        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
+                struct mem_section *ms;
+                if (!present_section_nr(pnum))
+                        continue;
+                ms = __nr_to_section(pnum);
+                nodeid_begin = sparse_early_nid(ms);
+                pnum_begin = pnum;
+                break;
+        }
+        map_count = 1;
+        for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
+                struct mem_section *ms;
+                int nodeid;
+                if (!present_section_nr(pnum))
+                        continue;
+                ms = __nr_to_section(pnum);
+                nodeid = sparse_early_nid(ms);
+                if (nodeid == nodeid_begin) {
+                        map_count++;
+                        continue;
+                }
+                /* ok, we need to take cake of from pnum_begin to pnum - 1*/
+                sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
+                                                 map_count, nodeid_begin);
+                /* new start, update count etc*/
+                nodeid_begin = nodeid;
+                pnum_begin = pnum;
+                map_count = 1;
+        }
+        /* ok, last chunk */
+        sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
+                                         map_count, nodeid_begin);
+#endif
        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
                if (!present_section_nr(pnum))
@@ -438,7 +591,11 @@ void __init sparse_init(void)
                if (!usemap)
                        continue;
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+                map = map_map[pnum];
+#else
                map = sparse_early_mem_map_alloc(pnum);
+#endif
                if (!map)
                        continue;
@@ -448,6 +605,9 @@ void __init sparse_init(void)
        vmemmap_populate_print_last();
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+        free_bootmem(__pa(map_map), size2);
+#endif
        free_bootmem(__pa(usemap_map), size);
 }
diff --git a/mm/swap.c b/mm/swap.c
index 308e57d8d7ed..9036b89813ac 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -55,7 +55,7 @@ static void __page_cache_release(struct page *page)
                del_page_from_lru(zone, page);
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
-        free_hot_page(page);
+        free_hot_cold_page(page, 0);
 }
 static void put_compound_page(struct page *page)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6c0585b16418..6cd0a8f90dc7 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -723,6 +723,37 @@ int free_swap_and_cache(swp_entry_t entry)
        return p != NULL;
 }
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+/**
+ * mem_cgroup_count_swap_user - count the user of a swap entry
+ * @ent: the swap entry to be checked
+ * @pagep: the pointer for the swap cache page of the entry to be stored
+ *
+ * Returns the number of the user of the swap entry. The number is valid only
+ * for swaps of anonymous pages.
+ * If the entry is found on swap cache, the page is stored to pagep with
+ * refcount of it being incremented.
+ */
+int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
+{
+        struct page *page;
+        struct swap_info_struct *p;
+        int count = 0;
+        page = find_get_page(&swapper_space, ent.val);
+        if (page)
+                count += page_mapcount(page);
+        p = swap_info_get(ent);
+        if (p) {
+                count += swap_count(p->swap_map[swp_offset(ent)]);
+                spin_unlock(&swap_lock);
+        }
+        *pagep = page;
+        return count;
+}
+#endif
 #ifdef CONFIG_HIBERNATION
 /*
 * Find the swap type that corresponds to given device (if any).
@@ -840,7 +871,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                goto out;
        }
-        inc_mm_counter(vma->vm_mm, anon_rss);
+        dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
+        inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
        get_page(page);
        set_pte_at(vma->vm_mm, addr, pte,
                   pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -1759,11 +1791,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        unsigned int type;
        int i, prev;
        int error;
-        union swap_header *swap_header = NULL;
+        union swap_header *swap_header;
-        unsigned int nr_good_pages = 0;
+        unsigned int nr_good_pages;
        int nr_extents = 0;
        sector_t span;
-        unsigned long maxpages = 1;
+        unsigned long maxpages;
        unsigned long swapfilepages;
        unsigned char *swap_map = NULL;
        struct page *page = NULL;
@@ -1922,9 +1954,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
         * swap pte.
         */
        maxpages = swp_offset(pte_to_swp_entry(
-                        swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
+                        swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
-        if (maxpages > swap_header->info.last_page)
+        if (maxpages > swap_header->info.last_page) {
-                maxpages = swap_header->info.last_page;
+                maxpages = swap_header->info.last_page + 1;
+                /* p->max is an unsigned int: don't overflow it */
+                if ((unsigned int)maxpages == 0)
+                        maxpages = UINT_MAX;
+        }
        p->highest_bit = maxpages - 1;
        error = -EINVAL;
@@ -1948,23 +1984,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        }
        memset(swap_map, 0, maxpages);
+        nr_good_pages = maxpages - 1;   /* omit header page */
        for (i = 0; i < swap_header->info.nr_badpages; i++) {
-                int page_nr = swap_header->info.badpages[i];
+                unsigned int page_nr = swap_header->info.badpages[i];
-                if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
+                if (page_nr == 0 || page_nr > swap_header->info.last_page) {
                        error = -EINVAL;
                        goto bad_swap;
                }
-                swap_map[page_nr] = SWAP_MAP_BAD;
+                if (page_nr < maxpages) {
+                        swap_map[page_nr] = SWAP_MAP_BAD;
+                        nr_good_pages--;
+                }
        }
        error = swap_cgroup_swapon(type, maxpages);
        if (error)
                goto bad_swap;
-        nr_good_pages = swap_header->info.last_page -
-                        swap_header->info.nr_badpages -
-                        1 /* header page */;
        if (nr_good_pages) {
                swap_map[0] = SWAP_MAP_BAD;
                p->max = maxpages;
@@ -2155,7 +2192,11 @@ void swap_shmem_alloc(swp_entry_t entry)
 }
 /*
- * increase reference count of swap entry by 1.
+ * Increase reference count of swap entry by 1.
+ * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
+ * but could not be atomically allocated.  Returns 0, just as if it succeeded,
+ * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
+ * might occur if a page table entry has got corrupted.
 */
 int swap_duplicate(swp_entry_t entry)
 {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c26986c85ce0..79c809895fba 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -262,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
        return ret;
 }
-/* Called without lock on whether page is mapped, so answer is unstable */
-static inline int page_mapping_inuse(struct page *page)
-{
-        struct address_space *mapping;
-        /* Page is in somebody's page tables. */
-        if (page_mapped(page))
-                return 1;
-        /* Be more reluctant to reclaim swapcache than pagecache */
-        if (PageSwapCache(page))
-                return 1;
-        mapping = page_mapping(page);
-        if (!mapping)
-                return 0;
-        /* File is mmap'd by somebody? */
-        return mapping_mapped(mapping);
-}
 static inline int is_page_cache_freeable(struct page *page)
 {
        /*
@@ -579,6 +558,65 @@ redo:
        put_page(page);         /* drop ref from isolate */
 }
+enum page_references {
+        PAGEREF_RECLAIM,
+        PAGEREF_RECLAIM_CLEAN,
+        PAGEREF_KEEP,
+        PAGEREF_ACTIVATE,
+};
+static enum page_references page_check_references(struct page *page,
+                                                  struct scan_control *sc)
+{
+        int referenced_ptes, referenced_page;
+        unsigned long vm_flags;
+        referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
+        referenced_page = TestClearPageReferenced(page);
+        /* Lumpy reclaim - ignore references */
+        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+                return PAGEREF_RECLAIM;
+        /*
+         * Mlock lost the isolation race with us.  Let try_to_unmap()
+         * move the page to the unevictable list.
+         */
+        if (vm_flags & VM_LOCKED)
+                return PAGEREF_RECLAIM;
+        if (referenced_ptes) {
+                if (PageAnon(page))
+                        return PAGEREF_ACTIVATE;
+                /*
+                 * All mapped pages start out with page table
+                 * references from the instantiating fault, so we need
+                 * to look twice if a mapped file page is used more
+                 * than once.
+                 *
+                 * Mark it and spare it for another trip around the
+                 * inactive list.  Another page table reference will
+                 * lead to its activation.
+                 *
+                 * Note: the mark is set for activated pages as well
+                 * so that recently deactivated but used pages are
+                 * quickly recovered.
+                 */
+                SetPageReferenced(page);
+                if (referenced_page)
+                        return PAGEREF_ACTIVATE;
+                return PAGEREF_KEEP;
+        }
+        /* Reclaim if clean, defer dirty pages to writeback */
+        if (referenced_page)
+                return PAGEREF_RECLAIM_CLEAN;
+        return PAGEREF_RECLAIM;
+}
 /*
 * shrink_page_list() returns the number of reclaimed pages
 */
@@ -590,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        struct pagevec freed_pvec;
        int pgactivate = 0;
        unsigned long nr_reclaimed = 0;
-        unsigned long vm_flags;
        cond_resched();
        pagevec_init(&freed_pvec, 1);
        while (!list_empty(page_list)) {
+                enum page_references references;
                struct address_space *mapping;
                struct page *page;
                int may_enter_fs;
-                int referenced;
                cond_resched();
@@ -641,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                }
-                referenced = page_referenced(page, 1,
+                references = page_check_references(page, sc);
-                                                sc->mem_cgroup, &vm_flags);
+                switch (references) {
-                /*
+                case PAGEREF_ACTIVATE:
-                 * In active use or really unfreeable?  Activate it.
-                 * If page which have PG_mlocked lost isoltation race,
-                 * try_to_unmap moves it to unevictable list
-                 */
-                if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
-                                        referenced && page_mapping_inuse(page)
-                                        && !(vm_flags & VM_LOCKED))
                        goto activate_locked;
+                case PAGEREF_KEEP:
+                        goto keep_locked;
+                case PAGEREF_RECLAIM:
+                case PAGEREF_RECLAIM_CLEAN:
+                        ; /* try to reclaim the page below */
+                }
                /*
                 * Anonymous process memory has backing store?
@@ -685,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                }
                if (PageDirty(page)) {
-                        if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
+                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
                                goto keep_locked;
@@ -1350,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                        continue;
                }
-                /* page_referenced clears PageReferenced */
+                if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
-                if (page_mapping_inuse(page) &&
-                    page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
                        nr_rotated++;
                        /*
                         * Identify referenced, file-backed active pages and
@@ -1501,6 +1535,13 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
        unsigned long ap, fp;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
+        /* If we have no swap space, do not bother scanning anon pages. */
+        if (!sc->may_swap || (nr_swap_pages <= 0)) {
+                percent[0] = 0;
+                percent[1] = 100;
+                return;
+        }
        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1598,22 +1639,20 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr_reclaimed = sc->nr_reclaimed;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-        int noswap = 0;
-        /* If we have no swap space, do not bother scanning anon pages. */
+        get_scan_ratio(zone, sc, percent);
-        if (!sc->may_swap || (nr_swap_pages <= 0)) {
-                noswap = 1;
-                percent[0] = 0;
-                percent[1] = 100;
-        } else
-                get_scan_ratio(zone, sc, percent);
        for_each_evictable_lru(l) {
                int file = is_file_lru(l);
                unsigned long scan;
+                if (percent[file] == 0) {
+                        nr[l] = 0;
+                        continue;
+                }
                scan = zone_nr_lru_pages(zone, sc, l);
-                if (priority || noswap) {
+                if (priority) {
                        scan >>= priority;
                        scan = (scan * percent[file]) / 100;
                }
@@ -1694,8 +1733,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                continue;
                        note_zone_scanning_priority(zone, priority);
-                        if (zone_is_all_unreclaimable(zone) &&
+                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
-                                                priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
                        sc->all_unreclaimable = 0;
                } else {
@@ -1922,7 +1960,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
                if (!populated_zone(zone))
                        continue;
-                if (zone_is_all_unreclaimable(zone))
+                if (zone->all_unreclaimable)
                        continue;
                if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
@@ -2012,8 +2050,7 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
-                        if (zone_is_all_unreclaimable(zone) &&
+                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
-                            priority != DEF_PRIORITY)
                                continue;
                        /*
@@ -2056,13 +2093,9 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
-                        if (zone_is_all_unreclaimable(zone) &&
+                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
-                                        priority != DEF_PRIORITY)
                                continue;
-                        if (!zone_watermark_ok(zone, order,
-                                        high_wmark_pages(zone), end_zone, 0))
-                                all_zones_ok = 0;
                        temp_priority[i] = priority;
                        sc.nr_scanned = 0;
                        note_zone_scanning_priority(zone, priority);
@@ -2087,12 +2120,11 @@ loop_again:
                                                lru_pages);
                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_scanned += sc.nr_scanned;
-                        if (zone_is_all_unreclaimable(zone))
+                        if (zone->all_unreclaimable)
                                continue;
-                        if (nr_slab == 0 && zone->pages_scanned >=
+                        if (nr_slab == 0 &&
-                                        (zone_reclaimable_pages(zone) * 6))
+                            zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
-                                        zone_set_flag(zone,
+                                zone->all_unreclaimable = 1;
-                                                      ZONE_ALL_UNRECLAIMABLE);
                        /*
                         * If we've done a decent amount of scanning and
                         * the reclaim ratio is low, start doing writepage
@@ -2102,13 +2134,18 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
-                        /*
+                        if (!zone_watermark_ok(zone, order,
-                         * We are still under min water mark. it mean we have
+                                        high_wmark_pages(zone), end_zone, 0)) {
-                         * GFP_ATOMIC allocation failure risk. Hurry up!
+                                all_zones_ok = 0;
-                         */
+                                /*
-                        if (!zone_watermark_ok(zone, order, min_wmark_pages(zone),
+                                 * We are still under min water mark.  This
-                                              end_zone, 0))
+                                 * means that we have a GFP_ATOMIC allocation
-                                has_under_min_watermark_zone = 1;
+                                 * failure risk. Hurry up!
+                                 */
+                                if (!zone_watermark_ok(zone, order,
+                                            min_wmark_pages(zone), end_zone, 0))
+                                        has_under_min_watermark_zone = 1;
+                        }
                }
                if (all_zones_ok)
@@ -2550,6 +2587,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         * and RECLAIM_SWAP.
         */
        p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
@@ -2593,6 +2631,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        p->reclaim_state = NULL;
        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+        lockdep_clear_current_reclaim_state();
        return sc.nr_reclaimed >= nr_pages;
 }
@@ -2615,7 +2654,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
            zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
                return ZONE_RECLAIM_FULL;
-        if (zone_is_all_unreclaimable(zone))
+        if (zone->all_unreclaimable)
                return ZONE_RECLAIM_FULL;
        /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6051fbab67ba..7f760cbc73f3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -139,7 +139,8 @@ static void refresh_zone_stat_thresholds(void)
                threshold = calculate_threshold(zone);
                for_each_online_cpu(cpu)
-                        zone_pcp(zone, cpu)->stat_threshold = threshold;
+                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+                                                        = threshold;
        }
 }
@@ -149,7 +150,8 @@ static void refresh_zone_stat_thresholds(void)
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
                                int delta)
 {
-        struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+        struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
        s8 *p = pcp->vm_stat_diff + item;
        long x;
@@ -202,7 +204,7 @@ EXPORT_SYMBOL(mod_zone_page_state);
 */
 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-        struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+        struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
        s8 *p = pcp->vm_stat_diff + item;
        (*p)++;
@@ -223,7 +225,7 @@ EXPORT_SYMBOL(__inc_zone_page_state);
 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-        struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id());
+        struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
        s8 *p = pcp->vm_stat_diff + item;
        (*p)--;
@@ -300,7 +302,7 @@ void refresh_cpu_vm_stats(int cpu)
        for_each_populated_zone(zone) {
                struct per_cpu_pageset *p;
-                p = zone_pcp(zone, cpu);
+                p = per_cpu_ptr(zone->pageset, cpu);
                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
                        if (p->vm_stat_diff[i]) {
@@ -741,7 +743,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
        for_each_online_cpu(i) {
                struct per_cpu_pageset *pageset;
-                pageset = zone_pcp(zone, i);
+                pageset = per_cpu_ptr(zone->pageset, i);
                seq_printf(m,
                           "\n    cpu: %i"
                           "\n              count: %i"
@@ -761,7 +763,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n  prev_priority:     %i"
                   "\n  start_pfn:         %lu"
                   "\n  inactive_ratio:    %u",
-                           zone_is_all_unreclaimable(zone),
+                   zone->all_unreclaimable,
                   zone->prev_priority,
                   zone->zone_start_pfn,
                   zone->inactive_ratio);
@@ -906,6 +908,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
                start_cpu_timer(cpu);
+                node_set_state(cpu_to_node(cpu), N_CPU);
                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN: