Merge branch 'mpi-master' into wip-k-fmlpwip-k-fmlp

Conflicts: litmus/sched_cedf.c
author: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
committer: Glenn Elliott <gelliott@cs.unc.edu> 2012-03-04 19:47:13 -0500
commit: c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree: ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm
parent: ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent: 6a00f206debf8a5c8899055726ad127dbeeed098 (diff)
68 files changed, 12466 insertions, 5029 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f0fb9124e410..8ca47a5ee9c8 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS
 config COMPACTION
        bool "Allow for memory compaction"
        select MIGRATION
-        depends on EXPERIMENTAL && HUGETLB_PAGE && MMU
+        depends on MMU
        help
          Allows the compaction of memory for the allocation of huge pages.
@@ -301,3 +301,72 @@ config NOMMU_INITIAL_TRIM_EXCESS
          of 1 says that all excess pages should be trimmed.
          See Documentation/nommu-mmap.txt for more information.
+config TRANSPARENT_HUGEPAGE
+        bool "Transparent Hugepage Support"
+        depends on X86 && MMU
+        select COMPACTION
+        help
+          Transparent Hugepages allows the kernel to use huge pages and
+          huge tlb transparently to the applications whenever possible.
+          This feature can improve computing performance to certain
+          applications by speeding up page faults during memory
+          allocation, by reducing the number of tlb misses and by speeding
+          up the pagetable walking.
+          If memory constrained on embedded, you may want to say N.
+choice
+        prompt "Transparent Hugepage Support sysfs defaults"
+        depends on TRANSPARENT_HUGEPAGE
+        default TRANSPARENT_HUGEPAGE_ALWAYS
+        help
+          Selects the sysfs defaults for Transparent Hugepage Support.
+        config TRANSPARENT_HUGEPAGE_ALWAYS
+                bool "always"
+        help
+          Enabling Transparent Hugepage always, can increase the
+          memory footprint of applications without a guaranteed
+          benefit but it will work automatically for all applications.
+        config TRANSPARENT_HUGEPAGE_MADVISE
+                bool "madvise"
+        help
+          Enabling Transparent Hugepage madvise, will only provide a
+          performance improvement benefit to the applications using
+          madvise(MADV_HUGEPAGE) but it won't risk to increase the
+          memory footprint of applications without a guaranteed
+          benefit.
+endchoice
+#
+# UP and nommu archs use km based percpu allocator
+#
+config NEED_PER_CPU_KM
+        depends on !SMP
+        bool
+        default y
+config CLEANCACHE
+        bool "Enable cleancache driver to cache clean pages if tmem is present"
+        default n
+        help
+          Cleancache can be thought of as a page-granularity victim cache
+          for clean pages that the kernel's pageframe replacement algorithm
+          (PFRA) would like to keep around, but can't since there isn't enough
+          memory.  So when the PFRA "evicts" a page, it first attempts to use
+          cleancacne code to put the data contained in that page into
+          "transcendent memory", memory that is not directly accessible or
+          addressable by the kernel and is of unknown and possibly
+          time-varying size.  And when a cleancache-enabled
+          filesystem wishes to access a page in a file on disk, it first
+          checks cleancache to see if it already contains it; if it does,
+          the page is copied into the kernel and a disk access is avoided.
+          When a transcendent memory driver is available (such as zcache or
+          Xen transcendent memory), a significant I/O reduction
+          may be achieved.  When none is available, all cleancache calls
+          are reduced to a single pointer-compare-against-NULL resulting
+          in a negligible performance hit.
+          If unsure, say Y to enable cleancache
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index af7cfb43d2f0..8b1a477162dc 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,27 +1,24 @@
 config DEBUG_PAGEALLOC
        bool "Debug page memory allocations"
-        depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
+        depends on DEBUG_KERNEL
-        depends on !HIBERNATION || !PPC && !SPARC
+        depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
        depends on !KMEMCHECK
+        select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
        ---help---
          Unmap pages from the kernel linear mapping after free_pages().
          This results in a large slowdown, but helps to find certain types
          of memory corruption.
+          For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
+          fill the pages with poison patterns after free_pages() and verify
+          the patterns before alloc_pages().  Additionally,
+          this option cannot be enabled in combination with hibernation as
+          that would result in incorrect warnings of memory corruption after
+          a resume because free pages are not saved to the suspend image.
 config WANT_PAGE_DEBUG_FLAGS
        bool
 config PAGE_POISONING
-        bool "Debug page memory allocations"
+        bool
-        depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
-        depends on !HIBERNATION
-        select DEBUG_PAGEALLOC
        select WANT_PAGE_DEBUG_FLAGS
-        ---help---
-           Fill the pages with poison patterns after free_pages() and verify
-           the patterns before alloc_pages(). This results in a large slowdown,
-           but helps to find certain types of memory corruption.
-           This option cannot be enabled in combination with hibernation as
-           that would result in incorrect warnings of memory corruption after
-           a resume because free pages are not saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 34b2546a9e37..836e4163c1bf 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,16 +5,22 @@
 mmu-y                   := nommu.o
 mmu-$(CONFIG_MMU)       := fremap.o highmem.o madvise.o memory.o mincore.o \
                           mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
-                           vmalloc.o pagewalk.o
+                           vmalloc.o pagewalk.o pgtable-generic.o
-obj-y                   := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
+obj-y                   := filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o \
                           readahead.o swap.o truncate.o vmscan.o shmem.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                           page_isolation.o mm_init.o mmu_context.o \
+                           page_isolation.o mm_init.o mmu_context.o percpu.o \
                           $(mmu-y)
 obj-y += init-mm.o
+ifdef CONFIG_NO_BOOTMEM
+        obj-y           += nobootmem.o
+else
+        obj-y           += bootmem.o
+endif
 obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
 obj-$(CONFIG_BOUNCE)    += bounce.o
@@ -36,14 +42,11 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
-ifdef CONFIG_SMP
-obj-y += percpu.o
-else
-obj-y += percpu_up.o
-endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
+obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
+obj-$(CONFIG_CLEANCACHE) += cleancache.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 65d420499a61..f032e6e1e09a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,17 +14,11 @@
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
-EXPORT_SYMBOL(default_unplug_io_fn);
 struct backing_dev_info default_backing_dev_info = {
        .name           = "default",
        .ra_pages       = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
        .state          = 0,
        .capabilities   = BDI_CAP_MAP_COPY,
-        .unplug_io_fn   = default_unplug_io_fn,
 };
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
@@ -69,18 +63,18 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long bdi_thresh;
-        unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
+        unsigned long nr_dirty, nr_io, nr_more_io;
        struct inode *inode;
-        nr_wb = nr_dirty = nr_io = nr_more_io = 0;
+        nr_dirty = nr_io = nr_more_io = 0;
-        spin_lock(&inode_lock);
+        spin_lock(&inode_wb_list_lock);
-        list_for_each_entry(inode, &wb->b_dirty, i_list)
+        list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                nr_dirty++;
-        list_for_each_entry(inode, &wb->b_io, i_list)
+        list_for_each_entry(inode, &wb->b_io, i_wb_list)
                nr_io++;
-        list_for_each_entry(inode, &wb->b_more_io, i_list)
+        list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
                nr_more_io++;
-        spin_unlock(&inode_lock);
+        spin_unlock(&inode_wb_list_lock);
        global_dirty_limits(&background_thresh, &dirty_thresh);
        bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -362,7 +356,7 @@ static int bdi_forker_thread(void *ptr)
 {
        struct bdi_writeback *me = ptr;
-        current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+        current->flags |= PF_SWAPWRITE;
        set_freezable();
        /*
@@ -604,7 +598,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                if (sb->s_bdi == bdi)
-                        sb->s_bdi = NULL;
+                        sb->s_bdi = &default_backing_dev_info;
        }
        spin_unlock(&sb_lock);
 }
@@ -682,11 +676,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
        if (bdi_has_dirty_io(bdi)) {
                struct bdi_writeback *dst = &default_backing_dev_info.wb;
-                spin_lock(&inode_lock);
+                spin_lock(&inode_wb_list_lock);
                list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
                list_splice(&bdi->wb.b_io, &dst->b_io);
                list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
-                spin_unlock(&inode_lock);
+                spin_unlock(&inode_wb_list_lock);
        }
        bdi_unregister(bdi);
@@ -729,6 +723,7 @@ static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
        };
+static atomic_t nr_bdi_congested[2];
 void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
 {
@@ -736,7 +731,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
        wait_queue_head_t *wqh = &congestion_wqh[sync];
        bit = sync ? BDI_sync_congested : BDI_async_congested;
-        clear_bit(bit, &bdi->state);
+        if (test_and_clear_bit(bit, &bdi->state))
+                atomic_dec(&nr_bdi_congested[sync]);
        smp_mb__after_clear_bit();
        if (waitqueue_active(wqh))
                wake_up(wqh);
@@ -748,7 +744,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
        enum bdi_state bit;
        bit = sync ? BDI_sync_congested : BDI_async_congested;
-        set_bit(bit, &bdi->state);
+        if (!test_and_set_bit(bit, &bdi->state))
+                atomic_inc(&nr_bdi_congested[sync]);
 }
 EXPORT_SYMBOL(set_bdi_congested);
@@ -764,13 +761,72 @@ EXPORT_SYMBOL(set_bdi_congested);
 long congestion_wait(int sync, long timeout)
 {
        long ret;
+        unsigned long start = jiffies;
        DEFINE_WAIT(wait);
        wait_queue_head_t *wqh = &congestion_wqh[sync];
        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
        ret = io_schedule_timeout(timeout);
        finish_wait(wqh, &wait);
+        trace_writeback_congestion_wait(jiffies_to_usecs(timeout),
+                                        jiffies_to_usecs(jiffies - start));
        return ret;
 }
 EXPORT_SYMBOL(congestion_wait);
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absence of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+        long ret;
+        unsigned long start = jiffies;
+        DEFINE_WAIT(wait);
+        wait_queue_head_t *wqh = &congestion_wqh[sync];
+        /*
+         * If there is no congestion, or heavy congestion is not being
+         * encountered in the current zone, yield if necessary instead
+         * of sleeping on the congestion queue
+         */
+        if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+                        !zone_is_reclaim_congested(zone)) {
+                cond_resched();
+                /* In case we scheduled, work out time remaining */
+                ret = timeout - (jiffies - start);
+                if (ret < 0)
+                        ret = 0;
+                goto out;
+        }
+        /* Sleep until uncongested or a write happens */
+        prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+        ret = io_schedule_timeout(timeout);
+        finish_wait(wqh, &wait);
+out:
+        trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+                                        jiffies_to_usecs(jiffies - start));
+        return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 142c84a54993..01d5a4b3dd0c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/kmemleak.h>
 #include <linux/range.h>
+#include <linux/memblock.h>
 #include <asm/bug.h>
 #include <asm/io.h>
@@ -22,19 +23,17 @@
 #include "internal.h"
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data = {
+        .bdata = &bootmem_node_data[0]
+};
+EXPORT_SYMBOL(contig_page_data);
+#endif
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
-#ifdef CONFIG_CRASH_DUMP
-/*
- * If we have booted due to a crash, max_pfn will be a very low value. We need
- * to know the amount of memory that the previous kernel used.
- */
-unsigned long saved_max_pfn;
-#endif
-#ifndef CONFIG_NO_BOOTMEM
 bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
 static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
@@ -145,7 +144,7 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
        min_low_pfn = start;
        return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
 }
-#endif
 /*
 * free_bootmem_late - free bootmem pages directly to page allocator
 * @addr: starting address of the range
@@ -170,53 +169,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
        }
 }
-#ifdef CONFIG_NO_BOOTMEM
-static void __init __free_pages_memory(unsigned long start, unsigned long end)
-{
-        int i;
-        unsigned long start_aligned, end_aligned;
-        int order = ilog2(BITS_PER_LONG);
-        start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
-        end_aligned = end & ~(BITS_PER_LONG - 1);
-        if (end_aligned <= start_aligned) {
-                for (i = start; i < end; i++)
-                        __free_pages_bootmem(pfn_to_page(i), 0);
-                return;
-        }
-        for (i = start; i < start_aligned; i++)
-                __free_pages_bootmem(pfn_to_page(i), 0);
-        for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
-                __free_pages_bootmem(pfn_to_page(i), order);
-        for (i = end_aligned; i < end; i++)
-                __free_pages_bootmem(pfn_to_page(i), 0);
-}
-unsigned long __init free_all_memory_core_early(int nodeid)
-{
-        int i;
-        u64 start, end;
-        unsigned long count = 0;
-        struct range *range = NULL;
-        int nr_range;
-        nr_range = get_free_all_memory_range(&range, nodeid);
-        for (i = 0; i < nr_range; i++) {
-                start = range[i].start;
-                end = range[i].end;
-                count += end - start;
-                __free_pages_memory(start, end);
-        }
-        return count;
-}
-#else
 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 {
        int aligned;
@@ -277,7 +229,6 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
        return count;
 }
-#endif
 /**
 * free_all_bootmem_node - release a node's free pages to the buddy allocator
@@ -288,12 +239,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
        register_page_bootmem_info_node(pgdat);
-#ifdef CONFIG_NO_BOOTMEM
-        /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
-        return 0;
-#else
        return free_all_bootmem_core(pgdat->bdata);
-#endif
 }
 /**
@@ -303,16 +249,6 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 */
 unsigned long __init free_all_bootmem(void)
 {
-#ifdef CONFIG_NO_BOOTMEM
-        /*
-         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
-         *  because in some case like Node0 doesnt have RAM installed
-         *  low ram will be on Node1
-         * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
-         *  will be used instead of only Node0 related
-         */
-        return free_all_memory_core_early(MAX_NUMNODES);
-#else
        unsigned long total_pages = 0;
        bootmem_data_t *bdata;
@@ -320,10 +256,8 @@ unsigned long __init free_all_bootmem(void)
                total_pages += free_all_bootmem_core(bdata);
        return total_pages;
-#endif
 }
-#ifndef CONFIG_NO_BOOTMEM
 static void __init __free(bootmem_data_t *bdata,
                        unsigned long sidx, unsigned long eidx)
 {
@@ -418,7 +352,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
        }
        BUG();
 }
-#endif
 /**
 * free_bootmem_node - mark a page range as usable
@@ -433,9 +366,6 @@ static int __init mark_bootmem(unsigned long start, unsigned long end,
 void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
                              unsigned long size)
 {
-#ifdef CONFIG_NO_BOOTMEM
-        free_early(physaddr, physaddr + size);
-#else
        unsigned long start, end;
        kmemleak_free_part(__va(physaddr), size);
@@ -444,7 +374,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
        end = PFN_DOWN(physaddr + size);
        mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
-#endif
 }
 /**
@@ -458,9 +387,6 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 */
 void __init free_bootmem(unsigned long addr, unsigned long size)
 {
-#ifdef CONFIG_NO_BOOTMEM
-        free_early(addr, addr + size);
-#else
        unsigned long start, end;
        kmemleak_free_part(__va(addr), size);
@@ -469,7 +395,6 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
        end = PFN_DOWN(addr + size);
        mark_bootmem(start, end, 0, 0);
-#endif
 }
 /**
@@ -486,17 +411,12 @@ void __init free_bootmem(unsigned long addr, unsigned long size)
 int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
                                 unsigned long size, int flags)
 {
-#ifdef CONFIG_NO_BOOTMEM
-        panic("no bootmem");
-        return 0;
-#else
        unsigned long start, end;
        start = PFN_DOWN(physaddr);
        end = PFN_UP(physaddr + size);
        return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
-#endif
 }
 /**
@@ -512,20 +432,20 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 int __init reserve_bootmem(unsigned long addr, unsigned long size,
                            int flags)
 {
-#ifdef CONFIG_NO_BOOTMEM
-        panic("no bootmem");
-        return 0;
-#else
        unsigned long start, end;
        start = PFN_DOWN(addr);
        end = PFN_UP(addr + size);
        return mark_bootmem(start, end, 1, flags);
-#endif
 }
-#ifndef CONFIG_NO_BOOTMEM
+int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+                                   int flags)
+{
+        return reserve_bootmem(phys, len, flags);
+}
 static unsigned long __init align_idx(struct bootmem_data *bdata,
                                      unsigned long idx, unsigned long step)
 {
@@ -676,33 +596,12 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
 #endif
        return NULL;
 }
-#endif
 static void * __init ___alloc_bootmem_nopanic(unsigned long size,
                                        unsigned long align,
                                        unsigned long goal,
                                        unsigned long limit)
 {
-#ifdef CONFIG_NO_BOOTMEM
-        void *ptr;
-        if (WARN_ON_ONCE(slab_is_available()))
-                return kzalloc(size, GFP_NOWAIT);
-restart:
-        ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
-        if (ptr)
-                return ptr;
-        if (goal != 0) {
-                goal = 0;
-                goto restart;
-        }
-        return NULL;
-#else
        bootmem_data_t *bdata;
        void *region;
@@ -728,7 +627,6 @@ restart:
        }
        return NULL;
-#endif
 }
 /**
@@ -749,10 +647,6 @@ void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
 {
        unsigned long limit = 0;
-#ifdef CONFIG_NO_BOOTMEM
-        limit = -1UL;
-#endif
        return ___alloc_bootmem_nopanic(size, align, goal, limit);
 }
@@ -789,14 +683,9 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
 {
        unsigned long limit = 0;
-#ifdef CONFIG_NO_BOOTMEM
-        limit = -1UL;
-#endif
        return ___alloc_bootmem(size, align, goal, limit);
 }
-#ifndef CONFIG_NO_BOOTMEM
 static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
                                unsigned long size, unsigned long align,
                                unsigned long goal, unsigned long limit)
@@ -813,7 +702,6 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
        return ___alloc_bootmem(size, align, goal, limit);
 }
-#endif
 /**
 * __alloc_bootmem_node - allocate boot memory from a specific node
@@ -833,24 +721,10 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
                                   unsigned long align, unsigned long goal)
 {
-        void *ptr;
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-#ifdef CONFIG_NO_BOOTMEM
+        return  ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
-        ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
-                                         goal, -1ULL);
-        if (ptr)
-                return ptr;
-        ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-                                         goal, -1ULL);
-#else
-        ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
-#endif
-        return ptr;
 }
 void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
@@ -871,13 +745,8 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
                unsigned long new_goal;
                new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
-#ifdef CONFIG_NO_BOOTMEM
-                ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
-                                                 new_goal, -1ULL);
-#else
                ptr = alloc_bootmem_core(pgdat->bdata, size, align,
                                                 new_goal, 0);
-#endif
                if (ptr)
                        return ptr;
        }
@@ -898,16 +767,6 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
 void * __init alloc_bootmem_section(unsigned long size,
                                    unsigned long section_nr)
 {
-#ifdef CONFIG_NO_BOOTMEM
-        unsigned long pfn, goal, limit;
-        pfn = section_nr_to_pfn(section_nr);
-        goal = pfn << PAGE_SHIFT;
-        limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
-        return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
-                                         SMP_CACHE_BYTES, goal, limit);
-#else
        bootmem_data_t *bdata;
        unsigned long pfn, goal, limit;
@@ -917,7 +776,6 @@ void * __init alloc_bootmem_section(unsigned long size,
        bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
        return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
-#endif
 }
 #endif
@@ -929,16 +787,11 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-#ifdef CONFIG_NO_BOOTMEM
-        ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
-                                                 goal, -1ULL);
-#else
        ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
        if (ptr)
                return ptr;
        ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
-#endif
        if (ptr)
                return ptr;
@@ -986,21 +839,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
                                       unsigned long align, unsigned long goal)
 {
-        void *ptr;
        if (WARN_ON_ONCE(slab_is_available()))
                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-#ifdef CONFIG_NO_BOOTMEM
+        return ___alloc_bootmem_node(pgdat->bdata, size, align,
-        ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
                                goal, ARCH_LOW_ADDRESS_LIMIT);
-        if (ptr)
-                return ptr;
-        ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align,
-                                goal, ARCH_LOW_ADDRESS_LIMIT);
-#else
-        ptr = ___alloc_bootmem_node(pgdat->bdata, size, align,
-                                goal, ARCH_LOW_ADDRESS_LIMIT);
-#endif
-        return ptr;
 }
diff --git a/mm/cleancache.c b/mm/cleancache.c
new file mode 100644
index 000000000000..bcaae4c2a770
--- /dev/null
+++ b/mm/cleancache.c
@@ -0,0 +1,244 @@
+/*
+ * Cleancache frontend
+ *
+ * This code provides the generic "frontend" layer to call a matching
+ * "backend" driver implementation of cleancache.  See
+ * Documentation/vm/cleancache.txt for more information.
+ *
+ * Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
+ * Author: Dan Magenheimer
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/exportfs.h>
+#include <linux/mm.h>
+#include <linux/cleancache.h>
+/*
+ * This global enablement flag may be read thousands of times per second
+ * by cleancache_get/put/flush even on systems where cleancache_ops
+ * is not claimed (e.g. cleancache is config'ed on but remains
+ * disabled), so is preferred to the slower alternative: a function
+ * call that checks a non-global.
+ */
+int cleancache_enabled;
+EXPORT_SYMBOL(cleancache_enabled);
+/*
+ * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * to the cleancache "backend" implementation functions.
+ */
+static struct cleancache_ops cleancache_ops;
+/* useful stats available in /sys/kernel/mm/cleancache */
+static unsigned long cleancache_succ_gets;
+static unsigned long cleancache_failed_gets;
+static unsigned long cleancache_puts;
+static unsigned long cleancache_flushes;
+/*
+ * register operations for cleancache, returning previous thus allowing
+ * detection of multiple backends and possible nesting
+ */
+struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
+{
+        struct cleancache_ops old = cleancache_ops;
+        cleancache_ops = *ops;
+        cleancache_enabled = 1;
+        return old;
+}
+EXPORT_SYMBOL(cleancache_register_ops);
+/* Called by a cleancache-enabled filesystem at time of mount */
+void __cleancache_init_fs(struct super_block *sb)
+{
+        sb->cleancache_poolid = (*cleancache_ops.init_fs)(PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_fs);
+/* Called by a cleancache-enabled clustered filesystem at time of mount */
+void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+{
+        sb->cleancache_poolid =
+                (*cleancache_ops.init_shared_fs)(uuid, PAGE_SIZE);
+}
+EXPORT_SYMBOL(__cleancache_init_shared_fs);
+/*
+ * If the filesystem uses exportable filehandles, use the filehandle as
+ * the key, else use the inode number.
+ */
+static int cleancache_get_key(struct inode *inode,
+                              struct cleancache_filekey *key)
+{
+        int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+        int len = 0, maxlen = CLEANCACHE_KEY_MAX;
+        struct super_block *sb = inode->i_sb;
+        key->u.ino = inode->i_ino;
+        if (sb->s_export_op != NULL) {
+                fhfn = sb->s_export_op->encode_fh;
+                if  (fhfn) {
+                        struct dentry d;
+                        d.d_inode = inode;
+                        len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
+                        if (len <= 0 || len == 255)
+                                return -1;
+                        if (maxlen > CLEANCACHE_KEY_MAX)
+                                return -1;
+                }
+        }
+        return 0;
+}
+/*
+ * "Get" data from cleancache associated with the poolid/inode/index
+ * that were specified when the data was put to cleanache and, if
+ * successful, use it to fill the specified page with data and return 0.
+ * The pageframe is unchanged and returns -1 if the get fails.
+ * Page must be locked by caller.
+ */
+int __cleancache_get_page(struct page *page)
+{
+        int ret = -1;
+        int pool_id;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        VM_BUG_ON(!PageLocked(page));
+        pool_id = page->mapping->host->i_sb->cleancache_poolid;
+        if (pool_id < 0)
+                goto out;
+        if (cleancache_get_key(page->mapping->host, &key) < 0)
+                goto out;
+        ret = (*cleancache_ops.get_page)(pool_id, key, page->index, page);
+        if (ret == 0)
+                cleancache_succ_gets++;
+        else
+                cleancache_failed_gets++;
+out:
+        return ret;
+}
+EXPORT_SYMBOL(__cleancache_get_page);
+/*
+ * "Put" data from a page to cleancache and associate it with the
+ * (previously-obtained per-filesystem) poolid and the page's,
+ * inode and page index.  Page must be locked.  Note that a put_page
+ * always "succeeds", though a subsequent get_page may succeed or fail.
+ */
+void __cleancache_put_page(struct page *page)
+{
+        int pool_id;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        VM_BUG_ON(!PageLocked(page));
+        pool_id = page->mapping->host->i_sb->cleancache_poolid;
+        if (pool_id >= 0 &&
+              cleancache_get_key(page->mapping->host, &key) >= 0) {
+                (*cleancache_ops.put_page)(pool_id, key, page->index, page);
+                cleancache_puts++;
+        }
+}
+EXPORT_SYMBOL(__cleancache_put_page);
+/*
+ * Flush any data from cleancache associated with the poolid and the
+ * page's inode and page index so that a subsequent "get" will fail.
+ */
+void __cleancache_flush_page(struct address_space *mapping, struct page *page)
+{
+        /* careful... page->mapping is NULL sometimes when this is called */
+        int pool_id = mapping->host->i_sb->cleancache_poolid;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        if (pool_id >= 0) {
+                VM_BUG_ON(!PageLocked(page));
+                if (cleancache_get_key(mapping->host, &key) >= 0) {
+                        (*cleancache_ops.flush_page)(pool_id, key, page->index);
+                        cleancache_flushes++;
+                }
+        }
+}
+EXPORT_SYMBOL(__cleancache_flush_page);
+/*
+ * Flush all data from cleancache associated with the poolid and the
+ * mappings's inode so that all subsequent gets to this poolid/inode
+ * will fail.
+ */
+void __cleancache_flush_inode(struct address_space *mapping)
+{
+        int pool_id = mapping->host->i_sb->cleancache_poolid;
+        struct cleancache_filekey key = { .u.key = { 0 } };
+        if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
+                (*cleancache_ops.flush_inode)(pool_id, key);
+}
+EXPORT_SYMBOL(__cleancache_flush_inode);
+/*
+ * Called by any cleancache-enabled filesystem at time of unmount;
+ * note that pool_id is surrendered and may be reutrned by a subsequent
+ * cleancache_init_fs or cleancache_init_shared_fs
+ */
+void __cleancache_flush_fs(struct super_block *sb)
+{
+        if (sb->cleancache_poolid >= 0) {
+                int old_poolid = sb->cleancache_poolid;
+                sb->cleancache_poolid = -1;
+                (*cleancache_ops.flush_fs)(old_poolid);
+        }
+}
+EXPORT_SYMBOL(__cleancache_flush_fs);
+#ifdef CONFIG_SYSFS
+/* see Documentation/ABI/xxx/sysfs-kernel-mm-cleancache */
+#define CLEANCACHE_SYSFS_RO(_name) \
+        static ssize_t cleancache_##_name##_show(struct kobject *kobj, \
+                                struct kobj_attribute *attr, char *buf) \
+        { \
+                return sprintf(buf, "%lu\n", cleancache_##_name); \
+        } \
+        static struct kobj_attribute cleancache_##_name##_attr = { \
+                .attr = { .name = __stringify(_name), .mode = 0444 }, \
+                .show = cleancache_##_name##_show, \
+        }
+CLEANCACHE_SYSFS_RO(succ_gets);
+CLEANCACHE_SYSFS_RO(failed_gets);
+CLEANCACHE_SYSFS_RO(puts);
+CLEANCACHE_SYSFS_RO(flushes);
+static struct attribute *cleancache_attrs[] = {
+        &cleancache_succ_gets_attr.attr,
+        &cleancache_failed_gets_attr.attr,
+        &cleancache_puts_attr.attr,
+        &cleancache_flushes_attr.attr,
+        NULL,
+};
+static struct attribute_group cleancache_attr_group = {
+        .attrs = cleancache_attrs,
+        .name = "cleancache",
+};
+#endif /* CONFIG_SYSFS */
+static int __init init_cleancache(void)
+{
+#ifdef CONFIG_SYSFS
+        int err;
+        err = sysfs_create_group(mm_kobj, &cleancache_attr_group);
+#endif /* CONFIG_SYSFS */
+        return 0;
+}
+module_init(init_cleancache)
diff --git a/mm/compaction.c b/mm/compaction.c
index 4d709ee59013..6cc604bd5649 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,9 @@
 #include <linux/sysfs.h>
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/compaction.h>
 /*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
@@ -30,6 +33,7 @@ struct compact_control {
        unsigned long nr_migratepages;  /* Number of pages to migrate */
        unsigned long free_pfn;         /* isolate_freepages search base */
        unsigned long migrate_pfn;      /* isolate_migratepages search base */
+        bool sync;                      /* Synchronous migration */
        /* Account for isolated anon and file pages */
        unsigned long nr_anon;
@@ -60,7 +64,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                                struct list_head *freelist)
 {
        unsigned long zone_end_pfn, end_pfn;
-        int total_isolated = 0;
+        int nr_scanned = 0, total_isolated = 0;
        struct page *cursor;
        /* Get the last PFN we should scan for free pages at */
@@ -81,6 +85,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                if (!pfn_valid_within(blockpfn))
                        continue;
+                nr_scanned++;
                if (!PageBuddy(page))
                        continue;
@@ -100,6 +105,7 @@ static unsigned long isolate_freepages_block(struct zone *zone,
                }
        }
+        trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
        return total_isolated;
 }
@@ -138,16 +144,26 @@ static void isolate_freepages(struct zone *zone,
        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
+        /*
+         * Initialise the free scanner. The starting point is where we last
+         * scanned from (or the end of the zone if starting). The low point
+         * is the end of the pageblock the migration scanner is using.
+         */
        pfn = cc->free_pfn;
        low_pfn = cc->migrate_pfn + pageblock_nr_pages;
-        high_pfn = low_pfn;
+        /*
+         * Take care that if the migration scanner is at the end of the zone
+         * that the free scanner does not accidentally move to the next zone
+         * in the next isolation cycle.
+         */
+        high_pfn = min(low_pfn, pfn);
        /*
         * Isolate free pages until enough are available to migrate the
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
         */
-        spin_lock_irqsave(&zone->lock, flags);
        for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
                                        pfn -= pageblock_nr_pages) {
                unsigned long isolated;
@@ -170,9 +186,19 @@ static void isolate_freepages(struct zone *zone,
                if (!suitable_migration_target(page))
                        continue;
-                /* Found a block suitable for isolating free pages from */
+                /*
-                isolated = isolate_freepages_block(zone, pfn, freelist);
+                 * Found a block suitable for isolating free pages from. Now
-                nr_freepages += isolated;
+                 * we disabled interrupts, double check things are ok and
+                 * isolate the pages. This is to minimise the time IRQs
+                 * are disabled
+                 */
+                isolated = 0;
+                spin_lock_irqsave(&zone->lock, flags);
+                if (suitable_migration_target(page)) {
+                        isolated = isolate_freepages_block(zone, pfn, freelist);
+                        nr_freepages += isolated;
+                }
+                spin_unlock_irqrestore(&zone->lock, flags);
                /*
                 * Record the highest PFN we isolated pages from. When next
@@ -182,7 +208,6 @@ static void isolate_freepages(struct zone *zone,
                if (isolated)
                        high_pfn = max(high_pfn, pfn);
        }
-        spin_unlock_irqrestore(&zone->lock, flags);
        /* split_free_page does not map the pages */
        list_for_each_entry(page, freelist, lru) {
@@ -226,14 +251,23 @@ static bool too_many_isolated(struct zone *zone)
        return isolated > (inactive + active) / 2;
 }
+/* possible outcome of isolate_migratepages */
+typedef enum {
+        ISOLATE_ABORT,          /* Abort compaction now */
+        ISOLATE_NONE,           /* No pages isolated, continue scanning */
+        ISOLATE_SUCCESS,        /* Pages isolated, migrate */
+} isolate_migrate_t;
 /*
 * Isolate all pages that can be migrated from the block pointed to by
 * the migrate scanner within compact_control.
 */
-static unsigned long isolate_migratepages(struct zone *zone,
+static isolate_migrate_t isolate_migratepages(struct zone *zone,
                                        struct compact_control *cc)
 {
        unsigned long low_pfn, end_pfn;
+        unsigned long last_pageblock_nr = 0, pageblock_nr;
+        unsigned long nr_scanned = 0, nr_isolated = 0;
        struct list_head *migratelist = &cc->migratepages;
        /* Do not scan outside zone boundaries */
@@ -245,7 +279,7 @@ static unsigned long isolate_migratepages(struct zone *zone,
        /* Do not cross the free scanner or scan within a memory hole */
        if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
                cc->migrate_pfn = end_pfn;
-                return 0;
+                return ISOLATE_NONE;
        }
        /*
@@ -254,33 +288,85 @@ static unsigned long isolate_migratepages(struct zone *zone,
         * delay for some time until fewer pages are isolated
         */
        while (unlikely(too_many_isolated(zone))) {
+                /* async migration should just abort */
+                if (!cc->sync)
+                        return ISOLATE_ABORT;
                congestion_wait(BLK_RW_ASYNC, HZ/10);
                if (fatal_signal_pending(current))
-                        return 0;
+                        return ISOLATE_ABORT;
        }
        /* Time to isolate some pages for migration */
+        cond_resched();
        spin_lock_irq(&zone->lru_lock);
        for (; low_pfn < end_pfn; low_pfn++) {
                struct page *page;
+                bool locked = true;
+                /* give a chance to irqs before checking need_resched() */
+                if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
+                        spin_unlock_irq(&zone->lru_lock);
+                        locked = false;
+                }
+                if (need_resched() || spin_is_contended(&zone->lru_lock)) {
+                        if (locked)
+                                spin_unlock_irq(&zone->lru_lock);
+                        cond_resched();
+                        spin_lock_irq(&zone->lru_lock);
+                        if (fatal_signal_pending(current))
+                                break;
+                } else if (!locked)
+                        spin_lock_irq(&zone->lru_lock);
                if (!pfn_valid_within(low_pfn))
                        continue;
+                nr_scanned++;
                /* Get the page and skip if free */
                page = pfn_to_page(low_pfn);
                if (PageBuddy(page))
                        continue;
+                /*
+                 * For async migration, also only scan in MOVABLE blocks. Async
+                 * migration is optimistic to see if the minimum amount of work
+                 * satisfies the allocation
+                 */
+                pageblock_nr = low_pfn >> pageblock_order;
+                if (!cc->sync && last_pageblock_nr != pageblock_nr &&
+                                get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
+                        low_pfn += pageblock_nr_pages;
+                        low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
+                        last_pageblock_nr = pageblock_nr;
+                        continue;
+                }
+                if (!PageLRU(page))
+                        continue;
+                /*
+                 * PageLRU is set, and lru_lock excludes isolation,
+                 * splitting and collapsing (collapsing has already
+                 * happened if PageLRU is set).
+                 */
+                if (PageTransHuge(page)) {
+                        low_pfn += (1 << compound_order(page)) - 1;
+                        continue;
+                }
                /* Try isolate the page */
                if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
                        continue;
+                VM_BUG_ON(PageTransCompound(page));
                /* Successfully isolated */
                del_page_from_lru_list(zone, page, page_lru(page));
                list_add(&page->lru, migratelist);
-                mem_cgroup_del_lru(page);
                cc->nr_migratepages++;
+                nr_isolated++;
                /* Avoid isolating too much */
                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
@@ -292,7 +378,9 @@ static unsigned long isolate_migratepages(struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
        cc->migrate_pfn = low_pfn;
-        return cc->nr_migratepages;
+        trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+        return ISOLATE_SUCCESS;
 }
 /*
@@ -342,10 +430,10 @@ static void update_nr_listpages(struct compact_control *cc)
 }
 static int compact_finished(struct zone *zone,
-                                                struct compact_control *cc)
+                            struct compact_control *cc)
 {
        unsigned int order;
-        unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+        unsigned long watermark;
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
@@ -354,11 +442,18 @@ static int compact_finished(struct zone *zone,
        if (cc->free_pfn <= cc->migrate_pfn)
                return COMPACT_COMPLETE;
-        /* Compaction run is not finished if the watermark is not met */
+        /*
-        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+         * order == -1 is expected when compacting via
+         * /proc/sys/vm/compact_memory
+         */
+        if (cc->order == -1)
                return COMPACT_CONTINUE;
-        if (cc->order == -1)
+        /* Compaction run is not finished if the watermark is not met */
+        watermark = low_wmark_pages(zone);
+        watermark += (1 << cc->order);
+        if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
                return COMPACT_CONTINUE;
        /* Direct compactor: Is a suitable page free? */
@@ -375,10 +470,71 @@ static int compact_finished(struct zone *zone,
        return COMPACT_CONTINUE;
 }
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ *   COMPACT_SKIPPED  - If there are too few free pages for compaction
+ *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
+ *   COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+        int fragindex;
+        unsigned long watermark;
+        /*
+         * order == -1 is expected when compacting via
+         * /proc/sys/vm/compact_memory
+         */
+        if (order == -1)
+                return COMPACT_CONTINUE;
+        /*
+         * Watermarks for order-0 must be met for compaction. Note the 2UL.
+         * This is because during migration, copies of pages need to be
+         * allocated and for a short time, the footprint is higher
+         */
+        watermark = low_wmark_pages(zone) + (2UL << order);
+        if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                return COMPACT_SKIPPED;
+        /*
+         * fragmentation index determines if allocation failures are due to
+         * low memory or external fragmentation
+         *
+         * index of -1000 implies allocations might succeed depending on
+         * watermarks
+         * index towards 0 implies failure is due to lack of memory
+         * index towards 1000 implies failure is due to fragmentation
+         *
+         * Only compact if a failure would be due to fragmentation.
+         */
+        fragindex = fragmentation_index(zone, order);
+        if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+                return COMPACT_SKIPPED;
+        if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
+            0, 0))
+                return COMPACT_PARTIAL;
+        return COMPACT_CONTINUE;
+}
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
+        ret = compaction_suitable(zone, cc->order);
+        switch (ret) {
+        case COMPACT_PARTIAL:
+        case COMPACT_SKIPPED:
+                /* Compaction is likely to fail */
+                return ret;
+        case COMPACT_CONTINUE:
+                /* Fall through to compaction */
+                ;
+        }
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -388,13 +544,22 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
                unsigned long nr_migrate, nr_remaining;
+                int err;
-                if (!isolate_migratepages(zone, cc))
+                switch (isolate_migratepages(zone, cc)) {
+                case ISOLATE_ABORT:
+                        ret = COMPACT_PARTIAL;
+                        goto out;
+                case ISOLATE_NONE:
                        continue;
+                case ISOLATE_SUCCESS:
+                        ;
+                }
                nr_migrate = cc->nr_migratepages;
-                migrate_pages(&cc->migratepages, compaction_alloc,
+                err = migrate_pages(&cc->migratepages, compaction_alloc,
-                                                (unsigned long)cc, 0);
+                                (unsigned long)cc, false,
+                                cc->sync);
                update_nr_listpages(cc);
                nr_remaining = cc->nr_migratepages;
@@ -402,15 +567,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
                count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
                if (nr_remaining)
                        count_vm_events(COMPACTPAGEFAILED, nr_remaining);
+                trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
+                                                nr_remaining);
                /* Release LRU pages not migrated */
-                if (!list_empty(&cc->migratepages)) {
+                if (err) {
                        putback_lru_pages(&cc->migratepages);
                        cc->nr_migratepages = 0;
                }
        }
+out:
        /* Release free pages and check accounting */
        cc->nr_freepages -= release_freepages(&cc->freepages);
        VM_BUG_ON(cc->nr_freepages != 0);
@@ -418,8 +586,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        return ret;
 }
-static unsigned long compact_zone_order(struct zone *zone,
+unsigned long compact_zone_order(struct zone *zone,
-                                                int order, gfp_t gfp_mask)
+                                 int order, gfp_t gfp_mask,
+                                 bool sync)
 {
        struct compact_control cc = {
                .nr_freepages = 0,
@@ -427,6 +596,7 @@ static unsigned long compact_zone_order(struct zone *zone,
                .order = order,
                .migratetype = allocflags_to_migratetype(gfp_mask),
                .zone = zone,
+                .sync = sync,
        };
        INIT_LIST_HEAD(&cc.freepages);
        INIT_LIST_HEAD(&cc.migratepages);
@@ -442,16 +612,17 @@ int sysctl_extfrag_threshold = 500;
 * @order: The order of the current allocation
 * @gfp_mask: The GFP mask of the current allocation
 * @nodemask: The allowed nodes to allocate from
+ * @sync: Whether migration is synchronous or not
 *
 * This is the main entry point for direct page compaction.
 */
 unsigned long try_to_compact_pages(struct zonelist *zonelist,
-                        int order, gfp_t gfp_mask, nodemask_t *nodemask)
+                        int order, gfp_t gfp_mask, nodemask_t *nodemask,
+                        bool sync)
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
-        unsigned long watermark;
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
@@ -461,7 +632,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
         * made because an assumption is made that the page allocator can satisfy
         * the "cheaper" orders without taking special steps
         */
-        if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+        if (!order || !may_enter_fs || !may_perform_io)
                return rc;
        count_vm_event(COMPACTSTALL);
@@ -469,43 +640,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
-                int fragindex;
                int status;
-                /*
+                status = compact_zone_order(zone, order, gfp_mask, sync);
-                 * Watermarks for order-0 must be met for compaction. Note
-                 * the 2UL. This is because during migration, copies of
-                 * pages need to be allocated and for a short time, the
-                 * footprint is higher
-                 */
-                watermark = low_wmark_pages(zone) + (2UL << order);
-                if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
-                        continue;
-                /*
-                 * fragmentation index determines if allocation failures are
-                 * due to low memory or external fragmentation
-                 *
-                 * index of -1 implies allocations might succeed depending
-                 *      on watermarks
-                 * index towards 0 implies failure is due to lack of memory
-                 * index towards 1000 implies failure is due to fragmentation
-                 *
-                 * Only compact if a failure would be due to fragmentation.
-                 */
-                fragindex = fragmentation_index(zone, order);
-                if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-                        continue;
-                if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
-                        rc = COMPACT_PARTIAL;
-                        break;
-                }
-                status = compact_zone_order(zone, order, gfp_mask);
                rc = max(status, rc);
-                if (zone_watermark_ok(zone, order, watermark, 0, 0))
+                /* If a normal allocation would succeed, stop compacting */
+                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
                        break;
        }
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 3df063706f53..03bf3bb4519a 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -311,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
        size_t offset;
        void *retval;
+        might_sleep_if(mem_flags & __GFP_WAIT);
        spin_lock_irqsave(&pool->lock, flags);
 restart:
        list_for_each_entry(page, &pool->page_list, page_list) {
@@ -322,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
                if (mem_flags & __GFP_WAIT) {
                        DECLARE_WAITQUEUE(wait, current);
-                        __set_current_state(TASK_INTERRUPTIBLE);
+                        __set_current_state(TASK_UNINTERRUPTIBLE);
                        __add_wait_queue(&pool->waitq, &wait);
                        spin_unlock_irqrestore(&pool->lock, flags);
@@ -353,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc);
 static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma)
 {
-        unsigned long flags;
        struct dma_page *page;
-        spin_lock_irqsave(&pool->lock, flags);
        list_for_each_entry(page, &pool->page_list, page_list) {
                if (dma < page->dma)
                        continue;
                if (dma < (page->dma + pool->allocation))
-                        goto done;
+                        return page;
        }
-        page = NULL;
+        return NULL;
- done:
-        spin_unlock_irqrestore(&pool->lock, flags);
-        return page;
 }
 /**
@@ -384,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        unsigned long flags;
        unsigned int offset;
+        spin_lock_irqsave(&pool->lock, flags);
        page = pool_find_page(pool, dma);
        if (!page) {
+                spin_unlock_irqrestore(&pool->lock, flags);
                if (pool->dev)
                        dev_err(pool->dev,
                                "dma_pool_free %s, %p/%lx (bad dma)\n",
@@ -399,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        offset = vaddr - page->vaddr;
 #ifdef  DMAPOOL_DEBUG
        if ((dma - page->dma) != offset) {
+                spin_unlock_irqrestore(&pool->lock, flags);
                if (pool->dev)
                        dev_err(pool->dev,
                                "dma_pool_free %s, %p (bad vaddr)/%Lx\n",
@@ -416,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
                                chain = *(int *)(page->vaddr + chain);
                                continue;
                        }
+                        spin_unlock_irqrestore(&pool->lock, flags);
                        if (pool->dev)
                                dev_err(pool->dev, "dma_pool_free %s, dma %Lx "
                                        "already free\n", pool->name,
@@ -430,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        memset(vaddr, POOL_POISON_FREED, pool->size);
 #endif
-        spin_lock_irqsave(&pool->lock, flags);
        page->in_use--;
        *(int *)vaddr = page->offset;
        page->offset = offset;
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44e4221..a8251a8d3457 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <linux/cleancache.h>
 #include "internal.h"
 /*
@@ -58,16 +59,16 @@
 /*
 * Lock ordering:
 *
- *  ->i_mmap_lock               (truncate_pagecache)
+ *  ->i_mmap_mutex              (truncate_pagecache)
 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
 *      ->swap_lock             (exclusive_swap_page, others)
 *        ->mapping->tree_lock
 *
 *  ->i_mutex
- *    ->i_mmap_lock             (truncate->unmap_mapping_range)
+ *    ->i_mmap_mutex            (truncate->unmap_mapping_range)
 *
 *  ->mmap_sem
- *    ->i_mmap_lock
+ *    ->i_mmap_mutex
 *      ->page_table_lock or pte_lock   (various, mainly in memory.c)
 *        ->mapping->tree_lock  (arch-dependent flush_dcache_mmap_lock)
 *
@@ -80,11 +81,11 @@
 *  ->i_mutex
 *    ->i_alloc_sem             (various)
 *
- *  ->inode_lock
+ *  inode_wb_list_lock
- *    ->sb_lock                 (fs/fs-writeback.c)
+ *    sb_lock                   (fs/fs-writeback.c)
 *    ->mapping->tree_lock      (__sync_single_inode)
 *
- *  ->i_mmap_lock
+ *  ->i_mmap_mutex
 *    ->anon_vma.lock           (vma_adjust)
 *
 *  ->anon_vma.lock
@@ -98,27 +99,36 @@
 *    ->zone.lru_lock           (check_pte_range->isolate_lru_page)
 *    ->private_lock            (page_remove_rmap->set_page_dirty)
 *    ->tree_lock               (page_remove_rmap->set_page_dirty)
- *    ->inode_lock              (page_remove_rmap->set_page_dirty)
+ *    inode_wb_list_lock        (page_remove_rmap->set_page_dirty)
- *    ->inode_lock              (zap_pte_range->set_page_dirty)
+ *    ->inode->i_lock           (page_remove_rmap->set_page_dirty)
+ *    inode_wb_list_lock        (zap_pte_range->set_page_dirty)
+ *    ->inode->i_lock           (zap_pte_range->set_page_dirty)
 *    ->private_lock            (zap_pte_range->__set_page_dirty_buffers)
 *
- *  ->task->proc_lock
- *    ->dcache_lock             (proc_pid_lookup)
- *
 *  (code doesn't rely on that order, so you could switch it around)
 *  ->tasklist_lock             (memory_failure, collect_procs_ao)
- *    ->i_mmap_lock
+ *    ->i_mmap_mutex
 */
 /*
- * Remove a page from the page cache and free it. Caller has to make
+ * Delete a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
 * is safe.  The caller must hold the mapping's tree_lock.
 */
-void __remove_from_page_cache(struct page *page)
+void __delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
+        /*
+         * if we're uptodate, flush out into the cleancache, otherwise
+         * invalidate any existing cleancache entries.  We can't leave
+         * stale data around in the cleancache once our page is gone
+         */
+        if (PageUptodate(page) && PageMappedToDisk(page))
+                cleancache_put_page(page);
+        else
+                cleancache_flush_page(mapping, page);
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
        mapping->nrpages--;
@@ -140,58 +150,42 @@ void __remove_from_page_cache(struct page *page)
        }
 }
-void remove_from_page_cache(struct page *page)
+/**
+ * delete_from_page_cache - delete page from page cache
+ * @page: the page which the kernel is trying to remove from page cache
+ *
+ * This must be called only on pages that have been verified to be in the page
+ * cache and locked.  It will never put the page into the free list, the caller
+ * has a reference on the page.
+ */
+void delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
+        void (*freepage)(struct page *);
        BUG_ON(!PageLocked(page));
+        freepage = mapping->a_ops->freepage;
        spin_lock_irq(&mapping->tree_lock);
-        __remove_from_page_cache(page);
+        __delete_from_page_cache(page);
        spin_unlock_irq(&mapping->tree_lock);
        mem_cgroup_uncharge_cache_page(page);
+        if (freepage)
+                freepage(page);
+        page_cache_release(page);
 }
-EXPORT_SYMBOL(remove_from_page_cache);
+EXPORT_SYMBOL(delete_from_page_cache);
-static int sync_page(void *word)
+static int sleep_on_page(void *word)
 {
-        struct address_space *mapping;
-        struct page *page;
-        page = container_of((unsigned long *)word, struct page, flags);
-        /*
-         * page_mapping() is being called without PG_locked held.
-         * Some knowledge of the state and use of the page is used to
-         * reduce the requirements down to a memory barrier.
-         * The danger here is of a stale page_mapping() return value
-         * indicating a struct address_space different from the one it's
-         * associated with when it is associated with one.
-         * After smp_mb(), it's either the correct page_mapping() for
-         * the page, or an old page_mapping() and the page's own
-         * page_mapping() has gone NULL.
-         * The ->sync_page() address_space operation must tolerate
-         * page_mapping() going NULL. By an amazing coincidence,
-         * this comes about because none of the users of the page
-         * in the ->sync_page() methods make essential use of the
-         * page_mapping(), merely passing the page down to the backing
-         * device's unplug functions when it's non-NULL, which in turn
-         * ignore it for all cases but swap, where only page_private(page) is
-         * of interest. When page_mapping() does go NULL, the entire
-         * call stack gracefully ignores the page and returns.
-         * -- wli
-         */
-        smp_mb();
-        mapping = page_mapping(page);
-        if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
-                mapping->a_ops->sync_page(page);
        io_schedule();
        return 0;
 }
-static int sync_page_killable(void *word)
+static int sleep_on_page_killable(void *word)
 {
-        sync_page(word);
+        sleep_on_page(word);
        return fatal_signal_pending(current) ? -EINTR : 0;
 }
@@ -296,7 +290,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
                                continue;
                        wait_on_page_writeback(page);
-                        if (PageError(page))
+                        if (TestClearPageError(page))
                                ret = -EIO;
                }
                pagevec_release(&pvec);
@@ -385,6 +379,76 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 EXPORT_SYMBOL(filemap_write_and_wait_range);
 /**
+ * replace_page_cache_page - replace a pagecache page with a new one
+ * @old:        page to be replaced
+ * @new:        page to replace with
+ * @gfp_mask:   allocation mode
+ *
+ * This function replaces a page in the pagecache with a new one.  On
+ * success it acquires the pagecache reference for the new page and
+ * drops it for the old page.  Both the old and new pages must be
+ * locked.  This function does not add the new page to the LRU, the
+ * caller must do that.
+ *
+ * The remove + add is atomic.  The only way this function can fail is
+ * memory allocation failure.
+ */
+int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
+{
+        int error;
+        struct mem_cgroup *memcg = NULL;
+        VM_BUG_ON(!PageLocked(old));
+        VM_BUG_ON(!PageLocked(new));
+        VM_BUG_ON(new->mapping);
+        /*
+         * This is not page migration, but prepare_migration and
+         * end_migration does enough work for charge replacement.
+         *
+         * In the longer term we probably want a specialized function
+         * for moving the charge from old to new in a more efficient
+         * manner.
+         */
+        error = mem_cgroup_prepare_migration(old, new, &memcg, gfp_mask);
+        if (error)
+                return error;
+        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+        if (!error) {
+                struct address_space *mapping = old->mapping;
+                void (*freepage)(struct page *);
+                pgoff_t offset = old->index;
+                freepage = mapping->a_ops->freepage;
+                page_cache_get(new);
+                new->mapping = mapping;
+                new->index = offset;
+                spin_lock_irq(&mapping->tree_lock);
+                __delete_from_page_cache(old);
+                error = radix_tree_insert(&mapping->page_tree, offset, new);
+                BUG_ON(error);
+                mapping->nrpages++;
+                __inc_zone_page_state(new, NR_FILE_PAGES);
+                if (PageSwapBacked(new))
+                        __inc_zone_page_state(new, NR_SHMEM);
+                spin_unlock_irq(&mapping->tree_lock);
+                radix_tree_preload_end();
+                if (freepage)
+                        freepage(old);
+                page_cache_release(old);
+                mem_cgroup_end_migration(memcg, old, new, true);
+        } else {
+                mem_cgroup_end_migration(memcg, old, new, false);
+        }
+        return error;
+}
+EXPORT_SYMBOL_GPL(replace_page_cache_page);
+/**
 * add_to_page_cache_locked - add a locked page to the pagecache
 * @page:       page to add
 * @mapping:    the page's address_space
@@ -477,12 +541,6 @@ struct page *__page_cache_alloc(gfp_t gfp)
 EXPORT_SYMBOL(__page_cache_alloc);
 #endif
-static int __sleep_on_page_lock(void *word)
-{
-        io_schedule();
-        return 0;
-}
 /*
 * In order to wait for pages to become available there must be
 * waitqueues associated with pages. By using a hash table of
@@ -510,11 +568,22 @@ void wait_on_page_bit(struct page *page, int bit_nr)
        DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
        if (test_bit(bit_nr, &page->flags))
-                __wait_on_bit(page_waitqueue(page), &wait, sync_page,
+                __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
                                                        TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_on_page_bit);
+int wait_on_page_bit_killable(struct page *page, int bit_nr)
+{
+        DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
+        if (!test_bit(bit_nr, &page->flags))
+                return 0;
+        return __wait_on_bit(page_waitqueue(page), &wait,
+                             sleep_on_page_killable, TASK_KILLABLE);
+}
 /**
 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
 * @page: Page defining the wait queue of interest
@@ -574,17 +643,12 @@ EXPORT_SYMBOL(end_page_writeback);
 /**
 * __lock_page - get a lock on the page, assuming we need to sleep to get it
 * @page: the page to lock
- *
- * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
- * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
- * chances are that on the second loop, the block layer's plug list is empty,
- * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
 */
 void __lock_page(struct page *page)
 {
        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
-        __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
+        __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
                                                        TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(__lock_page);
@@ -594,22 +658,40 @@ int __lock_page_killable(struct page *page)
        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
        return __wait_on_bit_lock(page_waitqueue(page), &wait,
-                                        sync_page_killable, TASK_KILLABLE);
+                                        sleep_on_page_killable, TASK_KILLABLE);
 }
 EXPORT_SYMBOL_GPL(__lock_page_killable);
-/**
+int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
- * __lock_page_nosync - get a lock on the page, without calling sync_page()
+                         unsigned int flags)
- * @page: the page to lock
- *
- * Variant of lock_page that does not require the caller to hold a reference
- * on the page's mapping.
- */
-void __lock_page_nosync(struct page *page)
 {
-        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+        if (flags & FAULT_FLAG_ALLOW_RETRY) {
-        __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
+                /*
-                                                        TASK_UNINTERRUPTIBLE);
+                 * CAUTION! In this case, mmap_sem is not released
+                 * even though return 0.
+                 */
+                if (flags & FAULT_FLAG_RETRY_NOWAIT)
+                        return 0;
+                up_read(&mm->mmap_sem);
+                if (flags & FAULT_FLAG_KILLABLE)
+                        wait_on_page_locked_killable(page);
+                else
+                        wait_on_page_locked(page);
+                return 0;
+        } else {
+                if (flags & FAULT_FLAG_KILLABLE) {
+                        int ret;
+                        ret = __lock_page_killable(page);
+                        if (ret) {
+                                up_read(&mm->mmap_sem);
+                                return 0;
+                        }
+                } else
+                        __lock_page(page);
+                return 1;
+        }
 }
 /**
@@ -631,7 +713,9 @@ repeat:
        pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
        if (pagep) {
                page = radix_tree_deref_slot(pagep);
-                if (unlikely(!page || page == RADIX_TREE_RETRY))
+                if (unlikely(!page))
+                        goto out;
+                if (radix_tree_deref_retry(page))
                        goto repeat;
                if (!page_cache_get_speculative(page))
@@ -647,6 +731,7 @@ repeat:
                        goto repeat;
                }
        }
+out:
        rcu_read_unlock();
        return page;
@@ -764,12 +849,15 @@ repeat:
                page = radix_tree_deref_slot((void **)pages[i]);
                if (unlikely(!page))
                        continue;
                /*
-                 * this can only trigger if nr_found == 1, making livelock
+                 * This can only trigger when the entry at index 0 moves out
-                 * a non issue.
+                 * of or back to the root: none yet gotten, safe to restart.
                 */
-                if (unlikely(page == RADIX_TREE_RETRY))
+                if (radix_tree_deref_retry(page)) {
+                        WARN_ON(start | i);
                        goto restart;
+                }
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -783,6 +871,13 @@ repeat:
                pages[ret] = page;
                ret++;
        }
+        /*
+         * If all entries were removed before we could secure them,
+         * try again, because callers stop trying once 0 is returned.
+         */
+        if (unlikely(!ret && nr_found))
+                goto restart;
        rcu_read_unlock();
        return ret;
 }
@@ -817,16 +912,14 @@ repeat:
                page = radix_tree_deref_slot((void **)pages[i]);
                if (unlikely(!page))
                        continue;
                /*
-                 * this can only trigger if nr_found == 1, making livelock
+                 * This can only trigger when the entry at index 0 moves out
-                 * a non issue.
+                 * of or back to the root: none yet gotten, safe to restart.
                 */
-                if (unlikely(page == RADIX_TREE_RETRY))
+                if (radix_tree_deref_retry(page))
                        goto restart;
-                if (page->mapping == NULL || page->index != index)
-                        break;
                if (!page_cache_get_speculative(page))
                        goto repeat;
@@ -836,6 +929,16 @@ repeat:
                        goto repeat;
                }
+                /*
+                 * must check mapping and index after taking the ref.
+                 * otherwise we can get both false positives and false
+                 * negatives, which is just confusing to the caller.
+                 */
+                if (page->mapping == NULL || page->index != index) {
+                        page_cache_release(page);
+                        break;
+                }
                pages[ret] = page;
                ret++;
                index++;
@@ -874,11 +977,12 @@ repeat:
                page = radix_tree_deref_slot((void **)pages[i]);
                if (unlikely(!page))
                        continue;
                /*
-                 * this can only trigger if nr_found == 1, making livelock
+                 * This can only trigger when the entry at index 0 moves out
-                 * a non issue.
+                 * of or back to the root: none yet gotten, safe to restart.
                 */
-                if (unlikely(page == RADIX_TREE_RETRY))
+                if (radix_tree_deref_retry(page))
                        goto restart;
                if (!page_cache_get_speculative(page))
@@ -893,6 +997,13 @@ repeat:
                pages[ret] = page;
                ret++;
        }
+        /*
+         * If all entries were removed before we could secure them,
+         * try again, because callers stop trying once 0 is returned.
+         */
+        if (unlikely(!ret && nr_found))
+                goto restart;
        rcu_read_unlock();
        if (ret)
@@ -1016,6 +1127,9 @@ find_page:
                                goto page_not_up_to_date;
                        if (!trylock_page(page))
                                goto page_not_up_to_date;
+                        /* Did it get truncated before we got the lock? */
+                        if (!page->mapping)
+                                goto page_not_up_to_date_locked;
                        if (!mapping->a_ops->is_partially_uptodate(page,
                                                                desc, offset))
                                goto page_not_up_to_date_locked;
@@ -1279,12 +1393,15 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        unsigned long seg = 0;
        size_t count;
        loff_t *ppos = &iocb->ki_pos;
+        struct blk_plug plug;
        count = 0;
        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
        if (retval)
                return retval;
+        blk_start_plug(&plug);
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (filp->f_flags & O_DIRECT) {
                loff_t size;
@@ -1357,6 +1474,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                        break;
        }
 out:
+        blk_finish_plug(&plug);
        return retval;
 }
 EXPORT_SYMBOL(generic_file_aio_read);
@@ -1449,15 +1567,17 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
        /* If we don't want any read-ahead, don't bother */
        if (VM_RandomReadHint(vma))
                return;
+        if (!ra->ra_pages)
+                return;
-        if (VM_SequentialReadHint(vma) ||
+        if (VM_SequentialReadHint(vma)) {
-                        offset - 1 == (ra->prev_pos >> PAGE_CACHE_SHIFT)) {
                page_cache_sync_readahead(mapping, ra, file, offset,
                                          ra->ra_pages);
                return;
        }
-        if (ra->mmap_miss < INT_MAX)
+        /* Avoid banging the cache line if not needed */
+        if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
                ra->mmap_miss++;
        /*
@@ -1471,12 +1591,10 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
         * mmap read-around
         */
        ra_pages = max_sane_readahead(ra->ra_pages);
-        if (ra_pages) {
+        ra->start = max_t(long, 0, offset - ra_pages / 2);
-                ra->start = max_t(long, 0, offset - ra_pages/2);
+        ra->size = ra_pages;
-                ra->size = ra_pages;
+        ra->async_size = ra_pages / 4;
-                ra->async_size = 0;
+        ra_submit(ra, mapping, file);
-                ra_submit(ra, mapping, file);
-        }
 }
 /*
@@ -1539,25 +1657,31 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 * waiting for the lock.
                 */
                do_async_mmap_readahead(vma, ra, file, page, offset);
-                lock_page(page);
-                /* Did it get truncated? */
-                if (unlikely(page->mapping != mapping)) {
-                        unlock_page(page);
-                        put_page(page);
-                        goto no_cached_page;
-                }
        } else {
                /* No page in the page cache at all */
                do_sync_mmap_readahead(vma, ra, file, offset);
                count_vm_event(PGMAJFAULT);
+                mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
 retry_find:
-                page = find_lock_page(mapping, offset);
+                page = find_get_page(mapping, offset);
                if (!page)
                        goto no_cached_page;
        }
+        if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+                page_cache_release(page);
+                return ret | VM_FAULT_RETRY;
+        }
+        /* Did it get truncated? */
+        if (unlikely(page->mapping != mapping)) {
+                unlock_page(page);
+                put_page(page);
+                goto retry_find;
+        }
+        VM_BUG_ON(page->index != offset);
        /*
         * We have a locked page in the page cache, now we need to check
         * that it's up-to-date. If not, it is going to be due to an error.
@@ -1576,7 +1700,6 @@ retry_find:
                return VM_FAULT_SIGBUS;
        }
-        ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
        vmf->page = page;
        return ret | VM_FAULT_LOCKED;
@@ -1859,16 +1982,26 @@ static int __remove_suid(struct dentry *dentry, int kill)
 int file_remove_suid(struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
-        int killsuid = should_remove_suid(dentry);
+        struct inode *inode = dentry->d_inode;
-        int killpriv = security_inode_need_killpriv(dentry);
+        int killsuid;
+        int killpriv;
        int error = 0;
+        /* Fast path for nothing security related */
+        if (IS_NOSEC(inode))
+                return 0;
+        killsuid = should_remove_suid(dentry);
+        killpriv = security_inode_need_killpriv(dentry);
        if (killpriv < 0)
                return killpriv;
        if (killpriv)
                error = security_inode_killpriv(dentry);
        if (!error && killsuid)
                error = __remove_suid(dentry, killsuid);
+        if (!error && (inode->i_sb->s_flags & MS_NOSEC))
+                inode->i_flags |= S_NOSEC;
        return error;
 }
@@ -2177,12 +2310,12 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        }
        if (written > 0) {
-                loff_t end = pos + written;
+                pos += written;
-                if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
+                if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
-                        i_size_write(inode,  end);
+                        i_size_write(inode, pos);
                        mark_inode_dirty(inode);
                }
-                *ppos = end;
+                *ppos = pos;
        }
 out:
        return written;
@@ -2203,8 +2336,8 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
                gfp_notmask = __GFP_FS;
 repeat:
        page = find_lock_page(mapping, index);
-        if (likely(page))
+        if (page)
-                return page;
+                goto found;
        page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
        if (!page)
@@ -2217,6 +2350,8 @@ repeat:
                        goto repeat;
                return NULL;
        }
+found:
+        wait_on_page_writeback(page);
        return page;
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);
@@ -2463,11 +2598,13 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
        mutex_lock(&inode->i_mutex);
+        blk_start_plug(&plug);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
@@ -2478,6 +2615,7 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
+        blk_finish_plug(&plug);
        return ret;
 }
 EXPORT_SYMBOL(generic_file_aio_write);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 83364df74a33..93356cd12828 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -183,7 +183,7 @@ __xip_unmap (struct address_space * mapping,
                return;
 retry:
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
@@ -201,7 +201,7 @@ retry:
                        page_cache_release(page);
                }
        }
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        if (locked) {
                mutex_unlock(&xip_sparse_mutex);
diff --git a/mm/fremap.c b/mm/fremap.c
index ec520c7b28df..b8e0e2d468af 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -211,20 +211,20 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
                        }
                        goto out;
                }
-                spin_lock(&mapping->i_mmap_lock);
+                mutex_lock(&mapping->i_mmap_mutex);
                flush_dcache_mmap_lock(mapping);
                vma->vm_flags |= VM_NONLINEAR;
                vma_prio_tree_remove(vma, &mapping->i_mmap);
                vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
                flush_dcache_mmap_unlock(mapping);
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
        }
        if (vma->vm_flags & VM_LOCKED) {
                /*
                 * drop PG_Mlocked flag for over-mapped range
                 */
-                unsigned int saved_flags = vma->vm_flags;
+                vm_flags_t saved_flags = vma->vm_flags;
                munlock_vma_pages_range(vma, start, start + size);
                vma->vm_flags = saved_flags;
        }
diff --git a/mm/highmem.c b/mm/highmem.c
index 7a0aa1be4993..693394daa2ed 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -29,6 +29,11 @@
 #include <linux/kgdb.h>
 #include <asm/tlbflush.h>
+#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
+DEFINE_PER_CPU(int, __kmap_atomic_idx);
+#endif
 /*
 * Virtual_count is not a pure "count".
 *  0 means that it is not mapped, and has not been mapped
@@ -42,6 +47,9 @@
 unsigned long totalhigh_pages __read_mostly;
 EXPORT_SYMBOL(totalhigh_pages);
+EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
 unsigned int nr_free_highpages (void)
 {
        pg_data_t *pgdat;
@@ -422,61 +430,3 @@ void __init page_address_init(void)
 }
 #endif  /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
-#ifdef CONFIG_DEBUG_HIGHMEM
-void debug_kmap_atomic(enum km_type type)
-{
-        static int warn_count = 10;
-        if (unlikely(warn_count < 0))
-                return;
-        if (unlikely(in_interrupt())) {
-                if (in_nmi()) {
-                        if (type != KM_NMI && type != KM_NMI_PTE) {
-                                WARN_ON(1);
-                                warn_count--;
-                        }
-                } else if (in_irq()) {
-                        if (type != KM_IRQ0 && type != KM_IRQ1 &&
-                            type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
-                            type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
-                                WARN_ON(1);
-                                warn_count--;
-                        }
-                } else if (!irqs_disabled()) {  /* softirq */
-                        if (type != KM_IRQ0 && type != KM_IRQ1 &&
-                            type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
-                            type != KM_SKB_SUNRPC_DATA &&
-                            type != KM_SKB_DATA_SOFTIRQ &&
-                            type != KM_BOUNCE_READ) {
-                                WARN_ON(1);
-                                warn_count--;
-                        }
-                }
-        }
-        if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-                        type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
-                        type == KM_IRQ_PTE || type == KM_NMI ||
-                        type == KM_NMI_PTE ) {
-                if (!irqs_disabled()) {
-                        WARN_ON(1);
-                        warn_count--;
-                }
-        } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
-                if (irq_count() == 0 && !irqs_disabled()) {
-                        WARN_ON(1);
-                        warn_count--;
-                }
-        }
-#ifdef CONFIG_KGDB_KDB
-        if (unlikely(type == KM_KDB && atomic_read(&kgdb_active) == -1)) {
-                WARN_ON(1);
-                warn_count--;
-        }
-#endif /* CONFIG_KGDB_KDB */
-}
-#endif
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
new file mode 100644
index 000000000000..81532f297fd2
--- /dev/null
+++ b/mm/huge_memory.c
@@ -0,0 +1,2391 @@
+/*
+ *  Copyright (C) 2009  Red Hat, Inc.
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/mm_inline.h>
+#include <linux/kthread.h>
+#include <linux/khugepaged.h>
+#include <linux/freezer.h>
+#include <linux/mman.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
+#include "internal.h"
+/*
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
+ */
+unsigned long transparent_hugepage_flags __read_mostly =
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
+        (1<<TRANSPARENT_HUGEPAGE_FLAG)|
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
+        (1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
+#endif
+        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_FLAG)|
+        (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+/* default scan 8*512 pte (or vmas) every 30 second */
+static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_collapsed;
+static unsigned int khugepaged_full_scans;
+static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
+/* during fragmentation poll the hugepage allocator once every minute */
+static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+static DEFINE_SPINLOCK(khugepaged_mm_lock);
+static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
+/*
+ * default collapse hugepages if there is at least one pte mapped like
+ * it would have happened if the vma was large enough during page
+ * fault.
+ */
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static int khugepaged(void *none);
+static int mm_slots_hash_init(void);
+static int khugepaged_slab_init(void);
+static void khugepaged_slab_free(void);
+#define MM_SLOTS_HASH_HEADS 1024
+static struct hlist_head *mm_slots_hash __read_mostly;
+static struct kmem_cache *mm_slot_cache __read_mostly;
+/**
+ * struct mm_slot - hash lookup from mm to mm_slot
+ * @hash: hash collision list
+ * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
+ * @mm: the mm that this information is valid for
+ */
+struct mm_slot {
+        struct hlist_node hash;
+        struct list_head mm_node;
+        struct mm_struct *mm;
+};
+/**
+ * struct khugepaged_scan - cursor for scanning
+ * @mm_head: the head of the mm list to scan
+ * @mm_slot: the current mm_slot we are scanning
+ * @address: the next address inside that to be scanned
+ *
+ * There is only the one khugepaged_scan instance of this cursor structure.
+ */
+struct khugepaged_scan {
+        struct list_head mm_head;
+        struct mm_slot *mm_slot;
+        unsigned long address;
+} khugepaged_scan = {
+        .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
+};
+static int set_recommended_min_free_kbytes(void)
+{
+        struct zone *zone;
+        int nr_zones = 0;
+        unsigned long recommended_min;
+        extern int min_free_kbytes;
+        if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                      &transparent_hugepage_flags) &&
+            !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                      &transparent_hugepage_flags))
+                return 0;
+        for_each_populated_zone(zone)
+                nr_zones++;
+        /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */
+        recommended_min = pageblock_nr_pages * nr_zones * 2;
+        /*
+         * Make sure that on average at least two pageblocks are almost free
+         * of another type, one for a migratetype to fall back to and a
+         * second to avoid subsequent fallbacks of other types There are 3
+         * MIGRATE_TYPES we care about.
+         */
+        recommended_min += pageblock_nr_pages * nr_zones *
+                           MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
+        /* don't ever allow to reserve more than 5% of the lowmem */
+        recommended_min = min(recommended_min,
+                              (unsigned long) nr_free_buffer_pages() / 20);
+        recommended_min <<= (PAGE_SHIFT-10);
+        if (recommended_min > min_free_kbytes)
+                min_free_kbytes = recommended_min;
+        setup_per_zone_wmarks();
+        return 0;
+}
+late_initcall(set_recommended_min_free_kbytes);
+static int start_khugepaged(void)
+{
+        int err = 0;
+        if (khugepaged_enabled()) {
+                int wakeup;
+                if (unlikely(!mm_slot_cache || !mm_slots_hash)) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                mutex_lock(&khugepaged_mutex);
+                if (!khugepaged_thread)
+                        khugepaged_thread = kthread_run(khugepaged, NULL,
+                                                        "khugepaged");
+                if (unlikely(IS_ERR(khugepaged_thread))) {
+                        printk(KERN_ERR
+                               "khugepaged: kthread_run(khugepaged) failed\n");
+                        err = PTR_ERR(khugepaged_thread);
+                        khugepaged_thread = NULL;
+                }
+                wakeup = !list_empty(&khugepaged_scan.mm_head);
+                mutex_unlock(&khugepaged_mutex);
+                if (wakeup)
+                        wake_up_interruptible(&khugepaged_wait);
+                set_recommended_min_free_kbytes();
+        } else
+                /* wakeup to exit */
+                wake_up_interruptible(&khugepaged_wait);
+out:
+        return err;
+}
+#ifdef CONFIG_SYSFS
+static ssize_t double_flag_show(struct kobject *kobj,
+                                struct kobj_attribute *attr, char *buf,
+                                enum transparent_hugepage_flag enabled,
+                                enum transparent_hugepage_flag req_madv)
+{
+        if (test_bit(enabled, &transparent_hugepage_flags)) {
+                VM_BUG_ON(test_bit(req_madv, &transparent_hugepage_flags));
+                return sprintf(buf, "[always] madvise never\n");
+        } else if (test_bit(req_madv, &transparent_hugepage_flags))
+                return sprintf(buf, "always [madvise] never\n");
+        else
+                return sprintf(buf, "always madvise [never]\n");
+}
+static ssize_t double_flag_store(struct kobject *kobj,
+                                 struct kobj_attribute *attr,
+                                 const char *buf, size_t count,
+                                 enum transparent_hugepage_flag enabled,
+                                 enum transparent_hugepage_flag req_madv)
+{
+        if (!memcmp("always", buf,
+                    min(sizeof("always")-1, count))) {
+                set_bit(enabled, &transparent_hugepage_flags);
+                clear_bit(req_madv, &transparent_hugepage_flags);
+        } else if (!memcmp("madvise", buf,
+                           min(sizeof("madvise")-1, count))) {
+                clear_bit(enabled, &transparent_hugepage_flags);
+                set_bit(req_madv, &transparent_hugepage_flags);
+        } else if (!memcmp("never", buf,
+                           min(sizeof("never")-1, count))) {
+                clear_bit(enabled, &transparent_hugepage_flags);
+                clear_bit(req_madv, &transparent_hugepage_flags);
+        } else
+                return -EINVAL;
+        return count;
+}
+static ssize_t enabled_show(struct kobject *kobj,
+                            struct kobj_attribute *attr, char *buf)
+{
+        return double_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_FLAG,
+                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+}
+static ssize_t enabled_store(struct kobject *kobj,
+                             struct kobj_attribute *attr,
+                             const char *buf, size_t count)
+{
+        ssize_t ret;
+        ret = double_flag_store(kobj, attr, buf, count,
+                                TRANSPARENT_HUGEPAGE_FLAG,
+                                TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG);
+        if (ret > 0) {
+                int err = start_khugepaged();
+                if (err)
+                        ret = err;
+        }
+        if (ret > 0 &&
+            (test_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                      &transparent_hugepage_flags) ||
+             test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                      &transparent_hugepage_flags)))
+                set_recommended_min_free_kbytes();
+        return ret;
+}
+static struct kobj_attribute enabled_attr =
+        __ATTR(enabled, 0644, enabled_show, enabled_store);
+static ssize_t single_flag_show(struct kobject *kobj,
+                                struct kobj_attribute *attr, char *buf,
+                                enum transparent_hugepage_flag flag)
+{
+        return sprintf(buf, "%d\n",
+                       !!test_bit(flag, &transparent_hugepage_flags));
+}
+static ssize_t single_flag_store(struct kobject *kobj,
+                                 struct kobj_attribute *attr,
+                                 const char *buf, size_t count,
+                                 enum transparent_hugepage_flag flag)
+{
+        unsigned long value;
+        int ret;
+        ret = kstrtoul(buf, 10, &value);
+        if (ret < 0)
+                return ret;
+        if (value > 1)
+                return -EINVAL;
+        if (value)
+                set_bit(flag, &transparent_hugepage_flags);
+        else
+                clear_bit(flag, &transparent_hugepage_flags);
+        return count;
+}
+/*
+ * Currently defrag only disables __GFP_NOWAIT for allocation. A blind
+ * __GFP_REPEAT is too aggressive, it's never worth swapping tons of
+ * memory just to allocate one more hugepage.
+ */
+static ssize_t defrag_show(struct kobject *kobj,
+                           struct kobj_attribute *attr, char *buf)
+{
+        return double_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static ssize_t defrag_store(struct kobject *kobj,
+                            struct kobj_attribute *attr,
+                            const char *buf, size_t count)
+{
+        return double_flag_store(kobj, attr, buf, count,
+                                 TRANSPARENT_HUGEPAGE_DEFRAG_FLAG,
+                                 TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG);
+}
+static struct kobj_attribute defrag_attr =
+        __ATTR(defrag, 0644, defrag_show, defrag_store);
+#ifdef CONFIG_DEBUG_VM
+static ssize_t debug_cow_show(struct kobject *kobj,
+                                struct kobj_attribute *attr, char *buf)
+{
+        return single_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static ssize_t debug_cow_store(struct kobject *kobj,
+                               struct kobj_attribute *attr,
+                               const char *buf, size_t count)
+{
+        return single_flag_store(kobj, attr, buf, count,
+                                 TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
+}
+static struct kobj_attribute debug_cow_attr =
+        __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
+#endif /* CONFIG_DEBUG_VM */
+static struct attribute *hugepage_attr[] = {
+        &enabled_attr.attr,
+        &defrag_attr.attr,
+#ifdef CONFIG_DEBUG_VM
+        &debug_cow_attr.attr,
+#endif
+        NULL,
+};
+static struct attribute_group hugepage_attr_group = {
+        .attrs = hugepage_attr,
+};
+static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
+                                         struct kobj_attribute *attr,
+                                         char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+}
+static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
+                                          struct kobj_attribute *attr,
+                                          const char *buf, size_t count)
+{
+        unsigned long msecs;
+        int err;
+        err = strict_strtoul(buf, 10, &msecs);
+        if (err || msecs > UINT_MAX)
+                return -EINVAL;
+        khugepaged_scan_sleep_millisecs = msecs;
+        wake_up_interruptible(&khugepaged_wait);
+        return count;
+}
+static struct kobj_attribute scan_sleep_millisecs_attr =
+        __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show,
+               scan_sleep_millisecs_store);
+static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
+                                          struct kobj_attribute *attr,
+                                          char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+}
+static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
+                                           struct kobj_attribute *attr,
+                                           const char *buf, size_t count)
+{
+        unsigned long msecs;
+        int err;
+        err = strict_strtoul(buf, 10, &msecs);
+        if (err || msecs > UINT_MAX)
+                return -EINVAL;
+        khugepaged_alloc_sleep_millisecs = msecs;
+        wake_up_interruptible(&khugepaged_wait);
+        return count;
+}
+static struct kobj_attribute alloc_sleep_millisecs_attr =
+        __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show,
+               alloc_sleep_millisecs_store);
+static ssize_t pages_to_scan_show(struct kobject *kobj,
+                                  struct kobj_attribute *attr,
+                                  char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+}
+static ssize_t pages_to_scan_store(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   const char *buf, size_t count)
+{
+        int err;
+        unsigned long pages;
+        err = strict_strtoul(buf, 10, &pages);
+        if (err || !pages || pages > UINT_MAX)
+                return -EINVAL;
+        khugepaged_pages_to_scan = pages;
+        return count;
+}
+static struct kobj_attribute pages_to_scan_attr =
+        __ATTR(pages_to_scan, 0644, pages_to_scan_show,
+               pages_to_scan_store);
+static ssize_t pages_collapsed_show(struct kobject *kobj,
+                                    struct kobj_attribute *attr,
+                                    char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+}
+static struct kobj_attribute pages_collapsed_attr =
+        __ATTR_RO(pages_collapsed);
+static ssize_t full_scans_show(struct kobject *kobj,
+                               struct kobj_attribute *attr,
+                               char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_full_scans);
+}
+static struct kobj_attribute full_scans_attr =
+        __ATTR_RO(full_scans);
+static ssize_t khugepaged_defrag_show(struct kobject *kobj,
+                                      struct kobj_attribute *attr, char *buf)
+{
+        return single_flag_show(kobj, attr, buf,
+                                TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static ssize_t khugepaged_defrag_store(struct kobject *kobj,
+                                       struct kobj_attribute *attr,
+                                       const char *buf, size_t count)
+{
+        return single_flag_store(kobj, attr, buf, count,
+                                 TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+}
+static struct kobj_attribute khugepaged_defrag_attr =
+        __ATTR(defrag, 0644, khugepaged_defrag_show,
+               khugepaged_defrag_store);
+/*
+ * max_ptes_none controls if khugepaged should collapse hugepages over
+ * any unmapped ptes in turn potentially increasing the memory
+ * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
+ * reduce the available free memory in the system as it
+ * runs. Increasing max_ptes_none will instead potentially reduce the
+ * free memory in the system during the khugepaged scan.
+ */
+static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
+                                             struct kobj_attribute *attr,
+                                             char *buf)
+{
+        return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+}
+static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
+                                              struct kobj_attribute *attr,
+                                              const char *buf, size_t count)
+{
+        int err;
+        unsigned long max_ptes_none;
+        err = strict_strtoul(buf, 10, &max_ptes_none);
+        if (err || max_ptes_none > HPAGE_PMD_NR-1)
+                return -EINVAL;
+        khugepaged_max_ptes_none = max_ptes_none;
+        return count;
+}
+static struct kobj_attribute khugepaged_max_ptes_none_attr =
+        __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show,
+               khugepaged_max_ptes_none_store);
+static struct attribute *khugepaged_attr[] = {
+        &khugepaged_defrag_attr.attr,
+        &khugepaged_max_ptes_none_attr.attr,
+        &pages_to_scan_attr.attr,
+        &pages_collapsed_attr.attr,
+        &full_scans_attr.attr,
+        &scan_sleep_millisecs_attr.attr,
+        &alloc_sleep_millisecs_attr.attr,
+        NULL,
+};
+static struct attribute_group khugepaged_attr_group = {
+        .attrs = khugepaged_attr,
+        .name = "khugepaged",
+};
+#endif /* CONFIG_SYSFS */
+static int __init hugepage_init(void)
+{
+        int err;
+#ifdef CONFIG_SYSFS
+        static struct kobject *hugepage_kobj;
+#endif
+        err = -EINVAL;
+        if (!has_transparent_hugepage()) {
+                transparent_hugepage_flags = 0;
+                goto out;
+        }
+#ifdef CONFIG_SYSFS
+        err = -ENOMEM;
+        hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
+        if (unlikely(!hugepage_kobj)) {
+                printk(KERN_ERR "hugepage: failed kobject create\n");
+                goto out;
+        }
+        err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group);
+        if (err) {
+                printk(KERN_ERR "hugepage: failed register hugeage group\n");
+                goto out;
+        }
+        err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group);
+        if (err) {
+                printk(KERN_ERR "hugepage: failed register hugeage group\n");
+                goto out;
+        }
+#endif
+        err = khugepaged_slab_init();
+        if (err)
+                goto out;
+        err = mm_slots_hash_init();
+        if (err) {
+                khugepaged_slab_free();
+                goto out;
+        }
+        /*
+         * By default disable transparent hugepages on smaller systems,
+         * where the extra memory used could hurt more than TLB overhead
+         * is likely to save.  The admin can still enable it through /sys.
+         */
+        if (totalram_pages < (512 << (20 - PAGE_SHIFT)))
+                transparent_hugepage_flags = 0;
+        start_khugepaged();
+        set_recommended_min_free_kbytes();
+out:
+        return err;
+}
+module_init(hugepage_init)
+static int __init setup_transparent_hugepage(char *str)
+{
+        int ret = 0;
+        if (!str)
+                goto out;
+        if (!strcmp(str, "always")) {
+                set_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                        &transparent_hugepage_flags);
+                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                          &transparent_hugepage_flags);
+                ret = 1;
+        } else if (!strcmp(str, "madvise")) {
+                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                          &transparent_hugepage_flags);
+                set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                        &transparent_hugepage_flags);
+                ret = 1;
+        } else if (!strcmp(str, "never")) {
+                clear_bit(TRANSPARENT_HUGEPAGE_FLAG,
+                          &transparent_hugepage_flags);
+                clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+                          &transparent_hugepage_flags);
+                ret = 1;
+        }
+out:
+        if (!ret)
+                printk(KERN_WARNING
+                       "transparent_hugepage= cannot parse, ignored\n");
+        return ret;
+}
+__setup("transparent_hugepage=", setup_transparent_hugepage);
+static void prepare_pmd_huge_pte(pgtable_t pgtable,
+                                 struct mm_struct *mm)
+{
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        if (!mm->pmd_huge_pte)
+                INIT_LIST_HEAD(&pgtable->lru);
+        else
+                list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+        mm->pmd_huge_pte = pgtable;
+}
+static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+        if (likely(vma->vm_flags & VM_WRITE))
+                pmd = pmd_mkwrite(pmd);
+        return pmd;
+}
+static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
+                                        struct vm_area_struct *vma,
+                                        unsigned long haddr, pmd_t *pmd,
+                                        struct page *page)
+{
+        int ret = 0;
+        pgtable_t pgtable;
+        VM_BUG_ON(!PageCompound(page));
+        pgtable = pte_alloc_one(mm, haddr);
+        if (unlikely(!pgtable)) {
+                mem_cgroup_uncharge_page(page);
+                put_page(page);
+                return VM_FAULT_OOM;
+        }
+        clear_huge_page(page, haddr, HPAGE_PMD_NR);
+        __SetPageUptodate(page);
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_none(*pmd))) {
+                spin_unlock(&mm->page_table_lock);
+                mem_cgroup_uncharge_page(page);
+                put_page(page);
+                pte_free(mm, pgtable);
+        } else {
+                pmd_t entry;
+                entry = mk_pmd(page, vma->vm_page_prot);
+                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+                entry = pmd_mkhuge(entry);
+                /*
+                 * The spinlocking to take the lru_lock inside
+                 * page_add_new_anon_rmap() acts as a full memory
+                 * barrier to be sure clear_huge_page writes become
+                 * visible after the set_pmd_at() write.
+                 */
+                page_add_new_anon_rmap(page, vma, haddr);
+                set_pmd_at(mm, haddr, pmd, entry);
+                prepare_pmd_huge_pte(pgtable, mm);
+                add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
+                spin_unlock(&mm->page_table_lock);
+        }
+        return ret;
+}
+static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
+{
+        return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
+}
+static inline struct page *alloc_hugepage_vma(int defrag,
+                                              struct vm_area_struct *vma,
+                                              unsigned long haddr, int nd,
+                                              gfp_t extra_gfp)
+{
+        return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
+                               HPAGE_PMD_ORDER, vma, haddr, nd);
+}
+#ifndef CONFIG_NUMA
+static inline struct page *alloc_hugepage(int defrag)
+{
+        return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+                           HPAGE_PMD_ORDER);
+}
+#endif
+int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                               unsigned long address, pmd_t *pmd,
+                               unsigned int flags)
+{
+        struct page *page;
+        unsigned long haddr = address & HPAGE_PMD_MASK;
+        pte_t *pte;
+        if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) {
+                if (unlikely(anon_vma_prepare(vma)))
+                        return VM_FAULT_OOM;
+                if (unlikely(khugepaged_enter(vma)))
+                        return VM_FAULT_OOM;
+                page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                          vma, haddr, numa_node_id(), 0);
+                if (unlikely(!page)) {
+                        count_vm_event(THP_FAULT_FALLBACK);
+                        goto out;
+                }
+                count_vm_event(THP_FAULT_ALLOC);
+                if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+                        put_page(page);
+                        goto out;
+                }
+                return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page);
+        }
+out:
+        /*
+         * Use __pte_alloc instead of pte_alloc_map, because we can't
+         * run pte_offset_map on the pmd, if an huge pmd could
+         * materialize from under us from a different thread.
+         */
+        if (unlikely(__pte_alloc(mm, vma, pmd, address)))
+                return VM_FAULT_OOM;
+        /* if an huge pmd materialized from under us just retry later */
+        if (unlikely(pmd_trans_huge(*pmd)))
+                return 0;
+        /*
+         * A regular pmd is established and it can't morph into a huge pmd
+         * from under us anymore at this point because we hold the mmap_sem
+         * read mode and khugepaged takes it in write mode. So now it's
+         * safe to run pte_offset_map().
+         */
+        pte = pte_offset_map(pmd, address);
+        return handle_pte_fault(mm, vma, address, pte, pmd, flags);
+}
+int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
+                  struct vm_area_struct *vma)
+{
+        struct page *src_page;
+        pmd_t pmd;
+        pgtable_t pgtable;
+        int ret;
+        ret = -ENOMEM;
+        pgtable = pte_alloc_one(dst_mm, addr);
+        if (unlikely(!pgtable))
+                goto out;
+        spin_lock(&dst_mm->page_table_lock);
+        spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
+        ret = -EAGAIN;
+        pmd = *src_pmd;
+        if (unlikely(!pmd_trans_huge(pmd))) {
+                pte_free(dst_mm, pgtable);
+                goto out_unlock;
+        }
+        if (unlikely(pmd_trans_splitting(pmd))) {
+                /* split huge page running from under us */
+                spin_unlock(&src_mm->page_table_lock);
+                spin_unlock(&dst_mm->page_table_lock);
+                pte_free(dst_mm, pgtable);
+                wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
+                goto out;
+        }
+        src_page = pmd_page(pmd);
+        VM_BUG_ON(!PageHead(src_page));
+        get_page(src_page);
+        page_dup_rmap(src_page);
+        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+        pmdp_set_wrprotect(src_mm, addr, src_pmd);
+        pmd = pmd_mkold(pmd_wrprotect(pmd));
+        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
+        prepare_pmd_huge_pte(pgtable, dst_mm);
+        ret = 0;
+out_unlock:
+        spin_unlock(&src_mm->page_table_lock);
+        spin_unlock(&dst_mm->page_table_lock);
+out:
+        return ret;
+}
+/* no "address" argument so destroys page coloring of some arch */
+pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
+{
+        pgtable_t pgtable;
+        assert_spin_locked(&mm->page_table_lock);
+        /* FIFO */
+        pgtable = mm->pmd_huge_pte;
+        if (list_empty(&pgtable->lru))
+                mm->pmd_huge_pte = NULL;
+        else {
+                mm->pmd_huge_pte = list_entry(pgtable->lru.next,
+                                              struct page, lru);
+                list_del(&pgtable->lru);
+        }
+        return pgtable;
+}
+static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
+                                        struct vm_area_struct *vma,
+                                        unsigned long address,
+                                        pmd_t *pmd, pmd_t orig_pmd,
+                                        struct page *page,
+                                        unsigned long haddr)
+{
+        pgtable_t pgtable;
+        pmd_t _pmd;
+        int ret = 0, i;
+        struct page **pages;
+        pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR,
+                        GFP_KERNEL);
+        if (unlikely(!pages)) {
+                ret |= VM_FAULT_OOM;
+                goto out;
+        }
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE |
+                                               __GFP_OTHER_NODE,
+                                               vma, address, page_to_nid(page));
+                if (unlikely(!pages[i] ||
+                             mem_cgroup_newpage_charge(pages[i], mm,
+                                                       GFP_KERNEL))) {
+                        if (pages[i])
+                                put_page(pages[i]);
+                        mem_cgroup_uncharge_start();
+                        while (--i >= 0) {
+                                mem_cgroup_uncharge_page(pages[i]);
+                                put_page(pages[i]);
+                        }
+                        mem_cgroup_uncharge_end();
+                        kfree(pages);
+                        ret |= VM_FAULT_OOM;
+                        goto out;
+                }
+        }
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                copy_user_highpage(pages[i], page + i,
+                                   haddr + PAGE_SHIFT*i, vma);
+                __SetPageUptodate(pages[i]);
+                cond_resched();
+        }
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(*pmd, orig_pmd)))
+                goto out_free_pages;
+        VM_BUG_ON(!PageHead(page));
+        pmdp_clear_flush_notify(vma, haddr, pmd);
+        /* leave pmd empty until pte is filled */
+        pgtable = get_pmd_huge_pte(mm);
+        pmd_populate(mm, &_pmd, pgtable);
+        for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+                pte_t *pte, entry;
+                entry = mk_pte(pages[i], vma->vm_page_prot);
+                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                page_add_new_anon_rmap(pages[i], vma, haddr);
+                pte = pte_offset_map(&_pmd, haddr);
+                VM_BUG_ON(!pte_none(*pte));
+                set_pte_at(mm, haddr, pte, entry);
+                pte_unmap(pte);
+        }
+        kfree(pages);
+        mm->nr_ptes++;
+        smp_wmb(); /* make pte visible before pmd */
+        pmd_populate(mm, pmd, pgtable);
+        page_remove_rmap(page);
+        spin_unlock(&mm->page_table_lock);
+        ret |= VM_FAULT_WRITE;
+        put_page(page);
+out:
+        return ret;
+out_free_pages:
+        spin_unlock(&mm->page_table_lock);
+        mem_cgroup_uncharge_start();
+        for (i = 0; i < HPAGE_PMD_NR; i++) {
+                mem_cgroup_uncharge_page(pages[i]);
+                put_page(pages[i]);
+        }
+        mem_cgroup_uncharge_end();
+        kfree(pages);
+        goto out;
+}
+int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
+                        unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
+{
+        int ret = 0;
+        struct page *page, *new_page;
+        unsigned long haddr;
+        VM_BUG_ON(!vma->anon_vma);
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_same(*pmd, orig_pmd)))
+                goto out_unlock;
+        page = pmd_page(orig_pmd);
+        VM_BUG_ON(!PageCompound(page) || !PageHead(page));
+        haddr = address & HPAGE_PMD_MASK;
+        if (page_mapcount(page) == 1) {
+                pmd_t entry;
+                entry = pmd_mkyoung(orig_pmd);
+                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+                if (pmdp_set_access_flags(vma, haddr, pmd, entry,  1))
+                        update_mmu_cache(vma, address, entry);
+                ret |= VM_FAULT_WRITE;
+                goto out_unlock;
+        }
+        get_page(page);
+        spin_unlock(&mm->page_table_lock);
+        if (transparent_hugepage_enabled(vma) &&
+            !transparent_hugepage_debug_cow())
+                new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
+                                              vma, haddr, numa_node_id(), 0);
+        else
+                new_page = NULL;
+        if (unlikely(!new_page)) {
+                count_vm_event(THP_FAULT_FALLBACK);
+                ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
+                                                   pmd, orig_pmd, page, haddr);
+                put_page(page);
+                goto out;
+        }
+        count_vm_event(THP_FAULT_ALLOC);
+        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+                put_page(new_page);
+                put_page(page);
+                ret |= VM_FAULT_OOM;
+                goto out;
+        }
+        copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR);
+        __SetPageUptodate(new_page);
+        spin_lock(&mm->page_table_lock);
+        put_page(page);
+        if (unlikely(!pmd_same(*pmd, orig_pmd))) {
+                mem_cgroup_uncharge_page(new_page);
+                put_page(new_page);
+        } else {
+                pmd_t entry;
+                VM_BUG_ON(!PageHead(page));
+                entry = mk_pmd(new_page, vma->vm_page_prot);
+                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+                entry = pmd_mkhuge(entry);
+                pmdp_clear_flush_notify(vma, haddr, pmd);
+                page_add_new_anon_rmap(new_page, vma, haddr);
+                set_pmd_at(mm, haddr, pmd, entry);
+                update_mmu_cache(vma, address, entry);
+                page_remove_rmap(page);
+                put_page(page);
+                ret |= VM_FAULT_WRITE;
+        }
+out_unlock:
+        spin_unlock(&mm->page_table_lock);
+out:
+        return ret;
+}
+struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+                                   unsigned long addr,
+                                   pmd_t *pmd,
+                                   unsigned int flags)
+{
+        struct page *page = NULL;
+        assert_spin_locked(&mm->page_table_lock);
+        if (flags & FOLL_WRITE && !pmd_write(*pmd))
+                goto out;
+        page = pmd_page(*pmd);
+        VM_BUG_ON(!PageHead(page));
+        if (flags & FOLL_TOUCH) {
+                pmd_t _pmd;
+                /*
+                 * We should set the dirty bit only for FOLL_WRITE but
+                 * for now the dirty bit in the pmd is meaningless.
+                 * And if the dirty bit will become meaningful and
+                 * we'll only set it with FOLL_WRITE, an atomic
+                 * set_bit will be required on the pmd to set the
+                 * young bit, instead of the current set_pmd_at.
+                 */
+                _pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
+                set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
+        }
+        page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
+        VM_BUG_ON(!PageCompound(page));
+        if (flags & FOLL_GET)
+                get_page(page);
+out:
+        return page;
+}
+int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
+                 pmd_t *pmd)
+{
+        int ret = 0;
+        spin_lock(&tlb->mm->page_table_lock);
+        if (likely(pmd_trans_huge(*pmd))) {
+                if (unlikely(pmd_trans_splitting(*pmd))) {
+                        spin_unlock(&tlb->mm->page_table_lock);
+                        wait_split_huge_page(vma->anon_vma,
+                                             pmd);
+                } else {
+                        struct page *page;
+                        pgtable_t pgtable;
+                        pgtable = get_pmd_huge_pte(tlb->mm);
+                        page = pmd_page(*pmd);
+                        pmd_clear(pmd);
+                        page_remove_rmap(page);
+                        VM_BUG_ON(page_mapcount(page) < 0);
+                        add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
+                        VM_BUG_ON(!PageHead(page));
+                        spin_unlock(&tlb->mm->page_table_lock);
+                        tlb_remove_page(tlb, page);
+                        pte_free(tlb->mm, pgtable);
+                        ret = 1;
+                }
+        } else
+                spin_unlock(&tlb->mm->page_table_lock);
+        return ret;
+}
+int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+                unsigned long addr, unsigned long end,
+                unsigned char *vec)
+{
+        int ret = 0;
+        spin_lock(&vma->vm_mm->page_table_lock);
+        if (likely(pmd_trans_huge(*pmd))) {
+                ret = !pmd_trans_splitting(*pmd);
+                spin_unlock(&vma->vm_mm->page_table_lock);
+                if (unlikely(!ret))
+                        wait_split_huge_page(vma->anon_vma, pmd);
+                else {
+                        /*
+                         * All logical pages in the range are present
+                         * if backed by a huge page.
+                         */
+                        memset(vec, 1, (end - addr) >> PAGE_SHIFT);
+                }
+        } else
+                spin_unlock(&vma->vm_mm->page_table_lock);
+        return ret;
+}
+int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
+                unsigned long addr, pgprot_t newprot)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        int ret = 0;
+        spin_lock(&mm->page_table_lock);
+        if (likely(pmd_trans_huge(*pmd))) {
+                if (unlikely(pmd_trans_splitting(*pmd))) {
+                        spin_unlock(&mm->page_table_lock);
+                        wait_split_huge_page(vma->anon_vma, pmd);
+                } else {
+                        pmd_t entry;
+                        entry = pmdp_get_and_clear(mm, addr, pmd);
+                        entry = pmd_modify(entry, newprot);
+                        set_pmd_at(mm, addr, pmd, entry);
+                        spin_unlock(&vma->vm_mm->page_table_lock);
+                        flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE);
+                        ret = 1;
+                }
+        } else
+                spin_unlock(&vma->vm_mm->page_table_lock);
+        return ret;
+}
+pmd_t *page_check_address_pmd(struct page *page,
+                              struct mm_struct *mm,
+                              unsigned long address,
+                              enum page_check_address_pmd_flag flag)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd, *ret = NULL;
+        if (address & ~HPAGE_PMD_MASK)
+                goto out;
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, address);
+        if (pmd_none(*pmd))
+                goto out;
+        if (pmd_page(*pmd) != page)
+                goto out;
+        /*
+         * split_vma() may create temporary aliased mappings. There is
+         * no risk as long as all huge pmd are found and have their
+         * splitting bit set before __split_huge_page_refcount
+         * runs. Finding the same huge pmd more than once during the
+         * same rmap walk is not a problem.
+         */
+        if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
+            pmd_trans_splitting(*pmd))
+                goto out;
+        if (pmd_trans_huge(*pmd)) {
+                VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
+                          !pmd_trans_splitting(*pmd));
+                ret = pmd;
+        }
+out:
+        return ret;
+}
+static int __split_huge_page_splitting(struct page *page,
+                                       struct vm_area_struct *vma,
+                                       unsigned long address)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        pmd_t *pmd;
+        int ret = 0;
+        spin_lock(&mm->page_table_lock);
+        pmd = page_check_address_pmd(page, mm, address,
+                                     PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
+        if (pmd) {
+                /*
+                 * We can't temporarily set the pmd to null in order
+                 * to split it, the pmd must remain marked huge at all
+                 * times or the VM won't take the pmd_trans_huge paths
+                 * and it won't wait on the anon_vma->root->mutex to
+                 * serialize against split_huge_page*.
+                 */
+                pmdp_splitting_flush_notify(vma, address, pmd);
+                ret = 1;
+        }
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
+static void __split_huge_page_refcount(struct page *page)
+{
+        int i;
+        unsigned long head_index = page->index;
+        struct zone *zone = page_zone(page);
+        int zonestat;
+        /* prevent PageLRU to go away from under us, and freeze lru stats */
+        spin_lock_irq(&zone->lru_lock);
+        compound_lock(page);
+        for (i = 1; i < HPAGE_PMD_NR; i++) {
+                struct page *page_tail = page + i;
+                /* tail_page->_count cannot change */
+                atomic_sub(atomic_read(&page_tail->_count), &page->_count);
+                BUG_ON(page_count(page) <= 0);
+                atomic_add(page_mapcount(page) + 1, &page_tail->_count);
+                BUG_ON(atomic_read(&page_tail->_count) <= 0);
+                /* after clearing PageTail the gup refcount can be released */
+                smp_mb();
+                /*
+                 * retain hwpoison flag of the poisoned tail page:
+                 *   fix for the unsuitable process killed on Guest Machine(KVM)
+                 *   by the memory-failure.
+                 */
+                page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+                page_tail->flags |= (page->flags &
+                                     ((1L << PG_referenced) |
+                                      (1L << PG_swapbacked) |
+                                      (1L << PG_mlocked) |
+                                      (1L << PG_uptodate)));
+                page_tail->flags |= (1L << PG_dirty);
+                /*
+                 * 1) clear PageTail before overwriting first_page
+                 * 2) clear PageTail before clearing PageHead for VM_BUG_ON
+                 */
+                smp_wmb();
+                /*
+                 * __split_huge_page_splitting() already set the
+                 * splitting bit in all pmd that could map this
+                 * hugepage, that will ensure no CPU can alter the
+                 * mapcount on the head page. The mapcount is only
+                 * accounted in the head page and it has to be
+                 * transferred to all tail pages in the below code. So
+                 * for this code to be safe, the split the mapcount
+                 * can't change. But that doesn't mean userland can't
+                 * keep changing and reading the page contents while
+                 * we transfer the mapcount, so the pmd splitting
+                 * status is achieved setting a reserved bit in the
+                 * pmd, not by clearing the present bit.
+                */
+                BUG_ON(page_mapcount(page_tail));
+                page_tail->_mapcount = page->_mapcount;
+                BUG_ON(page_tail->mapping);
+                page_tail->mapping = page->mapping;
+                page_tail->index = ++head_index;
+                BUG_ON(!PageAnon(page_tail));
+                BUG_ON(!PageUptodate(page_tail));
+                BUG_ON(!PageDirty(page_tail));
+                BUG_ON(!PageSwapBacked(page_tail));
+                mem_cgroup_split_huge_fixup(page, page_tail);
+                lru_add_page_tail(zone, page, page_tail);
+        }
+        __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
+        __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR);
+        /*
+         * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics,
+         * so adjust those appropriately if this page is on the LRU.
+         */
+        if (PageLRU(page)) {
+                zonestat = NR_LRU_BASE + page_lru(page);
+                __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1));
+        }
+        ClearPageCompound(page);
+        compound_unlock(page);
+        spin_unlock_irq(&zone->lru_lock);
+        for (i = 1; i < HPAGE_PMD_NR; i++) {
+                struct page *page_tail = page + i;
+                BUG_ON(page_count(page_tail) <= 0);
+                /*
+                 * Tail pages may be freed if there wasn't any mapping
+                 * like if add_to_swap() is running on a lru page that
+                 * had its mapping zapped. And freeing these pages
+                 * requires taking the lru_lock so we do the put_page
+                 * of the tail pages after the split is complete.
+                 */
+                put_page(page_tail);
+        }
+        /*
+         * Only the head page (now become a regular page) is required
+         * to be pinned by the caller.
+         */
+        BUG_ON(page_count(page) <= 0);
+}
+static int __split_huge_page_map(struct page *page,
+                                 struct vm_area_struct *vma,
+                                 unsigned long address)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        pmd_t *pmd, _pmd;
+        int ret = 0, i;
+        pgtable_t pgtable;
+        unsigned long haddr;
+        spin_lock(&mm->page_table_lock);
+        pmd = page_check_address_pmd(page, mm, address,
+                                     PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
+        if (pmd) {
+                pgtable = get_pmd_huge_pte(mm);
+                pmd_populate(mm, &_pmd, pgtable);
+                for (i = 0, haddr = address; i < HPAGE_PMD_NR;
+                     i++, haddr += PAGE_SIZE) {
+                        pte_t *pte, entry;
+                        BUG_ON(PageCompound(page+i));
+                        entry = mk_pte(page + i, vma->vm_page_prot);
+                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                        if (!pmd_write(*pmd))
+                                entry = pte_wrprotect(entry);
+                        else
+                                BUG_ON(page_mapcount(page) != 1);
+                        if (!pmd_young(*pmd))
+                                entry = pte_mkold(entry);
+                        pte = pte_offset_map(&_pmd, haddr);
+                        BUG_ON(!pte_none(*pte));
+                        set_pte_at(mm, haddr, pte, entry);
+                        pte_unmap(pte);
+                }
+                mm->nr_ptes++;
+                smp_wmb(); /* make pte visible before pmd */
+                /*
+                 * Up to this point the pmd is present and huge and
+                 * userland has the whole access to the hugepage
+                 * during the split (which happens in place). If we
+                 * overwrite the pmd with the not-huge version
+                 * pointing to the pte here (which of course we could
+                 * if all CPUs were bug free), userland could trigger
+                 * a small page size TLB miss on the small sized TLB
+                 * while the hugepage TLB entry is still established
+                 * in the huge TLB. Some CPU doesn't like that. See
+                 * http://support.amd.com/us/Processor_TechDocs/41322.pdf,
+                 * Erratum 383 on page 93. Intel should be safe but is
+                 * also warns that it's only safe if the permission
+                 * and cache attributes of the two entries loaded in
+                 * the two TLB is identical (which should be the case
+                 * here). But it is generally safer to never allow
+                 * small and huge TLB entries for the same virtual
+                 * address to be loaded simultaneously. So instead of
+                 * doing "pmd_populate(); flush_tlb_range();" we first
+                 * mark the current pmd notpresent (atomically because
+                 * here the pmd_trans_huge and pmd_trans_splitting
+                 * must remain set at all times on the pmd until the
+                 * split is complete for this pmd), then we flush the
+                 * SMP TLB and finally we write the non-huge version
+                 * of the pmd entry with pmd_populate.
+                 */
+                set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd));
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+                pmd_populate(mm, pmd, pgtable);
+                ret = 1;
+        }
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
+/* must be called with anon_vma->root->mutex hold */
+static void __split_huge_page(struct page *page,
+                              struct anon_vma *anon_vma)
+{
+        int mapcount, mapcount2;
+        struct anon_vma_chain *avc;
+        BUG_ON(!PageHead(page));
+        BUG_ON(PageTail(page));
+        mapcount = 0;
+        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+                struct vm_area_struct *vma = avc->vma;
+                unsigned long addr = vma_address(page, vma);
+                BUG_ON(is_vma_temporary_stack(vma));
+                if (addr == -EFAULT)
+                        continue;
+                mapcount += __split_huge_page_splitting(page, vma, addr);
+        }
+        /*
+         * It is critical that new vmas are added to the tail of the
+         * anon_vma list. This guarantes that if copy_huge_pmd() runs
+         * and establishes a child pmd before
+         * __split_huge_page_splitting() freezes the parent pmd (so if
+         * we fail to prevent copy_huge_pmd() from running until the
+         * whole __split_huge_page() is complete), we will still see
+         * the newly established pmd of the child later during the
+         * walk, to be able to set it as pmd_trans_splitting too.
+         */
+        if (mapcount != page_mapcount(page))
+                printk(KERN_ERR "mapcount %d page_mapcount %d\n",
+                       mapcount, page_mapcount(page));
+        BUG_ON(mapcount != page_mapcount(page));
+        __split_huge_page_refcount(page);
+        mapcount2 = 0;
+        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+                struct vm_area_struct *vma = avc->vma;
+                unsigned long addr = vma_address(page, vma);
+                BUG_ON(is_vma_temporary_stack(vma));
+                if (addr == -EFAULT)
+                        continue;
+                mapcount2 += __split_huge_page_map(page, vma, addr);
+        }
+        if (mapcount != mapcount2)
+                printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n",
+                       mapcount, mapcount2, page_mapcount(page));
+        BUG_ON(mapcount != mapcount2);
+}
+int split_huge_page(struct page *page)
+{
+        struct anon_vma *anon_vma;
+        int ret = 1;
+        BUG_ON(!PageAnon(page));
+        anon_vma = page_lock_anon_vma(page);
+        if (!anon_vma)
+                goto out;
+        ret = 0;
+        if (!PageCompound(page))
+                goto out_unlock;
+        BUG_ON(!PageSwapBacked(page));
+        __split_huge_page(page, anon_vma);
+        count_vm_event(THP_SPLIT);
+        BUG_ON(PageCompound(page));
+out_unlock:
+        page_unlock_anon_vma(anon_vma);
+out:
+        return ret;
+}
+#define VM_NO_THP (VM_SPECIAL|VM_INSERTPAGE|VM_MIXEDMAP|VM_SAO| \
+                   VM_HUGETLB|VM_SHARED|VM_MAYSHARE)
+int hugepage_madvise(struct vm_area_struct *vma,
+                     unsigned long *vm_flags, int advice)
+{
+        switch (advice) {
+        case MADV_HUGEPAGE:
+                /*
+                 * Be somewhat over-protective like KSM for now!
+                 */
+                if (*vm_flags & (VM_HUGEPAGE | VM_NO_THP))
+                        return -EINVAL;
+                *vm_flags &= ~VM_NOHUGEPAGE;
+                *vm_flags |= VM_HUGEPAGE;
+                /*
+                 * If the vma become good for khugepaged to scan,
+                 * register it here without waiting a page fault that
+                 * may not happen any time soon.
+                 */
+                if (unlikely(khugepaged_enter_vma_merge(vma)))
+                        return -ENOMEM;
+                break;
+        case MADV_NOHUGEPAGE:
+                /*
+                 * Be somewhat over-protective like KSM for now!
+                 */
+                if (*vm_flags & (VM_NOHUGEPAGE | VM_NO_THP))
+                        return -EINVAL;
+                *vm_flags &= ~VM_HUGEPAGE;
+                *vm_flags |= VM_NOHUGEPAGE;
+                /*
+                 * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
+                 * this vma even if we leave the mm registered in khugepaged if
+                 * it got registered before VM_NOHUGEPAGE was set.
+                 */
+                break;
+        }
+        return 0;
+}
+static int __init khugepaged_slab_init(void)
+{
+        mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
+                                          sizeof(struct mm_slot),
+                                          __alignof__(struct mm_slot), 0, NULL);
+        if (!mm_slot_cache)
+                return -ENOMEM;
+        return 0;
+}
+static void __init khugepaged_slab_free(void)
+{
+        kmem_cache_destroy(mm_slot_cache);
+        mm_slot_cache = NULL;
+}
+static inline struct mm_slot *alloc_mm_slot(void)
+{
+        if (!mm_slot_cache)     /* initialization failed */
+                return NULL;
+        return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL);
+}
+static inline void free_mm_slot(struct mm_slot *mm_slot)
+{
+        kmem_cache_free(mm_slot_cache, mm_slot);
+}
+static int __init mm_slots_hash_init(void)
+{
+        mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
+                                GFP_KERNEL);
+        if (!mm_slots_hash)
+                return -ENOMEM;
+        return 0;
+}
+#if 0
+static void __init mm_slots_hash_free(void)
+{
+        kfree(mm_slots_hash);
+        mm_slots_hash = NULL;
+}
+#endif
+static struct mm_slot *get_mm_slot(struct mm_struct *mm)
+{
+        struct mm_slot *mm_slot;
+        struct hlist_head *bucket;
+        struct hlist_node *node;
+        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+                                % MM_SLOTS_HASH_HEADS];
+        hlist_for_each_entry(mm_slot, node, bucket, hash) {
+                if (mm == mm_slot->mm)
+                        return mm_slot;
+        }
+        return NULL;
+}
+static void insert_to_mm_slots_hash(struct mm_struct *mm,
+                                    struct mm_slot *mm_slot)
+{
+        struct hlist_head *bucket;
+        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+                                % MM_SLOTS_HASH_HEADS];
+        mm_slot->mm = mm;
+        hlist_add_head(&mm_slot->hash, bucket);
+}
+static inline int khugepaged_test_exit(struct mm_struct *mm)
+{
+        return atomic_read(&mm->mm_users) == 0;
+}
+int __khugepaged_enter(struct mm_struct *mm)
+{
+        struct mm_slot *mm_slot;
+        int wakeup;
+        mm_slot = alloc_mm_slot();
+        if (!mm_slot)
+                return -ENOMEM;
+        /* __khugepaged_exit() must not run from under us */
+        VM_BUG_ON(khugepaged_test_exit(mm));
+        if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
+                free_mm_slot(mm_slot);
+                return 0;
+        }
+        spin_lock(&khugepaged_mm_lock);
+        insert_to_mm_slots_hash(mm, mm_slot);
+        /*
+         * Insert just behind the scanning cursor, to let the area settle
+         * down a little.
+         */
+        wakeup = list_empty(&khugepaged_scan.mm_head);
+        list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head);
+        spin_unlock(&khugepaged_mm_lock);
+        atomic_inc(&mm->mm_count);
+        if (wakeup)
+                wake_up_interruptible(&khugepaged_wait);
+        return 0;
+}
+int khugepaged_enter_vma_merge(struct vm_area_struct *vma)
+{
+        unsigned long hstart, hend;
+        if (!vma->anon_vma)
+                /*
+                 * Not yet faulted in so we will register later in the
+                 * page fault if needed.
+                 */
+                return 0;
+        if (vma->vm_ops)
+                /* khugepaged not yet working on file or special mappings */
+                return 0;
+        /*
+         * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+         * true too, verify it here.
+         */
+        VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+        hend = vma->vm_end & HPAGE_PMD_MASK;
+        if (hstart < hend)
+                return khugepaged_enter(vma);
+        return 0;
+}
+void __khugepaged_exit(struct mm_struct *mm)
+{
+        struct mm_slot *mm_slot;
+        int free = 0;
+        spin_lock(&khugepaged_mm_lock);
+        mm_slot = get_mm_slot(mm);
+        if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
+                hlist_del(&mm_slot->hash);
+                list_del(&mm_slot->mm_node);
+                free = 1;
+        }
+        if (free) {
+                spin_unlock(&khugepaged_mm_lock);
+                clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+                free_mm_slot(mm_slot);
+                mmdrop(mm);
+        } else if (mm_slot) {
+                spin_unlock(&khugepaged_mm_lock);
+                /*
+                 * This is required to serialize against
+                 * khugepaged_test_exit() (which is guaranteed to run
+                 * under mmap sem read mode). Stop here (after we
+                 * return all pagetables will be destroyed) until
+                 * khugepaged has finished working on the pagetables
+                 * under the mmap_sem.
+                 */
+                down_write(&mm->mmap_sem);
+                up_write(&mm->mmap_sem);
+        } else
+                spin_unlock(&khugepaged_mm_lock);
+}
+static void release_pte_page(struct page *page)
+{
+        /* 0 stands for page_is_file_cache(page) == false */
+        dec_zone_page_state(page, NR_ISOLATED_ANON + 0);
+        unlock_page(page);
+        putback_lru_page(page);
+}
+static void release_pte_pages(pte_t *pte, pte_t *_pte)
+{
+        while (--_pte >= pte) {
+                pte_t pteval = *_pte;
+                if (!pte_none(pteval))
+                        release_pte_page(pte_page(pteval));
+        }
+}
+static void release_all_pte_pages(pte_t *pte)
+{
+        release_pte_pages(pte, pte + HPAGE_PMD_NR);
+}
+static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
+                                        unsigned long address,
+                                        pte_t *pte)
+{
+        struct page *page;
+        pte_t *_pte;
+        int referenced = 0, isolated = 0, none = 0;
+        for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
+             _pte++, address += PAGE_SIZE) {
+                pte_t pteval = *_pte;
+                if (pte_none(pteval)) {
+                        if (++none <= khugepaged_max_ptes_none)
+                                continue;
+                        else {
+                                release_pte_pages(pte, _pte);
+                                goto out;
+                        }
+                }
+                if (!pte_present(pteval) || !pte_write(pteval)) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                page = vm_normal_page(vma, address, pteval);
+                if (unlikely(!page)) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                VM_BUG_ON(PageCompound(page));
+                BUG_ON(!PageAnon(page));
+                VM_BUG_ON(!PageSwapBacked(page));
+                /* cannot use mapcount: can't collapse if there's a gup pin */
+                if (page_count(page) != 1) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                /*
+                 * We can do it before isolate_lru_page because the
+                 * page can't be freed from under us. NOTE: PG_lock
+                 * is needed to serialize against split_huge_page
+                 * when invoked from the VM.
+                 */
+                if (!trylock_page(page)) {
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                /*
+                 * Isolate the page to avoid collapsing an hugepage
+                 * currently in use by the VM.
+                 */
+                if (isolate_lru_page(page)) {
+                        unlock_page(page);
+                        release_pte_pages(pte, _pte);
+                        goto out;
+                }
+                /* 0 stands for page_is_file_cache(page) == false */
+                inc_zone_page_state(page, NR_ISOLATED_ANON + 0);
+                VM_BUG_ON(!PageLocked(page));
+                VM_BUG_ON(PageLRU(page));
+                /* If there is no mapped pte young don't collapse the page */
+                if (pte_young(pteval) || PageReferenced(page) ||
+                    mmu_notifier_test_young(vma->vm_mm, address))
+                        referenced = 1;
+        }
+        if (unlikely(!referenced))
+                release_all_pte_pages(pte);
+        else
+                isolated = 1;
+out:
+        return isolated;
+}
+static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
+                                      struct vm_area_struct *vma,
+                                      unsigned long address,
+                                      spinlock_t *ptl)
+{
+        pte_t *_pte;
+        for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
+                pte_t pteval = *_pte;
+                struct page *src_page;
+                if (pte_none(pteval)) {
+                        clear_user_highpage(page, address);
+                        add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
+                } else {
+                        src_page = pte_page(pteval);
+                        copy_user_highpage(page, src_page, address, vma);
+                        VM_BUG_ON(page_mapcount(src_page) != 1);
+                        VM_BUG_ON(page_count(src_page) != 2);
+                        release_pte_page(src_page);
+                        /*
+                         * ptl mostly unnecessary, but preempt has to
+                         * be disabled to update the per-cpu stats
+                         * inside page_remove_rmap().
+                         */
+                        spin_lock(ptl);
+                        /*
+                         * paravirt calls inside pte_clear here are
+                         * superfluous.
+                         */
+                        pte_clear(vma->vm_mm, address, _pte);
+                        page_remove_rmap(src_page);
+                        spin_unlock(ptl);
+                        free_page_and_swap_cache(src_page);
+                }
+                address += PAGE_SIZE;
+                page++;
+        }
+}
+static void collapse_huge_page(struct mm_struct *mm,
+                               unsigned long address,
+                               struct page **hpage,
+                               struct vm_area_struct *vma,
+                               int node)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd, _pmd;
+        pte_t *pte;
+        pgtable_t pgtable;
+        struct page *new_page;
+        spinlock_t *ptl;
+        int isolated;
+        unsigned long hstart, hend;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+#ifndef CONFIG_NUMA
+        up_read(&mm->mmap_sem);
+        VM_BUG_ON(!*hpage);
+        new_page = *hpage;
+#else
+        VM_BUG_ON(*hpage);
+        /*
+         * Allocate the page while the vma is still valid and under
+         * the mmap_sem read mode so there is no memory allocation
+         * later when we take the mmap_sem in write mode. This is more
+         * friendly behavior (OTOH it may actually hide bugs) to
+         * filesystems in userland with daemons allocating memory in
+         * the userland I/O paths.  Allocating memory with the
+         * mmap_sem in read mode is good idea also to allow greater
+         * scalability.
+         */
+        new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+                                      node, __GFP_OTHER_NODE);
+        /*
+         * After allocating the hugepage, release the mmap_sem read lock in
+         * preparation for taking it in write mode.
+         */
+        up_read(&mm->mmap_sem);
+        if (unlikely(!new_page)) {
+                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+                *hpage = ERR_PTR(-ENOMEM);
+                return;
+        }
+#endif
+        count_vm_event(THP_COLLAPSE_ALLOC);
+        if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+#ifdef CONFIG_NUMA
+                put_page(new_page);
+#endif
+                return;
+        }
+        /*
+         * Prevent all access to pagetables with the exception of
+         * gup_fast later hanlded by the ptep_clear_flush and the VM
+         * handled by the anon_vma lock + PG_lock.
+         */
+        down_write(&mm->mmap_sem);
+        if (unlikely(khugepaged_test_exit(mm)))
+                goto out;
+        vma = find_vma(mm, address);
+        hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+        hend = vma->vm_end & HPAGE_PMD_MASK;
+        if (address < hstart || address + HPAGE_PMD_SIZE > hend)
+                goto out;
+        if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) ||
+            (vma->vm_flags & VM_NOHUGEPAGE))
+                goto out;
+        if (!vma->anon_vma || vma->vm_ops)
+                goto out;
+        if (is_vma_temporary_stack(vma))
+                goto out;
+        /*
+         * If is_pfn_mapping() is true is_learn_pfn_mapping() must be
+         * true too, verify it here.
+         */
+        VM_BUG_ON(is_linear_pfn_mapping(vma) || vma->vm_flags & VM_NO_THP);
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, address);
+        /* pmd can't go away or become huge under us */
+        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+                goto out;
+        anon_vma_lock(vma->anon_vma);
+        pte = pte_offset_map(pmd, address);
+        ptl = pte_lockptr(mm, pmd);
+        spin_lock(&mm->page_table_lock); /* probably unnecessary */
+        /*
+         * After this gup_fast can't run anymore. This also removes
+         * any huge TLB entry from the CPU so we won't allow
+         * huge and small TLB entries for the same virtual address
+         * to avoid the risk of CPU bugs in that area.
+         */
+        _pmd = pmdp_clear_flush_notify(vma, address, pmd);
+        spin_unlock(&mm->page_table_lock);
+        spin_lock(ptl);
+        isolated = __collapse_huge_page_isolate(vma, address, pte);
+        spin_unlock(ptl);
+        if (unlikely(!isolated)) {
+                pte_unmap(pte);
+                spin_lock(&mm->page_table_lock);
+                BUG_ON(!pmd_none(*pmd));
+                set_pmd_at(mm, address, pmd, _pmd);
+                spin_unlock(&mm->page_table_lock);
+                anon_vma_unlock(vma->anon_vma);
+                goto out;
+        }
+        /*
+         * All pages are isolated and locked so anon_vma rmap
+         * can't run anymore.
+         */
+        anon_vma_unlock(vma->anon_vma);
+        __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
+        pte_unmap(pte);
+        __SetPageUptodate(new_page);
+        pgtable = pmd_pgtable(_pmd);
+        VM_BUG_ON(page_count(pgtable) != 1);
+        VM_BUG_ON(page_mapcount(pgtable) != 0);
+        _pmd = mk_pmd(new_page, vma->vm_page_prot);
+        _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
+        _pmd = pmd_mkhuge(_pmd);
+        /*
+         * spin_lock() below is not the equivalent of smp_wmb(), so
+         * this is needed to avoid the copy_huge_page writes to become
+         * visible after the set_pmd_at() write.
+         */
+        smp_wmb();
+        spin_lock(&mm->page_table_lock);
+        BUG_ON(!pmd_none(*pmd));
+        page_add_new_anon_rmap(new_page, vma, address);
+        set_pmd_at(mm, address, pmd, _pmd);
+        update_mmu_cache(vma, address, entry);
+        prepare_pmd_huge_pte(pgtable, mm);
+        mm->nr_ptes--;
+        spin_unlock(&mm->page_table_lock);
+#ifndef CONFIG_NUMA
+        *hpage = NULL;
+#endif
+        khugepaged_pages_collapsed++;
+out_up_write:
+        up_write(&mm->mmap_sem);
+        return;
+out:
+        mem_cgroup_uncharge_page(new_page);
+#ifdef CONFIG_NUMA
+        put_page(new_page);
+#endif
+        goto out_up_write;
+}
+static int khugepaged_scan_pmd(struct mm_struct *mm,
+                               struct vm_area_struct *vma,
+                               unsigned long address,
+                               struct page **hpage)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte, *_pte;
+        int ret = 0, referenced = 0, none = 0;
+        struct page *page;
+        unsigned long _address;
+        spinlock_t *ptl;
+        int node = -1;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                goto out;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                goto out;
+        pmd = pmd_offset(pud, address);
+        if (!pmd_present(*pmd) || pmd_trans_huge(*pmd))
+                goto out;
+        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
+        for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
+             _pte++, _address += PAGE_SIZE) {
+                pte_t pteval = *_pte;
+                if (pte_none(pteval)) {
+                        if (++none <= khugepaged_max_ptes_none)
+                                continue;
+                        else
+                                goto out_unmap;
+                }
+                if (!pte_present(pteval) || !pte_write(pteval))
+                        goto out_unmap;
+                page = vm_normal_page(vma, _address, pteval);
+                if (unlikely(!page))
+                        goto out_unmap;
+                /*
+                 * Chose the node of the first page. This could
+                 * be more sophisticated and look at more pages,
+                 * but isn't for now.
+                 */
+                if (node == -1)
+                        node = page_to_nid(page);
+                VM_BUG_ON(PageCompound(page));
+                if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
+                        goto out_unmap;
+                /* cannot use mapcount: can't collapse if there's a gup pin */
+                if (page_count(page) != 1)
+                        goto out_unmap;
+                if (pte_young(pteval) || PageReferenced(page) ||
+                    mmu_notifier_test_young(vma->vm_mm, address))
+                        referenced = 1;
+        }
+        if (referenced)
+                ret = 1;
+out_unmap:
+        pte_unmap_unlock(pte, ptl);
+        if (ret)
+                /* collapse_huge_page will return with the mmap_sem released */
+                collapse_huge_page(mm, address, hpage, vma, node);
+out:
+        return ret;
+}
+static void collect_mm_slot(struct mm_slot *mm_slot)
+{
+        struct mm_struct *mm = mm_slot->mm;
+        VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+        if (khugepaged_test_exit(mm)) {
+                /* free mm_slot */
+                hlist_del(&mm_slot->hash);
+                list_del(&mm_slot->mm_node);
+                /*
+                 * Not strictly needed because the mm exited already.
+                 *
+                 * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
+                 */
+                /* khugepaged_mm_lock actually not necessary for the below */
+                free_mm_slot(mm_slot);
+                mmdrop(mm);
+        }
+}
+static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
+                                            struct page **hpage)
+{
+        struct mm_slot *mm_slot;
+        struct mm_struct *mm;
+        struct vm_area_struct *vma;
+        int progress = 0;
+        VM_BUG_ON(!pages);
+        VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock));
+        if (khugepaged_scan.mm_slot)
+                mm_slot = khugepaged_scan.mm_slot;
+        else {
+                mm_slot = list_entry(khugepaged_scan.mm_head.next,
+                                     struct mm_slot, mm_node);
+                khugepaged_scan.address = 0;
+                khugepaged_scan.mm_slot = mm_slot;
+        }
+        spin_unlock(&khugepaged_mm_lock);
+        mm = mm_slot->mm;
+        down_read(&mm->mmap_sem);
+        if (unlikely(khugepaged_test_exit(mm)))
+                vma = NULL;
+        else
+                vma = find_vma(mm, khugepaged_scan.address);
+        progress++;
+        for (; vma; vma = vma->vm_next) {
+                unsigned long hstart, hend;
+                cond_resched();
+                if (unlikely(khugepaged_test_exit(mm))) {
+                        progress++;
+                        break;
+                }
+                if ((!(vma->vm_flags & VM_HUGEPAGE) &&
+                     !khugepaged_always()) ||
+                    (vma->vm_flags & VM_NOHUGEPAGE)) {
+                skip:
+                        progress++;
+                        continue;
+                }
+                if (!vma->anon_vma || vma->vm_ops)
+                        goto skip;
+                if (is_vma_temporary_stack(vma))
+                        goto skip;
+                /*
+                 * If is_pfn_mapping() is true is_learn_pfn_mapping()
+                 * must be true too, verify it here.
+                 */
+                VM_BUG_ON(is_linear_pfn_mapping(vma) ||
+                          vma->vm_flags & VM_NO_THP);
+                hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
+                hend = vma->vm_end & HPAGE_PMD_MASK;
+                if (hstart >= hend)
+                        goto skip;
+                if (khugepaged_scan.address > hend)
+                        goto skip;
+                if (khugepaged_scan.address < hstart)
+                        khugepaged_scan.address = hstart;
+                VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
+                while (khugepaged_scan.address < hend) {
+                        int ret;
+                        cond_resched();
+                        if (unlikely(khugepaged_test_exit(mm)))
+                                goto breakouterloop;
+                        VM_BUG_ON(khugepaged_scan.address < hstart ||
+                                  khugepaged_scan.address + HPAGE_PMD_SIZE >
+                                  hend);
+                        ret = khugepaged_scan_pmd(mm, vma,
+                                                  khugepaged_scan.address,
+                                                  hpage);
+                        /* move to next address */
+                        khugepaged_scan.address += HPAGE_PMD_SIZE;
+                        progress += HPAGE_PMD_NR;
+                        if (ret)
+                                /* we released mmap_sem so break loop */
+                                goto breakouterloop_mmap_sem;
+                        if (progress >= pages)
+                                goto breakouterloop;
+                }
+        }
+breakouterloop:
+        up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_sem:
+        spin_lock(&khugepaged_mm_lock);
+        VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
+        /*
+         * Release the current mm_slot if this mm is about to die, or
+         * if we scanned all vmas of this mm.
+         */
+        if (khugepaged_test_exit(mm) || !vma) {
+                /*
+                 * Make sure that if mm_users is reaching zero while
+                 * khugepaged runs here, khugepaged_exit will find
+                 * mm_slot not pointing to the exiting mm.
+                 */
+                if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) {
+                        khugepaged_scan.mm_slot = list_entry(
+                                mm_slot->mm_node.next,
+                                struct mm_slot, mm_node);
+                        khugepaged_scan.address = 0;
+                } else {
+                        khugepaged_scan.mm_slot = NULL;
+                        khugepaged_full_scans++;
+                }
+                collect_mm_slot(mm_slot);
+        }
+        return progress;
+}
+static int khugepaged_has_work(void)
+{
+        return !list_empty(&khugepaged_scan.mm_head) &&
+                khugepaged_enabled();
+}
+static int khugepaged_wait_event(void)
+{
+        return !list_empty(&khugepaged_scan.mm_head) ||
+                !khugepaged_enabled();
+}
+static void khugepaged_do_scan(struct page **hpage)
+{
+        unsigned int progress = 0, pass_through_head = 0;
+        unsigned int pages = khugepaged_pages_to_scan;
+        barrier(); /* write khugepaged_pages_to_scan to local stack */
+        while (progress < pages) {
+                cond_resched();
+#ifndef CONFIG_NUMA
+                if (!*hpage) {
+                        *hpage = alloc_hugepage(khugepaged_defrag());
+                        if (unlikely(!*hpage)) {
+                                count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+                                break;
+                        }
+                        count_vm_event(THP_COLLAPSE_ALLOC);
+                }
+#else
+                if (IS_ERR(*hpage))
+                        break;
+#endif
+                if (unlikely(kthread_should_stop() || freezing(current)))
+                        break;
+                spin_lock(&khugepaged_mm_lock);
+                if (!khugepaged_scan.mm_slot)
+                        pass_through_head++;
+                if (khugepaged_has_work() &&
+                    pass_through_head < 2)
+                        progress += khugepaged_scan_mm_slot(pages - progress,
+                                                            hpage);
+                else
+                        progress = pages;
+                spin_unlock(&khugepaged_mm_lock);
+        }
+}
+static void khugepaged_alloc_sleep(void)
+{
+        DEFINE_WAIT(wait);
+        add_wait_queue(&khugepaged_wait, &wait);
+        schedule_timeout_interruptible(
+                msecs_to_jiffies(
+                        khugepaged_alloc_sleep_millisecs));
+        remove_wait_queue(&khugepaged_wait, &wait);
+}
+#ifndef CONFIG_NUMA
+static struct page *khugepaged_alloc_hugepage(void)
+{
+        struct page *hpage;
+        do {
+                hpage = alloc_hugepage(khugepaged_defrag());
+                if (!hpage) {
+                        count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+                        khugepaged_alloc_sleep();
+                } else
+                        count_vm_event(THP_COLLAPSE_ALLOC);
+        } while (unlikely(!hpage) &&
+                 likely(khugepaged_enabled()));
+        return hpage;
+}
+#endif
+static void khugepaged_loop(void)
+{
+        struct page *hpage;
+#ifdef CONFIG_NUMA
+        hpage = NULL;
+#endif
+        while (likely(khugepaged_enabled())) {
+#ifndef CONFIG_NUMA
+                hpage = khugepaged_alloc_hugepage();
+                if (unlikely(!hpage))
+                        break;
+#else
+                if (IS_ERR(hpage)) {
+                        khugepaged_alloc_sleep();
+                        hpage = NULL;
+                }
+#endif
+                khugepaged_do_scan(&hpage);
+#ifndef CONFIG_NUMA
+                if (hpage)
+                        put_page(hpage);
+#endif
+                try_to_freeze();
+                if (unlikely(kthread_should_stop()))
+                        break;
+                if (khugepaged_has_work()) {
+                        DEFINE_WAIT(wait);
+                        if (!khugepaged_scan_sleep_millisecs)
+                                continue;
+                        add_wait_queue(&khugepaged_wait, &wait);
+                        schedule_timeout_interruptible(
+                                msecs_to_jiffies(
+                                        khugepaged_scan_sleep_millisecs));
+                        remove_wait_queue(&khugepaged_wait, &wait);
+                } else if (khugepaged_enabled())
+                        wait_event_freezable(khugepaged_wait,
+                                             khugepaged_wait_event());
+        }
+}
+static int khugepaged(void *none)
+{
+        struct mm_slot *mm_slot;
+        set_freezable();
+        set_user_nice(current, 19);
+        /* serialize with start_khugepaged() */
+        mutex_lock(&khugepaged_mutex);
+        for (;;) {
+                mutex_unlock(&khugepaged_mutex);
+                VM_BUG_ON(khugepaged_thread != current);
+                khugepaged_loop();
+                VM_BUG_ON(khugepaged_thread != current);
+                mutex_lock(&khugepaged_mutex);
+                if (!khugepaged_enabled())
+                        break;
+                if (unlikely(kthread_should_stop()))
+                        break;
+        }
+        spin_lock(&khugepaged_mm_lock);
+        mm_slot = khugepaged_scan.mm_slot;
+        khugepaged_scan.mm_slot = NULL;
+        if (mm_slot)
+                collect_mm_slot(mm_slot);
+        spin_unlock(&khugepaged_mm_lock);
+        khugepaged_thread = NULL;
+        mutex_unlock(&khugepaged_mutex);
+        return 0;
+}
+void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd)
+{
+        struct page *page;
+        spin_lock(&mm->page_table_lock);
+        if (unlikely(!pmd_trans_huge(*pmd))) {
+                spin_unlock(&mm->page_table_lock);
+                return;
+        }
+        page = pmd_page(*pmd);
+        VM_BUG_ON(!page_count(page));
+        get_page(page);
+        spin_unlock(&mm->page_table_lock);
+        split_huge_page(page);
+        put_page(page);
+        BUG_ON(pmd_trans_huge(*pmd));
+}
+static void split_huge_page_address(struct mm_struct *mm,
+                                    unsigned long address)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        VM_BUG_ON(!(address & ~HPAGE_PMD_MASK));
+        pgd = pgd_offset(mm, address);
+        if (!pgd_present(*pgd))
+                return;
+        pud = pud_offset(pgd, address);
+        if (!pud_present(*pud))
+                return;
+        pmd = pmd_offset(pud, address);
+        if (!pmd_present(*pmd))
+                return;
+        /*
+         * Caller holds the mmap_sem write mode, so a huge pmd cannot
+         * materialize from under us.
+         */
+        split_huge_page_pmd(mm, pmd);
+}
+void __vma_adjust_trans_huge(struct vm_area_struct *vma,
+                             unsigned long start,
+                             unsigned long end,
+                             long adjust_next)
+{
+        /*
+         * If the new start address isn't hpage aligned and it could
+         * previously contain an hugepage: check if we need to split
+         * an huge pmd.
+         */
+        if (start & ~HPAGE_PMD_MASK &&
+            (start & HPAGE_PMD_MASK) >= vma->vm_start &&
+            (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+                split_huge_page_address(vma->vm_mm, start);
+        /*
+         * If the new end address isn't hpage aligned and it could
+         * previously contain an hugepage: check if we need to split
+         * an huge pmd.
+         */
+        if (end & ~HPAGE_PMD_MASK &&
+            (end & HPAGE_PMD_MASK) >= vma->vm_start &&
+            (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end)
+                split_huge_page_address(vma->vm_mm, end);
+        /*
+         * If we're also updating the vma->vm_next->vm_start, if the new
+         * vm_next->vm_start isn't page aligned and it could previously
+         * contain an hugepage: check if we need to split an huge pmd.
+         */
+        if (adjust_next > 0) {
+                struct vm_area_struct *next = vma->vm_next;
+                unsigned long nstart = next->vm_start;
+                nstart += adjust_next << PAGE_SHIFT;
+                if (nstart & ~HPAGE_PMD_MASK &&
+                    (nstart & HPAGE_PMD_MASK) >= next->vm_start &&
+                    (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
+                        split_huge_page_address(next->vm_mm, nstart);
+        }
+}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c03273807182..bfcf153bc829 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -146,7 +146,7 @@ static long region_chg(struct list_head *head, long f, long t)
                if (rg->from > t)
                        return chg;
-                /* We overlap with this area, if it extends futher than
+                /* We overlap with this area, if it extends further than
                 * us then we must extend ourselves.  Account for its
                 * existing reservation. */
                if (rg->to > t) {
@@ -394,67 +394,37 @@ static int vma_has_reserves(struct vm_area_struct *vma)
        return 0;
 }
-static void clear_gigantic_page(struct page *page,
+static void copy_gigantic_page(struct page *dst, struct page *src)
-                        unsigned long addr, unsigned long sz)
 {
        int i;
-        struct page *p = page;
+        struct hstate *h = page_hstate(src);
-        might_sleep();
-        for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
-                cond_resched();
-                clear_user_highpage(p, addr + i * PAGE_SIZE);
-        }
-}
-static void clear_huge_page(struct page *page,
-                        unsigned long addr, unsigned long sz)
-{
-        int i;
-        if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) {
-                clear_gigantic_page(page, addr, sz);
-                return;
-        }
-        might_sleep();
-        for (i = 0; i < sz/PAGE_SIZE; i++) {
-                cond_resched();
-                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
-        }
-}
-static void copy_gigantic_page(struct page *dst, struct page *src,
-                           unsigned long addr, struct vm_area_struct *vma)
-{
-        int i;
-        struct hstate *h = hstate_vma(vma);
        struct page *dst_base = dst;
        struct page *src_base = src;
-        might_sleep();
        for (i = 0; i < pages_per_huge_page(h); ) {
                cond_resched();
-                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+                copy_highpage(dst, src);
                i++;
                dst = mem_map_next(dst, dst_base, i);
                src = mem_map_next(src, src_base, i);
        }
 }
-static void copy_huge_page(struct page *dst, struct page *src,
-                           unsigned long addr, struct vm_area_struct *vma)
+void copy_huge_page(struct page *dst, struct page *src)
 {
        int i;
-        struct hstate *h = hstate_vma(vma);
+        struct hstate *h = page_hstate(src);
        if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
-                copy_gigantic_page(dst, src, addr, vma);
+                copy_gigantic_page(dst, src);
                return;
        }
        might_sleep();
        for (i = 0; i < pages_per_huge_page(h); i++) {
                cond_resched();
-                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+                copy_highpage(dst + i, src + i);
        }
 }
@@ -466,11 +436,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
        h->free_huge_pages_node[nid]++;
 }
+static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
+{
+        struct page *page;
+        if (list_empty(&h->hugepage_freelists[nid]))
+                return NULL;
+        page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
+        list_del(&page->lru);
+        set_page_refcounted(page);
+        h->free_huge_pages--;
+        h->free_huge_pages_node[nid]--;
+        return page;
+}
 static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, int avoid_reserve)
 {
-        int nid;
        struct page *page = NULL;
        struct mempolicy *mpol;
        nodemask_t *nodemask;
@@ -492,23 +475,17 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
        /* If reserves cannot be used, ensure enough pages are in the pool */
        if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
-                goto err;;
+                goto err;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
-                nid = zone_to_nid(zone);
+                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
-                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
+                        page = dequeue_huge_page_node(h, zone_to_nid(zone));
-                    !list_empty(&h->hugepage_freelists[nid])) {
+                        if (page) {
-                        page = list_entry(h->hugepage_freelists[nid].next,
+                                if (!avoid_reserve)
-                                          struct page, lru);
+                                        decrement_hugepage_resv_vma(h, vma);
-                        list_del(&page->lru);
+                                break;
-                        h->free_huge_pages--;
+                        }
-                        h->free_huge_pages_node[nid]--;
-                        if (!avoid_reserve)
-                                decrement_hugepage_resv_vma(h, vma);
-                        break;
                }
        }
 err:
@@ -770,11 +747,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
        return ret;
 }
-static struct page *alloc_buddy_huge_page(struct hstate *h,
+static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
-                        struct vm_area_struct *vma, unsigned long address)
 {
        struct page *page;
-        unsigned int nid;
+        unsigned int r_nid;
        if (h->order >= MAX_ORDER)
                return NULL;
@@ -812,9 +788,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
        }
        spin_unlock(&hugetlb_lock);
-        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+        if (nid == NUMA_NO_NODE)
-                                        __GFP_REPEAT|__GFP_NOWARN,
+                page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
-                                        huge_page_order(h));
+                                   __GFP_REPEAT|__GFP_NOWARN,
+                                   huge_page_order(h));
+        else
+                page = alloc_pages_exact_node(nid,
+                        htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+                        __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
        if (page && arch_prepare_hugepage(page)) {
                __free_pages(page, huge_page_order(h));
@@ -823,19 +804,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
        spin_lock(&hugetlb_lock);
        if (page) {
-                /*
+                r_nid = page_to_nid(page);
-                 * This page is now managed by the hugetlb allocator and has
-                 * no users -- drop the buddy allocator's reference.
-                 */
-                put_page_testzero(page);
-                VM_BUG_ON(page_count(page));
-                nid = page_to_nid(page);
                set_compound_page_dtor(page, free_huge_page);
                /*
                 * We incremented the global counters already
                 */
-                h->nr_huge_pages_node[nid]++;
+                h->nr_huge_pages_node[r_nid]++;
-                h->surplus_huge_pages_node[nid]++;
+                h->surplus_huge_pages_node[r_nid]++;
                __count_vm_event(HTLB_BUDDY_PGALLOC);
        } else {
                h->nr_huge_pages--;
@@ -848,7 +823,26 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
 }
 /*
- * Increase the hugetlb pool such that it can accomodate a reservation
+ * This allocation function is useful in the context where vma is irrelevant.
+ * E.g. soft-offlining uses this function because it only cares physical
+ * address of error page.
+ */
+struct page *alloc_huge_page_node(struct hstate *h, int nid)
+{
+        struct page *page;
+        spin_lock(&hugetlb_lock);
+        page = dequeue_huge_page_node(h, nid);
+        spin_unlock(&hugetlb_lock);
+        if (!page)
+                page = alloc_buddy_huge_page(h, nid);
+        return page;
+}
+/*
+ * Increase the hugetlb pool such that it can accommodate a reservation
 * of size 'delta'.
 */
 static int gather_surplus_pages(struct hstate *h, int delta)
@@ -871,17 +865,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
-                page = alloc_buddy_huge_page(h, NULL, 0);
+                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
-                if (!page) {
+                if (!page)
                        /*
                         * We were not able to allocate enough pages to
                         * satisfy the entire reservation so we free what
                         * we've allocated so far.
                         */
-                        spin_lock(&hugetlb_lock);
-                        needed = 0;
                        goto free;
-                }
                list_add(&page->lru, &surplus_list);
        }
@@ -899,7 +890,7 @@ retry:
        /*
         * The surplus_list now contains _at_least_ the number of extra pages
-         * needed to accomodate the reservation.  Add the appropriate number
+         * needed to accommodate the reservation.  Add the appropriate number
         * of pages to the hugetlb pool and free the extras back to the buddy
         * allocator.  Commit the entire reservation here to prevent another
         * process from stealing the pages as they are added to the pool but
@@ -908,31 +899,31 @@ retry:
        needed += allocated;
        h->resv_huge_pages += delta;
        ret = 0;
-free:
+        spin_unlock(&hugetlb_lock);
        /* Free the needed pages to the hugetlb pool */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                if ((--needed) < 0)
                        break;
                list_del(&page->lru);
+                /*
+                 * This page is now managed by the hugetlb allocator and has
+                 * no users -- drop the buddy allocator's reference.
+                 */
+                put_page_testzero(page);
+                VM_BUG_ON(page_count(page));
                enqueue_huge_page(h, page);
        }
        /* Free unnecessary surplus pages to the buddy allocator */
+free:
        if (!list_empty(&surplus_list)) {
-                spin_unlock(&hugetlb_lock);
                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
                        list_del(&page->lru);
-                        /*
+                        put_page(page);
-                         * The page has a reference count of zero already, so
-                         * call free_huge_page directly instead of using
-                         * put_page.  This must be done with hugetlb_lock
-                         * unlocked which is safe because free_huge_page takes
-                         * hugetlb_lock before deciding how to free the page.
-                         */
-                        free_huge_page(page);
                }
-                spin_lock(&hugetlb_lock);
        }
+        spin_lock(&hugetlb_lock);
        return ret;
 }
@@ -1042,24 +1033,23 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
         */
        chg = vma_needs_reservation(h, vma, addr);
        if (chg < 0)
-                return ERR_PTR(chg);
+                return ERR_PTR(-VM_FAULT_OOM);
        if (chg)
                if (hugetlb_get_quota(inode->i_mapping, chg))
-                        return ERR_PTR(-ENOSPC);
+                        return ERR_PTR(-VM_FAULT_SIGBUS);
        spin_lock(&hugetlb_lock);
        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
        spin_unlock(&hugetlb_lock);
        if (!page) {
-                page = alloc_buddy_huge_page(h, vma, addr);
+                page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
                if (!page) {
                        hugetlb_put_quota(inode->i_mapping, chg);
                        return ERR_PTR(-VM_FAULT_SIGBUS);
                }
        }
-        set_page_refcounted(page);
        set_page_private(page, (unsigned long) mapping);
        vma_commit_reservation(h, vma, addr);
@@ -1121,6 +1111,14 @@ static void __init gather_bootmem_prealloc(void)
                WARN_ON(page_count(page) != 1);
                prep_compound_huge_page(page, h->order);
                prep_new_huge_page(h, page, page_to_nid(page));
+                /*
+                 * If we had gigantic hugepages allocated at boot time, we need
+                 * to restore the 'stolen' pages to totalram_pages in order to
+                 * fix confusing memory reports from free(1) and another
+                 * side-effects, like CommitLimit going negative.
+                 */
+                if (h->order > (MAX_ORDER - 1))
+                        totalram_pages += 1 << h->order;
        }
 }
@@ -1373,6 +1371,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
        return sprintf(buf, "%lu\n", nr_huge_pages);
 }
 static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                        struct kobject *kobj, struct kobj_attribute *attr,
                        const char *buf, size_t len)
@@ -1385,9 +1384,14 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
        err = strict_strtoul(buf, 10, &count);
        if (err)
-                return 0;
+                goto out;
        h = kobj_to_hstate(kobj, &nid);
+        if (h->order >= MAX_ORDER) {
+                err = -EINVAL;
+                goto out;
+        }
        if (nid == NUMA_NO_NODE) {
                /*
                 * global hstate attribute
@@ -1413,6 +1417,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
                NODEMASK_FREE(nodes_allowed);
        return len;
+out:
+        NODEMASK_FREE(nodes_allowed);
+        return err;
 }
 static ssize_t nr_hugepages_show(struct kobject *kobj,
@@ -1455,6 +1462,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
        struct hstate *h = kobj_to_hstate(kobj, NULL);
        return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
 }
 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
                struct kobj_attribute *attr, const char *buf, size_t count)
 {
@@ -1462,9 +1470,12 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
        unsigned long input;
        struct hstate *h = kobj_to_hstate(kobj, NULL);
+        if (h->order >= MAX_ORDER)
+                return -EINVAL;
        err = strict_strtoul(buf, 10, &input);
        if (err)
-                return 0;
+                return err;
        spin_lock(&hugetlb_lock);
        h->nr_overcommit_huge_pages = input;
@@ -1867,13 +1878,18 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
 {
        struct hstate *h = &default_hstate;
        unsigned long tmp;
+        int ret;
+        tmp = h->max_huge_pages;
-        if (!write)
+        if (write && h->order >= MAX_ORDER)
-                tmp = h->max_huge_pages;
+                return -EINVAL;
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-        proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        if (ret)
+                goto out;
        if (write) {
                NODEMASK_ALLOC(nodemask_t, nodes_allowed,
@@ -1888,8 +1904,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
                if (nodes_allowed != &node_states[N_HIGH_MEMORY])
                        NODEMASK_FREE(nodes_allowed);
        }
+out:
-        return 0;
+        return ret;
 }
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -1927,21 +1943,26 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 {
        struct hstate *h = &default_hstate;
        unsigned long tmp;
+        int ret;
-        if (!write)
+        tmp = h->nr_overcommit_huge_pages;
-                tmp = h->nr_overcommit_huge_pages;
+        if (write && h->order >= MAX_ORDER)
+                return -EINVAL;
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-        proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+        if (ret)
+                goto out;
        if (write) {
                spin_lock(&hugetlb_lock);
                h->nr_overcommit_huge_pages = tmp;
                spin_unlock(&hugetlb_lock);
        }
+out:
-        return 0;
+        return ret;
 }
 #endif /* CONFIG_SYSCTL */
@@ -2030,7 +2051,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
         * This new VMA should share its siblings reservation map if present.
         * The VMA will only ever have a valid reservation map pointer where
         * it is being copied for another still existing VMA.  As that VMA
-         * has a reference to the reservation map it cannot dissappear until
+         * has a reference to the reservation map it cannot disappear until
         * after this open call completes.  It is therefore safe to take a
         * new reference here without additional locking.
         */
@@ -2153,6 +2174,19 @@ nomem:
        return -ENOMEM;
 }
+static int is_hugetlb_entry_migration(pte_t pte)
+{
+        swp_entry_t swp;
+        if (huge_pte_none(pte) || pte_present(pte))
+                return 0;
+        swp = pte_to_swp_entry(pte);
+        if (non_swap_entry(swp) && is_migration_entry(swp)) {
+                return 1;
+        } else
+                return 0;
+}
 static int is_hugetlb_entry_hwpoisoned(pte_t pte)
 {
        swp_entry_t swp;
@@ -2179,7 +2213,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        unsigned long sz = huge_page_size(h);
        /*
-         * A page gathering list, protected by per file i_mmap_lock. The
+         * A page gathering list, protected by per file i_mmap_mutex. The
         * lock is used to avoid list corruption from multiple unmapping
         * of the same page since we are using page->lru.
         */
@@ -2248,9 +2282,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                          unsigned long end, struct page *ref_page)
 {
-        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
        __unmap_hugepage_range(vma, start, end, ref_page);
-        spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
 }
 /*
@@ -2282,7 +2316,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
         * this mapping should be shared between all the VMAs,
         * __unmap_hugepage_range() is called as the lock is already held
         */
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                /* Do not unmap the current VMA */
                if (iter_vma == vma)
@@ -2300,7 +2334,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                                address, address + huge_page_size(h),
                                page);
        }
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        return 1;
 }
@@ -2380,10 +2414,14 @@ retry_avoidcopy:
         * When the original hugepage is shared one, it does not have
         * anon_vma prepared.
         */
-        if (unlikely(anon_vma_prepare(vma)))
+        if (unlikely(anon_vma_prepare(vma))) {
+                /* Caller expects lock to be held */
+                spin_lock(&mm->page_table_lock);
                return VM_FAULT_OOM;
+        }
-        copy_huge_page(new_page, old_page, address, vma);
+        copy_user_huge_page(new_page, old_page, address, vma,
+                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
        /*
@@ -2460,7 +2498,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /*
         * Currently, we are forced to kill the process in the event the
         * original mapper has unmapped pages from the child due to a failed
-         * COW. Warn that such a situation has occured as it may not be obvious
+         * COW. Warn that such a situation has occurred as it may not be obvious
         */
        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                printk(KERN_WARNING
@@ -2487,7 +2525,7 @@ retry:
                        ret = -PTR_ERR(page);
                        goto out;
                }
-                clear_huge_page(page, address, huge_page_size(h));
+                clear_huge_page(page, address, pages_per_huge_page(h));
                __SetPageUptodate(page);
                if (vma->vm_flags & VM_MAYSHARE) {
@@ -2515,22 +2553,20 @@ retry:
                        hugepage_add_new_anon_rmap(page, vma, address);
                }
        } else {
+                /*
+                 * If memory error occurs between mmap() and fault, some process
+                 * don't have hwpoisoned swap entry for errored virtual address.
+                 * So we need to block hugepage fault by PG_hwpoison bit check.
+                 */
+                if (unlikely(PageHWPoison(page))) {
+                        ret = VM_FAULT_HWPOISON | 
+                              VM_FAULT_SET_HINDEX(h - hstates);
+                        goto backout_unlocked;
+                }
                page_dup_rmap(page);
        }
        /*
-         * Since memory error handler replaces pte into hwpoison swap entry
-         * at the time of error handling, a process which reserved but not have
-         * the mapping to the error hugepage does not have hwpoison swap entry.
-         * So we need to block accesses from such a process by checking
-         * PG_hwpoison bit here.
-         */
-        if (unlikely(PageHWPoison(page))) {
-                ret = VM_FAULT_HWPOISON;
-                goto backout_unlocked;
-        }
-        /*
         * If we are going to COW a private mapping later, we examine the
         * pending reservations for this page now. This will ensure that
         * any allocations necessary to record that reservation occur outside
@@ -2587,8 +2623,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        ptep = huge_pte_offset(mm, address);
        if (ptep) {
                entry = huge_ptep_get(ptep);
-                if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                if (unlikely(is_hugetlb_entry_migration(entry))) {
-                        return VM_FAULT_HWPOISON;
+                        migration_entry_wait(mm, (pmd_t *)ptep, address);
+                        return 0;
+                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                        return VM_FAULT_HWPOISON_LARGE | 
+                               VM_FAULT_SET_HINDEX(h - hstates);
        }
        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
@@ -2665,7 +2705,8 @@ out_page_table_lock:
                unlock_page(pagecache_page);
                put_page(pagecache_page);
        }
-        unlock_page(page);
+        if (page != pagecache_page)
+                unlock_page(page);
 out_mutex:
        mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2777,7 +2818,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
-        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+        mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
        spin_lock(&mm->page_table_lock);
        for (; address < end; address += huge_page_size(h)) {
                ptep = huge_pte_offset(mm, address);
@@ -2792,7 +2833,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                }
        }
        spin_unlock(&mm->page_table_lock);
-        spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+        mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex);
        flush_tlb_range(vma, start, end);
 }
@@ -2800,7 +2841,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 int hugetlb_reserve_pages(struct inode *inode,
                                        long from, long to,
                                        struct vm_area_struct *vma,
-                                        int acctflag)
+                                        vm_flags_t vm_flags)
 {
        long ret, chg;
        struct hstate *h = hstate_inode(inode);
@@ -2810,7 +2851,7 @@ int hugetlb_reserve_pages(struct inode *inode,
         * attempt will be made for VM_NORESERVE to allocate a page
         * and filesystem quota without using reserves
         */
-        if (acctflag & VM_NORESERVE)
+        if (vm_flags & VM_NORESERVE)
                return 0;
        /*
@@ -2878,18 +2919,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        hugetlb_acct_memory(h, -(chg - freed));
 }
+#ifdef CONFIG_MEMORY_FAILURE
+/* Should be called in hugetlb_lock */
+static int is_hugepage_on_freelist(struct page *hpage)
+{
+        struct page *page;
+        struct page *tmp;
+        struct hstate *h = page_hstate(hpage);
+        int nid = page_to_nid(hpage);
+        list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
+                if (page == hpage)
+                        return 1;
+        return 0;
+}
 /*
 * This function is called from memory failure code.
 * Assume the caller holds page lock of the head page.
 */
-void __isolate_hwpoisoned_huge_page(struct page *hpage)
+int dequeue_hwpoisoned_huge_page(struct page *hpage)
 {
        struct hstate *h = page_hstate(hpage);
        int nid = page_to_nid(hpage);
+        int ret = -EBUSY;
        spin_lock(&hugetlb_lock);
-        list_del(&hpage->lru);
+        if (is_hugepage_on_freelist(hpage)) {
-        h->free_huge_pages--;
+                list_del(&hpage->lru);
-        h->free_huge_pages_node[nid]--;
+                set_page_refcounted(hpage);
+                h->free_huge_pages--;
+                h->free_huge_pages_node[nid]--;
+                ret = 0;
+        }
        spin_unlock(&hugetlb_lock);
+        return ret;
 }
+#endif
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 0948f1072d6b..c7fc7fd00e32 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -1,4 +1,4 @@
-/* Inject a hwpoison memory failure on a arbitary pfn */
+/* Inject a hwpoison memory failure on a arbitrary pfn */
 #include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 1d29cdfe8ebb..4019979b2637 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -21,6 +21,5 @@ struct mm_struct init_mm = {
        .mmap_sem       = __RWSEM_INITIALIZER(init_mm.mmap_sem),
        .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
        .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
-        .cpu_vm_mask    = CPU_MASK_ALL,
        INIT_MM_CONTEXT(init_mm)
 };
diff --git a/mm/internal.h b/mm/internal.h
index 6a697bb97fc5..d071d380fb49 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -62,10 +62,14 @@ extern bool is_free_buddy_page(struct page *page);
 */
 static inline unsigned long page_order(struct page *page)
 {
-        VM_BUG_ON(!PageBuddy(page));
+        /* PageBuddy() must be checked by the caller */
        return page_private(page);
 }
+/* mm/util.c */
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+                struct vm_area_struct *prev, struct rb_node *rb_parent);
 #ifdef CONFIG_MMU
 extern long mlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
@@ -134,6 +138,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
        }
 }
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+extern unsigned long vma_address(struct page *page,
+                                 struct vm_area_struct *vma);
+#endif
 #else /* !CONFIG_MMU */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 {
@@ -158,7 +166,7 @@ static inline struct page *mem_map_offset(struct page *base, int offset)
 }
 /*
- * Iterator over all subpages withing the maximally aligned gigantic
+ * Iterator over all subpages within the maximally aligned gigantic
 * page 'base'.  Handle any discontiguity in the mem_map.
 */
 static inline struct page *mem_map_next(struct page *iter,
@@ -241,10 +249,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
 }
 #endif /* CONFIG_SPARSEMEM */
-int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                     unsigned long start, int len, unsigned int foll_flags,
-                     struct page **pages, struct vm_area_struct **vmas);
 #define ZONE_RECLAIM_NOSCAN     -2
 #define ZONE_RECLAIM_FULL       -1
 #define ZONE_RECLAIM_SOME       0
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index 177a5169bbde..ff0d9779cec8 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void)
         * after the module is removed.
         */
        for (i = 0; i < 10; i++) {
-                elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+                elem = kzalloc(sizeof(*elem), GFP_KERNEL);
-                pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
+                pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem);
                if (!elem)
                        return -ENOMEM;
-                memset(elem, 0, sizeof(*elem));
                INIT_LIST_HEAD(&elem->list);
                list_add_tail(&elem->list, &test_list);
        }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index bd9bc214091b..aacee45616fc 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -113,7 +113,9 @@
 #define BYTES_PER_POINTER       sizeof(void *)
 /* GFP bitmask for kmemleak internal allocations */
-#define GFP_KMEMLEAK_MASK       (GFP_KERNEL | GFP_ATOMIC)
+#define gfp_kmemleak_mask(gfp)  (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
+                                 __GFP_NORETRY | __GFP_NOMEMALLOC | \
+                                 __GFP_NOWARN)
 /* scanning area inside a memory block */
 struct kmemleak_scan_area {
@@ -263,7 +265,7 @@ static void kmemleak_disable(void);
 } while (0)
 /*
- * Macro invoked when a serious kmemleak condition occured and cannot be
+ * Macro invoked when a serious kmemleak condition occurred and cannot be
 * recovered from. Kmemleak will be disabled and further allocation/freeing
 * tracing no longer available.
 */
@@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        struct kmemleak_object *object;
        struct prio_tree_node *node;
-        object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK);
+        object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
        if (!object) {
-                kmemleak_stop("Cannot allocate a kmemleak_object structure\n");
+                pr_warning("Cannot allocate a kmemleak_object structure\n");
+                kmemleak_disable();
                return NULL;
        }
@@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
                return;
        }
-        area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK);
+        area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
        if (!area) {
-                kmemleak_warn("Cannot allocate a scan area\n");
+                pr_warning("Cannot allocate a scan area\n");
                goto out;
        }
@@ -1003,7 +1006,7 @@ static bool update_checksum(struct kmemleak_object *object)
 /*
 * Memory scanning is a long process and it needs to be interruptable. This
- * function checks whether such interrupt condition occured.
+ * function checks whether such interrupt condition occurred.
 */
 static int scan_should_stop(void)
 {
@@ -1411,9 +1414,12 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
        ++(*pos);
        list_for_each_continue_rcu(n, &object_list) {
-                next_obj = list_entry(n, struct kmemleak_object, object_list);
+                struct kmemleak_object *obj =
-                if (get_object(next_obj))
+                        list_entry(n, struct kmemleak_object, object_list);
+                if (get_object(obj)) {
+                        next_obj = obj;
                        break;
+                }
        }
        put_object(prev_obj);
@@ -1730,7 +1736,7 @@ static int __init kmemleak_late_init(void)
        if (atomic_read(&kmemleak_error)) {
                /*
-                 * Some error occured and kmemleak was disabled. There is a
+                 * Some error occurred and kmemleak was disabled. There is a
                 * small chance that kmemleak_disable() was called immediately
                 * after setting kmemleak_initialized and we may end up with
                 * two clean-up threads but serialized by scan_mutex.
diff --git a/mm/ksm.c b/mm/ksm.c
index 65ab5c7067d9..9a68b0cf0a1c 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -34,6 +34,8 @@
 #include <linux/swap.h>
 #include <linux/ksm.h>
 #include <linux/hash.h>
+#include <linux/freezer.h>
+#include <linux/oom.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -300,20 +302,6 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
        return rmap_item->address & STABLE_FLAG;
 }
-static void hold_anon_vma(struct rmap_item *rmap_item,
-                          struct anon_vma *anon_vma)
-{
-        rmap_item->anon_vma = anon_vma;
-        get_anon_vma(anon_vma);
-}
-static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
-{
-        struct anon_vma *anon_vma = rmap_item->anon_vma;
-        drop_anon_vma(anon_vma);
-}
 /*
 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
 * page tables after it has passed through ksm_exit() - which, if necessary,
@@ -396,7 +384,7 @@ static void break_cow(struct rmap_item *rmap_item)
         * It is not an accident that whenever we want to break COW
         * to undo, we also need to drop a reference to the anon_vma.
         */
-        ksm_drop_anon_vma(rmap_item);
+        put_anon_vma(rmap_item->anon_vma);
        down_read(&mm->mmap_sem);
        if (ksm_test_exit(mm))
@@ -411,6 +399,20 @@ out:
        up_read(&mm->mmap_sem);
 }
+static struct page *page_trans_compound_anon(struct page *page)
+{
+        if (PageTransCompound(page)) {
+                struct page *head = compound_trans_head(page);
+                /*
+                 * head may actually be splitted and freed from under
+                 * us but it's ok here.
+                 */
+                if (PageAnon(head))
+                        return head;
+        }
+        return NULL;
+}
 static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 {
        struct mm_struct *mm = rmap_item->mm;
@@ -430,7 +432,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
        page = follow_page(vma, addr, FOLL_GET);
        if (IS_ERR_OR_NULL(page))
                goto out;
-        if (PageAnon(page)) {
+        if (PageAnon(page) || page_trans_compound_anon(page)) {
                flush_anon_page(vma, page, addr);
                flush_dcache_page(page);
        } else {
@@ -451,7 +453,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
                        ksm_pages_sharing--;
                else
                        ksm_pages_shared--;
-                ksm_drop_anon_vma(rmap_item);
+                put_anon_vma(rmap_item->anon_vma);
                rmap_item->address &= PAGE_MASK;
                cond_resched();
        }
@@ -539,7 +541,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                else
                        ksm_pages_shared--;
-                ksm_drop_anon_vma(rmap_item);
+                put_anon_vma(rmap_item->anon_vma);
                rmap_item->address &= PAGE_MASK;
        } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -708,6 +710,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
        if (addr == -EFAULT)
                goto out;
+        BUG_ON(PageTransCompound(page));
        ptep = page_check_address(page, mm, addr, &ptl, 0);
        if (!ptep)
                goto out;
@@ -718,7 +721,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
                swapped = PageSwapCache(page);
                flush_cache_page(vma, addr, page_to_pfn(page));
                /*
-                 * Ok this is tricky, when get_user_pages_fast() run it doesnt
+                 * Ok this is tricky, when get_user_pages_fast() run it doesn't
                 * take any lock, therefore the check that we are going to make
                 * with the pagecount against the mapcount is racey and
                 * O_DIRECT can happen right after the check.
@@ -783,6 +786,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
                goto out;
        pmd = pmd_offset(pud, addr);
+        BUG_ON(pmd_trans_huge(*pmd));
        if (!pmd_present(*pmd))
                goto out;
@@ -800,6 +804,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
        page_remove_rmap(page);
+        if (!page_mapped(page))
+                try_to_free_swap(page);
        put_page(page);
        pte_unmap_unlock(ptep, ptl);
@@ -808,6 +814,33 @@ out:
        return err;
 }
+static int page_trans_compound_anon_split(struct page *page)
+{
+        int ret = 0;
+        struct page *transhuge_head = page_trans_compound_anon(page);
+        if (transhuge_head) {
+                /* Get the reference on the head to split it. */
+                if (get_page_unless_zero(transhuge_head)) {
+                        /*
+                         * Recheck we got the reference while the head
+                         * was still anonymous.
+                         */
+                        if (PageAnon(transhuge_head))
+                                ret = split_huge_page(transhuge_head);
+                        else
+                                /*
+                                 * Retry later if split_huge_page run
+                                 * from under us.
+                                 */
+                                ret = 1;
+                        put_page(transhuge_head);
+                } else
+                        /* Retry later if split_huge_page run from under us. */
+                        ret = 1;
+        }
+        return ret;
+}
 /*
 * try_to_merge_one_page - take two pages and merge them into one
 * @vma: the vma that holds the pte pointing to page
@@ -828,6 +861,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
        if (!(vma->vm_flags & VM_MERGEABLE))
                goto out;
+        if (PageTransCompound(page) && page_trans_compound_anon_split(page))
+                goto out;
+        BUG_ON(PageTransCompound(page));
        if (!PageAnon(page))
                goto out;
@@ -900,7 +936,8 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
                goto out;
        /* Must get reference to anon_vma while still holding mmap_sem */
-        hold_anon_vma(rmap_item, vma->anon_vma);
+        rmap_item->anon_vma = vma->anon_vma;
+        get_anon_vma(vma->anon_vma);
 out:
        up_read(&mm->mmap_sem);
        return err;
@@ -1247,12 +1284,30 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
        slot = ksm_scan.mm_slot;
        if (slot == &ksm_mm_head) {
+                /*
+                 * A number of pages can hang around indefinitely on per-cpu
+                 * pagevecs, raised page count preventing write_protect_page
+                 * from merging them.  Though it doesn't really matter much,
+                 * it is puzzling to see some stuck in pages_volatile until
+                 * other activity jostles them out, and they also prevented
+                 * LTP's KSM test from succeeding deterministically; so drain
+                 * them here (here rather than on entry to ksm_do_scan(),
+                 * so we don't IPI too often when pages_to_scan is set low).
+                 */
+                lru_add_drain_all();
                root_unstable_tree = RB_ROOT;
                spin_lock(&ksm_mmlist_lock);
                slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list);
                ksm_scan.mm_slot = slot;
                spin_unlock(&ksm_mmlist_lock);
+                /*
+                 * Although we tested list_empty() above, a racing __ksm_exit
+                 * of the last mm on the list may have removed it since then.
+                 */
+                if (slot == &ksm_mm_head)
+                        return NULL;
 next_mm:
                ksm_scan.address = 0;
                ksm_scan.rmap_list = &slot->rmap_list;
@@ -1277,7 +1332,13 @@ next_mm:
                        if (ksm_test_exit(mm))
                                break;
                        *page = follow_page(vma, ksm_scan.address, FOLL_GET);
-                        if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) {
+                        if (IS_ERR_OR_NULL(*page)) {
+                                ksm_scan.address += PAGE_SIZE;
+                                cond_resched();
+                                continue;
+                        }
+                        if (PageAnon(*page) ||
+                            page_trans_compound_anon(*page)) {
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(slot,
@@ -1291,8 +1352,7 @@ next_mm:
                                up_read(&mm->mmap_sem);
                                return rmap_item;
                        }
-                        if (!IS_ERR_OR_NULL(*page))
+                        put_page(*page);
-                                put_page(*page);
                        ksm_scan.address += PAGE_SIZE;
                        cond_resched();
                }
@@ -1352,7 +1412,7 @@ static void ksm_do_scan(unsigned int scan_npages)
        struct rmap_item *rmap_item;
        struct page *uninitialized_var(page);
-        while (scan_npages--) {
+        while (scan_npages-- && likely(!freezing(current))) {
                cond_resched();
                rmap_item = scan_get_next_rmap_item(&page);
                if (!rmap_item)
@@ -1370,6 +1430,7 @@ static int ksmd_should_run(void)
 static int ksm_scan_thread(void *nothing)
 {
+        set_freezable();
        set_user_nice(current, 5);
        while (!kthread_should_stop()) {
@@ -1378,11 +1439,13 @@ static int ksm_scan_thread(void *nothing)
                        ksm_do_scan(ksm_thread_pages_to_scan);
                mutex_unlock(&ksm_thread_mutex);
+                try_to_freeze();
                if (ksmd_should_run()) {
                        schedule_timeout_interruptible(
                                msecs_to_jiffies(ksm_thread_sleep_millisecs));
                } else {
-                        wait_event_interruptible(ksm_thread_wait,
+                        wait_event_freezable(ksm_thread_wait,
                                ksmd_should_run() || kthread_should_stop());
                }
        }
@@ -1724,8 +1787,13 @@ static int ksm_memory_callback(struct notifier_block *self,
                /*
                 * Keep it very simple for now: just lock out ksmd and
                 * MADV_UNMERGEABLE while any memory is going offline.
+                 * mutex_lock_nested() is necessary because lockdep was alarmed
+                 * that here we take ksm_thread_mutex inside notifier chain
+                 * mutex, and later take notifier chain mutex inside
+                 * ksm_thread_mutex to unlock it.   But that's safe because both
+                 * are inside mem_hotplug_mutex.
                 */
-                mutex_lock(&ksm_thread_mutex);
+                mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING);
                break;
        case MEM_OFFLINE:
@@ -1833,9 +1901,11 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
        if (ksm_run != flags) {
                ksm_run = flags;
                if (flags & KSM_RUN_UNMERGE) {
-                        current->flags |= PF_OOM_ORIGIN;
+                        int oom_score_adj;
+                        oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
                        err = unmerge_and_remove_all_rmap_items();
-                        current->flags &= ~PF_OOM_ORIGIN;
+                        test_set_oom_score_adj(oom_score_adj);
                        if (err) {
                                ksm_run = KSM_RUN_STOP;
                                count = err;
diff --git a/mm/maccess.c b/mm/maccess.c
index 4e348dbaecd7..4cee182ab5f3 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,9 +1,9 @@
 /*
 * Access kernel memory without faulting.
 */
-#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/uaccess.h>
 /**
 * probe_kernel_read(): safely attempt to read from a location
@@ -15,10 +15,10 @@
 * happens, handle that and return -EFAULT.
 */
-long __weak probe_kernel_read(void *dst, void *src, size_t size)
+long __weak probe_kernel_read(void *dst, const void *src, size_t size)
    __attribute__((alias("__probe_kernel_read")));
-long __probe_kernel_read(void *dst, void *src, size_t size)
+long __probe_kernel_read(void *dst, const void *src, size_t size)
 {
        long ret;
        mm_segment_t old_fs = get_fs();
@@ -43,10 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read);
 * Safely write to address @dst from the buffer at @src.  If a kernel fault
 * happens, handle that and return -EFAULT.
 */
-long __weak probe_kernel_write(void *dst, void *src, size_t size)
+long __weak probe_kernel_write(void *dst, const void *src, size_t size)
    __attribute__((alias("__probe_kernel_write")));
-long __probe_kernel_write(void *dst, void *src, size_t size)
+long __probe_kernel_write(void *dst, const void *src, size_t size)
 {
        long ret;
        mm_segment_t old_fs = get_fs();
diff --git a/mm/madvise.c b/mm/madvise.c
index 319528b8db74..2221491ed503 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -71,6 +71,12 @@ static long madvise_behavior(struct vm_area_struct * vma,
                if (error)
                        goto out;
                break;
+        case MADV_HUGEPAGE:
+        case MADV_NOHUGEPAGE:
+                error = hugepage_madvise(vma, &new_flags, behavior);
+                if (error)
+                        goto out;
+                break;
        }
        if (new_flags == vma->vm_flags) {
@@ -283,6 +289,10 @@ madvise_behavior_valid(int behavior)
        case MADV_MERGEABLE:
        case MADV_UNMERGEABLE:
 #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        case MADV_HUGEPAGE:
+        case MADV_NOHUGEPAGE:
+#endif
                return 1;
        default:
diff --git a/mm/memblock.c b/mm/memblock.c
index 43840b305ecb..a0562d1a6ad4 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -11,446 +11,634 @@
 */
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/bitops.h>
+#include <linux/poison.h>
+#include <linux/pfn.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include <linux/memblock.h>
-#define MEMBLOCK_ALLOC_ANYWHERE 0
+struct memblock memblock __initdata_memblock;
-struct memblock memblock;
+int memblock_debug __initdata_memblock;
+int memblock_can_resize __initdata_memblock;
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
-static int memblock_debug;
+/* inline so we don't get a warning when pr_debug is compiled out */
+static inline const char *memblock_type_name(struct memblock_type *type)
+{
+        if (type == &memblock.memory)
+                return "memory";
+        else if (type == &memblock.reserved)
+                return "reserved";
+        else
+                return "unknown";
+}
-static int __init early_memblock(char *p)
+/*
+ * Address comparison utilities
+ */
+static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
 {
-        if (p && strstr(p, "debug"))
+        return addr & ~(size - 1);
-                memblock_debug = 1;
-        return 0;
 }
-early_param("memblock", early_memblock);
-static void memblock_dump(struct memblock_region *region, char *name)
+static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
 {
-        unsigned long long base, size;
+        return (addr + (size - 1)) & ~(size - 1);
-        int i;
+}
-        pr_info(" %s.cnt  = 0x%lx\n", name, region->cnt);
+static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
+                                       phys_addr_t base2, phys_addr_t size2)
+{
+        return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+}
-        for (i = 0; i < region->cnt; i++) {
+long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
-                base = region->region[i].base;
+{
-                size = region->region[i].size;
+        unsigned long i;
-                pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n",
+        for (i = 0; i < type->cnt; i++) {
-                    name, i, base, base + size - 1, size);
+                phys_addr_t rgnbase = type->regions[i].base;
+                phys_addr_t rgnsize = type->regions[i].size;
+                if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+                        break;
        }
+        return (i < type->cnt) ? i : -1;
 }
-void memblock_dump_all(void)
+/*
+ * Find, allocate, deallocate or reserve unreserved regions. All allocations
+ * are top-down.
+ */
+static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end,
+                                          phys_addr_t size, phys_addr_t align)
 {
-        if (!memblock_debug)
+        phys_addr_t base, res_base;
-                return;
+        long j;
-        pr_info("MEMBLOCK configuration:\n");
+        /* In case, huge size is requested */
-        pr_info(" rmo_size    = 0x%llx\n", (unsigned long long)memblock.rmo_size);
+        if (end < size)
-        pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size);
+                return MEMBLOCK_ERROR;
-        memblock_dump(&memblock.memory, "memory");
+        base = memblock_align_down((end - size), align);
-        memblock_dump(&memblock.reserved, "reserved");
+        /* Prevent allocations returning 0 as it's also used to
+         * indicate an allocation failure
+         */
+        if (start == 0)
+                start = PAGE_SIZE;
+        while (start <= base) {
+                j = memblock_overlaps_region(&memblock.reserved, base, size);
+                if (j < 0)
+                        return base;
+                res_base = memblock.reserved.regions[j].base;
+                if (res_base < size)
+                        break;
+                base = memblock_align_down(res_base - size, align);
+        }
+        return MEMBLOCK_ERROR;
 }
-static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2,
+static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
-                                        u64 size2)
+                        phys_addr_t align, phys_addr_t start, phys_addr_t end)
 {
-        return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+        long i;
+        BUG_ON(0 == size);
+        /* Pump up max_addr */
+        if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+                end = memblock.current_limit;
+        /* We do a top-down search, this tends to limit memory
+         * fragmentation by keeping early boot allocs near the
+         * top of memory
+         */
+        for (i = memblock.memory.cnt - 1; i >= 0; i--) {
+                phys_addr_t memblockbase = memblock.memory.regions[i].base;
+                phys_addr_t memblocksize = memblock.memory.regions[i].size;
+                phys_addr_t bottom, top, found;
+                if (memblocksize < size)
+                        continue;
+                if ((memblockbase + memblocksize) <= start)
+                        break;
+                bottom = max(memblockbase, start);
+                top = min(memblockbase + memblocksize, end);
+                if (bottom >= top)
+                        continue;
+                found = memblock_find_region(bottom, top, size, align);
+                if (found != MEMBLOCK_ERROR)
+                        return found;
+        }
+        return MEMBLOCK_ERROR;
 }
-static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2)
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
 {
-        if (base2 == base1 + size1)
+        return memblock_find_base(size, align, start, end);
-                return 1;
+}
-        else if (base1 == base2 + size2)
-                return -1;
-        return 0;
+/*
+ * Free memblock.reserved.regions
+ */
+int __init_memblock memblock_free_reserved_regions(void)
+{
+        if (memblock.reserved.regions == memblock_reserved_init_regions)
+                return 0;
+        return memblock_free(__pa(memblock.reserved.regions),
+                 sizeof(struct memblock_region) * memblock.reserved.max);
 }
-static long memblock_regions_adjacent(struct memblock_region *rgn,
+/*
-                unsigned long r1, unsigned long r2)
+ * Reserve memblock.reserved.regions
+ */
+int __init_memblock memblock_reserve_reserved_regions(void)
 {
-        u64 base1 = rgn->region[r1].base;
+        if (memblock.reserved.regions == memblock_reserved_init_regions)
-        u64 size1 = rgn->region[r1].size;
+                return 0;
-        u64 base2 = rgn->region[r2].base;
-        u64 size2 = rgn->region[r2].size;
-        return memblock_addrs_adjacent(base1, size1, base2, size2);
+        return memblock_reserve(__pa(memblock.reserved.regions),
+                 sizeof(struct memblock_region) * memblock.reserved.max);
 }
-static void memblock_remove_region(struct memblock_region *rgn, unsigned long r)
+static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
 {
        unsigned long i;
-        for (i = r; i < rgn->cnt - 1; i++) {
+        for (i = r; i < type->cnt - 1; i++) {
-                rgn->region[i].base = rgn->region[i + 1].base;
+                type->regions[i].base = type->regions[i + 1].base;
-                rgn->region[i].size = rgn->region[i + 1].size;
+                type->regions[i].size = type->regions[i + 1].size;
        }
-        rgn->cnt--;
+        type->cnt--;
-}
-/* Assumption: base addr of region 1 < base addr of region 2 */
+        /* Special case for empty arrays */
-static void memblock_coalesce_regions(struct memblock_region *rgn,
+        if (type->cnt == 0) {
-                unsigned long r1, unsigned long r2)
+                type->cnt = 1;
-{
+                type->regions[0].base = 0;
-        rgn->region[r1].size += rgn->region[r2].size;
+                type->regions[0].size = 0;
-        memblock_remove_region(rgn, r2);
+        }
 }
-void __init memblock_init(void)
+/* Defined below but needed now */
+static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
+static int __init_memblock memblock_double_array(struct memblock_type *type)
 {
-        /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
+        struct memblock_region *new_array, *old_array;
-         * This simplifies the memblock_add() code below...
+        phys_addr_t old_size, new_size, addr;
+        int use_slab = slab_is_available();
+        /* We don't allow resizing until we know about the reserved regions
+         * of memory that aren't suitable for allocation
         */
-        memblock.memory.region[0].base = 0;
+        if (!memblock_can_resize)
-        memblock.memory.region[0].size = 0;
+                return -1;
-        memblock.memory.cnt = 1;
-        /* Ditto. */
+        /* Calculate new doubled size */
-        memblock.reserved.region[0].base = 0;
+        old_size = type->max * sizeof(struct memblock_region);
-        memblock.reserved.region[0].size = 0;
+        new_size = old_size << 1;
-        memblock.reserved.cnt = 1;
-}
+        /* Try to find some space for it.
+         *
+         * WARNING: We assume that either slab_is_available() and we use it or
+         * we use MEMBLOCK for allocations. That means that this is unsafe to use
+         * when bootmem is currently active (unless bootmem itself is implemented
+         * on top of MEMBLOCK which isn't the case yet)
+         *
+         * This should however not be an issue for now, as we currently only
+         * call into MEMBLOCK while it's still active, or much later when slab is
+         * active for memory hotplug operations
+         */
+        if (use_slab) {
+                new_array = kmalloc(new_size, GFP_KERNEL);
+                addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
+        } else
+                addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
+        if (addr == MEMBLOCK_ERROR) {
+                pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
+                       memblock_type_name(type), type->max, type->max * 2);
+                return -1;
+        }
+        new_array = __va(addr);
-void __init memblock_analyze(void)
+        memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
-{
+                 memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
-        int i;
-        memblock.memory.size = 0;
+        /* Found space, we now need to move the array over before
+         * we add the reserved region since it may be our reserved
+         * array itself that is full.
+         */
+        memcpy(new_array, type->regions, old_size);
+        memset(new_array + type->max, 0, old_size);
+        old_array = type->regions;
+        type->regions = new_array;
+        type->max <<= 1;
+        /* If we use SLAB that's it, we are done */
+        if (use_slab)
+                return 0;
-        for (i = 0; i < memblock.memory.cnt; i++)
+        /* Add the new reserved region now. Should not fail ! */
-                memblock.memory.size += memblock.memory.region[i].size;
+        BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size));
+        /* If the array wasn't our static init one, then free it. We only do
+         * that before SLAB is available as later on, we don't know whether
+         * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
+         * anyways
+         */
+        if (old_array != memblock_memory_init_regions &&
+            old_array != memblock_reserved_init_regions)
+                memblock_free(__pa(old_array), old_size);
+        return 0;
+}
+extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
+                                          phys_addr_t addr2, phys_addr_t size2)
+{
+        return 1;
 }
-static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size)
+static long __init_memblock memblock_add_region(struct memblock_type *type,
+                                                phys_addr_t base, phys_addr_t size)
 {
-        unsigned long coalesced = 0;
+        phys_addr_t end = base + size;
-        long adjacent, i;
+        int i, slot = -1;
-        if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) {
+        /* First try and coalesce this MEMBLOCK with others */
-                rgn->region[0].base = base;
+        for (i = 0; i < type->cnt; i++) {
-                rgn->region[0].size = size;
+                struct memblock_region *rgn = &type->regions[i];
-                return 0;
+                phys_addr_t rend = rgn->base + rgn->size;
-        }
-        /* First try and coalesce this MEMBLOCK with another. */
+                /* Exit if there's no possible hits */
-        for (i = 0; i < rgn->cnt; i++) {
+                if (rgn->base > end || rgn->size == 0)
-                u64 rgnbase = rgn->region[i].base;
+                        break;
-                u64 rgnsize = rgn->region[i].size;
-                if ((rgnbase == base) && (rgnsize == size))
+                /* Check if we are fully enclosed within an existing
-                        /* Already have this region, so we're done */
+                 * block
+                 */
+                if (rgn->base <= base && rend >= end)
                        return 0;
-                adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
+                /* Check if we overlap or are adjacent with the bottom
-                if (adjacent > 0) {
+                 * of a block.
-                        rgn->region[i].base -= size;
+                 */
-                        rgn->region[i].size += size;
+                if (base < rgn->base && end >= rgn->base) {
-                        coalesced++;
+                        /* If we can't coalesce, create a new block */
-                        break;
+                        if (!memblock_memory_can_coalesce(base, size,
-                } else if (adjacent < 0) {
+                                                          rgn->base,
-                        rgn->region[i].size += size;
+                                                          rgn->size)) {
-                        coalesced++;
+                                /* Overlap & can't coalesce are mutually
-                        break;
+                                 * exclusive, if you do that, be prepared
+                                 * for trouble
+                                 */
+                                WARN_ON(end != rgn->base);
+                                goto new_block;
+                        }
+                        /* We extend the bottom of the block down to our
+                         * base
+                         */
+                        rgn->base = base;
+                        rgn->size = rend - base;
+                        /* Return if we have nothing else to allocate
+                         * (fully coalesced)
+                         */
+                        if (rend >= end)
+                                return 0;
+                        /* We continue processing from the end of the
+                         * coalesced block.
+                         */
+                        base = rend;
+                        size = end - base;
+                }
+                /* Now check if we overlap or are adjacent with the
+                 * top of a block
+                 */
+                if (base <= rend && end >= rend) {
+                        /* If we can't coalesce, create a new block */
+                        if (!memblock_memory_can_coalesce(rgn->base,
+                                                          rgn->size,
+                                                          base, size)) {
+                                /* Overlap & can't coalesce are mutually
+                                 * exclusive, if you do that, be prepared
+                                 * for trouble
+                                 */
+                                WARN_ON(rend != base);
+                                goto new_block;
+                        }
+                        /* We adjust our base down to enclose the
+                         * original block and destroy it. It will be
+                         * part of our new allocation. Since we've
+                         * freed an entry, we know we won't fail
+                         * to allocate one later, so we won't risk
+                         * losing the original block allocation.
+                         */
+                        size += (base - rgn->base);
+                        base = rgn->base;
+                        memblock_remove_region(type, i--);
                }
        }
-        if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) {
+        /* If the array is empty, special case, replace the fake
-                memblock_coalesce_regions(rgn, i, i+1);
+         * filler region and return
-                coalesced++;
+         */
+        if ((type->cnt == 1) && (type->regions[0].size == 0)) {
+                type->regions[0].base = base;
+                type->regions[0].size = size;
+                return 0;
        }
-        if (coalesced)
+ new_block:
-                return coalesced;
+        /* If we are out of space, we fail. It's too late to resize the array
-        if (rgn->cnt >= MAX_MEMBLOCK_REGIONS)
+         * but then this shouldn't have happened in the first place.
+         */
+        if (WARN_ON(type->cnt >= type->max))
                return -1;
        /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
-        for (i = rgn->cnt - 1; i >= 0; i--) {
+        for (i = type->cnt - 1; i >= 0; i--) {
-                if (base < rgn->region[i].base) {
+                if (base < type->regions[i].base) {
-                        rgn->region[i+1].base = rgn->region[i].base;
+                        type->regions[i+1].base = type->regions[i].base;
-                        rgn->region[i+1].size = rgn->region[i].size;
+                        type->regions[i+1].size = type->regions[i].size;
                } else {
-                        rgn->region[i+1].base = base;
+                        type->regions[i+1].base = base;
-                        rgn->region[i+1].size = size;
+                        type->regions[i+1].size = size;
+                        slot = i + 1;
                        break;
                }
        }
+        if (base < type->regions[0].base) {
+                type->regions[0].base = base;
+                type->regions[0].size = size;
+                slot = 0;
+        }
+        type->cnt++;
-        if (base < rgn->region[0].base) {
+        /* The array is full ? Try to resize it. If that fails, we undo
-                rgn->region[0].base = base;
+         * our allocation and return an error
-                rgn->region[0].size = size;
+         */
+        if (type->cnt == type->max && memblock_double_array(type)) {
+                BUG_ON(slot < 0);
+                memblock_remove_region(type, slot);
+                return -1;
        }
-        rgn->cnt++;
        return 0;
 }
-long memblock_add(u64 base, u64 size)
+long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
 {
-        struct memblock_region *_rgn = &memblock.memory;
+        return memblock_add_region(&memblock.memory, base, size);
-        /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */
-        if (base == 0)
-                memblock.rmo_size = size;
-        return memblock_add_region(_rgn, base, size);
 }
-static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
+static long __init_memblock __memblock_remove(struct memblock_type *type,
+                                              phys_addr_t base, phys_addr_t size)
 {
-        u64 rgnbegin, rgnend;
+        phys_addr_t end = base + size;
-        u64 end = base + size;
        int i;
-        rgnbegin = rgnend = 0; /* supress gcc warnings */
+        /* Walk through the array for collisions */
+        for (i = 0; i < type->cnt; i++) {
-        /* Find the region where (base, size) belongs to */
+                struct memblock_region *rgn = &type->regions[i];
-        for (i=0; i < rgn->cnt; i++) {
+                phys_addr_t rend = rgn->base + rgn->size;
-                rgnbegin = rgn->region[i].base;
-                rgnend = rgnbegin + rgn->region[i].size;
-                if ((rgnbegin <= base) && (end <= rgnend))
+                /* Nothing more to do, exit */
+                if (rgn->base > end || rgn->size == 0)
                        break;
-        }
-        /* Didn't find the region */
+                /* If we fully enclose the block, drop it */
-        if (i == rgn->cnt)
+                if (base <= rgn->base && end >= rend) {
-                return -1;
+                        memblock_remove_region(type, i--);
+                        continue;
+                }
-        /* Check to see if we are removing entire region */
+                /* If we are fully enclosed within a block
-        if ((rgnbegin == base) && (rgnend == end)) {
+                 * then we need to split it and we are done
-                memblock_remove_region(rgn, i);
+                 */
-                return 0;
+                if (base > rgn->base && end < rend) {
-        }
+                        rgn->size = base - rgn->base;
+                        if (!memblock_add_region(type, end, rend - end))
+                                return 0;
+                        /* Failure to split is bad, we at least
+                         * restore the block before erroring
+                         */
+                        rgn->size = rend - rgn->base;
+                        WARN_ON(1);
+                        return -1;
+                }
-        /* Check to see if region is matching at the front */
+                /* Check if we need to trim the bottom of a block */
-        if (rgnbegin == base) {
+                if (rgn->base < end && rend > end) {
-                rgn->region[i].base = end;
+                        rgn->size -= end - rgn->base;
-                rgn->region[i].size -= size;
+                        rgn->base = end;
-                return 0;
+                        break;
-        }
+                }
-        /* Check to see if the region is matching at the end */
+                /* And check if we need to trim the top of a block */
-        if (rgnend == end) {
+                if (base < rend)
-                rgn->region[i].size -= size;
+                        rgn->size -= rend - base;
-                return 0;
-        }
-        /*
+        }
-         * We need to split the entry -  adjust the current one to the
+        return 0;
-         * beginging of the hole and add the region after hole.
-         */
-        rgn->region[i].size = base - rgn->region[i].base;
-        return memblock_add_region(rgn, end, rgnend - end);
 }
-long memblock_remove(u64 base, u64 size)
+long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
 {
        return __memblock_remove(&memblock.memory, base, size);
 }
-long __init memblock_free(u64 base, u64 size)
+long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
 {
        return __memblock_remove(&memblock.reserved, base, size);
 }
-long __init memblock_reserve(u64 base, u64 size)
+long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 {
-        struct memblock_region *_rgn = &memblock.reserved;
+        struct memblock_type *_rgn = &memblock.reserved;
        BUG_ON(0 == size);
        return memblock_add_region(_rgn, base, size);
 }
-long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size)
+phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
 {
-        unsigned long i;
+        phys_addr_t found;
-        for (i = 0; i < rgn->cnt; i++) {
+        /* We align the size to limit fragmentation. Without this, a lot of
-                u64 rgnbase = rgn->region[i].base;
+         * small allocs quickly eat up the whole reserve array on sparc
-                u64 rgnsize = rgn->region[i].size;
+         */
-                if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+        size = memblock_align_up(size, align);
-                        break;
-        }
-        return (i < rgn->cnt) ? i : -1;
+        found = memblock_find_base(size, align, 0, max_addr);
+        if (found != MEMBLOCK_ERROR &&
+            !memblock_add_region(&memblock.reserved, found, size))
+                return found;
+        return 0;
 }
-static u64 memblock_align_down(u64 addr, u64 size)
+phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
 {
-        return addr & ~(size - 1);
+        phys_addr_t alloc;
+        alloc = __memblock_alloc_base(size, align, max_addr);
+        if (alloc == 0)
+                panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+                      (unsigned long long) size, (unsigned long long) max_addr);
+        return alloc;
 }
-static u64 memblock_align_up(u64 addr, u64 size)
+phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
 {
-        return (addr + (size - 1)) & ~(size - 1);
+        return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
 }
-static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end,
-                                           u64 size, u64 align)
+/*
+ * Additional node-local allocators. Search for node memory is bottom up
+ * and walks memblock regions within that node bottom-up as well, but allocation
+ * within an memblock region is top-down. XXX I plan to fix that at some stage
+ *
+ * WARNING: Only available after early_node_map[] has been populated,
+ * on some architectures, that is after all the calls to add_active_range()
+ * have been done to populate it.
+ */
+phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
 {
-        u64 base, res_base;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
-        long j;
+        /*
+         * This code originates from sparc which really wants use to walk by addresses
+         * and returns the nid. This is not very convenient for early_pfn_map[] users
+         * as the map isn't sorted yet, and it really wants to be walked by nid.
+         *
+         * For now, I implement the inefficient method below which walks the early
+         * map multiple times. Eventually we may want to use an ARCH config option
+         * to implement a completely different method for both case.
+         */
+        unsigned long start_pfn, end_pfn;
+        int i;
-        base = memblock_align_down((end - size), align);
+        for (i = 0; i < MAX_NUMNODES; i++) {
-        while (start <= base) {
+                get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
-                j = memblock_overlaps_region(&memblock.reserved, base, size);
+                if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
-                if (j < 0) {
+                        continue;
-                        /* this area isn't reserved, take it */
+                *nid = i;
-                        if (memblock_add_region(&memblock.reserved, base, size) < 0)
+                return min(end, PFN_PHYS(end_pfn));
-                                base = ~(u64)0;
-                        return base;
-                }
-                res_base = memblock.reserved.region[j].base;
-                if (res_base < size)
-                        break;
-                base = memblock_align_down(res_base - size, align);
        }
+#endif
+        *nid = 0;
-        return ~(u64)0;
+        return end;
 }
-static u64 __init memblock_alloc_nid_region(struct memblock_property *mp,
+static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
-                                       u64 (*nid_range)(u64, u64, int *),
+                                               phys_addr_t size,
-                                       u64 size, u64 align, int nid)
+                                               phys_addr_t align, int nid)
 {
-        u64 start, end;
+        phys_addr_t start, end;
        start = mp->base;
        end = start + mp->size;
        start = memblock_align_up(start, align);
        while (start < end) {
-                u64 this_end;
+                phys_addr_t this_end;
                int this_nid;
-                this_end = nid_range(start, end, &this_nid);
+                this_end = memblock_nid_range(start, end, &this_nid);
                if (this_nid == nid) {
-                        u64 ret = memblock_alloc_nid_unreserved(start, this_end,
+                        phys_addr_t ret = memblock_find_region(start, this_end, size, align);
-                                                           size, align);
+                        if (ret != MEMBLOCK_ERROR &&
-                        if (ret != ~(u64)0)
+                            !memblock_add_region(&memblock.reserved, ret, size))
                                return ret;
                }
                start = this_end;
        }
-        return ~(u64)0;
+        return MEMBLOCK_ERROR;
 }
-u64 __init memblock_alloc_nid(u64 size, u64 align, int nid,
+phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
-                         u64 (*nid_range)(u64 start, u64 end, int *nid))
 {
-        struct memblock_region *mem = &memblock.memory;
+        struct memblock_type *mem = &memblock.memory;
        int i;
        BUG_ON(0 == size);
+        /* We align the size to limit fragmentation. Without this, a lot of
+         * small allocs quickly eat up the whole reserve array on sparc
+         */
        size = memblock_align_up(size, align);
+        /* We do a bottom-up search for a region with the right
+         * nid since that's easier considering how memblock_nid_range()
+         * works
+         */
        for (i = 0; i < mem->cnt; i++) {
-                u64 ret = memblock_alloc_nid_region(&mem->region[i],
+                phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
-                                               nid_range,
                                               size, align, nid);
-                if (ret != ~(u64)0)
+                if (ret != MEMBLOCK_ERROR)
                        return ret;
        }
-        return memblock_alloc(size, align);
+        return 0;
-}
-u64 __init memblock_alloc(u64 size, u64 align)
-{
-        return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
 }
-u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr)
+phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
 {
-        u64 alloc;
+        phys_addr_t res = memblock_alloc_nid(size, align, nid);
-        alloc = __memblock_alloc_base(size, align, max_addr);
+        if (res)
+                return res;
-        if (alloc == 0)
+        return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
-                panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
-                      (unsigned long long) size, (unsigned long long) max_addr);
-        return alloc;
 }
-u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr)
-{
-        long i, j;
-        u64 base = 0;
-        u64 res_base;
-        BUG_ON(0 == size);
-        size = memblock_align_up(size, align);
-        /* On some platforms, make sure we allocate lowmem */
-        /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */
-        if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
-                max_addr = MEMBLOCK_REAL_LIMIT;
-        for (i = memblock.memory.cnt - 1; i >= 0; i--) {
+/*
-                u64 memblockbase = memblock.memory.region[i].base;
+ * Remaining API functions
-                u64 memblocksize = memblock.memory.region[i].size;
+ */
-                if (memblocksize < size)
-                        continue;
-                if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
-                        base = memblock_align_down(memblockbase + memblocksize - size, align);
-                else if (memblockbase < max_addr) {
-                        base = min(memblockbase + memblocksize, max_addr);
-                        base = memblock_align_down(base - size, align);
-                } else
-                        continue;
-                while (base && memblockbase <= base) {
-                        j = memblock_overlaps_region(&memblock.reserved, base, size);
-                        if (j < 0) {
-                                /* this area isn't reserved, take it */
-                                if (memblock_add_region(&memblock.reserved, base, size) < 0)
-                                        return 0;
-                                return base;
-                        }
-                        res_base = memblock.reserved.region[j].base;
-                        if (res_base < size)
-                                break;
-                        base = memblock_align_down(res_base - size, align);
-                }
-        }
-        return 0;
-}
 /* You must call memblock_analyze() before this. */
-u64 __init memblock_phys_mem_size(void)
+phys_addr_t __init memblock_phys_mem_size(void)
 {
-        return memblock.memory.size;
+        return memblock.memory_size;
 }
-u64 memblock_end_of_DRAM(void)
+phys_addr_t __init_memblock memblock_end_of_DRAM(void)
 {
        int idx = memblock.memory.cnt - 1;
-        return (memblock.memory.region[idx].base + memblock.memory.region[idx].size);
+        return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
 }
 /* You must call memblock_analyze() after this. */
-void __init memblock_enforce_memory_limit(u64 memory_limit)
+void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
 {
        unsigned long i;
-        u64 limit;
+        phys_addr_t limit;
-        struct memblock_property *p;
+        struct memblock_region *p;
        if (!memory_limit)
                return;
@@ -458,24 +646,21 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
        /* Truncate the memblock regions to satisfy the memory limit. */
        limit = memory_limit;
        for (i = 0; i < memblock.memory.cnt; i++) {
-                if (limit > memblock.memory.region[i].size) {
+                if (limit > memblock.memory.regions[i].size) {
-                        limit -= memblock.memory.region[i].size;
+                        limit -= memblock.memory.regions[i].size;
                        continue;
                }
-                memblock.memory.region[i].size = limit;
+                memblock.memory.regions[i].size = limit;
                memblock.memory.cnt = i + 1;
                break;
        }
-        if (memblock.memory.region[0].size < memblock.rmo_size)
-                memblock.rmo_size = memblock.memory.region[0].size;
        memory_limit = memblock_end_of_DRAM();
        /* And truncate any reserves above the limit also. */
        for (i = 0; i < memblock.reserved.cnt; i++) {
-                p = &memblock.reserved.region[i];
+                p = &memblock.reserved.regions[i];
                if (p->base > memory_limit)
                        p->size = 0;
@@ -489,53 +674,190 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
        }
 }
-int __init memblock_is_reserved(u64 addr)
+static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
+{
+        unsigned int left = 0, right = type->cnt;
+        do {
+                unsigned int mid = (right + left) / 2;
+                if (addr < type->regions[mid].base)
+                        right = mid;
+                else if (addr >= (type->regions[mid].base +
+                                  type->regions[mid].size))
+                        left = mid + 1;
+                else
+                        return mid;
+        } while (left < right);
+        return -1;
+}
+int __init memblock_is_reserved(phys_addr_t addr)
+{
+        return memblock_search(&memblock.reserved, addr) != -1;
+}
+int __init_memblock memblock_is_memory(phys_addr_t addr)
+{
+        return memblock_search(&memblock.memory, addr) != -1;
+}
+int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
+{
+        int idx = memblock_search(&memblock.memory, base);
+        if (idx == -1)
+                return 0;
+        return memblock.memory.regions[idx].base <= base &&
+                (memblock.memory.regions[idx].base +
+                 memblock.memory.regions[idx].size) >= (base + size);
+}
+int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+{
+        return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+}
+void __init_memblock memblock_set_current_limit(phys_addr_t limit)
+{
+        memblock.current_limit = limit;
+}
+static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
 {
+        unsigned long long base, size;
        int i;
-        for (i = 0; i < memblock.reserved.cnt; i++) {
+        pr_info(" %s.cnt  = 0x%lx\n", name, region->cnt);
-                u64 upper = memblock.reserved.region[i].base +
-                        memblock.reserved.region[i].size - 1;
+        for (i = 0; i < region->cnt; i++) {
-                if ((addr >= memblock.reserved.region[i].base) && (addr <= upper))
+                base = region->regions[i].base;
-                        return 1;
+                size = region->regions[i].size;
+                pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
+                    name, i, base, base + size - 1, size);
        }
-        return 0;
 }
-int memblock_is_region_reserved(u64 base, u64 size)
+void __init_memblock memblock_dump_all(void)
 {
-        return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+        if (!memblock_debug)
+                return;
+        pr_info("MEMBLOCK configuration:\n");
+        pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size);
+        memblock_dump(&memblock.memory, "memory");
+        memblock_dump(&memblock.reserved, "reserved");
 }
-/*
+void __init memblock_analyze(void)
- * Given a <base, len>, find which memory regions belong to this range.
- * Adjust the request and return a contiguous chunk.
- */
-int memblock_find(struct memblock_property *res)
 {
        int i;
-        u64 rstart, rend;
-        rstart = res->base;
+        /* Check marker in the unused last array entry */
-        rend = rstart + res->size - 1;
+        WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
+                != (phys_addr_t)RED_INACTIVE);
+        WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
+                != (phys_addr_t)RED_INACTIVE);
-        for (i = 0; i < memblock.memory.cnt; i++) {
+        memblock.memory_size = 0;
-                u64 start = memblock.memory.region[i].base;
-                u64 end = start + memblock.memory.region[i].size - 1;
-                if (start > rend)
+        for (i = 0; i < memblock.memory.cnt; i++)
-                        return -1;
+                memblock.memory_size += memblock.memory.regions[i].size;
+        /* We allow resizing from there */
+        memblock_can_resize = 1;
+}
+void __init memblock_init(void)
+{
+        static int init_done __initdata = 0;
+        if (init_done)
+                return;
+        init_done = 1;
+        /* Hookup the initial arrays */
+        memblock.memory.regions = memblock_memory_init_regions;
+        memblock.memory.max             = INIT_MEMBLOCK_REGIONS;
+        memblock.reserved.regions       = memblock_reserved_init_regions;
+        memblock.reserved.max   = INIT_MEMBLOCK_REGIONS;
+        /* Write a marker in the unused last array entry */
+        memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+        memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+        /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
+         * This simplifies the memblock_add() code below...
+         */
+        memblock.memory.regions[0].base = 0;
+        memblock.memory.regions[0].size = 0;
+        memblock.memory.cnt = 1;
+        /* Ditto. */
+        memblock.reserved.regions[0].base = 0;
+        memblock.reserved.regions[0].size = 0;
+        memblock.reserved.cnt = 1;
+        memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
+}
+static int __init early_memblock(char *p)
+{
+        if (p && strstr(p, "debug"))
+                memblock_debug = 1;
+        return 0;
+}
+early_param("memblock", early_memblock);
+#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
+static int memblock_debug_show(struct seq_file *m, void *private)
+{
+        struct memblock_type *type = m->private;
+        struct memblock_region *reg;
+        int i;
+        for (i = 0; i < type->cnt; i++) {
+                reg = &type->regions[i];
+                seq_printf(m, "%4d: ", i);
+                if (sizeof(phys_addr_t) == 4)
+                        seq_printf(m, "0x%08lx..0x%08lx\n",
+                                   (unsigned long)reg->base,
+                                   (unsigned long)(reg->base + reg->size - 1));
+                else
+                        seq_printf(m, "0x%016llx..0x%016llx\n",
+                                   (unsigned long long)reg->base,
+                                   (unsigned long long)(reg->base + reg->size - 1));
-                if ((end >= rstart) && (start < rend)) {
-                        /* adjust the request */
-                        if (rstart < start)
-                                rstart = start;
-                        if (rend > end)
-                                rend = end;
-                        res->base = rstart;
-                        res->size = rend - rstart + 1;
-                        return 0;
-                }
        }
-        return -1;
+        return 0;
+}
+static int memblock_debug_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, memblock_debug_show, inode->i_private);
 }
+static const struct file_operations memblock_debug_fops = {
+        .open = memblock_debug_open,
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int __init memblock_init_debugfs(void)
+{
+        struct dentry *root = debugfs_create_dir("memblock", NULL);
+        if (!root)
+                return -ENXIO;
+        debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
+        debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
+        return 0;
+}
+__initcall(memblock_init_debugfs);
+#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9be3cf8a5da4..e013b8e57d25 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,6 +35,7 @@
 #include <linux/limits.h>
 #include <linux/mutex.h>
 #include <linux/rbtree.h>
+#include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
@@ -61,20 +62,18 @@ struct mem_cgroup *root_mem_cgroup __read_mostly;
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
-static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+/* for remember boot option*/
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata = 0;
+#endif
 #else
 #define do_swap_account         (0)
 #endif
-/*
- * Per memcg event counter is incremented at every pagein/pageout. This counter
- * is used for trigger some periodic events. This is straightforward and better
- * than using jiffies etc. to handle periodic memcg event.
- *
- * These values will be used as !((event) & ((1 <<(thresh)) - 1))
- */
-#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
-#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
 /*
 * Statistics for memory cgroup.
@@ -86,16 +85,40 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
        MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
        MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
-        MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
-        MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
        MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
-        MEM_CGROUP_EVENTS,      /* incremented at every  pagein/pageout */
+        MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
+        MEM_CGROUP_ON_MOVE,     /* someone is moving account between groups */
        MEM_CGROUP_STAT_NSTATS,
 };
+enum mem_cgroup_events_index {
+        MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
+        MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
+        MEM_CGROUP_EVENTS_COUNT,        /* # of pages paged in/out */
+        MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
+        MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
+        MEM_CGROUP_EVENTS_NSTATS,
+};
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. With THP,
+ * it will be incremated by the number of pages. This counter is used for
+ * for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ */
+enum mem_cgroup_events_target {
+        MEM_CGROUP_TARGET_THRESH,
+        MEM_CGROUP_TARGET_SOFTLIMIT,
+        MEM_CGROUP_TARGET_NUMAINFO,
+        MEM_CGROUP_NTARGETS,
+};
+#define THRESHOLDS_EVENTS_TARGET (128)
+#define SOFTLIMIT_EVENTS_TARGET (1024)
+#define NUMAINFO_EVENTS_TARGET  (1024)
 struct mem_cgroup_stat_cpu {
-        s64 count[MEM_CGROUP_STAT_NSTATS];
+        long count[MEM_CGROUP_STAT_NSTATS];
+        unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+        unsigned long targets[MEM_CGROUP_NTARGETS];
 };
 /*
@@ -208,17 +231,17 @@ struct mem_cgroup {
         * per zone LRU lists.
         */
        struct mem_cgroup_lru_info info;
-        /*
-          protect against reclaim related member.
-        */
-        spinlock_t reclaim_param_lock;
        /*
         * While reclaiming in a hierarchy, we cache the last child we
         * reclaimed from.
         */
        int last_scanned_child;
+        int last_scanned_node;
+#if MAX_NUMNODES > 1
+        nodemask_t      scan_nodes;
+        atomic_t        numainfo_events;
+        atomic_t        numainfo_updating;
+#endif
        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
@@ -254,6 +277,12 @@ struct mem_cgroup {
         * percpu counter.
         */
        struct mem_cgroup_stat_cpu *stat;
+        /*
+         * used when a cpu is offlined or other synchronizations
+         * See mem_cgroup_read_stat().
+         */
+        struct mem_cgroup_stat_cpu nocpu_base;
+        spinlock_t pcp_counter_lock;
 };
 /* Stuffs for move charges at task migration. */
@@ -269,7 +298,7 @@ enum move_type {
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
-        spinlock_t        lock; /* for from, to, moving_task */
+        spinlock_t        lock; /* for from, to */
        struct mem_cgroup *from;
        struct mem_cgroup *to;
        unsigned long precharge;
@@ -311,13 +340,6 @@ enum charge_type {
        NR_CHARGE_TYPE,
 };
-/* only for here (for easy reading.) */
-#define PCGF_CACHE      (1UL << PCG_CACHE)
-#define PCGF_USED       (1UL << PCG_USED)
-#define PCGF_LOCK       (1UL << PCG_LOCK)
-/* Not used, but added here for completeness */
-#define PCGF_ACCT       (1UL << PCG_ACCT)
 /* for encoding cft->private value on file */
 #define _MEM                    (0)
 #define _MEMSWAP                (1)
@@ -341,7 +363,7 @@ enum charge_type {
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
-static void drain_all_stock_async(void);
+static void drain_all_stock_async(struct mem_cgroup *mem);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -355,14 +377,10 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
 }
 static struct mem_cgroup_per_zone *
-page_cgroup_zoneinfo(struct page_cgroup *pc)
+page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page)
 {
-        struct mem_cgroup *mem = pc->mem_cgroup;
+        int nid = page_to_nid(page);
-        int nid = page_cgroup_nid(pc);
+        int zid = page_zonenum(page);
-        int zid = page_cgroup_zid(pc);
-        if (!mem)
-                return NULL;
        return mem_cgroup_zoneinfo(mem, nid, zid);
 }
@@ -488,11 +506,6 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
        }
 }
-static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
-{
-        return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
-}
 static struct mem_cgroup_per_zone *
 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
@@ -530,26 +543,43 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
        return mz;
 }
-static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
+/*
-                enum mem_cgroup_stat_index idx)
+ * Implementation Note: reading percpu statistics for memcg.
+ *
+ * Both of vmstat[] and percpu_counter has threshold and do periodic
+ * synchronization to implement "quick" read. There are trade-off between
+ * reading cost and precision of value. Then, we may have a chance to implement
+ * a periodic synchronizion of counter in memcg's counter.
+ *
+ * But this _read() function is used for user interface now. The user accounts
+ * memory usage by memory cgroup and he _always_ requires exact value because
+ * he accounts memory. Even if we provide quick-and-fuzzy read, we always
+ * have to visit all online cpus and make sum. So, for now, unnecessary
+ * synchronization is not implemented. (just implemented for cpu hotplug)
+ *
+ * If there are kernel internal actions which can make use of some not-exact
+ * value, and reading all cpu value can be performance bottleneck in some
+ * common workload, threashold and synchonization as vmstat[] should be
+ * implemented.
+ */
+static long mem_cgroup_read_stat(struct mem_cgroup *mem,
+                                 enum mem_cgroup_stat_index idx)
 {
+        long val = 0;
        int cpu;
-        s64 val = 0;
-        for_each_possible_cpu(cpu)
+        get_online_cpus();
+        for_each_online_cpu(cpu)
                val += per_cpu(mem->stat->count[idx], cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+        spin_lock(&mem->pcp_counter_lock);
+        val += mem->nocpu_base.count[idx];
+        spin_unlock(&mem->pcp_counter_lock);
+#endif
+        put_online_cpus();
        return val;
 }
-static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
-{
-        s64 ret;
-        ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
-        ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
-        return ret;
-}
 static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
                                         bool charge)
 {
@@ -557,50 +587,110 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
        this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
 }
-static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
+void mem_cgroup_pgfault(struct mem_cgroup *mem, int val)
-                                         struct page_cgroup *pc,
-                                         bool charge)
 {
-        int val = (charge) ? 1 : -1;
+        this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val);
+}
+void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val)
+{
+        this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val);
+}
+static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem,
+                                            enum mem_cgroup_events_index idx)
+{
+        unsigned long val = 0;
+        int cpu;
+        for_each_online_cpu(cpu)
+                val += per_cpu(mem->stat->events[idx], cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+        spin_lock(&mem->pcp_counter_lock);
+        val += mem->nocpu_base.events[idx];
+        spin_unlock(&mem->pcp_counter_lock);
+#endif
+        return val;
+}
+static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
+                                         bool file, int nr_pages)
+{
        preempt_disable();
-        if (PageCgroupCache(pc))
+        if (file)
-                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
+                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages);
        else
-                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
+                __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages);
+        /* pagein of a big page is an event. So, ignore page size */
+        if (nr_pages > 0)
+                __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
+        else {
+                __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
+                nr_pages = -nr_pages; /* for event */
+        }
-        if (charge)
+        __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages);
-                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
-        else
-                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
-        __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
        preempt_enable();
 }
+static unsigned long
+mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
+{
+        struct mem_cgroup_per_zone *mz;
+        u64 total = 0;
+        int zid;
+        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                mz = mem_cgroup_zoneinfo(mem, nid, zid);
+                total += MEM_CGROUP_ZSTAT(mz, idx);
+        }
+        return total;
+}
 static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
                                        enum lru_list idx)
 {
-        int nid, zid;
+        int nid;
-        struct mem_cgroup_per_zone *mz;
        u64 total = 0;
        for_each_online_node(nid)
-                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+                total += mem_cgroup_get_zonestat_node(mem, nid, idx);
-                        mz = mem_cgroup_zoneinfo(mem, nid, zid);
-                        total += MEM_CGROUP_ZSTAT(mz, idx);
-                }
        return total;
 }
-static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
+static bool __memcg_event_check(struct mem_cgroup *mem, int target)
 {
-        s64 val;
+        unsigned long val, next;
+        val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
+        next = this_cpu_read(mem->stat->targets[target]);
+        /* from time_after() in jiffies.h */
+        return ((long)next - (long)val < 0);
+}
+static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target)
+{
+        unsigned long val, next;
-        val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
+        val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]);
-        return !(val & ((1 << event_mask_shift) - 1));
+        switch (target) {
+        case MEM_CGROUP_TARGET_THRESH:
+                next = val + THRESHOLDS_EVENTS_TARGET;
+                break;
+        case MEM_CGROUP_TARGET_SOFTLIMIT:
+                next = val + SOFTLIMIT_EVENTS_TARGET;
+                break;
+        case MEM_CGROUP_TARGET_NUMAINFO:
+                next = val + NUMAINFO_EVENTS_TARGET;
+                break;
+        default:
+                return;
+        }
+        this_cpu_write(mem->stat->targets[target], next);
 }
 /*
@@ -610,10 +700,23 @@ static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
 static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
 {
        /* threshold event is triggered in finer grain than soft limit */
-        if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
+        if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) {
                mem_cgroup_threshold(mem);
-                if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
+                __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH);
+                if (unlikely(__memcg_event_check(mem,
+                             MEM_CGROUP_TARGET_SOFTLIMIT))) {
                        mem_cgroup_update_tree(mem, page);
+                        __mem_cgroup_target_update(mem,
+                                                   MEM_CGROUP_TARGET_SOFTLIMIT);
+                }
+#if MAX_NUMNODES > 1
+                if (unlikely(__memcg_event_check(mem,
+                        MEM_CGROUP_TARGET_NUMAINFO))) {
+                        atomic_inc(&mem->numainfo_events);
+                        __mem_cgroup_target_update(mem,
+                                MEM_CGROUP_TARGET_NUMAINFO);
+                }
+#endif
        }
 }
@@ -638,7 +741,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
                                struct mem_cgroup, css);
 }
-static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
        struct mem_cgroup *mem = NULL;
@@ -659,46 +762,116 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
        return mem;
 }
-/*
+/* The caller has to guarantee "mem" exists before calling this */
- * Call callback function against all cgroup under hierarchy tree.
+static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem)
- */
-static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
-                          int (*func)(struct mem_cgroup *, void *))
 {
-        int found, ret, nextid;
        struct cgroup_subsys_state *css;
-        struct mem_cgroup *mem;
+        int found;
-        if (!root->use_hierarchy)
-                return (*func)(root, data);
-        nextid = 1;
+        if (!mem) /* ROOT cgroup has the smallest ID */
-        do {
+                return root_mem_cgroup; /*css_put/get against root is ignored*/
-                ret = 0;
+        if (!mem->use_hierarchy) {
+                if (css_tryget(&mem->css))
+                        return mem;
+                return NULL;
+        }
+        rcu_read_lock();
+        /*
+         * searching a memory cgroup which has the smallest ID under given
+         * ROOT cgroup. (ID >= 1)
+         */
+        css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found);
+        if (css && css_tryget(css))
+                mem = container_of(css, struct mem_cgroup, css);
+        else
                mem = NULL;
+        rcu_read_unlock();
+        return mem;
+}
+static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter,
+                                        struct mem_cgroup *root,
+                                        bool cond)
+{
+        int nextid = css_id(&iter->css) + 1;
+        int found;
+        int hierarchy_used;
+        struct cgroup_subsys_state *css;
+        hierarchy_used = iter->use_hierarchy;
+        css_put(&iter->css);
+        /* If no ROOT, walk all, ignore hierarchy */
+        if (!cond || (root && !hierarchy_used))
+                return NULL;
+        if (!root)
+                root = root_mem_cgroup;
+        do {
+                iter = NULL;
                rcu_read_lock();
-                css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
-                                   &found);
+                css = css_get_next(&mem_cgroup_subsys, nextid,
+                                &root->css, &found);
                if (css && css_tryget(css))
-                        mem = container_of(css, struct mem_cgroup, css);
+                        iter = container_of(css, struct mem_cgroup, css);
                rcu_read_unlock();
+                /* If css is NULL, no more cgroups will be found */
-                if (mem) {
-                        ret = (*func)(mem, data);
-                        css_put(&mem->css);
-                }
                nextid = found + 1;
-        } while (!ret && css);
+        } while (css && !iter);
-        return ret;
+        return iter;
 }
+/*
+ * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please
+ * be careful that "break" loop is not allowed. We have reference count.
+ * Instead of that modify "cond" to be false and "continue" to exit the loop.
+ */
+#define for_each_mem_cgroup_tree_cond(iter, root, cond) \
+        for (iter = mem_cgroup_start_loop(root);\
+             iter != NULL;\
+             iter = mem_cgroup_get_next(iter, root, cond))
+#define for_each_mem_cgroup_tree(iter, root) \
+        for_each_mem_cgroup_tree_cond(iter, root, true)
+#define for_each_mem_cgroup_all(iter) \
+        for_each_mem_cgroup_tree_cond(iter, NULL, true)
 static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
 {
        return (mem == root_mem_cgroup);
 }
+void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
+{
+        struct mem_cgroup *mem;
+        if (!mm)
+                return;
+        rcu_read_lock();
+        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+        if (unlikely(!mem))
+                goto out;
+        switch (idx) {
+        case PGMAJFAULT:
+                mem_cgroup_pgmajfault(mem, 1);
+                break;
+        case PGFAULT:
+                mem_cgroup_pgfault(mem, 1);
+                break;
+        default:
+                BUG();
+        }
+out:
+        rcu_read_unlock();
+}
+EXPORT_SYMBOL(mem_cgroup_count_vm_event);
 /*
 * Following LRU functions are allowed to be used without PCG_LOCK.
 * Operations are called by routine of global LRU independently from memcg.
@@ -729,13 +902,13 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
         * We don't check PCG_USED bit. It's cleared when the "page" is finally
         * removed from global LRU.
         */
-        mz = page_cgroup_zoneinfo(pc);
+        mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
-        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+        /* huge page split is done under lru_lock. so, we have no races. */
+        MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page);
        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
        VM_BUG_ON(list_empty(&pc->lru));
        list_del_init(&pc->lru);
-        return;
 }
 void mem_cgroup_del_lru(struct page *page)
@@ -743,24 +916,49 @@ void mem_cgroup_del_lru(struct page *page)
        mem_cgroup_del_lru_list(page, page_lru(page));
 }
-void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
+/*
+ * Writeback is about to end against a page which has been marked for immediate
+ * reclaim.  If it still appears to be reclaimable, move it to the tail of the
+ * inactive list.
+ */
+void mem_cgroup_rotate_reclaimable_page(struct page *page)
 {
        struct mem_cgroup_per_zone *mz;
        struct page_cgroup *pc;
+        enum lru_list lru = page_lru(page);
        if (mem_cgroup_disabled())
                return;
        pc = lookup_page_cgroup(page);
-        /*
+        /* unused or root page is not rotated. */
-         * Used bit is set without atomic ops but after smp_wmb().
+        if (!PageCgroupUsed(pc))
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
+                return;
-         */
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
        smp_rmb();
+        if (mem_cgroup_is_root(pc->mem_cgroup))
+                return;
+        mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
+        list_move_tail(&pc->lru, &mz->lists[lru]);
+}
+void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
+{
+        struct mem_cgroup_per_zone *mz;
+        struct page_cgroup *pc;
+        if (mem_cgroup_disabled())
+                return;
+        pc = lookup_page_cgroup(page);
        /* unused or root page is not rotated. */
-        if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
+        if (!PageCgroupUsed(pc))
+                return;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
+        smp_rmb();
+        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
-        mz = page_cgroup_zoneinfo(pc);
+        mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
        list_move(&pc->lru, &mz->lists[lru]);
 }
@@ -773,16 +971,13 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
                return;
        pc = lookup_page_cgroup(page);
        VM_BUG_ON(PageCgroupAcctLRU(pc));
-        /*
-         * Used bit is set without atomic ops but after smp_wmb().
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         */
-        smp_rmb();
        if (!PageCgroupUsed(pc))
                return;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
-        mz = page_cgroup_zoneinfo(pc);
+        smp_rmb();
-        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+        mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
+        /* huge page split is done under lru_lock. so, we have no races. */
+        MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page);
        SetPageCgroupAcctLRU(pc);
        if (mem_cgroup_is_root(pc->mem_cgroup))
                return;
@@ -790,18 +985,28 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
 }
 /*
- * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
+ * At handling SwapCache and other FUSE stuff, pc->mem_cgroup may be changed
- * lru because the page may.be reused after it's fully uncharged (because of
+ * while it's linked to lru because the page may be reused after it's fully
- * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
+ * uncharged. To handle that, unlink page_cgroup from LRU when charge it again.
- * it again. This function is only used to charge SwapCache. It's done under
+ * It's done under lock_page and expected that zone->lru_lock isnever held.
- * lock_page and expected that zone->lru_lock is never held.
 */
-static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
+static void mem_cgroup_lru_del_before_commit(struct page *page)
 {
        unsigned long flags;
        struct zone *zone = page_zone(page);
        struct page_cgroup *pc = lookup_page_cgroup(page);
+        /*
+         * Doing this check without taking ->lru_lock seems wrong but this
+         * is safe. Because if page_cgroup's USED bit is unset, the page
+         * will not be added to any memcg's LRU. If page_cgroup's USED bit is
+         * set, the commit after this will fail, anyway.
+         * This all charge/uncharge is done under some mutual execustion.
+         * So, we don't need to taking care of changes in USED bit.
+         */
+        if (likely(!PageLRU(page)))
+                return;
        spin_lock_irqsave(&zone->lru_lock, flags);
        /*
         * Forget old LRU when this page_cgroup is *not* used. This Used bit
@@ -812,12 +1017,15 @@ static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
        spin_unlock_irqrestore(&zone->lru_lock, flags);
 }
-static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
+static void mem_cgroup_lru_add_after_commit(struct page *page)
 {
        unsigned long flags;
        struct zone *zone = page_zone(page);
        struct page_cgroup *pc = lookup_page_cgroup(page);
+        /* taking care of that the page is added to LRU while we commit it */
+        if (likely(!PageLRU(page)))
+                return;
        spin_lock_irqsave(&zone->lru_lock, flags);
        /* link when the page is linked to LRU but page_cgroup isn't */
        if (PageLRU(page) && !PageCgroupAcctLRU(pc))
@@ -915,9 +1123,9 @@ int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg)
        return (active > inactive);
 }
-unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
+unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
-                                       struct zone *zone,
+                                                struct zone *zone,
-                                       enum lru_list lru)
+                                                enum lru_list lru)
 {
        int nid = zone_to_nid(zone);
        int zid = zone_idx(zone);
@@ -926,6 +1134,92 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
        return MEM_CGROUP_ZSTAT(mz, lru);
 }
+static unsigned long mem_cgroup_node_nr_file_lru_pages(struct mem_cgroup *memcg,
+                                                        int nid)
+{
+        unsigned long ret;
+        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_FILE) +
+                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_FILE);
+        return ret;
+}
+static unsigned long mem_cgroup_node_nr_anon_lru_pages(struct mem_cgroup *memcg,
+                                                        int nid)
+{
+        unsigned long ret;
+        ret = mem_cgroup_get_zonestat_node(memcg, nid, LRU_INACTIVE_ANON) +
+                mem_cgroup_get_zonestat_node(memcg, nid, LRU_ACTIVE_ANON);
+        return ret;
+}
+#if MAX_NUMNODES > 1
+static unsigned long mem_cgroup_nr_file_lru_pages(struct mem_cgroup *memcg)
+{
+        u64 total = 0;
+        int nid;
+        for_each_node_state(nid, N_HIGH_MEMORY)
+                total += mem_cgroup_node_nr_file_lru_pages(memcg, nid);
+        return total;
+}
+static unsigned long mem_cgroup_nr_anon_lru_pages(struct mem_cgroup *memcg)
+{
+        u64 total = 0;
+        int nid;
+        for_each_node_state(nid, N_HIGH_MEMORY)
+                total += mem_cgroup_node_nr_anon_lru_pages(memcg, nid);
+        return total;
+}
+static unsigned long
+mem_cgroup_node_nr_unevictable_lru_pages(struct mem_cgroup *memcg, int nid)
+{
+        return mem_cgroup_get_zonestat_node(memcg, nid, LRU_UNEVICTABLE);
+}
+static unsigned long
+mem_cgroup_nr_unevictable_lru_pages(struct mem_cgroup *memcg)
+{
+        u64 total = 0;
+        int nid;
+        for_each_node_state(nid, N_HIGH_MEMORY)
+                total += mem_cgroup_node_nr_unevictable_lru_pages(memcg, nid);
+        return total;
+}
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+                                                        int nid)
+{
+        enum lru_list l;
+        u64 total = 0;
+        for_each_lru(l)
+                total += mem_cgroup_get_zonestat_node(memcg, nid, l);
+        return total;
+}
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg)
+{
+        u64 total = 0;
+        int nid;
+        for_each_node_state(nid, N_HIGH_MEMORY)
+                total += mem_cgroup_node_nr_lru_pages(memcg, nid);
+        return total;
+}
+#endif /* CONFIG_NUMA */
 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
                                                      struct zone *zone)
 {
@@ -946,18 +1240,11 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
                return NULL;
        pc = lookup_page_cgroup(page);
-        /*
-         * Used bit is set without atomic ops but after smp_wmb().
-         * For making pc->mem_cgroup visible, insert smp_rmb() here.
-         */
-        smp_rmb();
        if (!PageCgroupUsed(pc))
                return NULL;
+        /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
-        mz = page_cgroup_zoneinfo(pc);
+        smp_rmb();
-        if (!mz)
+        mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
-                return NULL;
        return &mz->reclaim_stat;
 }
@@ -989,9 +1276,11 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                if (scan >= nr_to_scan)
                        break;
-                page = pc->page;
                if (unlikely(!PageCgroupUsed(pc)))
                        continue;
+                page = lookup_cgroup_page(pc);
                if (unlikely(!PageLRU(page)))
                        continue;
@@ -1001,7 +1290,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                case 0:
                        list_move(&page->lru, dst);
                        mem_cgroup_del_lru(page);
-                        nr_taken++;
+                        nr_taken += hpage_nr_pages(page);
                        break;
                case -EBUSY:
                        /* we don't affect global LRU but rotate in our LRU */
@@ -1023,35 +1312,80 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 #define mem_cgroup_from_res_counter(counter, member)    \
        container_of(counter, struct mem_cgroup, member)
-static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
+/**
+ * mem_cgroup_margin - calculate chargeable space of a memory cgroup
+ * @mem: the memory cgroup
+ *
+ * Returns the maximum amount of memory @mem can be charged with, in
+ * pages.
+ */
+static unsigned long mem_cgroup_margin(struct mem_cgroup *mem)
 {
-        if (do_swap_account) {
+        unsigned long long margin;
-                if (res_counter_check_under_limit(&mem->res) &&
-                        res_counter_check_under_limit(&mem->memsw))
+        margin = res_counter_margin(&mem->res);
-                        return true;
+        if (do_swap_account)
-        } else
+                margin = min(margin, res_counter_margin(&mem->memsw));
-                if (res_counter_check_under_limit(&mem->res))
+        return margin >> PAGE_SHIFT;
-                        return true;
-        return false;
 }
 static unsigned int get_swappiness(struct mem_cgroup *memcg)
 {
        struct cgroup *cgrp = memcg->css.cgroup;
-        unsigned int swappiness;
        /* root ? */
        if (cgrp->parent == NULL)
                return vm_swappiness;
-        spin_lock(&memcg->reclaim_param_lock);
+        return memcg->swappiness;
-        swappiness = memcg->swappiness;
+}
-        spin_unlock(&memcg->reclaim_param_lock);
+static void mem_cgroup_start_move(struct mem_cgroup *mem)
+{
+        int cpu;
+        get_online_cpus();
+        spin_lock(&mem->pcp_counter_lock);
+        for_each_online_cpu(cpu)
+                per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1;
+        mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1;
+        spin_unlock(&mem->pcp_counter_lock);
+        put_online_cpus();
-        return swappiness;
+        synchronize_rcu();
 }
-/* A routine for testing mem is not under move_account */
+static void mem_cgroup_end_move(struct mem_cgroup *mem)
+{
+        int cpu;
+        if (!mem)
+                return;
+        get_online_cpus();
+        spin_lock(&mem->pcp_counter_lock);
+        for_each_online_cpu(cpu)
+                per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1;
+        mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1;
+        spin_unlock(&mem->pcp_counter_lock);
+        put_online_cpus();
+}
+/*
+ * 2 routines for checking "mem" is under move_account() or not.
+ *
+ * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used
+ *                        for avoiding race in accounting. If true,
+ *                        pc->mem_cgroup may be overwritten.
+ *
+ * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or
+ *                        under hierarchy of moving cgroups. This is for
+ *                        waiting at hith-memory prressure caused by "move".
+ */
+static bool mem_cgroup_stealed(struct mem_cgroup *mem)
+{
+        VM_BUG_ON(!rcu_read_lock_held());
+        return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0;
+}
 static bool mem_cgroup_under_move(struct mem_cgroup *mem)
 {
@@ -1092,13 +1426,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
        return false;
 }
-static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
-{
-        int *val = data;
-        (*val)++;
-        return 0;
-}
 /**
 * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
 * @memcg: The memory cgroup that went over limit
@@ -1173,7 +1500,10 @@ done:
 static int mem_cgroup_count_children(struct mem_cgroup *mem)
 {
        int num = 0;
-        mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
+        struct mem_cgroup *iter;
+        for_each_mem_cgroup_tree(iter, mem)
+                num++;
        return num;
 }
@@ -1185,8 +1515,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
        u64 limit;
        u64 memsw;
-        limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+        limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
-                        total_swap_pages;
+        limit += total_swap_pages << PAGE_SHIFT;
        memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
        /*
         * If memsw is finite and limits the amount of swap space available
@@ -1222,18 +1553,153 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
                rcu_read_unlock();
                /* Updates scanning parameter */
-                spin_lock(&root_mem->reclaim_param_lock);
                if (!css) {
                        /* this means start scan from ID:1 */
                        root_mem->last_scanned_child = 0;
                } else
                        root_mem->last_scanned_child = found;
-                spin_unlock(&root_mem->reclaim_param_lock);
        }
        return ret;
 }
+/**
+ * test_mem_cgroup_node_reclaimable
+ * @mem: the target memcg
+ * @nid: the node ID to be checked.
+ * @noswap : specify true here if the user wants flle only information.
+ *
+ * This function returns whether the specified memcg contains any
+ * reclaimable pages on a node. Returns true if there are any reclaimable
+ * pages in the node.
+ */
+static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem,
+                int nid, bool noswap)
+{
+        if (mem_cgroup_node_nr_file_lru_pages(mem, nid))
+                return true;
+        if (noswap || !total_swap_pages)
+                return false;
+        if (mem_cgroup_node_nr_anon_lru_pages(mem, nid))
+                return true;
+        return false;
+}
+#if MAX_NUMNODES > 1
+/*
+ * Always updating the nodemask is not very good - even if we have an empty
+ * list or the wrong list here, we can start from some node and traverse all
+ * nodes based on the zonelist. So update the list loosely once per 10 secs.
+ *
+ */
+static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
+{
+        int nid;
+        /*
+         * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
+         * pagein/pageout changes since the last update.
+         */
+        if (!atomic_read(&mem->numainfo_events))
+                return;
+        if (atomic_inc_return(&mem->numainfo_updating) > 1)
+                return;
+        /* make a nodemask where this memcg uses memory from */
+        mem->scan_nodes = node_states[N_HIGH_MEMORY];
+        for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
+                if (!test_mem_cgroup_node_reclaimable(mem, nid, false))
+                        node_clear(nid, mem->scan_nodes);
+        }
+        atomic_set(&mem->numainfo_events, 0);
+        atomic_set(&mem->numainfo_updating, 0);
+}
+/*
+ * Selecting a node where we start reclaim from. Because what we need is just
+ * reducing usage counter, start from anywhere is O,K. Considering
+ * memory reclaim from current node, there are pros. and cons.
+ *
+ * Freeing memory from current node means freeing memory from a node which
+ * we'll use or we've used. So, it may make LRU bad. And if several threads
+ * hit limits, it will see a contention on a node. But freeing from remote
+ * node means more costs for memory reclaim because of memory latency.
+ *
+ * Now, we use round-robin. Better algorithm is welcomed.
+ */
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+        int node;
+        mem_cgroup_may_update_nodemask(mem);
+        node = mem->last_scanned_node;
+        node = next_node(node, mem->scan_nodes);
+        if (node == MAX_NUMNODES)
+                node = first_node(mem->scan_nodes);
+        /*
+         * We call this when we hit limit, not when pages are added to LRU.
+         * No LRU may hold pages because all pages are UNEVICTABLE or
+         * memcg is too small and all pages are not on LRU. In that case,
+         * we use curret node.
+         */
+        if (unlikely(node == MAX_NUMNODES))
+                node = numa_node_id();
+        mem->last_scanned_node = node;
+        return node;
+}
+/*
+ * Check all nodes whether it contains reclaimable pages or not.
+ * For quick scan, we make use of scan_nodes. This will allow us to skip
+ * unused nodes. But scan_nodes is lazily updated and may not cotain
+ * enough new information. We need to do double check.
+ */
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+        int nid;
+        /*
+         * quick check...making use of scan_node.
+         * We can skip unused nodes.
+         */
+        if (!nodes_empty(mem->scan_nodes)) {
+                for (nid = first_node(mem->scan_nodes);
+                     nid < MAX_NUMNODES;
+                     nid = next_node(nid, mem->scan_nodes)) {
+                        if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                                return true;
+                }
+        }
+        /*
+         * Check rest of nodes.
+         */
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                if (node_isset(nid, mem->scan_nodes))
+                        continue;
+                if (test_mem_cgroup_node_reclaimable(mem, nid, noswap))
+                        return true;
+        }
+        return false;
+}
+#else
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
+{
+        return 0;
+}
+bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap)
+{
+        return test_mem_cgroup_node_reclaimable(mem, 0, noswap);
+}
+#endif
 /*
 * Scan the hierarchy if needed to reclaim memory. We remember the last child
 * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -1249,7 +1715,8 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
 static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                                struct zone *zone,
                                                gfp_t gfp_mask,
-                                                unsigned long reclaim_options)
+                                                unsigned long reclaim_options,
+                                                unsigned long *total_scanned)
 {
        struct mem_cgroup *victim;
        int ret, total = 0;
@@ -1257,18 +1724,27 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
        bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
        bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
        bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
-        unsigned long excess = mem_cgroup_get_excess(root_mem);
+        unsigned long excess;
+        unsigned long nr_scanned;
+        excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
        /* If memsw_is_minimum==1, swap-out is of-no-use. */
-        if (root_mem->memsw_is_minimum)
+        if (!check_soft && root_mem->memsw_is_minimum)
                noswap = true;
        while (1) {
                victim = mem_cgroup_select_victim(root_mem);
                if (victim == root_mem) {
                        loop++;
-                        if (loop >= 1)
+                        /*
-                                drain_all_stock_async();
+                         * We are not draining per cpu cached charges during
+                         * soft limit reclaim  because global reclaim doesn't
+                         * care about charges. It tries to free some memory and
+                         * charges will not give any.
+                         */
+                        if (!check_soft && loop >= 1)
+                                drain_all_stock_async(root_mem);
                        if (loop >= 2) {
                                /*
                                 * If we have not been able to reclaim
@@ -1280,7 +1756,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                        break;
                                }
                                /*
-                                 * We want to do more targetted reclaim.
+                                 * We want to do more targeted reclaim.
                                 * excess >> 2 is not to excessive so as to
                                 * reclaim too much, nor too less that we keep
                                 * coming back to reclaim from this cgroup
@@ -1292,16 +1768,18 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                }
                        }
                }
-                if (!mem_cgroup_local_usage(victim)) {
+                if (!mem_cgroup_reclaimable(victim, noswap)) {
                        /* this cgroup's local usage == 0 */
                        css_put(&victim->css);
                        continue;
                }
                /* we use swappiness of local cgroup */
-                if (check_soft)
+                if (check_soft) {
                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                                noswap, get_swappiness(victim), zone);
+                                noswap, get_swappiness(victim), zone,
-                else
+                                &nr_scanned);
+                        *total_scanned += nr_scanned;
+                } else
                        ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
                                                noswap, get_swappiness(victim));
                css_put(&victim->css);
@@ -1314,57 +1792,47 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                        return ret;
                total += ret;
                if (check_soft) {
-                        if (res_counter_check_under_soft_limit(&root_mem->res))
+                        if (!res_counter_soft_limit_excess(&root_mem->res))
                                return total;
-                } else if (mem_cgroup_check_under_limit(root_mem))
+                } else if (mem_cgroup_margin(root_mem))
-                        return 1 + total;
+                        return total;
        }
        return total;
 }
-static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
-{
-        int *val = (int *)data;
-        int x;
-        /*
-         * Logically, we can stop scanning immediately when we find
-         * a memcg is already locked. But condidering unlock ops and
-         * creation/removal of memcg, scan-all is simple operation.
-         */
-        x = atomic_inc_return(&mem->oom_lock);
-        *val = max(x, *val);
-        return 0;
-}
 /*
 * Check OOM-Killer is already running under our hierarchy.
 * If someone is running, return false.
 */
 static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
 {
-        int lock_count = 0;
+        int x, lock_count = 0;
+        struct mem_cgroup *iter;
-        mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
+        for_each_mem_cgroup_tree(iter, mem) {
+                x = atomic_inc_return(&iter->oom_lock);
+                lock_count = max(x, lock_count);
+        }
        if (lock_count == 1)
                return true;
        return false;
 }
-static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_unlock(struct mem_cgroup *mem)
 {
+        struct mem_cgroup *iter;
        /*
         * When a new child is created while the hierarchy is under oom,
         * mem_cgroup_oom_lock() may not be called. We have to use
         * atomic_add_unless() here.
         */
-        atomic_add_unless(&mem->oom_lock, -1, 0);
+        for_each_mem_cgroup_tree(iter, mem)
+                atomic_add_unless(&iter->oom_lock, -1, 0);
        return 0;
 }
-static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
-{
-        mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
-}
 static DEFINE_MUTEX(memcg_oom_mutex);
 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@ -1462,51 +1930,91 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
 /*
 * Currently used to update mapped file statistics, but the routine can be
 * generalized to update other statistics as well.
+ *
+ * Notes: Race condition
+ *
+ * We usually use page_cgroup_lock() for accessing page_cgroup member but
+ * it tends to be costly. But considering some conditions, we doesn't need
+ * to do so _always_.
+ *
+ * Considering "charge", lock_page_cgroup() is not required because all
+ * file-stat operations happen after a page is attached to radix-tree. There
+ * are no race with "charge".
+ *
+ * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
+ * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
+ * if there are race with "uncharge". Statistics itself is properly handled
+ * by flags.
+ *
+ * Considering "move", this is an only case we see a race. To make the race
+ * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are
+ * possibility of race condition. If there is, we take a lock.
 */
-void mem_cgroup_update_file_mapped(struct page *page, int val)
+void mem_cgroup_update_page_stat(struct page *page,
+                                 enum mem_cgroup_page_stat_item idx, int val)
 {
        struct mem_cgroup *mem;
-        struct page_cgroup *pc;
+        struct page_cgroup *pc = lookup_page_cgroup(page);
+        bool need_unlock = false;
+        unsigned long uninitialized_var(flags);
-        pc = lookup_page_cgroup(page);
        if (unlikely(!pc))
                return;
-        lock_page_cgroup(pc);
+        rcu_read_lock();
        mem = pc->mem_cgroup;
-        if (!mem || !PageCgroupUsed(pc))
+        if (unlikely(!mem || !PageCgroupUsed(pc)))
-                goto done;
+                goto out;
+        /* pc->mem_cgroup is unstable ? */
+        if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) {
+                /* take a lock against to access pc->mem_cgroup */
+                move_lock_page_cgroup(pc, &flags);
+                need_unlock = true;
+                mem = pc->mem_cgroup;
+                if (!mem || !PageCgroupUsed(pc))
+                        goto out;
+        }
-        /*
+        switch (idx) {
-         * Preemption is already disabled. We can use __this_cpu_xxx
+        case MEMCG_NR_FILE_MAPPED:
-         */
+                if (val > 0)
-        if (val > 0) {
+                        SetPageCgroupFileMapped(pc);
-                __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+                else if (!page_mapped(page))
-                SetPageCgroupFileMapped(pc);
+                        ClearPageCgroupFileMapped(pc);
-        } else {
+                idx = MEM_CGROUP_STAT_FILE_MAPPED;
-                __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+                break;
-                ClearPageCgroupFileMapped(pc);
+        default:
+                BUG();
        }
-done:
+        this_cpu_add(mem->stat->count[idx], val);
-        unlock_page_cgroup(pc);
+out:
+        if (unlikely(need_unlock))
+                move_unlock_page_cgroup(pc, &flags);
+        rcu_read_unlock();
+        return;
 }
+EXPORT_SYMBOL(mem_cgroup_update_page_stat);
 /*
 * size of first charge trial. "32" comes from vmscan.c's magic value.
 * TODO: maybe necessary to use big numbers in big irons.
 */
-#define CHARGE_SIZE     (32 * PAGE_SIZE)
+#define CHARGE_BATCH    32U
 struct memcg_stock_pcp {
        struct mem_cgroup *cached; /* this never be root cgroup */
-        int charge;
+        unsigned int nr_pages;
        struct work_struct work;
+        unsigned long flags;
+#define FLUSHING_CACHED_CHARGE  (0)
 };
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
-static atomic_t memcg_drain_count;
+static DEFINE_MUTEX(percpu_charge_mutex);
 /*
- * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
+ * Try to consume stocked charge on this cpu. If success, one page is consumed
 * from local stock and true is returned. If the stock is 0 or charges from a
 * cgroup which is not current target, returns false. This stock will be
 * refilled.
@@ -1517,8 +2025,8 @@ static bool consume_stock(struct mem_cgroup *mem)
        bool ret = true;
        stock = &get_cpu_var(memcg_stock);
-        if (mem == stock->cached && stock->charge)
+        if (mem == stock->cached && stock->nr_pages)
-                stock->charge -= PAGE_SIZE;
+                stock->nr_pages--;
        else /* need to call res_counter_charge */
                ret = false;
        put_cpu_var(memcg_stock);
@@ -1532,13 +2040,15 @@ static void drain_stock(struct memcg_stock_pcp *stock)
 {
        struct mem_cgroup *old = stock->cached;
-        if (stock->charge) {
+        if (stock->nr_pages) {
-                res_counter_uncharge(&old->res, stock->charge);
+                unsigned long bytes = stock->nr_pages * PAGE_SIZE;
+                res_counter_uncharge(&old->res, bytes);
                if (do_swap_account)
-                        res_counter_uncharge(&old->memsw, stock->charge);
+                        res_counter_uncharge(&old->memsw, bytes);
+                stock->nr_pages = 0;
        }
        stock->cached = NULL;
-        stock->charge = 0;
 }
 /*
@@ -1549,13 +2059,14 @@ static void drain_local_stock(struct work_struct *dummy)
 {
        struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
        drain_stock(stock);
+        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 /*
 * Cache charges(val) which is from res_counter, to local per_cpu area.
 * This will be consumed by consume_stock() function, later.
 */
-static void refill_stock(struct mem_cgroup *mem, int val)
+static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages)
 {
        struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
@@ -1563,7 +2074,7 @@ static void refill_stock(struct mem_cgroup *mem, int val)
                drain_stock(stock);
                stock->cached = mem;
        }
-        stock->charge += val;
+        stock->nr_pages += nr_pages;
        put_cpu_var(memcg_stock);
 }
@@ -1573,26 +2084,45 @@ static void refill_stock(struct mem_cgroup *mem, int val)
 * expects some charges will be back to res_counter later but cannot wait for
 * it.
 */
-static void drain_all_stock_async(void)
+static void drain_all_stock_async(struct mem_cgroup *root_mem)
 {
-        int cpu;
+        int cpu, curcpu;
-        /* This function is for scheduling "drain" in asynchronous way.
+        /*
-         * The result of "drain" is not directly handled by callers. Then,
+         * If someone calls draining, avoid adding more kworker runs.
-         * if someone is calling drain, we don't have to call drain more.
-         * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
-         * there is a race. We just do loose check here.
         */
-        if (atomic_read(&memcg_drain_count))
+        if (!mutex_trylock(&percpu_charge_mutex))
                return;
        /* Notify other cpus that system-wide "drain" is running */
-        atomic_inc(&memcg_drain_count);
        get_online_cpus();
+        /*
+         * Get a hint for avoiding draining charges on the current cpu,
+         * which must be exhausted by our charging.  It is not required that
+         * this be a precise check, so we use raw_smp_processor_id() instead of
+         * getcpu()/putcpu().
+         */
+        curcpu = raw_smp_processor_id();
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
-                schedule_work_on(cpu, &stock->work);
+                struct mem_cgroup *mem;
+                if (cpu == curcpu)
+                        continue;
+                mem = stock->cached;
+                if (!mem)
+                        continue;
+                if (mem != root_mem) {
+                        if (!root_mem->use_hierarchy)
+                                continue;
+                        /* check whether "mem" is under tree of "root_mem" */
+                        if (!css_is_ancestor(&mem->css, &root_mem->css))
+                                continue;
+                }
+                if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
+                        schedule_work_on(cpu, &stock->work);
        }
        put_online_cpus();
-        atomic_dec(&memcg_drain_count);
+        mutex_unlock(&percpu_charge_mutex);
        /* We don't wait for flush_work */
 }
@@ -1600,20 +2130,66 @@ static void drain_all_stock_async(void)
 static void drain_all_stock_sync(void)
 {
        /* called when force_empty is called */
-        atomic_inc(&memcg_drain_count);
+        mutex_lock(&percpu_charge_mutex);
        schedule_on_each_cpu(drain_local_stock);
-        atomic_dec(&memcg_drain_count);
+        mutex_unlock(&percpu_charge_mutex);
 }
-static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
+/*
+ * This function drains percpu counter value from DEAD cpu and
+ * move it to local cpu. Note that this function can be preempted.
+ */
+static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu)
+{
+        int i;
+        spin_lock(&mem->pcp_counter_lock);
+        for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) {
+                long x = per_cpu(mem->stat->count[i], cpu);
+                per_cpu(mem->stat->count[i], cpu) = 0;
+                mem->nocpu_base.count[i] += x;
+        }
+        for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
+                unsigned long x = per_cpu(mem->stat->events[i], cpu);
+                per_cpu(mem->stat->events[i], cpu) = 0;
+                mem->nocpu_base.events[i] += x;
+        }
+        /* need to clear ON_MOVE value, works as a kind of lock. */
+        per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0;
+        spin_unlock(&mem->pcp_counter_lock);
+}
+static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu)
+{
+        int idx = MEM_CGROUP_ON_MOVE;
+        spin_lock(&mem->pcp_counter_lock);
+        per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx];
+        spin_unlock(&mem->pcp_counter_lock);
+}
+static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb,
                                        unsigned long action,
                                        void *hcpu)
 {
        int cpu = (unsigned long)hcpu;
        struct memcg_stock_pcp *stock;
+        struct mem_cgroup *iter;
-        if (action != CPU_DEAD)
+        if ((action == CPU_ONLINE)) {
+                for_each_mem_cgroup_all(iter)
+                        synchronize_mem_cgroup_on_move(iter, cpu);
                return NOTIFY_OK;
+        }
+        if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN)
+                return NOTIFY_OK;
+        for_each_mem_cgroup_all(iter)
+                mem_cgroup_drain_pcp_counter(iter, cpu);
        stock = &per_cpu(memcg_stock, cpu);
        drain_stock(stock);
        return NOTIFY_OK;
@@ -1629,9 +2205,10 @@ enum {
        CHARGE_OOM_DIE,         /* the current is killed because of OOM */
 };
-static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
-                                int csize, bool oom_check)
+                                unsigned int nr_pages, bool oom_check)
 {
+        unsigned long csize = nr_pages * PAGE_SIZE;
        struct mem_cgroup *mem_over_limit;
        struct res_counter *fail_res;
        unsigned long flags = 0;
@@ -1646,27 +2223,38 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
                if (likely(!ret))
                        return CHARGE_OK;
+                res_counter_uncharge(&mem->res, csize);
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        /*
-        if (csize > PAGE_SIZE) /* change csize and retry */
+         * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
+         * of regular pages (CHARGE_BATCH), or a single regular page (1).
+         *
+         * Never reclaim on behalf of optional batching, retry with a
+         * single page instead.
+         */
+        if (nr_pages == CHARGE_BATCH)
                return CHARGE_RETRY;
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                        gfp_mask, flags);
+                                              gfp_mask, flags, NULL);
+        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
+                return CHARGE_RETRY;
        /*
-         * try_to_free_mem_cgroup_pages() might not give us a full
+         * Even though the limit is exceeded at this point, reclaim
-         * picture of reclaim. Some pages are reclaimed and might be
+         * may have been able to free some pages.  Retry the charge
-         * moved to swap cache or just unmapped from the cgroup.
+         * before killing the task.
-         * Check the limit again to see if the reclaim reduced the
+         *
-         * current usage of the cgroup before giving up
+         * Only for regular pages, though: huge pages are rather
+         * unlikely to succeed so close to the limit, and we fall back
+         * to regular pages anyway in case of failure.
         */
-        if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+        if (nr_pages == 1 && ret)
                return CHARGE_RETRY;
        /*
@@ -1691,12 +2279,15 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
 * oom-killer can be invoked.
 */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
-                gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+                                   gfp_t gfp_mask,
+                                   unsigned int nr_pages,
+                                   struct mem_cgroup **memcg,
+                                   bool oom)
 {
+        unsigned int batch = max(CHARGE_BATCH, nr_pages);
        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct mem_cgroup *mem = NULL;
        int ret;
-        int csize = CHARGE_SIZE;
        /*
         * Unlike gloval-vm's OOM-kill, we're not in memory shortage
@@ -1721,7 +2312,7 @@ again:
                VM_BUG_ON(css_is_removed(&mem->css));
                if (mem_cgroup_is_root(mem))
                        goto done;
-                if (consume_stock(mem))
+                if (nr_pages == 1 && consume_stock(mem))
                        goto done;
                css_get(&mem->css);
        } else {
@@ -1729,23 +2320,22 @@ again:
                rcu_read_lock();
                p = rcu_dereference(mm->owner);
-                VM_BUG_ON(!p);
                /*
-                 * because we don't have task_lock(), "p" can exit while
+                 * Because we don't have task_lock(), "p" can exit.
-                 * we're here. In that case, "mem" can point to root
+                 * In that case, "mem" can point to root or p can be NULL with
-                 * cgroup but never be NULL. (and task_struct itself is freed
+                 * race with swapoff. Then, we have small risk of mis-accouning.
-                 * by RCU, cgroup itself is RCU safe.) Then, we have small
+                 * But such kind of mis-account by race always happens because
-                 * risk here to get wrong cgroup. But such kind of mis-account
+                 * we don't have cgroup_mutex(). It's overkill and we allo that
-                 * by race always happens because we don't have cgroup_mutex().
+                 * small race, here.
-                 * It's overkill and we allow that small race, here.
+                 * (*) swapoff at el will charge against mm-struct not against
+                 * task-struct. So, mm->owner can be NULL.
                 */
                mem = mem_cgroup_from_task(p);
-                VM_BUG_ON(!mem);
+                if (!mem || mem_cgroup_is_root(mem)) {
-                if (mem_cgroup_is_root(mem)) {
                        rcu_read_unlock();
                        goto done;
                }
-                if (consume_stock(mem)) {
+                if (nr_pages == 1 && consume_stock(mem)) {
                        /*
                         * It seems dagerous to access memcg without css_get().
                         * But considering how consume_stok works, it's not
@@ -1780,13 +2370,12 @@ again:
                        nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
                }
-                ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
+                ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check);
                switch (ret) {
                case CHARGE_OK:
                        break;
                case CHARGE_RETRY: /* not in OOM situation but retry */
-                        csize = PAGE_SIZE;
+                        batch = nr_pages;
                        css_put(&mem->css);
                        mem = NULL;
                        goto again;
@@ -1807,8 +2396,8 @@ again:
                }
        } while (ret != CHARGE_OK);
-        if (csize > PAGE_SIZE)
+        if (batch > nr_pages)
-                refill_stock(mem, csize - PAGE_SIZE);
+                refill_stock(mem, batch - nr_pages);
        css_put(&mem->css);
 done:
        *memcg = mem;
@@ -1827,20 +2416,17 @@ bypass:
 * gotten by try_charge().
 */
 static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
-                                                        unsigned long count)
+                                       unsigned int nr_pages)
 {
        if (!mem_cgroup_is_root(mem)) {
-                res_counter_uncharge(&mem->res, PAGE_SIZE * count);
+                unsigned long bytes = nr_pages * PAGE_SIZE;
+                res_counter_uncharge(&mem->res, bytes);
                if (do_swap_account)
-                        res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
+                        res_counter_uncharge(&mem->memsw, bytes);
        }
 }
-static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
-{
-        __mem_cgroup_cancel_charge(mem, 1);
-}
 /*
 * A helper function to get mem_cgroup from ID. must be called under
 * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1888,26 +2474,22 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
        return mem;
 }
-/*
- * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
- * USED state. If already USED, uncharge and return.
- */
 static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
-                                     struct page_cgroup *pc,
+                                       struct page *page,
-                                     enum charge_type ctype)
+                                       unsigned int nr_pages,
+                                       struct page_cgroup *pc,
+                                       enum charge_type ctype)
 {
-        /* try_charge() can return NULL to *memcg, taking care of it. */
-        if (!mem)
-                return;
        lock_page_cgroup(pc);
        if (unlikely(PageCgroupUsed(pc))) {
                unlock_page_cgroup(pc);
-                mem_cgroup_cancel_charge(mem);
+                __mem_cgroup_cancel_charge(mem, nr_pages);
                return;
        }
+        /*
+         * we don't need page_cgroup_lock about tail pages, becase they are not
+         * accessed by any other context at this point.
+         */
        pc->mem_cgroup = mem;
        /*
         * We access a page_cgroup asynchronously without lock_page_cgroup().
@@ -1931,19 +2513,62 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
                break;
        }
-        mem_cgroup_charge_statistics(mem, pc, true);
+        mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages);
        unlock_page_cgroup(pc);
        /*
         * "charge_statistics" updated event counter. Then, check it.
         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
         * if they exceeds softlimit.
         */
-        memcg_check_events(mem, pc->page);
+        memcg_check_events(mem, page);
+}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\
+                        (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION))
+/*
+ * Because tail pages are not marked as "used", set it. We're under
+ * zone->lru_lock, 'splitting on pmd' and compund_lock.
+ */
+void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail)
+{
+        struct page_cgroup *head_pc = lookup_page_cgroup(head);
+        struct page_cgroup *tail_pc = lookup_page_cgroup(tail);
+        unsigned long flags;
+        if (mem_cgroup_disabled())
+                return;
+        /*
+         * We have no races with charge/uncharge but will have races with
+         * page state accounting.
+         */
+        move_lock_page_cgroup(head_pc, &flags);
+        tail_pc->mem_cgroup = head_pc->mem_cgroup;
+        smp_wmb(); /* see __commit_charge() */
+        if (PageCgroupAcctLRU(head_pc)) {
+                enum lru_list lru;
+                struct mem_cgroup_per_zone *mz;
+                /*
+                 * LRU flags cannot be copied because we need to add tail
+                 *.page to LRU by generic call and our hook will be called.
+                 * We hold lru_lock, then, reduce counter directly.
+                 */
+                lru = page_lru(head);
+                mz = page_cgroup_zoneinfo(head_pc->mem_cgroup, head);
+                MEM_CGROUP_ZSTAT(mz, lru) -= 1;
+        }
+        tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+        move_unlock_page_cgroup(head_pc, &flags);
 }
+#endif
 /**
- * __mem_cgroup_move_account - move account of the page
+ * mem_cgroup_move_account - move account of the page
+ * @page: the page
+ * @nr_pages: number of regular pages (>1 for huge pages)
 * @pc: page_cgroup of the page.
 * @from: mem_cgroup which the page is moved from.
 * @to: mem_cgroup which the page is moved to. @from != @to.
@@ -1951,22 +2576,42 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 *
 * The caller must confirm following.
 * - page is not on LRU (isolate_page() is useful.)
- * - the pc is locked, used, and ->mem_cgroup points to @from.
+ * - compound_lock is held when nr_pages > 1
 *
 * This function doesn't do "charge" nor css_get to new cgroup. It should be
- * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is
+ * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is
 * true, this function does "uncharge" from old cgroup, but it doesn't if
 * @uncharge is false, so a caller should do "uncharge".
 */
+static int mem_cgroup_move_account(struct page *page,
-static void __mem_cgroup_move_account(struct page_cgroup *pc,
+                                   unsigned int nr_pages,
-        struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
+                                   struct page_cgroup *pc,
+                                   struct mem_cgroup *from,
+                                   struct mem_cgroup *to,
+                                   bool uncharge)
 {
+        unsigned long flags;
+        int ret;
        VM_BUG_ON(from == to);
-        VM_BUG_ON(PageLRU(pc->page));
+        VM_BUG_ON(PageLRU(page));
-        VM_BUG_ON(!PageCgroupLocked(pc));
+        /*
-        VM_BUG_ON(!PageCgroupUsed(pc));
+         * The page is isolated from LRU. So, collapse function
-        VM_BUG_ON(pc->mem_cgroup != from);
+         * will not handle this page. But page splitting can happen.
+         * Do this check under compound_page_lock(). The caller should
+         * hold it.
+         */
+        ret = -EBUSY;
+        if (nr_pages > 1 && !PageTransHuge(page))
+                goto out;
+        lock_page_cgroup(pc);
+        ret = -EINVAL;
+        if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
+                goto unlock;
+        move_lock_page_cgroup(pc, &flags);
        if (PageCgroupFileMapped(pc)) {
                /* Update mapped_file data for mem_cgroup */
@@ -1975,42 +2620,31 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc,
                __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
                preempt_enable();
        }
-        mem_cgroup_charge_statistics(from, pc, false);
+        mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages);
        if (uncharge)
                /* This is not "cancel", but cancel_charge does all we need. */
-                mem_cgroup_cancel_charge(from);
+                __mem_cgroup_cancel_charge(from, nr_pages);
        /* caller should have done css_get */
        pc->mem_cgroup = to;
-        mem_cgroup_charge_statistics(to, pc, true);
+        mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages);
        /*
         * We charges against "to" which may not have any tasks. Then, "to"
         * can be under rmdir(). But in current implementation, caller of
         * this function is just force_empty() and move charge, so it's
-         * garanteed that "to" is never removed. So, we don't check rmdir
+         * guaranteed that "to" is never removed. So, we don't check rmdir
         * status here.
         */
-}
+        move_unlock_page_cgroup(pc, &flags);
+        ret = 0;
-/*
+unlock:
- * check whether the @pc is valid for moving account and call
- * __mem_cgroup_move_account()
- */
-static int mem_cgroup_move_account(struct page_cgroup *pc,
-                struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
-{
-        int ret = -EINVAL;
-        lock_page_cgroup(pc);
-        if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
-                __mem_cgroup_move_account(pc, from, to, uncharge);
-                ret = 0;
-        }
        unlock_page_cgroup(pc);
        /*
         * check events
         */
-        memcg_check_events(to, pc->page);
+        memcg_check_events(to, page);
-        memcg_check_events(from, pc->page);
+        memcg_check_events(from, page);
+out:
        return ret;
 }
@@ -2018,14 +2652,16 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 * move charges to its parent.
 */
-static int mem_cgroup_move_parent(struct page_cgroup *pc,
+static int mem_cgroup_move_parent(struct page *page,
+                                  struct page_cgroup *pc,
                                  struct mem_cgroup *child,
                                  gfp_t gfp_mask)
 {
-        struct page *page = pc->page;
        struct cgroup *cg = child->css.cgroup;
        struct cgroup *pcg = cg->parent;
        struct mem_cgroup *parent;
+        unsigned int nr_pages;
+        unsigned long uninitialized_var(flags);
        int ret;
        /* Is ROOT ? */
@@ -2038,14 +2674,22 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        if (isolate_lru_page(page))
                goto put;
+        nr_pages = hpage_nr_pages(page);
        parent = mem_cgroup_from_cont(pcg);
-        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false);
        if (ret || !parent)
                goto put_back;
-        ret = mem_cgroup_move_account(pc, child, parent, true);
+        if (nr_pages > 1)
+                flags = compound_lock_irqsave(page);
+        ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true);
        if (ret)
-                mem_cgroup_cancel_charge(parent);
+                __mem_cgroup_cancel_charge(parent, nr_pages);
+        if (nr_pages > 1)
+                compound_unlock_irqrestore(page, flags);
 put_back:
        putback_lru_page(page);
 put:
@@ -2064,20 +2708,29 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask, enum charge_type ctype)
 {
        struct mem_cgroup *mem = NULL;
+        unsigned int nr_pages = 1;
        struct page_cgroup *pc;
+        bool oom = true;
        int ret;
+        if (PageTransHuge(page)) {
+                nr_pages <<= compound_order(page);
+                VM_BUG_ON(!PageTransHuge(page));
+                /*
+                 * Never OOM-kill a process for a huge page.  The
+                 * fault handler will fall back to regular pages.
+                 */
+                oom = false;
+        }
        pc = lookup_page_cgroup(page);
-        /* can happen at boot */
+        BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */
-        if (unlikely(!pc))
-                return 0;
-        prefetchw(pc);
-        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
+        ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom);
        if (ret || !mem)
                return ret;
-        __mem_cgroup_commit_charge(mem, pc, ctype);
+        __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype);
        return 0;
 }
@@ -2086,8 +2739,6 @@ int mem_cgroup_newpage_charge(struct page *page,
 {
        if (mem_cgroup_disabled())
                return 0;
-        if (PageCompound(page))
-                return 0;
        /*
         * If already mapped, we don't have to account.
         * If page cache, page->mapping has address_space.
@@ -2107,9 +2758,26 @@ static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                                        enum charge_type ctype);
+static void
+__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem,
+                                        enum charge_type ctype)
+{
+        struct page_cgroup *pc = lookup_page_cgroup(page);
+        /*
+         * In some case, SwapCache, FUSE(splice_buf->radixtree), the page
+         * is already on LRU. It means the page may on some other page_cgroup's
+         * LRU. Take care of it.
+         */
+        mem_cgroup_lru_del_before_commit(page);
+        __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
+        mem_cgroup_lru_add_after_commit(page);
+        return;
+}
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
+        struct mem_cgroup *mem = NULL;
        int ret;
        if (mem_cgroup_disabled())
@@ -2144,14 +2812,22 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
        if (unlikely(!mm))
                mm = &init_mm;
-        if (page_is_file_cache(page))
+        if (page_is_file_cache(page)) {
-                return mem_cgroup_charge_common(page, mm, gfp_mask,
+                ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true);
-                                MEM_CGROUP_CHARGE_TYPE_CACHE);
+                if (ret || !mem)
+                        return ret;
+                /*
+                 * FUSE reuses pages without going through the final
+                 * put that would remove them from the LRU list, make
+                 * sure that they get relinked properly.
+                 */
+                __mem_cgroup_commit_charge_lrucare(page, mem,
+                                        MEM_CGROUP_CHARGE_TYPE_CACHE);
+                return ret;
+        }
        /* shmem */
        if (PageSwapCache(page)) {
-                struct mem_cgroup *mem = NULL;
                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
                if (!ret)
                        __mem_cgroup_commit_charge_swapin(page, mem,
@@ -2176,6 +2852,8 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
        struct mem_cgroup *mem;
        int ret;
+        *ptr = NULL;
        if (mem_cgroup_disabled())
                return 0;
@@ -2193,30 +2871,26 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
        if (!mem)
                goto charge_cur_mm;
        *ptr = mem;
-        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
+        ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true);
        css_put(&mem->css);
        return ret;
 charge_cur_mm:
        if (unlikely(!mm))
                mm = &init_mm;
-        return __mem_cgroup_try_charge(mm, mask, ptr, true);
+        return __mem_cgroup_try_charge(mm, mask, 1, ptr, true);
 }
 static void
 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
                                        enum charge_type ctype)
 {
-        struct page_cgroup *pc;
        if (mem_cgroup_disabled())
                return;
        if (!ptr)
                return;
        cgroup_exclude_rmdir(&ptr->css);
-        pc = lookup_page_cgroup(page);
-        mem_cgroup_lru_del_before_commit_swapcache(page);
+        __mem_cgroup_commit_charge_lrucare(page, ptr, ctype);
-        __mem_cgroup_commit_charge(ptr, pc, ctype);
-        mem_cgroup_lru_add_after_commit_swapcache(page);
        /*
         * Now swap is on-memory. This means this page may be
         * counted both as mem and swap....double count.
@@ -2264,14 +2938,16 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
                return;
        if (!mem)
                return;
-        mem_cgroup_cancel_charge(mem);
+        __mem_cgroup_cancel_charge(mem, 1);
 }
-static void
+static void mem_cgroup_do_uncharge(struct mem_cgroup *mem,
-__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+                                   unsigned int nr_pages,
+                                   const enum charge_type ctype)
 {
        struct memcg_batch_info *batch = NULL;
        bool uncharge_memsw = true;
        /* If swapout, usage of swap doesn't decrease */
        if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
                uncharge_memsw = false;
@@ -2286,7 +2962,7 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
                batch->memcg = mem;
        /*
         * do_batch > 0 when unmapping pages or inode invalidate/truncate.
-         * In those cases, all pages freed continously can be expected to be in
+         * In those cases, all pages freed continuously can be expected to be in
         * the same cgroup and we have chance to coalesce uncharges.
         * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
         * because we want to do uncharge as soon as possible.
@@ -2295,6 +2971,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
        if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
                goto direct_uncharge;
+        if (nr_pages > 1)
+                goto direct_uncharge;
        /*
         * In typical case, batch->memcg == mem. This means we can
         * merge a series of uncharges to an uncharge of res_counter.
@@ -2303,14 +2982,14 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
        if (batch->memcg != mem)
                goto direct_uncharge;
        /* remember freed charge and uncharge it later */
-        batch->bytes += PAGE_SIZE;
+        batch->nr_pages++;
        if (uncharge_memsw)
-                batch->memsw_bytes += PAGE_SIZE;
+                batch->memsw_nr_pages++;
        return;
 direct_uncharge:
-        res_counter_uncharge(&mem->res, PAGE_SIZE);
+        res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE);
        if (uncharge_memsw)
-                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+                res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE);
        if (unlikely(batch->memcg != mem))
                memcg_oom_recover(mem);
        return;
@@ -2322,8 +3001,9 @@ direct_uncharge:
 static struct mem_cgroup *
 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
-        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
+        unsigned int nr_pages = 1;
+        struct page_cgroup *pc;
        if (mem_cgroup_disabled())
                return NULL;
@@ -2331,6 +3011,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        if (PageSwapCache(page))
                return NULL;
+        if (PageTransHuge(page)) {
+                nr_pages <<= compound_order(page);
+                VM_BUG_ON(!PageTransHuge(page));
+        }
        /*
         * Check if our page_cgroup is valid
         */
@@ -2363,7 +3047,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        mem_cgroup_charge_statistics(mem, pc, false);
+        mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages);
        ClearPageCgroupUsed(pc);
        /*
@@ -2384,7 +3068,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                mem_cgroup_get(mem);
        }
        if (!mem_cgroup_is_root(mem))
-                __do_uncharge(mem, ctype);
+                mem_cgroup_do_uncharge(mem, nr_pages, ctype);
        return mem;
@@ -2424,8 +3108,8 @@ void mem_cgroup_uncharge_start(void)
        /* We can do nest. */
        if (current->memcg_batch.do_batch == 1) {
                current->memcg_batch.memcg = NULL;
-                current->memcg_batch.bytes = 0;
+                current->memcg_batch.nr_pages = 0;
-                current->memcg_batch.memsw_bytes = 0;
+                current->memcg_batch.memsw_nr_pages = 0;
        }
 }
@@ -2446,10 +3130,12 @@ void mem_cgroup_uncharge_end(void)
         * This "batch->memcg" is valid without any css_get/put etc...
         * bacause we hide charges behind us.
         */
-        if (batch->bytes)
+        if (batch->nr_pages)
-                res_counter_uncharge(&batch->memcg->res, batch->bytes);
+                res_counter_uncharge(&batch->memcg->res,
-        if (batch->memsw_bytes)
+                                     batch->nr_pages * PAGE_SIZE);
-                res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
+        if (batch->memsw_nr_pages)
+                res_counter_uncharge(&batch->memcg->memsw,
+                                     batch->memsw_nr_pages * PAGE_SIZE);
        memcg_oom_recover(batch->memcg);
        /* forget this pointer (for sanity check) */
        batch->memcg = NULL;
@@ -2572,13 +3258,16 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 * page belongs to.
 */
 int mem_cgroup_prepare_migration(struct page *page,
-        struct page *newpage, struct mem_cgroup **ptr)
+        struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask)
 {
-        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
+        struct page_cgroup *pc;
        enum charge_type ctype;
        int ret = 0;
+        *ptr = NULL;
+        VM_BUG_ON(PageTransHuge(page));
        if (mem_cgroup_disabled())
                return 0;
@@ -2628,7 +3317,7 @@ int mem_cgroup_prepare_migration(struct page *page,
                return 0;
        *ptr = mem;
-        ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+        ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false);
        css_put(&mem->css);/* drop extra refcnt */
        if (ret || *ptr == NULL) {
                if (PageAnon(page)) {
@@ -2655,13 +3344,13 @@ int mem_cgroup_prepare_migration(struct page *page,
                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
        else
                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-        __mem_cgroup_commit_charge(mem, pc, ctype);
+        __mem_cgroup_commit_charge(mem, page, 1, pc, ctype);
        return ret;
 }
 /* remove redundant charge if migration failed*/
 void mem_cgroup_end_migration(struct mem_cgroup *mem,
-        struct page *oldpage, struct page *newpage)
+        struct page *oldpage, struct page *newpage, bool migration_ok)
 {
        struct page *used, *unused;
        struct page_cgroup *pc;
@@ -2670,8 +3359,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
                return;
        /* blocks rmdir() */
        cgroup_exclude_rmdir(&mem->css);
-        /* at migration success, oldpage->mapping is NULL. */
+        if (!migration_ok) {
-        if (oldpage->mapping) {
                used = oldpage;
                unused = newpage;
        } else {
@@ -2721,7 +3409,7 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
                            struct mm_struct *mm,
                            gfp_t gfp_mask)
 {
-        struct mem_cgroup *mem = NULL;
+        struct mem_cgroup *mem;
        int ret;
        if (mem_cgroup_disabled())
@@ -2734,6 +3422,52 @@ int mem_cgroup_shmem_charge_fallback(struct page *page,
        return ret;
 }
+#ifdef CONFIG_DEBUG_VM
+static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
+{
+        struct page_cgroup *pc;
+        pc = lookup_page_cgroup(page);
+        if (likely(pc) && PageCgroupUsed(pc))
+                return pc;
+        return NULL;
+}
+bool mem_cgroup_bad_page_check(struct page *page)
+{
+        if (mem_cgroup_disabled())
+                return false;
+        return lookup_page_cgroup_used(page) != NULL;
+}
+void mem_cgroup_print_bad_page(struct page *page)
+{
+        struct page_cgroup *pc;
+        pc = lookup_page_cgroup_used(page);
+        if (pc) {
+                int ret = -1;
+                char *path;
+                printk(KERN_ALERT "pc:%p pc->flags:%lx pc->mem_cgroup:%p",
+                       pc, pc->flags, pc->mem_cgroup);
+                path = kmalloc(PATH_MAX, GFP_KERNEL);
+                if (path) {
+                        rcu_read_lock();
+                        ret = cgroup_path(pc->mem_cgroup->css.cgroup,
+                                                        path, PATH_MAX);
+                        rcu_read_unlock();
+                }
+                printk(KERN_CONT "(%s)\n",
+                                (ret < 0) ? "cannot get the path" : path);
+                kfree(path);
+        }
+}
+#endif
 static DEFINE_MUTEX(set_limit_mutex);
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
@@ -2791,7 +3525,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                        break;
                mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
-                                                MEM_CGROUP_RECLAIM_SHRINK);
+                                                MEM_CGROUP_RECLAIM_SHRINK,
+                                                NULL);
                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -2851,7 +3586,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
                mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
                                                MEM_CGROUP_RECLAIM_NOSWAP |
-                                                MEM_CGROUP_RECLAIM_SHRINK);
+                                                MEM_CGROUP_RECLAIM_SHRINK,
+                                                NULL);
                curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
                /* Usage is reduced ? */
                if (curusage >= oldusage)
@@ -2865,7 +3601,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-                                            gfp_t gfp_mask)
+                                            gfp_t gfp_mask,
+                                            unsigned long *total_scanned)
 {
        unsigned long nr_reclaimed = 0;
        struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2873,6 +3610,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
        int loop = 0;
        struct mem_cgroup_tree_per_zone *mctz;
        unsigned long long excess;
+        unsigned long nr_scanned;
        if (order > 0)
                return 0;
@@ -2891,10 +3629,13 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                if (!mz)
                        break;
+                nr_scanned = 0;
                reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
                                                gfp_mask,
-                                                MEM_CGROUP_RECLAIM_SOFT);
+                                                MEM_CGROUP_RECLAIM_SOFT,
+                                                &nr_scanned);
                nr_reclaimed += reclaimed;
+                *total_scanned += nr_scanned;
                spin_lock(&mctz->lock);
                /*
@@ -2917,10 +3658,9 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
                                 */
                                next_mz =
                                __mem_cgroup_largest_soft_limit_node(mctz);
-                                if (next_mz == mz) {
+                                if (next_mz == mz)
                                        css_put(&next_mz->mem->css);
-                                        next_mz = NULL;
+                                else /* next_mz == NULL or other memcg */
-                                } else /* next_mz == NULL or other memcg */
                                        break;
                        } while (1);
                }
@@ -2977,6 +3717,8 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
        loop += 256;
        busy = NULL;
        while (loop--) {
+                struct page *page;
                ret = 0;
                spin_lock_irqsave(&zone->lru_lock, flags);
                if (list_empty(list)) {
@@ -2992,7 +3734,9 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
                }
                spin_unlock_irqrestore(&zone->lru_lock, flags);
-                ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
+                page = lookup_cgroup_page(pc);
+                ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL);
                if (ret == -ENOMEM)
                        break;
@@ -3038,6 +3782,7 @@ move_account:
                lru_add_drain_all();
                drain_all_stock_sync();
                ret = 0;
+                mem_cgroup_start_move(mem);
                for_each_node_state(node, N_HIGH_MEMORY) {
                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
                                enum lru_list l;
@@ -3051,6 +3796,7 @@ move_account:
                        if (ret)
                                break;
                }
+                mem_cgroup_end_move(mem);
                memcg_oom_recover(mem);
                /* it seems parent cgroup doesn't have enough mem */
                if (ret == -ENOMEM)
@@ -3137,33 +3883,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
        return retval;
 }
-struct mem_cgroup_idx_data {
-        s64 val;
-        enum mem_cgroup_stat_index idx;
-};
-static int
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem,
-mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
+                                               enum mem_cgroup_stat_index idx)
 {
-        struct mem_cgroup_idx_data *d = data;
+        struct mem_cgroup *iter;
-        d->val += mem_cgroup_read_stat(mem, d->idx);
+        long val = 0;
-        return 0;
-}
-static void
+        /* Per-cpu values can be negative, use a signed accumulator */
-mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
+        for_each_mem_cgroup_tree(iter, mem)
-                                enum mem_cgroup_stat_index idx, s64 *val)
+                val += mem_cgroup_read_stat(iter, idx);
-{
-        struct mem_cgroup_idx_data d;
+        if (val < 0) /* race ? */
-        d.idx = idx;
+                val = 0;
-        d.val = 0;
+        return val;
-        mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
-        *val = d.val;
 }
 static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
 {
-        u64 idx_val, val;
+        u64 val;
        if (!mem_cgroup_is_root(mem)) {
                if (!swap)
@@ -3172,16 +3910,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap)
                        return res_counter_read_u64(&mem->memsw, RES_USAGE);
        }
-        mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val);
+        val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE);
-        val = idx_val;
+        val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS);
-        mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val);
-        val += idx_val;
-        if (swap) {
+        if (swap)
-                mem_cgroup_get_recursive_idx_stat(mem,
+                val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
-                                MEM_CGROUP_STAT_SWAPOUT, &idx_val);
-                val += idx_val;
-        }
        return val << PAGE_SHIFT;
 }
@@ -3359,6 +4092,8 @@ enum {
        MCS_PGPGIN,
        MCS_PGPGOUT,
        MCS_SWAP,
+        MCS_PGFAULT,
+        MCS_PGMAJFAULT,
        MCS_INACTIVE_ANON,
        MCS_ACTIVE_ANON,
        MCS_INACTIVE_FILE,
@@ -3381,6 +4116,8 @@ struct {
        {"pgpgin", "total_pgpgin"},
        {"pgpgout", "total_pgpgout"},
        {"swap", "total_swap"},
+        {"pgfault", "total_pgfault"},
+        {"pgmajfault", "total_pgmajfault"},
        {"inactive_anon", "total_inactive_anon"},
        {"active_anon", "total_active_anon"},
        {"inactive_file", "total_inactive_file"},
@@ -3389,9 +4126,9 @@ struct {
 };
-static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
+static void
+mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
 {
-        struct mcs_total_stat *s = data;
        s64 val;
        /* per cpu stat */
@@ -3401,14 +4138,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
        s->stat[MCS_RSS] += val * PAGE_SIZE;
        val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
        s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
-        val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
+        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN);
        s->stat[MCS_PGPGIN] += val;
-        val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
+        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT);
        s->stat[MCS_PGPGOUT] += val;
        if (do_swap_account) {
                val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
                s->stat[MCS_SWAP] += val * PAGE_SIZE;
        }
+        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT);
+        s->stat[MCS_PGFAULT] += val;
+        val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT);
+        s->stat[MCS_PGMAJFAULT] += val;
        /* per zone stat */
        val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -3421,15 +4162,62 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
        s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
        val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
        s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
-        return 0;
 }
 static void
 mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
 {
-        mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
+        struct mem_cgroup *iter;
+        for_each_mem_cgroup_tree(iter, mem)
+                mem_cgroup_get_local_stat(iter, s);
 }
+#ifdef CONFIG_NUMA
+static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
+{
+        int nid;
+        unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
+        unsigned long node_nr;
+        struct cgroup *cont = m->private;
+        struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
+        total_nr = mem_cgroup_nr_lru_pages(mem_cont);
+        seq_printf(m, "total=%lu", total_nr);
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                node_nr = mem_cgroup_node_nr_lru_pages(mem_cont, nid);
+                seq_printf(m, " N%d=%lu", nid, node_nr);
+        }
+        seq_putc(m, '\n');
+        file_nr = mem_cgroup_nr_file_lru_pages(mem_cont);
+        seq_printf(m, "file=%lu", file_nr);
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                node_nr = mem_cgroup_node_nr_file_lru_pages(mem_cont, nid);
+                seq_printf(m, " N%d=%lu", nid, node_nr);
+        }
+        seq_putc(m, '\n');
+        anon_nr = mem_cgroup_nr_anon_lru_pages(mem_cont);
+        seq_printf(m, "anon=%lu", anon_nr);
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                node_nr = mem_cgroup_node_nr_anon_lru_pages(mem_cont, nid);
+                seq_printf(m, " N%d=%lu", nid, node_nr);
+        }
+        seq_putc(m, '\n');
+        unevictable_nr = mem_cgroup_nr_unevictable_lru_pages(mem_cont);
+        seq_printf(m, "unevictable=%lu", unevictable_nr);
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                node_nr = mem_cgroup_node_nr_unevictable_lru_pages(mem_cont,
+                                                                        nid);
+                seq_printf(m, " N%d=%lu", nid, node_nr);
+        }
+        seq_putc(m, '\n');
+        return 0;
+}
+#endif /* CONFIG_NUMA */
 static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
                                 struct cgroup_map_cb *cb)
 {
@@ -3440,6 +4228,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        memset(&mystat, 0, sizeof(mystat));
        mem_cgroup_get_local_stat(mem_cont, &mystat);
        for (i = 0; i < NR_MCS_STAT; i++) {
                if (i == MCS_SWAP && !do_swap_account)
                        continue;
@@ -3525,9 +4314,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
                return -EINVAL;
        }
-        spin_lock(&memcg->reclaim_param_lock);
        memcg->swappiness = val;
-        spin_unlock(&memcg->reclaim_param_lock);
        cgroup_unlock();
@@ -3604,7 +4391,7 @@ static int compare_thresholds(const void *a, const void *b)
        return _a->threshold - _b->threshold;
 }
-static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem)
 {
        struct mem_cgroup_eventfd_list *ev;
@@ -3615,7 +4402,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data)
 static void mem_cgroup_oom_notify(struct mem_cgroup *mem)
 {
-        mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb);
+        struct mem_cgroup *iter;
+        for_each_mem_cgroup_tree(iter, mem)
+                mem_cgroup_oom_notify_cb(iter);
 }
 static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
@@ -3862,6 +4652,22 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        return 0;
 }
+#ifdef CONFIG_NUMA
+static const struct file_operations mem_control_numa_stat_file_operations = {
+        .read = seq_read,
+        .llseek = seq_lseek,
+        .release = single_release,
+};
+static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
+{
+        struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
+        file->f_op = &mem_control_numa_stat_file_operations;
+        return single_open(file, mem_control_numa_stat_show, cont);
+}
+#endif /* CONFIG_NUMA */
 static struct cftype mem_cgroup_files[] = {
        {
                .name = "usage_in_bytes",
@@ -3925,6 +4731,13 @@ static struct cftype mem_cgroup_files[] = {
                .unregister_event = mem_cgroup_oom_unregister_event,
                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
        },
+#ifdef CONFIG_NUMA
+        {
+                .name = "numa_stat",
+                .open = mem_control_numa_stat_open,
+                .mode = S_IRUGO,
+        },
+#endif
 };
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -3986,13 +4799,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
         */
        if (!node_state(node, N_NORMAL_MEMORY))
                tmp = -1;
-        pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
+        pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
        if (!pn)
                return 1;
        mem->info.nodeinfo[node] = pn;
-        memset(pn, 0, sizeof(*pn));
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
                for_each_lru(l)
@@ -4016,23 +4827,25 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        /* Can be very big if MAX_NUMNODES is very big */
        if (size < PAGE_SIZE)
-                mem = kmalloc(size, GFP_KERNEL);
+                mem = kzalloc(size, GFP_KERNEL);
        else
-                mem = vmalloc(size);
+                mem = vzalloc(size);
        if (!mem)
                return NULL;
-        memset(mem, 0, size);
        mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
-        if (!mem->stat) {
+        if (!mem->stat)
-                if (size < PAGE_SIZE)
+                goto out_free;
-                        kfree(mem);
+        spin_lock_init(&mem->pcp_counter_lock);
-                else
-                        vfree(mem);
-                mem = NULL;
-        }
        return mem;
+out_free:
+        if (size < PAGE_SIZE)
+                kfree(mem);
+        else
+                vfree(mem);
+        return NULL;
 }
 /*
@@ -4158,7 +4971,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                                                &per_cpu(memcg_stock, cpu);
                        INIT_WORK(&stock->work, drain_local_stock);
                }
-                hotcpu_notifier(memcg_stock_cpu_callback, 0);
+                hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
        } else {
                parent = mem_cgroup_from_cont(cont->parent);
                mem->use_hierarchy = parent->use_hierarchy;
@@ -4180,7 +4993,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
                res_counter_init(&mem->memsw, NULL);
        }
        mem->last_scanned_child = 0;
-        spin_lock_init(&mem->reclaim_param_lock);
+        mem->last_scanned_node = MAX_NUMNODES;
        INIT_LIST_HEAD(&mem->oom_notify);
        if (parent)
@@ -4268,7 +5081,7 @@ one_by_one:
                        batch_count = PRECHARGE_COUNT_AT_ONCE;
                        cond_resched();
                }
-                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
+                ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false);
                if (ret || !mem)
                        /* mem_cgroup_clear_mc() will do uncharge later */
                        return -ENOMEM;
@@ -4430,6 +5243,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
+        split_huge_page_pmd(walk->mm, pmd);
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; pte++, addr += PAGE_SIZE)
                if (is_target_pte_for_mc(vma, addr, *pte, NULL))
@@ -4467,10 +5282,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 {
-        return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm));
+        unsigned long precharge = mem_cgroup_count_precharge(mm);
+        VM_BUG_ON(mc.moving_task);
+        mc.moving_task = current;
+        return mem_cgroup_do_precharge(precharge);
 }
-static void mem_cgroup_clear_mc(void)
+/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
+static void __mem_cgroup_clear_mc(void)
 {
        struct mem_cgroup *from = mc.from;
        struct mem_cgroup *to = mc.to;
@@ -4505,23 +5325,33 @@ static void mem_cgroup_clear_mc(void)
                                                PAGE_SIZE * mc.moved_swap);
                }
                /* we've already done mem_cgroup_get(mc.to) */
                mc.moved_swap = 0;
        }
+        memcg_oom_recover(from);
+        memcg_oom_recover(to);
+        wake_up_all(&mc.waitq);
+}
+static void mem_cgroup_clear_mc(void)
+{
+        struct mem_cgroup *from = mc.from;
+        /*
+         * we must clear moving_task before waking up waiters at the end of
+         * task migration.
+         */
+        mc.moving_task = NULL;
+        __mem_cgroup_clear_mc();
        spin_lock(&mc.lock);
        mc.from = NULL;
        mc.to = NULL;
-        mc.moving_task = NULL;
        spin_unlock(&mc.lock);
-        memcg_oom_recover(from);
+        mem_cgroup_end_move(from);
-        memcg_oom_recover(to);
-        wake_up_all(&mc.waitq);
 }
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
        int ret = 0;
        struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup);
@@ -4542,15 +5372,12 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        VM_BUG_ON(mc.precharge);
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
-                        VM_BUG_ON(mc.moving_task);
+                        mem_cgroup_start_move(from);
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = mem;
-                        mc.precharge = 0;
-                        mc.moved_charge = 0;
-                        mc.moved_swap = 0;
-                        mc.moving_task = current;
                        spin_unlock(&mc.lock);
+                        /* We set mc.moving_task later */
                        ret = mem_cgroup_precharge_mc(mm);
                        if (ret)
@@ -4563,8 +5390,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
        mem_cgroup_clear_mc();
 }
@@ -4578,6 +5404,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
        pte_t *pte;
        spinlock_t *ptl;
+        split_huge_page_pmd(walk->mm, pmd);
 retry:
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        for (; addr != end; addr += PAGE_SIZE) {
@@ -4598,8 +5425,8 @@ retry:
                        if (isolate_lru_page(page))
                                goto put;
                        pc = lookup_page_cgroup(page);
-                        if (!mem_cgroup_move_account(pc,
+                        if (!mem_cgroup_move_account(page, 1, pc,
-                                                mc.from, mc.to, false)) {
+                                                     mc.from, mc.to, false)) {
                                mc.precharge--;
                                /* we uncharge from mc.from later. */
                                mc.moved_charge++;
@@ -4644,7 +5471,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
        struct vm_area_struct *vma;
        lru_add_drain_all();
-        down_read(&mm->mmap_sem);
+retry:
+        if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
+                /*
+                 * Someone who are holding the mmap_sem might be waiting in
+                 * waitq. So we cancel all extra charges, wake up all waiters,
+                 * and retry. Because we cancel precharges, we might not be able
+                 * to move enough charges, but moving charge is a best-effort
+                 * feature anyway, so it wouldn't be a big problem.
+                 */
+                __mem_cgroup_clear_mc();
+                cond_resched();
+                goto retry;
+        }
        for (vma = mm->mmap; vma; vma = vma->vm_next) {
                int ret;
                struct mm_walk mem_cgroup_move_charge_walk = {
@@ -4669,41 +5508,35 @@ static void mem_cgroup_move_charge(struct mm_struct *mm)
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *cont,
                                struct cgroup *old_cont,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
-        struct mm_struct *mm;
+        struct mm_struct *mm = get_task_mm(p);
-        if (!mc.to)
-                /* no need to move charge */
-                return;
-        mm = get_task_mm(p);
        if (mm) {
-                mem_cgroup_move_charge(mm);
+                if (mc.to)
+                        mem_cgroup_move_charge(mm);
+                put_swap_token(mm);
                mmput(mm);
        }
-        mem_cgroup_clear_mc();
+        if (mc.to)
+                mem_cgroup_clear_mc();
 }
 #else   /* !CONFIG_MMU */
 static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
        return 0;
 }
 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
                                struct cgroup *cgroup,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
 }
 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct cgroup *cont,
                                struct cgroup *old_cont,
-                                struct task_struct *p,
+                                struct task_struct *p)
-                                bool threadgroup)
 {
 }
 #endif
@@ -4723,11 +5556,15 @@ struct cgroup_subsys mem_cgroup_subsys = {
 };
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static int __init enable_swap_account(char *s)
-static int __init disable_swap_account(char *s)
 {
-        really_do_swap_account = 0;
+        /* consider enabled if no parameter or 1 is given */
+        if (!strcmp(s, "1"))
+                really_do_swap_account = 1;
+        else if (!strcmp(s, "0"))
+                really_do_swap_account = 0;
        return 1;
 }
-__setup("noswapaccount", disable_swap_account);
+__setup("swapaccount=", enable_swap_account);
 #endif
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 757f6b0accfe..740c4f52059c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -7,21 +7,26 @@
 * Free Software Foundation.
 *
 * High level machine check handler. Handles pages reported by the
- * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
 * failure.
+ * 
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
 *
 * Handles page cache pages in various states.  The tricky part
- * here is that we can access any page asynchronous to other VM
+ * here is that we can access any page asynchronously in respect to 
- * users, because memory failures could happen anytime and anywhere,
+ * other VM users, because memory failures could happen anytime and 
- * possibly violating some of their assumptions. This is why this code
+ * anywhere. This could violate some of their assumptions. This is why 
- * has to be extremely careful. Generally it tries to use normal locking
+ * this code has to be extremely careful. Generally it tries to use 
- * rules, as in get the standard locks, even if that means the
+ * normal locking rules, as in get the standard locks, even if that means 
- * error handling takes potentially a long time.
+ * the error handling takes potentially a long time.
- *
+ * 
- * The operation to map back from RMAP chains to processes has to walk
+ * There are several operations here with exponential complexity because
- * the complete process list and has non linear complexity with the number
+ * of unsuitable VM data structures. For example the operation to map back 
- * mappings. In short it can be quite slow. But since memory corruptions
+ * from RMAP chains to processes has to walk the complete process list and 
- * are rare we hope to get away with this.
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core 
+ * VM.
 */
 /*
@@ -30,7 +35,6 @@
 * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
 * - pass bad pages to kdump next kernel
 */
-#define DEBUG 1         /* remove me in 2.6.34 */
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
@@ -47,6 +51,8 @@
 #include <linux/slab.h>
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm_inline.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -78,7 +84,7 @@ static int hwpoison_filter_dev(struct page *p)
                return 0;
        /*
-         * page_mapping() does not accept slab page
+         * page_mapping() does not accept slab pages.
         */
        if (PageSlab(p))
                return -EINVAL;
@@ -198,12 +204,12 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
 #ifdef __ARCH_SI_TRAPNO
        si.si_trapno = trapno;
 #endif
-        si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
+        si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
        /*
         * Don't use force here, it's convenient if the signal
         * can be temporarily blocked.
         * This could cause a loop when the user sets SIGBUS
-         * to SIG_IGN, but hopefully noone will do that?
+         * to SIG_IGN, but hopefully no one will do that?
         */
        ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
        if (ret < 0)
@@ -228,13 +234,17 @@ void shake_page(struct page *p, int access)
        }
        /*
-         * Only all shrink_slab here (which would also
+         * Only call shrink_slab here (which would also shrink other caches) if
-         * shrink other caches) if access is not potentially fatal.
+         * access is not potentially fatal.
         */
        if (access) {
                int nr;
                do {
-                        nr = shrink_slab(1000, GFP_KERNEL, 1000);
+                        struct shrink_control shrink = {
+                                .gfp_mask = GFP_KERNEL,
+                        };
+                        nr = shrink_slab(&shrink, 1000, 1000);
                        if (page_count(p) == 1)
                                break;
                } while (nr > 10);
@@ -268,7 +278,7 @@ struct to_kill {
        struct list_head nd;
        struct task_struct *tsk;
        unsigned long addr;
-        unsigned addr_valid:1;
+        char addr_valid;
 };
 /*
@@ -309,7 +319,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
         * a SIGKILL because the error is not contained anymore.
         */
        if (tk->addr == -EFAULT) {
-                pr_debug("MCE: Unable to find user space address %lx in %s\n",
+                pr_info("MCE: Unable to find user space address %lx in %s\n",
                        page_to_pfn(p), tsk->comm);
                tk->addr_valid = 0;
        }
@@ -381,10 +391,11 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        struct task_struct *tsk;
        struct anon_vma *av;
-        read_lock(&tasklist_lock);
        av = page_lock_anon_vma(page);
        if (av == NULL) /* Not actually mapped anymore */
-                goto out;
+                return;
+        read_lock(&tasklist_lock);
        for_each_process (tsk) {
                struct anon_vma_chain *vmac;
@@ -398,9 +409,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
                                add_to_kill(tsk, page, vma, to_kill, tkc);
                }
        }
-        page_unlock_anon_vma(av);
-out:
        read_unlock(&tasklist_lock);
+        page_unlock_anon_vma(av);
 }
 /*
@@ -414,17 +424,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
        struct prio_tree_iter iter;
        struct address_space *mapping = page->mapping;
-        /*
+        mutex_lock(&mapping->i_mmap_mutex);
-         * A note on the locking order between the two locks.
-         * We don't rely on this particular order.
-         * If you have some other code that needs a different order
-         * feel free to switch them around. Or add a reverse link
-         * from mm_struct to task_struct, then this could be all
-         * done without taking tasklist_lock and looping over all tasks.
-         */
        read_lock(&tasklist_lock);
-        spin_lock(&mapping->i_mmap_lock);
        for_each_process(tsk) {
                pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -444,8 +445,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
                                add_to_kill(tsk, page, vma, to_kill, tkc);
                }
        }
-        spin_unlock(&mapping->i_mmap_lock);
        read_unlock(&tasklist_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
 }
 /*
@@ -577,7 +578,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
                                        pfn, err);
                } else if (page_has_private(p) &&
                                !try_to_release_page(p, GFP_NOIO)) {
-                        pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+                        pr_info("MCE %#lx: failed to release buffers\n", pfn);
                } else {
                        ret = RECOVERED;
                }
@@ -629,7 +630,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
                 * when the page is reread or dropped.  If an
                 * application assumes it will always get error on
                 * fsync, but does other operations on the fd before
-                 * and the page is dropped inbetween then the error
+                 * and the page is dropped between then the error
                 * will not be properly reported.
                 *
                 * This can already happen even without hwpoisoned
@@ -693,11 +694,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
 * Issues:
 * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
 *   To narrow down kill region to one page, we need to break up pmd.
- * - To support soft-offlining for hugepage, we need to support hugepage
- *   migration.
 */
 static int me_huge_page(struct page *p, unsigned long pfn)
 {
+        int res = 0;
        struct page *hpage = compound_head(p);
        /*
         * We can safely recover from error on free or reserved (i.e.
@@ -710,8 +710,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
         * so there is no race between isolation and mapping/unmapping.
         */
        if (!(page_mapping(hpage) || PageAnon(hpage))) {
-                __isolate_hwpoisoned_huge_page(hpage);
+                res = dequeue_hwpoisoned_huge_page(hpage);
-                return RECOVERED;
+                if (!res)
+                        return RECOVERED;
        }
        return DELAYED;
 }
@@ -723,7 +724,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 * The table matches them in order and calls the right handler.
 *
 * This is quite tricky because we can access page at any time
- * in its live cycle, so all accesses have to be extremly careful.
+ * in its live cycle, so all accesses have to be extremely careful.
 *
 * This is not complete. More states could be added.
 * For any missing state don't attempt recovery.
@@ -836,8 +837,6 @@ static int page_action(struct page_state *ps, struct page *p,
        return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
 }
-#define N_UNMAP_TRIES 5
 /*
 * Do all that is necessary to remove user space mappings. Unmap
 * the pages and send SIGBUS to the processes if the data was dirty.
@@ -849,9 +848,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        struct address_space *mapping;
        LIST_HEAD(tokill);
        int ret;
-        int i;
        int kill = 1;
        struct page *hpage = compound_head(p);
+        struct page *ppage;
        if (PageReserved(p) || PageSlab(p))
                return SWAP_SUCCESS;
@@ -893,6 +892,44 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        }
        /*
+         * ppage: poisoned page
+         *   if p is regular page(4k page)
+         *        ppage == real poisoned page;
+         *   else p is hugetlb or THP, ppage == head page.
+         */
+        ppage = hpage;
+        if (PageTransHuge(hpage)) {
+                /*
+                 * Verify that this isn't a hugetlbfs head page, the check for
+                 * PageAnon is just for avoid tripping a split_huge_page
+                 * internal debug check, as split_huge_page refuses to deal with
+                 * anything that isn't an anon page. PageAnon can't go away fro
+                 * under us because we hold a refcount on the hpage, without a
+                 * refcount on the hpage. split_huge_page can't be safely called
+                 * in the first place, having a refcount on the tail isn't
+                 * enough * to be safe.
+                 */
+                if (!PageHuge(hpage) && PageAnon(hpage)) {
+                        if (unlikely(split_huge_page(hpage))) {
+                                /*
+                                 * FIXME: if splitting THP is failed, it is
+                                 * better to stop the following operation rather
+                                 * than causing panic by unmapping. System might
+                                 * survive if the page is freed later.
+                                 */
+                                printk(KERN_INFO
+                                        "MCE %#lx: failed to split THP\n", pfn);
+                                BUG_ON(!PageHWPoison(p));
+                                return SWAP_FAIL;
+                        }
+                        /* THP is split, so ppage should be the real poisoned page. */
+                        ppage = p;
+                }
+        }
+        /*
         * First collect all the processes that have the page
         * mapped in dirty form.  This has to be done before try_to_unmap,
         * because ttu takes the rmap data structures down.
@@ -901,22 +938,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-                collect_procs(hpage, &tokill);
+                collect_procs(ppage, &tokill);
-        /*
+        if (hpage != ppage)
-         * try_to_unmap can fail temporarily due to races.
+                lock_page(ppage);
-         * Try a few times (RED-PEN better strategy?)
-         */
-        for (i = 0; i < N_UNMAP_TRIES; i++) {
-                ret = try_to_unmap(hpage, ttu);
-                if (ret == SWAP_SUCCESS)
-                        break;
-                pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
-        }
+        ret = try_to_unmap(ppage, ttu);
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                                pfn, page_mapcount(hpage));
+                                pfn, page_mapcount(ppage));
+        if (hpage != ppage)
+                unlock_page(ppage);
        /*
         * Now that the dirty bit has been propagated to the
@@ -927,7 +960,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+        kill_procs_ao(&tokill, !!PageDirty(ppage), trapno,
                      ret != SWAP_SUCCESS, p, pfn);
        return ret;
@@ -936,7 +969,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 static void set_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-        int nr_pages = 1 << compound_order(hpage);
+        int nr_pages = 1 << compound_trans_order(hpage);
        for (i = 0; i < nr_pages; i++)
                SetPageHWPoison(hpage + i);
 }
@@ -944,7 +977,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
 static void clear_page_hwpoison_huge_page(struct page *hpage)
 {
        int i;
-        int nr_pages = 1 << compound_order(hpage);
+        int nr_pages = 1 << compound_trans_order(hpage);
        for (i = 0; i < nr_pages; i++)
                ClearPageHWPoison(hpage + i);
 }
@@ -974,14 +1007,17 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                return 0;
        }
-        nr_pages = 1 << compound_order(hpage);
+        nr_pages = 1 << compound_trans_order(hpage);
        atomic_long_add(nr_pages, &mce_bad_pages);
        /*
         * We need/can do nothing about count=0 pages.
         * 1) it's a free page, and therefore in safe hand:
         *    prep_new_page() will be the gate keeper.
-         * 2) it's part of a non-compound high order page.
+         * 2) it's a free hugepage, which is also safe:
+         *    an affected hugepage will be dequeued from hugepage freelist,
+         *    so there's no concern about reusing it ever after.
+         * 3) it's part of a non-compound high order page.
         *    Implies some kernel user: cannot stop them from
         *    R/W the page; let's pray that the page has been
         *    used and will be freed some time later.
@@ -993,6 +1029,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                if (is_free_buddy_page(p)) {
                        action_result(pfn, "free buddy", DELAYED);
                        return 0;
+                } else if (PageHuge(hpage)) {
+                        /*
+                         * Check "just unpoisoned", "filter hit", and
+                         * "race with other subpage."
+                         */
+                        lock_page(hpage);
+                        if (!PageHWPoison(hpage)
+                            || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+                            || (p != hpage && TestSetPageHWPoison(hpage))) {
+                                atomic_long_sub(nr_pages, &mce_bad_pages);
+                                return 0;
+                        }
+                        set_page_hwpoison_huge_page(hpage);
+                        res = dequeue_hwpoisoned_huge_page(hpage);
+                        action_result(pfn, "free huge",
+                                      res ? IGNORED : DELAYED);
+                        unlock_page(hpage);
+                        return res;
                } else {
                        action_result(pfn, "high order kernel", IGNORED);
                        return -EBUSY;
@@ -1007,19 +1061,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-        if (!PageLRU(p) && !PageHuge(p))
+        if (!PageHuge(p) && !PageTransCompound(p)) {
-                shake_page(p, 0);
+                if (!PageLRU(p))
-        if (!PageLRU(p) && !PageHuge(p)) {
+                        shake_page(p, 0);
-                /*
+                if (!PageLRU(p)) {
-                 * shake_page could have turned it free.
+                        /*
-                 */
+                         * shake_page could have turned it free.
-                if (is_free_buddy_page(p)) {
+                         */
-                        action_result(pfn, "free buddy, 2nd try", DELAYED);
+                        if (is_free_buddy_page(p)) {
-                        return 0;
+                                action_result(pfn, "free buddy, 2nd try",
+                                                DELAYED);
+                                return 0;
+                        }
+                        action_result(pfn, "non LRU", IGNORED);
+                        put_page(p);
+                        return -EBUSY;
                }
-                action_result(pfn, "non LRU", IGNORED);
-                put_page(p);
-                return -EBUSY;
        }
        /*
@@ -1027,7 +1084,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * It's very difficult to mess with pages currently under IO
         * and in many cases impossible, so we just avoid it here.
         */
-        lock_page_nosync(hpage);
+        lock_page(hpage);
        /*
         * unpoison always clear PG_hwpoison inside page lock
@@ -1049,7 +1106,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * For error on the tail page, we should set PG_hwpoison
         * on the head page to show that the hugepage is hwpoisoned
         */
-        if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+        if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) {
                action_result(pfn, "hugepage already hardware poisoned",
                                IGNORED);
                unlock_page(hpage);
@@ -1069,7 +1126,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
        /*
         * Now take care of user space mappings.
-         * Abort on fail: __remove_from_page_cache() assumes unmapped page.
+         * Abort on fail: __delete_from_page_cache() assumes unmapped page.
         */
        if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
                printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
@@ -1147,20 +1204,30 @@ int unpoison_memory(unsigned long pfn)
        page = compound_head(p);
        if (!PageHWPoison(p)) {
-                pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+                pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
                return 0;
        }
-        nr_pages = 1 << compound_order(page);
+        nr_pages = 1 << compound_trans_order(page);
        if (!get_page_unless_zero(page)) {
+                /*
+                 * Since HWPoisoned hugepage should have non-zero refcount,
+                 * race between memory failure and unpoison seems to happen.
+                 * In such case unpoison fails and memory failure runs
+                 * to the end.
+                 */
+                if (PageHuge(page)) {
+                        pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+                        return 0;
+                }
                if (TestClearPageHWPoison(p))
                        atomic_long_sub(nr_pages, &mce_bad_pages);
-                pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+                pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
                return 0;
        }
-        lock_page_nosync(page);
+        lock_page(page);
        /*
         * This test is racy because PG_hwpoison is set outside of page lock.
         * That's acceptable because that won't trigger kernel panic. Instead,
@@ -1168,12 +1235,12 @@ int unpoison_memory(unsigned long pfn)
         * the free buddy page pool.
         */
        if (TestClearPageHWPoison(page)) {
-                pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
+                pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
                atomic_long_sub(nr_pages, &mce_bad_pages);
                freeit = 1;
+                if (PageHuge(page))
+                        clear_page_hwpoison_huge_page(page);
        }
-        if (PageHuge(p))
-                clear_page_hwpoison_huge_page(page);
        unlock_page(page);
        put_page(page);
@@ -1187,7 +1254,11 @@ EXPORT_SYMBOL(unpoison_memory);
 static struct page *new_page(struct page *p, unsigned long private, int **x)
 {
        int nid = page_to_nid(p);
-        return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+        if (PageHuge(p))
+                return alloc_huge_page_node(page_hstate(compound_head(p)),
+                                                   nid);
+        else
+                return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
 }
 /*
@@ -1204,25 +1275,31 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
                return 1;
        /*
-         * The lock_system_sleep prevents a race with memory hotplug,
+         * The lock_memory_hotplug prevents a race with memory hotplug.
-         * because the isolation assumes there's only a single user.
         * This is a big hammer, a better would be nicer.
         */
-        lock_system_sleep();
+        lock_memory_hotplug();
        /*
         * Isolate the page, so that it doesn't get reallocated if it
         * was free.
         */
        set_migratetype_isolate(p);
+        /*
+         * When the target page is a free hugepage, just remove it
+         * from free hugepage list.
+         */
        if (!get_page_unless_zero(compound_head(p))) {
-                if (is_free_buddy_page(p)) {
+                if (PageHuge(p)) {
-                        pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+                        pr_info("get_any_page: %#lx free huge page\n", pfn);
+                        ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+                } else if (is_free_buddy_page(p)) {
+                        pr_info("get_any_page: %#lx free buddy page\n", pfn);
                        /* Set hwpoison bit while page is still isolated */
                        SetPageHWPoison(p);
                        ret = 0;
                } else {
-                        pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+                        pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
                                pfn, p->flags);
                        ret = -EIO;
                }
@@ -1231,7 +1308,51 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
                ret = 1;
        }
        unset_migratetype_isolate(p);
-        unlock_system_sleep();
+        unlock_memory_hotplug();
+        return ret;
+}
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+        int ret;
+        unsigned long pfn = page_to_pfn(page);
+        struct page *hpage = compound_head(page);
+        LIST_HEAD(pagelist);
+        ret = get_any_page(page, pfn, flags);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                goto done;
+        if (PageHWPoison(hpage)) {
+                put_page(hpage);
+                pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+                return -EBUSY;
+        }
+        /* Keep page count to indicate a given hugepage is isolated. */
+        list_add(&hpage->lru, &pagelist);
+        ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0,
+                                true);
+        if (ret) {
+                struct page *page1, *page2;
+                list_for_each_entry_safe(page1, page2, &pagelist, lru)
+                        put_page(page1);
+                pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                         pfn, ret, page->flags);
+                if (ret > 0)
+                        ret = -EIO;
+                return ret;
+        }
+done:
+        if (!PageHWPoison(hpage))
+                atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages);
+        set_page_hwpoison_huge_page(hpage);
+        dequeue_hwpoisoned_huge_page(hpage);
+        /* keep elevated page count for bad page */
        return ret;
 }
@@ -1262,6 +1383,9 @@ int soft_offline_page(struct page *page, int flags)
        int ret;
        unsigned long pfn = page_to_pfn(page);
+        if (PageHuge(page))
+                return soft_offline_huge_page(page, flags);
        ret = get_any_page(page, pfn, flags);
        if (ret < 0)
                return ret;
@@ -1288,7 +1412,7 @@ int soft_offline_page(struct page *page, int flags)
                        goto done;
        }
        if (!PageLRU(page)) {
-                pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+                pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
                                pfn, page->flags);
                return -EIO;
        }
@@ -1302,7 +1426,7 @@ int soft_offline_page(struct page *page, int flags)
        if (PageHWPoison(page)) {
                unlock_page(page);
                put_page(page);
-                pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+                pr_info("soft offline: %#lx page already poisoned\n", pfn);
                return -EBUSY;
        }
@@ -1312,18 +1436,14 @@ int soft_offline_page(struct page *page, int flags)
         */
        ret = invalidate_inode_page(page);
        unlock_page(page);
        /*
-         * Drop count because page migration doesn't like raised
-         * counts. The page could get re-allocated, but if it becomes
-         * LRU the isolation will just fail.
         * RED-PEN would be better to keep it isolated here, but we
         * would need to fix isolation locking first.
         */
-        put_page(page);
        if (ret == 1) {
+                put_page(page);
                ret = 0;
-                pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+                pr_info("soft_offline: %#lx: invalidated\n", pfn);
                goto done;
        }
@@ -1333,19 +1453,27 @@ int soft_offline_page(struct page *page, int flags)
         * handles a large number of cases for us.
         */
        ret = isolate_lru_page(page);
+        /*
+         * Drop page reference which is came from get_any_page()
+         * successful isolate_lru_page() already took another one.
+         */
+        put_page(page);
        if (!ret) {
                LIST_HEAD(pagelist);
+                inc_zone_page_state(page, NR_ISOLATED_ANON +
+                                            page_is_file_cache(page));
                list_add(&page->lru, &pagelist);
-                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+                                                                0, true);
                if (ret) {
-                        pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                        putback_lru_pages(&pagelist);
+                        pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
                                pfn, ret, page->flags);
                        if (ret > 0)
                                ret = -EIO;
                }
        } else {
-                pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+                pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
                                pfn, ret, page_count(page), page->flags);
        }
        if (ret)
@@ -1357,35 +1485,3 @@ done:
        /* keep elevated page count for bad page */
        return ret;
 }
-/*
- * The caller must hold current->mm->mmap_sem in read mode.
- */
-int is_hwpoison_address(unsigned long addr)
-{
-        pgd_t *pgdp;
-        pud_t pud, *pudp;
-        pmd_t pmd, *pmdp;
-        pte_t pte, *ptep;
-        swp_entry_t entry;
-        pgdp = pgd_offset(current->mm, addr);
-        if (!pgd_present(*pgdp))
-                return 0;
-        pudp = pud_offset(pgdp, addr);
-        pud = *pudp;
-        if (!pud_present(pud) || pud_large(pud))
-                return 0;
-        pmdp = pmd_offset(pudp, addr);
-        pmd = *pmdp;
-        if (!pmd_present(pmd) || pmd_large(pmd))
-                return 0;
-        ptep = pte_offset_map(pmdp, addr);
-        pte = *ptep;
-        pte_unmap(ptep);
-        if (!is_swap_pte(pte))
-                return 0;
-        entry = pte_to_swp_entry(pte);
-        return is_hwpoison_entry(entry);
-}
-EXPORT_SYMBOL_GPL(is_hwpoison_address);
diff --git a/mm/memory.c b/mm/memory.c
index 0e18b4d649ec..9b8a01d941cb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -182,7 +182,7 @@ void sync_mm_rss(struct task_struct *task, struct mm_struct *mm)
 {
        __sync_task_rss_stat(task, mm);
 }
-#else
+#else /* SPLIT_RSS_COUNTING */
 #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member)
 #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
@@ -191,8 +191,206 @@ static void check_sync_rss_stat(struct task_struct *task)
 {
 }
+#endif /* SPLIT_RSS_COUNTING */
+#ifdef HAVE_GENERIC_MMU_GATHER
+static int tlb_next_batch(struct mmu_gather *tlb)
+{
+        struct mmu_gather_batch *batch;
+        batch = tlb->active;
+        if (batch->next) {
+                tlb->active = batch->next;
+                return 1;
+        }
+        batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+        if (!batch)
+                return 0;
+        batch->next = NULL;
+        batch->nr   = 0;
+        batch->max  = MAX_GATHER_BATCH;
+        tlb->active->next = batch;
+        tlb->active = batch;
+        return 1;
+}
+/* tlb_gather_mmu
+ *      Called to initialize an (on-stack) mmu_gather structure for page-table
+ *      tear-down from @mm. The @fullmm argument is used when @mm is without
+ *      users and we're going to destroy the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm)
+{
+        tlb->mm = mm;
+        tlb->fullmm     = fullmm;
+        tlb->need_flush = 0;
+        tlb->fast_mode  = (num_possible_cpus() == 1);
+        tlb->local.next = NULL;
+        tlb->local.nr   = 0;
+        tlb->local.max  = ARRAY_SIZE(tlb->__pages);
+        tlb->active     = &tlb->local;
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+        tlb->batch = NULL;
+#endif
+}
+void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+        struct mmu_gather_batch *batch;
+        if (!tlb->need_flush)
+                return;
+        tlb->need_flush = 0;
+        tlb_flush(tlb);
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+        tlb_table_flush(tlb);
 #endif
+        if (tlb_fast_mode(tlb))
+                return;
+        for (batch = &tlb->local; batch; batch = batch->next) {
+                free_pages_and_swap_cache(batch->pages, batch->nr);
+                batch->nr = 0;
+        }
+        tlb->active = &tlb->local;
+}
+/* tlb_finish_mmu
+ *      Called at the end of the shootdown operation to free up any resources
+ *      that were required.
+ */
+void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end)
+{
+        struct mmu_gather_batch *batch, *next;
+        tlb_flush_mmu(tlb);
+        /* keep the page table cache within bounds */
+        check_pgt_cache();
+        for (batch = tlb->local.next; batch; batch = next) {
+                next = batch->next;
+                free_pages((unsigned long)batch, 0);
+        }
+        tlb->local.next = NULL;
+}
+/* __tlb_remove_page
+ *      Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
+ *      handling the additional races in SMP caused by other CPUs caching valid
+ *      mappings in their TLBs. Returns the number of free page slots left.
+ *      When out of page slots we must call tlb_flush_mmu().
+ */
+int __tlb_remove_page(struct mmu_gather *tlb, struct page *page)
+{
+        struct mmu_gather_batch *batch;
+        tlb->need_flush = 1;
+        if (tlb_fast_mode(tlb)) {
+                free_page_and_swap_cache(page);
+                return 1; /* avoid calling tlb_flush_mmu() */
+        }
+        batch = tlb->active;
+        batch->pages[batch->nr++] = page;
+        if (batch->nr == batch->max) {
+                if (!tlb_next_batch(tlb))
+                        return 0;
+                batch = tlb->active;
+        }
+        VM_BUG_ON(batch->nr > batch->max);
+        return batch->max - batch->nr;
+}
+#endif /* HAVE_GENERIC_MMU_GATHER */
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+/*
+ * See the comment near struct mmu_table_batch.
+ */
+static void tlb_remove_table_smp_sync(void *arg)
+{
+        /* Simply deliver the interrupt */
+}
+static void tlb_remove_table_one(void *table)
+{
+        /*
+         * This isn't an RCU grace period and hence the page-tables cannot be
+         * assumed to be actually RCU-freed.
+         *
+         * It is however sufficient for software page-table walkers that rely on
+         * IRQ disabling. See the comment near struct mmu_table_batch.
+         */
+        smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+        __tlb_remove_table(table);
+}
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+        struct mmu_table_batch *batch;
+        int i;
+        batch = container_of(head, struct mmu_table_batch, rcu);
+        for (i = 0; i < batch->nr; i++)
+                __tlb_remove_table(batch->tables[i]);
+        free_page((unsigned long)batch);
+}
+void tlb_table_flush(struct mmu_gather *tlb)
+{
+        struct mmu_table_batch **batch = &tlb->batch;
+        if (*batch) {
+                call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
+                *batch = NULL;
+        }
+}
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+        struct mmu_table_batch **batch = &tlb->batch;
+        tlb->need_flush = 1;
+        /*
+         * When there's less then two users of this mm there cannot be a
+         * concurrent page-table walk.
+         */
+        if (atomic_read(&tlb->mm->mm_users) < 2) {
+                __tlb_remove_table(table);
+                return;
+        }
+        if (*batch == NULL) {
+                *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+                if (*batch == NULL) {
+                        tlb_remove_table_one(table);
+                        return;
+                }
+                (*batch)->nr = 0;
+        }
+        (*batch)->tables[(*batch)->nr++] = table;
+        if ((*batch)->nr == MAX_TABLE_BATCH)
+                tlb_table_flush(tlb);
+}
+#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
 /*
 * If a p?d_bad entry is found while walking page tables, report
 * the error, before resetting entry to p?d_none.  Usually (but
@@ -394,9 +592,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
        }
 }
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
+                pmd_t *pmd, unsigned long address)
 {
        pgtable_t new = pte_alloc_one(mm, address);
+        int wait_split_huge_page;
        if (!new)
                return -ENOMEM;
@@ -416,14 +616,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
        spin_lock(&mm->page_table_lock);
-        if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+        wait_split_huge_page = 0;
+        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                mm->nr_ptes++;
                pmd_populate(mm, pmd, new);
                new = NULL;
-        }
+        } else if (unlikely(pmd_trans_splitting(*pmd)))
+                wait_split_huge_page = 1;
        spin_unlock(&mm->page_table_lock);
        if (new)
                pte_free(mm, new);
+        if (wait_split_huge_page)
+                wait_split_huge_page(vma->anon_vma, pmd);
        return 0;
 }
@@ -436,10 +640,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        smp_wmb(); /* See comment in __pte_alloc */
        spin_lock(&init_mm.page_table_lock);
-        if (!pmd_present(*pmd)) {       /* Has another populated it ? */
+        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
-        }
+        } else
+                VM_BUG_ON(pmd_trans_splitting(*pmd));
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
@@ -526,7 +731,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
        add_taint(TAINT_BAD_PAGE);
 }
-static inline int is_cow_mapping(unsigned int flags)
+static inline int is_cow_mapping(vm_flags_t flags)
 {
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
 }
@@ -719,9 +924,9 @@ out_set_pte:
        return 0;
 }
-static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-                pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+                   pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end)
+                   unsigned long addr, unsigned long end)
 {
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
@@ -736,7 +941,7 @@ again:
        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte)
                return -ENOMEM;
-        src_pte = pte_offset_map_nested(src_pmd, addr);
+        src_pte = pte_offset_map(src_pmd, addr);
        src_ptl = pte_lockptr(src_mm, src_pmd);
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        orig_src_pte = src_pte;
@@ -767,7 +972,7 @@ again:
        arch_leave_lazy_mmu_mode();
        spin_unlock(src_ptl);
-        pte_unmap_nested(orig_src_pte);
+        pte_unmap(orig_src_pte);
        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
@@ -795,6 +1000,17 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*src_pmd)) {
+                        int err;
+                        VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
+                        err = copy_huge_pmd(dst_mm, src_mm,
+                                            dst_pmd, src_pmd, addr, vma);
+                        if (err == -ENOMEM)
+                                return -ENOMEM;
+                        if (!err)
+                                continue;
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(src_pmd))
                        continue;
                if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
@@ -891,26 +1107,26 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        struct mm_struct *mm = tlb->mm;
-        pte_t *pte;
+        int force_flush = 0;
-        spinlock_t *ptl;
        int rss[NR_MM_COUNTERS];
+        spinlock_t *ptl;
+        pte_t *start_pte;
+        pte_t *pte;
+again:
        init_rss_vec(rss);
+        start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+        pte = start_pte;
        arch_enter_lazy_mmu_mode();
        do {
                pte_t ptent = *pte;
                if (pte_none(ptent)) {
-                        (*zap_work)--;
                        continue;
                }
-                (*zap_work) -= PAGE_SIZE;
                if (pte_present(ptent)) {
                        struct page *page;
@@ -956,7 +1172,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                        page_remove_rmap(page);
                        if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
-                        tlb_remove_page(tlb, page);
+                        force_flush = !__tlb_remove_page(tlb, page);
+                        if (force_flush)
+                                break;
                        continue;
                }
                /*
@@ -977,11 +1195,23 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                print_bad_pte(vma, addr, ptent, NULL);
                }
                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+        } while (pte++, addr += PAGE_SIZE, addr != end);
        add_mm_rss_vec(mm, rss);
        arch_leave_lazy_mmu_mode();
-        pte_unmap_unlock(pte - 1, ptl);
+        pte_unmap_unlock(start_pte, ptl);
+        /*
+         * mmu_gather ran out of room to batch pages, we break out of
+         * the PTE lock to avoid doing the potential expensive TLB invalidate
+         * and page-free while holding it.
+         */
+        if (force_flush) {
+                force_flush = 0;
+                tlb_flush_mmu(tlb);
+                if (addr != end)
+                        goto again;
+        }
        return addr;
 }
@@ -989,7 +1219,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -997,13 +1227,19 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-                if (pmd_none_or_clear_bad(pmd)) {
+                if (pmd_trans_huge(*pmd)) {
-                        (*zap_work)--;
+                        if (next-addr != HPAGE_PMD_SIZE) {
-                        continue;
+                                VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
+                                split_huge_page_pmd(vma->vm_mm, pmd);
+                        } else if (zap_huge_pmd(tlb, vma, pmd))
+                                continue;
+                        /* fall through */
                }
-                next = zap_pte_range(tlb, vma, pmd, addr, next,
+                if (pmd_none_or_clear_bad(pmd))
-                                                zap_work, details);
+                        continue;
-        } while (pmd++, addr = next, (addr != end && *zap_work > 0));
+                next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+                cond_resched();
+        } while (pmd++, addr = next, addr != end);
        return addr;
 }
@@ -1011,7 +1247,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        pud_t *pud;
        unsigned long next;
@@ -1019,13 +1255,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
-                if (pud_none_or_clear_bad(pud)) {
+                if (pud_none_or_clear_bad(pud))
-                        (*zap_work)--;
                        continue;
-                }
+                next = zap_pmd_range(tlb, vma, pud, addr, next, details);
-                next = zap_pmd_range(tlb, vma, pud, addr, next,
+        } while (pud++, addr = next, addr != end);
-                                                zap_work, details);
-        } while (pud++, addr = next, (addr != end && *zap_work > 0));
        return addr;
 }
@@ -1033,7 +1266,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
 static unsigned long unmap_page_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma,
                                unsigned long addr, unsigned long end,
-                                long *zap_work, struct zap_details *details)
+                                struct zap_details *details)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -1047,13 +1280,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
-                if (pgd_none_or_clear_bad(pgd)) {
+                if (pgd_none_or_clear_bad(pgd))
-                        (*zap_work)--;
                        continue;
-                }
+                next = zap_pud_range(tlb, vma, pgd, addr, next, details);
-                next = zap_pud_range(tlb, vma, pgd, addr, next,
+        } while (pgd++, addr = next, addr != end);
-                                                zap_work, details);
-        } while (pgd++, addr = next, (addr != end && *zap_work > 0));
        tlb_end_vma(tlb, vma);
        mem_cgroup_uncharge_end();
@@ -1069,7 +1299,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 /**
 * unmap_vmas - unmap a range of memory covered by a list of vma's
- * @tlbp: address of the caller's struct mmu_gather
+ * @tlb: address of the caller's struct mmu_gather
 * @vma: the starting vma
 * @start_addr: virtual address at which to start unmapping
 * @end_addr: virtual address at which to end unmapping
@@ -1093,17 +1323,12 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
 * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
 * drops the lock and schedules.
 */
-unsigned long unmap_vmas(struct mmu_gather **tlbp,
+unsigned long unmap_vmas(struct mmu_gather *tlb,
                struct vm_area_struct *vma, unsigned long start_addr,
                unsigned long end_addr, unsigned long *nr_accounted,
                struct zap_details *details)
 {
-        long zap_work = ZAP_BLOCK_SIZE;
-        unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
-        int tlb_start_valid = 0;
        unsigned long start = start_addr;
-        spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
-        int fullmm = (*tlbp)->fullmm;
        struct mm_struct *mm = vma->vm_mm;
        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1124,11 +1349,6 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        untrack_pfn_vma(vma, 0, 0);
                while (start != end) {
-                        if (!tlb_start_valid) {
-                                tlb_start = start;
-                                tlb_start_valid = 1;
-                        }
                        if (unlikely(is_vm_hugetlb_page(vma))) {
                                /*
                                 * It is undesirable to test vma->vm_file as it
@@ -1141,39 +1361,15 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                                 * Since no pte has actually been setup, it is
                                 * safe to do nothing in this case.
                                 */
-                                if (vma->vm_file) {
+                                if (vma->vm_file)
                                        unmap_hugepage_range(vma, start, end, NULL);
-                                        zap_work -= (end - start) /
-                                        pages_per_huge_page(hstate_vma(vma));
-                                }
                                start = end;
                        } else
-                                start = unmap_page_range(*tlbp, vma,
+                                start = unmap_page_range(tlb, vma, start, end, details);
-                                                start, end, &zap_work, details);
-                        if (zap_work > 0) {
-                                BUG_ON(start != end);
-                                break;
-                        }
-                        tlb_finish_mmu(*tlbp, tlb_start, start);
-                        if (need_resched() ||
-                                (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-                                if (i_mmap_lock) {
-                                        *tlbp = NULL;
-                                        goto out;
-                                }
-                                cond_resched();
-                        }
-                        *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
-                        tlb_start_valid = 0;
-                        zap_work = ZAP_BLOCK_SIZE;
                }
        }
-out:
        mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
        return start;   /* which is now the end (or restart) address */
 }
@@ -1189,16 +1385,15 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
                unsigned long size, struct zap_details *details)
 {
        struct mm_struct *mm = vma->vm_mm;
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        unsigned long end = address + size;
        unsigned long nr_accounted = 0;
        lru_add_drain();
-        tlb = tlb_gather_mmu(mm, 0);
+        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
        end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details);
-        if (tlb)
+        tlb_finish_mmu(&tlb, address, end);
-                tlb_finish_mmu(tlb, address, end);
        return end;
 }
@@ -1262,7 +1457,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        pud = pud_offset(pgd, address);
        if (pud_none(*pud))
                goto no_page_table;
-        if (pud_huge(*pud)) {
+        if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
                goto out;
@@ -1273,11 +1468,32 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                goto no_page_table;
-        if (pmd_huge(*pmd)) {
+        if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
+        if (pmd_trans_huge(*pmd)) {
+                if (flags & FOLL_SPLIT) {
+                        split_huge_page_pmd(mm, pmd);
+                        goto split_fallthrough;
+                }
+                spin_lock(&mm->page_table_lock);
+                if (likely(pmd_trans_huge(*pmd))) {
+                        if (unlikely(pmd_trans_splitting(*pmd))) {
+                                spin_unlock(&mm->page_table_lock);
+                                wait_split_huge_page(vma->anon_vma, pmd);
+                        } else {
+                                page = follow_trans_huge_pmd(mm, address,
+                                                             pmd, flags);
+                                spin_unlock(&mm->page_table_lock);
+                                goto out;
+                        }
+                } else
+                        spin_unlock(&mm->page_table_lock);
+                /* fall through */
+        }
+split_fallthrough:
        if (unlikely(pmd_bad(*pmd)))
                goto no_page_table;
@@ -1310,6 +1526,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                 */
                mark_page_accessed(page);
        }
+        if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+                /*
+                 * The preliminary mapping check is mainly to avoid the
+                 * pointless overhead of lock_page on the ZERO_PAGE
+                 * which might bounce very badly if there is contention.
+                 *
+                 * If the page is already locked, we don't need to
+                 * handle it now - vmscan will handle it later if and
+                 * when it attempts to reclaim the page.
+                 */
+                if (page->mapping && trylock_page(page)) {
+                        lru_add_drain();  /* push cached pages to LRU */
+                        /*
+                         * Because we lock page here and migration is
+                         * blocked by the pte's page reference, we need
+                         * only check for file-cache page truncation.
+                         */
+                        if (page->mapping)
+                                mlock_vma_page(page);
+                        unlock_page(page);
+                }
+        }
 unlock:
        pte_unmap_unlock(ptep, ptl);
 out:
@@ -1339,9 +1577,65 @@ no_page_table:
        return page;
 }
+static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+{
+        return stack_guard_page_start(vma, addr) ||
+               stack_guard_page_end(vma, addr+PAGE_SIZE);
+}
+/**
+ * __get_user_pages() - pin user pages in memory
+ * @tsk:        task_struct of target task
+ * @mm:         mm_struct of target mm
+ * @start:      starting user address
+ * @nr_pages:   number of pages from start to pin
+ * @gup_flags:  flags modifying pin behaviour
+ * @pages:      array that receives pointers to the pages pinned.
+ *              Should be at least nr_pages long. Or NULL, if caller
+ *              only intends to ensure the pages are faulted in.
+ * @vmas:       array of pointers to vmas corresponding to each page.
+ *              Or NULL if the caller does not require them.
+ * @nonblocking: whether waiting for disk IO or mmap_sem contention
+ *
+ * Returns number of pages pinned. This may be fewer than the number
+ * requested. If nr_pages is 0 or negative, returns 0. If no pages
+ * were pinned, returns -errno. Each page returned must be released
+ * with a put_page() call when it is finished with. vmas will only
+ * remain valid while mmap_sem is held.
+ *
+ * Must be called with mmap_sem held for read or write.
+ *
+ * __get_user_pages walks a process's page tables and takes a reference to
+ * each struct page that each user address corresponds to at a given
+ * instant. That is, it takes the page that would be accessed if a user
+ * thread accesses the given user virtual address at that instant.
+ *
+ * This does not guarantee that the page exists in the user mappings when
+ * __get_user_pages returns, and there may even be a completely different
+ * page there in some cases (eg. if mmapped pagecache has been invalidated
+ * and subsequently re faulted). However it does guarantee that the page
+ * won't be freed completely. And mostly callers simply care that the page
+ * contains data that was valid *at some point in time*. Typically, an IO
+ * or similar operation cannot guarantee anything stronger anyway because
+ * locks can't be held over the syscall boundary.
+ *
+ * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
+ * the page is written to, set_page_dirty (or set_page_dirty_lock, as
+ * appropriate) must be called after the page is finished with, and
+ * before put_page is called.
+ *
+ * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
+ * or mmap_sem contention, and if waiting is needed to pin all pages,
+ * *@nonblocking will be set to 0.
+ *
+ * In most cases, get_user_pages or get_user_pages_fast should be used
+ * instead of __get_user_pages. __get_user_pages should be used only if
+ * you need some special @gup_flags.
+ */
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int nr_pages, unsigned int gup_flags,
-                     struct page **pages, struct vm_area_struct **vmas)
+                     struct page **pages, struct vm_area_struct **vmas,
+                     int *nonblocking)
 {
        int i;
        unsigned long vm_flags;
@@ -1365,9 +1659,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                struct vm_area_struct *vma;
                vma = find_extend_vma(mm, start);
-                if (!vma && in_gate_area(tsk, start)) {
+                if (!vma && in_gate_area(mm, start)) {
                        unsigned long pg = start & PAGE_MASK;
-                        struct vm_area_struct *gate_vma = get_gate_vma(tsk);
                        pgd_t *pgd;
                        pud_t *pud;
                        pmd_t *pmd;
@@ -1386,15 +1679,17 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        pmd = pmd_offset(pud, pg);
                        if (pmd_none(*pmd))
                                return i ? : -EFAULT;
+                        VM_BUG_ON(pmd_trans_huge(*pmd));
                        pte = pte_offset_map(pmd, pg);
                        if (pte_none(*pte)) {
                                pte_unmap(pte);
                                return i ? : -EFAULT;
                        }
+                        vma = get_gate_vma(mm);
                        if (pages) {
                                struct page *page;
-                                page = vm_normal_page(gate_vma, start, *pte);
+                                page = vm_normal_page(vma, start, *pte);
                                if (!page) {
                                        if (!(gup_flags & FOLL_DUMP) &&
                                             is_zero_pfn(pte_pfn(*pte)))
@@ -1408,12 +1703,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                get_page(page);
                        }
                        pte_unmap(pte);
-                        if (vmas)
+                        goto next_page;
-                                vmas[i] = gate_vma;
-                        i++;
-                        start += PAGE_SIZE;
-                        nr_pages--;
-                        continue;
                }
                if (!vma ||
@@ -1441,23 +1731,52 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        cond_resched();
                        while (!(page = follow_page(vma, start, foll_flags))) {
                                int ret;
+                                unsigned int fault_flags = 0;
+                                /* For mlock, just skip the stack guard page. */
+                                if (foll_flags & FOLL_MLOCK) {
+                                        if (stack_guard_page(vma, start))
+                                                goto next_page;
+                                }
+                                if (foll_flags & FOLL_WRITE)
+                                        fault_flags |= FAULT_FLAG_WRITE;
+                                if (nonblocking)
+                                        fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+                                if (foll_flags & FOLL_NOWAIT)
+                                        fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT);
                                ret = handle_mm_fault(mm, vma, start,
-                                        (foll_flags & FOLL_WRITE) ?
+                                                        fault_flags);
-                                        FAULT_FLAG_WRITE : 0);
                                if (ret & VM_FAULT_ERROR) {
                                        if (ret & VM_FAULT_OOM)
                                                return i ? i : -ENOMEM;
-                                        if (ret &
+                                        if (ret & (VM_FAULT_HWPOISON |
-                                            (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
+                                                   VM_FAULT_HWPOISON_LARGE)) {
+                                                if (i)
+                                                        return i;
+                                                else if (gup_flags & FOLL_HWPOISON)
+                                                        return -EHWPOISON;
+                                                else
+                                                        return -EFAULT;
+                                        }
+                                        if (ret & VM_FAULT_SIGBUS)
                                                return i ? i : -EFAULT;
                                        BUG();
                                }
-                                if (ret & VM_FAULT_MAJOR)
-                                        tsk->maj_flt++;
+                                if (tsk) {
-                                else
+                                        if (ret & VM_FAULT_MAJOR)
-                                        tsk->min_flt++;
+                                                tsk->maj_flt++;
+                                        else
+                                                tsk->min_flt++;
+                                }
+                                if (ret & VM_FAULT_RETRY) {
+                                        if (nonblocking)
+                                                *nonblocking = 0;
+                                        return i;
+                                }
                                /*
                                 * The VM_FAULT_WRITE bit tells us that
@@ -1485,6 +1804,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                flush_anon_page(vma, page, start);
                                flush_dcache_page(page);
                        }
+next_page:
                        if (vmas)
                                vmas[i] = vma;
                        i++;
@@ -1494,10 +1814,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        } while (nr_pages);
        return i;
 }
+EXPORT_SYMBOL(__get_user_pages);
 /**
 * get_user_pages() - pin user pages in memory
- * @tsk:        task_struct of target task
+ * @tsk:        the task_struct to use for page fault accounting, or
+ *              NULL if faults are not to be recorded.
 * @mm:         mm_struct of target mm
 * @start:      starting user address
 * @nr_pages:   number of pages from start to pin
@@ -1558,7 +1880,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        if (force)
                flags |= FOLL_FORCE;
-        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+                                NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
@@ -1583,22 +1906,25 @@ struct page *get_dump_page(unsigned long addr)
        struct page *page;
        if (__get_user_pages(current, current->mm, addr, 1,
-                        FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1)
+                             FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
+                             NULL) < 1)
                return NULL;
        flush_cache_page(vma, addr, page_to_pfn(page));
        return page;
 }
 #endif /* CONFIG_ELF_CORE */
-pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
+pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr,
                        spinlock_t **ptl)
 {
        pgd_t * pgd = pgd_offset(mm, addr);
        pud_t * pud = pud_alloc(mm, pgd, addr);
        if (pud) {
                pmd_t * pmd = pmd_alloc(mm, pud, addr);
-                if (pmd)
+                if (pmd) {
+                        VM_BUG_ON(pmd_trans_huge(*pmd));
                        return pte_alloc_map_lock(mm, pmd, addr, ptl);
+                }
        }
        return NULL;
 }
@@ -1817,6 +2143,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud,
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        do {
                next = pmd_addr_end(addr, end);
                if (remap_pte_range(mm, pmd, addr, next,
@@ -2026,10 +2353,10 @@ EXPORT_SYMBOL_GPL(apply_to_page_range);
 * handle_pte_fault chooses page fault handler according to an entry
 * which was read non-atomically.  Before making any commitment, on
 * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_file_page
+ * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
 * must check under lock before unmapping the pte and proceeding
 * (but do_wp_page is only called after already making such a check;
- * and do_anonymous_page and do_no_page can safely check later on).
+ * and do_anonymous_page can safely check later on).
 */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
                                pte_t *page_table, pte_t orig_pte)
@@ -2047,19 +2374,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
        return same;
 }
-/*
- * Do pte_mkwrite, but only if the vma says VM_WRITE.  We do this when
- * servicing faults for write access.  In the normal case, do always want
- * pte_mkwrite.  But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
-static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-{
-        if (likely(vma->vm_flags & VM_WRITE))
-                pte = pte_mkwrite(pte);
-        return pte;
-}
 static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
 {
        /*
@@ -2079,7 +2393,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
                 * zeroes.
                 */
                if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
-                        memset(kaddr, 0, PAGE_SIZE);
+                        clear_page(kaddr);
                kunmap_atomic(kaddr, KM_USER0);
                flush_dcache_page(dst);
        } else
@@ -2107,10 +2421,11 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
                spinlock_t *ptl, pte_t orig_pte)
+        __releases(ptl)
 {
        struct page *old_page, *new_page;
        pte_t entry;
-        int reuse = 0, ret = 0;
+        int ret = 0;
        int page_mkwrite = 0;
        struct page *dirty_page = NULL;
@@ -2142,19 +2457,20 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                                page_cache_release(old_page);
                                goto unlock;
                        }
                        page_cache_release(old_page);
                }
-                reuse = reuse_swap_page(old_page);
+                if (reuse_swap_page(old_page)) {
-                if (reuse)
                        /*
                         * The page is all ours.  Move it to our anon_vma so
                         * the rmap code will not search our parent or siblings.
                         * Protected against the rmap code by the page lock.
                         */
                        page_move_anon_rmap(old_page, vma, address);
+                        unlock_page(old_page);
+                        goto reuse;
+                }
                unlock_page(old_page);
        } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                                        (VM_WRITE|VM_SHARED))) {
@@ -2210,7 +2526,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                                         &ptl);
                        if (!pte_same(*page_table, orig_pte)) {
                                unlock_page(old_page);
-                                page_cache_release(old_page);
                                goto unlock;
                        }
@@ -2218,18 +2533,52 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                dirty_page = old_page;
                get_page(dirty_page);
-                reuse = 1;
-        }
-        if (reuse) {
 reuse:
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                if (ptep_set_access_flags(vma, address, page_table, entry,1))
                        update_mmu_cache(vma, address, page_table);
+                pte_unmap_unlock(page_table, ptl);
                ret |= VM_FAULT_WRITE;
-                goto unlock;
+                if (!dirty_page)
+                        return ret;
+                /*
+                 * Yes, Virginia, this is actually required to prevent a race
+                 * with clear_page_dirty_for_io() from clearing the page dirty
+                 * bit after it clear all dirty ptes, but before a racing
+                 * do_wp_page installs a dirty pte.
+                 *
+                 * __do_fault is protected similarly.
+                 */
+                if (!page_mkwrite) {
+                        wait_on_page_locked(dirty_page);
+                        set_page_dirty_balance(dirty_page, page_mkwrite);
+                }
+                put_page(dirty_page);
+                if (page_mkwrite) {
+                        struct address_space *mapping = dirty_page->mapping;
+                        set_page_dirty(dirty_page);
+                        unlock_page(dirty_page);
+                        page_cache_release(dirty_page);
+                        if (mapping)    {
+                                /*
+                                 * Some device drivers do not set page.mapping
+                                 * but still dirty their pages
+                                 */
+                                balance_dirty_pages_ratelimited(mapping);
+                        }
+                }
+                /* file_update_time outside page_lock */
+                if (vma->vm_file)
+                        file_update_time(vma->vm_file);
+                return ret;
        }
        /*
@@ -2254,16 +2603,6 @@ gotten:
        }
        __SetPageUptodate(new_page);
-        /*
-         * Don't let another task, with possibly unlocked vma,
-         * keep the mlocked page.
-         */
-        if ((vma->vm_flags & VM_LOCKED) && old_page) {
-                lock_page(old_page);    /* for LRU manipulation */
-                clear_page_mlock(old_page);
-                unlock_page(old_page);
-        }
        if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                goto oom_free_new;
@@ -2331,42 +2670,19 @@ gotten:
        if (new_page)
                page_cache_release(new_page);
-        if (old_page)
-                page_cache_release(old_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
-        if (dirty_page) {
+        if (old_page) {
                /*
-                 * Yes, Virginia, this is actually required to prevent a race
+                 * Don't let another task, with possibly unlocked vma,
-                 * with clear_page_dirty_for_io() from clearing the page dirty
+                 * keep the mlocked page.
-                 * bit after it clear all dirty ptes, but before a racing
-                 * do_wp_page installs a dirty pte.
-                 *
-                 * do_no_page is protected similarly.
                 */
-                if (!page_mkwrite) {
+                if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
-                        wait_on_page_locked(dirty_page);
+                        lock_page(old_page);    /* LRU manipulation */
-                        set_page_dirty_balance(dirty_page, page_mkwrite);
+                        munlock_vma_page(old_page);
-                }
+                        unlock_page(old_page);
-                put_page(dirty_page);
-                if (page_mkwrite) {
-                        struct address_space *mapping = dirty_page->mapping;
-                        set_page_dirty(dirty_page);
-                        unlock_page(dirty_page);
-                        page_cache_release(dirty_page);
-                        if (mapping)    {
-                                /*
-                                 * Some device drivers do not set page.mapping
-                                 * but still dirty their pages
-                                 */
-                                balance_dirty_pages_ratelimited(mapping);
-                        }
                }
+                page_cache_release(old_page);
-                /* file_update_time outside page_lock */
-                if (vma->vm_file)
-                        file_update_time(vma->vm_file);
        }
        return ret;
 oom_free_new:
@@ -2386,96 +2702,11 @@ unwritable_page:
        return ret;
 }
-/*
+static void unmap_mapping_range_vma(struct vm_area_struct *vma,
- * Helper functions for unmap_mapping_range().
- *
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
- *
- * We have to restart searching the prio_tree whenever we drop the lock,
- * since the iterator is only valid while the lock is held, and anyway
- * a later vma might be split and reinserted earlier while lock dropped.
- *
- * The list of nonlinear vmas could be handled more efficiently, using
- * a placeholder, but handle it in the same way until a need is shown.
- * It is important to search the prio_tree before nonlinear list: a vma
- * may become nonlinear and be shifted from prio_tree to nonlinear list
- * while the lock is dropped; but never shifted from list to prio_tree.
- *
- * In order to make forward progress despite restarting the search,
- * vm_truncate_count is used to mark a vma as now dealt with, so we can
- * quickly skip it next time around.  Since the prio_tree search only
- * shows us those vmas affected by unmapping the range in question, we
- * can't efficiently keep all vmas in step with mapping->truncate_count:
- * so instead reset them all whenever it wraps back to 0 (then go to 1).
- * mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
- *
- * In order to make forward progress despite repeatedly restarting some
- * large vma, note the restart_addr from unmap_vmas when it breaks out:
- * and restart from that address when we reach that vma again.  It might
- * have been split or merged, shrunk or extended, but never shifted: so
- * restart_addr remains valid so long as it remains in the vma's range.
- * unmap_mapping_range forces truncate_count to leap over page-aligned
- * values so we can save vma's restart_addr in its truncate_count field.
- */
-#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
-static void reset_vma_truncate_counts(struct address_space *mapping)
-{
-        struct vm_area_struct *vma;
-        struct prio_tree_iter iter;
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-                vma->vm_truncate_count = 0;
-        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-                vma->vm_truncate_count = 0;
-}
-static int unmap_mapping_range_vma(struct vm_area_struct *vma,
                unsigned long start_addr, unsigned long end_addr,
                struct zap_details *details)
 {
-        unsigned long restart_addr;
+        zap_page_range(vma, start_addr, end_addr - start_addr, details);
-        int need_break;
-        /*
-         * files that support invalidating or truncating portions of the
-         * file from under mmaped areas must have their ->fault function
-         * return a locked page (and set VM_FAULT_LOCKED in the return).
-         * This provides synchronisation against concurrent unmapping here.
-         */
-again:
-        restart_addr = vma->vm_truncate_count;
-        if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
-                start_addr = restart_addr;
-                if (start_addr >= end_addr) {
-                        /* Top of vma has been split off since last time */
-                        vma->vm_truncate_count = details->truncate_count;
-                        return 0;
-                }
-        }
-        restart_addr = zap_page_range(vma, start_addr,
-                                        end_addr - start_addr, details);
-        need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
-        if (restart_addr >= end_addr) {
-                /* We have now completed this vma: mark it so */
-                vma->vm_truncate_count = details->truncate_count;
-                if (!need_break)
-                        return 0;
-        } else {
-                /* Note restart_addr in vma's truncate_count field */
-                vma->vm_truncate_count = restart_addr;
-                if (!need_break)
-                        goto again;
-        }
-        spin_unlock(details->i_mmap_lock);
-        cond_resched();
-        spin_lock(details->i_mmap_lock);
-        return -EINTR;
 }
 static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -2485,12 +2716,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
        struct prio_tree_iter iter;
        pgoff_t vba, vea, zba, zea;
-restart:
        vma_prio_tree_foreach(vma, &iter, root,
                        details->first_index, details->last_index) {
-                /* Skip quickly over those we have already dealt with */
-                if (vma->vm_truncate_count == details->truncate_count)
-                        continue;
                vba = vma->vm_pgoff;
                vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
@@ -2502,11 +2729,10 @@ restart:
                if (zea > vea)
                        zea = vea;
-                if (unmap_mapping_range_vma(vma,
+                unmap_mapping_range_vma(vma,
                        ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
                        ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
-                                details) < 0)
+                                details);
-                        goto restart;
        }
 }
@@ -2521,15 +2747,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
         * across *all* the pages in each nonlinear VMA, not just the pages
         * whose virtual address lies outside the file truncation point.
         */
-restart:
        list_for_each_entry(vma, head, shared.vm_set.list) {
-                /* Skip quickly over those we have already dealt with */
-                if (vma->vm_truncate_count == details->truncate_count)
-                        continue;
                details->nonlinear_vma = vma;
-                if (unmap_mapping_range_vma(vma, vma->vm_start,
+                unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
-                                        vma->vm_end, details) < 0)
-                        goto restart;
        }
 }
@@ -2568,51 +2788,17 @@ void unmap_mapping_range(struct address_space *mapping,
        details.last_index = hba + hlen - 1;
        if (details.last_index < details.first_index)
                details.last_index = ULONG_MAX;
-        details.i_mmap_lock = &mapping->i_mmap_lock;
-        spin_lock(&mapping->i_mmap_lock);
-        /* Protect against endless unmapping loops */
-        mapping->truncate_count++;
-        if (unlikely(is_restart_addr(mapping->truncate_count))) {
-                if (mapping->truncate_count == 0)
-                        reset_vma_truncate_counts(mapping);
-                mapping->truncate_count++;
-        }
-        details.truncate_count = mapping->truncate_count;
+        mutex_lock(&mapping->i_mmap_mutex);
        if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
                unmap_mapping_range_tree(&mapping->i_mmap, &details);
        if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
-int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
-{
-        struct address_space *mapping = inode->i_mapping;
-        /*
-         * If the underlying filesystem is not going to provide
-         * a way to truncate a range of blocks (punch a hole) -
-         * we should return failure right now.
-         */
-        if (!inode->i_op->truncate_range)
-                return -ENOSYS;
-        mutex_lock(&inode->i_mutex);
-        down_write(&inode->i_alloc_sem);
-        unmap_mapping_range(mapping, offset, (end - offset), 1);
-        truncate_inode_pages_range(mapping, offset, end);
-        unmap_mapping_range(mapping, offset, (end - offset), 1);
-        inode->i_op->truncate_range(inode, offset, end);
-        up_write(&inode->i_alloc_sem);
-        mutex_unlock(&inode->i_mutex);
-        return 0;
-}
 /*
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2626,7 +2812,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page, *swapcache = NULL;
        swp_entry_t entry;
        pte_t pte;
-        struct mem_cgroup *ptr = NULL;
+        int locked;
+        struct mem_cgroup *ptr;
        int exclusive = 0;
        int ret = 0;
@@ -2666,6 +2853,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
+                mem_cgroup_count_vm_event(mm, PGMAJFAULT);
        } else if (PageHWPoison(page)) {
                /*
                 * hwpoisoned dirty swapcache pages are kept for killing
@@ -2676,8 +2864,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_release;
        }
-        lock_page(page);
+        locked = lock_page_or_retry(page, mm, flags);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+        if (!locked) {
+                ret |= VM_FAULT_RETRY;
+                goto out_release;
+        }
        /*
         * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
@@ -2810,7 +3002,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
                if (prev && prev->vm_end == address)
                        return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
-                expand_stack(vma, address - PAGE_SIZE);
+                expand_downwards(vma, address - PAGE_SIZE);
        }
        if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
                struct vm_area_struct *next = vma->vm_next;
@@ -2926,7 +3118,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        vmf.page = NULL;
        ret = vma->vm_ops->fault(vma, &vmf);
-        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
+                            VM_FAULT_RETRY)))
                return ret;
        if (unlikely(PageHWPoison(vmf.page))) {
@@ -2967,12 +3160,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                goto out;
                        }
                        charged = 1;
-                        /*
-                         * Don't let another task, with possibly unlocked vma,
-                         * keep the mlocked page.
-                         */
-                        if (vma->vm_flags & VM_LOCKED)
-                                clear_page_mlock(vmf.page);
                        copy_user_highpage(page, vmf.page, address, vma);
                        __SetPageUptodate(page);
                } else {
@@ -3139,9 +3326,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_sem still held, but pte unmapped and unlocked.
 */
-static inline int handle_pte_fault(struct mm_struct *mm,
+int handle_pte_fault(struct mm_struct *mm,
-                struct vm_area_struct *vma, unsigned long address,
+                     struct vm_area_struct *vma, unsigned long address,
-                pte_t *pte, pmd_t *pmd, unsigned int flags)
+                     pte_t *pte, pmd_t *pmd, unsigned int flags)
 {
        pte_t entry;
        spinlock_t *ptl;
@@ -3185,7 +3372,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                 * with threads.
                 */
                if (flags & FAULT_FLAG_WRITE)
-                        flush_tlb_page(vma, address);
+                        flush_tlb_fix_spurious_fault(vma, address);
        }
 unlock:
        pte_unmap_unlock(pte, ptl);
@@ -3206,6 +3393,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        __set_current_state(TASK_RUNNING);
        count_vm_event(PGFAULT);
+        mem_cgroup_count_vm_event(mm, PGFAULT);
        /* do counter updates before entering really critical section. */
        check_sync_rss_stat(current);
@@ -3220,9 +3408,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pmd = pmd_alloc(mm, pud, address);
        if (!pmd)
                return VM_FAULT_OOM;
-        pte = pte_alloc_map(mm, pmd, address);
+        if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) {
-        if (!pte)
+                if (!vma->vm_ops)
+                        return do_huge_pmd_anonymous_page(mm, vma, address,
+                                                          pmd, flags);
+        } else {
+                pmd_t orig_pmd = *pmd;
+                barrier();
+                if (pmd_trans_huge(orig_pmd)) {
+                        if (flags & FAULT_FLAG_WRITE &&
+                            !pmd_write(orig_pmd) &&
+                            !pmd_trans_splitting(orig_pmd))
+                                return do_huge_pmd_wp_page(mm, vma, address,
+                                                           pmd, orig_pmd);
+                        return 0;
+                }
+        }
+        /*
+         * Use __pte_alloc instead of pte_alloc_map, because we can't
+         * run pte_offset_map on the pmd, if an huge pmd could
+         * materialize from under us from a different thread.
+         */
+        if (unlikely(pmd_none(*pmd)) && __pte_alloc(mm, vma, pmd, address))
                return VM_FAULT_OOM;
+        /* if an huge pmd materialized from under us just retry later */
+        if (unlikely(pmd_trans_huge(*pmd)))
+                return 0;
+        /*
+         * A regular pmd is established and it can't morph into a huge pmd
+         * from under us anymore at this point because we hold the mmap_sem
+         * read mode and khugepaged takes it in write mode. So now it's
+         * safe to run pte_offset_map().
+         */
+        pte = pte_offset_map(pmd, address);
        return handle_pte_fault(mm, vma, address, pte, pmd, flags);
 }
@@ -3288,7 +3507,12 @@ int make_pages_present(unsigned long addr, unsigned long end)
        vma = find_vma(current->mm, addr);
        if (!vma)
                return -ENOMEM;
-        write = (vma->vm_flags & VM_WRITE) != 0;
+        /*
+         * We want to touch writable mappings with a write fault in order
+         * to break COW, except for shared mappings because these don't COW
+         * and we would not want to dirty them for nothing.
+         */
+        write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE;
        BUG_ON(addr >= end);
        BUG_ON(end > vma->vm_end);
        len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
@@ -3323,7 +3547,7 @@ static int __init gate_vma_init(void)
 __initcall(gate_vma_init);
 #endif
-struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
 {
 #ifdef AT_SYSINFO_EHDR
        return &gate_vma;
@@ -3332,7 +3556,7 @@ struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
 #endif
 }
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
 {
 #ifdef AT_SYSINFO_EHDR
        if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END))
@@ -3343,7 +3567,7 @@ int in_gate_area_no_task(unsigned long addr)
 #endif  /* __HAVE_ARCH_GATE_AREA */
-static int follow_pte(struct mm_struct *mm, unsigned long address,
+static int __follow_pte(struct mm_struct *mm, unsigned long address,
                pte_t **ptepp, spinlock_t **ptlp)
 {
        pgd_t *pgd;
@@ -3360,6 +3584,7 @@ static int follow_pte(struct mm_struct *mm, unsigned long address,
                goto out;
        pmd = pmd_offset(pud, address);
+        VM_BUG_ON(pmd_trans_huge(*pmd));
        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
                goto out;
@@ -3380,6 +3605,17 @@ out:
        return -EINVAL;
 }
+static inline int follow_pte(struct mm_struct *mm, unsigned long address,
+                             pte_t **ptepp, spinlock_t **ptlp)
+{
+        int res;
+        /* (void) is needed to make gcc happy */
+        (void) __cond_lock(*ptlp,
+                           !(res = __follow_pte(mm, address, ptepp, ptlp)));
+        return res;
+}
 /**
 * follow_pfn - look up PFN at a user virtual address
 * @vma: memory mapping
@@ -3461,20 +3697,15 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 #endif
 /*
- * Access another process' address space.
+ * Access another process' address space as given in mm.  If non-NULL, use the
- * Source/target buffer must be kernel space,
+ * given task for page fault accounting.
- * Do not walk the page table directly, use get_user_pages
 */
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
+                unsigned long addr, void *buf, int len, int write)
 {
-        struct mm_struct *mm;
        struct vm_area_struct *vma;
        void *old_buf = buf;
-        mm = get_task_mm(tsk);
-        if (!mm)
-                return 0;
        down_read(&mm->mmap_sem);
        /* ignore errors, just check how much was successfully transferred */
        while (len) {
@@ -3491,7 +3722,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
                         */
 #ifdef CONFIG_HAVE_IOREMAP_PROT
                        vma = find_vma(mm, addr);
-                        if (!vma)
+                        if (!vma || vma->vm_start > addr)
                                break;
                        if (vma->vm_ops && vma->vm_ops->access)
                                ret = vma->vm_ops->access(vma, addr, buf,
@@ -3523,11 +3754,47 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
                addr += bytes;
        }
        up_read(&mm->mmap_sem);
-        mmput(mm);
        return buf - old_buf;
 }
+/**
+ * access_remote_vm - access another process' address space
+ * @mm:         the mm_struct of the target address space
+ * @addr:       start address to access
+ * @buf:        source or destination buffer
+ * @len:        number of bytes to transfer
+ * @write:      whether the access is a write
+ *
+ * The caller must hold a reference on @mm.
+ */
+int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+                void *buf, int len, int write)
+{
+        return __access_remote_vm(NULL, mm, addr, buf, len, write);
+}
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr,
+                void *buf, int len, int write)
+{
+        struct mm_struct *mm;
+        int ret;
+        mm = get_task_mm(tsk);
+        if (!mm)
+                return 0;
+        ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
+        mmput(mm);
+        return ret;
+}
 /*
 * Print the name of a VMA.
 */
@@ -3589,3 +3856,74 @@ void might_fault(void)
 }
 EXPORT_SYMBOL(might_fault);
 #endif
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
+static void clear_gigantic_page(struct page *page,
+                                unsigned long addr,
+                                unsigned int pages_per_huge_page)
+{
+        int i;
+        struct page *p = page;
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page;
+             i++, p = mem_map_next(p, page, i)) {
+                cond_resched();
+                clear_user_highpage(p, addr + i * PAGE_SIZE);
+        }
+}
+void clear_huge_page(struct page *page,
+                     unsigned long addr, unsigned int pages_per_huge_page)
+{
+        int i;
+        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+                clear_gigantic_page(page, addr, pages_per_huge_page);
+                return;
+        }
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page; i++) {
+                cond_resched();
+                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
+        }
+}
+static void copy_user_gigantic_page(struct page *dst, struct page *src,
+                                    unsigned long addr,
+                                    struct vm_area_struct *vma,
+                                    unsigned int pages_per_huge_page)
+{
+        int i;
+        struct page *dst_base = dst;
+        struct page *src_base = src;
+        for (i = 0; i < pages_per_huge_page; ) {
+                cond_resched();
+                copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
+                i++;
+                dst = mem_map_next(dst, dst_base, i);
+                src = mem_map_next(src, src_base, i);
+        }
+}
+void copy_user_huge_page(struct page *dst, struct page *src,
+                         unsigned long addr, struct vm_area_struct *vma,
+                         unsigned int pages_per_huge_page)
+{
+        int i;
+        if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) {
+                copy_user_gigantic_page(dst, src, addr, vma,
+                                        pages_per_huge_page);
+                return;
+        }
+        might_sleep();
+        for (i = 0; i < pages_per_huge_page; i++) {
+                cond_resched();
+                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
+        }
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index dd186c1a5d53..c46887b5a11e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -34,6 +34,23 @@
 #include "internal.h"
+DEFINE_MUTEX(mem_hotplug_mutex);
+void lock_memory_hotplug(void)
+{
+        mutex_lock(&mem_hotplug_mutex);
+        /* for exclusive hibernation if CONFIG_HIBERNATION=y */
+        lock_system_sleep();
+}
+void unlock_memory_hotplug(void)
+{
+        unlock_system_sleep();
+        mutex_unlock(&mem_hotplug_mutex);
+}
 /* add this memory to iomem resource */
 static struct resource *register_memory_resource(u64 start, u64 size)
 {
@@ -65,9 +82,10 @@ static void release_memory_resource(struct resource *res)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info,  struct page *page, int type)
+static void get_page_bootmem(unsigned long info,  struct page *page,
+                             unsigned long type)
 {
-        atomic_set(&page->_mapcount, type);
+        page->lru.next = (struct list_head *) type;
        SetPagePrivate(page);
        set_page_private(page, info);
        atomic_inc(&page->_count);
@@ -77,15 +95,16 @@ static void get_page_bootmem(unsigned long info,  struct page *page, int type)
 * so use __ref to tell modpost not to generate a warning */
 void __ref put_page_bootmem(struct page *page)
 {
-        int type;
+        unsigned long type;
-        type = atomic_read(&page->_mapcount);
+        type = (unsigned long) page->lru.next;
-        BUG_ON(type >= -1);
+        BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
+               type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
        if (atomic_dec_return(&page->_count) == 1) {
                ClearPagePrivate(page);
                set_page_private(page, 0);
-                reset_page_mapcount(page);
+                INIT_LIST_HEAD(&page->lru);
                __free_pages_bootmem(page, 0);
        }
@@ -355,10 +374,6 @@ void online_page(struct page *page)
                totalhigh_pages++;
 #endif
-#ifdef CONFIG_FLATMEM
-        max_mapnr = max(page_to_pfn(page), max_mapnr);
-#endif
        ClearPageReserved(page);
        init_page_count(page);
        __free_page(page);
@@ -381,7 +396,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 }
-int online_pages(unsigned long pfn, unsigned long nr_pages)
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
 {
        unsigned long onlined_pages = 0;
        struct zone *zone;
@@ -390,6 +405,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        int ret;
        struct memory_notify arg;
+        lock_memory_hotplug();
        arg.start_pfn = pfn;
        arg.nr_pages = nr_pages;
        arg.status_change_nid = -1;
@@ -402,6 +418,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        ret = notifier_to_errno(ret);
        if (ret) {
                memory_notify(MEM_CANCEL_ONLINE, &arg);
+                unlock_memory_hotplug();
                return ret;
        }
        /*
@@ -426,6 +443,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
                printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
                        nr_pages, pfn);
                memory_notify(MEM_CANCEL_ONLINE, &arg);
+                unlock_memory_hotplug();
                return ret;
        }
@@ -437,8 +455,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
                zone_pcp_update(zone);
        mutex_unlock(&zonelists_mutex);
-        setup_per_zone_wmarks();
-        calculate_zone_inactive_ratio(zone);
+        init_per_zone_wmark_min();
        if (onlined_pages) {
                kswapd_run(zone_to_nid(zone));
                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
@@ -450,6 +469,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        if (onlined_pages)
                memory_notify(MEM_ONLINE, &arg);
+        unlock_memory_hotplug();
        return 0;
 }
@@ -474,6 +494,14 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
        /* init node's zones as empty zones, we don't have any present pages.*/
        free_area_init_node(nid, zones_size, start_pfn, zholes_size);
+        /*
+         * The node we allocated has no zone fallback lists. For avoiding
+         * to access not-initialized zonelist, build here.
+         */
+        mutex_lock(&zonelists_mutex);
+        build_all_zonelists(NULL);
+        mutex_unlock(&zonelists_mutex);
        return pgdat;
 }
@@ -493,9 +521,9 @@ int mem_online_node(int nid)
        pg_data_t       *pgdat;
        int     ret;
-        lock_system_sleep();
+        lock_memory_hotplug();
        pgdat = hotadd_new_pgdat(nid, 0);
-        if (pgdat) {
+        if (!pgdat) {
                ret = -ENOMEM;
                goto out;
        }
@@ -504,7 +532,7 @@ int mem_online_node(int nid)
        BUG_ON(ret);
 out:
-        unlock_system_sleep();
+        unlock_memory_hotplug();
        return ret;
 }
@@ -516,7 +544,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
        struct resource *res;
        int ret;
-        lock_system_sleep();
+        lock_memory_hotplug();
        res = register_memory_resource(start, size);
        ret = -EEXIST;
@@ -563,7 +591,7 @@ error:
                release_memory_resource(res);
 out:
-        unlock_system_sleep();
+        unlock_memory_hotplug();
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_memory);
@@ -602,27 +630,14 @@ static struct page *next_active_pageblock(struct page *page)
 /* Checks if this range of memory is likely to be hot-removable. */
 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 {
-        int type;
        struct page *page = pfn_to_page(start_pfn);
        struct page *end_page = page + nr_pages;
        /* Check the starting page of each pageblock within the range */
        for (; page < end_page; page = next_active_pageblock(page)) {
-                type = get_pageblock_migratetype(page);
+                if (!is_pageblock_removable_nolock(page))
-                /*
-                 * A pageblock containing MOVABLE or free pages is considered
-                 * removable
-                 */
-                if (type != MIGRATE_MOVABLE && !pageblock_free(page))
-                        return 0;
-                /*
-                 * A pageblock starting with a PageReserved page is not
-                 * considered removable.
-                 */
-                if (PageReserved(page))
                        return 0;
+                cond_resched();
        }
        /* All pageblocks in the memory block are likely to be hot-removable */
@@ -659,7 +674,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
 * Scanning pfn is much easier than scanning lru list.
 * Scan pfn from start to end and Find LRU page.
 */
-int scan_lru_pages(unsigned long start, unsigned long end)
+static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
 {
        unsigned long pfn;
        struct page *page;
@@ -695,7 +710,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                if (!pfn_valid(pfn))
                        continue;
                page = pfn_to_page(pfn);
-                if (!page_count(page))
+                if (!get_page_unless_zero(page))
                        continue;
                /*
                 * We can skip free pages. And we can only deal with pages on
@@ -703,35 +718,39 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 */
                ret = isolate_lru_page(page);
                if (!ret) { /* Success */
+                        put_page(page);
                        list_add_tail(&page->lru, &source);
                        move_pages--;
                        inc_zone_page_state(page, NR_ISOLATED_ANON +
                                            page_is_file_cache(page));
                } else {
-                        /* Becasue we don't have big zone->lock. we should
-                           check this again here. */
-                        if (page_count(page))
-                                not_managed++;
 #ifdef CONFIG_DEBUG_VM
                        printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
                               pfn);
                        dump_page(page);
 #endif
+                        put_page(page);
+                        /* Because we don't have big zone->lock. we should
+                           check this again here. */
+                        if (page_count(page)) {
+                                not_managed++;
+                                ret = -EBUSY;
+                                break;
+                        }
                }
        }
-        ret = -EBUSY;
+        if (!list_empty(&source)) {
-        if (not_managed) {
+                if (not_managed) {
-                if (!list_empty(&source))
+                        putback_lru_pages(&source);
+                        goto out;
+                }
+                /* this function returns # of failed pages */
+                ret = migrate_pages(&source, hotremove_migrate_alloc, 0,
+                                                                true, true);
+                if (ret)
                        putback_lru_pages(&source);
-                goto out;
        }
-        ret = 0;
-        if (list_empty(&source))
-                goto out;
-        /* this function returns # of failed pages */
-        ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
 out:
        return ret;
 }
@@ -783,7 +802,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        return offlined;
 }
-static int offline_pages(unsigned long start_pfn,
+static int __ref offline_pages(unsigned long start_pfn,
                  unsigned long end_pfn, unsigned long timeout)
 {
        unsigned long pfn, nr_pages, expire;
@@ -803,7 +822,7 @@ static int offline_pages(unsigned long start_pfn,
        if (!test_pages_in_a_zone(start_pfn, end_pfn))
                return -EINVAL;
-        lock_system_sleep();
+        lock_memory_hotplug();
        zone = page_zone(pfn_to_page(start_pfn));
        node = zone_to_nid(zone);
@@ -840,7 +859,6 @@ repeat:
        ret = 0;
        if (drain) {
                lru_add_drain_all();
-                flush_scheduled_work();
                cond_resched();
                drain_all_pages();
        }
@@ -862,7 +880,6 @@ repeat:
        }
        /* drain all zone's lru pagevec, this is asyncronous... */
        lru_add_drain_all();
-        flush_scheduled_work();
        yield();
        /* drain pcp pages , this is synchrouns. */
        drain_all_pages();
@@ -883,8 +900,8 @@ repeat:
        zone->zone_pgdat->node_present_pages -= offlined_pages;
        totalram_pages -= offlined_pages;
-        setup_per_zone_wmarks();
+        init_per_zone_wmark_min();
-        calculate_zone_inactive_ratio(zone);
        if (!node_present_pages(node)) {
                node_clear_state(node, N_HIGH_MEMORY);
                kswapd_stop(node);
@@ -894,7 +911,7 @@ repeat:
        writeback_set_ratelimit();
        memory_notify(MEM_OFFLINE, &arg);
-        unlock_system_sleep();
+        unlock_memory_hotplug();
        return 0;
 failed_removal:
@@ -905,7 +922,7 @@ failed_removal:
        undo_isolate_page_range(start_pfn, end_pfn);
 out:
-        unlock_system_sleep();
+        unlock_memory_hotplug();
        return ret;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f969da5dd8a2..e7fb9d25c54e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -99,7 +99,6 @@
 /* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
-#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 static struct kmem_cache *policy_cache;
 static struct kmem_cache *sn_cache;
@@ -457,7 +456,6 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
        },
 };
-static void gather_stats(struct page *, void *, int pte_dirty);
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags);
@@ -492,9 +490,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
                        continue;
-                if (flags & MPOL_MF_STATS)
+                if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-                        gather_stats(page, private, pte_dirty(*pte));
-                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
                        migrate_page_add(page, private, flags);
                else
                        break;
@@ -514,6 +510,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                split_huge_page_pmd(vma->vm_mm, pmd);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                if (check_pte_range(vma, pmd, addr, next, nodes,
@@ -924,15 +921,22 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
        nodemask_t nmask;
        LIST_HEAD(pagelist);
        int err = 0;
+        struct vm_area_struct *vma;
        nodes_clear(nmask);
        node_set(source, nmask);
-        check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+        vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+        if (IS_ERR(vma))
+                return PTR_ERR(vma);
-        if (!list_empty(&pagelist))
+        if (!list_empty(&pagelist)) {
-                err = migrate_pages(&pagelist, new_node_page, dest, 0);
+                err = migrate_pages(&pagelist, new_node_page, dest,
+                                                                false, true);
+                if (err)
+                        putback_lru_pages(&pagelist);
+        }
        return err;
 }
@@ -985,7 +989,7 @@ int do_migrate_pages(struct mm_struct *mm,
         * most recent <s, d> pair that moved (s != d).  If we find a pair
         * that not only moved, but what's better, moved to an empty slot
         * (d is not set in tmp), then we break out then, with that pair.
-         * Otherwise when we finish scannng from_tmp, we at least have the
+         * Otherwise when we finish scanning from_tmp, we at least have the
         * most recent <s, d> pair that moved.  If we get all the way through
         * the scan of tmp without finding any node that moved, much less
         * moved to an empty node, then there is nothing left worth migrating.
@@ -1147,9 +1151,13 @@ static long do_mbind(unsigned long start, unsigned long len,
                err = mbind_range(mm, start, end, new);
-                if (!list_empty(&pagelist))
+                if (!list_empty(&pagelist)) {
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
-                                                (unsigned long)vma, 0);
+                                                (unsigned long)vma,
+                                                false, true);
+                        if (nr_failed)
+                                putback_lru_pages(&pagelist);
+                }
                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
@@ -1298,15 +1306,15 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                goto out;
        /* Find the mm_struct */
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
                err = -ESRCH;
                goto out;
        }
        mm = get_task_mm(task);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        err = -EINVAL;
        if (!mm)
@@ -1477,7 +1485,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 * freeing by another task.  It is the caller's responsibility to free the
 * extra reference for shared policies.
 */
-static struct mempolicy *get_vma_policy(struct task_struct *task,
+struct mempolicy *get_vma_policy(struct task_struct *task,
                struct vm_area_struct *vma, unsigned long addr)
 {
        struct mempolicy *pol = task->mempolicy;
@@ -1512,10 +1520,9 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 }
 /* Return a zonelist indicated by gfp for node representing a mempolicy */
-static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
+        int nd)
 {
-        int nd = numa_node_id();
        switch (policy->mode) {
        case MPOL_PREFERRED:
                if (!(policy->flags & MPOL_F_LOCAL))
@@ -1588,7 +1595,7 @@ unsigned slab_node(struct mempolicy *policy)
                (void)first_zones_zonelist(zonelist, highest_zoneidx,
                                                        &policy->v.nodes,
                                                        &zone);
-                return zone->node;
+                return zone ? zone->node : numa_node_id();
        }
        default:
@@ -1667,7 +1674,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
                                huge_page_shift(hstate_vma(vma))), gfp_flags);
        } else {
-                zl = policy_zonelist(gfp_flags, *mpol);
+                zl = policy_zonelist(gfp_flags, *mpol, numa_node_id());
                if ((*mpol)->mode == MPOL_BIND)
                        *nodemask = &(*mpol)->v.nodes;
        }
@@ -1784,7 +1791,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 }
 /**
- *      alloc_page_vma  - Allocate a page for a VMA.
+ *      alloc_pages_vma - Allocate a page for a VMA.
 *
 *      @gfp:
 *      %GFP_USER    user allocation.
@@ -1793,6 +1800,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 *      %GFP_FS      allocation should not call back into a file system.
 *      %GFP_ATOMIC  don't sleep.
 *
+ *      @order:Order of the GFP allocation.
 *      @vma:  Pointer to VMA or NULL if not available.
 *      @addr: Virtual Address of the allocation. Must be inside the VMA.
 *
@@ -1806,7 +1814,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
 *      Should be called with the mm_sem of the vma hold.
 */
 struct page *
-alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
+                unsigned long addr, int node)
 {
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
        struct zonelist *zl;
@@ -1816,18 +1825,18 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
                unsigned nid;
-                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
+                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
                mpol_cond_put(pol);
-                page = alloc_page_interleave(gfp, 0, nid);
+                page = alloc_page_interleave(gfp, order, nid);
                put_mems_allowed();
                return page;
        }
-        zl = policy_zonelist(gfp, pol);
+        zl = policy_zonelist(gfp, pol, node);
        if (unlikely(mpol_needs_cond_ref(pol))) {
                /*
                 * slow path: ref counted shared policy
                 */
-                struct page *page =  __alloc_pages_nodemask(gfp, 0,
+                struct page *page =  __alloc_pages_nodemask(gfp, order,
                                                zl, policy_nodemask(gfp, pol));
                __mpol_put(pol);
                put_mems_allowed();
@@ -1836,7 +1845,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
        /*
         * fast path:  default or task policy
         */
-        page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+        page = __alloc_pages_nodemask(gfp, order, zl,
+                                      policy_nodemask(gfp, pol));
        put_mems_allowed();
        return page;
 }
@@ -1877,7 +1887,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
                page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
        else
                page = __alloc_pages_nodemask(gfp, order,
-                        policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+                                policy_zonelist(gfp, pol, numa_node_id()),
+                                policy_nodemask(gfp, pol));
        put_mems_allowed();
        return page;
 }
@@ -1964,8 +1975,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
        case MPOL_INTERLEAVE:
                return nodes_equal(a->v.nodes, b->v.nodes);
        case MPOL_PREFERRED:
-                return a->v.preferred_node == b->v.preferred_node &&
+                return a->v.preferred_node == b->v.preferred_node;
-                        a->flags == b->flags;
        default:
                BUG();
                return 0;
@@ -2515,159 +2525,3 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
        }
        return p - buffer;
 }
-struct numa_maps {
-        unsigned long pages;
-        unsigned long anon;
-        unsigned long active;
-        unsigned long writeback;
-        unsigned long mapcount_max;
-        unsigned long dirty;
-        unsigned long swapcache;
-        unsigned long node[MAX_NUMNODES];
-};
-static void gather_stats(struct page *page, void *private, int pte_dirty)
-{
-        struct numa_maps *md = private;
-        int count = page_mapcount(page);
-        md->pages++;
-        if (pte_dirty || PageDirty(page))
-                md->dirty++;
-        if (PageSwapCache(page))
-                md->swapcache++;
-        if (PageActive(page) || PageUnevictable(page))
-                md->active++;
-        if (PageWriteback(page))
-                md->writeback++;
-        if (PageAnon(page))
-                md->anon++;
-        if (count > md->mapcount_max)
-                md->mapcount_max = count;
-        md->node[page_to_nid(page)]++;
-}
-#ifdef CONFIG_HUGETLB_PAGE
-static void check_huge_range(struct vm_area_struct *vma,
-                unsigned long start, unsigned long end,
-                struct numa_maps *md)
-{
-        unsigned long addr;
-        struct page *page;
-        struct hstate *h = hstate_vma(vma);
-        unsigned long sz = huge_page_size(h);
-        for (addr = start; addr < end; addr += sz) {
-                pte_t *ptep = huge_pte_offset(vma->vm_mm,
-                                                addr & huge_page_mask(h));
-                pte_t pte;
-                if (!ptep)
-                        continue;
-                pte = *ptep;
-                if (pte_none(pte))
-                        continue;
-                page = pte_page(pte);
-                if (!page)
-                        continue;
-                gather_stats(page, md, pte_dirty(*ptep));
-        }
-}
-#else
-static inline void check_huge_range(struct vm_area_struct *vma,
-                unsigned long start, unsigned long end,
-                struct numa_maps *md)
-{
-}
-#endif
-/*
- * Display pages allocated per node and memory policy via /proc.
- */
-int show_numa_map(struct seq_file *m, void *v)
-{
-        struct proc_maps_private *priv = m->private;
-        struct vm_area_struct *vma = v;
-        struct numa_maps *md;
-        struct file *file = vma->vm_file;
-        struct mm_struct *mm = vma->vm_mm;
-        struct mempolicy *pol;
-        int n;
-        char buffer[50];
-        if (!mm)
-                return 0;
-        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
-        if (!md)
-                return 0;
-        pol = get_vma_policy(priv->task, vma, vma->vm_start);
-        mpol_to_str(buffer, sizeof(buffer), pol, 0);
-        mpol_cond_put(pol);
-        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
-        if (file) {
-                seq_printf(m, " file=");
-                seq_path(m, &file->f_path, "\n\t= ");
-        } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
-                seq_printf(m, " heap");
-        } else if (vma->vm_start <= mm->start_stack &&
-                        vma->vm_end >= mm->start_stack) {
-                seq_printf(m, " stack");
-        }
-        if (is_vm_hugetlb_page(vma)) {
-                check_huge_range(vma, vma->vm_start, vma->vm_end, md);
-                seq_printf(m, " huge");
-        } else {
-                check_pgd_range(vma, vma->vm_start, vma->vm_end,
-                        &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
-        }
-        if (!md->pages)
-                goto out;
-        if (md->anon)
-                seq_printf(m," anon=%lu",md->anon);
-        if (md->dirty)
-                seq_printf(m," dirty=%lu",md->dirty);
-        if (md->pages != md->anon && md->pages != md->dirty)
-                seq_printf(m, " mapped=%lu", md->pages);
-        if (md->mapcount_max > 1)
-                seq_printf(m, " mapmax=%lu", md->mapcount_max);
-        if (md->swapcache)
-                seq_printf(m," swapcache=%lu", md->swapcache);
-        if (md->active < md->pages && !is_vm_hugetlb_page(vma))
-                seq_printf(m," active=%lu", md->active);
-        if (md->writeback)
-                seq_printf(m," writeback=%lu", md->writeback);
-        for_each_node_state(n, N_HIGH_MEMORY)
-                if (md->node[n])
-                        seq_printf(m, " N%d=%lu", n, md->node[n]);
-out:
-        seq_putc(m, '\n');
-        kfree(md);
-        if (m->count < m->size)
-                m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
-        return 0;
-}
diff --git a/mm/migrate.c b/mm/migrate.c
index 38e7cad782f4..666e4e677414 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -32,8 +32,11 @@
 #include <linux/security.h>
 #include <linux/memcontrol.h>
 #include <linux/syscalls.h>
+#include <linux/hugetlb.h>
 #include <linux/gfp.h>
+#include <asm/tlbflush.h>
 #include "internal.h"
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -95,26 +98,36 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
        pte_t *ptep, pte;
        spinlock_t *ptl;
-        pgd = pgd_offset(mm, addr);
+        if (unlikely(PageHuge(new))) {
-        if (!pgd_present(*pgd))
+                ptep = huge_pte_offset(mm, addr);
-                goto out;
+                if (!ptep)
+                        goto out;
+                ptl = &mm->page_table_lock;
+        } else {
+                pgd = pgd_offset(mm, addr);
+                if (!pgd_present(*pgd))
+                        goto out;
-        pud = pud_offset(pgd, addr);
+                pud = pud_offset(pgd, addr);
-        if (!pud_present(*pud))
+                if (!pud_present(*pud))
-                goto out;
+                        goto out;
-        pmd = pmd_offset(pud, addr);
+                pmd = pmd_offset(pud, addr);
-        if (!pmd_present(*pmd))
+                if (pmd_trans_huge(*pmd))
-                goto out;
+                        goto out;
+                if (!pmd_present(*pmd))
+                        goto out;
-        ptep = pte_offset_map(pmd, addr);
+                ptep = pte_offset_map(pmd, addr);
-        if (!is_swap_pte(*ptep)) {
+                if (!is_swap_pte(*ptep)) {
-                pte_unmap(ptep);
+                        pte_unmap(ptep);
-                goto out;
+                        goto out;
-        }
+                }
+                ptl = pte_lockptr(mm, pmd);
+        }
-        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        pte = *ptep;
        if (!is_swap_pte(pte))
@@ -130,10 +143,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
        if (is_write_migration_entry(entry))
                pte = pte_mkwrite(pte);
+#ifdef CONFIG_HUGETLB_PAGE
+        if (PageHuge(new))
+                pte = pte_mkhuge(pte);
+#endif
        flush_cache_page(vma, addr, pte_pfn(pte));
        set_pte_at(mm, addr, ptep, pte);
-        if (PageAnon(new))
+        if (PageHuge(new)) {
+                if (PageAnon(new))
+                        hugepage_add_anon_rmap(new, vma, addr);
+                else
+                        page_dup_rmap(new);
+        } else if (PageAnon(new))
                page_add_anon_rmap(new, vma, addr);
        else
                page_add_file_rmap(new);
@@ -226,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        expected_count = 2 + page_has_private(page);
        if (page_count(page) != expected_count ||
-                        (struct page *)radix_tree_deref_slot(pslot) != page) {
+                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
                spin_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -266,7 +288,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
         */
        __dec_zone_page_state(page, NR_FILE_PAGES);
        __inc_zone_page_state(newpage, NR_FILE_PAGES);
-        if (PageSwapBacked(page)) {
+        if (!PageSwapCache(page) && PageSwapBacked(page)) {
                __dec_zone_page_state(page, NR_SHMEM);
                __inc_zone_page_state(newpage, NR_SHMEM);
        }
@@ -276,11 +298,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 }
 /*
+ * The expected number of remaining references is the same as that
+ * of migrate_page_move_mapping().
+ */
+int migrate_huge_page_move_mapping(struct address_space *mapping,
+                                   struct page *newpage, struct page *page)
+{
+        int expected_count;
+        void **pslot;
+        if (!mapping) {
+                if (page_count(page) != 1)
+                        return -EAGAIN;
+                return 0;
+        }
+        spin_lock_irq(&mapping->tree_lock);
+        pslot = radix_tree_lookup_slot(&mapping->page_tree,
+                                        page_index(page));
+        expected_count = 2 + page_has_private(page);
+        if (page_count(page) != expected_count ||
+                radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) {
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        if (!page_freeze_refs(page, expected_count)) {
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        get_page(newpage);
+        radix_tree_replace_slot(pslot, newpage);
+        page_unfreeze_refs(page, expected_count);
+        __put_page(page);
+        spin_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
+/*
 * Copy the page to its new location
 */
-static void migrate_page_copy(struct page *newpage, struct page *page)
+void migrate_page_copy(struct page *newpage, struct page *page)
 {
-        copy_highpage(newpage, page);
+        if (PageHuge(page))
+                copy_huge_page(newpage, page);
+        else
+                copy_highpage(newpage, page);
        if (PageError(page))
                SetPageError(newpage);
@@ -305,7 +375,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
                 * redo the accounting that clear_page_dirty_for_io undid,
                 * but we can't use set_page_dirty because that function
                 * is actually a signal that all of the page has become dirty.
-                 * Wheras only part of our page may be dirty.
+                 * Whereas only part of our page may be dirty.
                 */
                __set_page_dirty_nobuffers(newpage);
        }
@@ -431,7 +501,6 @@ static int writeout(struct address_space *mapping, struct page *page)
                .nr_to_write = 1,
                .range_start = 0,
                .range_end = LLONG_MAX,
-                .nonblocking = 1,
                .for_reclaim = 1
        };
        int rc;
@@ -495,7 +564,7 @@ static int fallback_migrate_page(struct address_space *mapping,
 *  == 0 - success
 */
 static int move_to_new_page(struct page *newpage, struct page *page,
-                                                int remap_swapcache)
+                                        int remap_swapcache, bool sync)
 {
        struct address_space *mapping;
        int rc;
@@ -517,18 +586,28 @@ static int move_to_new_page(struct page *newpage, struct page *page,
        mapping = page_mapping(page);
        if (!mapping)
                rc = migrate_page(mapping, newpage, page);
-        else if (mapping->a_ops->migratepage)
+        else {
                /*
-                 * Most pages have a mapping and most filesystems
+                 * Do not writeback pages if !sync and migratepage is
-                 * should provide a migration function. Anonymous
+                 * not pointing to migrate_page() which is nonblocking
-                 * pages are part of swap space which also has its
+                 * (swapcache/tmpfs uses migratepage = migrate_page).
-                 * own migration function. This is the most common
-                 * path for page migration.
                 */
-                rc = mapping->a_ops->migratepage(mapping,
+                if (PageDirty(page) && !sync &&
-                                                newpage, page);
+                    mapping->a_ops->migratepage != migrate_page)
-        else
+                        rc = -EBUSY;
-                rc = fallback_migrate_page(mapping, newpage, page);
+                else if (mapping->a_ops->migratepage)
+                        /*
+                         * Most pages have a mapping and most filesystems
+                         * should provide a migration function. Anonymous
+                         * pages are part of swap space which also has its
+                         * own migration function. This is the most common
+                         * path for page migration.
+                         */
+                        rc = mapping->a_ops->migratepage(mapping,
+                                                        newpage, page);
+                else
+                        rc = fallback_migrate_page(mapping, newpage, page);
+        }
        if (rc) {
                newpage->mapping = NULL;
@@ -547,15 +626,14 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 * to the newly allocated page in newpage.
 */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-                        struct page *page, int force, int offlining)
+                        struct page *page, int force, bool offlining, bool sync)
 {
        int rc = 0;
        int *result = NULL;
        struct page *newpage = get_new_page(page, private, &result);
        int remap_swapcache = 1;
-        int rcu_locked = 0;
        int charge = 0;
-        struct mem_cgroup *mem = NULL;
+        struct mem_cgroup *mem;
        struct anon_vma *anon_vma = NULL;
        if (!newpage)
@@ -565,13 +643,33 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                /* page was freed from under us. So we are done. */
                goto move_newpage;
        }
+        if (unlikely(PageTransHuge(page)))
+                if (unlikely(split_huge_page(page)))
+                        goto move_newpage;
        /* prepare cgroup just returns 0 or -ENOMEM */
        rc = -EAGAIN;
        if (!trylock_page(page)) {
-                if (!force)
+                if (!force || !sync)
+                        goto move_newpage;
+                /*
+                 * It's not safe for direct compaction to call lock_page.
+                 * For example, during page readahead pages are added locked
+                 * to the LRU. Later, when the IO completes the pages are
+                 * marked uptodate and unlocked. However, the queueing
+                 * could be merging multiple pages for one bio (e.g.
+                 * mpage_readpages). If an allocation happens for the
+                 * second or third page, the process can end up locking
+                 * the same page twice and deadlocking. Rather than
+                 * trying to be clever about what pages can be locked,
+                 * avoid the use of lock_page for direct compaction
+                 * altogether.
+                 */
+                if (current->flags & PF_MEMALLOC)
                        goto move_newpage;
                lock_page(page);
        }
@@ -590,7 +688,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        }
        /* charge against new page */
-        charge = mem_cgroup_prepare_migration(page, newpage, &mem);
+        charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL);
        if (charge == -ENOMEM) {
                rc = -ENOMEM;
                goto unlock;
@@ -598,6 +696,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        BUG_ON(charge);
        if (PageWriteback(page)) {
+                /*
+                 * For !sync, there is no point retrying as the retry loop
+                 * is expected to be too short for PageWriteback to be cleared
+                 */
+                if (!sync) {
+                        rc = -EBUSY;
+                        goto uncharge;
+                }
                if (!force)
                        goto uncharge;
                wait_on_page_writeback(page);
@@ -605,20 +711,22 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        /*
         * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
         * we cannot notice that anon_vma is freed while we migrates a page.
-         * This rcu_read_lock() delays freeing anon_vma pointer until the end
+         * This get_anon_vma() delays freeing anon_vma pointer until the end
         * of migration. File cache pages are no problem because of page_lock()
         * File Caches may use write_page() or lock_page() in migration, then,
         * just care Anon page here.
         */
        if (PageAnon(page)) {
-                rcu_read_lock();
+                /*
-                rcu_locked = 1;
+                 * Only page_lock_anon_vma() understands the subtleties of
+                 * getting a hold on an anon_vma from outside one of its mms.
-                /* Determine how to safely use anon_vma */
+                 */
-                if (!page_mapped(page)) {
+                anon_vma = page_get_anon_vma(page);
-                        if (!PageSwapCache(page))
+                if (anon_vma) {
-                                goto rcu_unlock;
+                        /*
+                         * Anon page
+                         */
+                } else if (PageSwapCache(page)) {
                        /*
                         * We cannot be sure that the anon_vma of an unmapped
                         * swapcache page is safe to use because we don't
@@ -633,13 +741,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                         */
                        remap_swapcache = 0;
                } else {
-                        /*
+                        goto uncharge;
-                         * Take a reference count on the anon_vma if the
-                         * page is mapped so that it is guaranteed to
-                         * exist when the page is remapped later
-                         */
-                        anon_vma = page_anon_vma(page);
-                        get_anon_vma(anon_vma);
                }
        }
@@ -656,16 +758,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
         * free the metadata, so the page can be freed.
         */
        if (!page->mapping) {
-                if (!PageAnon(page) && page_has_private(page)) {
+                VM_BUG_ON(PageAnon(page));
-                        /*
+                if (page_has_private(page)) {
-                         * Go direct to try_to_free_buffers() here because
-                         * a) that's what try_to_release_page() would do anyway
-                         * b) we may be under rcu_read_lock() here, so we can't
-                         *    use GFP_KERNEL which is what try_to_release_page()
-                         *    needs to be effective.
-                         */
                        try_to_free_buffers(page);
-                        goto rcu_unlock;
+                        goto uncharge;
                }
                goto skip_unmap;
        }
@@ -675,24 +771,22 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 skip_unmap:
        if (!page_mapped(page))
-                rc = move_to_new_page(newpage, page, remap_swapcache);
+                rc = move_to_new_page(newpage, page, remap_swapcache, sync);
        if (rc && remap_swapcache)
                remove_migration_ptes(page, page);
-rcu_unlock:
        /* Drop an anon_vma reference if we took one */
        if (anon_vma)
-                drop_anon_vma(anon_vma);
+                put_anon_vma(anon_vma);
-        if (rcu_locked)
-                rcu_read_unlock();
 uncharge:
        if (!charge)
-                mem_cgroup_end_migration(mem, page, newpage);
+                mem_cgroup_end_migration(mem, page, newpage, rc == 0);
 unlock:
        unlock_page(page);
+move_newpage:
        if (rc != -EAGAIN) {
                /*
                 * A page that has been migrated has all references
@@ -706,8 +800,6 @@ unlock:
                putback_lru_page(page);
        }
-move_newpage:
        /*
         * Move the new page to the LRU. If migration was not successful
         * then this will free the page.
@@ -724,6 +816,76 @@ move_newpage:
 }
 /*
+ * Counterpart of unmap_and_move_page() for hugepage migration.
+ *
+ * This function doesn't wait the completion of hugepage I/O
+ * because there is no race between I/O and migration for hugepage.
+ * Note that currently hugepage I/O occurs only in direct I/O
+ * where no lock is held and PG_writeback is irrelevant,
+ * and writeback status of all subpages are counted in the reference
+ * count of the head page (i.e. if all subpages of a 2MB hugepage are
+ * under direct I/O, the reference of the head page is 512 and a bit more.)
+ * This means that when we try to migrate hugepage whose subpages are
+ * doing direct I/O, some references remain after try_to_unmap() and
+ * hugepage migration fails without data corruption.
+ *
+ * There is also no race when direct I/O is issued on the page under migration,
+ * because then pte is replaced with migration swap entry and direct I/O code
+ * will wait in the page fault for migration to complete.
+ */
+static int unmap_and_move_huge_page(new_page_t get_new_page,
+                                unsigned long private, struct page *hpage,
+                                int force, bool offlining, bool sync)
+{
+        int rc = 0;
+        int *result = NULL;
+        struct page *new_hpage = get_new_page(hpage, private, &result);
+        struct anon_vma *anon_vma = NULL;
+        if (!new_hpage)
+                return -ENOMEM;
+        rc = -EAGAIN;
+        if (!trylock_page(hpage)) {
+                if (!force || !sync)
+                        goto out;
+                lock_page(hpage);
+        }
+        if (PageAnon(hpage))
+                anon_vma = page_get_anon_vma(hpage);
+        try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+        if (!page_mapped(hpage))
+                rc = move_to_new_page(new_hpage, hpage, 1, sync);
+        if (rc)
+                remove_migration_ptes(hpage, hpage);
+        if (anon_vma)
+                put_anon_vma(anon_vma);
+out:
+        unlock_page(hpage);
+        if (rc != -EAGAIN) {
+                list_del(&hpage->lru);
+                put_page(hpage);
+        }
+        put_page(new_hpage);
+        if (result) {
+                if (rc)
+                        *result = rc;
+                else
+                        *result = page_to_nid(new_hpage);
+        }
+        return rc;
+}
+/*
 * migrate_pages
 *
 * The function takes one list of pages to migrate and a function
@@ -732,13 +894,15 @@ move_newpage:
 *
 * The function returns after 10 attempts or if no pages
 * are movable anymore because to has become empty
- * or no retryable pages exist anymore. All pages will be
+ * or no retryable pages exist anymore.
- * returned to the LRU or freed.
+ * Caller should call putback_lru_pages to return pages to the LRU
+ * or free list only if ret != 0.
 *
 * Return: Number of pages not migrated or error code.
 */
 int migrate_pages(struct list_head *from,
-                new_page_t get_new_page, unsigned long private, int offlining)
+                new_page_t get_new_page, unsigned long private, bool offlining,
+                bool sync)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -758,7 +922,8 @@ int migrate_pages(struct list_head *from,
                        cond_resched();
                        rc = unmap_and_move(get_new_page, private,
-                                                page, pass > 2, offlining);
+                                                page, pass > 2, offlining,
+                                                sync);
                        switch(rc) {
                        case -ENOMEM:
@@ -780,8 +945,50 @@ out:
        if (!swapwrite)
                current->flags &= ~PF_SWAPWRITE;
-        putback_lru_pages(from);
+        if (rc)
+                return rc;
+        return nr_failed + retry;
+}
+int migrate_huge_pages(struct list_head *from,
+                new_page_t get_new_page, unsigned long private, bool offlining,
+                bool sync)
+{
+        int retry = 1;
+        int nr_failed = 0;
+        int pass = 0;
+        struct page *page;
+        struct page *page2;
+        int rc;
+        for (pass = 0; pass < 10 && retry; pass++) {
+                retry = 0;
+                list_for_each_entry_safe(page, page2, from, lru) {
+                        cond_resched();
+                        rc = unmap_and_move_huge_page(get_new_page,
+                                        private, page, pass > 2, offlining,
+                                        sync);
+                        switch(rc) {
+                        case -ENOMEM:
+                                goto out;
+                        case -EAGAIN:
+                                retry++;
+                                break;
+                        case 0:
+                                break;
+                        default:
+                                /* Permanent failure */
+                                nr_failed++;
+                                break;
+                        }
+                }
+        }
+        rc = 0;
+out:
        if (rc)
                return rc;
@@ -841,10 +1048,10 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                err = -EFAULT;
                vma = find_vma(mm, pp->addr);
-                if (!vma || !vma_migratable(vma))
+                if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma))
                        goto set_status;
-                page = follow_page(vma, pp->addr, FOLL_GET);
+                page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT);
                err = PTR_ERR(page);
                if (IS_ERR(page))
@@ -890,9 +1097,12 @@ set_status:
        }
        err = 0;
-        if (!list_empty(&pagelist))
+        if (!list_empty(&pagelist)) {
                err = migrate_pages(&pagelist, new_page_node,
-                                (unsigned long)pm, 0);
+                                (unsigned long)pm, 0, true);
+                if (err)
+                        putback_lru_pages(&pagelist);
+        }
        up_read(&mm->mmap_sem);
        return err;
@@ -1005,7 +1215,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
                int err = -EFAULT;
                vma = find_vma(mm, addr);
-                if (!vma)
+                if (!vma || addr < vma->vm_start)
                        goto set_status;
                page = follow_page(vma, addr, 0);
@@ -1086,14 +1296,14 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
                return -EPERM;
        /* Find the mm_struct */
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
-                read_unlock(&tasklist_lock);
+                rcu_read_unlock();
                return -ESRCH;
        }
        mm = get_task_mm(task);
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        if (!mm)
                return -EINVAL;
diff --git a/mm/mincore.c b/mm/mincore.c
index 9ac42dc6d7b6..a4e6b9d75c76 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -154,6 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*pmd)) {
+                        if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
+                                vec += (next - addr) >> PAGE_SHIFT;
+                                continue;
+                        }
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(pmd))
                        mincore_unmapped_range(vma, addr, next, vec);
                else
diff --git a/mm/mlock.c b/mm/mlock.c
index b70919ce4f72..048260c4e02e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,13 +135,6 @@ void munlock_vma_page(struct page *page)
        }
 }
-static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
-{
-        return (vma->vm_flags & VM_GROWSDOWN) &&
-                (vma->vm_start == addr) &&
-                !vma_stack_continue(vma->vm_prev, addr);
-}
 /**
 * __mlock_vma_pages_range() -  mlock a range of pages in the vma.
 * @vma:   target vma
@@ -155,13 +148,12 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add
 * vma->vm_mm->mmap_sem must be held for at least read.
 */
 static long __mlock_vma_pages_range(struct vm_area_struct *vma,
-                                    unsigned long start, unsigned long end)
+                                    unsigned long start, unsigned long end,
+                                    int *nonblocking)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = start;
-        struct page *pages[16]; /* 16 gives a reasonable batch */
        int nr_pages = (end - start) / PAGE_SIZE;
-        int ret = 0;
        int gup_flags;
        VM_BUG_ON(start & ~PAGE_MASK);
@@ -170,73 +162,24 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        VM_BUG_ON(end   > vma->vm_end);
        VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
-        gup_flags = FOLL_TOUCH | FOLL_GET;
+        gup_flags = FOLL_TOUCH | FOLL_MLOCK;
-        if (vma->vm_flags & VM_WRITE)
+        /*
+         * We want to touch writable mappings with a write fault in order
+         * to break COW, except for shared mappings because these don't COW
+         * and we would not want to dirty them for nothing.
+         */
+        if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
                gup_flags |= FOLL_WRITE;
-        /* We don't try to access the guard page of a stack vma */
+        /*
-        if (stack_guard_page(vma, start)) {
+         * We want mlock to succeed for regions that have any permissions
-                addr += PAGE_SIZE;
+         * other than PROT_NONE.
-                nr_pages--;
+         */
-        }
+        if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC))
+                gup_flags |= FOLL_FORCE;
-        while (nr_pages > 0) {
-                int i;
-                cond_resched();
-                /*
-                 * get_user_pages makes pages present if we are
-                 * setting mlock. and this extra reference count will
-                 * disable migration of this page.  However, page may
-                 * still be truncated out from under us.
-                 */
-                ret = __get_user_pages(current, mm, addr,
-                                min_t(int, nr_pages, ARRAY_SIZE(pages)),
-                                gup_flags, pages, NULL);
-                /*
-                 * This can happen for, e.g., VM_NONLINEAR regions before
-                 * a page has been allocated and mapped at a given offset,
-                 * or for addresses that map beyond end of a file.
-                 * We'll mlock the pages if/when they get faulted in.
-                 */
-                if (ret < 0)
-                        break;
-                lru_add_drain();        /* push cached pages to LRU */
-                for (i = 0; i < ret; i++) {
-                        struct page *page = pages[i];
-                        if (page->mapping) {
-                                /*
-                                 * That preliminary check is mainly to avoid
-                                 * the pointless overhead of lock_page on the
-                                 * ZERO_PAGE: which might bounce very badly if
-                                 * there is contention.  However, we're still
-                                 * dirtying its cacheline with get/put_page:
-                                 * we'll add another __get_user_pages flag to
-                                 * avoid it if that case turns out to matter.
-                                 */
-                                lock_page(page);
-                                /*
-                                 * Because we lock page here and migration is
-                                 * blocked by the elevated reference, we need
-                                 * only check for file-cache page truncation.
-                                 */
-                                if (page->mapping)
-                                        mlock_vma_page(page);
-                                unlock_page(page);
-                        }
-                        put_page(page); /* ref from get_user_pages() */
-                }
-                addr += ret * PAGE_SIZE;
-                nr_pages -= ret;
-                ret = 0;
-        }
-        return ret;     /* 0 or negative error code */
+        return __get_user_pages(current, mm, addr, nr_pages, gup_flags,
+                                NULL, NULL, nonblocking);
 }
 /*
@@ -278,9 +221,9 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
        if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
                        is_vm_hugetlb_page(vma) ||
-                        vma == get_gate_vma(current))) {
+                        vma == get_gate_vma(current->mm))) {
-                __mlock_vma_pages_range(vma, start, end);
+                __mlock_vma_pages_range(vma, start, end, NULL);
                /* Hide errors from mmap() and other callers */
                return 0;
@@ -364,26 +307,18 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 * For vmas that pass the filters, merge/split as appropriate.
 */
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
-        unsigned long start, unsigned long end, unsigned int newflags)
+        unsigned long start, unsigned long end, vm_flags_t newflags)
 {
        struct mm_struct *mm = vma->vm_mm;
        pgoff_t pgoff;
        int nr_pages;
        int ret = 0;
-        int lock = newflags & VM_LOCKED;
+        int lock = !!(newflags & VM_LOCKED);
-        if (newflags == vma->vm_flags ||
+        if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
-                        (vma->vm_flags & (VM_IO | VM_PFNMAP)))
+            is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
                goto out;       /* don't set VM_LOCKED,  don't count */
-        if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
-                        is_vm_hugetlb_page(vma) ||
-                        vma == get_gate_vma(current)) {
-                if (lock)
-                        make_pages_present(start, end);
-                goto out;       /* don't set VM_LOCKED,  don't count */
-        }
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
        *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
                          vma->vm_file, pgoff, vma_policy(vma));
@@ -419,14 +354,10 @@ success:
         * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
         */
-        if (lock) {
+        if (lock)
                vma->vm_flags = newflags;
-                ret = __mlock_vma_pages_range(vma, start, end);
+        else
-                if (ret < 0)
-                        ret = __mlock_posix_error_return(ret);
-        } else {
                munlock_vma_pages_range(vma, start, end);
-        }
 out:
        *prev = vma;
@@ -439,7 +370,8 @@ static int do_mlock(unsigned long start, size_t len, int on)
        struct vm_area_struct * vma, * prev;
        int error;
-        len = PAGE_ALIGN(len);
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(len != PAGE_ALIGN(len));
        end = start + len;
        if (end < start)
                return -EINVAL;
@@ -453,7 +385,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
                prev = vma;
        for (nstart = start ; ; ) {
-                unsigned int newflags;
+                vm_flags_t newflags;
                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
@@ -482,6 +414,62 @@ static int do_mlock(unsigned long start, size_t len, int on)
        return error;
 }
+static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors)
+{
+        struct mm_struct *mm = current->mm;
+        unsigned long end, nstart, nend;
+        struct vm_area_struct *vma = NULL;
+        int locked = 0;
+        int ret = 0;
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(len != PAGE_ALIGN(len));
+        end = start + len;
+        for (nstart = start; nstart < end; nstart = nend) {
+                /*
+                 * We want to fault in pages for [nstart; end) address range.
+                 * Find first corresponding VMA.
+                 */
+                if (!locked) {
+                        locked = 1;
+                        down_read(&mm->mmap_sem);
+                        vma = find_vma(mm, nstart);
+                } else if (nstart >= vma->vm_end)
+                        vma = vma->vm_next;
+                if (!vma || vma->vm_start >= end)
+                        break;
+                /*
+                 * Set [nstart; nend) to intersection of desired address
+                 * range with the first VMA. Also, skip undesirable VMA types.
+                 */
+                nend = min(end, vma->vm_end);
+                if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                        continue;
+                if (nstart < vma->vm_start)
+                        nstart = vma->vm_start;
+                /*
+                 * Now fault in a range of pages. __mlock_vma_pages_range()
+                 * double checks the vma flags, so that it won't mlock pages
+                 * if the vma was already munlocked.
+                 */
+                ret = __mlock_vma_pages_range(vma, nstart, nend, &locked);
+                if (ret < 0) {
+                        if (ignore_errors) {
+                                ret = 0;
+                                continue;       /* continue at next VMA */
+                        }
+                        ret = __mlock_posix_error_return(ret);
+                        break;
+                }
+                nend = nstart + ret * PAGE_SIZE;
+                ret = 0;
+        }
+        if (locked)
+                up_read(&mm->mmap_sem);
+        return ret;     /* 0 or negative error code */
+}
 SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 {
        unsigned long locked;
@@ -507,6 +495,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
        if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
                error = do_mlock(start, len, 1);
        up_write(&current->mm->mmap_sem);
+        if (!error)
+                error = do_mlock_pages(start, len, 0);
        return error;
 }
@@ -534,7 +524,7 @@ static int do_mlockall(int flags)
                goto out;
        for (vma = current->mm->mmap; vma ; vma = prev->vm_next) {
-                unsigned int newflags;
+                vm_flags_t newflags;
                newflags = vma->vm_flags | VM_LOCKED;
                if (!(flags & MCL_CURRENT))
@@ -571,6 +561,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
            capable(CAP_IPC_LOCK))
                ret = do_mlockall(flags);
        up_write(&current->mm->mmap_sem);
+        if (!ret && (flags & MCL_CURRENT)) {
+                /* Ignore errors */
+                do_mlock_pages(0, TASK_SIZE, 1);
+        }
 out:
        return ret;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 00161a48a451..d49736ff8a8d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,8 @@
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
+#include <linux/audit.h>
+#include <linux/khugepaged.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -82,10 +84,14 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags)
 }
 EXPORT_SYMBOL(vm_get_page_prot);
-int sysctl_overcommit_memory = OVERCOMMIT_GUESS;  /* heuristic overcommit */
+int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
-int sysctl_overcommit_ratio = 50;       /* default is 50% */
+int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
-struct percpu_counter vm_committed_as;
+/*
+ * Make sure vm_committed_as in one cacheline and not cacheline shared with
+ * other variables. It can be updated by several CPUs frequently.
+ */
+struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
 /*
 * Check that a process has enough memory to allocate a new virtual
@@ -188,7 +194,7 @@ error:
 }
 /*
- * Requires inode->i_mapping->i_mmap_lock
+ * Requires inode->i_mapping->i_mmap_mutex
 */
 static void __remove_shared_vm_struct(struct vm_area_struct *vma,
                struct file *file, struct address_space *mapping)
@@ -216,9 +222,9 @@ void unlink_file_vma(struct vm_area_struct *vma)
        if (file) {
                struct address_space *mapping = file->f_mapping;
-                spin_lock(&mapping->i_mmap_lock);
+                mutex_lock(&mapping->i_mmap_mutex);
                __remove_shared_vm_struct(vma, file, mapping);
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
        }
 }
@@ -252,7 +258,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
        down_write(&mm->mmap_sem);
 #ifdef CONFIG_COMPAT_BRK
-        min_brk = mm->end_code;
+        /*
+         * CONFIG_COMPAT_BRK can still be overridden by setting
+         * randomize_va_space to 2, which will still cause mm->start_brk
+         * to be arbitrarily shifted
+         */
+        if (current->brk_randomized)
+                min_brk = mm->start_brk;
+        else
+                min_brk = mm->end_data;
 #else
        min_brk = mm->start_brk;
 #endif
@@ -384,29 +398,6 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
        return vma;
 }
-static inline void
-__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
-                struct vm_area_struct *prev, struct rb_node *rb_parent)
-{
-        struct vm_area_struct *next;
-        vma->vm_prev = prev;
-        if (prev) {
-                next = prev->vm_next;
-                prev->vm_next = vma;
-        } else {
-                mm->mmap = vma;
-                if (rb_parent)
-                        next = rb_entry(rb_parent,
-                                        struct vm_area_struct, vm_rb);
-                else
-                        next = NULL;
-        }
-        vma->vm_next = next;
-        if (next)
-                next->vm_prev = vma;
-}
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
                struct rb_node **rb_link, struct rb_node *rb_parent)
 {
@@ -454,16 +445,14 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
        if (vma->vm_file)
                mapping = vma->vm_file->f_mapping;
-        if (mapping) {
+        if (mapping)
-                spin_lock(&mapping->i_mmap_lock);
+                mutex_lock(&mapping->i_mmap_mutex);
-                vma->vm_truncate_count = mapping->truncate_count;
-        }
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        __vma_link_file(vma);
        if (mapping)
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
        mm->map_count++;
        validate_mm(mm);
@@ -566,17 +555,8 @@ again:			remove_next = 1 + (end > next->vm_end);
                mapping = file->f_mapping;
                if (!(vma->vm_flags & VM_NONLINEAR))
                        root = &mapping->i_mmap;
-                spin_lock(&mapping->i_mmap_lock);
+                mutex_lock(&mapping->i_mmap_mutex);
-                if (importer &&
-                    vma->vm_truncate_count != next->vm_truncate_count) {
-                        /*
-                         * unmap_mapping_range might be in progress:
-                         * ensure that the expanding vma is rescanned.
-                         */
-                        importer->vm_truncate_count = 0;
-                }
                if (insert) {
-                        insert->vm_truncate_count = vma->vm_truncate_count;
                        /*
                         * Put into prio_tree now, so instantiated pages
                         * are visible to arm/parisc __flush_dcache_page
@@ -587,13 +567,15 @@ again:			remove_next = 1 + (end > next->vm_end);
                }
        }
+        vma_adjust_trans_huge(vma, start, end, adjust_next);
        /*
         * When changing only vma->vm_end, we don't really need anon_vma
         * lock. This is a fairly rare case by itself, but the anon_vma
         * lock may be shared between many sibling processes.  Skipping
         * the lock for brk adjustments makes a difference sometimes.
         */
-        if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+        if (vma->anon_vma && (importer || start != vma->vm_start)) {
                anon_vma = vma->anon_vma;
                anon_vma_lock(anon_vma);
        }
@@ -640,7 +622,7 @@ again:			remove_next = 1 + (end > next->vm_end);
        if (anon_vma)
                anon_vma_unlock(anon_vma);
        if (mapping)
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
        if (remove_next) {
                if (file) {
@@ -687,9 +669,17 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
 }
 static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
-                                        struct anon_vma *anon_vma2)
+                                        struct anon_vma *anon_vma2,
+                                        struct vm_area_struct *vma)
 {
-        return !anon_vma1 || !anon_vma2 || (anon_vma1 == anon_vma2);
+        /*
+         * The list_is_singular() test is to avoid merging VMA cloned from
+         * parents. This can improve scalability caused by anon_vma lock.
+         */
+        if ((!anon_vma1 || !anon_vma2) && (!vma ||
+                list_is_singular(&vma->anon_vma_chain)))
+                return 1;
+        return anon_vma1 == anon_vma2;
 }
 /*
@@ -708,7 +698,7 @@ can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 {
        if (is_mergeable_vma(vma, file, vm_flags) &&
-            is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                if (vma->vm_pgoff == vm_pgoff)
                        return 1;
        }
@@ -727,7 +717,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
        struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff)
 {
        if (is_mergeable_vma(vma, file, vm_flags) &&
-            is_mergeable_anon_vma(anon_vma, vma->anon_vma)) {
+            is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                pgoff_t vm_pglen;
                vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
                if (vma->vm_pgoff + vm_pglen == vm_pgoff)
@@ -805,7 +795,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                can_vma_merge_before(next, vm_flags,
                                        anon_vma, file, pgoff+pglen) &&
                                is_mergeable_anon_vma(prev->anon_vma,
-                                                      next->anon_vma)) {
+                                                      next->anon_vma, NULL)) {
                                                        /* cases 1, 6 */
                        err = vma_adjust(prev, prev->vm_start,
                                next->vm_end, prev->vm_pgoff, NULL);
@@ -814,6 +804,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                end, prev->vm_pgoff, NULL);
                if (err)
                        return NULL;
+                khugepaged_enter_vma_merge(prev);
                return prev;
        }
@@ -832,6 +823,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                next->vm_pgoff - pglen, NULL);
                if (err)
                        return NULL;
+                khugepaged_enter_vma_merge(area);
                return area;
        }
@@ -914,14 +906,7 @@ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
        if (anon_vma)
                return anon_vma;
 try_prev:
-        /*
+        near = vma->vm_prev;
-         * It is potentially slow to have to call find_vma_prev here.
-         * But it's only on the first write fault on the vma, not
-         * every time, and we could devise a way to avoid it later
-         * (e.g. stash info in next's anon_vma_node when assigning
-         * an anon_vma, or when trying vma_merge).  Another time.
-         */
-        BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
        if (!near)
                goto none;
@@ -968,7 +953,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 {
        struct mm_struct * mm = current->mm;
        struct inode *inode;
-        unsigned int vm_flags;
+        vm_flags_t vm_flags;
        int error;
        unsigned long reqprot = prot;
@@ -1108,6 +1093,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
        unsigned long retval = -EBADF;
        if (!(flags & MAP_ANONYMOUS)) {
+                audit_mmap_fd(fd, flags);
                if (unlikely(flags & MAP_HUGETLB))
                        return -EINVAL;
                file = fget(fd);
@@ -1172,7 +1158,7 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
 */
 int vma_wants_writenotify(struct vm_area_struct *vma)
 {
-        unsigned int vm_flags = vma->vm_flags;
+        vm_flags_t vm_flags = vma->vm_flags;
        /* If it was private or non-writable, the write bit is already clear */
        if ((vm_flags & (VM_WRITE|VM_SHARED)) != ((VM_WRITE|VM_SHARED)))
@@ -1200,7 +1186,7 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 * We account for memory if it's a private writeable mapping,
 * not hugepages and VM_NORESERVE wasn't set.
 */
-static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags)
 {
        /*
         * hugetlb has its own accounting separate from the core VM
@@ -1214,7 +1200,7 @@ static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
 unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, unsigned long flags,
-                          unsigned int vm_flags, unsigned long pgoff)
+                          vm_flags_t vm_flags, unsigned long pgoff)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma, *prev;
@@ -1752,13 +1738,17 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                size = address - vma->vm_start;
                grow = (address - vma->vm_end) >> PAGE_SHIFT;
-                error = acct_stack_growth(vma, size, grow);
+                error = -ENOMEM;
-                if (!error) {
+                if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) {
-                        vma->vm_end = address;
+                        error = acct_stack_growth(vma, size, grow);
-                        perf_event_mmap(vma);
+                        if (!error) {
+                                vma->vm_end = address;
+                                perf_event_mmap(vma);
+                        }
                }
        }
        vma_unlock_anon_vma(vma);
+        khugepaged_enter_vma_merge(vma);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1766,7 +1756,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 /*
 * vma is the first one with address < vma->vm_start.  Have to extend vma.
 */
-static int expand_downwards(struct vm_area_struct *vma,
+int expand_downwards(struct vm_area_struct *vma,
                                   unsigned long address)
 {
        int error;
@@ -1798,22 +1788,21 @@ static int expand_downwards(struct vm_area_struct *vma,
                size = vma->vm_end - address;
                grow = (vma->vm_start - address) >> PAGE_SHIFT;
-                error = acct_stack_growth(vma, size, grow);
+                error = -ENOMEM;
-                if (!error) {
+                if (grow <= vma->vm_pgoff) {
-                        vma->vm_start = address;
+                        error = acct_stack_growth(vma, size, grow);
-                        vma->vm_pgoff -= grow;
+                        if (!error) {
-                        perf_event_mmap(vma);
+                                vma->vm_start = address;
+                                vma->vm_pgoff -= grow;
+                                perf_event_mmap(vma);
+                        }
                }
        }
        vma_unlock_anon_vma(vma);
+        khugepaged_enter_vma_merge(vma);
        return error;
 }
-int expand_stack_downwards(struct vm_area_struct *vma, unsigned long address)
-{
-        return expand_downwards(vma, address);
-}
 #ifdef CONFIG_STACK_GROWSUP
 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 {
@@ -1896,17 +1885,17 @@ static void unmap_region(struct mm_struct *mm,
                unsigned long start, unsigned long end)
 {
        struct vm_area_struct *next = prev? prev->vm_next: mm->mmap;
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        unsigned long nr_accounted = 0;
        lru_add_drain();
-        tlb = tlb_gather_mmu(mm, 0);
+        tlb_gather_mmu(&tlb, mm, 0);
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+        free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
-                                 next? next->vm_start: 0);
+                                 next ? next->vm_start : 0);
-        tlb_finish_mmu(tlb, start, end);
+        tlb_finish_mmu(&tlb, start, end);
 }
 /*
@@ -2048,9 +2037,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
                return -EINVAL;
        /* Find the first overlapping VMA */
-        vma = find_vma_prev(mm, start, &prev);
+        vma = find_vma(mm, start);
        if (!vma)
                return 0;
+        prev = vma->vm_prev;
        /* we have  start < vma->vm_end  */
        /* if it doesn't overlap, we have nothing.. */
@@ -2248,7 +2238,7 @@ EXPORT_SYMBOL(do_brk);
 /* Release all mmaps. */
 void exit_mmap(struct mm_struct *mm)
 {
-        struct mmu_gather *tlb;
+        struct mmu_gather tlb;
        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        unsigned long end;
@@ -2273,14 +2263,14 @@ void exit_mmap(struct mm_struct *mm)
        lru_add_drain();
        flush_cache_mm(mm);
-        tlb = tlb_gather_mmu(mm, 1);
+        tlb_gather_mmu(&tlb, mm, 1);
        /* update_hiwater_rss(mm) here? but nobody should be looking */
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
+        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
-        tlb_finish_mmu(tlb, 0, end);
+        tlb_finish_mmu(&tlb, 0, end);
        /*
         * Walk the list again, actually closing and freeing it,
@@ -2294,7 +2284,7 @@ void exit_mmap(struct mm_struct *mm)
 /* Insert vm structure into process list sorted by address
 * and into the inode's i_mmap tree.  If vm_file is non-NULL
- * then i_mmap_lock is taken here.
+ * then i_mmap_mutex is taken here.
 */
 int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 {
@@ -2460,6 +2450,7 @@ int install_special_mapping(struct mm_struct *mm,
                            unsigned long addr, unsigned long len,
                            unsigned long vm_flags, struct page **pages)
 {
+        int ret;
        struct vm_area_struct *vma;
        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
@@ -2477,16 +2468,23 @@ int install_special_mapping(struct mm_struct *mm,
        vma->vm_ops = &special_mapping_vmops;
        vma->vm_private_data = pages;
-        if (unlikely(insert_vm_struct(mm, vma))) {
+        ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
-                kmem_cache_free(vm_area_cachep, vma);
+        if (ret)
-                return -ENOMEM;
+                goto out;
-        }
+        ret = insert_vm_struct(mm, vma);
+        if (ret)
+                goto out;
        mm->total_vm += len >> PAGE_SHIFT;
        perf_event_mmap(vma);
        return 0;
+out:
+        kmem_cache_free(vm_area_cachep, vma);
+        return ret;
 }
 static DEFINE_MUTEX(mm_all_locks_mutex);
@@ -2498,15 +2496,15 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
+                mutex_lock_nest_lock(&anon_vma->root->mutex, &mm->mmap_sem);
                /*
                 * We can safely modify head.next after taking the
-                 * anon_vma->root->lock. If some other vma in this mm shares
+                 * anon_vma->root->mutex. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
-                 * anon_vma->root->lock.
+                 * anon_vma->root->mutex.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
                                       &anon_vma->root->head.next))
@@ -2528,7 +2526,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
                 */
                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
                        BUG();
-                spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+                mutex_lock_nest_lock(&mapping->i_mmap_mutex, &mm->mmap_sem);
        }
 }
@@ -2555,7 +2553,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
 * vma in this mm is backed by the same anon_vma or address_space.
 *
 * We can take all the locks in random order because the VM code
- * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * taking i_mmap_mutex or anon_vma->mutex outside the mmap_sem never
 * takes more than one of them in a row. Secondly we're protected
 * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
 *
@@ -2611,7 +2609,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
-                 * anon_vma->root->lock.
+                 * anon_vma->root->mutex.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
                                          &anon_vma->root->head.next))
@@ -2627,7 +2625,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
                 * AS_MM_ALL_LOCKS can't change to 0 from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
                                        &mapping->flags))
                        BUG();
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 438951d366f2..8d032de4088e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
        return young;
 }
+int __mmu_notifier_test_young(struct mm_struct *mm,
+                              unsigned long address)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        int young = 0;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->test_young) {
+                        young = mn->ops->test_young(mn, mm, address);
+                        if (young)
+                                break;
+                }
+        }
+        rcu_read_unlock();
+        return young;
+}
 void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
                               pte_t pte)
 {
diff --git a/mm/mmzone.c b/mm/mmzone.c
index e35bfb82c855..f5b7d1760213 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn,
        return 1;
 }
 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-#ifdef CONFIG_SMP
-/* Called when a more accurate view of NR_FREE_PAGES is needed */
-unsigned long zone_nr_free_pages(struct zone *zone)
-{
-        unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
-        /*
-         * While kswapd is awake, it is considered the zone is under some
-         * memory pressure. Under pressure, there is a risk that
-         * per-cpu-counter-drift will allow the min watermark to be breached
-         * potentially causing a live-lock. While kswapd is awake and
-         * free pages are low, get a better estimate for free pages
-         */
-        if (nr_free_pages < zone->percpu_drift_mark &&
-                        !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
-                return zone_page_state_snapshot(zone, NR_FREE_PAGES);
-        return nr_free_pages;
-}
-#endif /* CONFIG_SMP */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2d1bf7cf8851..5a688a2756be 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
        pte_unmap_unlock(pte - 1, ptl);
 }
-static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                unsigned long addr, unsigned long end, pgprot_t newprot,
                int dirty_accountable)
 {
@@ -88,13 +88,21 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (pmd_trans_huge(*pmd)) {
+                        if (next - addr != HPAGE_PMD_SIZE)
+                                split_huge_page_pmd(vma->vm_mm, pmd);
+                        else if (change_huge_pmd(vma, pmd, addr, newprot))
+                                continue;
+                        /* fall through */
+                }
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable);
+                change_pte_range(vma->vm_mm, pmd, addr, next, newprot,
+                                 dirty_accountable);
        } while (pmd++, addr = next, addr != end);
 }
-static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                unsigned long addr, unsigned long end, pgprot_t newprot,
                int dirty_accountable)
 {
@@ -106,7 +114,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable);
+                change_pmd_range(vma, pud, addr, next, newprot,
+                                 dirty_accountable);
        } while (pud++, addr = next, addr != end);
 }
@@ -126,7 +135,8 @@ static void change_protection(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable);
+                change_pud_range(vma, pgd, addr, next, newprot,
+                                 dirty_accountable);
        } while (pgd++, addr = next, addr != end);
        flush_tlb_range(vma, start, end);
 }
@@ -211,6 +221,7 @@ success:
        mmu_notifier_invalidate_range_end(mm, start, end);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+        perf_event_mmap(vma);
        return 0;
 fail:
@@ -299,7 +310,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
                error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
                if (error)
                        goto out;
-                perf_event_mmap(vma);
                nstart = tmp;
                if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index cde56ee51ef7..506fa44403df 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,13 +41,15 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
                return NULL;
        pmd = pmd_offset(pud, addr);
+        split_huge_page_pmd(mm, pmd);
        if (pmd_none_or_clear_bad(pmd))
                return NULL;
        return pmd;
 }
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+                            unsigned long addr)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -62,7 +64,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
        if (!pmd)
                return NULL;
-        if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
+        VM_BUG_ON(pmd_trans_huge(*pmd));
+        if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
                return NULL;
        return pmd;
@@ -90,10 +93,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                 * and we propagate stale pages into the dst afterward.
                 */
                mapping = vma->vm_file->f_mapping;
-                spin_lock(&mapping->i_mmap_lock);
+                mutex_lock(&mapping->i_mmap_mutex);
-                if (new_vma->vm_truncate_count &&
-                    new_vma->vm_truncate_count != vma->vm_truncate_count)
-                        new_vma->vm_truncate_count = 0;
        }
        /*
@@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
         * pte locks because exclusive mmap_sem prevents deadlock.
         */
        old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
-        new_pte = pte_offset_map_nested(new_pmd, new_addr);
+        new_pte = pte_offset_map(new_pmd, new_addr);
        new_ptl = pte_lockptr(mm, new_pmd);
        if (new_ptl != old_ptl)
                spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
@@ -119,10 +119,10 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        arch_leave_lazy_mmu_mode();
        if (new_ptl != old_ptl)
                spin_unlock(new_ptl);
-        pte_unmap_nested(new_pte - 1);
+        pte_unmap(new_pte - 1);
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (mapping)
-                spin_unlock(&mapping->i_mmap_lock);
+                mutex_unlock(&mapping->i_mmap_mutex);
        mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 }
@@ -147,7 +147,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
                old_pmd = get_old_pmd(vma->vm_mm, old_addr);
                if (!old_pmd)
                        continue;
-                new_pmd = alloc_new_pmd(vma->vm_mm, new_addr);
+                new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
                if (!new_pmd)
                        break;
                next = (new_addr + PMD_SIZE) & PMD_MASK;
@@ -276,9 +276,16 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
        if (old_len > vma->vm_end - addr)
                goto Efault;
-        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
+        /* Need to be careful about a growing mapping */
-                if (new_len > old_len)
+        if (new_len > old_len) {
+                unsigned long pgoff;
+                if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
                        goto Efault;
+                pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
+                pgoff += vma->vm_pgoff;
+                if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
+                        goto Einval;
        }
        if (vma->vm_flags & VM_LOCKED) {
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
new file mode 100644
index 000000000000..6e93dc7f2586
--- /dev/null
+++ b/mm/nobootmem.c
@@ -0,0 +1,404 @@
+/*
+ *  bootmem - A boot-time physical memory allocator and configurator
+ *
+ *  Copyright (C) 1999 Ingo Molnar
+ *                1999 Kanoj Sarcar, SGI
+ *                2008 Johannes Weiner
+ *
+ * Access to this subsystem has to be serialized externally (which is true
+ * for the boot process anyway).
+ */
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/slab.h>
+#include <linux/bootmem.h>
+#include <linux/module.h>
+#include <linux/kmemleak.h>
+#include <linux/range.h>
+#include <linux/memblock.h>
+#include <asm/bug.h>
+#include <asm/io.h>
+#include <asm/processor.h>
+#include "internal.h"
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data;
+EXPORT_SYMBOL(contig_page_data);
+#endif
+unsigned long max_low_pfn;
+unsigned long min_low_pfn;
+unsigned long max_pfn;
+static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+                                        u64 goal, u64 limit)
+{
+        void *ptr;
+        u64 addr;
+        if (limit > memblock.current_limit)
+                limit = memblock.current_limit;
+        addr = find_memory_core_early(nid, size, align, goal, limit);
+        if (addr == MEMBLOCK_ERROR)
+                return NULL;
+        ptr = phys_to_virt(addr);
+        memset(ptr, 0, size);
+        memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+        /*
+         * The min_count is set to 0 so that bootmem allocated blocks
+         * are never reported as leaks.
+         */
+        kmemleak_alloc(ptr, size, 0, 0);
+        return ptr;
+}
+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system.  Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+        unsigned long cursor, end;
+        kmemleak_free_part(__va(addr), size);
+        cursor = PFN_UP(addr);
+        end = PFN_DOWN(addr + size);
+        for (; cursor < end; cursor++) {
+                __free_pages_bootmem(pfn_to_page(cursor), 0);
+                totalram_pages++;
+        }
+}
+static void __init __free_pages_memory(unsigned long start, unsigned long end)
+{
+        int i;
+        unsigned long start_aligned, end_aligned;
+        int order = ilog2(BITS_PER_LONG);
+        start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+        end_aligned = end & ~(BITS_PER_LONG - 1);
+        if (end_aligned <= start_aligned) {
+                for (i = start; i < end; i++)
+                        __free_pages_bootmem(pfn_to_page(i), 0);
+                return;
+        }
+        for (i = start; i < start_aligned; i++)
+                __free_pages_bootmem(pfn_to_page(i), 0);
+        for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
+                __free_pages_bootmem(pfn_to_page(i), order);
+        for (i = end_aligned; i < end; i++)
+                __free_pages_bootmem(pfn_to_page(i), 0);
+}
+unsigned long __init free_all_memory_core_early(int nodeid)
+{
+        int i;
+        u64 start, end;
+        unsigned long count = 0;
+        struct range *range = NULL;
+        int nr_range;
+        nr_range = get_free_all_memory_range(&range, nodeid);
+        for (i = 0; i < nr_range; i++) {
+                start = range[i].start;
+                end = range[i].end;
+                count += end - start;
+                __free_pages_memory(start, end);
+        }
+        return count;
+}
+/**
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+{
+        register_page_bootmem_info_node(pgdat);
+        /* free_all_memory_core_early(MAX_NUMNODES) will be called later */
+        return 0;
+}
+/**
+ * free_all_bootmem - release free pages to the buddy allocator
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem(void)
+{
+        /*
+         * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
+         *  because in some case like Node0 doesn't have RAM installed
+         *  low ram will be on Node1
+         * Use MAX_NUMNODES will make sure all ranges in early_node_map[]
+         *  will be used instead of only Node0 related
+         */
+        return free_all_memory_core_early(MAX_NUMNODES);
+}
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must reside completely on the specified node.
+ */
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+                              unsigned long size)
+{
+        kmemleak_free_part(__va(physaddr), size);
+        memblock_x86_free_range(physaddr, physaddr + size);
+}
+/**
+ * free_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long addr, unsigned long size)
+{
+        kmemleak_free_part(__va(addr), size);
+        memblock_x86_free_range(addr, addr + size);
+}
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
+                                        unsigned long align,
+                                        unsigned long goal,
+                                        unsigned long limit)
+{
+        void *ptr;
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc(size, GFP_NOWAIT);
+restart:
+        ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit);
+        if (ptr)
+                return ptr;
+        if (goal != 0) {
+                goal = 0;
+                goto restart;
+        }
+        return NULL;
+}
+/**
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+                                        unsigned long goal)
+{
+        unsigned long limit = -1UL;
+        return ___alloc_bootmem_nopanic(size, align, goal, limit);
+}
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+                                        unsigned long goal, unsigned long limit)
+{
+        void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
+        if (mem)
+                return mem;
+        /*
+         * Whoops, we cannot satisfy the allocation request.
+         */
+        printk(KERN_ALERT "bootmem alloc of %lu bytes failed!\n", size);
+        panic("Out of memory");
+        return NULL;
+}
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+                              unsigned long goal)
+{
+        unsigned long limit = -1UL;
+        return ___alloc_bootmem(size, align, goal, limit);
+}
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+                                   unsigned long align, unsigned long goal)
+{
+        void *ptr;
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+        ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+                                         goal, -1ULL);
+        if (ptr)
+                return ptr;
+        return __alloc_memory_core_early(MAX_NUMNODES, size, align,
+                                         goal, -1ULL);
+}
+void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
+                                   unsigned long align, unsigned long goal)
+{
+        return __alloc_bootmem_node(pgdat, size, align, goal);
+}
+#ifdef CONFIG_SPARSEMEM
+/**
+ * alloc_bootmem_section - allocate boot memory from a specific section
+ * @size: size of the request in bytes
+ * @section_nr: sparse map section to allocate from
+ *
+ * Return NULL on failure.
+ */
+void * __init alloc_bootmem_section(unsigned long size,
+                                    unsigned long section_nr)
+{
+        unsigned long pfn, goal, limit;
+        pfn = section_nr_to_pfn(section_nr);
+        goal = pfn << PAGE_SHIFT;
+        limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
+        return __alloc_memory_core_early(early_pfn_to_nid(pfn), size,
+                                         SMP_CACHE_BYTES, goal, limit);
+}
+#endif
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
+                                   unsigned long align, unsigned long goal)
+{
+        void *ptr;
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+        ptr =  __alloc_memory_core_early(pgdat->node_id, size, align,
+                                                 goal, -1ULL);
+        if (ptr)
+                return ptr;
+        return __alloc_bootmem_nopanic(size, align, goal);
+}
+#ifndef ARCH_LOW_ADDRESS_LIMIT
+#define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
+#endif
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
+                                  unsigned long goal)
+{
+        return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
+}
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+                                       unsigned long align, unsigned long goal)
+{
+        void *ptr;
+        if (WARN_ON_ONCE(slab_is_available()))
+                return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+        ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
+                                goal, ARCH_LOW_ADDRESS_LIMIT);
+        if (ptr)
+                return ptr;
+        return  __alloc_memory_core_early(MAX_NUMNODES, size, align,
+                                goal, ARCH_LOW_ADDRESS_LIMIT);
+}
diff --git a/mm/nommu.c b/mm/nommu.c
index 88ff091eb07a..9edc897a3970 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -10,7 +10,7 @@
 *  Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
 *  Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
 *  Copyright (c) 2002      Greg Ungerer <gerg@snapgear.com>
- *  Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org>
+ *  Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org>
 */
 #include <linux/module.h>
@@ -29,6 +29,7 @@
 #include <linux/personality.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/audit.h>
 #include <asm/uaccess.h>
 #include <asm/tlb.h>
@@ -126,7 +127,8 @@ unsigned int kobjsize(const void *objp)
 int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                     unsigned long start, int nr_pages, unsigned int foll_flags,
-                     struct page **pages, struct vm_area_struct **vmas)
+                     struct page **pages, struct vm_area_struct **vmas,
+                     int *retry)
 {
        struct vm_area_struct *vma;
        unsigned long vm_flags;
@@ -184,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        if (force)
                flags |= FOLL_FORCE;
-        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+        return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
+                                NULL);
 }
 EXPORT_SYMBOL(get_user_pages);
@@ -293,12 +296,60 @@ void *vmalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc);
+/*
+ *      vzalloc - allocate virtually continguos memory with zero fill
+ *
+ *      @size:          allocation size
+ *
+ *      Allocate enough pages to cover @size from the page level
+ *      allocator and map them into continguos kernel virtual space.
+ *      The memory allocated is set to zero.
+ *
+ *      For tight control over page level allocator and protection flags
+ *      use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+                        PAGE_KERNEL);
+}
+EXPORT_SYMBOL(vzalloc);
+/**
+ * vmalloc_node - allocate memory on a specific node
+ * @size:       allocation size
+ * @node:       numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
 void *vmalloc_node(unsigned long size, int node)
 {
        return vmalloc(size);
 }
 EXPORT_SYMBOL(vmalloc_node);
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size:       allocation size
+ * @node:       numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+        return vzalloc(size);
+}
+EXPORT_SYMBOL(vzalloc_node);
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 #endif
@@ -392,6 +443,31 @@ void  __attribute__((weak)) vmalloc_sync_all(void)
 {
 }
+/**
+ *      alloc_vm_area - allocate a range of kernel address space
+ *      @size:          size of the area
+ *
+ *      Returns:        NULL on failure, vm_struct on success
+ *
+ *      This function reserves a range of kernel address space, and
+ *      allocates pagetables to map that range.  No actual mappings
+ *      are created.  If the kernel address space is not shared
+ *      between processes, it syncs the pagetable across all
+ *      processes.
+ */
+struct vm_struct *alloc_vm_area(size_t size)
+{
+        BUG();
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(alloc_vm_area);
+void free_vm_area(struct vm_struct *area)
+{
+        BUG();
+}
+EXPORT_SYMBOL_GPL(free_vm_area);
 int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
                   struct page *page)
 {
@@ -604,9 +680,9 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
 */
 static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-        struct vm_area_struct *pvma, **pp, *next;
+        struct vm_area_struct *pvma, *prev;
        struct address_space *mapping;
-        struct rb_node **p, *parent;
+        struct rb_node **p, *parent, *rb_prev;
        kenter(",%p", vma);
@@ -627,7 +703,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
        }
        /* add the VMA to the tree */
-        parent = NULL;
+        parent = rb_prev = NULL;
        p = &mm->mm_rb.rb_node;
        while (*p) {
                parent = *p;
@@ -637,17 +713,20 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
                 * (the latter is necessary as we may get identical VMAs) */
                if (vma->vm_start < pvma->vm_start)
                        p = &(*p)->rb_left;
-                else if (vma->vm_start > pvma->vm_start)
+                else if (vma->vm_start > pvma->vm_start) {
+                        rb_prev = parent;
                        p = &(*p)->rb_right;
-                else if (vma->vm_end < pvma->vm_end)
+                } else if (vma->vm_end < pvma->vm_end)
                        p = &(*p)->rb_left;
-                else if (vma->vm_end > pvma->vm_end)
+                else if (vma->vm_end > pvma->vm_end) {
+                        rb_prev = parent;
                        p = &(*p)->rb_right;
-                else if (vma < pvma)
+                } else if (vma < pvma)
                        p = &(*p)->rb_left;
-                else if (vma > pvma)
+                else if (vma > pvma) {
+                        rb_prev = parent;
                        p = &(*p)->rb_right;
-                else
+                } else
                        BUG();
        }
@@ -655,20 +734,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
        rb_insert_color(&vma->vm_rb, &mm->mm_rb);
        /* add VMA to the VMA list also */
-        for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) {
+        prev = NULL;
-                if (pvma->vm_start > vma->vm_start)
+        if (rb_prev)
-                        break;
+                prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb);
-                if (pvma->vm_start < vma->vm_start)
-                        continue;
-                if (pvma->vm_end < vma->vm_end)
-                        break;
-        }
-        next = *pp;
+        __vma_link_list(mm, vma, prev, parent);
-        *pp = vma;
-        vma->vm_next = next;
-        if (next)
-                next->vm_prev = vma;
 }
 /*
@@ -676,7 +746,6 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 */
 static void delete_vma_from_mm(struct vm_area_struct *vma)
 {
-        struct vm_area_struct **pp;
        struct address_space *mapping;
        struct mm_struct *mm = vma->vm_mm;
@@ -699,12 +768,14 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
        /* remove from the MM's tree and list */
        rb_erase(&vma->vm_rb, &mm->mm_rb);
-        for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) {
-                if (*pp == vma) {
+        if (vma->vm_prev)
-                        *pp = vma->vm_next;
+                vma->vm_prev->vm_next = vma->vm_next;
-                        break;
+        else
-                }
+                mm->mmap = vma->vm_next;
-        }
+        if (vma->vm_next)
+                vma->vm_next->vm_prev = vma->vm_prev;
        vma->vm_mm = NULL;
 }
@@ -733,17 +804,15 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 {
        struct vm_area_struct *vma;
-        struct rb_node *n = mm->mm_rb.rb_node;
        /* check the cache first */
        vma = mm->mmap_cache;
        if (vma && vma->vm_start <= addr && vma->vm_end > addr)
                return vma;
-        /* trawl the tree (there may be multiple mappings in which addr
+        /* trawl the list (there may be multiple mappings in which addr
         * resides) */
-        for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                vma = rb_entry(n, struct vm_area_struct, vm_rb);
                if (vma->vm_start > addr)
                        return NULL;
                if (vma->vm_end > addr) {
@@ -783,7 +852,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
                                             unsigned long len)
 {
        struct vm_area_struct *vma;
-        struct rb_node *n = mm->mm_rb.rb_node;
        unsigned long end = addr + len;
        /* check the cache first */
@@ -791,10 +859,9 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
        if (vma && vma->vm_start == addr && vma->vm_end == end)
                return vma;
-        /* trawl the tree (there may be multiple mappings in which addr
+        /* trawl the list (there may be multiple mappings in which addr
         * resides) */
-        for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
-                vma = rb_entry(n, struct vm_area_struct, vm_rb);
                if (vma->vm_start < addr)
                        continue;
                if (vma->vm_start > addr)
@@ -1057,7 +1124,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
                           unsigned long capabilities)
 {
        struct page *pages;
-        unsigned long total, point, n, rlen;
+        unsigned long total, point, n;
        void *base;
        int ret, order;
@@ -1081,13 +1148,12 @@ static int do_mmap_private(struct vm_area_struct *vma,
                 * make a private copy of the data and map that instead */
        }
-        rlen = PAGE_ALIGN(len);
        /* allocate some memory to hold the mapping
         * - note that this may not return a page-aligned address if the object
         *   we're allocating is smaller than a page
         */
-        order = get_order(rlen);
+        order = get_order(len);
        kdebug("alloc order %d for %lx", order, len);
        pages = alloc_pages(GFP_KERNEL, order);
@@ -1097,7 +1163,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
        total = 1 << order;
        atomic_long_add(total, &mmap_pages_allocated);
-        point = rlen >> PAGE_SHIFT;
+        point = len >> PAGE_SHIFT;
        /* we allocated a power-of-2 sized page set, so we may want to trim off
         * the excess */
@@ -1119,7 +1185,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
        base = page_address(pages);
        region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
        region->vm_start = (unsigned long) base;
-        region->vm_end   = region->vm_start + rlen;
+        region->vm_end   = region->vm_start + len;
        region->vm_top   = region->vm_start + (total << PAGE_SHIFT);
        vma->vm_start = region->vm_start;
@@ -1135,22 +1201,22 @@ static int do_mmap_private(struct vm_area_struct *vma,
                old_fs = get_fs();
                set_fs(KERNEL_DS);
-                ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos);
+                ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos);
                set_fs(old_fs);
                if (ret < 0)
                        goto error_free;
                /* clear the last little bit */
-                if (ret < rlen)
+                if (ret < len)
-                        memset(base + ret, 0, rlen - ret);
+                        memset(base + ret, 0, len - ret);
        }
        return 0;
 error_free:
-        free_page_series(region->vm_start, region->vm_end);
+        free_page_series(region->vm_start, region->vm_top);
        region->vm_start = vma->vm_start = 0;
        region->vm_end   = vma->vm_end = 0;
        region->vm_top   = 0;
@@ -1159,7 +1225,7 @@ error_free:
 enomem:
        printk("Allocation of length %lu from process %d (%s) failed\n",
               len, current->pid, current->comm);
-        show_free_areas();
+        show_free_areas(0);
        return -ENOMEM;
 }
@@ -1192,6 +1258,7 @@ unsigned long do_mmap_pgoff(struct file *file,
        /* we ignore the address hint */
        addr = 0;
+        len = PAGE_ALIGN(len);
        /* we've determined that we can make the mapping, now translate what we
         * now know into VMA flags */
@@ -1309,15 +1376,15 @@ unsigned long do_mmap_pgoff(struct file *file,
                if (capabilities & BDI_CAP_MAP_DIRECT) {
                        addr = file->f_op->get_unmapped_area(file, addr, len,
                                                             pgoff, flags);
-                        if (IS_ERR((void *) addr)) {
+                        if (IS_ERR_VALUE(addr)) {
                                ret = addr;
-                                if (ret != (unsigned long) -ENOSYS)
+                                if (ret != -ENOSYS)
                                        goto error_just_free;
                                /* the driver refused to tell us where to site
                                 * the mapping so we'll have to attempt to copy
                                 * it */
-                                ret = (unsigned long) -ENODEV;
+                                ret = -ENODEV;
                                if (!(capabilities & BDI_CAP_MAP_COPY))
                                        goto error_just_free;
@@ -1392,14 +1459,14 @@ error_getting_vma:
        printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
               " from process %d failed\n",
               len, current->pid);
-        show_free_areas();
+        show_free_areas(0);
        return -ENOMEM;
 error_getting_region:
        printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
               " from process %d failed\n",
               len, current->pid);
-        show_free_areas();
+        show_free_areas(0);
        return -ENOMEM;
 }
 EXPORT_SYMBOL(do_mmap_pgoff);
@@ -1411,6 +1478,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
        struct file *file = NULL;
        unsigned long retval = -EBADF;
+        audit_mmap_fd(fd, flags);
        if (!(flags & MAP_ANONYMOUS)) {
                file = fget(fd);
                if (!file)
@@ -1567,15 +1635,17 @@ static int shrink_vma(struct mm_struct *mm,
 int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 {
        struct vm_area_struct *vma;
-        struct rb_node *rb;
+        unsigned long end;
-        unsigned long end = start + len;
        int ret;
        kenter(",%lx,%zx", start, len);
+        len = PAGE_ALIGN(len);
        if (len == 0)
                return -EINVAL;
+        end = start + len;
        /* find the first potentially overlapping VMA */
        vma = find_vma(mm, start);
        if (!vma) {
@@ -1600,9 +1670,8 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
                        }
                        if (end == vma->vm_end)
                                goto erase_whole_vma;
-                        rb = rb_next(&vma->vm_rb);
+                        vma = vma->vm_next;
-                        vma = rb_entry(rb, struct vm_area_struct, vm_rb);
+                } while (vma);
-                } while (rb);
                kleave(" = -EINVAL [split file]");
                return -EINVAL;
        } else {
@@ -1668,6 +1737,7 @@ void exit_mmap(struct mm_struct *mm)
                mm->mmap = vma->vm_next;
                delete_vma_from_mm(vma);
                delete_vma(mm, vma);
+                cond_resched();
        }
        kleave("");
@@ -1695,6 +1765,8 @@ unsigned long do_mremap(unsigned long addr,
        struct vm_area_struct *vma;
        /* insanity checks first */
+        old_len = PAGE_ALIGN(old_len);
+        new_len = PAGE_ALIGN(new_len);
        if (old_len == 0 || new_len == 0)
                return (unsigned long) -EINVAL;
@@ -1741,10 +1813,13 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
        return NULL;
 }
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
-                unsigned long to, unsigned long size, pgprot_t prot)
+                unsigned long pfn, unsigned long size, pgprot_t prot)
 {
-        vma->vm_start = vma->vm_pgoff << PAGE_SHIFT;
+        if (addr != (pfn << PAGE_SHIFT))
+                return -EINVAL;
+        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
        return 0;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1764,10 +1839,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
 }
 EXPORT_SYMBOL(remap_vmalloc_range);
-void swap_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-}
 unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
        unsigned long len, unsigned long pgoff, unsigned long flags)
 {
@@ -1885,7 +1956,7 @@ error:
        return -ENOMEM;
 }
-int in_gate_area_no_task(unsigned long addr)
+int in_gate_area_no_mm(unsigned long addr)
 {
        return 0;
 }
@@ -1897,21 +1968,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
-/*
+static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
- * Access another process' address space.
+                unsigned long addr, void *buf, int len, int write)
- * - source/target buffer must be kernel space
- */
-int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
 {
        struct vm_area_struct *vma;
-        struct mm_struct *mm;
-        if (addr + len < addr)
-                return 0;
-        mm = get_task_mm(tsk);
-        if (!mm)
-                return 0;
        down_read(&mm->mmap_sem);
@@ -1936,6 +1996,43 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
        }
        up_read(&mm->mmap_sem);
+        return len;
+}
+/**
+ * @access_remote_vm - access another process' address space
+ * @mm:         the mm_struct of the target address space
+ * @addr:       start address to access
+ * @buf:        source or destination buffer
+ * @len:        number of bytes to transfer
+ * @write:      whether the access is a write
+ *
+ * The caller must hold a reference on @mm.
+ */
+int access_remote_vm(struct mm_struct *mm, unsigned long addr,
+                void *buf, int len, int write)
+{
+        return __access_remote_vm(NULL, mm, addr, buf, len, write);
+}
+/*
+ * Access another process' address space.
+ * - source/target buffer must be kernel space
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+        struct mm_struct *mm;
+        if (addr + len < addr)
+                return 0;
+        mm = get_task_mm(tsk);
+        if (!mm)
+                return 0;
+        len = __access_remote_vm(tsk, mm, addr, buf, len, write);
        mmput(mm);
        return len;
 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 4029583a1024..e4b0991ca351 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,12 +31,40 @@
 #include <linux/memcontrol.h>
 #include <linux/mempolicy.h>
 #include <linux/security.h>
+#include <linux/ptrace.h>
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
 int sysctl_oom_dump_tasks = 1;
 static DEFINE_SPINLOCK(zone_scan_lock);
+/**
+ * test_set_oom_score_adj() - set current's oom_score_adj and return old value
+ * @new_val: new oom_score_adj value
+ *
+ * Sets the oom_score_adj value for current to @new_val with proper
+ * synchronization and returns the old value.  Usually used to temporarily
+ * set a value, save the old value in the caller, and then reinstate it later.
+ */
+int test_set_oom_score_adj(int new_val)
+{
+        struct sighand_struct *sighand = current->sighand;
+        int old_val;
+        spin_lock_irq(&sighand->siglock);
+        old_val = current->signal->oom_score_adj;
+        if (new_val != old_val) {
+                if (new_val == OOM_SCORE_ADJ_MIN)
+                        atomic_inc(&current->mm->oom_disable_count);
+                else if (old_val == OOM_SCORE_ADJ_MIN)
+                        atomic_dec(&current->mm->oom_disable_count);
+                current->signal->oom_score_adj = new_val;
+        }
+        spin_unlock_irq(&sighand->siglock);
+        return old_val;
+}
 #ifdef CONFIG_NUMA
 /**
 * has_intersects_mems_allowed() - check task eligiblity for kill
@@ -83,24 +111,6 @@ static bool has_intersects_mems_allowed(struct task_struct *tsk,
 #endif /* CONFIG_NUMA */
 /*
- * If this is a system OOM (not a memcg OOM) and the task selected to be
- * killed is not already running at high (RT) priorities, speed up the
- * recovery by boosting the dying task to the lowest FIFO priority.
- * That helps with the recovery and avoids interfering with RT tasks.
- */
-static void boost_dying_task_prio(struct task_struct *p,
-                                  struct mem_cgroup *mem)
-{
-        struct sched_param param = { .sched_priority = 1 };
-        if (mem)
-                return;
-        if (!rt_task(p))
-                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
-}
-/*
 * The process p may have detached its own ->mm while exiting or through
 * use_mm(), but one or more of its subthreads may still have a valid
 * pointer.  Return p, or any of its subthreads with a valid ->mm, with
@@ -162,24 +172,16 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
                return 0;
        /*
-         * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
+         * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
-         * need to be executed for something that cannot be killed.
+         * so the entire heuristic doesn't need to be executed for something
+         * that cannot be killed.
         */
-        if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+        if (atomic_read(&p->mm->oom_disable_count)) {
                task_unlock(p);
                return 0;
        }
        /*
-         * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
-         * priority for oom killing.
-         */
-        if (p->flags & PF_OOM_ORIGIN) {
-                task_unlock(p);
-                return 1000;
-        }
-        /*
         * The memory controller may have a limit of 0 bytes, so avoid a divide
         * by zero, if necessary.
         */
@@ -188,10 +190,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
        /*
         * The baseline for the badness score is the proportion of RAM that each
-         * task's rss and swap space use.
+         * task's rss, pagetable and swap space use.
         */
-        points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
+        points = get_mm_rss(p->mm) + p->mm->nr_ptes;
-                        totalpages;
+        points += get_mm_counter(p->mm, MM_SWAPENTS);
+        points *= 1000;
+        points /= totalpages;
        task_unlock(p);
        /*
@@ -291,13 +296,15 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                unsigned long totalpages, struct mem_cgroup *mem,
                const nodemask_t *nodemask)
 {
-        struct task_struct *p;
+        struct task_struct *g, *p;
        struct task_struct *chosen = NULL;
        *ppoints = 0;
-        for_each_process(p) {
+        do_each_thread(g, p) {
                unsigned int points;
+                if (!p->mm)
+                        continue;
                if (oom_unkillable_task(p, mem, nodemask))
                        continue;
@@ -313,22 +320,29 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                if (test_tsk_thread_flag(p, TIF_MEMDIE))
                        return ERR_PTR(-1UL);
-                /*
+                if (p->flags & PF_EXITING) {
-                 * This is in the process of releasing memory so wait for it
+                        /*
-                 * to finish before killing some other task by mistake.
+                         * If p is the current task and is in the process of
-                 *
+                         * releasing memory, we allow the "kill" to set
-                 * However, if p is the current task, we allow the 'kill' to
+                         * TIF_MEMDIE, which will allow it to gain access to
-                 * go ahead if it is exiting: this will simply set TIF_MEMDIE,
+                         * memory reserves.  Otherwise, it may stall forever.
-                 * which will allow it to gain access to memory reserves in
+                         *
-                 * the process of exiting and releasing its resources.
+                         * The loop isn't broken here, however, in case other
-                 * Otherwise we could get an easy OOM deadlock.
+                         * threads are found to have already been oom killed.
-                 */
+                         */
-                if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
+                        if (p == current) {
-                        if (p != current)
+                                chosen = p;
-                                return ERR_PTR(-1UL);
+                                *ppoints = 1000;
+                        } else {
-                        chosen = p;
+                                /*
-                        *ppoints = 1000;
+                                 * If this task is not being ptraced on exit,
+                                 * then wait for it to finish before killing
+                                 * some other task unnecessarily.
+                                 */
+                                if (!(task_ptrace(p->group_leader) &
+                                                        PT_TRACE_EXIT))
+                                        return ERR_PTR(-1UL);
+                        }
                }
                points = oom_badness(p, mem, nodemask, totalpages);
@@ -336,7 +350,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
                        chosen = p;
                        *ppoints = points;
                }
-        }
+        } while_each_thread(g, p);
        return chosen;
 }
@@ -395,7 +409,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
        task_unlock(current);
        dump_stack();
        mem_cgroup_print_oom_info(mem, p);
-        show_mem();
+        show_mem(SHOW_MEM_FILTER_NODES);
        if (sysctl_oom_dump_tasks)
                dump_tasks(mem, nodemask);
 }
@@ -403,27 +417,44 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 #define K(x) ((x) << (PAGE_SHIFT-10))
 static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
 {
+        struct task_struct *q;
+        struct mm_struct *mm;
        p = find_lock_task_mm(p);
        if (!p)
                return 1;
+        /* mm cannot be safely dereferenced after task_unlock(p) */
+        mm = p->mm;
        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
                task_pid_nr(p), p->comm, K(p->mm->total_vm),
                K(get_mm_counter(p->mm, MM_ANONPAGES)),
                K(get_mm_counter(p->mm, MM_FILEPAGES)));
        task_unlock(p);
+        /*
+         * Kill all processes sharing p->mm in other thread groups, if any.
+         * They don't get access to memory reserves or a higher scheduler
+         * priority, though, to avoid depletion of all memory or task
+         * starvation.  This prevents mm->mmap_sem livelock when an oom killed
+         * task cannot exit because it requires the semaphore and its contended
+         * by another thread trying to allocate memory itself.  That thread will
+         * now get access to memory reserves since it has a pending fatal
+         * signal.
+         */
+        for_each_process(q)
+                if (q->mm == mm && !same_thread_group(q, p)) {
+                        task_lock(q);   /* Protect ->comm from prctl() */
+                        pr_err("Kill process %d (%s) sharing same memory\n",
+                                task_pid_nr(q), q->comm);
+                        task_unlock(q);
+                        force_sig(SIGKILL, q);
+                }
        set_tsk_thread_flag(p, TIF_MEMDIE);
        force_sig(SIGKILL, p);
-        /*
-         * We give our sacrificial lamb high priority and access to
-         * all the memory it needs. That way it should be able to
-         * exit() and clear out its resources quickly...
-         */
-        boost_dying_task_prio(p, mem);
        return 0;
 }
 #undef K
@@ -447,7 +478,6 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         */
        if (p->flags & PF_EXITING) {
                set_tsk_thread_flag(p, TIF_MEMDIE);
-                boost_dying_task_prio(p, mem);
                return 0;
        }
@@ -466,6 +496,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
                list_for_each_entry(child, &t->children, sibling) {
                        unsigned int child_points;
+                        if (child->mm == p->mm)
+                                continue;
                        /*
                         * oom_badness() returns 0 if the thread is unkillable
                         */
@@ -512,6 +544,16 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
        unsigned int points = 0;
        struct task_struct *p;
+        /*
+         * If current has a pending SIGKILL, then automatically select it.  The
+         * goal is to allow it to allocate so that it may quickly exit and free
+         * its memory.
+         */
+        if (fatal_signal_pending(current)) {
+                set_thread_flag(TIF_MEMDIE);
+                return;
+        }
        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
        limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
        read_lock(&tasklist_lock);
@@ -664,7 +706,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
         */
        if (fatal_signal_pending(current)) {
                set_thread_flag(TIF_MEMDIE);
-                boost_dying_task_prio(current, NULL);
                return;
        }
@@ -680,7 +721,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
        read_lock(&tasklist_lock);
        if (sysctl_oom_kill_allocating_task &&
            !oom_unkillable_task(current, NULL, nodemask) &&
-            (current->signal->oom_adj != OOM_DISABLE)) {
+            current->mm && !atomic_read(&current->mm->oom_disable_count)) {
                /*
                 * oom_kill_process() needs tasklist_lock held.  If it returns
                 * non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e3bccac1f025..31f698862420 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -404,25 +404,22 @@ unsigned long determine_dirtyable_memory(void)
 * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
 * - vm.dirty_ratio             or  vm.dirty_bytes
 * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * runtime tasks.
+ * real-time tasks.
 */
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
        unsigned long background;
        unsigned long dirty;
-        unsigned long available_memory = determine_dirtyable_memory();
+        unsigned long uninitialized_var(available_memory);
        struct task_struct *tsk;
+        if (!vm_dirty_bytes || !dirty_background_bytes)
+                available_memory = determine_dirtyable_memory();
        if (vm_dirty_bytes)
                dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
-        else {
+        else
-                int dirty_ratio;
+                dirty = (vm_dirty_ratio * available_memory) / 100;
-                dirty_ratio = vm_dirty_ratio;
-                if (dirty_ratio < 5)
-                        dirty_ratio = 5;
-                dirty = (dirty_ratio * available_memory) / 100;
-        }
        if (dirty_background_bytes)
                background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
@@ -510,7 +507,7 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * catch-up. This avoids (excessively) small writeouts
                 * when the bdi limits are ramping up.
                 */
-                if (nr_reclaimable + nr_writeback <
+                if (nr_reclaimable + nr_writeback <=
                                (background_thresh + dirty_thresh) / 2)
                        break;
@@ -542,8 +539,8 @@ static void balance_dirty_pages(struct address_space *mapping,
                 * the last resort safeguard.
                 */
                dirty_exceeded =
-                        (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
+                        (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
-                        || (nr_reclaimable + nr_writeback >= dirty_thresh);
+                        || (nr_reclaimable + nr_writeback > dirty_thresh);
                if (!dirty_exceeded)
                        break;
@@ -569,7 +566,7 @@ static void balance_dirty_pages(struct address_space *mapping,
                                break;          /* We've done our duty */
                }
                trace_wbc_balance_dirty_wait(&wbc, bdi);
-                __set_current_state(TASK_INTERRUPTIBLE);
+                __set_current_state(TASK_UNINTERRUPTIBLE);
                io_schedule_timeout(pause);
                /*
@@ -930,7 +927,7 @@ retry:
                                break;
                        }
-                        done_index = page->index + 1;
+                        done_index = page->index;
                        lock_page(page);
@@ -980,6 +977,7 @@ continue_unlock:
                                         * not be suitable for data integrity
                                         * writeout).
                                         */
+                                        done_index = page->index + 1;
                                        done = 1;
                                        break;
                                }
@@ -1042,11 +1040,17 @@ static int __writepage(struct page *page, struct writeback_control *wbc,
 int generic_writepages(struct address_space *mapping,
                       struct writeback_control *wbc)
 {
+        struct blk_plug plug;
+        int ret;
        /* deal with chardevs and other special file */
        if (!mapping->a_ops->writepage)
                return 0;
-        return write_cache_pages(mapping, wbc, __writepage, mapping);
+        blk_start_plug(&plug);
+        ret = write_cache_pages(mapping, wbc, __writepage, mapping);
+        blk_finish_plug(&plug);
+        return ret;
 }
 EXPORT_SYMBOL(generic_writepages);
@@ -1109,7 +1113,7 @@ EXPORT_SYMBOL(write_one_page);
 int __set_page_dirty_no_writeback(struct page *page)
 {
        if (!PageDirty(page))
-                SetPageDirty(page);
+                return !TestSetPageDirty(page);
        return 0;
 }
@@ -1121,6 +1125,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 {
        if (mapping_cap_account_dirty(mapping)) {
                __inc_zone_page_state(page, NR_FILE_DIRTY);
+                __inc_zone_page_state(page, NR_DIRTIED);
                __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
                task_dirty_inc(current);
                task_io_account_write(PAGE_CACHE_SIZE);
@@ -1129,6 +1134,18 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 EXPORT_SYMBOL(account_page_dirtied);
 /*
+ * Helper function for set_page_writeback family.
+ * NOTE: Unlike account_page_dirtied this does not rely on being atomic
+ * wrt interrupts.
+ */
+void account_page_writeback(struct page *page)
+{
+        inc_zone_page_state(page, NR_WRITEBACK);
+        inc_zone_page_state(page, NR_WRITTEN);
+}
+EXPORT_SYMBOL(account_page_writeback);
+/*
 * For address_spaces which do not use buffers.  Just tag the page as dirty in
 * its radix tree.
 *
@@ -1201,6 +1218,17 @@ int set_page_dirty(struct page *page)
        if (likely(mapping)) {
                int (*spd)(struct page *) = mapping->a_ops->set_page_dirty;
+                /*
+                 * readahead/lru_deactivate_page could remain
+                 * PG_readahead/PG_reclaim due to race with end_page_writeback
+                 * About readahead, if the page is written, the flags would be
+                 * reset. So no problem.
+                 * About lru_deactivate_page, if the page is redirty, the flag
+                 * will be reset. So no problem. but if the page is used by readahead
+                 * it will confuse readahead and make it restart the size rampup
+                 * process. But it's a trivial problem.
+                 */
+                ClearPageReclaim(page);
 #ifdef CONFIG_BLOCK
                if (!spd)
                        spd = __set_page_dirty_buffers;
@@ -1229,7 +1257,7 @@ int set_page_dirty_lock(struct page *page)
 {
        int ret;
-        lock_page_nosync(page);
+        lock_page(page);
        ret = set_page_dirty(page);
        unlock_page(page);
        return ret;
@@ -1256,7 +1284,6 @@ int clear_page_dirty_for_io(struct page *page)
        BUG_ON(!PageLocked(page));
-        ClearPageReclaim(page);
        if (mapping && mapping_cap_account_dirty(mapping)) {
                /*
                 * Yes, Virginia, this is indeed insane.
@@ -1366,7 +1393,7 @@ int test_set_page_writeback(struct page *page)
                ret = TestSetPageWriteback(page);
        }
        if (!ret)
-                inc_zone_page_state(page, NR_WRITEBACK);
+                account_page_writeback(page);
        return ret;
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f12ad1836abe..4e8985acdab8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
 #include <linux/pagemap.h>
 #include <linux/jiffies.h>
 #include <linux/bootmem.h>
+#include <linux/memblock.h>
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
@@ -29,6 +30,7 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
@@ -38,6 +40,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
+#include <linux/vmstat.h>
 #include <linux/mempolicy.h>
 #include <linux/stop_machine.h>
 #include <linux/sort.h>
@@ -52,6 +55,8 @@
 #include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
+#include <linux/memcontrol.h>
+#include <linux/prefetch.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -103,19 +108,24 @@ gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 * only be modified with pm_mutex held, unless the suspend/hibernate code is
 * guaranteed not to run in parallel with that modification).
 */
-void set_gfp_allowed_mask(gfp_t mask)
+static gfp_t saved_gfp_mask;
+void pm_restore_gfp_mask(void)
 {
        WARN_ON(!mutex_is_locked(&pm_mutex));
-        gfp_allowed_mask = mask;
+        if (saved_gfp_mask) {
+                gfp_allowed_mask = saved_gfp_mask;
+                saved_gfp_mask = 0;
+        }
 }
-gfp_t clear_gfp_allowed_mask(gfp_t mask)
+void pm_restrict_gfp_mask(void)
 {
-        gfp_t ret = gfp_allowed_mask;
        WARN_ON(!mutex_is_locked(&pm_mutex));
-        gfp_allowed_mask &= ~mask;
+        WARN_ON(saved_gfp_mask);
-        return ret;
+        saved_gfp_mask = gfp_allowed_mask;
+        gfp_allowed_mask &= ~GFP_IOFS;
 }
 #endif /* CONFIG_PM_SLEEP */
@@ -280,7 +290,7 @@ static void bad_page(struct page *page)
        /* Don't complain about poisoned pages */
        if (PageHWPoison(page)) {
-                __ClearPageBuddy(page);
+                reset_page_mapcount(page); /* remove PageBuddy */
                return;
        }
@@ -311,7 +321,7 @@ static void bad_page(struct page *page)
        dump_stack();
 out:
        /* Leave bad fields for debug, except PageBuddy could make trouble */
-        __ClearPageBuddy(page);
+        reset_page_mapcount(page); /* remove PageBuddy */
        add_taint(TAINT_BAD_PAGE);
 }
@@ -351,6 +361,7 @@ void prep_compound_page(struct page *page, unsigned long order)
        }
 }
+/* update __split_huge_page_refcount if you change this function */
 static int destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
@@ -420,18 +431,10 @@ static inline void rmv_page_order(struct page *page)
 *
 * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
 */
-static inline struct page *
-__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
-{
-        unsigned long buddy_idx = page_idx ^ (1 << order);
-        return page + (buddy_idx - page_idx);
-}
 static inline unsigned long
-__find_combined_index(unsigned long page_idx, unsigned int order)
+__find_buddy_index(unsigned long page_idx, unsigned int order)
 {
-        return (page_idx & ~(1 << order));
+        return page_idx ^ (1 << order);
 }
 /*
@@ -442,8 +445,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 * (c) a page and its buddy have the same order &&
 * (d) a page and its buddy are in the same zone.
 *
- * For recording whether a page is in the buddy system, we use PG_buddy.
+ * For recording whether a page is in the buddy system, we set ->_mapcount -2.
- * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
+ * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
@@ -476,7 +479,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 * as necessary, plus some accounting needed to play nicely with other
 * parts of the VM system.
 * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with PG_buddy. Page's
+ * free pages of length of (1 << order) and marked with _mapcount -2. Page's
 * order is recorded in page_private(page) field.
 * So when we are allocating or freeing one, we can derive the state of the
 * other.  That is, if we allocate a small block, and both were   
@@ -493,6 +496,7 @@ static inline void __free_one_page(struct page *page,
 {
        unsigned long page_idx;
        unsigned long combined_idx;
+        unsigned long uninitialized_var(buddy_idx);
        struct page *buddy;
        if (unlikely(PageCompound(page)))
@@ -507,7 +511,8 @@ static inline void __free_one_page(struct page *page,
        VM_BUG_ON(bad_range(zone, page));
        while (order < MAX_ORDER-1) {
-                buddy = __page_find_buddy(page, page_idx, order);
+                buddy_idx = __find_buddy_index(page_idx, order);
+                buddy = page + (buddy_idx - page_idx);
                if (!page_is_buddy(page, buddy, order))
                        break;
@@ -515,7 +520,7 @@ static inline void __free_one_page(struct page *page,
                list_del(&buddy->lru);
                zone->free_area[order].nr_free--;
                rmv_page_order(buddy);
-                combined_idx = __find_combined_index(page_idx, order);
+                combined_idx = buddy_idx & page_idx;
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
                order++;
@@ -530,11 +535,12 @@ static inline void __free_one_page(struct page *page,
         * so it's less likely to be used soon and more likely to be merged
         * as a higher order page
         */
-        if ((order < MAX_ORDER-1) && pfn_valid_within(page_to_pfn(buddy))) {
+        if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) {
                struct page *higher_page, *higher_buddy;
-                combined_idx = __find_combined_index(page_idx, order);
+                combined_idx = buddy_idx & page_idx;
-                higher_page = page + combined_idx - page_idx;
+                higher_page = page + (combined_idx - page_idx);
-                higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1);
+                buddy_idx = __find_buddy_index(combined_idx, order + 1);
+                higher_buddy = page + (buddy_idx - combined_idx);
                if (page_is_buddy(higher_page, higher_buddy, order + 1)) {
                        list_add_tail(&page->lru,
                                &zone->free_area[order].free_list[migratetype]);
@@ -563,7 +569,8 @@ static inline int free_pages_check(struct page *page)
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
                (atomic_read(&page->_count) != 0) |
-                (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
+                (page->flags & PAGE_FLAGS_CHECK_AT_FREE) |
+                (mem_cgroup_bad_page_check(page)))) {
                bad_page(page);
                return 1;
        }
@@ -612,6 +619,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        list = &pcp->lists[migratetype];
                } while (list_empty(list));
+                /* This is the only non-empty list. Free them all. */
+                if (batch_free == MIGRATE_PCPTYPES)
+                        batch_free = to_free;
                do {
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
@@ -645,13 +656,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
        trace_mm_page_free_direct(page, order);
        kmemcheck_free_shadow(page, order);
-        for (i = 0; i < (1 << order); i++) {
+        if (PageAnon(page))
-                struct page *pg = page + i;
+                page->mapping = NULL;
+        for (i = 0; i < (1 << order); i++)
-                if (PageAnon(pg))
+                bad += free_pages_check(page + i);
-                        pg->mapping = NULL;
-                bad += free_pages_check(pg);
-        }
        if (bad)
                return false;
@@ -751,7 +759,8 @@ static inline int check_new_page(struct page *page)
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
                (atomic_read(&page->_count) != 0)  |
-                (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
+                (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
+                (mem_cgroup_bad_page_check(page)))) {
                bad_page(page);
                return 1;
        }
@@ -864,9 +873,8 @@ static int move_freepages(struct zone *zone,
                }
                order = page_order(page);
-                list_del(&page->lru);
+                list_move(&page->lru,
-                list_add(&page->lru,
+                          &zone->free_area[order].free_list[migratetype]);
-                        &zone->free_area[order].free_list[migratetype]);
                page += 1 << order;
                pages_moved += 1 << order;
        }
@@ -937,7 +945,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
                         * If breaking a large block of pages, move all free
                         * pages to the preferred allocation list. If falling
                         * back for a reclaimable kernel allocation, be more
-                         * agressive about taking ownership of free pages
+                         * aggressive about taking ownership of free pages
                         */
                        if (unlikely(current_order >= (pageblock_order >> 1)) ||
                                        start_migratetype == MIGRATE_RECLAIMABLE ||
@@ -1089,8 +1097,10 @@ static void drain_pages(unsigned int cpu)
                pset = per_cpu_ptr(zone->pageset, cpu);
                pcp = &pset->pcp;
-                free_pcppages_bulk(zone, pcp->count, pcp);
+                if (pcp->count) {
-                pcp->count = 0;
+                        free_pcppages_bulk(zone, pcp->count, pcp);
+                        pcp->count = 0;
+                }
                local_irq_restore(flags);
        }
 }
@@ -1332,7 +1342,7 @@ again:
        }
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
-        zone_statistics(preferred_zone, zone);
+        zone_statistics(preferred_zone, zone, gfp_flags);
        local_irq_restore(flags);
        VM_BUG_ON(bad_range(zone, page));
@@ -1454,24 +1464,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
 #endif /* CONFIG_FAIL_PAGE_ALLOC */
 /*
- * Return 1 if free pages are above 'mark'. This takes into account the order
+ * Return true if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
-int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                      int classzone_idx, int alloc_flags)
+                      int classzone_idx, int alloc_flags, long free_pages)
 {
        /* free_pages my go negative - that's OK */
        long min = mark;
-        long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
        int o;
+        free_pages -= (1 << order) + 1;
        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
-                return 0;
+                return false;
        for (o = 0; o < order; o++) {
                /* At the next order, this order's pages become unavailable */
                free_pages -= z->free_area[o].nr_free << o;
@@ -1480,9 +1490,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
                min >>= 1;
                if (free_pages <= min)
-                        return 0;
+                        return false;
        }
-        return 1;
+        return true;
+}
+bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                        zone_page_state(z, NR_FREE_PAGES));
+}
+bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
+                      int classzone_idx, int alloc_flags)
+{
+        long free_pages = zone_page_state(z, NR_FREE_PAGES);
+        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
+                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
+        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+                                                                free_pages);
 }
 #ifdef CONFIG_NUMA
@@ -1694,6 +1723,59 @@ try_next_zone:
        return page;
 }
+/*
+ * Large machines with many possible nodes should not always dump per-node
+ * meminfo in irq context.
+ */
+static inline bool should_suppress_show_mem(void)
+{
+        bool ret = false;
+#if NODES_SHIFT > 8
+        ret = in_interrupt();
+#endif
+        return ret;
+}
+static DEFINE_RATELIMIT_STATE(nopage_rs,
+                DEFAULT_RATELIMIT_INTERVAL,
+                DEFAULT_RATELIMIT_BURST);
+void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
+{
+        va_list args;
+        unsigned int filter = SHOW_MEM_FILTER_NODES;
+        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+                return;
+        /*
+         * This documents exceptions given to allocations in certain
+         * contexts that are allowed to allocate outside current's set
+         * of allowed nodes.
+         */
+        if (!(gfp_mask & __GFP_NOMEMALLOC))
+                if (test_thread_flag(TIF_MEMDIE) ||
+                    (current->flags & (PF_MEMALLOC | PF_EXITING)))
+                        filter &= ~SHOW_MEM_FILTER_NODES;
+        if (in_interrupt() || !(gfp_mask & __GFP_WAIT))
+                filter &= ~SHOW_MEM_FILTER_NODES;
+        if (fmt) {
+                printk(KERN_WARNING);
+                va_start(args, fmt);
+                vprintk(fmt, args);
+                va_end(args);
+        }
+        pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
+                   current->comm, order, gfp_mask);
+        dump_stack();
+        if (!should_suppress_show_mem())
+                show_mem(filter);
+}
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
                                unsigned long pages_reclaimed)
@@ -1787,15 +1869,18 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
+        int migratetype, unsigned long *did_some_progress,
+        bool sync_migration)
 {
        struct page *page;
        if (!order || compaction_deferred(preferred_zone))
                return NULL;
+        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
-                                                                nodemask);
+                                                nodemask, sync_migration);
+        current->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
                /* Page migration frees to the PCP lists but we want merging */
@@ -1831,7 +1916,8 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress)
+        int migratetype, unsigned long *did_some_progress,
+        bool sync_migration)
 {
        return NULL;
 }
@@ -1846,23 +1932,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
 {
        struct page *page = NULL;
        struct reclaim_state reclaim_state;
-        struct task_struct *p = current;
        bool drained = false;
        cond_resched();
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
-        p->flags |= PF_MEMALLOC;
+        current->flags |= PF_MEMALLOC;
        lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
-        p->reclaim_state = &reclaim_state;
+        current->reclaim_state = &reclaim_state;
        *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
-        p->reclaim_state = NULL;
+        current->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
-        p->flags &= ~PF_MEMALLOC;
+        current->flags &= ~PF_MEMALLOC;
        cond_resched();
@@ -1906,7 +1991,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
                        preferred_zone, migratetype);
                if (!page && gfp_mask & __GFP_NOFAIL)
-                        congestion_wait(BLK_RW_ASYNC, HZ/50);
+                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
        } while (!page && (gfp_mask & __GFP_NOFAIL));
        return page;
@@ -1914,24 +1999,24 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 static inline
 void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-                                                enum zone_type high_zoneidx)
+                                                enum zone_type high_zoneidx,
+                                                enum zone_type classzone_idx)
 {
        struct zoneref *z;
        struct zone *zone;
        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-                wakeup_kswapd(zone, order);
+                wakeup_kswapd(zone, order, classzone_idx);
 }
 static inline int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
-        struct task_struct *p = current;
        int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
        const gfp_t wait = gfp_mask & __GFP_WAIT;
        /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
-        BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
+        BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
        /*
         * The caller may dip into page reserves a bit more if the caller
@@ -1939,21 +2024,26 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
         * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
         * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
         */
-        alloc_flags |= (gfp_mask & __GFP_HIGH);
+        alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
        if (!wait) {
-                alloc_flags |= ALLOC_HARDER;
+                /*
+                 * Not worth trying to allocate harder for
+                 * __GFP_NOMEMALLOC even if it can't schedule.
+                 */
+                if  (!(gfp_mask & __GFP_NOMEMALLOC))
+                        alloc_flags |= ALLOC_HARDER;
                /*
                 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
                 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-        } else if (unlikely(rt_task(p)) && !in_interrupt())
+        } else if (unlikely(rt_task(current)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
                if (!in_interrupt() &&
-                    ((p->flags & PF_MEMALLOC) ||
+                    ((current->flags & PF_MEMALLOC) ||
                     unlikely(test_thread_flag(TIF_MEMDIE))))
                        alloc_flags |= ALLOC_NO_WATERMARKS;
        }
@@ -1972,7 +2062,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        int alloc_flags;
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
-        struct task_struct *p = current;
+        bool sync_migration = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -1997,7 +2087,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
                goto nopage;
 restart:
-        wake_all_kswapd(order, zonelist, high_zoneidx);
+        if (!(gfp_mask & __GFP_NO_KSWAPD))
+                wake_all_kswapd(order, zonelist, high_zoneidx,
+                                                zone_idx(preferred_zone));
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -2006,6 +2098,15 @@ restart:
         */
        alloc_flags = gfp_to_alloc_flags(gfp_mask);
+        /*
+         * Find the true preferred zone if the allocation is unconstrained by
+         * cpusets.
+         */
+        if (!(alloc_flags & ALLOC_CPUSET) && !nodemask)
+                first_zones_zonelist(zonelist, high_zoneidx, NULL,
+                                        &preferred_zone);
+rebalance:
        /* This is the last chance, in general, before the goto nopage. */
        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
                        high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2013,7 +2114,6 @@ restart:
        if (page)
                goto got_pg;
-rebalance:
        /* Allocate without watermarks if the context allows */
        if (alloc_flags & ALLOC_NO_WATERMARKS) {
                page = __alloc_pages_high_priority(gfp_mask, order,
@@ -2028,21 +2128,26 @@ rebalance:
                goto nopage;
        /* Avoid recursion of direct reclaim */
-        if (p->flags & PF_MEMALLOC)
+        if (current->flags & PF_MEMALLOC)
                goto nopage;
        /* Avoid allocations with no watermarks from looping endlessly */
        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                goto nopage;
-        /* Try direct compaction */
+        /*
+         * Try direct compaction. The first pass is asynchronous. Subsequent
+         * attempts after direct reclaim are synchronous
+         */
        page = __alloc_pages_direct_compact(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress);
+                                        migratetype, &did_some_progress,
+                                        sync_migration);
        if (page)
                goto got_pg;
+        sync_migration = true;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
@@ -2094,18 +2199,26 @@ rebalance:
        pages_reclaimed += did_some_progress;
        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
                /* Wait for some write requests to complete then retry */
-                congestion_wait(BLK_RW_ASYNC, HZ/50);
+                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
+        } else {
+                /*
+                 * High-order allocations do not necessarily loop after
+                 * direct reclaim and reclaim/compaction depends on compaction
+                 * being called after reclaim so call directly if necessary
+                 */
+                page = __alloc_pages_direct_compact(gfp_mask, order,
+                                        zonelist, high_zoneidx,
+                                        nodemask,
+                                        alloc_flags, preferred_zone,
+                                        migratetype, &did_some_progress,
+                                        sync_migration);
+                if (page)
+                        goto got_pg;
        }
 nopage:
-        if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
+        warn_alloc_failed(gfp_mask, order, NULL);
-                printk(KERN_WARNING "%s: page allocation failure."
-                        " order:%d, mode:0x%x\n",
-                        p->comm, order, gfp_mask);
-                dump_stack();
-                show_mem();
-        }
        return page;
 got_pg:
        if (kmemcheck_enabled)
@@ -2145,7 +2258,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        get_mems_allowed();
        /* The preferred zone is used for statistics later */
-        first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
+        first_zones_zonelist(zonelist, high_zoneidx,
+                                nodemask ? : &cpuset_current_mems_allowed,
+                                &preferred_zone);
        if (!preferred_zone) {
                put_mems_allowed();
                return NULL;
@@ -2224,6 +2339,21 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
+static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
+{
+        if (addr) {
+                unsigned long alloc_end = addr + (PAGE_SIZE << order);
+                unsigned long used = addr + PAGE_ALIGN(size);
+                split_page(virt_to_page((void *)addr), order);
+                while (used < alloc_end) {
+                        free_page(used);
+                        used += PAGE_SIZE;
+                }
+        }
+        return (void *)addr;
+}
 /**
 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
 * @size: the number of bytes to allocate
@@ -2243,22 +2373,33 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
        unsigned long addr;
        addr = __get_free_pages(gfp_mask, order);
-        if (addr) {
+        return make_alloc_exact(addr, order, size);
-                unsigned long alloc_end = addr + (PAGE_SIZE << order);
-                unsigned long used = addr + PAGE_ALIGN(size);
-                split_page(virt_to_page((void *)addr), order);
-                while (used < alloc_end) {
-                        free_page(used);
-                        used += PAGE_SIZE;
-                }
-        }
-        return (void *)addr;
 }
 EXPORT_SYMBOL(alloc_pages_exact);
 /**
+ * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
+ *                         pages on a node.
+ * @nid: the preferred node ID where memory should be allocated
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * Like alloc_pages_exact(), but try to allocate on node nid first before falling
+ * back.
+ * Note this is not alloc_pages_exact_node() which allocates on a specific node,
+ * but is not exact.
+ */
+void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
+{
+        unsigned order = get_order(size);
+        struct page *p = alloc_pages_node(nid, gfp_mask, order);
+        if (!p)
+                return NULL;
+        return make_alloc_exact((unsigned long)page_address(p), order, size);
+}
+EXPORT_SYMBOL(alloc_pages_exact_nid);
+/**
 * free_pages_exact - release memory allocated via alloc_pages_exact()
 * @virt: the value returned by alloc_pages_exact.
 * @size: size of allocation, same value as passed to alloc_pages_exact().
@@ -2352,19 +2493,41 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 }
 #endif
+/*
+ * Determine whether the node should be displayed or not, depending on whether
+ * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
+ */
+bool skip_free_areas_node(unsigned int flags, int nid)
+{
+        bool ret = false;
+        if (!(flags & SHOW_MEM_FILTER_NODES))
+                goto out;
+        get_mems_allowed();
+        ret = !node_isset(nid, cpuset_current_mems_allowed);
+        put_mems_allowed();
+out:
+        return ret;
+}
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
 * Show free area list (used inside shift_scroll-lock stuff)
 * We also calculate the percentage fragmentation. We do this by counting the
 * memory on each free list with the exception of the first item on the list.
+ * Suppresses nodes that are not allowed by current's cpuset if
+ * SHOW_MEM_FILTER_NODES is passed.
 */
-void show_free_areas(void)
+void show_free_areas(unsigned int filter)
 {
        int cpu;
        struct zone *zone;
        for_each_populated_zone(zone) {
+                if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                        continue;
                show_node(zone);
                printk("%s per-cpu:\n", zone->name);
@@ -2406,6 +2569,8 @@ void show_free_areas(void)
        for_each_populated_zone(zone) {
                int i;
+                if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                        continue;
                show_node(zone);
                printk("%s"
                        " free:%lukB"
@@ -2436,7 +2601,7 @@ void show_free_areas(void)
                        " all_unreclaimable? %s"
                        "\n",
                        zone->name,
-                        K(zone_nr_free_pages(zone)),
+                        K(zone_page_state(zone, NR_FREE_PAGES)),
                        K(min_wmark_pages(zone)),
                        K(low_wmark_pages(zone)),
                        K(high_wmark_pages(zone)),
@@ -2473,6 +2638,8 @@ void show_free_areas(void)
        for_each_populated_zone(zone) {
                unsigned long nr[MAX_ORDER], flags, order, total = 0;
+                if (skip_free_areas_node(filter, zone_to_nid(zone)))
+                        continue;
                show_node(zone);
                printk("%s: ", zone->name);
@@ -2579,9 +2746,16 @@ static int __parse_numa_zonelist_order(char *s)
 static __init int setup_numa_zonelist_order(char *s)
 {
-        if (s)
+        int ret;
-                return __parse_numa_zonelist_order(s);
-        return 0;
+        if (!s)
+                return 0;
+        ret = __parse_numa_zonelist_order(s);
+        if (ret == 0)
+                strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
+        return ret;
 }
 early_param("numa_zonelist_order", setup_numa_zonelist_order);
@@ -3007,14 +3181,6 @@ static __init_refok int __build_all_zonelists(void *data)
                build_zonelist_cache(pgdat);
        }
-#ifdef CONFIG_MEMORY_HOTPLUG
-        /* Setup real pagesets for the new zone */
-        if (data) {
-                struct zone *zone = data;
-                setup_zone_pageset(zone);
-        }
-#endif
        /*
         * Initialize the boot_pagesets that are going to be used
         * for bootstrapping processors. The real pagesets for
@@ -3052,7 +3218,7 @@ static __init_refok int __build_all_zonelists(void *data)
 * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
 */
-void build_all_zonelists(void *data)
+void __ref build_all_zonelists(void *data)
 {
        set_zonelist_order();
@@ -3063,7 +3229,11 @@ void build_all_zonelists(void *data)
        } else {
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
-                stop_machine(__build_all_zonelists, data, NULL);
+#ifdef CONFIG_MEMORY_HOTPLUG
+                if (data)
+                        setup_zone_pageset((struct zone *)data);
+#endif
+                stop_machine(__build_all_zonelists, NULL, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -3159,6 +3329,20 @@ static inline unsigned long wait_table_bits(unsigned long size)
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
+ * Check if a pageblock contains reserved pages
+ */
+static int pageblock_is_reserved(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+                if (!pfn_valid_within(pfn) || PageReserved(pfn_to_page(pfn)))
+                        return 1;
+        }
+        return 0;
+}
+/*
 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
 * of blocks reserved is based on min_wmark_pages(zone). The memory within
 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
@@ -3167,7 +3351,7 @@ static inline unsigned long wait_table_bits(unsigned long size)
 */
 static void setup_zone_migrate_reserve(struct zone *zone)
 {
-        unsigned long start_pfn, pfn, end_pfn;
+        unsigned long start_pfn, pfn, end_pfn, block_end_pfn;
        struct page *page;
        unsigned long block_migratetype;
        int reserve;
@@ -3197,7 +3381,8 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                        continue;
                /* Blocks with reserved pages will never free, skip them. */
-                if (PageReserved(page))
+                block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn);
+                if (pageblock_is_reserved(pfn, block_end_pfn))
                        continue;
                block_migratetype = get_pageblock_migratetype(page);
@@ -3386,7 +3571,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
                pcp->batch = PAGE_SHIFT * 8;
 }
-static __meminit void setup_zone_pageset(struct zone *zone)
+static void setup_zone_pageset(struct zone *zone)
 {
        int cpu;
@@ -3436,7 +3621,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
        if (!slab_is_available()) {
                zone->wait_table = (wait_queue_head_t *)
-                        alloc_bootmem_node(pgdat, alloc_size);
+                        alloc_bootmem_node_nopanic(pgdat, alloc_size);
        } else {
                /*
                 * This case means that a zone whose size was 0 gets new memory
@@ -3636,68 +3821,87 @@ void __init free_bootmem_with_active_regions(int nid,
        }
 }
-int __init add_from_early_node_map(struct range *range, int az,
+#ifdef CONFIG_HAVE_MEMBLOCK
-                                   int nr_range, int nid)
+/*
+ * Basic iterator support. Return the last range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns last region regardless of node
+ */
+static int __meminit last_active_region_index_in_nid(int nid)
 {
        int i;
-        u64 start, end;
-        /* need to go over early_node_map to find out good range for node */
+        for (i = nr_nodemap_entries - 1; i >= 0; i--)
-        for_each_active_range_index_in_nid(i, nid) {
+                if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)
-                start = early_node_map[i].start_pfn;
+                        return i;
-                end = early_node_map[i].end_pfn;
-                nr_range = add_range(range, az, nr_range, start, end);
+        return -1;
-        }
-        return nr_range;
 }
-#ifdef CONFIG_NO_BOOTMEM
+/*
-void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
+ * Basic iterator support. Return the previous active range of PFNs for a node
+ * Note: nid == MAX_NUMNODES returns next region regardless of node
+ */
+static int __meminit previous_active_region_index_in_nid(int index, int nid)
+{
+        for (index = index - 1; index >= 0; index--)
+                if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
+                        return index;
+        return -1;
+}
+#define for_each_active_range_index_in_nid_reverse(i, nid) \
+        for (i = last_active_region_index_in_nid(nid); i != -1; \
+                                i = previous_active_region_index_in_nid(i, nid))
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
                                        u64 goal, u64 limit)
 {
        int i;
-        void *ptr;
-        if (limit > get_max_mapped())
-                limit = get_max_mapped();
-        /* need to go over early_node_map to find out good range for node */
+        /* Need to go over early_node_map to find out good range for node */
-        for_each_active_range_index_in_nid(i, nid) {
+        for_each_active_range_index_in_nid_reverse(i, nid) {
                u64 addr;
                u64 ei_start, ei_last;
+                u64 final_start, final_end;
                ei_last = early_node_map[i].end_pfn;
                ei_last <<= PAGE_SHIFT;
                ei_start = early_node_map[i].start_pfn;
                ei_start <<= PAGE_SHIFT;
-                addr = find_early_area(ei_start, ei_last,
-                                         goal, limit, size, align);
-                if (addr == -1ULL)
+                final_start = max(ei_start, goal);
+                final_end = min(ei_last, limit);
+                if (final_start >= final_end)
                        continue;
-#if 0
+                addr = memblock_find_in_range(final_start, final_end, size, align);
-                printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
-                                nid,
-                                ei_start, ei_last, goal, limit, size,
-                                align, addr);
-#endif
-                ptr = phys_to_virt(addr);
+                if (addr == MEMBLOCK_ERROR)
-                memset(ptr, 0, size);
+                        continue;
-                reserve_early_without_check(addr, addr + size, "BOOTMEM");
-                /*
+                return addr;
-                 * The min_count is set to 0 so that bootmem allocated blocks
-                 * are never reported as leaks.
-                 */
-                kmemleak_alloc(ptr, size, 0, 0);
-                return ptr;
        }
-        return NULL;
+        return MEMBLOCK_ERROR;
 }
 #endif
+int __init add_from_early_node_map(struct range *range, int az,
+                                   int nr_range, int nid)
+{
+        int i;
+        u64 start, end;
+        /* need to go over early_node_map to find out good range for node */
+        for_each_active_range_index_in_nid(i, nid) {
+                start = early_node_map[i].start_pfn;
+                end = early_node_map[i].end_pfn;
+                nr_range = add_range(range, az, nr_range, start, end);
+        }
+        return nr_range;
+}
 void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data)
 {
@@ -3779,7 +3983,7 @@ static void __init find_usable_zone_for_movable(void)
 /*
 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
- * because it is sized independant of architecture. Unlike the other zones,
+ * because it is sized independent of architecture. Unlike the other zones,
 * the starting point for ZONE_MOVABLE is not fixed. It may be different
 * in each node depending on the size of each node and how evenly kernelcore
 * is distributed. This helper function adjusts the zone ranges
@@ -3994,10 +4198,11 @@ static void __init setup_usemap(struct pglist_data *pgdat,
        unsigned long usemapsize = usemap_size(zonesize);
        zone->pageblock_flags = NULL;
        if (usemapsize)
-                zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+                zone->pageblock_flags = alloc_bootmem_node_nopanic(pgdat,
+                                                                   usemapsize);
 }
 #else
-static void inline setup_usemap(struct pglist_data *pgdat,
+static inline void setup_usemap(struct pglist_data *pgdat,
                                struct zone *zone, unsigned long zonesize) {}
 #endif /* CONFIG_SPARSEMEM */
@@ -4114,10 +4319,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->zone_pgdat = pgdat;
                zone_pcp_init(zone);
-                for_each_lru(l) {
+                for_each_lru(l)
                        INIT_LIST_HEAD(&zone->lru[l].list);
-                        zone->reclaim_stat.nr_saved_scan[l] = 0;
-                }
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
                zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4160,7 +4363,7 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
                size =  (end - start) * sizeof(struct page);
                map = alloc_remap(pgdat->node_id, size);
                if (!map)
-                        map = alloc_bootmem_node(pgdat, size);
+                        map = alloc_bootmem_node_nopanic(pgdat, size);
                pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
        }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -4732,15 +4935,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
        dma_reserve = new_dma_reserve;
 }
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = {
-#ifndef CONFIG_NO_BOOTMEM
- .bdata = &bootmem_node_data[0]
-#endif
- };
-EXPORT_SYMBOL(contig_page_data);
-#endif
 void __init free_area_init(unsigned long *zones_size)
 {
        free_area_init_node(0, zones_size,
@@ -4934,7 +5128,7 @@ void setup_per_zone_wmarks(void)
 *    1TB     101        10GB
 *   10TB     320        32GB
 */
-void calculate_zone_inactive_ratio(struct zone *zone)
+static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
 {
        unsigned int gb, ratio;
@@ -4948,7 +5142,7 @@ void calculate_zone_inactive_ratio(struct zone *zone)
        zone->inactive_ratio = ratio;
 }
-static void __init setup_per_zone_inactive_ratio(void)
+static void __meminit setup_per_zone_inactive_ratio(void)
 {
        struct zone *zone;
@@ -4980,7 +5174,7 @@ static void __init setup_per_zone_inactive_ratio(void)
 * 8192MB:      11584k
 * 16384MB:     16384k
 */
-static int __init init_per_zone_wmark_min(void)
+int __meminit init_per_zone_wmark_min(void)
 {
        unsigned long lowmem_kbytes;
@@ -4992,6 +5186,7 @@ static int __init init_per_zone_wmark_min(void)
        if (min_free_kbytes > 65536)
                min_free_kbytes = 65536;
        setup_per_zone_wmarks();
+        refresh_zone_stat_thresholds();
        setup_per_zone_lowmem_reserve();
        setup_per_zone_inactive_ratio();
        return 0;
@@ -5281,26 +5476,71 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 * page allocater never alloc memory from ISOLATE block.
 */
+static int
+__count_immobile_pages(struct zone *zone, struct page *page, int count)
+{
+        unsigned long pfn, iter, found;
+        /*
+         * For avoiding noise data, lru_add_drain_all() should be called
+         * If ZONE_MOVABLE, the zone never contains immobile pages
+         */
+        if (zone_idx(zone) == ZONE_MOVABLE)
+                return true;
+        if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE)
+                return true;
+        pfn = page_to_pfn(page);
+        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+                unsigned long check = pfn + iter;
+                if (!pfn_valid_within(check))
+                        continue;
+                page = pfn_to_page(check);
+                if (!page_count(page)) {
+                        if (PageBuddy(page))
+                                iter += (1 << page_order(page)) - 1;
+                        continue;
+                }
+                if (!PageLRU(page))
+                        found++;
+                /*
+                 * If there are RECLAIMABLE pages, we need to check it.
+                 * But now, memory offline itself doesn't call shrink_slab()
+                 * and it still to be fixed.
+                 */
+                /*
+                 * If the page is not RAM, page_count()should be 0.
+                 * we don't need more check. This is an _used_ not-movable page.
+                 *
+                 * The problematic thing here is PG_reserved pages. PG_reserved
+                 * is set to both of a memory hole page and a _used_ kernel
+                 * page at boot.
+                 */
+                if (found > count)
+                        return false;
+        }
+        return true;
+}
+bool is_pageblock_removable_nolock(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        return __count_immobile_pages(zone, page, 0);
+}
 int set_migratetype_isolate(struct page *page)
 {
        struct zone *zone;
-        struct page *curr_page;
+        unsigned long flags, pfn;
-        unsigned long flags, pfn, iter;
-        unsigned long immobile = 0;
        struct memory_isolate_notify arg;
        int notifier_ret;
        int ret = -EBUSY;
-        int zone_idx;
        zone = page_zone(page);
-        zone_idx = zone_idx(zone);
        spin_lock_irqsave(&zone->lock, flags);
-        if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
-            zone_idx == ZONE_MOVABLE) {
-                ret = 0;
-                goto out;
-        }
        pfn = page_to_pfn(page);
        arg.start_pfn = pfn;
@@ -5320,23 +5560,20 @@ int set_migratetype_isolate(struct page *page)
         */
        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
        notifier_ret = notifier_to_errno(notifier_ret);
-        if (notifier_ret || !arg.pages_found)
+        if (notifier_ret)
                goto out;
+        /*
-        for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
+         * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
-                if (!pfn_valid_within(pfn))
+         * We just check MOVABLE pages.
-                        continue;
+         */
+        if (__count_immobile_pages(zone, page, arg.pages_found))
-                curr_page = pfn_to_page(iter);
-                if (!page_count(curr_page) || PageLRU(curr_page))
-                        continue;
-                immobile++;
-        }
-        if (arg.pages_found == immobile)
                ret = 0;
+        /*
+         * immobile means "not-on-lru" paes. If immobile is larger than
+         * removable-by-driver pages reported by notifier, we'll fail.
+         */
 out:
        if (!ret) {
                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
@@ -5455,7 +5692,6 @@ static struct trace_print_flags pageflag_names[] = {
        {1UL << PG_swapcache,           "swapcache"     },
        {1UL << PG_mappedtodisk,        "mappedtodisk"  },
        {1UL << PG_reclaim,             "reclaim"       },
-        {1UL << PG_buddy,               "buddy"         },
        {1UL << PG_swapbacked,          "swapbacked"    },
        {1UL << PG_unevictable,         "unevictable"   },
 #ifdef CONFIG_MMU
@@ -5503,7 +5739,8 @@ void dump_page(struct page *page)
 {
        printk(KERN_ALERT
               "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n",
-                page, page_count(page), page_mapcount(page),
+                page, atomic_read(&page->_count), page_mapcount(page),
                page->mapping, page->index);
        dump_page_flags(page->flags);
+        mem_cgroup_print_bad_page(page);
 }
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5bffada7cde1..53bffc6c293e 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -11,12 +11,11 @@
 #include <linux/swapops.h>
 #include <linux/kmemleak.h>
-static void __meminit
+static void __meminit init_page_cgroup(struct page_cgroup *pc, unsigned long id)
-__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
 {
        pc->flags = 0;
+        set_page_cgroup_array_id(pc, id);
        pc->mem_cgroup = NULL;
-        pc->page = pfn_to_page(pfn);
        INIT_LIST_HEAD(&pc->lru);
 }
 static unsigned long total_usage;
@@ -43,6 +42,19 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
        return base + offset;
 }
+struct page *lookup_cgroup_page(struct page_cgroup *pc)
+{
+        unsigned long pfn;
+        struct page *page;
+        pg_data_t *pgdat;
+        pgdat = NODE_DATA(page_cgroup_array_id(pc));
+        pfn = pc - pgdat->node_page_cgroup + pgdat->node_start_pfn;
+        page = pfn_to_page(pfn);
+        VM_BUG_ON(pc != lookup_page_cgroup(page));
+        return page;
+}
 static int __init alloc_node_page_cgroup(int nid)
 {
        struct page_cgroup *base, *pc;
@@ -63,7 +75,7 @@ static int __init alloc_node_page_cgroup(int nid)
                return -ENOMEM;
        for (index = 0; index < nr_pages; index++) {
                pc = base + index;
-                __init_page_cgroup(pc, start_pfn + index);
+                init_page_cgroup(pc, nid);
        }
        NODE_DATA(nid)->node_page_cgroup = base;
        total_usage += table_size;
@@ -105,46 +117,74 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
        return section->page_cgroup + pfn;
 }
-/* __alloc_bootmem...() is protected by !slab_available() */
+struct page *lookup_cgroup_page(struct page_cgroup *pc)
-static int __init_refok init_section_page_cgroup(unsigned long pfn)
 {
-        struct mem_section *section = __pfn_to_section(pfn);
+        struct mem_section *section;
-        struct page_cgroup *base, *pc;
+        struct page *page;
-        unsigned long table_size;
+        unsigned long nr;
-        int nid, index;
+        nr = page_cgroup_array_id(pc);
-        if (!section->page_cgroup) {
+        section = __nr_to_section(nr);
-                nid = page_to_nid(pfn_to_page(pfn));
+        page = pfn_to_page(pc - section->page_cgroup);
-                table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+        VM_BUG_ON(pc != lookup_page_cgroup(page));
-                VM_BUG_ON(!slab_is_available());
+        return page;
-                if (node_state(nid, N_HIGH_MEMORY)) {
+}
-                        base = kmalloc_node(table_size,
-                                GFP_KERNEL | __GFP_NOWARN, nid);
+static void *__meminit alloc_page_cgroup(size_t size, int nid)
-                        if (!base)
+{
-                                base = vmalloc_node(table_size, nid);
+        void *addr = NULL;
-                } else {
-                        base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN);
+        addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN);
-                        if (!base)
+        if (addr)
-                                base = vmalloc(table_size);
+                return addr;
-                }
-                /*
+        if (node_state(nid, N_HIGH_MEMORY))
-                 * The value stored in section->page_cgroup is (base - pfn)
+                addr = vmalloc_node(size, nid);
-                 * and it does not point to the memory block allocated above,
+        else
-                 * causing kmemleak false positives.
+                addr = vmalloc(size);
-                 */
-                kmemleak_not_leak(base);
+        return addr;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void free_page_cgroup(void *addr)
+{
+        if (is_vmalloc_addr(addr)) {
+                vfree(addr);
        } else {
-                /*
+                struct page *page = virt_to_page(addr);
-                 * We don't have to allocate page_cgroup again, but
+                size_t table_size =
-                 * address of memmap may be changed. So, we have to initialize
+                        sizeof(struct page_cgroup) * PAGES_PER_SECTION;
-                 * again.
-                 */
+                BUG_ON(PageReserved(page));
-                base = section->page_cgroup + pfn;
+                free_pages_exact(addr, table_size);
-                table_size = 0;
-                /* check address of memmap is changed or not. */
-                if (base->page == pfn_to_page(pfn))
-                        return 0;
        }
+}
+#endif
+static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
+{
+        struct page_cgroup *base, *pc;
+        struct mem_section *section;
+        unsigned long table_size;
+        unsigned long nr;
+        int index;
+        nr = pfn_to_section_nr(pfn);
+        section = __nr_to_section(nr);
+        if (section->page_cgroup)
+                return 0;
+        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+        base = alloc_page_cgroup(table_size, nid);
+        /*
+         * The value stored in section->page_cgroup is (base - pfn)
+         * and it does not point to the memory block allocated above,
+         * causing kmemleak false positives.
+         */
+        kmemleak_not_leak(base);
        if (!base) {
                printk(KERN_ERR "page cgroup allocation failure\n");
@@ -153,9 +193,13 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn)
        for (index = 0; index < PAGES_PER_SECTION; index++) {
                pc = base + index;
-                __init_page_cgroup(pc, pfn + index);
+                init_page_cgroup(pc, nr);
        }
+        /*
+         * The passed "pfn" may not be aligned to SECTION.  For the calculation
+         * we need to apply a mask.
+         */
+        pfn &= PAGE_SECTION_MASK;
        section->page_cgroup = base - pfn;
        total_usage += table_size;
        return 0;
@@ -170,16 +214,8 @@ void __free_page_cgroup(unsigned long pfn)
        if (!ms || !ms->page_cgroup)
                return;
        base = ms->page_cgroup + pfn;
-        if (is_vmalloc_addr(base)) {
+        free_page_cgroup(base);
-                vfree(base);
+        ms->page_cgroup = NULL;
-                ms->page_cgroup = NULL;
-        } else {
-                struct page *page = virt_to_page(base);
-                if (!PageReserved(page)) { /* Is bootmem ? */
-                        kfree(base);
-                        ms->page_cgroup = NULL;
-                }
-        }
 }
 int __meminit online_page_cgroup(unsigned long start_pfn,
@@ -192,10 +228,20 @@ int __meminit online_page_cgroup(unsigned long start_pfn,
        start = start_pfn & ~(PAGES_PER_SECTION - 1);
        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+        if (nid == -1) {
+                /*
+                 * In this case, "nid" already exists and contains valid memory.
+                 * "start_pfn" passed to us is a pfn which is an arg for
+                 * online__pages(), and start_pfn should exist.
+                 */
+                nid = pfn_to_nid(start_pfn);
+                VM_BUG_ON(!node_state(nid, N_ONLINE));
+        }
        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
                if (!pfn_present(pfn))
                        continue;
-                fail = init_section_page_cgroup(pfn);
+                fail = init_section_page_cgroup(pfn, nid);
        }
        if (!fail)
                return 0;
@@ -243,12 +289,7 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
                break;
        }
-        if (ret)
+        return notifier_from_errno(ret);
-                ret = notifier_from_errno(ret);
-        else
-                ret = NOTIFY_OK;
-        return ret;
 }
 #endif
@@ -256,25 +297,47 @@ static int __meminit page_cgroup_callback(struct notifier_block *self,
 void __init page_cgroup_init(void)
 {
        unsigned long pfn;
-        int fail = 0;
+        int nid;
        if (mem_cgroup_disabled())
                return;
-        for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
+        for_each_node_state(nid, N_HIGH_MEMORY) {
-                if (!pfn_present(pfn))
+                unsigned long start_pfn, end_pfn;
-                        continue;
-                fail = init_section_page_cgroup(pfn);
+                start_pfn = node_start_pfn(nid);
-        }
+                end_pfn = node_end_pfn(nid);
-        if (fail) {
+                /*
-                printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+                 * start_pfn and end_pfn may not be aligned to SECTION and the
-                panic("Out of memory");
+                 * page->flags of out of node pages are not initialized.  So we
-        } else {
+                 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
-                hotplug_memory_notifier(page_cgroup_callback, 0);
+                 */
+                for (pfn = start_pfn;
+                     pfn < end_pfn;
+                     pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
+                        if (!pfn_valid(pfn))
+                                continue;
+                        /*
+                         * Nodes's pfns can be overlapping.
+                         * We know some arch can have a nodes layout such as
+                         * -------------pfn-------------->
+                         * N0 | N1 | N2 | N0 | N1 | N2|....
+                         */
+                        if (pfn_to_nid(pfn) != nid)
+                                continue;
+                        if (init_section_page_cgroup(pfn, nid))
+                                goto oom;
+                }
        }
+        hotplug_memory_notifier(page_cgroup_callback, 0);
        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
-        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you don't"
+        printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
-        " want memory cgroups\n");
+                         "don't want memory cgroups\n");
+        return;
+oom:
+        printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
+        panic("Out of memory");
 }
 void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
@@ -349,7 +412,7 @@ not_enough_page:
 * @new: new id
 *
 * Returns old id at success, 0 at failure.
- * (There is no mem_cgroup useing 0 as its id)
+ * (There is no mem_cgroup using 0 as its id)
 */
 unsigned short swap_cgroup_cmpxchg(swp_entry_t ent,
                                        unsigned short old, unsigned short new)
@@ -447,7 +510,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
        if (!do_swap_account)
                return 0;
-        length = ((max_pages/SC_PER_PAGE) + 1);
+        length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
        array_size = length * sizeof(void *);
        array = vmalloc(array_size);
@@ -464,8 +527,8 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
                /* memory shortage */
                ctrl->map = NULL;
                ctrl->length = 0;
-                vfree(array);
                mutex_unlock(&swap_cgroup_mutex);
+                vfree(array);
                goto nomem;
        }
        mutex_unlock(&swap_cgroup_mutex);
@@ -480,7 +543,8 @@ nomem:
 void swap_cgroup_swapoff(int type)
 {
-        int i;
+        struct page **map;
+        unsigned long i, length;
        struct swap_cgroup_ctrl *ctrl;
        if (!do_swap_account)
@@ -488,17 +552,20 @@ void swap_cgroup_swapoff(int type)
        mutex_lock(&swap_cgroup_mutex);
        ctrl = &swap_cgroup_ctrl[type];
-        if (ctrl->map) {
+        map = ctrl->map;
-                for (i = 0; i < ctrl->length; i++) {
+        length = ctrl->length;
-                        struct page *page = ctrl->map[i];
+        ctrl->map = NULL;
+        ctrl->length = 0;
+        mutex_unlock(&swap_cgroup_mutex);
+        if (map) {
+                for (i = 0; i < length; i++) {
+                        struct page *page = map[i];
                        if (page)
                                __free_page(page);
                }
-                vfree(ctrl->map);
+                vfree(map);
-                ctrl->map = NULL;
-                ctrl->length = 0;
        }
-        mutex_unlock(&swap_cgroup_mutex);
 }
 #endif
diff --git a/mm/page_io.c b/mm/page_io.c
index 2dee975bf469..dc76b4d0611e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                goto out;
        }
        if (wbc->sync_mode == WB_SYNC_ALL)
-                rw |= REQ_SYNC | REQ_UNPLUG;
+                rw |= REQ_SYNC;
        count_vm_event(PSWPOUT);
        set_page_writeback(page);
        unlock_page(page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 5e0ffd967452..4ae42bb40892 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
 * all pages in [start_pfn...end_pfn) must be in the same zone.
 * zone->lock must be held before call this.
 *
- * Returns 0 if all pages in the range is isolated.
+ * Returns 1 if all pages in the range is isolated.
 */
 static int
 __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
@@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        struct zone *zone;
        int ret;
-        pfn = start_pfn;
        /*
         * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
         * is not aligned to pageblock_nr_pages.
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 8b1a2ce21ee5..c3450d533611 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -33,18 +33,35 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
        pmd = pmd_offset(pud, addr);
        do {
+again:
                next = pmd_addr_end(addr, end);
-                if (pmd_none_or_clear_bad(pmd)) {
+                if (pmd_none(*pmd)) {
                        if (walk->pte_hole)
                                err = walk->pte_hole(addr, next, walk);
                        if (err)
                                break;
                        continue;
                }
+                /*
+                 * This implies that each ->pmd_entry() handler
+                 * needs to know about pmd_trans_huge() pmds
+                 */
                if (walk->pmd_entry)
                        err = walk->pmd_entry(pmd, addr, next, walk);
-                if (!err && walk->pte_entry)
+                if (err)
-                        err = walk_pte_range(pmd, addr, next, walk);
+                        break;
+                /*
+                 * Check this here so we only break down trans_huge
+                 * pages when we _need_ to
+                 */
+                if (!walk->pte_entry)
+                        continue;
+                split_huge_page_pmd(walk->mm, pmd);
+                if (pmd_none_or_clear_bad(pmd))
+                        goto again;
+                err = walk_pte_range(pmd, addr, next, walk);
                if (err)
                        break;
        } while (pmd++, addr = next, addr != end);
@@ -139,7 +156,6 @@ int walk_page_range(unsigned long addr, unsigned long end,
        pgd_t *pgd;
        unsigned long next;
        int err = 0;
-        struct vm_area_struct *vma;
        if (addr >= end)
                return err;
@@ -149,15 +165,17 @@ int walk_page_range(unsigned long addr, unsigned long end,
        pgd = pgd_offset(walk->mm, addr);
        do {
+                struct vm_area_struct *uninitialized_var(vma);
                next = pgd_addr_end(addr, end);
+#ifdef CONFIG_HUGETLB_PAGE
                /*
                 * handle hugetlb vma individually because pagetable walk for
                 * the hugetlb page is dependent on the architecture and
                 * we can't handled it in the same manner as non-huge pages.
                 */
                vma = find_vma(walk->mm, addr);
-#ifdef CONFIG_HUGETLB_PAGE
                if (vma && is_vm_hugetlb_page(vma)) {
                        if (vma->vm_end < next)
                                next = vma->vm_end;
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index df680855540a..89633fefc6a2 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -27,7 +27,7 @@
 *   chunk size is not aligned.  percpu-km code will whine about it.
 */
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
 #error "contiguous percpu allocation is incompatible with paged first chunk"
 #endif
@@ -35,7 +35,11 @@
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
-        /* noop */
+        unsigned int cpu;
+        for_each_possible_cpu(cpu)
+                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
        return 0;
 }
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 7d9c1d0ebd3f..ea534960a04b 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
                return NULL;
        vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes,
-                                pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL);
+                                pcpu_nr_groups, pcpu_atom_size);
        if (!vms) {
                pcpu_free_chunk(chunk);
                return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index c76ef3891e0d..bf80e55dbed7 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -31,7 +31,7 @@
 * as small as 4 bytes.  The allocator organizes chunks into lists
 * according to free size and tries to allocate from the fullest one.
 * Each chunk keeps the maximum contiguous area size hint which is
- * guaranteed to be eqaul to or larger than the maximum contiguous
+ * guaranteed to be equal to or larger than the maximum contiguous
 * area in the chunk.  This helps the allocator not to iterate the
 * chunk maps unnecessarily.
 *
@@ -76,6 +76,7 @@
 #define PCPU_SLOT_BASE_SHIFT            5       /* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC              16      /* start a map with 16 ents */
+#ifdef CONFIG_SMP
 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
 #ifndef __addr_to_pcpu_ptr
 #define __addr_to_pcpu_ptr(addr)                                        \
@@ -89,6 +90,11 @@
                         (unsigned long)pcpu_base_addr -                \
                         (unsigned long)__per_cpu_start)
 #endif
+#else   /* CONFIG_SMP */
+/* on UP, it's always identity mapped */
+#define __addr_to_pcpu_ptr(addr)        (void __percpu *)(addr)
+#define __pcpu_ptr_to_addr(ptr)         (void __force *)(ptr)
+#endif  /* CONFIG_SMP */
 struct pcpu_chunk {
        struct list_head        list;           /* linked to pcpu_slot lists */
@@ -252,7 +258,7 @@ static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk,
 /*
 * (Un)populated page region iterators.  Iterate over (un)populated
- * page regions betwen @start and @end in @chunk.  @rs and @re should
+ * page regions between @start and @end in @chunk.  @rs and @re should
 * be integer variables and will be set to start and end page index of
 * the current region.
 */
@@ -287,12 +293,8 @@ static void *pcpu_mem_alloc(size_t size)
        if (size <= PAGE_SIZE)
                return kzalloc(size, GFP_KERNEL);
-        else {
+        else
-                void *ptr = vmalloc(size);
+                return vzalloc(size);
-                if (ptr)
-                        memset(ptr, 0, size);
-                return ptr;
-        }
 }
 /**
@@ -340,7 +342,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 * @chunk: chunk of interest
 *
 * Determine whether area map of @chunk needs to be extended to
- * accomodate a new allocation.
+ * accommodate a new allocation.
 *
 * CONTEXT:
 * pcpu_lock.
@@ -429,7 +431,7 @@ out_unlock:
 * depending on @head, is reduced by @tail bytes and @tail byte block
 * is inserted after the target block.
 *
- * @chunk->map must have enough free slots to accomodate the split.
+ * @chunk->map must have enough free slots to accommodate the split.
 *
 * CONTEXT:
 * pcpu_lock.
@@ -820,8 +822,8 @@ fail_unlock_mutex:
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 *
- * Allocate percpu area of @size bytes aligned at @align.  Might
+ * Allocate zero-filled percpu area of @size bytes aligned at @align.
- * sleep.  Might trigger writeouts.
+ * Might sleep.  Might trigger writeouts.
 *
 * CONTEXT:
 * Does GFP_KERNEL allocation.
@@ -840,9 +842,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
 * @size: size of area to allocate in bytes
 * @align: alignment of area (max PAGE_SIZE)
 *
- * Allocate percpu area of @size bytes aligned at @align from reserved
+ * Allocate zero-filled percpu area of @size bytes aligned at @align
- * percpu area if arch has set it up; otherwise, allocation is served
+ * from reserved percpu area if arch has set it up; otherwise,
- * from the same dynamic area.  Might sleep.  Might trigger writeouts.
+ * allocation is served from the same dynamic area.  Might sleep.
+ * Might trigger writeouts.
 *
 * CONTEXT:
 * Does GFP_KERNEL allocation.
@@ -949,6 +952,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
 */
 bool is_kernel_percpu_address(unsigned long addr)
 {
+#ifdef CONFIG_SMP
        const size_t static_size = __per_cpu_end - __per_cpu_start;
        void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
        unsigned int cpu;
@@ -959,6 +963,8 @@ bool is_kernel_percpu_address(unsigned long addr)
                if ((void *)addr >= start && (void *)addr < start + static_size)
                        return true;
        }
+#endif
+        /* on UP, can't distinguish from other static vars, always false */
        return false;
 }
@@ -1002,8 +1008,7 @@ phys_addr_t per_cpu_ptr_to_phys(void *addr)
        }
        if (in_first_chunk) {
-                if ((unsigned long)addr < VMALLOC_START ||
+                if (!is_vmalloc_addr(addr))
-                    (unsigned long)addr >= VMALLOC_END)
                        return __pa(addr);
                else
                        return page_to_phys(vmalloc_to_page(addr));
@@ -1067,161 +1072,6 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
 }
 /**
- * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
- * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: minimum free size for dynamic allocation in bytes
- * @atom_size: allocation atom size
- * @cpu_distance_fn: callback to determine distance between cpus, optional
- *
- * This function determines grouping of units, their mappings to cpus
- * and other parameters considering needed percpu size, allocation
- * atom size and distances between CPUs.
- *
- * Groups are always mutliples of atom size and CPUs which are of
- * LOCAL_DISTANCE both ways are grouped together and share space for
- * units in the same group.  The returned configuration is guaranteed
- * to have CPUs on different nodes on different groups and >=75% usage
- * of allocated virtual address space.
- *
- * RETURNS:
- * On success, pointer to the new allocation_info is returned.  On
- * failure, ERR_PTR value is returned.
- */
-static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
-                                size_t reserved_size, size_t dyn_size,
-                                size_t atom_size,
-                                pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
-{
-        static int group_map[NR_CPUS] __initdata;
-        static int group_cnt[NR_CPUS] __initdata;
-        const size_t static_size = __per_cpu_end - __per_cpu_start;
-        int nr_groups = 1, nr_units = 0;
-        size_t size_sum, min_unit_size, alloc_size;
-        int upa, max_upa, uninitialized_var(best_upa);  /* units_per_alloc */
-        int last_allocs, group, unit;
-        unsigned int cpu, tcpu;
-        struct pcpu_alloc_info *ai;
-        unsigned int *cpu_map;
-        /* this function may be called multiple times */
-        memset(group_map, 0, sizeof(group_map));
-        memset(group_cnt, 0, sizeof(group_cnt));
-        /* calculate size_sum and ensure dyn_size is enough for early alloc */
-        size_sum = PFN_ALIGN(static_size + reserved_size +
-                            max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
-        dyn_size = size_sum - static_size - reserved_size;
-        /*
-         * Determine min_unit_size, alloc_size and max_upa such that
-         * alloc_size is multiple of atom_size and is the smallest
-         * which can accomodate 4k aligned segments which are equal to
-         * or larger than min_unit_size.
-         */
-        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-        alloc_size = roundup(min_unit_size, atom_size);
-        upa = alloc_size / min_unit_size;
-        while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
-                upa--;
-        max_upa = upa;
-        /* group cpus according to their proximity */
-        for_each_possible_cpu(cpu) {
-                group = 0;
-        next_group:
-                for_each_possible_cpu(tcpu) {
-                        if (cpu == tcpu)
-                                break;
-                        if (group_map[tcpu] == group && cpu_distance_fn &&
-                            (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
-                             cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
-                                group++;
-                                nr_groups = max(nr_groups, group + 1);
-                                goto next_group;
-                        }
-                }
-                group_map[cpu] = group;
-                group_cnt[group]++;
-        }
-        /*
-         * Expand unit size until address space usage goes over 75%
-         * and then as much as possible without using more address
-         * space.
-         */
-        last_allocs = INT_MAX;
-        for (upa = max_upa; upa; upa--) {
-                int allocs = 0, wasted = 0;
-                if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
-                        continue;
-                for (group = 0; group < nr_groups; group++) {
-                        int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
-                        allocs += this_allocs;
-                        wasted += this_allocs * upa - group_cnt[group];
-                }
-                /*
-                 * Don't accept if wastage is over 1/3.  The
-                 * greater-than comparison ensures upa==1 always
-                 * passes the following check.
-                 */
-                if (wasted > num_possible_cpus() / 3)
-                        continue;
-                /* and then don't consume more memory */
-                if (allocs > last_allocs)
-                        break;
-                last_allocs = allocs;
-                best_upa = upa;
-        }
-        upa = best_upa;
-        /* allocate and fill alloc_info */
-        for (group = 0; group < nr_groups; group++)
-                nr_units += roundup(group_cnt[group], upa);
-        ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
-        if (!ai)
-                return ERR_PTR(-ENOMEM);
-        cpu_map = ai->groups[0].cpu_map;
-        for (group = 0; group < nr_groups; group++) {
-                ai->groups[group].cpu_map = cpu_map;
-                cpu_map += roundup(group_cnt[group], upa);
-        }
-        ai->static_size = static_size;
-        ai->reserved_size = reserved_size;
-        ai->dyn_size = dyn_size;
-        ai->unit_size = alloc_size / upa;
-        ai->atom_size = atom_size;
-        ai->alloc_size = alloc_size;
-        for (group = 0, unit = 0; group_cnt[group]; group++) {
-                struct pcpu_group_info *gi = &ai->groups[group];
-                /*
-                 * Initialize base_offset as if all groups are located
-                 * back-to-back.  The caller should update this to
-                 * reflect actual allocation.
-                 */
-                gi->base_offset = unit * ai->unit_size;
-                for_each_possible_cpu(cpu)
-                        if (group_map[cpu] == group)
-                                gi->cpu_map[gi->nr_units++] = cpu;
-                gi->nr_units = roundup(gi->nr_units, upa);
-                unit += gi->nr_units;
-        }
-        BUG_ON(unit != nr_units);
-        return ai;
-}
-/**
 * pcpu_dump_alloc_info - print out information about pcpu_alloc_info
 * @lvl: loglevel
 * @ai: allocation info to dump
@@ -1363,8 +1213,12 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        /* sanity checks */
        PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
+#ifdef CONFIG_SMP
        PCPU_SETUP_BUG_ON(!ai->static_size);
+        PCPU_SETUP_BUG_ON((unsigned long)__per_cpu_start & ~PAGE_MASK);
+#endif
        PCPU_SETUP_BUG_ON(!base_addr);
+        PCPU_SETUP_BUG_ON((unsigned long)base_addr & ~PAGE_MASK);
        PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
        PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
        PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
@@ -1411,7 +1265,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        /* we're done parsing the input, undefine BUG macro and dump config */
 #undef PCPU_SETUP_BUG_ON
-        pcpu_dump_alloc_info(KERN_INFO, ai);
+        pcpu_dump_alloc_info(KERN_DEBUG, ai);
        pcpu_nr_groups = ai->nr_groups;
        pcpu_group_offsets = group_offsets;
@@ -1488,6 +1342,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
        return 0;
 }
+#ifdef CONFIG_SMP
 const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
        [PCPU_FC_AUTO]  = "auto",
        [PCPU_FC_EMBED] = "embed",
@@ -1515,8 +1371,180 @@ static int __init percpu_alloc_setup(char *str)
 }
 early_param("percpu_alloc", percpu_alloc_setup);
+/*
+ * pcpu_embed_first_chunk() is used by the generic percpu setup.
+ * Build it if needed by the arch config or the generic setup is going
+ * to be used.
+ */
 #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
        !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
+#define BUILD_EMBED_FIRST_CHUNK
+#endif
+/* build pcpu_page_first_chunk() iff needed by the arch config */
+#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
+#define BUILD_PAGE_FIRST_CHUNK
+#endif
+/* pcpu_build_alloc_info() is used by both embed and page first chunk */
+#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
+/**
+ * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: minimum free size for dynamic allocation in bytes
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ *
+ * This function determines grouping of units, their mappings to cpus
+ * and other parameters considering needed percpu size, allocation
+ * atom size and distances between CPUs.
+ *
+ * Groups are always mutliples of atom size and CPUs which are of
+ * LOCAL_DISTANCE both ways are grouped together and share space for
+ * units in the same group.  The returned configuration is guaranteed
+ * to have CPUs on different nodes on different groups and >=75% usage
+ * of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, pointer to the new allocation_info is returned.  On
+ * failure, ERR_PTR value is returned.
+ */
+static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+                                size_t reserved_size, size_t dyn_size,
+                                size_t atom_size,
+                                pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+        static int group_map[NR_CPUS] __initdata;
+        static int group_cnt[NR_CPUS] __initdata;
+        const size_t static_size = __per_cpu_end - __per_cpu_start;
+        int nr_groups = 1, nr_units = 0;
+        size_t size_sum, min_unit_size, alloc_size;
+        int upa, max_upa, uninitialized_var(best_upa);  /* units_per_alloc */
+        int last_allocs, group, unit;
+        unsigned int cpu, tcpu;
+        struct pcpu_alloc_info *ai;
+        unsigned int *cpu_map;
+        /* this function may be called multiple times */
+        memset(group_map, 0, sizeof(group_map));
+        memset(group_cnt, 0, sizeof(group_cnt));
+        /* calculate size_sum and ensure dyn_size is enough for early alloc */
+        size_sum = PFN_ALIGN(static_size + reserved_size +
+                            max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
+        dyn_size = size_sum - static_size - reserved_size;
+        /*
+         * Determine min_unit_size, alloc_size and max_upa such that
+         * alloc_size is multiple of atom_size and is the smallest
+         * which can accommodate 4k aligned segments which are equal to
+         * or larger than min_unit_size.
+         */
+        min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+        alloc_size = roundup(min_unit_size, atom_size);
+        upa = alloc_size / min_unit_size;
+        while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+                upa--;
+        max_upa = upa;
+        /* group cpus according to their proximity */
+        for_each_possible_cpu(cpu) {
+                group = 0;
+        next_group:
+                for_each_possible_cpu(tcpu) {
+                        if (cpu == tcpu)
+                                break;
+                        if (group_map[tcpu] == group && cpu_distance_fn &&
+                            (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+                             cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+                                group++;
+                                nr_groups = max(nr_groups, group + 1);
+                                goto next_group;
+                        }
+                }
+                group_map[cpu] = group;
+                group_cnt[group]++;
+        }
+        /*
+         * Expand unit size until address space usage goes over 75%
+         * and then as much as possible without using more address
+         * space.
+         */
+        last_allocs = INT_MAX;
+        for (upa = max_upa; upa; upa--) {
+                int allocs = 0, wasted = 0;
+                if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+                        continue;
+                for (group = 0; group < nr_groups; group++) {
+                        int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+                        allocs += this_allocs;
+                        wasted += this_allocs * upa - group_cnt[group];
+                }
+                /*
+                 * Don't accept if wastage is over 1/3.  The
+                 * greater-than comparison ensures upa==1 always
+                 * passes the following check.
+                 */
+                if (wasted > num_possible_cpus() / 3)
+                        continue;
+                /* and then don't consume more memory */
+                if (allocs > last_allocs)
+                        break;
+                last_allocs = allocs;
+                best_upa = upa;
+        }
+        upa = best_upa;
+        /* allocate and fill alloc_info */
+        for (group = 0; group < nr_groups; group++)
+                nr_units += roundup(group_cnt[group], upa);
+        ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
+        if (!ai)
+                return ERR_PTR(-ENOMEM);
+        cpu_map = ai->groups[0].cpu_map;
+        for (group = 0; group < nr_groups; group++) {
+                ai->groups[group].cpu_map = cpu_map;
+                cpu_map += roundup(group_cnt[group], upa);
+        }
+        ai->static_size = static_size;
+        ai->reserved_size = reserved_size;
+        ai->dyn_size = dyn_size;
+        ai->unit_size = alloc_size / upa;
+        ai->atom_size = atom_size;
+        ai->alloc_size = alloc_size;
+        for (group = 0, unit = 0; group_cnt[group]; group++) {
+                struct pcpu_group_info *gi = &ai->groups[group];
+                /*
+                 * Initialize base_offset as if all groups are located
+                 * back-to-back.  The caller should update this to
+                 * reflect actual allocation.
+                 */
+                gi->base_offset = unit * ai->unit_size;
+                for_each_possible_cpu(cpu)
+                        if (group_map[cpu] == group)
+                                gi->cpu_map[gi->nr_units++] = cpu;
+                gi->nr_units = roundup(gi->nr_units, upa);
+                unit += gi->nr_units;
+        }
+        BUG_ON(unit != nr_units);
+        return ai;
+}
+#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
+#if defined(BUILD_EMBED_FIRST_CHUNK)
 /**
 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
 * @reserved_size: the size of reserved percpu area in bytes
@@ -1524,7 +1552,7 @@ early_param("percpu_alloc", percpu_alloc_setup);
 * @atom_size: allocation atom size
 * @cpu_distance_fn: callback to determine distance between cpus, optional
 * @alloc_fn: function to allocate percpu page
- * @free_fn: funtion to free percpu page
+ * @free_fn: function to free percpu page
 *
 * This is a helper to ease setting up embedded first percpu chunk and
 * can be called where pcpu_setup_first_chunk() is expected.
@@ -1619,8 +1647,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
        /* warn if maximum distance is further than 75% of vmalloc space */
        if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) {
                pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc "
-                           "space 0x%lx\n",
+                           "space 0x%lx\n", max_distance,
-                           max_distance, VMALLOC_END - VMALLOC_START);
+                           (unsigned long)(VMALLOC_END - VMALLOC_START));
 #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
                /* and fail if we have fallback */
                rc = -EINVAL;
@@ -1645,15 +1673,14 @@ out_free:
                free_bootmem(__pa(areas), areas_size);
        return rc;
 }
-#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
+#endif /* BUILD_EMBED_FIRST_CHUNK */
-          !CONFIG_HAVE_SETUP_PER_CPU_AREA */
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#ifdef BUILD_PAGE_FIRST_CHUNK
 /**
 * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
 * @reserved_size: the size of reserved percpu area in bytes
 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
- * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
+ * @free_fn: function to free percpu page, always called with PAGE_SIZE
 * @populate_pte_fn: function to populate pte
 *
 * This is a helper to ease setting up page-remapped first percpu
@@ -1756,10 +1783,11 @@ out_free_ar:
        pcpu_free_alloc_info(ai);
        return rc;
 }
-#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
+#endif /* BUILD_PAGE_FIRST_CHUNK */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 /*
- * Generic percpu area setup.
+ * Generic SMP percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
 * the original non-dynamic generic percpu area setup.  This is
@@ -1770,7 +1798,6 @@ out_free_ar:
 * on the physical linear memory mapping which uses large page
 * mappings on applicable archs.
 */
-#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
@@ -1799,13 +1826,48 @@ void __init setup_per_cpu_areas(void)
                                    PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
                                    pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
        if (rc < 0)
-                panic("Failed to initialized percpu areas.");
+                panic("Failed to initialize percpu areas.");
        delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
        for_each_possible_cpu(cpu)
                __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
 }
-#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+#endif  /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+#else   /* CONFIG_SMP */
+/*
+ * UP percpu area setup.
+ *
+ * UP always uses km-based percpu allocator with identity mapping.
+ * Static percpu variables are indistinguishable from the usual static
+ * variables and don't require any special preparation.
+ */
+void __init setup_per_cpu_areas(void)
+{
+        const size_t unit_size =
+                roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
+                                         PERCPU_DYNAMIC_RESERVE));
+        struct pcpu_alloc_info *ai;
+        void *fc;
+        ai = pcpu_alloc_alloc_info(1, 1);
+        fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+        if (!ai || !fc)
+                panic("Failed to allocate memory for percpu areas.");
+        ai->dyn_size = unit_size;
+        ai->unit_size = unit_size;
+        ai->atom_size = unit_size;
+        ai->alloc_size = unit_size;
+        ai->groups[0].nr_units = 1;
+        ai->groups[0].cpu_map[0] = 0;
+        if (pcpu_setup_first_chunk(ai, fc) < 0)
+                panic("Failed to initialize percpu areas.");
+}
+#endif  /* CONFIG_SMP */
 /*
 * First and reserved chunks are initialized with temporary allocation
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
deleted file mode 100644
index db884fae5721..000000000000
--- a/mm/percpu_up.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
- */
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-void __percpu *__alloc_percpu(size_t size, size_t align)
-{
-        /*
-         * Can't easily make larger alignment work with kmalloc.  WARN
-         * on it.  Larger alignment should only be used for module
-         * percpu sections on SMP for which this path isn't used.
-         */
-        WARN_ON_ONCE(align > SMP_CACHE_BYTES);
-        return (void __percpu __force *)kzalloc(size, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(__alloc_percpu);
-void free_percpu(void __percpu *p)
-{
-        kfree(this_cpu_ptr(p));
-}
-EXPORT_SYMBOL_GPL(free_percpu);
-phys_addr_t per_cpu_ptr_to_phys(void *addr)
-{
-        return __pa(addr);
-}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
new file mode 100644
index 000000000000..eb663fb533e0
--- /dev/null
+++ b/mm/pgtable-generic.c
@@ -0,0 +1,121 @@
+/*
+ *  mm/pgtable-generic.c
+ *
+ *  Generic pgtable methods declared in asm-generic/pgtable.h
+ *
+ *  Copyright (C) 2010  Linus Torvalds
+ */
+#include <linux/pagemap.h>
+#include <asm/tlb.h>
+#include <asm-generic/pgtable.h>
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+/*
+ * Only sets the access flags (dirty, accessed, and
+ * writable). Furthermore, we know it always gets set to a "more
+ * permissive" setting, which allows most architectures to optimize
+ * this. We return whether the PTE actually changed, which in turn
+ * instructs the caller to do things like update__mmu_cache.  This
+ * used to be done in the caller, but sparc needs minor faults to
+ * force that call on sun4c so we changed this macro slightly
+ */
+int ptep_set_access_flags(struct vm_area_struct *vma,
+                          unsigned long address, pte_t *ptep,
+                          pte_t entry, int dirty)
+{
+        int changed = !pte_same(*ptep, entry);
+        if (changed) {
+                set_pte_at(vma->vm_mm, address, ptep, entry);
+                flush_tlb_page(vma, address);
+        }
+        return changed;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
+int pmdp_set_access_flags(struct vm_area_struct *vma,
+                          unsigned long address, pmd_t *pmdp,
+                          pmd_t entry, int dirty)
+{
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        int changed = !pmd_same(*pmdp, entry);
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        if (changed) {
+                set_pmd_at(vma->vm_mm, address, pmdp, entry);
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        }
+        return changed;
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+        BUG();
+        return 0;
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+}
+#endif
+#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+                           unsigned long address, pte_t *ptep)
+{
+        int young;
+        young = ptep_test_and_clear_young(vma, address, ptep);
+        if (young)
+                flush_tlb_page(vma, address);
+        return young;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
+int pmdp_clear_flush_young(struct vm_area_struct *vma,
+                           unsigned long address, pmd_t *pmdp)
+{
+        int young;
+#ifndef CONFIG_TRANSPARENT_HUGEPAGE
+        BUG();
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        young = pmdp_test_and_clear_young(vma, address, pmdp);
+        if (young)
+                flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        return young;
+}
+#endif
+#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH
+pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                       pte_t *ptep)
+{
+        pte_t pte;
+        pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
+        flush_tlb_page(vma, address);
+        return pte;
+}
+#endif
+#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address,
+                       pmd_t *pmdp)
+{
+        pmd_t pmd;
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp);
+        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+        return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
+#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
+                           pmd_t *pmdp)
+{
+        pmd_t pmd = pmd_mksplitting(*pmdp);
+        VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+        set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+        /* tlb flush only to serialize against gup-fast */
+        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
index 603ae98d9694..799dcfd7cd8c 100644
--- a/mm/prio_tree.c
+++ b/mm/prio_tree.c
@@ -13,6 +13,7 @@
 #include <linux/mm.h>
 #include <linux/prio_tree.h>
+#include <linux/prefetch.h>
 /*
 * See lib/prio_tree.c for details on the general radix priority search tree
diff --git a/mm/readahead.c b/mm/readahead.c
index 77506a291a2d..867f9dd82dcd 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -109,9 +109,12 @@ EXPORT_SYMBOL(read_cache_pages);
 static int read_pages(struct address_space *mapping, struct file *filp,
                struct list_head *pages, unsigned nr_pages)
 {
+        struct blk_plug plug;
        unsigned page_idx;
        int ret;
+        blk_start_plug(&plug);
        if (mapping->a_ops->readpages) {
                ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
                /* Clean up the remaining pages */
@@ -129,7 +132,10 @@ static int read_pages(struct address_space *mapping, struct file *filp,
                page_cache_release(page);
        }
        ret = 0;
 out:
+        blk_finish_plug(&plug);
        return ret;
 }
@@ -174,7 +180,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
                if (page)
                        continue;
-                page = page_cache_alloc_cold(mapping);
+                page = page_cache_alloc_readahead(mapping);
                if (!page)
                        break;
                page->index = page_offset;
@@ -554,17 +560,5 @@ page_cache_async_readahead(struct address_space *mapping,
        /* do read-ahead */
        ondemand_readahead(mapping, ra, filp, true, offset, req_size);
-#ifdef CONFIG_BLOCK
-        /*
-         * Normally the current page is !uptodate and lock_page() will be
-         * immediately called to implicitly unplug the device. However this
-         * is not always true for RAID conifgurations, where data arrives
-         * not strictly in their submission order. In this case we need to
-         * explicitly kick off the IO.
-         */
-        if (PageUptodate(page))
-                blk_run_backing_dev(mapping->backing_dev_info, NULL);
-#endif
 }
 EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index 92e6757f196e..23295f65ae43 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -24,22 +24,22 @@
 *   inode->i_alloc_sem (vmtruncate_range)
 *   mm->mmap_sem
 *     page->flags PG_locked (lock_page)
- *       mapping->i_mmap_lock
+ *       mapping->i_mmap_mutex
- *         anon_vma->lock
+ *         anon_vma->mutex
 *           mm->page_table_lock or pte_lock
 *             zone->lru_lock (in mark_page_accessed, isolate_lru_page)
 *             swap_lock (in swap_duplicate, swap_info_get)
 *               mmlist_lock (in mmput, drain_mmlist and others)
 *               mapping->private_lock (in __set_page_dirty_buffers)
- *               inode_lock (in set_page_dirty's __mark_inode_dirty)
+ *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+ *               inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
 *                 sb_lock (within inode_lock in fs/fs-writeback.c)
 *                 mapping->tree_lock (widely used, in set_page_dirty,
 *                           in arch-dependent flush_dcache_mmap_lock,
- *                           within inode_lock in __sync_single_inode)
+ *                           within inode_wb_list_lock in __sync_single_inode)
 *
- * (code doesn't rely on that order so it could be switched around)
+ * anon_vma->mutex,mapping->i_mutex      (memory_failure, collect_procs_anon)
- * ->tasklist_lock
+ *   ->tasklist_lock
- *   anon_vma->lock      (memory_failure, collect_procs_anon)
 *     pte map lock
 */
@@ -67,20 +67,56 @@ static struct kmem_cache *anon_vma_chain_cachep;
 static inline struct anon_vma *anon_vma_alloc(void)
 {
-        return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+        struct anon_vma *anon_vma;
+        anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+        if (anon_vma) {
+                atomic_set(&anon_vma->refcount, 1);
+                /*
+                 * Initialise the anon_vma root to point to itself. If called
+                 * from fork, the root will be reset to the parents anon_vma.
+                 */
+                anon_vma->root = anon_vma;
+        }
+        return anon_vma;
 }
-void anon_vma_free(struct anon_vma *anon_vma)
+static inline void anon_vma_free(struct anon_vma *anon_vma)
 {
+        VM_BUG_ON(atomic_read(&anon_vma->refcount));
+        /*
+         * Synchronize against page_lock_anon_vma() such that
+         * we can safely hold the lock without the anon_vma getting
+         * freed.
+         *
+         * Relies on the full mb implied by the atomic_dec_and_test() from
+         * put_anon_vma() against the acquire barrier implied by
+         * mutex_trylock() from page_lock_anon_vma(). This orders:
+         *
+         * page_lock_anon_vma()         VS      put_anon_vma()
+         *   mutex_trylock()                      atomic_dec_and_test()
+         *   LOCK                                 MB
+         *   atomic_read()                        mutex_is_locked()
+         *
+         * LOCK should suffice since the actual taking of the lock must
+         * happen _before_ what follows.
+         */
+        if (mutex_is_locked(&anon_vma->root->mutex)) {
+                anon_vma_lock(anon_vma);
+                anon_vma_unlock(anon_vma);
+        }
        kmem_cache_free(anon_vma_cachep, anon_vma);
 }
-static inline struct anon_vma_chain *anon_vma_chain_alloc(void)
+static inline struct anon_vma_chain *anon_vma_chain_alloc(gfp_t gfp)
 {
-        return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL);
+        return kmem_cache_alloc(anon_vma_chain_cachep, gfp);
 }
-void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
+static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
 {
        kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain);
 }
@@ -94,7 +130,7 @@ void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain)
 * anonymous pages mapped into it with that anon_vma.
 *
 * The common case will be that we already have one, but if
- * if not we either need to find an adjacent mapping that we
+ * not we either need to find an adjacent mapping that we
 * can re-use the anon_vma from (very common when the only
 * reason for splitting a vma has been mprotect()), or we
 * allocate a new one.
@@ -122,7 +158,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                struct mm_struct *mm = vma->vm_mm;
                struct anon_vma *allocated;
-                avc = anon_vma_chain_alloc();
+                avc = anon_vma_chain_alloc(GFP_KERNEL);
                if (!avc)
                        goto out_enomem;
@@ -133,11 +169,6 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        if (unlikely(!anon_vma))
                                goto out_enomem_free_avc;
                        allocated = anon_vma;
-                        /*
-                         * This VMA had no anon_vma yet.  This anon_vma is
-                         * the root of any anon_vma tree that might form.
-                         */
-                        anon_vma->root = anon_vma;
                }
                anon_vma_lock(anon_vma);
@@ -156,7 +187,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                anon_vma_unlock(anon_vma);
                if (unlikely(allocated))
-                        anon_vma_free(allocated);
+                        put_anon_vma(allocated);
                if (unlikely(avc))
                        anon_vma_chain_free(avc);
        }
@@ -168,6 +199,32 @@ int anon_vma_prepare(struct vm_area_struct *vma)
        return -ENOMEM;
 }
+/*
+ * This is a useful helper function for locking the anon_vma root as
+ * we traverse the vma->anon_vma_chain, looping over anon_vma's that
+ * have the same vma.
+ *
+ * Such anon_vma's should have the same root, so you'd expect to see
+ * just a single mutex_lock for the whole traversal.
+ */
+static inline struct anon_vma *lock_anon_vma_root(struct anon_vma *root, struct anon_vma *anon_vma)
+{
+        struct anon_vma *new_root = anon_vma->root;
+        if (new_root != root) {
+                if (WARN_ON_ONCE(root))
+                        mutex_unlock(&root->mutex);
+                root = new_root;
+                mutex_lock(&root->mutex);
+        }
+        return root;
+}
+static inline void unlock_anon_vma_root(struct anon_vma *root)
+{
+        if (root)
+                mutex_unlock(&root->mutex);
+}
 static void anon_vma_chain_link(struct vm_area_struct *vma,
                                struct anon_vma_chain *avc,
                                struct anon_vma *anon_vma)
@@ -176,9 +233,11 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
-        anon_vma_lock(anon_vma);
+        /*
+         * It's critical to add new vmas to the tail of the anon_vma,
+         * see comment in huge_memory.c:__split_huge_page().
+         */
        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
-        anon_vma_unlock(anon_vma);
 }
 /*
@@ -188,13 +247,24 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 {
        struct anon_vma_chain *avc, *pavc;
+        struct anon_vma *root = NULL;
        list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) {
-                avc = anon_vma_chain_alloc();
+                struct anon_vma *anon_vma;
-                if (!avc)
-                        goto enomem_failure;
+                avc = anon_vma_chain_alloc(GFP_NOWAIT | __GFP_NOWARN);
-                anon_vma_chain_link(dst, avc, pavc->anon_vma);
+                if (unlikely(!avc)) {
+                        unlock_anon_vma_root(root);
+                        root = NULL;
+                        avc = anon_vma_chain_alloc(GFP_KERNEL);
+                        if (!avc)
+                                goto enomem_failure;
+                }
+                anon_vma = pavc->anon_vma;
+                root = lock_anon_vma_root(root, anon_vma);
+                anon_vma_chain_link(dst, avc, anon_vma);
        }
+        unlock_anon_vma_root(root);
        return 0;
 enomem_failure:
@@ -227,7 +297,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        anon_vma = anon_vma_alloc();
        if (!anon_vma)
                goto out_error;
-        avc = anon_vma_chain_alloc();
+        avc = anon_vma_chain_alloc(GFP_KERNEL);
        if (!avc)
                goto out_error_free_anon_vma;
@@ -237,58 +307,63 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
         */
        anon_vma->root = pvma->anon_vma->root;
        /*
-         * With KSM refcounts, an anon_vma can stay around longer than the
+         * With refcounts, an anon_vma can stay around longer than the
-         * process it belongs to.  The root anon_vma needs to be pinned
+         * process it belongs to. The root anon_vma needs to be pinned until
-         * until this anon_vma is freed, because the lock lives in the root.
+         * this anon_vma is freed, because the lock lives in the root.
         */
        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
+        anon_vma_lock(anon_vma);
        anon_vma_chain_link(vma, avc, anon_vma);
+        anon_vma_unlock(anon_vma);
        return 0;
 out_error_free_anon_vma:
-        anon_vma_free(anon_vma);
+        put_anon_vma(anon_vma);
 out_error:
        unlink_anon_vmas(vma);
        return -ENOMEM;
 }
-static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
+void unlink_anon_vmas(struct vm_area_struct *vma)
 {
-        struct anon_vma *anon_vma = anon_vma_chain->anon_vma;
+        struct anon_vma_chain *avc, *next;
-        int empty;
+        struct anon_vma *root = NULL;
-        /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */
+        /*
-        if (!anon_vma)
+         * Unlink each anon_vma chained to the VMA.  This list is ordered
-                return;
+         * from newest to oldest, ensuring the root anon_vma gets freed last.
+         */
+        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
+                struct anon_vma *anon_vma = avc->anon_vma;
-        anon_vma_lock(anon_vma);
+                root = lock_anon_vma_root(root, anon_vma);
-        list_del(&anon_vma_chain->same_anon_vma);
+                list_del(&avc->same_anon_vma);
-        /* We must garbage collect the anon_vma if it's empty */
+                /*
-        empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
+                 * Leave empty anon_vmas on the list - we'll need
-        anon_vma_unlock(anon_vma);
+                 * to free them outside the lock.
+                 */
+                if (list_empty(&anon_vma->head))
+                        continue;
-        if (empty) {
+                list_del(&avc->same_vma);
-                /* We no longer need the root anon_vma */
+                anon_vma_chain_free(avc);
-                if (anon_vma->root != anon_vma)
-                        drop_anon_vma(anon_vma->root);
-                anon_vma_free(anon_vma);
        }
-}
+        unlock_anon_vma_root(root);
-void unlink_anon_vmas(struct vm_area_struct *vma)
-{
-        struct anon_vma_chain *avc, *next;
        /*
-         * Unlink each anon_vma chained to the VMA.  This list is ordered
+         * Iterate the list once more, it now only contains empty and unlinked
-         * from newest to oldest, ensuring the root anon_vma gets freed last.
+         * anon_vmas, destroy them. Could not do before due to __put_anon_vma()
+         * needing to acquire the anon_vma->root->mutex.
         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
-                anon_vma_unlink(avc);
+                struct anon_vma *anon_vma = avc->anon_vma;
+                put_anon_vma(anon_vma);
                list_del(&avc->same_vma);
                anon_vma_chain_free(avc);
        }
@@ -298,8 +373,8 @@ static void anon_vma_ctor(void *data)
 {
        struct anon_vma *anon_vma = data;
-        spin_lock_init(&anon_vma->lock);
+        mutex_init(&anon_vma->mutex);
-        anonvma_external_refcount_init(anon_vma);
+        atomic_set(&anon_vma->refcount, 0);
        INIT_LIST_HEAD(&anon_vma->head);
 }
@@ -311,12 +386,31 @@ void __init anon_vma_init(void)
 }
 /*
- * Getting a lock on a stable anon_vma from a page off the LRU is
+ * Getting a lock on a stable anon_vma from a page off the LRU is tricky!
- * tricky: page_lock_anon_vma rely on RCU to guard against the races.
+ *
+ * Since there is no serialization what so ever against page_remove_rmap()
+ * the best this function can do is return a locked anon_vma that might
+ * have been relevant to this page.
+ *
+ * The page might have been remapped to a different anon_vma or the anon_vma
+ * returned may already be freed (and even reused).
+ *
+ * In case it was remapped to a different anon_vma, the new anon_vma will be a
+ * child of the old anon_vma, and the anon_vma lifetime rules will therefore
+ * ensure that any anon_vma obtained from the page will still be valid for as
+ * long as we observe page_mapped() [ hence all those page_mapped() tests ].
+ *
+ * All users of this function must be very careful when walking the anon_vma
+ * chain and verify that the page in question is indeed mapped in it
+ * [ something equivalent to page_mapped_in_vma() ].
+ *
+ * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
+ * that the anon_vma pointer from page->mapping is valid if there is a
+ * mapcount, we can dereference the anon_vma after observing those.
 */
-struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_get_anon_vma(struct page *page)
 {
-        struct anon_vma *anon_vma, *root_anon_vma;
+        struct anon_vma *anon_vma = NULL;
        unsigned long anon_mapping;
        rcu_read_lock();
@@ -327,30 +421,100 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
                goto out;
        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
-        root_anon_vma = ACCESS_ONCE(anon_vma->root);
+        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
-        spin_lock(&root_anon_vma->lock);
+                anon_vma = NULL;
+                goto out;
+        }
        /*
         * If this page is still mapped, then its anon_vma cannot have been
-         * freed.  But if it has been unmapped, we have no security against
+         * freed.  But if it has been unmapped, we have no security against the
-         * the anon_vma structure being freed and reused (for another anon_vma:
+         * anon_vma structure being freed and reused (for another anon_vma:
-         * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot
+         * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero()
-         * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting
+         * above cannot corrupt).
-         * anon_vma->root before page_unlock_anon_vma() is called to unlock.
         */
-        if (page_mapped(page))
+        if (!page_mapped(page)) {
-                return anon_vma;
+                put_anon_vma(anon_vma);
+                anon_vma = NULL;
+        }
+out:
+        rcu_read_unlock();
+        return anon_vma;
+}
+/*
+ * Similar to page_get_anon_vma() except it locks the anon_vma.
+ *
+ * Its a little more complex as it tries to keep the fast path to a single
+ * atomic op -- the trylock. If we fail the trylock, we fall back to getting a
+ * reference like with page_get_anon_vma() and then block on the mutex.
+ */
+struct anon_vma *page_lock_anon_vma(struct page *page)
+{
+        struct anon_vma *anon_vma = NULL;
+        struct anon_vma *root_anon_vma;
+        unsigned long anon_mapping;
+        rcu_read_lock();
+        anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
+        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
+                goto out;
+        if (!page_mapped(page))
+                goto out;
+        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
+        root_anon_vma = ACCESS_ONCE(anon_vma->root);
+        if (mutex_trylock(&root_anon_vma->mutex)) {
+                /*
+                 * If the page is still mapped, then this anon_vma is still
+                 * its anon_vma, and holding the mutex ensures that it will
+                 * not go away, see anon_vma_free().
+                 */
+                if (!page_mapped(page)) {
+                        mutex_unlock(&root_anon_vma->mutex);
+                        anon_vma = NULL;
+                }
+                goto out;
+        }
+        /* trylock failed, we got to sleep */
+        if (!atomic_inc_not_zero(&anon_vma->refcount)) {
+                anon_vma = NULL;
+                goto out;
+        }
+        if (!page_mapped(page)) {
+                put_anon_vma(anon_vma);
+                anon_vma = NULL;
+                goto out;
+        }
+        /* we pinned the anon_vma, its safe to sleep */
+        rcu_read_unlock();
+        anon_vma_lock(anon_vma);
+        if (atomic_dec_and_test(&anon_vma->refcount)) {
+                /*
+                 * Oops, we held the last refcount, release the lock
+                 * and bail -- can't simply use put_anon_vma() because
+                 * we'll deadlock on the anon_vma_lock() recursion.
+                 */
+                anon_vma_unlock(anon_vma);
+                __put_anon_vma(anon_vma);
+                anon_vma = NULL;
+        }
+        return anon_vma;
-        spin_unlock(&root_anon_vma->lock);
 out:
        rcu_read_unlock();
-        return NULL;
+        return anon_vma;
 }
 void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
        anon_vma_unlock(anon_vma);
-        rcu_read_unlock();
 }
 /*
@@ -358,7 +522,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma)
 * Returns virtual address or -EFAULT if page's index/offset is not
 * within the range mapped the @vma.
 */
-static inline unsigned long
+inline unsigned long
 vma_address(struct page *page, struct vm_area_struct *vma)
 {
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -407,7 +571,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 *
 * On success returns with pte mapped and locked.
 */
-pte_t *page_check_address(struct page *page, struct mm_struct *mm,
+pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
                          unsigned long address, spinlock_t **ptlp, int sync)
 {
        pgd_t *pgd;
@@ -433,6 +597,8 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
                return NULL;
+        if (pmd_trans_huge(*pmd))
+                return NULL;
        pte = pte_offset_map(pmd, address);
        /* Make a quick check before getting the lock */
@@ -487,35 +653,65 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        unsigned long *vm_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
-        pte_t *pte;
-        spinlock_t *ptl;
        int referenced = 0;
-        pte = page_check_address(page, mm, address, &ptl, 0);
+        if (unlikely(PageTransHuge(page))) {
-        if (!pte)
+                pmd_t *pmd;
-                goto out;
-        /*
-         * Don't want to elevate referenced for mlocked page that gets this far,
-         * in order that it progresses to try_to_unmap and is moved to the
-         * unevictable list.
-         */
-        if (vma->vm_flags & VM_LOCKED) {
-                *mapcount = 1;  /* break early from loop */
-                *vm_flags |= VM_LOCKED;
-                goto out_unmap;
-        }
-        if (ptep_clear_flush_young_notify(vma, address, pte)) {
+                spin_lock(&mm->page_table_lock);
                /*
-                 * Don't treat a reference through a sequentially read
+                 * rmap might return false positives; we must filter
-                 * mapping as such.  If the page has been used in
+                 * these out using page_check_address_pmd().
-                 * another mapping, we will catch it; if this other
-                 * mapping is already gone, the unmap path will have
-                 * set PG_referenced or activated the page.
                 */
-                if (likely(!VM_SequentialReadHint(vma)))
+                pmd = page_check_address_pmd(page, mm, address,
+                                             PAGE_CHECK_ADDRESS_PMD_FLAG);
+                if (!pmd) {
+                        spin_unlock(&mm->page_table_lock);
+                        goto out;
+                }
+                if (vma->vm_flags & VM_LOCKED) {
+                        spin_unlock(&mm->page_table_lock);
+                        *mapcount = 0;  /* break early from loop */
+                        *vm_flags |= VM_LOCKED;
+                        goto out;
+                }
+                /* go ahead even if the pmd is pmd_trans_splitting() */
+                if (pmdp_clear_flush_young_notify(vma, address, pmd))
                        referenced++;
+                spin_unlock(&mm->page_table_lock);
+        } else {
+                pte_t *pte;
+                spinlock_t *ptl;
+                /*
+                 * rmap might return false positives; we must filter
+                 * these out using page_check_address().
+                 */
+                pte = page_check_address(page, mm, address, &ptl, 0);
+                if (!pte)
+                        goto out;
+                if (vma->vm_flags & VM_LOCKED) {
+                        pte_unmap_unlock(pte, ptl);
+                        *mapcount = 0;  /* break early from loop */
+                        *vm_flags |= VM_LOCKED;
+                        goto out;
+                }
+                if (ptep_clear_flush_young_notify(vma, address, pte)) {
+                        /*
+                         * Don't treat a reference through a sequentially read
+                         * mapping as such.  If the page has been used in
+                         * another mapping, we will catch it; if this other
+                         * mapping is already gone, the unmap path will have
+                         * set PG_referenced or activated the page.
+                         */
+                        if (likely(!VM_SequentialReadHint(vma)))
+                                referenced++;
+                }
+                pte_unmap_unlock(pte, ptl);
        }
        /* Pretend the page is referenced if the task has the
@@ -524,9 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
                        rwsem_is_locked(&mm->mmap_sem))
                referenced++;
-out_unmap:
        (*mapcount)--;
-        pte_unmap_unlock(pte, ptl);
        if (referenced)
                *vm_flags |= vma->vm_flags;
@@ -605,14 +799,14 @@ static int page_referenced_file(struct page *page,
         * The page lock not only makes sure that page->mapping cannot
         * suddenly be NULLified by truncation, it makes sure that the
         * structure at mapping cannot be freed and reused yet,
-         * so we can safely take mapping->i_mmap_lock.
+         * so we can safely take mapping->i_mmap_mutex.
         */
        BUG_ON(!PageLocked(page));
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        /*
-         * i_mmap_lock does not stabilize mapcount at all, but mapcount
+         * i_mmap_mutex does not stabilize mapcount at all, but mapcount
         * is more likely to be accurate if we note it after spinning.
         */
        mapcount = page_mapcount(page);
@@ -634,7 +828,7 @@ static int page_referenced_file(struct page *page,
                        break;
        }
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        return referenced;
 }
@@ -678,7 +872,7 @@ int page_referenced(struct page *page,
                        unlock_page(page);
        }
 out:
-        if (page_test_and_clear_young(page))
+        if (page_test_and_clear_young(page_to_pfn(page)))
                referenced++;
        return referenced;
@@ -721,7 +915,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
        BUG_ON(PageAnon(page));
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                if (vma->vm_flags & VM_SHARED) {
                        unsigned long address = vma_address(page, vma);
@@ -730,7 +924,7 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
                        ret += page_mkclean_one(page, vma, address);
                }
        }
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        return ret;
 }
@@ -744,10 +938,8 @@ int page_mkclean(struct page *page)
                struct address_space *mapping = page_mapping(page);
                if (mapping) {
                        ret = page_mkclean_file(mapping, page);
-                        if (page_test_dirty(page)) {
+                        if (page_test_and_clear_dirty(page_to_pfn(page), 1))
-                                page_clear_dirty(page);
                                ret = 1;
-                        }
                }
        }
@@ -780,10 +972,10 @@ void page_move_anon_rmap(struct page *page,
 }
 /**
- * __page_set_anon_rmap - setup new anonymous rmap
+ * __page_set_anon_rmap - set up new anonymous rmap
- * @page:       the page to add the mapping to
+ * @page:       Page to add to rmap     
- * @vma:        the vm area in which the mapping is added
+ * @vma:        VM area to add page to.
- * @address:    the user virtual address mapped
+ * @address:    User virtual address of the mapping     
 * @exclusive:  the page is exclusively owned by the current process
 */
 static void __page_set_anon_rmap(struct page *page,
@@ -793,25 +985,16 @@ static void __page_set_anon_rmap(struct page *page,
        BUG_ON(!anon_vma);
+        if (PageAnon(page))
+                return;
        /*
         * If the page isn't exclusively mapped into this vma,
         * we must use the _oldest_ possible anon_vma for the
         * page mapping!
         */
-        if (!exclusive) {
+        if (!exclusive)
-                if (PageAnon(page))
-                        return;
                anon_vma = anon_vma->root;
-        } else {
-                /*
-                 * In this case, swapped-out-but-not-discarded swap-cache
-                 * is remapped. So, no need to update page->mapping here.
-                 * We convice anon_vma poitned by page->mapping is not obsolete
-                 * because vma->anon_vma is necessary to be a family of it.
-                 */
-                if (PageAnon(page))
-                        return;
-        }
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
@@ -871,13 +1054,18 @@ void do_page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address, int exclusive)
 {
        int first = atomic_inc_and_test(&page->_mapcount);
-        if (first)
+        if (first) {
-                __inc_zone_page_state(page, NR_ANON_PAGES);
+                if (!PageTransHuge(page))
+                        __inc_zone_page_state(page, NR_ANON_PAGES);
+                else
+                        __inc_zone_page_state(page,
+                                              NR_ANON_TRANSPARENT_HUGEPAGES);
+        }
        if (unlikely(PageKsm(page)))
                return;
        VM_BUG_ON(!PageLocked(page));
-        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+        /* address might be in next vma when migration races vma_adjust */
        if (first)
                __page_set_anon_rmap(page, vma, address, exclusive);
        else
@@ -900,7 +1088,10 @@ void page_add_new_anon_rmap(struct page *page,
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        SetPageSwapBacked(page);
        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
-        __inc_zone_page_state(page, NR_ANON_PAGES);
+        if (!PageTransHuge(page))
+                __inc_zone_page_state(page, NR_ANON_PAGES);
+        else
+                __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
        __page_set_anon_rmap(page, vma, address, 1);
        if (page_evictable(page, vma))
                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -918,7 +1109,7 @@ void page_add_file_rmap(struct page *page)
 {
        if (atomic_inc_and_test(&page->_mapcount)) {
                __inc_zone_page_state(page, NR_FILE_MAPPED);
-                mem_cgroup_update_file_mapped(page, 1);
+                mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED);
        }
 }
@@ -941,10 +1132,9 @@ void page_remove_rmap(struct page *page)
         * not if it's in swapcache - there might be another pte slot
         * containing the swap entry, but page not yet written to swap.
         */
-        if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) {
+        if ((!PageAnon(page) || PageSwapCache(page)) &&
-                page_clear_dirty(page);
+            page_test_and_clear_dirty(page_to_pfn(page), 1))
                set_page_dirty(page);
-        }
        /*
         * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
         * and not charged by memcg for now.
@@ -953,10 +1143,14 @@ void page_remove_rmap(struct page *page)
                return;
        if (PageAnon(page)) {
                mem_cgroup_uncharge_page(page);
-                __dec_zone_page_state(page, NR_ANON_PAGES);
+                if (!PageTransHuge(page))
+                        __dec_zone_page_state(page, NR_ANON_PAGES);
+                else
+                        __dec_zone_page_state(page,
+                                              NR_ANON_TRANSPARENT_HUGEPAGES);
        } else {
                __dec_zone_page_state(page, NR_FILE_MAPPED);
-                mem_cgroup_update_file_mapped(page, -1);
+                mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED);
        }
        /*
         * It would be tidy to reset the PageAnon mapping here,
@@ -1078,7 +1272,7 @@ out_mlock:
        /*
         * We need mmap_sem locking, Otherwise VM_LOCKED check makes
         * unstable result and race. Plus, We can't wait here because
-         * we now hold anon_vma->lock or mapping->i_mmap_lock.
+         * we now hold anon_vma->mutex or mapping->i_mmap_mutex.
         * if trylock failed, the page remain in evictable lru and later
         * vmscan could retry to move the page to unevictable lru if the
         * page is actually mlocked.
@@ -1209,7 +1403,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        return ret;
 }
-static bool is_vma_temporary_stack(struct vm_area_struct *vma)
+bool is_vma_temporary_stack(struct vm_area_struct *vma)
 {
        int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1304,7 +1498,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        unsigned long max_nl_size = 0;
        unsigned int mapcount;
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
                if (address == -EFAULT)
@@ -1350,7 +1544,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        mapcount = page_mapcount(page);
        if (!mapcount)
                goto out;
-        cond_resched_lock(&mapping->i_mmap_lock);
+        cond_resched();
        max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
        if (max_nl_cursor == 0)
@@ -1372,7 +1566,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
                        }
                        vma->vm_private_data = (void *) max_nl_cursor;
                }
-                cond_resched_lock(&mapping->i_mmap_lock);
+                cond_resched();
                max_nl_cursor += CLUSTER_SIZE;
        } while (max_nl_cursor <= max_nl_size);
@@ -1384,7 +1578,7 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
                vma->vm_private_data = NULL;
 out:
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        return ret;
 }
@@ -1407,6 +1601,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
        int ret;
        BUG_ON(!PageLocked(page));
+        VM_BUG_ON(!PageHuge(page) && PageTransHuge(page));
        if (unlikely(PageKsm(page)))
                ret = try_to_unmap_ksm(page, flags);
@@ -1446,41 +1641,15 @@ int try_to_munlock(struct page *page)
                return try_to_unmap_file(page, TTU_MUNLOCK);
 }
-#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
+void __put_anon_vma(struct anon_vma *anon_vma)
-/*
- * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
- * if necessary.  Be careful to do all the tests under the lock.  Once
- * we know we are the last user, nobody else can get a reference and we
- * can do the freeing without the lock.
- */
-void drop_anon_vma(struct anon_vma *anon_vma)
 {
-        BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
+        struct anon_vma *root = anon_vma->root;
-        if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
-                struct anon_vma *root = anon_vma->root;
-                int empty = list_empty(&anon_vma->head);
-                int last_root_user = 0;
-                int root_empty = 0;
-                /*
+        if (root != anon_vma && atomic_dec_and_test(&root->refcount))
-                 * The refcount on a non-root anon_vma got dropped.  Drop
+                anon_vma_free(root);
-                 * the refcount on the root and check if we need to free it.
-                 */
-                if (empty && anon_vma != root) {
-                        BUG_ON(atomic_read(&root->external_refcount) <= 0);
-                        last_root_user = atomic_dec_and_test(&root->external_refcount);
-                        root_empty = list_empty(&root->head);
-                }
-                anon_vma_unlock(anon_vma);
-                if (empty) {
+        anon_vma_free(anon_vma);
-                        anon_vma_free(anon_vma);
-                        if (root_empty && last_root_user)
-                                anon_vma_free(root);
-                }
-        }
 }
-#endif
 #ifdef CONFIG_MIGRATION
 /*
@@ -1528,7 +1697,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
        if (!mapping)
                return ret;
-        spin_lock(&mapping->i_mmap_lock);
+        mutex_lock(&mapping->i_mmap_mutex);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                unsigned long address = vma_address(page, vma);
                if (address == -EFAULT)
@@ -1542,7 +1711,7 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
         * never contain migration ptes.  Decide what to do about this
         * limitation to linear when we need rmap_walk() on nonlinear.
         */
-        spin_unlock(&mapping->i_mmap_lock);
+        mutex_unlock(&mapping->i_mmap_mutex);
        return ret;
 }
@@ -1591,7 +1760,7 @@ void hugepage_add_anon_rmap(struct page *page,
        BUG_ON(!PageLocked(page));
        BUG_ON(!anon_vma);
-        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+        /* address might be in next vma when migration races vma_adjust */
        first = atomic_inc_and_test(&page->_mapcount);
        if (first)
                __hugepage_set_anon_rmap(page, vma, address, 0);
diff --git a/mm/shmem.c b/mm/shmem.c
index 080b09a57a8f..fcedf5464eb7 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -99,6 +99,13 @@ static struct vfsmount *shm_mnt;
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
+struct shmem_xattr {
+        struct list_head list;  /* anchored by shmem_inode_info->xattr_list */
+        char *name;             /* xattr name */
+        size_t size;
+        char value[0];
+};
 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
 enum sgp_type {
        SGP_READ,       /* don't exceed i_size, don't allocate page */
@@ -224,7 +231,6 @@ static const struct vm_operations_struct shmem_vm_ops;
 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
        .ra_pages       = 0,    /* No readahead */
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-        .unplug_io_fn   = default_unplug_io_fn,
 };
 static LIST_HEAD(shmem_swaplist);
@@ -422,7 +428,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
                 * a waste to allocate index if we cannot allocate data.
                 */
                if (sbinfo->max_blocks) {
-                        if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
+                        if (percpu_counter_compare(&sbinfo->used_blocks,
+                                                sbinfo->max_blocks - 1) >= 0)
                                return ERR_PTR(-ENOSPC);
                        percpu_counter_inc(&sbinfo->used_blocks);
                        spin_lock(&inode->i_lock);
@@ -532,7 +539,7 @@ static void shmem_free_pages(struct list_head *next)
        } while (next);
 }
-static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
        unsigned long idx;
@@ -555,6 +562,8 @@ static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
        spinlock_t *punch_lock;
        unsigned long upper_limit;
+        truncate_inode_pages_range(inode->i_mapping, start, end);
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
        idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (idx >= info->next_index)
@@ -731,16 +740,8 @@ done2:
                 * lowered next_index.  Also, though shmem_getpage checks
                 * i_size before adding to cache, no recheck after: so fix the
                 * narrow window there too.
-                 *
-                 * Recalling truncate_inode_pages_range and unmap_mapping_range
-                 * every time for punch_hole (which never got a chance to clear
-                 * SHMEM_PAGEIN at the start of vmtruncate_range) is expensive,
-                 * yet hardly ever necessary: try to optimize them out later.
                 */
                truncate_inode_pages_range(inode->i_mapping, start, end);
-                if (punch_hole)
-                        unmap_mapping_range(inode->i_mapping, start,
-                                                        end - start, 1);
        }
        spin_lock(&info->lock);
@@ -759,27 +760,28 @@ done2:
                shmem_free_pages(pages_to_free.next);
        }
 }
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
-        loff_t newsize = attr->ia_size;
        int error;
        error = inode_change_ok(inode, attr);
        if (error)
                return error;
-        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
+        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-                                        && newsize != inode->i_size) {
+                loff_t oldsize = inode->i_size;
+                loff_t newsize = attr->ia_size;
                struct page *page = NULL;
-                if (newsize < inode->i_size) {
+                if (newsize < oldsize) {
                        /*
                         * If truncating down to a partial page, then
                         * if that page is already allocated, hold it
                         * in memory until the truncation is over, so
-                         * truncate_partial_page cannnot miss it were
+                         * truncate_partial_page cannot miss it were
                         * it assigned to swap.
                         */
                        if (newsize & (PAGE_CACHE_SIZE-1)) {
@@ -803,12 +805,19 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
                                spin_unlock(&info->lock);
                        }
                }
+                if (newsize != oldsize) {
-                /* XXX(truncate): truncate_setsize should be called last */
+                        i_size_write(inode, newsize);
-                truncate_setsize(inode, newsize);
+                        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+                }
+                if (newsize < oldsize) {
+                        loff_t holebegin = round_up(newsize, PAGE_SIZE);
+                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+                        shmem_truncate_range(inode, newsize, (loff_t)-1);
+                        /* unmap again to remove racily COWed private pages */
+                        unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
+                }
                if (page)
                        page_cache_release(page);
-                shmem_truncate_range(inode, newsize, (loff_t)-1);
        }
        setattr_copy(inode, attr);
@@ -822,9 +831,9 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 static void shmem_evict_inode(struct inode *inode)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
+        struct shmem_xattr *xattr, *nxattr;
        if (inode->i_mapping->a_ops == &shmem_aops) {
-                truncate_inode_pages(inode->i_mapping, 0);
                shmem_unacct_size(info->flags, inode->i_size);
                inode->i_size = 0;
                shmem_truncate_range(inode, 0, (loff_t)-1);
@@ -834,6 +843,11 @@ static void shmem_evict_inode(struct inode *inode)
                        mutex_unlock(&shmem_swaplist_mutex);
                }
        }
+        list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
+                kfree(xattr->name);
+                kfree(xattr);
+        }
        BUG_ON(inode->i_blocks);
        shmem_free_inode(inode->i_sb);
        end_writeback(inode);
@@ -852,7 +866,7 @@ static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_
 static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
 {
-        struct inode *inode;
+        struct address_space *mapping;
        unsigned long idx;
        unsigned long size;
        unsigned long limit;
@@ -875,8 +889,10 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
        if (size > SHMEM_NR_DIRECT)
                size = SHMEM_NR_DIRECT;
        offset = shmem_find_swp(entry, ptr, ptr+size);
-        if (offset >= 0)
+        if (offset >= 0) {
+                shmem_swp_balance_unmap();
                goto found;
+        }
        if (!info->i_indirect)
                goto lost2;
@@ -917,6 +933,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
                        shmem_swp_unmap(ptr);
                        if (offset >= 0) {
                                shmem_dir_unmap(dir);
+                                ptr = shmem_swp_map(subdir);
                                goto found;
                        }
                }
@@ -928,8 +945,7 @@ lost2:
        return 0;
 found:
        idx += offset;
-        inode = igrab(&info->vfs_inode);
+        ptr += offset;
-        spin_unlock(&info->lock);
        /*
         * Move _head_ to start search for next from here.
@@ -940,37 +956,18 @@ found:
         */
        if (shmem_swaplist.next != &info->swaplist)
                list_move_tail(&shmem_swaplist, &info->swaplist);
-        mutex_unlock(&shmem_swaplist_mutex);
-        error = 1;
-        if (!inode)
-                goto out;
        /*
-         * Charge page using GFP_KERNEL while we can wait.
+         * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
-         * Charged back to the user(not to caller) when swap account is used.
+         * but also to hold up shmem_evict_inode(): so inode cannot be freed
-         * add_to_page_cache() will be called with GFP_NOWAIT.
+         * beneath us (pagelock doesn't help until the page is in pagecache).
         */
-        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+        mapping = info->vfs_inode.i_mapping;
-        if (error)
+        error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
-                goto out;
+        /* which does mem_cgroup_uncharge_cache_page on error */
-        error = radix_tree_preload(GFP_KERNEL);
-        if (error) {
-                mem_cgroup_uncharge_cache_page(page);
-                goto out;
-        }
-        error = 1;
-        spin_lock(&info->lock);
-        ptr = shmem_swp_entry(info, idx, NULL);
-        if (ptr && ptr->val == entry.val) {
-                error = add_to_page_cache_locked(page, inode->i_mapping,
-                                                idx, GFP_NOWAIT);
-                /* does mem_cgroup_uncharge_cache_page on error */
-        } else  /* we must compensate for our precharge above */
-                mem_cgroup_uncharge_cache_page(page);
        if (error == -EEXIST) {
-                struct page *filepage = find_get_page(inode->i_mapping, idx);
+                struct page *filepage = find_get_page(mapping, idx);
                error = 1;
                if (filepage) {
                        /*
@@ -990,14 +987,8 @@ found:
                swap_free(entry);
                error = 1;      /* not an error, but entry was found */
        }
-        if (ptr)
+        shmem_swp_unmap(ptr);
-                shmem_swp_unmap(ptr);
        spin_unlock(&info->lock);
-        radix_tree_preload_end();
-out:
-        unlock_page(page);
-        page_cache_release(page);
-        iput(inode);            /* allows for NULL */
        return error;
 }
@@ -1009,6 +1000,26 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
        struct list_head *p, *next;
        struct shmem_inode_info *info;
        int found = 0;
+        int error;
+        /*
+         * Charge page using GFP_KERNEL while we can wait, before taking
+         * the shmem_swaplist_mutex which might hold up shmem_writepage().
+         * Charged back to the user (not to caller) when swap account is used.
+         * add_to_page_cache() will be called with GFP_NOWAIT.
+         */
+        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+        if (error)
+                goto out;
+        /*
+         * Try to preload while we can wait, to not make a habit of
+         * draining atomic reserves; but don't latch on to this cpu,
+         * it's okay if sometimes we get rescheduled after this.
+         */
+        error = radix_tree_preload(GFP_KERNEL);
+        if (error)
+                goto uncharge;
+        radix_tree_preload_end();
        mutex_lock(&shmem_swaplist_mutex);
        list_for_each_safe(p, next, &shmem_swaplist) {
@@ -1016,17 +1027,19 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
                found = shmem_unuse_inode(info, entry, page);
                cond_resched();
                if (found)
-                        goto out;
+                        break;
        }
        mutex_unlock(&shmem_swaplist_mutex);
-        /*
-         * Can some race bring us here?  We've been holding page lock,
+uncharge:
-         * so I think not; but would rather try again later than BUG()
+        if (!found)
-         */
+                mem_cgroup_uncharge_cache_page(page);
+        if (found < 0)
+                error = found;
+out:
        unlock_page(page);
        page_cache_release(page);
-out:
+        return error;
-        return (found < 0) ? found : 0;
 }
 /*
@@ -1064,7 +1077,25 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
        else
                swap.val = 0;
+        /*
+         * Add inode to shmem_unuse()'s list of swapped-out inodes,
+         * if it's not already there.  Do it now because we cannot take
+         * mutex while holding spinlock, and must do so before the page
+         * is moved to swap cache, when its pagelock no longer protects
+         * the inode from eviction.  But don't unlock the mutex until
+         * we've taken the spinlock, because shmem_unuse_inode() will
+         * prune a !swapped inode from the swaplist under both locks.
+         */
+        if (swap.val) {
+                mutex_lock(&shmem_swaplist_mutex);
+                if (list_empty(&info->swaplist))
+                        list_add_tail(&info->swaplist, &shmem_swaplist);
+        }
        spin_lock(&info->lock);
+        if (swap.val)
+                mutex_unlock(&shmem_swaplist_mutex);
        if (index >= info->next_index) {
                BUG_ON(!(info->flags & SHMEM_TRUNCATE));
                goto unlock;
@@ -1081,25 +1112,13 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
        shmem_recalc_inode(inode);
        if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
-                remove_from_page_cache(page);
+                delete_from_page_cache(page);
                shmem_swp_set(info, entry, swap.val);
                shmem_swp_unmap(entry);
-                if (list_empty(&info->swaplist))
-                        inode = igrab(inode);
-                else
-                        inode = NULL;
-                spin_unlock(&info->lock);
                swap_shmem_alloc(swap);
+                spin_unlock(&info->lock);
                BUG_ON(page_mapped(page));
-                page_cache_release(page);       /* pagecache ref */
                swap_writepage(page, wbc);
-                if (inode) {
-                        mutex_lock(&shmem_swaplist_mutex);
-                        /* move instead of add in case we're racing */
-                        list_move_tail(&info->swaplist, &shmem_swaplist);
-                        mutex_unlock(&shmem_swaplist_mutex);
-                        iput(inode);
-                }
                return 0;
        }
@@ -1287,12 +1306,10 @@ repeat:
                swappage = lookup_swap_cache(swap);
                if (!swappage) {
                        shmem_swp_unmap(entry);
+                        spin_unlock(&info->lock);
                        /* here we actually do the io */
-                        if (type && !(*type & VM_FAULT_MAJOR)) {
+                        if (type)
-                                __count_vm_event(PGMAJFAULT);
                                *type |= VM_FAULT_MAJOR;
-                        }
-                        spin_unlock(&info->lock);
                        swappage = shmem_swapin(swap, gfp, info, idx);
                        if (!swappage) {
                                spin_lock(&info->lock);
@@ -1399,21 +1416,16 @@ repeat:
                shmem_swp_unmap(entry);
                sbinfo = SHMEM_SB(inode->i_sb);
                if (sbinfo->max_blocks) {
-                        if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
+                        if (percpu_counter_compare(&sbinfo->used_blocks,
-                            shmem_acct_block(info->flags)) {
+                                                sbinfo->max_blocks) >= 0 ||
-                                spin_unlock(&info->lock);
+                            shmem_acct_block(info->flags))
-                                error = -ENOSPC;
+                                goto nospace;
-                                goto failed;
-                        }
                        percpu_counter_inc(&sbinfo->used_blocks);
                        spin_lock(&inode->i_lock);
                        inode->i_blocks += BLOCKS_PER_PAGE;
                        spin_unlock(&inode->i_lock);
-                } else if (shmem_acct_block(info->flags)) {
+                } else if (shmem_acct_block(info->flags))
-                        spin_unlock(&info->lock);
+                        goto nospace;
-                        error = -ENOSPC;
-                        goto failed;
-                }
                if (!filepage) {
                        int ret;
@@ -1493,6 +1505,24 @@ done:
        error = 0;
        goto out;
+nospace:
+        /*
+         * Perhaps the page was brought in from swap between find_lock_page
+         * and taking info->lock?  We allow for that at add_to_page_cache_lru,
+         * but must also avoid reporting a spurious ENOSPC while working on a
+         * full tmpfs.  (When filepage has been passed in to shmem_getpage, it
+         * is already in page cache, which prevents this race from occurring.)
+         */
+        if (!filepage) {
+                struct page *page = find_get_page(mapping, idx);
+                if (page) {
+                        spin_unlock(&info->lock);
+                        page_cache_release(page);
+                        goto repeat;
+                }
+        }
+        spin_unlock(&info->lock);
+        error = -ENOSPC;
 failed:
        if (*pagep != filepage) {
                unlock_page(filepage);
@@ -1518,7 +1548,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
+        if (ret & VM_FAULT_MAJOR) {
+                count_vm_event(PGMAJFAULT);
+                mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+        }
        return ret | VM_FAULT_LOCKED;
 }
@@ -1586,6 +1619,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
        inode = new_inode(sb);
        if (inode) {
+                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
                inode->i_blocks = 0;
                inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
@@ -1596,6 +1630,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
                spin_lock_init(&info->lock);
                info->flags = flags & VM_NORESERVE;
                INIT_LIST_HEAD(&info->swaplist);
+                INIT_LIST_HEAD(&info->xattr_list);
                cache_no_acl(inode);
                switch (mode & S_IFMT) {
@@ -1842,8 +1877,9 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
        inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
        if (inode) {
-                error = security_inode_init_security(inode, dir, NULL, NULL,
+                error = security_inode_init_security(inode, dir,
-                                                     NULL);
+                                                     &dentry->d_name, NULL,
+                                                     NULL, NULL);
                if (error) {
                        if (error != -EOPNOTSUPP) {
                                iput(inode);
@@ -1903,7 +1939,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
        dir->i_size += BOGO_DIRENT_SIZE;
        inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
        inc_nlink(inode);
-        atomic_inc(&inode->i_count);    /* New dentry reference */
+        ihold(inode);   /* New dentry reference */
        dget(dentry);           /* Extra pinning count for the created dentry */
        d_instantiate(dentry, inode);
 out:
@@ -1982,8 +2018,8 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        if (!inode)
                return -ENOSPC;
-        error = security_inode_init_security(inode, dir, NULL, NULL,
+        error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
-                                             NULL);
+                                             NULL, NULL);
        if (error) {
                if (error != -EOPNOTSUPP) {
                        iput(inode);
@@ -1994,9 +2030,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
        info = SHMEM_I(inode);
        inode->i_size = len-1;
-        if (len <= (char *)inode - (char *)info) {
+        if (len <= SHMEM_SYMLINK_INLINE_LEN) {
                /* do it inline */
-                memcpy(info, symname, len);
+                memcpy(info->inline_symlink, symname, len);
                inode->i_op = &shmem_symlink_inline_operations;
        } else {
                error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
@@ -2022,7 +2058,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
 {
-        nd_set_link(nd, (char *)SHMEM_I(dentry->d_inode));
+        nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
        return NULL;
 }
@@ -2046,63 +2082,253 @@ static void shmem_put_link(struct dentry *dentry, struct nameidata *nd, void *co
        }
 }
-static const struct inode_operations shmem_symlink_inline_operations = {
+#ifdef CONFIG_TMPFS_XATTR
-        .readlink       = generic_readlink,
-        .follow_link    = shmem_follow_link_inline,
-};
-static const struct inode_operations shmem_symlink_inode_operations = {
-        .readlink       = generic_readlink,
-        .follow_link    = shmem_follow_link,
-        .put_link       = shmem_put_link,
-};
-#ifdef CONFIG_TMPFS_POSIX_ACL
 /*
- * Superblocks without xattr inode operations will get security.* xattr
+ * Superblocks without xattr inode operations may get some security.* xattr
- * support from the VFS "for free". As soon as we have any other xattrs
+ * support from the LSM "for free". As soon as we have any other xattrs
 * like ACLs, we also need to implement the security.* handlers at
 * filesystem level, though.
 */
-static size_t shmem_xattr_security_list(struct dentry *dentry, char *list,
+static int shmem_xattr_get(struct dentry *dentry, const char *name,
-                                        size_t list_len, const char *name,
+                           void *buffer, size_t size)
-                                        size_t name_len, int handler_flags)
 {
-        return security_inode_listsecurity(dentry->d_inode, list, list_len);
+        struct shmem_inode_info *info;
-}
+        struct shmem_xattr *xattr;
+        int ret = -ENODATA;
-static int shmem_xattr_security_get(struct dentry *dentry, const char *name,
+        info = SHMEM_I(dentry->d_inode);
-                void *buffer, size_t size, int handler_flags)
-{
+        spin_lock(&info->lock);
-        if (strcmp(name, "") == 0)
+        list_for_each_entry(xattr, &info->xattr_list, list) {
-                return -EINVAL;
+                if (strcmp(name, xattr->name))
-        return xattr_getsecurity(dentry->d_inode, name, buffer, size);
+                        continue;
+                ret = xattr->size;
+                if (buffer) {
+                        if (size < xattr->size)
+                                ret = -ERANGE;
+                        else
+                                memcpy(buffer, xattr->value, xattr->size);
+                }
+                break;
+        }
+        spin_unlock(&info->lock);
+        return ret;
 }
-static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
+static int shmem_xattr_set(struct dentry *dentry, const char *name,
-                const void *value, size_t size, int flags, int handler_flags)
+                           const void *value, size_t size, int flags)
 {
-        if (strcmp(name, "") == 0)
+        struct inode *inode = dentry->d_inode;
-                return -EINVAL;
+        struct shmem_inode_info *info = SHMEM_I(inode);
-        return security_inode_setsecurity(dentry->d_inode, name, value,
+        struct shmem_xattr *xattr;
-                                          size, flags);
+        struct shmem_xattr *new_xattr = NULL;
+        size_t len;
+        int err = 0;
+        /* value == NULL means remove */
+        if (value) {
+                /* wrap around? */
+                len = sizeof(*new_xattr) + size;
+                if (len <= sizeof(*new_xattr))
+                        return -ENOMEM;
+                new_xattr = kmalloc(len, GFP_KERNEL);
+                if (!new_xattr)
+                        return -ENOMEM;
+                new_xattr->name = kstrdup(name, GFP_KERNEL);
+                if (!new_xattr->name) {
+                        kfree(new_xattr);
+                        return -ENOMEM;
+                }
+                new_xattr->size = size;
+                memcpy(new_xattr->value, value, size);
+        }
+        spin_lock(&info->lock);
+        list_for_each_entry(xattr, &info->xattr_list, list) {
+                if (!strcmp(name, xattr->name)) {
+                        if (flags & XATTR_CREATE) {
+                                xattr = new_xattr;
+                                err = -EEXIST;
+                        } else if (new_xattr) {
+                                list_replace(&xattr->list, &new_xattr->list);
+                        } else {
+                                list_del(&xattr->list);
+                        }
+                        goto out;
+                }
+        }
+        if (flags & XATTR_REPLACE) {
+                xattr = new_xattr;
+                err = -ENODATA;
+        } else {
+                list_add(&new_xattr->list, &info->xattr_list);
+                xattr = NULL;
+        }
+out:
+        spin_unlock(&info->lock);
+        if (xattr)
+                kfree(xattr->name);
+        kfree(xattr);
+        return err;
 }
-static const struct xattr_handler shmem_xattr_security_handler = {
-        .prefix = XATTR_SECURITY_PREFIX,
-        .list   = shmem_xattr_security_list,
-        .get    = shmem_xattr_security_get,
-        .set    = shmem_xattr_security_set,
-};
 static const struct xattr_handler *shmem_xattr_handlers[] = {
+#ifdef CONFIG_TMPFS_POSIX_ACL
        &generic_acl_access_handler,
        &generic_acl_default_handler,
-        &shmem_xattr_security_handler,
+#endif
        NULL
 };
+static int shmem_xattr_validate(const char *name)
+{
+        struct { const char *prefix; size_t len; } arr[] = {
+                { XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN },
+                { XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN }
+        };
+        int i;
+        for (i = 0; i < ARRAY_SIZE(arr); i++) {
+                size_t preflen = arr[i].len;
+                if (strncmp(name, arr[i].prefix, preflen) == 0) {
+                        if (!name[preflen])
+                                return -EINVAL;
+                        return 0;
+                }
+        }
+        return -EOPNOTSUPP;
+}
+static ssize_t shmem_getxattr(struct dentry *dentry, const char *name,
+                              void *buffer, size_t size)
+{
+        int err;
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_getxattr(dentry, name, buffer, size);
+        err = shmem_xattr_validate(name);
+        if (err)
+                return err;
+        return shmem_xattr_get(dentry, name, buffer, size);
+}
+static int shmem_setxattr(struct dentry *dentry, const char *name,
+                          const void *value, size_t size, int flags)
+{
+        int err;
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_setxattr(dentry, name, value, size, flags);
+        err = shmem_xattr_validate(name);
+        if (err)
+                return err;
+        if (size == 0)
+                value = "";  /* empty EA, do not remove */
+        return shmem_xattr_set(dentry, name, value, size, flags);
+}
+static int shmem_removexattr(struct dentry *dentry, const char *name)
+{
+        int err;
+        /*
+         * If this is a request for a synthetic attribute in the system.*
+         * namespace use the generic infrastructure to resolve a handler
+         * for it via sb->s_xattr.
+         */
+        if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
+                return generic_removexattr(dentry, name);
+        err = shmem_xattr_validate(name);
+        if (err)
+                return err;
+        return shmem_xattr_set(dentry, name, NULL, 0, XATTR_REPLACE);
+}
+static bool xattr_is_trusted(const char *name)
+{
+        return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+}
+static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
+{
+        bool trusted = capable(CAP_SYS_ADMIN);
+        struct shmem_xattr *xattr;
+        struct shmem_inode_info *info;
+        size_t used = 0;
+        info = SHMEM_I(dentry->d_inode);
+        spin_lock(&info->lock);
+        list_for_each_entry(xattr, &info->xattr_list, list) {
+                size_t len;
+                /* skip "trusted." attributes for unprivileged callers */
+                if (!trusted && xattr_is_trusted(xattr->name))
+                        continue;
+                len = strlen(xattr->name) + 1;
+                used += len;
+                if (buffer) {
+                        if (size < used) {
+                                used = -ERANGE;
+                                break;
+                        }
+                        memcpy(buffer, xattr->name, len);
+                        buffer += len;
+                }
+        }
+        spin_unlock(&info->lock);
+        return used;
+}
+#endif /* CONFIG_TMPFS_XATTR */
+static const struct inode_operations shmem_symlink_inline_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = shmem_follow_link_inline,
+#ifdef CONFIG_TMPFS_XATTR
+        .setxattr       = shmem_setxattr,
+        .getxattr       = shmem_getxattr,
+        .listxattr      = shmem_listxattr,
+        .removexattr    = shmem_removexattr,
+#endif
+};
+static const struct inode_operations shmem_symlink_inode_operations = {
+        .readlink       = generic_readlink,
+        .follow_link    = shmem_follow_link,
+        .put_link       = shmem_put_link,
+#ifdef CONFIG_TMPFS_XATTR
+        .setxattr       = shmem_setxattr,
+        .getxattr       = shmem_getxattr,
+        .listxattr      = shmem_listxattr,
+        .removexattr    = shmem_removexattr,
 #endif
+};
 static struct dentry *shmem_get_parent(struct dentry *child)
 {
@@ -2143,10 +2369,12 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
 {
        struct inode *inode = dentry->d_inode;
-        if (*len < 3)
+        if (*len < 3) {
+                *len = 3;
                return 255;
+        }
-        if (hlist_unhashed(&inode->i_hash)) {
+        if (inode_unhashed(inode)) {
                /* Unfortunately insert_inode_hash is not idempotent,
                 * so as we hash inodes here rather than at creation
                 * time, we need a lock to ensure we only try
@@ -2154,7 +2382,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
                 */
                static DEFINE_SPINLOCK(lock);
                spin_lock(&lock);
-                if (hlist_unhashed(&inode->i_hash))
+                if (inode_unhashed(inode))
                        __insert_inode_hash(inode,
                                            inode->i_ino + inode->i_generation);
                spin_unlock(&lock);
@@ -2380,8 +2608,10 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = TMPFS_MAGIC;
        sb->s_op = &shmem_ops;
        sb->s_time_gran = 1;
-#ifdef CONFIG_TMPFS_POSIX_ACL
+#ifdef CONFIG_TMPFS_XATTR
        sb->s_xattr = shmem_xattr_handlers;
+#endif
+#ifdef CONFIG_TMPFS_POSIX_ACL
        sb->s_flags |= MS_POSIXACL;
 #endif
@@ -2414,13 +2644,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
        return &p->vfs_inode;
 }
+static void shmem_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+}
 static void shmem_destroy_inode(struct inode *inode)
 {
        if ((inode->i_mode & S_IFMT) == S_IFREG) {
                /* only struct inode is valid if it's an inline symlink */
                mpol_free_shared_policy(&SHMEM_I(inode)->policy);
        }
-        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
+        call_rcu(&inode->i_rcu, shmem_i_callback);
 }
 static void init_once(void *foo)
@@ -2470,13 +2707,15 @@ static const struct file_operations shmem_file_operations = {
 };
 static const struct inode_operations shmem_inode_operations = {
-        .setattr        = shmem_notify_change,
+        .setattr        = shmem_setattr,
        .truncate_range = shmem_truncate_range,
+#ifdef CONFIG_TMPFS_XATTR
+        .setxattr       = shmem_setxattr,
+        .getxattr       = shmem_getxattr,
+        .listxattr      = shmem_listxattr,
+        .removexattr    = shmem_removexattr,
+#endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
-        .setxattr       = generic_setxattr,
-        .getxattr       = generic_getxattr,
-        .listxattr      = generic_listxattr,
-        .removexattr    = generic_removexattr,
        .check_acl      = generic_check_acl,
 #endif
@@ -2494,23 +2733,27 @@ static const struct inode_operations shmem_dir_inode_operations = {
        .mknod          = shmem_mknod,
        .rename         = shmem_rename,
 #endif
+#ifdef CONFIG_TMPFS_XATTR
+        .setxattr       = shmem_setxattr,
+        .getxattr       = shmem_getxattr,
+        .listxattr      = shmem_listxattr,
+        .removexattr    = shmem_removexattr,
+#endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
-        .setattr        = shmem_notify_change,
+        .setattr        = shmem_setattr,
-        .setxattr       = generic_setxattr,
-        .getxattr       = generic_getxattr,
-        .listxattr      = generic_listxattr,
-        .removexattr    = generic_removexattr,
        .check_acl      = generic_check_acl,
 #endif
 };
 static const struct inode_operations shmem_special_inode_operations = {
+#ifdef CONFIG_TMPFS_XATTR
+        .setxattr       = shmem_setxattr,
+        .getxattr       = shmem_getxattr,
+        .listxattr      = shmem_listxattr,
+        .removexattr    = shmem_removexattr,
+#endif
 #ifdef CONFIG_TMPFS_POSIX_ACL
-        .setattr        = shmem_notify_change,
+        .setattr        = shmem_setattr,
-        .setxattr       = generic_setxattr,
-        .getxattr       = generic_getxattr,
-        .listxattr      = generic_listxattr,
-        .removexattr    = generic_removexattr,
        .check_acl      = generic_check_acl,
 #endif
 };
@@ -2537,16 +2780,16 @@ static const struct vm_operations_struct shmem_vm_ops = {
 };
-static int shmem_get_sb(struct file_system_type *fs_type,
+static struct dentry *shmem_mount(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+        int flags, const char *dev_name, void *data)
 {
-        return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
+        return mount_nodev(fs_type, flags, data, shmem_fill_super);
 }
 static struct file_system_type tmpfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "tmpfs",
-        .get_sb         = shmem_get_sb,
+        .mount          = shmem_mount,
        .kill_sb        = kill_litter_super,
 };
@@ -2642,7 +2885,7 @@ out:
 static struct file_system_type tmpfs_fs_type = {
        .name           = "tmpfs",
-        .get_sb         = ramfs_get_sb,
+        .mount          = ramfs_mount,
        .kill_sb        = kill_litter_super,
 };
@@ -2666,6 +2909,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
        return 0;
 }
+void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+{
+        truncate_inode_pages_range(inode->i_mapping, start, end);
+}
+EXPORT_SYMBOL_GPL(shmem_truncate_range);
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 /**
 * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
@@ -2783,5 +3032,29 @@ int shmem_zero_setup(struct vm_area_struct *vma)
                fput(vma->vm_file);
        vma->vm_file = file;
        vma->vm_ops = &shmem_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
+/**
+ * shmem_read_mapping_page_gfp - read into page cache, using specified page allocation flags.
+ * @mapping:    the page's address_space
+ * @index:      the page index
+ * @gfp:        the page allocator flags to use if allocating
+ *
+ * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
+ * with any new page allocations done using the specified allocation flags.
+ * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * suit tmpfs, since it may have pages in swapcache, and needs to find those
+ * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
+ *
+ * Provide a stub for those callers to start using now, then later
+ * flesh it out to call shmem_getpage() with additional gfp mask, when
+ * shmem_file_splice_read() is added and shmem_readpage() is removed.
+ */
+struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
+                                         pgoff_t index, gfp_t gfp)
+{
+        return read_cache_page_gfp(mapping, index, gfp);
+}
+EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --git a/mm/slab.c b/mm/slab.c
index fcae9815d3b3..d96e223de775 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -115,6 +115,7 @@
 #include        <linux/debugobjects.h>
 #include        <linux/kmemcheck.h>
 #include        <linux/memory.h>
+#include        <linux/prefetch.h>
 #include        <asm/cacheflush.h>
 #include        <asm/tlbflush.h>
@@ -191,22 +192,6 @@ typedef unsigned int kmem_bufctl_t;
 #define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-3)
 /*
- * struct slab
- *
- * Manages the objs in a slab. Placed either at the beginning of mem allocated
- * for a slab, or allocated from an general cache.
- * Slabs are chained into three list: fully used, partial, fully free slabs.
- */
-struct slab {
-        struct list_head list;
-        unsigned long colouroff;
-        void *s_mem;            /* including colour offset */
-        unsigned int inuse;     /* num of objs active in slab */
-        kmem_bufctl_t free;
-        unsigned short nodeid;
-};
-/*
 * struct slab_rcu
 *
 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
@@ -219,8 +204,6 @@ struct slab {
 *
 * rcu_read_lock before reading the address, then rcu_read_unlock after
 * taking the spinlock within the structure expected at that address.
- *
- * We assume struct slab_rcu can overlay struct slab when destroying.
 */
 struct slab_rcu {
        struct rcu_head head;
@@ -229,6 +212,27 @@ struct slab_rcu {
 };
 /*
+ * struct slab
+ *
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
+ */
+struct slab {
+        union {
+                struct {
+                        struct list_head list;
+                        unsigned long colouroff;
+                        void *s_mem;            /* including colour offset */
+                        unsigned int inuse;     /* num of objs active in slab */
+                        kmem_bufctl_t free;
+                        unsigned short nodeid;
+                };
+                struct slab_rcu __slab_cover_slab_rcu;
+        };
+};
+/*
 * struct array_cache
 *
 * Purpose:
@@ -284,7 +288,7 @@ struct kmem_list3 {
 * Need this for bootstrapping a per node allocator.
 */
 #define NUM_INIT_LISTS (3 * MAX_NUMNODES)
-struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 #define CACHE_CACHE 0
 #define SIZE_AC MAX_NUMNODES
 #define SIZE_L3 (2 * MAX_NUMNODES)
@@ -829,12 +833,12 @@ static void init_reap_node(int cpu)
 static void next_reap_node(void)
 {
-        int node = __get_cpu_var(slab_reap_node);
+        int node = __this_cpu_read(slab_reap_node);
        node = next_node(node, node_online_map);
        if (unlikely(node >= MAX_NUMNODES))
                node = first_node(node_online_map);
-        __get_cpu_var(slab_reap_node) = node;
+        __this_cpu_write(slab_reap_node, node);
 }
 #else
@@ -875,7 +879,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
        nc = kmalloc_node(memsize, gfp, node);
        /*
         * The array_cache structures contain pointers to free object.
-         * However, when such objects are allocated or transfered to another
+         * However, when such objects are allocated or transferred to another
         * cache the pointers are not cleared and they could be counted as
         * valid references during a kmemleak scan. Therefore, kmemleak must
         * not scan such objects.
@@ -901,7 +905,7 @@ static int transfer_objects(struct array_cache *to,
                struct array_cache *from, unsigned int max)
 {
        /* Figure out how many entries to transfer */
-        int nr = min(min(from->avail, max), to->limit - to->avail);
+        int nr = min3(from->avail, max, to->limit - to->avail);
        if (!nr)
                return 0;
@@ -1012,7 +1016,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
 */
 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
 {
-        int node = __get_cpu_var(slab_reap_node);
+        int node = __this_cpu_read(slab_reap_node);
        if (l3->alien) {
                struct array_cache *ac = l3->alien[node];
@@ -1293,7 +1297,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
                 * anything expensive but will only modify reap_work
                 * and reschedule the timer.
                */
-                cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
+                cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu));
                /* Now the cache_reaper is guaranteed to be not running. */
                per_cpu(slab_reap_work, cpu).work.func = NULL;
                break;
@@ -1387,7 +1391,7 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
                break;
        }
 out:
-        return ret ? notifier_from_errno(ret) : NOTIFY_OK;
+        return notifier_from_errno(ret);
 }
 #endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */
@@ -2147,8 +2151,6 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 *
 * @name must be valid until the cache is destroyed. This implies that
 * the module calling this has to destroy the cache before getting unloaded.
- * Note that kmem_cache_name() is not guaranteed to return the same pointer,
- * therefore applications must manage it themselves.
 *
 * The flags are
 *
@@ -2288,8 +2290,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (ralign < align) {
                ralign = align;
        }
-        /* disable debug if not aligning with REDZONE_ALIGN */
+        /* disable debug if necessary */
-        if (ralign & (__alignof__(unsigned long long) - 1))
+        if (ralign > __alignof__(unsigned long long))
                flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        /*
         * 4) Store it.
@@ -2315,8 +2317,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         */
        if (flags & SLAB_RED_ZONE) {
                /* add space for red zone words */
-                cachep->obj_offset += align;
+                cachep->obj_offset += sizeof(unsigned long long);
-                size += align + sizeof(unsigned long long);
+                size += 2 * sizeof(unsigned long long);
        }
        if (flags & SLAB_STORE_USER) {
                /* user store requires one word storage behind the end of
@@ -2605,7 +2607,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 *
 * The cache must be empty before calling this function.
 *
- * The caller must guarantee that noone will allocate memory from the cache
+ * The caller must guarantee that no one will allocate memory from the cache
 * during the kmem_cache_destroy().
 */
 void kmem_cache_destroy(struct kmem_cache *cachep)
@@ -2781,7 +2783,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
 /*
 * Map pages beginning at addr to the given cache and slab. This is required
 * for the slab allocator to be able to lookup the cache and slab of a
- * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ * virtual address for kfree, ksize, and slab debugging.
 */
 static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
                           void *addr)
@@ -3602,13 +3604,14 @@ free_done:
 * Release an obj back to its cache. If the obj has a constructed state, it must
 * be in this state _before_ it is released.  Called with disabled ints.
 */
-static inline void __cache_free(struct kmem_cache *cachep, void *objp)
+static inline void __cache_free(struct kmem_cache *cachep, void *objp,
+    void *caller)
 {
        struct array_cache *ac = cpu_cache_get(cachep);
        check_irq_off();
        kmemleak_free_recursive(objp, cachep->flags);
-        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
+        objp = cache_free_debugcheck(cachep, objp, caller);
        kmemcheck_slab_free(cachep, objp, obj_size(cachep));
@@ -3653,42 +3656,19 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
+void *
+kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags)
 {
-        return __cache_alloc(cachep, flags, __builtin_return_address(0));
+        void *ret;
-}
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
-#endif
-/**
+        ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
- * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
- * @cachep: the cache we're checking against
- * @ptr: pointer to validate
- *
- * This verifies that the untrusted pointer looks sane;
- * it is _not_ a guarantee that the pointer is actually
- * part of the slab cache in question, but it at least
- * validates that the pointer can be dereferenced and
- * looks half-way sane.
- *
- * Currently only used for dentry validation.
- */
-int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
-{
-        unsigned long size = cachep->buffer_size;
-        struct page *page;
-        if (unlikely(!kern_ptr_validate(ptr, size)))
+        trace_kmalloc(_RET_IP_, ret,
-                goto out;
+                      size, slab_buffer_size(cachep), flags);
-        page = virt_to_page(ptr);
+        return ret;
-        if (unlikely(!PageSlab(page)))
-                goto out;
-        if (unlikely(page_get_cache(page) != cachep))
-                goto out;
-        return 1;
-out:
-        return 0;
 }
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+#endif
 #ifdef CONFIG_NUMA
 void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
@@ -3705,31 +3685,32 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
+void *kmem_cache_alloc_node_trace(size_t size,
-                                    gfp_t flags,
+                                  struct kmem_cache *cachep,
-                                    int nodeid)
+                                  gfp_t flags,
+                                  int nodeid)
 {
-        return __cache_alloc_node(cachep, flags, nodeid,
+        void *ret;
+        ret = __cache_alloc_node(cachep, flags, nodeid,
                                  __builtin_return_address(0));
+        trace_kmalloc_node(_RET_IP_, ret,
+                           size, slab_buffer_size(cachep),
+                           flags, nodeid);
+        return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
 #endif
 static __always_inline void *
 __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 {
        struct kmem_cache *cachep;
-        void *ret;
        cachep = kmem_find_general_cachep(size, flags);
        if (unlikely(ZERO_OR_NULL_PTR(cachep)))
                return cachep;
-        ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
+        return kmem_cache_alloc_node_trace(size, cachep, flags, node);
-        trace_kmalloc_node((unsigned long) caller, ret,
-                           size, cachep->buffer_size, flags, node);
-        return ret;
 }
 #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
@@ -3821,7 +3802,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
        debug_check_no_locks_freed(objp, obj_size(cachep));
        if (!(cachep->flags & SLAB_DEBUG_OBJECTS))
                debug_check_no_obj_freed(objp, obj_size(cachep));
-        __cache_free(cachep, objp);
+        __cache_free(cachep, objp, __builtin_return_address(0));
        local_irq_restore(flags);
        trace_kmem_cache_free(_RET_IP_, objp);
@@ -3851,7 +3832,7 @@ void kfree(const void *objp)
        c = virt_to_cache(objp);
        debug_check_no_locks_freed(objp, obj_size(c));
        debug_check_no_obj_freed(objp, obj_size(c));
-        __cache_free(c, (void *)objp);
+        __cache_free(c, (void *)objp, __builtin_return_address(0));
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
@@ -3862,12 +3843,6 @@ unsigned int kmem_cache_size(struct kmem_cache *cachep)
 }
 EXPORT_SYMBOL(kmem_cache_size);
-const char *kmem_cache_name(struct kmem_cache *cachep)
-{
-        return cachep->name;
-}
-EXPORT_SYMBOL_GPL(kmem_cache_name);
 /*
 * This initializes kmem_list3 or resizes various caches for all nodes.
 */
@@ -4075,7 +4050,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 * necessary. Note that the l3 listlock also protects the array_cache
 * if drain_array() is used on the shared array.
 */
-void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
                         struct array_cache *ac, int force, int node)
 {
        int tofree;
@@ -4339,7 +4314,7 @@ static const struct seq_operations slabinfo_op = {
 * @count: data length
 * @ppos: unused
 */
-ssize_t slabinfo_write(struct file *file, const char __user * buffer,
+static ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                       size_t count, loff_t *ppos)
 {
        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
diff --git a/mm/slob.c b/mm/slob.c
index d582171c8101..46e0aee33a23 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -500,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
        } else {
                unsigned int order = get_order(size);
-                ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
+                if (likely(order))
+                        gfp |= __GFP_COMP;
+                ret = slob_new_pages(gfp, order, node);
                if (ret) {
                        struct page *page;
                        page = virt_to_page(ret);
@@ -664,23 +666,12 @@ unsigned int kmem_cache_size(struct kmem_cache *c)
 }
 EXPORT_SYMBOL(kmem_cache_size);
-const char *kmem_cache_name(struct kmem_cache *c)
-{
-        return c->name;
-}
-EXPORT_SYMBOL(kmem_cache_name);
 int kmem_cache_shrink(struct kmem_cache *d)
 {
        return 0;
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
-int kmem_ptr_validate(struct kmem_cache *a, const void *b)
-{
-        return 0;
-}
 static unsigned int slob_ready __read_mostly;
 int slab_is_available(void)
diff --git a/mm/slub.c b/mm/slub.c
index 13fffe1f0f3d..35f351f26193 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -28,6 +28,8 @@
 #include <linux/math64.h>
 #include <linux/fault-inject.h>
+#include <trace/events/kmem.h>
 /*
 * Lock order:
 *   1. slab_lock(page)
@@ -62,7 +64,7 @@
 *   we must stay away from it for a while since we may cause a bouncing
 *   cacheline if we try to acquire the lock. So go onto the next slab.
 *   If all pages are busy then we may allocate a new slab instead of reusing
- *   a partial slab. A new slab has noone operating on it and thus there is
+ *   a partial slab. A new slab has no one operating on it and thus there is
 *   no danger of cacheline contention.
 *
 *   Interrupts are disabled during allocation and deallocation in order to
@@ -168,7 +170,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 /* Internal SLUB flags */
 #define __OBJECT_POISON         0x80000000UL /* Poison object */
-#define __SYSFS_ADD_DEFERRED    0x40000000UL /* Not yet visible via sysfs */
 static int kmem_size = sizeof(struct kmem_cache);
@@ -178,7 +179,7 @@ static struct notifier_block slab_notifier;
 static enum {
        DOWN,           /* No slab functionality available */
-        PARTIAL,        /* kmem_cache_open() works but kmalloc does not */
+        PARTIAL,        /* Kmem_cache_node works */
        UP,             /* Everything works but does not show up in sysfs */
        SYSFS           /* Sysfs up */
 } slab_state = DOWN;
@@ -199,7 +200,7 @@ struct track {
 enum track_item { TRACK_ALLOC, TRACK_FREE };
-#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SYSFS
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 static void sysfs_slab_remove(struct kmem_cache *);
@@ -210,12 +211,13 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
                                                        { return 0; }
 static inline void sysfs_slab_remove(struct kmem_cache *s)
 {
+        kfree(s->name);
        kfree(s);
 }
 #endif
-static inline void stat(struct kmem_cache *s, enum stat_item si)
+static inline void stat(const struct kmem_cache *s, enum stat_item si)
 {
 #ifdef CONFIG_SLUB_STATS
        __this_cpu_inc(s->cpu_slab->stat[si]);
@@ -233,11 +235,7 @@ int slab_is_available(void)
 static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 {
-#ifdef CONFIG_NUMA
        return s->node[node];
-#else
-        return &s->local_node;
-#endif
 }
 /* Verify that a pointer has an address that is valid within a slab page */
@@ -263,6 +261,18 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
        return *(void **)(object + s->offset);
 }
+static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
+{
+        void *p;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+        probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
+#else
+        p = get_freepointer(s, object);
+#endif
+        return p;
+}
 static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
 {
        *(void **)(object + s->offset) = fp;
@@ -273,21 +283,46 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
        for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
                        __p += (__s)->size)
-/* Scan freelist */
-#define for_each_free_object(__p, __s, __free) \
-        for (__p = (__free); __p; __p = get_freepointer((__s), __p))
 /* Determine object index from a given position */
 static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
 {
        return (p - addr) / s->size;
 }
+static inline size_t slab_ksize(const struct kmem_cache *s)
+{
+#ifdef CONFIG_SLUB_DEBUG
+        /*
+         * Debugging requires use of the padding between object
+         * and whatever may come after it.
+         */
+        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
+                return s->objsize;
+#endif
+        /*
+         * If we have the need to store the freelist pointer
+         * back there or track user information then we can
+         * only use the space before that information.
+         */
+        if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
+                return s->inuse;
+        /*
+         * Else we can use all the padding etc for the allocation
+         */
+        return s->size;
+}
+static inline int order_objects(int order, unsigned long size, int reserved)
+{
+        return ((PAGE_SIZE << order) - reserved) / size;
+}
 static inline struct kmem_cache_order_objects oo_make(int order,
-                                                unsigned long size)
+                unsigned long size, int reserved)
 {
        struct kmem_cache_order_objects x = {
-                (order << OO_SHIFT) + (PAGE_SIZE << order) / size
+                (order << OO_SHIFT) + order_objects(order, size, reserved)
        };
        return x;
@@ -305,6 +340,21 @@ static inline int oo_objects(struct kmem_cache_order_objects x)
 #ifdef CONFIG_SLUB_DEBUG
 /*
+ * Determine a map of object in use on a page.
+ *
+ * Slab lock or node listlock must be held to guarantee that the page does
+ * not vanish from under us.
+ */
+static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
+{
+        void *p;
+        void *addr = page_address(page);
+        for (p = page->freelist; p; p = get_freepointer(s, p))
+                set_bit(slab_index(p, s, addr), map);
+}
+/*
 * Debug settings:
 */
 #ifdef CONFIG_SLUB_DEBUG_ON
@@ -494,7 +544,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
        dump_stack();
 }
-static void init_object(struct kmem_cache *s, void *object, int active)
+static void init_object(struct kmem_cache *s, void *object, u8 val)
 {
        u8 *p = object;
@@ -504,9 +554,7 @@ static void init_object(struct kmem_cache *s, void *object, int active)
        }
        if (s->flags & SLAB_RED_ZONE)
-                memset(p + s->objsize,
+                memset(p + s->objsize, val, s->inuse - s->objsize);
-                        active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
-                        s->inuse - s->objsize);
 }
 static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
@@ -621,7 +669,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
                return 1;
        start = page_address(page);
-        length = (PAGE_SIZE << compound_order(page));
+        length = (PAGE_SIZE << compound_order(page)) - s->reserved;
        end = start + length;
        remainder = length % s->size;
        if (!remainder)
@@ -641,17 +689,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 }
 static int check_object(struct kmem_cache *s, struct page *page,
-                                        void *object, int active)
+                                        void *object, u8 val)
 {
        u8 *p = object;
        u8 *endobject = object + s->objsize;
        if (s->flags & SLAB_RED_ZONE) {
-                unsigned int red =
-                        active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
                if (!check_bytes_and_report(s, page, object, "Redzone",
-                        endobject, red, s->inuse - s->objsize))
+                        endobject, val, s->inuse - s->objsize))
                        return 0;
        } else {
                if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
@@ -661,7 +706,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
        }
        if (s->flags & SLAB_POISON) {
-                if (!active && (s->flags & __OBJECT_POISON) &&
+                if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
                        (!check_bytes_and_report(s, page, p, "Poison", p,
                                        POISON_FREE, s->objsize - 1) ||
                         !check_bytes_and_report(s, page, p, "Poison",
@@ -673,7 +718,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
                check_pad_bytes(s, page, p);
        }
-        if (!s->offset && active)
+        if (!s->offset && val == SLUB_RED_ACTIVE)
                /*
                 * Object and freepointer overlap. Cannot check
                 * freepointer while object is allocated.
@@ -705,7 +750,7 @@ static int check_slab(struct kmem_cache *s, struct page *page)
                return 0;
        }
-        maxobj = (PAGE_SIZE << compound_order(page)) / s->size;
+        maxobj = order_objects(compound_order(page), s->size, s->reserved);
        if (page->objects > maxobj) {
                slab_err(s, page, "objects %u > max %u",
                        s->name, page->objects, maxobj);
@@ -755,7 +800,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
                nr++;
        }
-        max_objects = (PAGE_SIZE << compound_order(page)) / s->size;
+        max_objects = order_objects(compound_order(page), s->size, s->reserved);
        if (max_objects > MAX_OBJS_PER_PAGE)
                max_objects = MAX_OBJS_PER_PAGE;
@@ -792,6 +837,49 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
 }
 /*
+ * Hooks for other subsystems that check memory allocations. In a typical
+ * production configuration these hooks all should produce no code at all.
+ */
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+{
+        flags &= gfp_allowed_mask;
+        lockdep_trace_alloc(flags);
+        might_sleep_if(flags & __GFP_WAIT);
+        return should_failslab(s->objsize, flags, s->flags);
+}
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
+{
+        flags &= gfp_allowed_mask;
+        kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+        kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
+}
+static inline void slab_free_hook(struct kmem_cache *s, void *x)
+{
+        kmemleak_free_recursive(x, s->flags);
+        /*
+         * Trouble is that we may no longer disable interupts in the fast path
+         * So in order to make the debug calls that expect irqs to be
+         * disabled we need to disable interrupts temporarily.
+         */
+#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+        {
+                unsigned long flags;
+                local_irq_save(flags);
+                kmemcheck_slab_free(s, x, s->objsize);
+                debug_check_no_locks_freed(x, s->objsize);
+                local_irq_restore(flags);
+        }
+#endif
+        if (!(s->flags & SLAB_DEBUG_OBJECTS))
+                debug_check_no_obj_freed(x, s->objsize);
+}
+/*
 * Tracking of fully allocated slabs for debugging purposes.
 */
 static void add_full(struct kmem_cache_node *n, struct page *page)
@@ -838,7 +926,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
         * dilemma by deferring the increment of the count during
         * bootstrap (see early_kmem_cache_node_alloc).
         */
-        if (!NUMA_BUILD || n) {
+        if (n) {
                atomic_long_inc(&n->nr_slabs);
                atomic_long_add(objects, &n->total_objects);
        }
@@ -858,11 +946,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
        if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
                return;
-        init_object(s, object, 0);
+        init_object(s, object, SLUB_RED_INACTIVE);
        init_tracking(s, object);
 }
-static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
+static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
                                        void *object, unsigned long addr)
 {
        if (!check_slab(s, page))
@@ -878,14 +966,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
                goto bad;
        }
-        if (!check_object(s, page, object, 0))
+        if (!check_object(s, page, object, SLUB_RED_INACTIVE))
                goto bad;
        /* Success perform special debug activities for allocs */
        if (s->flags & SLAB_STORE_USER)
                set_track(s, object, TRACK_ALLOC, addr);
        trace(s, page, object, 1);
-        init_object(s, object, 1);
+        init_object(s, object, SLUB_RED_ACTIVE);
        return 1;
 bad:
@@ -902,8 +990,8 @@ bad:
        return 0;
 }
-static int free_debug_processing(struct kmem_cache *s, struct page *page,
+static noinline int free_debug_processing(struct kmem_cache *s,
-                                        void *object, unsigned long addr)
+                 struct page *page, void *object, unsigned long addr)
 {
        if (!check_slab(s, page))
                goto fail;
@@ -918,7 +1006,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
                goto fail;
        }
-        if (!check_object(s, page, object, 1))
+        if (!check_object(s, page, object, SLUB_RED_ACTIVE))
                return 0;
        if (unlikely(s != page->slab)) {
@@ -942,7 +1030,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
        if (s->flags & SLAB_STORE_USER)
                set_track(s, object, TRACK_FREE, addr);
        trace(s, page, object, 0);
-        init_object(s, object, 0);
+        init_object(s, object, SLUB_RED_INACTIVE);
        return 1;
 fail:
@@ -1046,7 +1134,7 @@ static inline int free_debug_processing(struct kmem_cache *s,
 static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
                        { return 1; }
 static inline int check_object(struct kmem_cache *s, struct page *page,
-                        void *object, int active) { return 1; }
+                        void *object, u8 val) { return 1; }
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
@@ -1066,7 +1154,16 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
                                                        int objects) {}
 static inline void dec_slabs_node(struct kmem_cache *s, int node,
                                                        int objects) {}
-#endif
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+                                                        { return 0; }
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+                void *object) {}
+static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
+#endif /* CONFIG_SLUB_DEBUG */
 /*
 * Slab allocation and freeing
@@ -1194,7 +1291,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
                slab_pad_check(s, page);
                for_each_object(p, s, page_address(page),
                                                page->objects)
-                        check_object(s, page, p, 0);
+                        check_object(s, page, p, SLUB_RED_INACTIVE);
        }
        kmemcheck_free_shadow(page, compound_order(page));
@@ -1211,21 +1308,38 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        __free_pages(page, order);
 }
+#define need_reserve_slab_rcu                                           \
+        (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head))
 static void rcu_free_slab(struct rcu_head *h)
 {
        struct page *page;
-        page = container_of((struct list_head *)h, struct page, lru);
+        if (need_reserve_slab_rcu)
+                page = virt_to_head_page(h);
+        else
+                page = container_of((struct list_head *)h, struct page, lru);
        __free_slab(page->slab, page);
 }
 static void free_slab(struct kmem_cache *s, struct page *page)
 {
        if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) {
-                /*
+                struct rcu_head *head;
-                 * RCU free overloads the RCU head over the LRU
-                 */
+                if (need_reserve_slab_rcu) {
-                struct rcu_head *head = (void *)&page->lru;
+                        int order = compound_order(page);
+                        int offset = (PAGE_SIZE << order) - s->reserved;
+                        VM_BUG_ON(s->reserved != sizeof(*head));
+                        head = page_address(page) + offset;
+                } else {
+                        /*
+                         * RCU free overloads the RCU head over the LRU
+                         */
+                        head = (void *)&page->lru;
+                }
                call_rcu(head, rcu_free_slab);
        } else
@@ -1274,13 +1388,19 @@ static void add_partial(struct kmem_cache_node *n,
        spin_unlock(&n->list_lock);
 }
+static inline void __remove_partial(struct kmem_cache_node *n,
+                                        struct page *page)
+{
+        list_del(&page->lru);
+        n->nr_partial--;
+}
 static void remove_partial(struct kmem_cache *s, struct page *page)
 {
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
        spin_lock(&n->list_lock);
-        list_del(&page->lru);
+        __remove_partial(n, page);
-        n->nr_partial--;
        spin_unlock(&n->list_lock);
 }
@@ -1293,8 +1413,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
                                                        struct page *page)
 {
        if (slab_trylock(page)) {
-                list_del(&page->lru);
+                __remove_partial(n, page);
-                n->nr_partial--;
                __SetPageSlubFrozen(page);
                return 1;
        }
@@ -1391,7 +1510,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
        int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
        page = get_partial_node(get_node(s, searchnode));
-        if (page || node != -1)
+        if (page || node != NUMA_NO_NODE)
                return page;
        return get_any_partial(s, flags);
@@ -1405,6 +1524,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
 * On exit the slab lock will have been dropped.
 */
 static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
+        __releases(bitlock)
 {
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -1443,10 +1563,77 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
        }
 }
+#ifdef CONFIG_PREEMPT
+/*
+ * Calculate the next globally unique transaction for disambiguiation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
+#else
+/*
+ * No preemption supported therefore also no need to check for
+ * different cpus.
+ */
+#define TID_STEP 1
+#endif
+static inline unsigned long next_tid(unsigned long tid)
+{
+        return tid + TID_STEP;
+}
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+        return tid % TID_STEP;
+}
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+        return tid / TID_STEP;
+}
+static inline unsigned int init_tid(int cpu)
+{
+        return cpu;
+}
+static inline void note_cmpxchg_failure(const char *n,
+                const struct kmem_cache *s, unsigned long tid)
+{
+#ifdef SLUB_DEBUG_CMPXCHG
+        unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+        printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
+#ifdef CONFIG_PREEMPT
+        if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+                printk("due to cpu change %d -> %d\n",
+                        tid_to_cpu(tid), tid_to_cpu(actual_tid));
+        else
+#endif
+        if (tid_to_event(tid) != tid_to_event(actual_tid))
+                printk("due to cpu running other code. Event %ld->%ld\n",
+                        tid_to_event(tid), tid_to_event(actual_tid));
+        else
+                printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
+                        actual_tid, tid, next_tid(tid));
+#endif
+        stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
+}
+void init_kmem_cache_cpus(struct kmem_cache *s)
+{
+        int cpu;
+        for_each_possible_cpu(cpu)
+                per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+}
 /*
 * Remove the cpu slab
 */
 static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+        __releases(bitlock)
 {
        struct page *page = c->page;
        int tail = 1;
@@ -1473,6 +1660,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
                page->inuse--;
        }
        c->page = NULL;
+        c->tid = next_tid(c->tid);
        unfreeze_slab(s, page, tail);
 }
@@ -1606,33 +1794,46 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
                          unsigned long addr, struct kmem_cache_cpu *c)
 {
        void **object;
-        struct page *new;
+        struct page *page;
+        unsigned long flags;
+        local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+        /*
+         * We may have been preempted and rescheduled on a different
+         * cpu before disabling interrupts. Need to reload cpu area
+         * pointer.
+         */
+        c = this_cpu_ptr(s->cpu_slab);
+#endif
        /* We handle __GFP_ZERO in the caller */
        gfpflags &= ~__GFP_ZERO;
-        if (!c->page)
+        page = c->page;
+        if (!page)
                goto new_slab;
-        slab_lock(c->page);
+        slab_lock(page);
        if (unlikely(!node_match(c, node)))
                goto another_slab;
        stat(s, ALLOC_REFILL);
 load_freelist:
-        object = c->page->freelist;
+        object = page->freelist;
        if (unlikely(!object))
                goto another_slab;
        if (kmem_cache_debug(s))
                goto debug;
        c->freelist = get_freepointer(s, object);
-        c->page->inuse = c->page->objects;
+        page->inuse = page->objects;
-        c->page->freelist = NULL;
+        page->freelist = NULL;
-        c->node = page_to_nid(c->page);
-unlock_out:
+        slab_unlock(page);
-        slab_unlock(c->page);
+        c->tid = next_tid(c->tid);
+        local_irq_restore(flags);
        stat(s, ALLOC_SLOWPATH);
        return object;
@@ -1640,42 +1841,50 @@ another_slab:
        deactivate_slab(s, c);
 new_slab:
-        new = get_partial(s, gfpflags, node);
+        page = get_partial(s, gfpflags, node);
-        if (new) {
+        if (page) {
-                c->page = new;
                stat(s, ALLOC_FROM_PARTIAL);
+                c->node = page_to_nid(page);
+                c->page = page;
                goto load_freelist;
        }
+        gfpflags &= gfp_allowed_mask;
        if (gfpflags & __GFP_WAIT)
                local_irq_enable();
-        new = new_slab(s, gfpflags, node);
+        page = new_slab(s, gfpflags, node);
        if (gfpflags & __GFP_WAIT)
                local_irq_disable();
-        if (new) {
+        if (page) {
                c = __this_cpu_ptr(s->cpu_slab);
                stat(s, ALLOC_SLAB);
                if (c->page)
                        flush_slab(s, c);
-                slab_lock(new);
-                __SetPageSlubFrozen(new);
+                slab_lock(page);
-                c->page = new;
+                __SetPageSlubFrozen(page);
+                c->node = page_to_nid(page);
+                c->page = page;
                goto load_freelist;
        }
        if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
                slab_out_of_memory(s, gfpflags, node);
+        local_irq_restore(flags);
        return NULL;
 debug:
-        if (!alloc_debug_processing(s, c->page, object, addr))
+        if (!alloc_debug_processing(s, page, object, addr))
                goto another_slab;
-        c->page->inuse++;
+        page->inuse++;
-        c->page->freelist = get_freepointer(s, object);
+        page->freelist = get_freepointer(s, object);
-        c->node = -1;
+        deactivate_slab(s, c);
-        goto unlock_out;
+        c->page = NULL;
+        c->node = NUMA_NO_NODE;
+        local_irq_restore(flags);
+        return object;
 }
 /*
@@ -1693,34 +1902,63 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 {
        void **object;
        struct kmem_cache_cpu *c;
-        unsigned long flags;
+        unsigned long tid;
-        gfpflags &= gfp_allowed_mask;
-        lockdep_trace_alloc(gfpflags);
+        if (slab_pre_alloc_hook(s, gfpflags))
-        might_sleep_if(gfpflags & __GFP_WAIT);
-        if (should_failslab(s->objsize, gfpflags, s->flags))
                return NULL;
-        local_irq_save(flags);
+redo:
+        /*
+         * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+         * enabled. We may switch back and forth between cpus while
+         * reading from one cpu area. That does not matter as long
+         * as we end up on the original cpu again when doing the cmpxchg.
+         */
        c = __this_cpu_ptr(s->cpu_slab);
+        /*
+         * The transaction ids are globally unique per cpu and per operation on
+         * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+         * occurs on the right processor and that there was no operation on the
+         * linked list in between.
+         */
+        tid = c->tid;
+        barrier();
        object = c->freelist;
        if (unlikely(!object || !node_match(c, node)))
                object = __slab_alloc(s, gfpflags, node, addr, c);
        else {
-                c->freelist = get_freepointer(s, object);
+                /*
+                 * The cmpxchg will only match if there was no additional
+                 * operation and if we are on the right processor.
+                 *
+                 * The cmpxchg does the following atomically (without lock semantics!)
+                 * 1. Relocate first pointer to the current per cpu area.
+                 * 2. Verify that tid and freelist have not been changed
+                 * 3. If they were not changed replace tid and freelist
+                 *
+                 * Since this is without lock semantics the protection is only against
+                 * code executing on this cpu *not* from access by other cpus.
+                 */
+                if (unlikely(!irqsafe_cpu_cmpxchg_double(
+                                s->cpu_slab->freelist, s->cpu_slab->tid,
+                                object, tid,
+                                get_freepointer_safe(s, object), next_tid(tid)))) {
+                        note_cmpxchg_failure("slab_alloc", s, tid);
+                        goto redo;
+                }
                stat(s, ALLOC_FASTPATH);
        }
-        local_irq_restore(flags);
        if (unlikely(gfpflags & __GFP_ZERO) && object)
                memset(object, 0, s->objsize);
-        kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
+        slab_post_alloc_hook(s, gfpflags, object);
-        kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
        return object;
 }
@@ -1736,11 +1974,21 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 EXPORT_SYMBOL(kmem_cache_alloc);
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
+void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
-        return slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+        void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
+        trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+        return ret;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_trace);
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+{
+        void *ret = kmalloc_order(size, flags, order);
+        trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+        return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_notrace);
+EXPORT_SYMBOL(kmalloc_order_trace);
 #endif
 #ifdef CONFIG_NUMA
@@ -1754,16 +2002,20 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
        return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
-#endif
 #ifdef CONFIG_TRACING
-void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
+void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
                                    gfp_t gfpflags,
-                                    int node)
+                                    int node, size_t size)
 {
-        return slab_alloc(s, gfpflags, node, _RET_IP_);
+        void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
+        trace_kmalloc_node(_RET_IP_, ret,
+                           size, s->size, gfpflags, node);
+        return ret;
 }
-EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
+EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
+#endif
 #endif
 /*
@@ -1779,14 +2031,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 {
        void *prior;
        void **object = (void *)x;
+        unsigned long flags;
-        stat(s, FREE_SLOWPATH);
+        local_irq_save(flags);
        slab_lock(page);
+        stat(s, FREE_SLOWPATH);
-        if (kmem_cache_debug(s))
+        if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
-                goto debug;
+                goto out_unlock;
-checks_ok:
        prior = page->freelist;
        set_freepointer(s, object, prior);
        page->freelist = object;
@@ -1811,6 +2064,7 @@ checks_ok:
 out_unlock:
        slab_unlock(page);
+        local_irq_restore(flags);
        return;
 slab_empty:
@@ -1822,14 +2076,9 @@ slab_empty:
                stat(s, FREE_REMOVE_PARTIAL);
        }
        slab_unlock(page);
+        local_irq_restore(flags);
        stat(s, FREE_SLAB);
        discard_slab(s, page);
-        return;
-debug:
-        if (!free_debug_processing(s, page, x, addr))
-                goto out_unlock;
-        goto checks_ok;
 }
 /*
@@ -1848,23 +2097,38 @@ static __always_inline void slab_free(struct kmem_cache *s,
 {
        void **object = (void *)x;
        struct kmem_cache_cpu *c;
-        unsigned long flags;
+        unsigned long tid;
-        kmemleak_free_recursive(x, s->flags);
+        slab_free_hook(s, x);
-        local_irq_save(flags);
+redo:
+        /*
+         * Determine the currently cpus per cpu slab.
+         * The cpu may change afterward. However that does not matter since
+         * data is retrieved via this pointer. If we are on the same cpu
+         * during the cmpxchg then the free will succedd.
+         */
        c = __this_cpu_ptr(s->cpu_slab);
-        kmemcheck_slab_free(s, object, s->objsize);
-        debug_check_no_locks_freed(object, s->objsize);
+        tid = c->tid;
-        if (!(s->flags & SLAB_DEBUG_OBJECTS))
+        barrier();
-                debug_check_no_obj_freed(object, s->objsize);
-        if (likely(page == c->page && c->node >= 0)) {
+        if (likely(page == c->page)) {
                set_freepointer(s, object, c->freelist);
-                c->freelist = object;
+                if (unlikely(!irqsafe_cpu_cmpxchg_double(
+                                s->cpu_slab->freelist, s->cpu_slab->tid,
+                                c->freelist, tid,
+                                object, next_tid(tid)))) {
+                        note_cmpxchg_failure("slab_free", s, tid);
+                        goto redo;
+                }
                stat(s, FREE_FASTPATH);
        } else
                __slab_free(s, page, x, addr);
-        local_irq_restore(flags);
 }
 void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -1879,17 +2143,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 }
 EXPORT_SYMBOL(kmem_cache_free);
-/* Figure out on which slab page the object resides */
-static struct page *get_object_page(const void *x)
-{
-        struct page *page = virt_to_head_page(x);
-        if (!PageSlab(page))
-                return NULL;
-        return page;
-}
 /*
 * Object placement in a slab is made very easy because we always start at
 * offset 0. If we tune the size of the object to the alignment then we can
@@ -1945,13 +2198,13 @@ static int slub_nomerge;
 * the smallest order which will fit the object.
 */
 static inline int slab_order(int size, int min_objects,
-                                int max_order, int fract_leftover)
+                                int max_order, int fract_leftover, int reserved)
 {
        int order;
        int rem;
        int min_order = slub_min_order;
-        if ((PAGE_SIZE << min_order) / size > MAX_OBJS_PER_PAGE)
+        if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE)
                return get_order(size * MAX_OBJS_PER_PAGE) - 1;
        for (order = max(min_order,
@@ -1960,10 +2213,10 @@ static inline int slab_order(int size, int min_objects,
                unsigned long slab_size = PAGE_SIZE << order;
-                if (slab_size < min_objects * size)
+                if (slab_size < min_objects * size + reserved)
                        continue;
-                rem = slab_size % size;
+                rem = (slab_size - reserved) % size;
                if (rem <= slab_size / fract_leftover)
                        break;
@@ -1973,7 +2226,7 @@ static inline int slab_order(int size, int min_objects,
        return order;
 }
-static inline int calculate_order(int size)
+static inline int calculate_order(int size, int reserved)
 {
        int order;
        int min_objects;
@@ -1991,14 +2244,14 @@ static inline int calculate_order(int size)
        min_objects = slub_min_objects;
        if (!min_objects)
                min_objects = 4 * (fls(nr_cpu_ids) + 1);
-        max_objects = (PAGE_SIZE << slub_max_order)/size;
+        max_objects = order_objects(slub_max_order, size, reserved);
        min_objects = min(min_objects, max_objects);
        while (min_objects > 1) {
                fraction = 16;
                while (fraction >= 4) {
                        order = slab_order(size, min_objects,
-                                                slub_max_order, fraction);
+                                        slub_max_order, fraction, reserved);
                        if (order <= slub_max_order)
                                return order;
                        fraction /= 2;
@@ -2010,14 +2263,14 @@ static inline int calculate_order(int size)
         * We were unable to place multiple objects in a slab. Now
         * lets see if we can place a single object there.
         */
-        order = slab_order(size, 1, slub_max_order, 1);
+        order = slab_order(size, 1, slub_max_order, 1, reserved);
        if (order <= slub_max_order)
                return order;
        /*
         * Doh this slab cannot be placed using slub_max_order.
         */
-        order = slab_order(size, 1, MAX_ORDER, 1);
+        order = slab_order(size, 1, MAX_ORDER, 1, reserved);
        if (order < MAX_ORDER)
                return order;
        return -ENOSYS;
@@ -2062,26 +2315,28 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
 #endif
 }
-static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
-static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
 {
-        if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
+        BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
-                /*
+                        SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
-                 * Boot time creation of the kmalloc array. Use static per cpu data
-                 * since the per cpu allocator is not available yet.
+        /*
-                 */
+         * Must align to double word boundary for the double cmpxchg
-                s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
+         * instructions to work; see __pcpu_double_call_return_bool().
-        else
+         */
-                s->cpu_slab =  alloc_percpu(struct kmem_cache_cpu);
+        s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu),
+                                     2 * sizeof(void *));
        if (!s->cpu_slab)
                return 0;
+        init_kmem_cache_cpus(s);
        return 1;
 }
-#ifdef CONFIG_NUMA
+static struct kmem_cache *kmem_cache_node;
 /*
 * No kmalloc_node yet so do it by hand. We know that this is the first
 * slab on the node for this slabcache. There are no concurrent accesses
@@ -2091,15 +2346,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
 * when allocating for the kmalloc_node_cache. This is used for bootstrapping
 * memory on a fresh node that has no slab structures yet.
 */
-static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
+static void early_kmem_cache_node_alloc(int node)
 {
        struct page *page;
        struct kmem_cache_node *n;
        unsigned long flags;
-        BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
+        BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
-        page = new_slab(kmalloc_caches, gfpflags, node);
+        page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
        BUG_ON(!page);
        if (page_to_nid(page) != node) {
@@ -2111,15 +2366,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
        n = page->freelist;
        BUG_ON(!n);
-        page->freelist = get_freepointer(kmalloc_caches, n);
+        page->freelist = get_freepointer(kmem_cache_node, n);
        page->inuse++;
-        kmalloc_caches->node[node] = n;
+        kmem_cache_node->node[node] = n;
 #ifdef CONFIG_SLUB_DEBUG
-        init_object(kmalloc_caches, n, 1);
+        init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
-        init_tracking(kmalloc_caches, n);
+        init_tracking(kmem_cache_node, n);
 #endif
-        init_kmem_cache_node(n, kmalloc_caches);
+        init_kmem_cache_node(n, kmem_cache_node);
-        inc_slabs_node(kmalloc_caches, node, page->objects);
+        inc_slabs_node(kmem_cache_node, node, page->objects);
        /*
         * lockdep requires consistent irq usage for each lock
@@ -2137,13 +2392,15 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = s->node[node];
                if (n)
-                        kmem_cache_free(kmalloc_caches, n);
+                        kmem_cache_free(kmem_cache_node, n);
                s->node[node] = NULL;
        }
 }
-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
+static int init_kmem_cache_nodes(struct kmem_cache *s)
 {
        int node;
@@ -2151,11 +2408,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
                struct kmem_cache_node *n;
                if (slab_state == DOWN) {
-                        early_kmem_cache_node_alloc(gfpflags, node);
+                        early_kmem_cache_node_alloc(node);
                        continue;
                }
-                n = kmem_cache_alloc_node(kmalloc_caches,
+                n = kmem_cache_alloc_node(kmem_cache_node,
-                                                gfpflags, node);
+                                                GFP_KERNEL, node);
                if (!n) {
                        free_kmem_cache_nodes(s);
@@ -2167,17 +2424,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
        }
        return 1;
 }
-#else
-static void free_kmem_cache_nodes(struct kmem_cache *s)
-{
-}
-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
-{
-        init_kmem_cache_node(&s->local_node, s);
-        return 1;
-}
-#endif
 static void set_min_partial(struct kmem_cache *s, unsigned long min)
 {
@@ -2285,7 +2531,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
        if (forced_order >= 0)
                order = forced_order;
        else
-                order = calculate_order(size);
+                order = calculate_order(size, s->reserved);
        if (order < 0)
                return 0;
@@ -2303,8 +2549,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
        /*
         * Determine the number of objects per slab
         */
-        s->oo = oo_make(order, size);
+        s->oo = oo_make(order, size, s->reserved);
-        s->min = oo_make(get_order(size), size);
+        s->min = oo_make(get_order(size), size, s->reserved);
        if (oo_objects(s->oo) > oo_objects(s->max))
                s->max = s->oo;
@@ -2312,7 +2558,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 }
-static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
+static int kmem_cache_open(struct kmem_cache *s,
                const char *name, size_t size,
                size_t align, unsigned long flags,
                void (*ctor)(void *))
@@ -2323,6 +2569,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
        s->objsize = size;
        s->align = align;
        s->flags = kmem_cache_flags(size, flags, name, ctor);
+        s->reserved = 0;
+        if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
+                s->reserved = sizeof(struct rcu_head);
        if (!calculate_sizes(s, -1))
                goto error;
@@ -2348,10 +2598,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 #ifdef CONFIG_NUMA
        s->remote_node_defrag_ratio = 1000;
 #endif
-        if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+        if (!init_kmem_cache_nodes(s))
                goto error;
-        if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
+        if (alloc_kmem_cache_cpus(s))
                return 1;
        free_kmem_cache_nodes(s);
@@ -2365,35 +2615,6 @@ error:
 }
 /*
- * Check if a given pointer is valid
- */
-int kmem_ptr_validate(struct kmem_cache *s, const void *object)
-{
-        struct page *page;
-        if (!kern_ptr_validate(object, s->size))
-                return 0;
-        page = get_object_page(object);
-        if (!page || s != page->slab)
-                /* No slab or wrong slab */
-                return 0;
-        if (!check_valid_pointer(s, page, object))
-                return 0;
-        /*
-         * We could also check if the object is on the slabs freelist.
-         * But this would be too expensive and it seems that the main
-         * purpose of kmem_ptr_valid() is to check if the object belongs
-         * to a certain slab.
-         */
-        return 1;
-}
-EXPORT_SYMBOL(kmem_ptr_validate);
-/*
 * Determine the size of a slab object
 */
 unsigned int kmem_cache_size(struct kmem_cache *s)
@@ -2402,28 +2623,20 @@ unsigned int kmem_cache_size(struct kmem_cache *s)
 }
 EXPORT_SYMBOL(kmem_cache_size);
-const char *kmem_cache_name(struct kmem_cache *s)
-{
-        return s->name;
-}
-EXPORT_SYMBOL(kmem_cache_name);
 static void list_slab_objects(struct kmem_cache *s, struct page *page,
                                                        const char *text)
 {
 #ifdef CONFIG_SLUB_DEBUG
        void *addr = page_address(page);
        void *p;
-        long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long),
+        unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
-                            GFP_ATOMIC);
+                                     sizeof(long), GFP_ATOMIC);
        if (!map)
                return;
        slab_err(s, page, "%s", text);
        slab_lock(page);
-        for_each_free_object(p, s, page->freelist)
-                set_bit(slab_index(p, s, addr), map);
+        get_map(s, page, map);
        for_each_object(p, s, addr, page->objects) {
                if (!test_bit(slab_index(p, s, addr), map)) {
@@ -2448,9 +2661,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
        spin_lock_irqsave(&n->list_lock, flags);
        list_for_each_entry_safe(page, h, &n->partial, lru) {
                if (!page->inuse) {
-                        list_del(&page->lru);
+                        __remove_partial(n, page);
                        discard_slab(s, page);
-                        n->nr_partial--;
                } else {
                        list_slab_objects(s, page,
                                "Objects remaining on kmem_cache_close()");
@@ -2507,9 +2719,15 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 *              Kmalloc subsystem
 *******************************************************************/
-struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
+struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
 EXPORT_SYMBOL(kmalloc_caches);
+static struct kmem_cache *kmem_cache;
+#ifdef CONFIG_ZONE_DMA
+static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
+#endif
 static int __init setup_slub_min_order(char *str)
 {
        get_option(&str, &slub_min_order);
@@ -2546,116 +2764,29 @@ static int __init setup_slub_nomerge(char *str)
 __setup("slub_nomerge", setup_slub_nomerge);
-static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
+static struct kmem_cache *__init create_kmalloc_cache(const char *name,
-                const char *name, int size, gfp_t gfp_flags)
+                                                int size, unsigned int flags)
 {
-        unsigned int flags = 0;
+        struct kmem_cache *s;
-        if (gfp_flags & SLUB_DMA)
+        s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
-                flags = SLAB_CACHE_DMA;
        /*
         * This function is called with IRQs disabled during early-boot on
         * single CPU so there's no need to take slub_lock here.
         */
-        if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
+        if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
                                                                flags, NULL))
                goto panic;
        list_add(&s->list, &slab_caches);
-        if (sysfs_slab_add(s))
-                goto panic;
        return s;
 panic:
        panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
+        return NULL;
 }
-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
-static void sysfs_add_func(struct work_struct *w)
-{
-        struct kmem_cache *s;
-        down_write(&slub_lock);
-        list_for_each_entry(s, &slab_caches, list) {
-                if (s->flags & __SYSFS_ADD_DEFERRED) {
-                        s->flags &= ~__SYSFS_ADD_DEFERRED;
-                        sysfs_slab_add(s);
-                }
-        }
-        up_write(&slub_lock);
-}
-static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
-static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
-{
-        struct kmem_cache *s;
-        char *text;
-        size_t realsize;
-        unsigned long slabflags;
-        int i;
-        s = kmalloc_caches_dma[index];
-        if (s)
-                return s;
-        /* Dynamically create dma cache */
-        if (flags & __GFP_WAIT)
-                down_write(&slub_lock);
-        else {
-                if (!down_write_trylock(&slub_lock))
-                        goto out;
-        }
-        if (kmalloc_caches_dma[index])
-                goto unlock_out;
-        realsize = kmalloc_caches[index].objsize;
-        text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
-                         (unsigned int)realsize);
-        s = NULL;
-        for (i = 0; i < KMALLOC_CACHES; i++)
-                if (!kmalloc_caches[i].size)
-                        break;
-        BUG_ON(i >= KMALLOC_CACHES);
-        s = kmalloc_caches + i;
-        /*
-         * Must defer sysfs creation to a workqueue because we don't know
-         * what context we are called from. Before sysfs comes up, we don't
-         * need to do anything because our sysfs initcall will start by
-         * adding all existing slabs to sysfs.
-         */
-        slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
-        if (slab_state >= SYSFS)
-                slabflags |= __SYSFS_ADD_DEFERRED;
-        if (!text || !kmem_cache_open(s, flags, text,
-                        realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
-                s->size = 0;
-                kfree(text);
-                goto unlock_out;
-        }
-        list_add(&s->list, &slab_caches);
-        kmalloc_caches_dma[index] = s;
-        if (slab_state >= SYSFS)
-                schedule_work(&sysfs_add_work);
-unlock_out:
-        up_write(&slub_lock);
-out:
-        return kmalloc_caches_dma[index];
-}
-#endif
 /*
 * Conversion table for small slabs sizes / 8 to the index in the
 * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -2708,10 +2839,10 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
 #ifdef CONFIG_ZONE_DMA
        if (unlikely((flags & SLUB_DMA)))
-                return dma_kmalloc_cache(index, flags);
+                return kmalloc_dma_caches[index];
 #endif
-        return &kmalloc_caches[index];
+        return kmalloc_caches[index];
 }
 void *__kmalloc(size_t size, gfp_t flags)
@@ -2735,6 +2866,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 }
 EXPORT_SYMBOL(__kmalloc);
+#ifdef CONFIG_NUMA
 static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 {
        struct page *page;
@@ -2749,7 +2881,6 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
        return ptr;
 }
-#ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
        struct kmem_cache *s;
@@ -2782,7 +2913,6 @@ EXPORT_SYMBOL(__kmalloc_node);
 size_t ksize(const void *object)
 {
        struct page *page;
-        struct kmem_cache *s;
        if (unlikely(object == ZERO_SIZE_PTR))
                return 0;
@@ -2793,28 +2923,8 @@ size_t ksize(const void *object)
                WARN_ON(!PageCompound(page));
                return PAGE_SIZE << compound_order(page);
        }
-        s = page->slab;
-#ifdef CONFIG_SLUB_DEBUG
+        return slab_ksize(page->slab);
-        /*
-         * Debugging requires use of the padding between object
-         * and whatever may come after it.
-         */
-        if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
-                return s->objsize;
-#endif
-        /*
-         * If we have the need to store the freelist pointer
-         * back there or track user information then we can
-         * only use the space before that information.
-         */
-        if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
-                return s->inuse;
-        /*
-         * Else we can use all the padding etc for the allocation
-         */
-        return s->size;
 }
 EXPORT_SYMBOL(ksize);
@@ -2889,8 +2999,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
                                 * may have freed the last object and be
                                 * waiting to release the slab.
                                 */
-                                list_del(&page->lru);
+                                __remove_partial(n, page);
-                                n->nr_partial--;
                                slab_unlock(page);
                                discard_slab(s, page);
                        } else {
@@ -2914,7 +3023,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
-#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+#if defined(CONFIG_MEMORY_HOTPLUG)
 static int slab_mem_going_offline_callback(void *arg)
 {
        struct kmem_cache *s;
@@ -2956,7 +3065,7 @@ static void slab_mem_offline_callback(void *arg)
                        BUG_ON(slabs_node(s, offline_node));
                        s->node[offline_node] = NULL;
-                        kmem_cache_free(kmalloc_caches, n);
+                        kmem_cache_free(kmem_cache_node, n);
                }
        }
        up_read(&slub_lock);
@@ -2989,7 +3098,7 @@ static int slab_mem_going_online_callback(void *arg)
                 *      since memory is not yet available from the node that
                 *      is brought up.
                 */
-                n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL);
+                n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
                if (!n) {
                        ret = -ENOMEM;
                        goto out;
@@ -3035,46 +3144,92 @@ static int slab_memory_callback(struct notifier_block *self,
 *                      Basic setup of slabs
 *******************************************************************/
+/*
+ * Used for early kmem_cache structures that were allocated using
+ * the page allocator
+ */
+static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
+{
+        int node;
+        list_add(&s->list, &slab_caches);
+        s->refcount = -1;
+        for_each_node_state(node, N_NORMAL_MEMORY) {
+                struct kmem_cache_node *n = get_node(s, node);
+                struct page *p;
+                if (n) {
+                        list_for_each_entry(p, &n->partial, lru)
+                                p->slab = s;
+#ifdef CONFIG_SLUB_DEBUG
+                        list_for_each_entry(p, &n->full, lru)
+                                p->slab = s;
+#endif
+                }
+        }
+}
 void __init kmem_cache_init(void)
 {
        int i;
        int caches = 0;
+        struct kmem_cache *temp_kmem_cache;
+        int order;
+        struct kmem_cache *temp_kmem_cache_node;
+        unsigned long kmalloc_size;
+        kmem_size = offsetof(struct kmem_cache, node) +
+                                nr_node_ids * sizeof(struct kmem_cache_node *);
+        /* Allocate two kmem_caches from the page allocator */
+        kmalloc_size = ALIGN(kmem_size, cache_line_size());
+        order = get_order(2 * kmalloc_size);
+        kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
-#ifdef CONFIG_NUMA
        /*
         * Must first have the slab cache available for the allocations of the
         * struct kmem_cache_node's. There is special bootstrap code in
         * kmem_cache_open for slab_state == DOWN.
         */
-        create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
+        kmem_cache_node = (void *)kmem_cache + kmalloc_size;
-                sizeof(struct kmem_cache_node), GFP_NOWAIT);
-        kmalloc_caches[0].refcount = -1;
+        kmem_cache_open(kmem_cache_node, "kmem_cache_node",
-        caches++;
+                sizeof(struct kmem_cache_node),
+                0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
        hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
-#endif
        /* Able to allocate the per node structures */
        slab_state = PARTIAL;
-        /* Caches that are not of the two-to-the-power-of size */
+        temp_kmem_cache = kmem_cache;
-        if (KMALLOC_MIN_SIZE <= 32) {
+        kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
-                create_kmalloc_cache(&kmalloc_caches[1],
+                0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
-                                "kmalloc-96", 96, GFP_NOWAIT);
+        kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
-                caches++;
+        memcpy(kmem_cache, temp_kmem_cache, kmem_size);
-        }
-        if (KMALLOC_MIN_SIZE <= 64) {
-                create_kmalloc_cache(&kmalloc_caches[2],
-                                "kmalloc-192", 192, GFP_NOWAIT);
-                caches++;
-        }
-        for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+        /*
-                create_kmalloc_cache(&kmalloc_caches[i],
+         * Allocate kmem_cache_node properly from the kmem_cache slab.
-                        "kmalloc", 1 << i, GFP_NOWAIT);
+         * kmem_cache_node is separately allocated so no need to
-                caches++;
+         * update any list pointers.
-        }
+         */
+        temp_kmem_cache_node = kmem_cache_node;
+        kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+        memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
+        kmem_cache_bootstrap_fixup(kmem_cache_node);
+        caches++;
+        kmem_cache_bootstrap_fixup(kmem_cache);
+        caches++;
+        /* Free temporary boot structure */
+        free_pages((unsigned long)temp_kmem_cache, order);
+        /* Now we can use the kmem_cache to allocate kmalloc slabs */
        /*
         * Patch up the size_index table if we have strange large alignment
@@ -3114,26 +3269,60 @@ void __init kmem_cache_init(void)
                        size_index[size_index_elem(i)] = 8;
        }
+        /* Caches that are not of the two-to-the-power-of size */
+        if (KMALLOC_MIN_SIZE <= 32) {
+                kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
+                caches++;
+        }
+        if (KMALLOC_MIN_SIZE <= 64) {
+                kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
+                caches++;
+        }
+        for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+                kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
+                caches++;
+        }
        slab_state = UP;
        /* Provide the correct kmalloc names now that the caches are up */
+        if (KMALLOC_MIN_SIZE <= 32) {
+                kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
+                BUG_ON(!kmalloc_caches[1]->name);
+        }
+        if (KMALLOC_MIN_SIZE <= 64) {
+                kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
+                BUG_ON(!kmalloc_caches[2]->name);
+        }
        for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
                char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
                BUG_ON(!s);
-                kmalloc_caches[i].name = s;
+                kmalloc_caches[i]->name = s;
        }
 #ifdef CONFIG_SMP
        register_cpu_notifier(&slab_notifier);
 #endif
-#ifdef CONFIG_NUMA
-        kmem_size = offsetof(struct kmem_cache, node) +
-                                nr_node_ids * sizeof(struct kmem_cache_node *);
-#else
-        kmem_size = sizeof(struct kmem_cache);
-#endif
+#ifdef CONFIG_ZONE_DMA
+        for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
+                struct kmem_cache *s = kmalloc_caches[i];
+                if (s && s->size) {
+                        char *name = kasprintf(GFP_NOWAIT,
+                                 "dma-kmalloc-%d", s->objsize);
+                        BUG_ON(!name);
+                        kmalloc_dma_caches[i] = create_kmalloc_cache(name,
+                                s->objsize, SLAB_CACHE_DMA);
+                }
+        }
+#endif
        printk(KERN_INFO
                "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
                " CPUs=%d, Nodes=%d\n",
@@ -3211,6 +3400,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s;
+        char *n;
        if (WARN_ON(!name))
                return NULL;
@@ -3234,24 +3424,30 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                return s;
        }
+        n = kstrdup(name, GFP_KERNEL);
+        if (!n)
+                goto err;
        s = kmalloc(kmem_size, GFP_KERNEL);
        if (s) {
-                if (kmem_cache_open(s, GFP_KERNEL, name,
+                if (kmem_cache_open(s, n,
                                size, align, flags, ctor)) {
                        list_add(&s->list, &slab_caches);
                        if (sysfs_slab_add(s)) {
                                list_del(&s->list);
+                                kfree(n);
                                kfree(s);
                                goto err;
                        }
                        up_write(&slub_lock);
                        return s;
                }
+                kfree(n);
                kfree(s);
        }
+err:
        up_write(&slub_lock);
-err:
        if (flags & SLAB_PANIC)
                panic("Cannot create slabcache %s\n", name);
        else
@@ -3312,12 +3508,13 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
        ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
-        /* Honor the call site pointer we recieved. */
+        /* Honor the call site pointer we received. */
        trace_kmalloc(caller, ret, size, s->size, gfpflags);
        return ret;
 }
+#ifdef CONFIG_NUMA
 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
                                        int node, unsigned long caller)
 {
@@ -3341,13 +3538,14 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
        ret = slab_alloc(s, gfpflags, node, caller);
-        /* Honor the call site pointer we recieved. */
+        /* Honor the call site pointer we received. */
        trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
        return ret;
 }
+#endif
-#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SYSFS
 static int count_inuse(struct page *page)
 {
        return page->inuse;
@@ -3357,7 +3555,9 @@ static int count_total(struct page *page)
 {
        return page->objects;
 }
+#endif
+#ifdef CONFIG_SLUB_DEBUG
 static int validate_slab(struct kmem_cache *s, struct page *page,
                                                unsigned long *map)
 {
@@ -3371,15 +3571,16 @@ static int validate_slab(struct kmem_cache *s, struct page *page,
        /* Now we know that a valid freelist exists */
        bitmap_zero(map, page->objects);
-        for_each_free_object(p, s, page->freelist) {
+        get_map(s, page, map);
-                set_bit(slab_index(p, s, addr), map);
+        for_each_object(p, s, addr, page->objects) {
-                if (!check_object(s, page, p, 0))
+                if (test_bit(slab_index(p, s, addr), map))
-                        return 0;
+                        if (!check_object(s, page, p, SLUB_RED_INACTIVE))
+                                return 0;
        }
        for_each_object(p, s, addr, page->objects)
                if (!test_bit(slab_index(p, s, addr), map))
-                        if (!check_object(s, page, p, 1))
+                        if (!check_object(s, page, p, SLUB_RED_ACTIVE))
                                return 0;
        return 1;
 }
@@ -3448,65 +3649,6 @@ static long validate_slab_cache(struct kmem_cache *s)
        kfree(map);
        return count;
 }
-#ifdef SLUB_RESILIENCY_TEST
-static void resiliency_test(void)
-{
-        u8 *p;
-        printk(KERN_ERR "SLUB resiliency testing\n");
-        printk(KERN_ERR "-----------------------\n");
-        printk(KERN_ERR "A. Corruption after allocation\n");
-        p = kzalloc(16, GFP_KERNEL);
-        p[16] = 0x12;
-        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
-                        " 0x12->0x%p\n\n", p + 16);
-        validate_slab_cache(kmalloc_caches + 4);
-        /* Hmmm... The next two are dangerous */
-        p = kzalloc(32, GFP_KERNEL);
-        p[32 + sizeof(void *)] = 0x34;
-        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
-                        " 0x34 -> -0x%p\n", p);
-        printk(KERN_ERR
-                "If allocated object is overwritten then not detectable\n\n");
-        validate_slab_cache(kmalloc_caches + 5);
-        p = kzalloc(64, GFP_KERNEL);
-        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
-        *p = 0x56;
-        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
-                                                                        p);
-        printk(KERN_ERR
-                "If allocated object is overwritten then not detectable\n\n");
-        validate_slab_cache(kmalloc_caches + 6);
-        printk(KERN_ERR "\nB. Corruption after free\n");
-        p = kzalloc(128, GFP_KERNEL);
-        kfree(p);
-        *p = 0x78;
-        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
-        validate_slab_cache(kmalloc_caches + 7);
-        p = kzalloc(256, GFP_KERNEL);
-        kfree(p);
-        p[50] = 0x9a;
-        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
-                        p);
-        validate_slab_cache(kmalloc_caches + 8);
-        p = kzalloc(512, GFP_KERNEL);
-        kfree(p);
-        p[512] = 0xab;
-        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
-        validate_slab_cache(kmalloc_caches + 9);
-}
-#else
-static void resiliency_test(void) {};
-#endif
 /*
 * Generate lists of code addresses where slabcache objects are allocated
 * and freed.
@@ -3635,14 +3777,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
 static void process_slab(struct loc_track *t, struct kmem_cache *s,
                struct page *page, enum track_item alloc,
-                long *map)
+                unsigned long *map)
 {
        void *addr = page_address(page);
        void *p;
        bitmap_zero(map, page->objects);
-        for_each_free_object(p, s, page->freelist)
+        get_map(s, page, map);
-                set_bit(slab_index(p, s, addr), map);
        for_each_object(p, s, addr, page->objects)
                if (!test_bit(slab_index(p, s, addr), map))
@@ -3691,7 +3832,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
                len += sprintf(buf + len, "%7ld ", l->count);
                if (l->addr)
-                        len += sprint_symbol(buf + len, (unsigned long)l->addr);
+                        len += sprintf(buf + len, "%pS", (void *)l->addr);
                else
                        len += sprintf(buf + len, "<not-available>");
@@ -3735,7 +3876,71 @@ static int list_locations(struct kmem_cache *s, char *buf,
                len += sprintf(buf, "No data\n");
        return len;
 }
+#endif
+#ifdef SLUB_RESILIENCY_TEST
+static void resiliency_test(void)
+{
+        u8 *p;
+        BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
+        printk(KERN_ERR "SLUB resiliency testing\n");
+        printk(KERN_ERR "-----------------------\n");
+        printk(KERN_ERR "A. Corruption after allocation\n");
+        p = kzalloc(16, GFP_KERNEL);
+        p[16] = 0x12;
+        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
+                        " 0x12->0x%p\n\n", p + 16);
+        validate_slab_cache(kmalloc_caches[4]);
+        /* Hmmm... The next two are dangerous */
+        p = kzalloc(32, GFP_KERNEL);
+        p[32 + sizeof(void *)] = 0x34;
+        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
+                        " 0x34 -> -0x%p\n", p);
+        printk(KERN_ERR
+                "If allocated object is overwritten then not detectable\n\n");
+        validate_slab_cache(kmalloc_caches[5]);
+        p = kzalloc(64, GFP_KERNEL);
+        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
+        *p = 0x56;
+        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+                                                                        p);
+        printk(KERN_ERR
+                "If allocated object is overwritten then not detectable\n\n");
+        validate_slab_cache(kmalloc_caches[6]);
+        printk(KERN_ERR "\nB. Corruption after free\n");
+        p = kzalloc(128, GFP_KERNEL);
+        kfree(p);
+        *p = 0x78;
+        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+        validate_slab_cache(kmalloc_caches[7]);
+        p = kzalloc(256, GFP_KERNEL);
+        kfree(p);
+        p[50] = 0x9a;
+        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
+                        p);
+        validate_slab_cache(kmalloc_caches[8]);
+        p = kzalloc(512, GFP_KERNEL);
+        kfree(p);
+        p[512] = 0xab;
+        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+        validate_slab_cache(kmalloc_caches[9]);
+}
+#else
+#ifdef CONFIG_SYSFS
+static void resiliency_test(void) {};
+#endif
+#endif
+#ifdef CONFIG_SYSFS
 enum slab_stat_type {
        SL_ALL,                 /* All slabs */
        SL_PARTIAL,             /* Only partially allocated slabs */
@@ -3788,6 +3993,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                }
        }
+        lock_memory_hotplug();
+#ifdef CONFIG_SLUB_DEBUG
        if (flags & SO_ALL) {
                for_each_node_state(node, N_NORMAL_MEMORY) {
                        struct kmem_cache_node *n = get_node(s, node);
@@ -3804,7 +4011,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                        nodes[node] += x;
                }
-        } else if (flags & SO_PARTIAL) {
+        } else
+#endif
+        if (flags & SO_PARTIAL) {
                for_each_node_state(node, N_NORMAL_MEMORY) {
                        struct kmem_cache_node *n = get_node(s, node);
@@ -3825,10 +4034,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
                        x += sprintf(buf + x, " N%d=%lu",
                                        node, nodes[node]);
 #endif
+        unlock_memory_hotplug();
        kfree(nodes);
        return x + sprintf(buf + x, "\n");
 }
+#ifdef CONFIG_SLUB_DEBUG
 static int any_slab_objects(struct kmem_cache *s)
 {
        int node;
@@ -3844,6 +4055,7 @@ static int any_slab_objects(struct kmem_cache *s)
        }
        return 0;
 }
+#endif
 #define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
 #define to_slab(n) container_of(n, struct kmem_cache, kobj);
@@ -3930,12 +4142,9 @@ SLAB_ATTR(min_partial);
 static ssize_t ctor_show(struct kmem_cache *s, char *buf)
 {
-        if (s->ctor) {
+        if (!s->ctor)
-                int n = sprint_symbol(buf, (unsigned long)s->ctor);
+                return 0;
+        return sprintf(buf, "%pS\n", s->ctor);
-                return n + sprintf(buf + n, "\n");
-        }
-        return 0;
 }
 SLAB_ATTR_RO(ctor);
@@ -3945,12 +4154,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR_RO(aliases);
-static ssize_t slabs_show(struct kmem_cache *s, char *buf)
-{
-        return show_slab_objects(s, buf, SO_ALL);
-}
-SLAB_ATTR_RO(slabs);
 static ssize_t partial_show(struct kmem_cache *s, char *buf)
 {
        return show_slab_objects(s, buf, SO_PARTIAL);
@@ -3975,93 +4178,89 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR_RO(objects_partial);
-static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
+static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
-{
-        return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
-}
-SLAB_ATTR_RO(total_objects);
-static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
 }
-static ssize_t sanity_checks_store(struct kmem_cache *s,
+static ssize_t reclaim_account_store(struct kmem_cache *s,
                                const char *buf, size_t length)
 {
-        s->flags &= ~SLAB_DEBUG_FREE;
+        s->flags &= ~SLAB_RECLAIM_ACCOUNT;
        if (buf[0] == '1')
-                s->flags |= SLAB_DEBUG_FREE;
+                s->flags |= SLAB_RECLAIM_ACCOUNT;
        return length;
 }
-SLAB_ATTR(sanity_checks);
+SLAB_ATTR(reclaim_account);
-static ssize_t trace_show(struct kmem_cache *s, char *buf)
+static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
 }
+SLAB_ATTR_RO(hwcache_align);
-static ssize_t trace_store(struct kmem_cache *s, const char *buf,
+#ifdef CONFIG_ZONE_DMA
-                                                        size_t length)
+static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
 {
-        s->flags &= ~SLAB_TRACE;
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
-        if (buf[0] == '1')
-                s->flags |= SLAB_TRACE;
-        return length;
 }
-SLAB_ATTR(trace);
+SLAB_ATTR_RO(cache_dma);
+#endif
-#ifdef CONFIG_FAILSLAB
+static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
-static ssize_t failslab_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
 }
+SLAB_ATTR_RO(destroy_by_rcu);
-static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+static ssize_t reserved_show(struct kmem_cache *s, char *buf)
-                                                        size_t length)
 {
-        s->flags &= ~SLAB_FAILSLAB;
+        return sprintf(buf, "%d\n", s->reserved);
-        if (buf[0] == '1')
-                s->flags |= SLAB_FAILSLAB;
-        return length;
 }
-SLAB_ATTR(failslab);
+SLAB_ATTR_RO(reserved);
-#endif
-static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
+#ifdef CONFIG_SLUB_DEBUG
+static ssize_t slabs_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+        return show_slab_objects(s, buf, SO_ALL);
 }
+SLAB_ATTR_RO(slabs);
-static ssize_t reclaim_account_store(struct kmem_cache *s,
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
-                                const char *buf, size_t length)
 {
-        s->flags &= ~SLAB_RECLAIM_ACCOUNT;
+        return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
-        if (buf[0] == '1')
-                s->flags |= SLAB_RECLAIM_ACCOUNT;
-        return length;
 }
-SLAB_ATTR(reclaim_account);
+SLAB_ATTR_RO(total_objects);
-static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
+static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
 }
-SLAB_ATTR_RO(hwcache_align);
-#ifdef CONFIG_ZONE_DMA
+static ssize_t sanity_checks_store(struct kmem_cache *s,
-static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
+                                const char *buf, size_t length)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+        s->flags &= ~SLAB_DEBUG_FREE;
+        if (buf[0] == '1')
+                s->flags |= SLAB_DEBUG_FREE;
+        return length;
 }
-SLAB_ATTR_RO(cache_dma);
+SLAB_ATTR(sanity_checks);
-#endif
-static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
+static ssize_t trace_show(struct kmem_cache *s, char *buf)
 {
-        return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
 }
-SLAB_ATTR_RO(destroy_by_rcu);
+static ssize_t trace_store(struct kmem_cache *s, const char *buf,
+                                                        size_t length)
+{
+        s->flags &= ~SLAB_TRACE;
+        if (buf[0] == '1')
+                s->flags |= SLAB_TRACE;
+        return length;
+}
+SLAB_ATTR(trace);
 static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
 {
@@ -4139,6 +4338,40 @@ static ssize_t validate_store(struct kmem_cache *s,
 }
 SLAB_ATTR(validate);
+static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
+{
+        if (!(s->flags & SLAB_STORE_USER))
+                return -ENOSYS;
+        return list_locations(s, buf, TRACK_ALLOC);
+}
+SLAB_ATTR_RO(alloc_calls);
+static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
+{
+        if (!(s->flags & SLAB_STORE_USER))
+                return -ENOSYS;
+        return list_locations(s, buf, TRACK_FREE);
+}
+SLAB_ATTR_RO(free_calls);
+#endif /* CONFIG_SLUB_DEBUG */
+#ifdef CONFIG_FAILSLAB
+static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+{
+        return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+}
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+                                                        size_t length)
+{
+        s->flags &= ~SLAB_FAILSLAB;
+        if (buf[0] == '1')
+                s->flags |= SLAB_FAILSLAB;
+        return length;
+}
+SLAB_ATTR(failslab);
+#endif
 static ssize_t shrink_show(struct kmem_cache *s, char *buf)
 {
        return 0;
@@ -4158,22 +4391,6 @@ static ssize_t shrink_store(struct kmem_cache *s,
 }
 SLAB_ATTR(shrink);
-static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
-{
-        if (!(s->flags & SLAB_STORE_USER))
-                return -ENOSYS;
-        return list_locations(s, buf, TRACK_ALLOC);
-}
-SLAB_ATTR_RO(alloc_calls);
-static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
-{
-        if (!(s->flags & SLAB_STORE_USER))
-                return -ENOSYS;
-        return list_locations(s, buf, TRACK_FREE);
-}
-SLAB_ATTR_RO(free_calls);
 #ifdef CONFIG_NUMA
 static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
 {
@@ -4279,25 +4496,28 @@ static struct attribute *slab_attrs[] = {
        &min_partial_attr.attr,
        &objects_attr.attr,
        &objects_partial_attr.attr,
-        &total_objects_attr.attr,
-        &slabs_attr.attr,
        &partial_attr.attr,
        &cpu_slabs_attr.attr,
        &ctor_attr.attr,
        &aliases_attr.attr,
        &align_attr.attr,
-        &sanity_checks_attr.attr,
-        &trace_attr.attr,
        &hwcache_align_attr.attr,
        &reclaim_account_attr.attr,
        &destroy_by_rcu_attr.attr,
+        &shrink_attr.attr,
+        &reserved_attr.attr,
+#ifdef CONFIG_SLUB_DEBUG
+        &total_objects_attr.attr,
+        &slabs_attr.attr,
+        &sanity_checks_attr.attr,
+        &trace_attr.attr,
        &red_zone_attr.attr,
        &poison_attr.attr,
        &store_user_attr.attr,
        &validate_attr.attr,
-        &shrink_attr.attr,
        &alloc_calls_attr.attr,
        &free_calls_attr.attr,
+#endif
 #ifdef CONFIG_ZONE_DMA
        &cache_dma_attr.attr,
 #endif
@@ -4377,6 +4597,7 @@ static void kmem_cache_release(struct kobject *kobj)
 {
        struct kmem_cache *s = to_slab(kobj);
+        kfree(s->name);
        kfree(s);
 }
@@ -4579,7 +4800,7 @@ static int __init slab_sysfs_init(void)
 }
 __initcall(slab_sysfs_init);
-#endif
+#endif /* CONFIG_SYSFS */
 /*
 * The /proc/slabinfo ABI
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index aa33fd67fa41..64b984091edb 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -9,7 +9,7 @@
 *
 * However, virtual mappings need a page table and TLBs. Many Linux
 * architectures already map their physical space using 1-1 mappings
- * via TLBs. For those arches the virtual memmory map is essentially
+ * via TLBs. For those arches the virtual memory map is essentially
 * for free if we use the same page size as the 1-1 mappings. In that
 * case the overhead consists of a few additional pages that are
 * allocated to create a view of memory for vmemmap.
@@ -220,18 +220,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
        if (vmemmap_buf_start) {
                /* need to free left buf */
-#ifdef CONFIG_NO_BOOTMEM
-                free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
-                if (vmemmap_buf_start < vmemmap_buf) {
-                        char name[15];
-                        snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
-                        reserve_early_without_check(__pa(vmemmap_buf_start),
-                                                    __pa(vmemmap_buf), name);
-                }
-#else
                free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
-#endif
                vmemmap_buf = NULL;
                vmemmap_buf_end = NULL;
        }
diff --git a/mm/sparse.c b/mm/sparse.c
index 95ac219af379..aa64b12831a2 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -500,7 +500,7 @@ void __init sparse_init(void)
         * so alloc 2M (with 2M align) and 24 bytes in turn will
         * make next 2M slip to one more 2M later.
         * then in big system, the memory will have a lot of holes...
-         * here try to allocate 2M pages continously.
+         * here try to allocate 2M pages continuously.
         *
         * powerpc need to call sparse_init_one_section right after each
         * sparse_early_mem_map_alloc, so allocate usemap_map at first.
@@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 static void free_map_bootmem(struct page *page, unsigned long nr_pages)
 {
        unsigned long maps_section_nr, removing_section_nr, i;
-        int magic;
+        unsigned long magic;
        for (i = 0; i < nr_pages; i++, page++) {
-                magic = atomic_read(&page->_mapcount);
+                magic = (unsigned long) page->lru.next;
                BUG_ON(magic == NODE_INFO);
diff --git a/mm/swap.c b/mm/swap.c
index 3ce7bc373a52..3a442f18b0b3 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,6 +39,7 @@ int page_cluster;
 static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
 /*
 * This path almost never happens for VM activity - pages are normally
@@ -56,17 +57,97 @@ static void __page_cache_release(struct page *page)
                del_page_from_lru(zone, page);
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
+}
+static void __put_single_page(struct page *page)
+{
+        __page_cache_release(page);
        free_hot_cold_page(page, 0);
 }
-static void put_compound_page(struct page *page)
+static void __put_compound_page(struct page *page)
 {
-        page = compound_head(page);
+        compound_page_dtor *dtor;
-        if (put_page_testzero(page)) {
-                compound_page_dtor *dtor;
-                dtor = get_compound_page_dtor(page);
+        __page_cache_release(page);
-                (*dtor)(page);
+        dtor = get_compound_page_dtor(page);
+        (*dtor)(page);
+}
+static void put_compound_page(struct page *page)
+{
+        if (unlikely(PageTail(page))) {
+                /* __split_huge_page_refcount can run under us */
+                struct page *page_head = page->first_page;
+                smp_rmb();
+                /*
+                 * If PageTail is still set after smp_rmb() we can be sure
+                 * that the page->first_page we read wasn't a dangling pointer.
+                 * See __split_huge_page_refcount() smp_wmb().
+                 */
+                if (likely(PageTail(page) && get_page_unless_zero(page_head))) {
+                        unsigned long flags;
+                        /*
+                         * Verify that our page_head wasn't converted
+                         * to a a regular page before we got a
+                         * reference on it.
+                         */
+                        if (unlikely(!PageHead(page_head))) {
+                                /* PageHead is cleared after PageTail */
+                                smp_rmb();
+                                VM_BUG_ON(PageTail(page));
+                                goto out_put_head;
+                        }
+                        /*
+                         * Only run compound_lock on a valid PageHead,
+                         * after having it pinned with
+                         * get_page_unless_zero() above.
+                         */
+                        smp_mb();
+                        /* page_head wasn't a dangling pointer */
+                        flags = compound_lock_irqsave(page_head);
+                        if (unlikely(!PageTail(page))) {
+                                /* __split_huge_page_refcount run before us */
+                                compound_unlock_irqrestore(page_head, flags);
+                                VM_BUG_ON(PageHead(page_head));
+                        out_put_head:
+                                if (put_page_testzero(page_head))
+                                        __put_single_page(page_head);
+                        out_put_single:
+                                if (put_page_testzero(page))
+                                        __put_single_page(page);
+                                return;
+                        }
+                        VM_BUG_ON(page_head != page->first_page);
+                        /*
+                         * We can release the refcount taken by
+                         * get_page_unless_zero now that
+                         * split_huge_page_refcount is blocked on the
+                         * compound_lock.
+                         */
+                        if (put_page_testzero(page_head))
+                                VM_BUG_ON(1);
+                        /* __split_huge_page_refcount will wait now */
+                        VM_BUG_ON(atomic_read(&page->_count) <= 0);
+                        atomic_dec(&page->_count);
+                        VM_BUG_ON(atomic_read(&page_head->_count) <= 0);
+                        compound_unlock_irqrestore(page_head, flags);
+                        if (put_page_testzero(page_head)) {
+                                if (PageHead(page_head))
+                                        __put_compound_page(page_head);
+                                else
+                                        __put_single_page(page_head);
+                        }
+                } else {
+                        /* page_head is a dangling pointer */
+                        VM_BUG_ON(PageTail(page));
+                        goto out_put_single;
+                }
+        } else if (put_page_testzero(page)) {
+                if (PageHead(page))
+                        __put_compound_page(page);
+                else
+                        __put_single_page(page);
        }
 }
@@ -75,7 +156,7 @@ void put_page(struct page *page)
        if (unlikely(PageCompound(page)))
                put_compound_page(page);
        else if (put_page_testzero(page))
-                __page_cache_release(page);
+                __put_single_page(page);
 }
 EXPORT_SYMBOL(put_page);
@@ -98,15 +179,13 @@ void put_pages_list(struct list_head *pages)
 }
 EXPORT_SYMBOL(put_pages_list);
-/*
+static void pagevec_lru_move_fn(struct pagevec *pvec,
- * pagevec_move_tail() must be called with IRQ disabled.
+                                void (*move_fn)(struct page *page, void *arg),
- * Otherwise this may cause nasty races.
+                                void *arg)
- */
-static void pagevec_move_tail(struct pagevec *pvec)
 {
        int i;
-        int pgmoved = 0;
        struct zone *zone = NULL;
+        unsigned long flags = 0;
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
@@ -114,29 +193,50 @@ static void pagevec_move_tail(struct pagevec *pvec)
                if (pagezone != zone) {
                        if (zone)
-                                spin_unlock(&zone->lru_lock);
+                                spin_unlock_irqrestore(&zone->lru_lock, flags);
                        zone = pagezone;
-                        spin_lock(&zone->lru_lock);
+                        spin_lock_irqsave(&zone->lru_lock, flags);
-                }
-                if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-                        int lru = page_lru_base_type(page);
-                        list_move_tail(&page->lru, &zone->lru[lru].list);
-                        pgmoved++;
                }
+                (*move_fn)(page, arg);
        }
        if (zone)
-                spin_unlock(&zone->lru_lock);
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
-        __count_vm_events(PGROTATED, pgmoved);
        release_pages(pvec->pages, pvec->nr, pvec->cold);
        pagevec_reinit(pvec);
 }
+static void pagevec_move_tail_fn(struct page *page, void *arg)
+{
+        int *pgmoved = arg;
+        struct zone *zone = page_zone(page);
+        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+                enum lru_list lru = page_lru_base_type(page);
+                list_move_tail(&page->lru, &zone->lru[lru].list);
+                mem_cgroup_rotate_reclaimable_page(page);
+                (*pgmoved)++;
+        }
+}
+/*
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
+ */
+static void pagevec_move_tail(struct pagevec *pvec)
+{
+        int pgmoved = 0;
+        pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
+        __count_vm_events(PGROTATED, pgmoved);
+}
 /*
 * Writeback is about to end against a page which has been marked for immediate
 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 * inactive list.
 */
-void  rotate_reclaimable_page(struct page *page)
+void rotate_reclaimable_page(struct page *page)
 {
        if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
            !PageUnevictable(page) && PageLRU(page)) {
@@ -172,14 +272,10 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page,
                memcg_reclaim_stat->recent_rotated[file]++;
 }
-/*
+static void __activate_page(struct page *page, void *arg)
- * FIXME: speed this up?
- */
-void activate_page(struct page *page)
 {
        struct zone *zone = page_zone(page);
-        spin_lock_irq(&zone->lru_lock);
        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
                int file = page_is_file_cache(page);
                int lru = page_lru_base_type(page);
@@ -192,8 +288,45 @@ void activate_page(struct page *page)
                update_page_reclaim_stat(zone, page, file, 1);
        }
+}
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+static void activate_page_drain(int cpu)
+{
+        struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
+        if (pagevec_count(pvec))
+                pagevec_lru_move_fn(pvec, __activate_page, NULL);
+}
+void activate_page(struct page *page)
+{
+        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+                struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+                page_cache_get(page);
+                if (!pagevec_add(pvec, page))
+                        pagevec_lru_move_fn(pvec, __activate_page, NULL);
+                put_cpu_var(activate_page_pvecs);
+        }
+}
+#else
+static inline void activate_page_drain(int cpu)
+{
+}
+void activate_page(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        spin_lock_irq(&zone->lru_lock);
+        __activate_page(page, NULL);
        spin_unlock_irq(&zone->lru_lock);
 }
+#endif
 /*
 * Mark a page as having seen activity.
@@ -267,6 +400,74 @@ void add_page_to_unevictable_list(struct page *page)
 }
 /*
+ * If the page can not be invalidated, it is moved to the
+ * inactive list to speed up its reclaim.  It is moved to the
+ * head of the list, rather than the tail, to give the flusher
+ * threads some time to write it out, as this is much more
+ * effective than the single-page writeout from reclaim.
+ *
+ * If the page isn't page_mapped and dirty/writeback, the page
+ * could reclaim asap using PG_reclaim.
+ *
+ * 1. active, mapped page -> none
+ * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
+ * 3. inactive, mapped page -> none
+ * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
+ * 5. inactive, clean -> inactive, tail
+ * 6. Others -> none
+ *
+ * In 4, why it moves inactive's head, the VM expects the page would
+ * be write it out by flusher threads as this is much more effective
+ * than the single-page writeout from reclaim.
+ */
+static void lru_deactivate_fn(struct page *page, void *arg)
+{
+        int lru, file;
+        bool active;
+        struct zone *zone = page_zone(page);
+        if (!PageLRU(page))
+                return;
+        if (PageUnevictable(page))
+                return;
+        /* Some processes are using the page */
+        if (page_mapped(page))
+                return;
+        active = PageActive(page);
+        file = page_is_file_cache(page);
+        lru = page_lru_base_type(page);
+        del_page_from_lru_list(zone, page, lru + active);
+        ClearPageActive(page);
+        ClearPageReferenced(page);
+        add_page_to_lru_list(zone, page, lru);
+        if (PageWriteback(page) || PageDirty(page)) {
+                /*
+                 * PG_reclaim could be raced with end_page_writeback
+                 * It can make readahead confusing.  But race window
+                 * is _really_ small and  it's non-critical problem.
+                 */
+                SetPageReclaim(page);
+        } else {
+                /*
+                 * The page's writeback ends up during pagevec
+                 * We moves tha page into tail of inactive.
+                 */
+                list_move_tail(&page->lru, &zone->lru[lru].list);
+                mem_cgroup_rotate_reclaimable_page(page);
+                __count_vm_event(PGROTATED);
+        }
+        if (active)
+                __count_vm_event(PGDEACTIVATE);
+        update_page_reclaim_stat(zone, page, file, 0);
+}
+/*
 * Drain pages out of the cpu's pagevecs.
 * Either "cpu" is the current CPU, and preemption has already been
 * disabled; or "cpu" is being hot-unplugged, and is already dead.
@@ -292,6 +493,38 @@ static void drain_cpu_pagevecs(int cpu)
                pagevec_move_tail(pvec);
                local_irq_restore(flags);
        }
+        pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+        if (pagevec_count(pvec))
+                pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+        activate_page_drain(cpu);
+}
+/**
+ * deactivate_page - forcefully deactivate a page
+ * @page: page to deactivate
+ *
+ * This function hints the VM that @page is a good reclaim candidate,
+ * for example if its invalidation fails due to the page being dirty
+ * or under writeback.
+ */
+void deactivate_page(struct page *page)
+{
+        /*
+         * In a workload with many unevictable page such as mprotect, unevictable
+         * page deactivation for accelerating reclaim is pointless.
+         */
+        if (PageUnevictable(page))
+                return;
+        if (likely(get_page_unless_zero(page))) {
+                struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+                if (!pagevec_add(pvec, page))
+                        pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+                put_cpu_var(lru_deactivate_pvecs);
+        }
 }
 void lru_add_drain(void)
@@ -378,6 +611,7 @@ void release_pages(struct page **pages, int nr, int cold)
        pagevec_free(&pages_to_free);
 }
+EXPORT_SYMBOL(release_pages);
 /*
 * The pages which we're about to release may be in the deferred lru-addition
@@ -398,44 +632,70 @@ void __pagevec_release(struct pagevec *pvec)
 EXPORT_SYMBOL(__pagevec_release);
+/* used by __split_huge_page_refcount() */
+void lru_add_page_tail(struct zone* zone,
+                       struct page *page, struct page *page_tail)
+{
+        int active;
+        enum lru_list lru;
+        const int file = 0;
+        struct list_head *head;
+        VM_BUG_ON(!PageHead(page));
+        VM_BUG_ON(PageCompound(page_tail));
+        VM_BUG_ON(PageLRU(page_tail));
+        VM_BUG_ON(!spin_is_locked(&zone->lru_lock));
+        SetPageLRU(page_tail);
+        if (page_evictable(page_tail, NULL)) {
+                if (PageActive(page)) {
+                        SetPageActive(page_tail);
+                        active = 1;
+                        lru = LRU_ACTIVE_ANON;
+                } else {
+                        active = 0;
+                        lru = LRU_INACTIVE_ANON;
+                }
+                update_page_reclaim_stat(zone, page_tail, file, active);
+                if (likely(PageLRU(page)))
+                        head = page->lru.prev;
+                else
+                        head = &zone->lru[lru].list;
+                __add_page_to_lru_list(zone, page_tail, lru, head);
+        } else {
+                SetPageUnevictable(page_tail);
+                add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE);
+        }
+}
+static void ____pagevec_lru_add_fn(struct page *page, void *arg)
+{
+        enum lru_list lru = (enum lru_list)arg;
+        struct zone *zone = page_zone(page);
+        int file = is_file_lru(lru);
+        int active = is_active_lru(lru);
+        VM_BUG_ON(PageActive(page));
+        VM_BUG_ON(PageUnevictable(page));
+        VM_BUG_ON(PageLRU(page));
+        SetPageLRU(page);
+        if (active)
+                SetPageActive(page);
+        update_page_reclaim_stat(zone, page, file, active);
+        add_page_to_lru_list(zone, page, lru);
+}
 /*
 * Add the passed pages to the LRU, then drop the caller's refcount
 * on them.  Reinitialises the caller's pagevec.
 */
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
-        int i;
-        struct zone *zone = NULL;
        VM_BUG_ON(is_unevictable_lru(lru));
-        for (i = 0; i < pagevec_count(pvec); i++) {
+        pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru);
-                struct page *page = pvec->pages[i];
-                struct zone *pagezone = page_zone(page);
-                int file;
-                int active;
-                if (pagezone != zone) {
-                        if (zone)
-                                spin_unlock_irq(&zone->lru_lock);
-                        zone = pagezone;
-                        spin_lock_irq(&zone->lru_lock);
-                }
-                VM_BUG_ON(PageActive(page));
-                VM_BUG_ON(PageUnevictable(page));
-                VM_BUG_ON(PageLRU(page));
-                SetPageLRU(page);
-                active = is_active_lru(lru);
-                file = is_file_lru(lru);
-                if (active)
-                        SetPageActive(page);
-                update_page_reclaim_stat(zone, page, file, active);
-                add_page_to_lru_list(zone, page, lru);
-        }
-        if (zone)
-                spin_unlock_irq(&zone->lru_lock);
-        release_pages(pvec->pages, pvec->nr, pvec->cold);
-        pagevec_reinit(pvec);
 }
 EXPORT_SYMBOL(____pagevec_lru_add);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e10f5833167f..46680461785b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,12 +24,10 @@
 /*
 * swapper_space is a fiction, retained to simplify the path through
- * vmscan's shrink_page_list, to make sync_page look nicer, and to allow
+ * vmscan's shrink_page_list.
- * future use of radix_tree tags in the swap cache.
 */
 static const struct address_space_operations swap_aops = {
        .writepage      = swap_writepage,
-        .sync_page      = block_sync_page,
        .set_page_dirty = __set_page_dirty_nobuffers,
        .migratepage    = migrate_page,
 };
@@ -37,7 +35,6 @@ static const struct address_space_operations swap_aops = {
 static struct backing_dev_info swap_backing_dev_info = {
        .name           = "swap",
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-        .unplug_io_fn   = swap_unplug_io_fn,
 };
 struct address_space swapper_space = {
@@ -157,6 +154,12 @@ int add_to_swap(struct page *page)
        if (!entry.val)
                return 0;
+        if (unlikely(PageTransHuge(page)))
+                if (unlikely(split_huge_page(page))) {
+                        swapcache_free(entry, NULL);
+                        return 0;
+                }
        /*
         * Radix-tree node allocations from PF_MEMALLOC contexts could
         * completely exhaust the page allocator. __GFP_NOMEMALLOC
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 7c703ff2f36f..ff8dc1a18cb4 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,7 +14,7 @@
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
 #include <linux/namei.h>
-#include <linux/shm.h>
+#include <linux/shmem_fs.h>
 #include <linux/blkdev.h>
 #include <linux/random.h>
 #include <linux/writeback.h>
@@ -30,6 +30,8 @@
 #include <linux/capability.h>
 #include <linux/syscalls.h>
 #include <linux/memcontrol.h>
+#include <linux/poll.h>
+#include <linux/oom.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -58,6 +60,10 @@ static struct swap_info_struct *swap_info[MAX_SWAPFILES];
 static DEFINE_MUTEX(swapon_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
+/* Activity counter to indicate that a swapon or swapoff has occurred */
+static atomic_t proc_poll_event = ATOMIC_INIT(0);
 static inline unsigned char swap_count(unsigned char ent)
 {
        return ent & ~SWAP_HAS_CACHE;   /* may include SWAP_HAS_CONT flag */
@@ -90,39 +96,6 @@ __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
 }
 /*
- * We need this because the bdev->unplug_fn can sleep and we cannot
- * hold swap_lock while calling the unplug_fn. And swap_lock
- * cannot be turned into a mutex.
- */
-static DECLARE_RWSEM(swap_unplug_sem);
-void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
-{
-        swp_entry_t entry;
-        down_read(&swap_unplug_sem);
-        entry.val = page_private(page);
-        if (PageSwapCache(page)) {
-                struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
-                struct backing_dev_info *bdi;
-                /*
-                 * If the page is removed from swapcache from under us (with a
-                 * racy try_to_unuse/swapoff) we need an additional reference
-                 * count to avoid reading garbage from page_private(page) above.
-                 * If the WARN_ON triggers during a swapoff it maybe the race
-                 * condition and it's harmless. However if it triggers without
-                 * swapoff it signals a problem.
-                 */
-                WARN_ON(page_count(page) <= 1);
-                bdi = bdev->bd_inode->i_mapping->backing_dev_info;
-                blk_run_backing_dev(bdi, page);
-        }
-        up_read(&swap_unplug_sem);
-}
-/*
 * swapon tell device that all the old swap contents can be discarded,
 * to allow the swap device to optimize its wear-levelling.
 */
@@ -139,7 +112,7 @@ static int discard_swap(struct swap_info_struct *si)
        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
        if (nr_blocks) {
                err = blkdev_issue_discard(si->bdev, start_block,
-                                nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+                                nr_blocks, GFP_KERNEL, 0);
                if (err)
                        return err;
                cond_resched();
@@ -150,7 +123,7 @@ static int discard_swap(struct swap_info_struct *si)
                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
                err = blkdev_issue_discard(si->bdev, start_block,
-                                nr_blocks, GFP_KERNEL, BLKDEV_IFL_WAIT);
+                                nr_blocks, GFP_KERNEL, 0);
                if (err)
                        break;
@@ -189,7 +162,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
                        start_block <<= PAGE_SHIFT - 9;
                        nr_blocks <<= PAGE_SHIFT - 9;
                        if (blkdev_issue_discard(si->bdev, start_block,
-                                    nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT))
+                                    nr_blocks, GFP_NOIO, 0))
                                break;
                }
@@ -207,8 +180,8 @@ static int wait_for_discard(void *word)
 #define SWAPFILE_CLUSTER        256
 #define LATENCY_LIMIT           256
-static inline unsigned long scan_swap_map(struct swap_info_struct *si,
+static unsigned long scan_swap_map(struct swap_info_struct *si,
-                                          unsigned char usage)
+                                   unsigned char usage)
 {
        unsigned long offset;
        unsigned long scan_base;
@@ -875,7 +848,7 @@ unsigned int count_swap_pages(int type, int free)
 static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
                unsigned long addr, swp_entry_t entry, struct page *page)
 {
-        struct mem_cgroup *ptr = NULL;
+        struct mem_cgroup *ptr;
        spinlock_t *ptl;
        pte_t *pte;
        int ret = 1;
@@ -959,6 +932,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
+                if (unlikely(pmd_trans_huge(*pmd)))
+                        continue;
                if (pmd_none_or_clear_bad(pmd))
                        continue;
                ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
@@ -1543,6 +1518,36 @@ bad_bmap:
        goto out;
 }
+static void enable_swap_info(struct swap_info_struct *p, int prio,
+                                unsigned char *swap_map)
+{
+        int i, prev;
+        spin_lock(&swap_lock);
+        if (prio >= 0)
+                p->prio = prio;
+        else
+                p->prio = --least_priority;
+        p->swap_map = swap_map;
+        p->flags |= SWP_WRITEOK;
+        nr_swap_pages += p->pages;
+        total_swap_pages += p->pages;
+        /* insert swap space into swap_list: */
+        prev = -1;
+        for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
+                if (p->prio >= swap_info[i]->prio)
+                        break;
+                prev = i;
+        }
+        p->next = i;
+        if (prev < 0)
+                swap_list.head = swap_list.next = p->type;
+        else
+                swap_info[prev]->next = p->type;
+        spin_unlock(&swap_lock);
+}
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
        struct swap_info_struct *p = NULL;
@@ -1551,6 +1556,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        struct address_space *mapping;
        struct inode *inode;
        char *pathname;
+        int oom_score_adj;
        int i, type, prev;
        int err;
@@ -1609,37 +1615,22 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        p->flags &= ~SWP_WRITEOK;
        spin_unlock(&swap_lock);
-        current->flags |= PF_OOM_ORIGIN;
+        oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
        err = try_to_unuse(type);
-        current->flags &= ~PF_OOM_ORIGIN;
+        test_set_oom_score_adj(oom_score_adj);
        if (err) {
+                /*
+                 * reading p->prio and p->swap_map outside the lock is
+                 * safe here because only sys_swapon and sys_swapoff
+                 * change them, and there can be no other sys_swapon or
+                 * sys_swapoff for this swap_info_struct at this point.
+                 */
                /* re-insert swap space back into swap_list */
-                spin_lock(&swap_lock);
+                enable_swap_info(p, p->prio, p->swap_map);
-                if (p->prio < 0)
-                        p->prio = --least_priority;
-                prev = -1;
-                for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
-                        if (p->prio >= swap_info[i]->prio)
-                                break;
-                        prev = i;
-                }
-                p->next = i;
-                if (prev < 0)
-                        swap_list.head = swap_list.next = type;
-                else
-                        swap_info[prev]->next = type;
-                nr_swap_pages += p->pages;
-                total_swap_pages += p->pages;
-                p->flags |= SWP_WRITEOK;
-                spin_unlock(&swap_lock);
                goto out_dput;
        }
-        /* wait for any unplug function to finish */
-        down_write(&swap_unplug_sem);
-        up_write(&swap_unplug_sem);
        destroy_swap_extents(p);
        if (p->flags & SWP_CONTINUED)
                free_swap_count_continuations(p);
@@ -1672,7 +1663,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        if (S_ISBLK(inode->i_mode)) {
                struct block_device *bdev = I_BDEV(inode);
                set_blocksize(bdev, p->old_block_size);
-                bd_release(bdev);
+                blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
        } else {
                mutex_lock(&inode->i_mutex);
                inode->i_flags &= ~S_SWAPFILE;
@@ -1680,6 +1671,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        }
        filp_close(swap_file, NULL);
        err = 0;
+        atomic_inc(&proc_poll_event);
+        wake_up_interruptible(&proc_poll_wait);
 out_dput:
        filp_close(victim, NULL);
@@ -1688,6 +1681,25 @@ out:
 }
 #ifdef CONFIG_PROC_FS
+struct proc_swaps {
+        struct seq_file seq;
+        int event;
+};
+static unsigned swaps_poll(struct file *file, poll_table *wait)
+{
+        struct proc_swaps *s = file->private_data;
+        poll_wait(file, &proc_poll_wait, wait);
+        if (s->event != atomic_read(&proc_poll_event)) {
+                s->event = atomic_read(&proc_poll_event);
+                return POLLIN | POLLRDNORM | POLLERR | POLLPRI;
+        }
+        return POLLIN | POLLRDNORM;
+}
 /* iterator */
 static void *swap_start(struct seq_file *swap, loff_t *pos)
 {
@@ -1771,7 +1783,24 @@ static const struct seq_operations swaps_op = {
 static int swaps_open(struct inode *inode, struct file *file)
 {
-        return seq_open(file, &swaps_op);
+        struct proc_swaps *s;
+        int ret;
+        s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL);
+        if (!s)
+                return -ENOMEM;
+        file->private_data = s;
+        ret = seq_open(file, &swaps_op);
+        if (ret) {
+                kfree(s);
+                return ret;
+        }
+        s->seq.private = s;
+        s->event = atomic_read(&proc_poll_event);
+        return ret;
 }
 static const struct file_operations proc_swaps_operations = {
@@ -1779,6 +1808,7 @@ static const struct file_operations proc_swaps_operations = {
        .read           = seq_read,
        .llseek         = seq_lseek,
        .release        = seq_release,
+        .poll           = swaps_poll,
 };
 static int __init procswaps_init(void)
@@ -1798,49 +1828,24 @@ static int __init max_swapfiles_check(void)
 late_initcall(max_swapfiles_check);
 #endif
-/*
+static struct swap_info_struct *alloc_swap_info(void)
- * Written 01/25/92 by Simmule Turner, heavily changed by Linus.
- *
- * The swapon system call
- */
-SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
        struct swap_info_struct *p;
-        char *name = NULL;
-        struct block_device *bdev = NULL;
-        struct file *swap_file = NULL;
-        struct address_space *mapping;
        unsigned int type;
-        int i, prev;
-        int error;
-        union swap_header *swap_header;
-        unsigned int nr_good_pages;
-        int nr_extents = 0;
-        sector_t span;
-        unsigned long maxpages;
-        unsigned long swapfilepages;
-        unsigned char *swap_map = NULL;
-        struct page *page = NULL;
-        struct inode *inode = NULL;
-        int did_down = 0;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
        p = kzalloc(sizeof(*p), GFP_KERNEL);
        if (!p)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
        spin_lock(&swap_lock);
        for (type = 0; type < nr_swapfiles; type++) {
                if (!(swap_info[type]->flags & SWP_USED))
                        break;
        }
-        error = -EPERM;
        if (type >= MAX_SWAPFILES) {
                spin_unlock(&swap_lock);
                kfree(p);
-                goto out;
+                return ERR_PTR(-EPERM);
        }
        if (type >= nr_swapfiles) {
                p->type = type;
@@ -1865,80 +1870,49 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        p->next = -1;
        spin_unlock(&swap_lock);
-        name = getname(specialfile);
+        return p;
-        error = PTR_ERR(name);
+}
-        if (IS_ERR(name)) {
-                name = NULL;
-                goto bad_swap_2;
-        }
-        swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
-        error = PTR_ERR(swap_file);
-        if (IS_ERR(swap_file)) {
-                swap_file = NULL;
-                goto bad_swap_2;
-        }
-        p->swap_file = swap_file;
-        mapping = swap_file->f_mapping;
-        inode = mapping->host;
-        error = -EBUSY;
-        for (i = 0; i < nr_swapfiles; i++) {
-                struct swap_info_struct *q = swap_info[i];
-                if (i == type || !q->swap_file)
+static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
-                        continue;
+{
-                if (mapping == q->swap_file->f_mapping)
+        int error;
-                        goto bad_swap;
-        }
-        error = -EINVAL;
        if (S_ISBLK(inode->i_mode)) {
-                bdev = I_BDEV(inode);
+                p->bdev = bdgrab(I_BDEV(inode));
-                error = bd_claim(bdev, sys_swapon);
+                error = blkdev_get(p->bdev,
+                                   FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+                                   sys_swapon);
                if (error < 0) {
-                        bdev = NULL;
+                        p->bdev = NULL;
-                        error = -EINVAL;
+                        return -EINVAL;
-                        goto bad_swap;
                }
-                p->old_block_size = block_size(bdev);
+                p->old_block_size = block_size(p->bdev);
-                error = set_blocksize(bdev, PAGE_SIZE);
+                error = set_blocksize(p->bdev, PAGE_SIZE);
                if (error < 0)
-                        goto bad_swap;
+                        return error;
-                p->bdev = bdev;
                p->flags |= SWP_BLKDEV;
        } else if (S_ISREG(inode->i_mode)) {
                p->bdev = inode->i_sb->s_bdev;
                mutex_lock(&inode->i_mutex);
-                did_down = 1;
+                if (IS_SWAPFILE(inode))
-                if (IS_SWAPFILE(inode)) {
+                        return -EBUSY;
-                        error = -EBUSY;
+        } else
-                        goto bad_swap;
+                return -EINVAL;
-                }
-        } else {
-                goto bad_swap;
-        }
-        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
+        return 0;
+}
-        /*
+static unsigned long read_swap_header(struct swap_info_struct *p,
-         * Read the swap header.
+                                        union swap_header *swap_header,
-         */
+                                        struct inode *inode)
-        if (!mapping->a_ops->readpage) {
+{
-                error = -EINVAL;
+        int i;
-                goto bad_swap;
+        unsigned long maxpages;
-        }
+        unsigned long swapfilepages;
-        page = read_mapping_page(mapping, 0, swap_file);
-        if (IS_ERR(page)) {
-                error = PTR_ERR(page);
-                goto bad_swap;
-        }
-        swap_header = kmap(page);
        if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
                printk(KERN_ERR "Unable to find swap-space signature\n");
-                error = -EINVAL;
+                return 0;
-                goto bad_swap;
        }
        /* swap partition endianess hack... */
@@ -1954,8 +1928,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                printk(KERN_WARNING
                       "Unable to handle swap header version %d\n",
                       swap_header->info.version);
-                error = -EINVAL;
+                return 0;
-                goto bad_swap;
        }
        p->lowest_bit  = 1;
@@ -1986,62 +1959,156 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        }
        p->highest_bit = maxpages - 1;
-        error = -EINVAL;
        if (!maxpages)
-                goto bad_swap;
+                return 0;
+        swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
        if (swapfilepages && maxpages > swapfilepages) {
                printk(KERN_WARNING
                       "Swap area shorter than signature indicates\n");
-                goto bad_swap;
+                return 0;
        }
        if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
-                goto bad_swap;
+                return 0;
        if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
-                goto bad_swap;
+                return 0;
-        /* OK, set up the swap map and apply the bad block list */
+        return maxpages;
-        swap_map = vmalloc(maxpages);
+}
-        if (!swap_map) {
-                error = -ENOMEM;
+static int setup_swap_map_and_extents(struct swap_info_struct *p,
-                goto bad_swap;
+                                        union swap_header *swap_header,
-        }
+                                        unsigned char *swap_map,
+                                        unsigned long maxpages,
+                                        sector_t *span)
+{
+        int i;
+        unsigned int nr_good_pages;
+        int nr_extents;
-        memset(swap_map, 0, maxpages);
        nr_good_pages = maxpages - 1;   /* omit header page */
        for (i = 0; i < swap_header->info.nr_badpages; i++) {
                unsigned int page_nr = swap_header->info.badpages[i];
-                if (page_nr == 0 || page_nr > swap_header->info.last_page) {
+                if (page_nr == 0 || page_nr > swap_header->info.last_page)
-                        error = -EINVAL;
+                        return -EINVAL;
-                        goto bad_swap;
-                }
                if (page_nr < maxpages) {
                        swap_map[page_nr] = SWAP_MAP_BAD;
                        nr_good_pages--;
                }
        }
-        error = swap_cgroup_swapon(type, maxpages);
-        if (error)
-                goto bad_swap;
        if (nr_good_pages) {
                swap_map[0] = SWAP_MAP_BAD;
                p->max = maxpages;
                p->pages = nr_good_pages;
-                nr_extents = setup_swap_extents(p, &span);
+                nr_extents = setup_swap_extents(p, span);
-                if (nr_extents < 0) {
+                if (nr_extents < 0)
-                        error = nr_extents;
+                        return nr_extents;
-                        goto bad_swap;
-                }
                nr_good_pages = p->pages;
        }
        if (!nr_good_pages) {
                printk(KERN_WARNING "Empty swap-file\n");
+                return -EINVAL;
+        }
+        return nr_extents;
+}
+SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
+{
+        struct swap_info_struct *p;
+        char *name;
+        struct file *swap_file = NULL;
+        struct address_space *mapping;
+        int i;
+        int prio;
+        int error;
+        union swap_header *swap_header;
+        int nr_extents;
+        sector_t span;
+        unsigned long maxpages;
+        unsigned char *swap_map = NULL;
+        struct page *page = NULL;
+        struct inode *inode = NULL;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        p = alloc_swap_info();
+        if (IS_ERR(p))
+                return PTR_ERR(p);
+        name = getname(specialfile);
+        if (IS_ERR(name)) {
+                error = PTR_ERR(name);
+                name = NULL;
+                goto bad_swap;
+        }
+        swap_file = filp_open(name, O_RDWR|O_LARGEFILE, 0);
+        if (IS_ERR(swap_file)) {
+                error = PTR_ERR(swap_file);
+                swap_file = NULL;
+                goto bad_swap;
+        }
+        p->swap_file = swap_file;
+        mapping = swap_file->f_mapping;
+        for (i = 0; i < nr_swapfiles; i++) {
+                struct swap_info_struct *q = swap_info[i];
+                if (q == p || !q->swap_file)
+                        continue;
+                if (mapping == q->swap_file->f_mapping) {
+                        error = -EBUSY;
+                        goto bad_swap;
+                }
+        }
+        inode = mapping->host;
+        /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */
+        error = claim_swapfile(p, inode);
+        if (unlikely(error))
+                goto bad_swap;
+        /*
+         * Read the swap header.
+         */
+        if (!mapping->a_ops->readpage) {
+                error = -EINVAL;
+                goto bad_swap;
+        }
+        page = read_mapping_page(mapping, 0, swap_file);
+        if (IS_ERR(page)) {
+                error = PTR_ERR(page);
+                goto bad_swap;
+        }
+        swap_header = kmap(page);
+        maxpages = read_swap_header(p, swap_header, inode);
+        if (unlikely(!maxpages)) {
                error = -EINVAL;
                goto bad_swap;
        }
+        /* OK, set up the swap map and apply the bad block list */
+        swap_map = vzalloc(maxpages);
+        if (!swap_map) {
+                error = -ENOMEM;
+                goto bad_swap;
+        }
+        error = swap_cgroup_swapon(p->type, maxpages);
+        if (error)
+                goto bad_swap;
+        nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
+                maxpages, &span);
+        if (unlikely(nr_extents < 0)) {
+                error = nr_extents;
+                goto bad_swap;
+        }
        if (p->bdev) {
                if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
                        p->flags |= SWP_SOLIDSTATE;
@@ -2052,55 +2119,46 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        }
        mutex_lock(&swapon_mutex);
-        spin_lock(&swap_lock);
+        prio = -1;
        if (swap_flags & SWAP_FLAG_PREFER)
-                p->prio =
+                prio =
                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
-        else
+        enable_swap_info(p, prio, swap_map);
-                p->prio = --least_priority;
-        p->swap_map = swap_map;
-        p->flags |= SWP_WRITEOK;
-        nr_swap_pages += nr_good_pages;
-        total_swap_pages += nr_good_pages;
        printk(KERN_INFO "Adding %uk swap on %s.  "
                        "Priority:%d extents:%d across:%lluk %s%s\n",
-                nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
+                p->pages<<(PAGE_SHIFT-10), name, p->prio,
                nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
                (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
                (p->flags & SWP_DISCARDABLE) ? "D" : "");
-        /* insert swap space into swap_list: */
-        prev = -1;
-        for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
-                if (p->prio >= swap_info[i]->prio)
-                        break;
-                prev = i;
-        }
-        p->next = i;
-        if (prev < 0)
-                swap_list.head = swap_list.next = type;
-        else
-                swap_info[prev]->next = type;
-        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
+        atomic_inc(&proc_poll_event);
+        wake_up_interruptible(&proc_poll_wait);
+        if (S_ISREG(inode->i_mode))
+                inode->i_flags |= S_SWAPFILE;
        error = 0;
        goto out;
 bad_swap:
-        if (bdev) {
+        if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
-                set_blocksize(bdev, p->old_block_size);
+                set_blocksize(p->bdev, p->old_block_size);
-                bd_release(bdev);
+                blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
        }
        destroy_swap_extents(p);
-        swap_cgroup_swapoff(type);
+        swap_cgroup_swapoff(p->type);
-bad_swap_2:
        spin_lock(&swap_lock);
        p->swap_file = NULL;
        p->flags = 0;
        spin_unlock(&swap_lock);
        vfree(swap_map);
-        if (swap_file)
+        if (swap_file) {
+                if (inode && S_ISREG(inode->i_mode)) {
+                        mutex_unlock(&inode->i_mutex);
+                        inode = NULL;
+                }
                filp_close(swap_file, NULL);
+        }
 out:
        if (page && !IS_ERR(page)) {
                kunmap(page);
@@ -2108,11 +2166,8 @@ out:
        }
        if (name)
                putname(name);
-        if (did_down) {
+        if (inode && S_ISREG(inode->i_mode))
-                if (!error)
-                        inode->i_flags |= S_SWAPFILE;
                mutex_unlock(&inode->i_mutex);
-        }
        return error;
 }
diff --git a/mm/thrash.c b/mm/thrash.c
index 2372d4ed5dd8..fabf2d0f5169 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -21,14 +21,40 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/swap.h>
+#include <linux/memcontrol.h>
+#include <trace/events/vmscan.h>
+#define TOKEN_AGING_INTERVAL    (0xFF)
 static DEFINE_SPINLOCK(swap_token_lock);
 struct mm_struct *swap_token_mm;
+struct mem_cgroup *swap_token_memcg;
 static unsigned int global_faults;
+static unsigned int last_aging;
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+        struct mem_cgroup *memcg;
+        memcg = try_get_mem_cgroup_from_mm(mm);
+        if (memcg)
+                css_put(mem_cgroup_css(memcg));
+        return memcg;
+}
+#else
+static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
+{
+        return NULL;
+}
+#endif
 void grab_swap_token(struct mm_struct *mm)
 {
        int current_interval;
+        unsigned int old_prio = mm->token_priority;
        global_faults++;
@@ -38,40 +64,81 @@ void grab_swap_token(struct mm_struct *mm)
                return;
        /* First come first served */
-        if (swap_token_mm == NULL) {
+        if (!swap_token_mm)
-                mm->token_priority = mm->token_priority + 2;
+                goto replace_token;
-                swap_token_mm = mm;
-                goto out;
+        if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) {
+                swap_token_mm->token_priority /= 2;
+                last_aging = global_faults;
        }
-        if (mm != swap_token_mm) {
+        if (mm == swap_token_mm) {
-                if (current_interval < mm->last_interval)
-                        mm->token_priority++;
-                else {
-                        if (likely(mm->token_priority > 0))
-                                mm->token_priority--;
-                }
-                /* Check if we deserve the token */
-                if (mm->token_priority > swap_token_mm->token_priority) {
-                        mm->token_priority += 2;
-                        swap_token_mm = mm;
-                }
-        } else {
-                /* Token holder came in again! */
                mm->token_priority += 2;
+                goto update_priority;
+        }
+        if (current_interval < mm->last_interval)
+                mm->token_priority++;
+        else {
+                if (likely(mm->token_priority > 0))
+                        mm->token_priority--;
        }
+        /* Check if we deserve the token */
+        if (mm->token_priority > swap_token_mm->token_priority)
+                goto replace_token;
+update_priority:
+        trace_update_swap_token_priority(mm, old_prio, swap_token_mm);
 out:
        mm->faultstamp = global_faults;
        mm->last_interval = current_interval;
        spin_unlock(&swap_token_lock);
+        return;
+replace_token:
+        mm->token_priority += 2;
+        trace_replace_swap_token(swap_token_mm, mm);
+        swap_token_mm = mm;
+        swap_token_memcg = swap_token_memcg_from_mm(mm);
+        last_aging = global_faults;
+        goto out;
 }
 /* Called on process exit. */
 void __put_swap_token(struct mm_struct *mm)
 {
        spin_lock(&swap_token_lock);
-        if (likely(mm == swap_token_mm))
+        if (likely(mm == swap_token_mm)) {
+                trace_put_swap_token(swap_token_mm);
                swap_token_mm = NULL;
+                swap_token_memcg = NULL;
+        }
        spin_unlock(&swap_token_lock);
 }
+static bool match_memcg(struct mem_cgroup *a, struct mem_cgroup *b)
+{
+        if (!a)
+                return true;
+        if (!b)
+                return true;
+        if (a == b)
+                return true;
+        return false;
+}
+void disable_swap_token(struct mem_cgroup *memcg)
+{
+        /* memcg reclaim don't disable unrelated mm token. */
+        if (match_memcg(memcg, swap_token_memcg)) {
+                spin_lock(&swap_token_lock);
+                if (match_memcg(memcg, swap_token_memcg)) {
+                        trace_disable_swap_token(swap_token_mm);
+                        swap_token_mm = NULL;
+                        swap_token_memcg = NULL;
+                }
+                spin_unlock(&swap_token_lock);
+        }
+}
diff --git a/mm/truncate.c b/mm/truncate.c
index ba887bff48c5..e13f22efaad7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -19,6 +19,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>  /* grr. try_to_release_page,
                                   do_invalidatepage */
+#include <linux/cleancache.h>
 #include "internal.h"
@@ -51,6 +52,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
        zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+        cleancache_flush_page(page->mapping, page);
        if (page_has_private(page))
                do_invalidatepage(page, partial);
 }
@@ -106,9 +108,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        cancel_dirty_page(page, PAGE_CACHE_SIZE);
        clear_page_mlock(page);
-        remove_from_page_cache(page);
        ClearPageMappedToDisk(page);
-        page_cache_release(page);       /* pagecache ref */
+        delete_from_page_cache(page);
        return 0;
 }
@@ -215,6 +216,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        pgoff_t next;
        int i;
+        cleancache_flush_inode(mapping);
        if (mapping->nrpages == 0)
                return;
@@ -225,6 +227,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
        next = start;
        while (next <= end &&
               pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t page_index = page->index;
@@ -247,6 +250,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        unlock_page(page);
                }
                pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
                cond_resched();
        }
@@ -290,6 +294,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
        }
+        cleancache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -299,6 +304,11 @@ EXPORT_SYMBOL(truncate_inode_pages_range);
 * @lstart: offset from which to truncate
 *
 * Called under (and serialised by) inode->i_mutex.
+ *
+ * Note: When this function returns, there can be a page in the process of
+ * deletion (inside __delete_from_page_cache()) in the specified range.  Thus
+ * mapping->nrpages can be non-zero when this function returns even after
+ * truncation of the whole mapping.
 */
 void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 {
@@ -320,11 +330,12 @@ EXPORT_SYMBOL(truncate_inode_pages);
 * pagetables.
 */
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
-                                       pgoff_t start, pgoff_t end)
+                pgoff_t start, pgoff_t end)
 {
        struct pagevec pvec;
        pgoff_t next = start;
-        unsigned long ret = 0;
+        unsigned long ret;
+        unsigned long count = 0;
        int i;
        pagevec_init(&pvec, 0);
@@ -351,9 +362,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                        if (lock_failed)
                                continue;
-                        ret += invalidate_inode_page(page);
+                        ret = invalidate_inode_page(page);
                        unlock_page(page);
+                        /*
+                         * Invalidation is a hint that the page is no longer
+                         * of interest and try to speed up its reclaim.
+                         */
+                        if (!ret)
+                                deactivate_page(page);
+                        count += ret;
                        if (next > end)
                                break;
                }
@@ -361,7 +378,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                mem_cgroup_uncharge_end();
                cond_resched();
        }
-        return ret;
+        return count;
 }
 EXPORT_SYMBOL(invalidate_mapping_pages);
@@ -387,9 +404,13 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        clear_page_mlock(page);
        BUG_ON(page_has_private(page));
-        __remove_from_page_cache(page);
+        __delete_from_page_cache(page);
        spin_unlock_irq(&mapping->tree_lock);
        mem_cgroup_uncharge_cache_page(page);
+        if (mapping->a_ops->freepage)
+                mapping->a_ops->freepage(page);
        page_cache_release(page);       /* pagecache ref */
        return 1;
 failed:
@@ -428,6 +449,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
        int did_range_unmap = 0;
        int wrapped = 0;
+        cleancache_flush_inode(mapping);
        pagevec_init(&pvec, 0);
        next = start;
        while (next <= end && !wrapped &&
@@ -486,6 +508,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                mem_cgroup_uncharge_end();
                cond_resched();
        }
+        cleancache_flush_inode(mapping);
        return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
@@ -545,13 +568,12 @@ EXPORT_SYMBOL(truncate_pagecache);
 * @inode: inode
 * @newsize: new file size
 *
- * truncate_setsize updastes i_size update and performs pagecache
+ * truncate_setsize updates i_size and performs pagecache truncation (if
- * truncation (if necessary) for a file size updates. It will be
+ * necessary) to @newsize. It will be typically be called from the filesystem's
- * typically be called from the filesystem's setattr function when
+ * setattr function when ATTR_SIZE is passed in.
- * ATTR_SIZE is passed in.
 *
- * Must be called with inode_mutex held and after all filesystem
+ * Must be called with inode_mutex held and before all filesystem specific
- * specific block truncation has been performed.
+ * block truncation has been performed.
 */
 void truncate_setsize(struct inode *inode, loff_t newsize)
 {
@@ -586,3 +608,27 @@ int vmtruncate(struct inode *inode, loff_t offset)
        return 0;
 }
 EXPORT_SYMBOL(vmtruncate);
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+        struct address_space *mapping = inode->i_mapping;
+        /*
+         * If the underlying filesystem is not going to provide
+         * a way to truncate a range of blocks (punch a hole) -
+         * we should return failure right now.
+         */
+        if (!inode->i_op->truncate_range)
+                return -ENOSYS;
+        mutex_lock(&inode->i_mutex);
+        down_write(&inode->i_alloc_sem);
+        unmap_mapping_range(mapping, offset, (end - offset), 1);
+        inode->i_op->truncate_range(inode, offset, end);
+        /* unmap again to remove racily COWed private pages */
+        unmap_mapping_range(mapping, offset, (end - offset), 1);
+        up_write(&inode->i_alloc_sem);
+        mutex_unlock(&inode->i_mutex);
+        return 0;
+}
diff --git a/mm/util.c b/mm/util.c
index 4735ea481816..88ea1bd661c0 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -6,6 +6,8 @@
 #include <linux/sched.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/kmem.h>
@@ -186,27 +188,6 @@ void kzfree(const void *p)
 }
 EXPORT_SYMBOL(kzfree);
-int kern_ptr_validate(const void *ptr, unsigned long size)
-{
-        unsigned long addr = (unsigned long)ptr;
-        unsigned long min_addr = PAGE_OFFSET;
-        unsigned long align_mask = sizeof(void *) - 1;
-        if (unlikely(addr < min_addr))
-                goto out;
-        if (unlikely(addr > (unsigned long)high_memory - size))
-                goto out;
-        if (unlikely(addr & align_mask))
-                goto out;
-        if (unlikely(!kern_addr_valid(addr)))
-                goto out;
-        if (unlikely(!kern_addr_valid(addr + size - 1)))
-                goto out;
-        return 1;
-out:
-        return 0;
-}
 /*
 * strndup_user - duplicate an existing string from user space
 * @s: The string to duplicate
@@ -236,6 +217,28 @@ char *strndup_user(const char __user *s, long n)
 }
 EXPORT_SYMBOL(strndup_user);
+void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
+                struct vm_area_struct *prev, struct rb_node *rb_parent)
+{
+        struct vm_area_struct *next;
+        vma->vm_prev = prev;
+        if (prev) {
+                next = prev->vm_next;
+                prev->vm_next = vma;
+        } else {
+                mm->mmap = vma;
+                if (rb_parent)
+                        next = rb_entry(rb_parent,
+                                        struct vm_area_struct, vm_rb);
+                else
+                        next = NULL;
+        }
+        vma->vm_next = next;
+        if (next)
+                next->vm_prev = vma;
+}
 #if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
@@ -245,6 +248,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 }
 #endif
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ * If the architecture not support this function, simply return with no
+ * page pinned
+ */
+int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+                                 int nr_pages, int write, struct page **pages)
+{
+        return 0;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
 /**
 * get_user_pages_fast() - pin user pages in memory
 * @start:      starting user address
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8889da69a6..1d34d75366a7 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,8 +31,6 @@
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
-bool vmap_lazy_unmap __read_mostly = true;
 /*** Page table manipulation functions ***/
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
@@ -263,8 +261,15 @@ struct vmap_area {
 };
 static DEFINE_SPINLOCK(vmap_area_lock);
-static struct rb_root vmap_area_root = RB_ROOT;
 static LIST_HEAD(vmap_area_list);
+static struct rb_root vmap_area_root = RB_ROOT;
+/* The vmap cache globals are protected by vmap_area_lock */
+static struct rb_node *free_vmap_cache;
+static unsigned long cached_hole_size;
+static unsigned long cached_vstart;
+static unsigned long cached_align;
 static unsigned long vmap_area_pcpu_hole;
 static struct vmap_area *__find_vmap_area(unsigned long addr)
@@ -293,13 +298,13 @@ static void __insert_vmap_area(struct vmap_area *va)
        struct rb_node *tmp;
        while (*p) {
-                struct vmap_area *tmp;
+                struct vmap_area *tmp_va;
                parent = *p;
-                tmp = rb_entry(parent, struct vmap_area, rb_node);
+                tmp_va = rb_entry(parent, struct vmap_area, rb_node);
-                if (va->va_start < tmp->va_end)
+                if (va->va_start < tmp_va->va_end)
                        p = &(*p)->rb_left;
-                else if (va->va_end > tmp->va_start)
+                else if (va->va_end > tmp_va->va_start)
                        p = &(*p)->rb_right;
                else
                        BUG();
@@ -333,9 +338,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
        struct rb_node *n;
        unsigned long addr;
        int purged = 0;
+        struct vmap_area *first;
        BUG_ON(!size);
        BUG_ON(size & ~PAGE_MASK);
+        BUG_ON(!is_power_of_2(align));
        va = kmalloc_node(sizeof(struct vmap_area),
                        gfp_mask & GFP_RECLAIM_MASK, node);
@@ -343,79 +350,106 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
                return ERR_PTR(-ENOMEM);
 retry:
-        addr = ALIGN(vstart, align);
        spin_lock(&vmap_area_lock);
-        if (addr + size - 1 < addr)
+        /*
-                goto overflow;
+         * Invalidate cache if we have more permissive parameters.
+         * cached_hole_size notes the largest hole noticed _below_
+         * the vmap_area cached in free_vmap_cache: if size fits
+         * into that hole, we want to scan from vstart to reuse
+         * the hole instead of allocating above free_vmap_cache.
+         * Note that __free_vmap_area may update free_vmap_cache
+         * without updating cached_hole_size or cached_align.
+         */
+        if (!free_vmap_cache ||
+                        size < cached_hole_size ||
+                        vstart < cached_vstart ||
+                        align < cached_align) {
+nocache:
+                cached_hole_size = 0;
+                free_vmap_cache = NULL;
+        }
+        /* record if we encounter less permissive parameters */
+        cached_vstart = vstart;
+        cached_align = align;
+        /* find starting point for our search */
+        if (free_vmap_cache) {
+                first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+                addr = ALIGN(first->va_end, align);
+                if (addr < vstart)
+                        goto nocache;
+                if (addr + size - 1 < addr)
+                        goto overflow;
+        } else {
+                addr = ALIGN(vstart, align);
+                if (addr + size - 1 < addr)
+                        goto overflow;
-        /* XXX: could have a last_hole cache */
+                n = vmap_area_root.rb_node;
-        n = vmap_area_root.rb_node;
+                first = NULL;
-        if (n) {
-                struct vmap_area *first = NULL;
-                do {
+                while (n) {
                        struct vmap_area *tmp;
                        tmp = rb_entry(n, struct vmap_area, rb_node);
                        if (tmp->va_end >= addr) {
-                                if (!first && tmp->va_start < addr + size)
-                                        first = tmp;
-                                n = n->rb_left;
-                        } else {
                                first = tmp;
+                                if (tmp->va_start <= addr)
+                                        break;
+                                n = n->rb_left;
+                        } else
                                n = n->rb_right;
-                        }
+                }
-                } while (n);
                if (!first)
                        goto found;
-                if (first->va_end < addr) {
-                        n = rb_next(&first->rb_node);
-                        if (n)
-                                first = rb_entry(n, struct vmap_area, rb_node);
-                        else
-                                goto found;
-                }
-                while (addr + size > first->va_start && addr + size <= vend) {
-                        addr = ALIGN(first->va_end + PAGE_SIZE, align);
-                        if (addr + size - 1 < addr)
-                                goto overflow;
-                        n = rb_next(&first->rb_node);
-                        if (n)
-                                first = rb_entry(n, struct vmap_area, rb_node);
-                        else
-                                goto found;
-                }
        }
-found:
-        if (addr + size > vend) {
+        /* from the starting point, walk areas until a suitable hole is found */
-overflow:
+        while (addr + size > first->va_start && addr + size <= vend) {
-                spin_unlock(&vmap_area_lock);
+                if (addr + cached_hole_size < first->va_start)
-                if (!purged) {
+                        cached_hole_size = first->va_start - addr;
-                        purge_vmap_area_lazy();
+                addr = ALIGN(first->va_end, align);
-                        purged = 1;
+                if (addr + size - 1 < addr)
-                        goto retry;
+                        goto overflow;
-                }
-                if (printk_ratelimit())
+                n = rb_next(&first->rb_node);
-                        printk(KERN_WARNING
+                if (n)
-                                "vmap allocation for size %lu failed: "
+                        first = rb_entry(n, struct vmap_area, rb_node);
-                                "use vmalloc=<size> to increase size.\n", size);
+                else
-                kfree(va);
+                        goto found;
-                return ERR_PTR(-EBUSY);
        }
-        BUG_ON(addr & (align-1));
+found:
+        if (addr + size > vend)
+                goto overflow;
        va->va_start = addr;
        va->va_end = addr + size;
        va->flags = 0;
        __insert_vmap_area(va);
+        free_vmap_cache = &va->rb_node;
        spin_unlock(&vmap_area_lock);
+        BUG_ON(va->va_start & (align-1));
+        BUG_ON(va->va_start < vstart);
+        BUG_ON(va->va_end > vend);
        return va;
+overflow:
+        spin_unlock(&vmap_area_lock);
+        if (!purged) {
+                purge_vmap_area_lazy();
+                purged = 1;
+                goto retry;
+        }
+        if (printk_ratelimit())
+                printk(KERN_WARNING
+                        "vmap allocation for size %lu failed: "
+                        "use vmalloc=<size> to increase size.\n", size);
+        kfree(va);
+        return ERR_PTR(-EBUSY);
 }
 static void rcu_free_va(struct rcu_head *head)
@@ -428,6 +462,22 @@ static void rcu_free_va(struct rcu_head *head)
 static void __free_vmap_area(struct vmap_area *va)
 {
        BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+        if (free_vmap_cache) {
+                if (va->va_end < cached_vstart) {
+                        free_vmap_cache = NULL;
+                } else {
+                        struct vmap_area *cache;
+                        cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
+                        if (va->va_start <= cache->va_start) {
+                                free_vmap_cache = rb_prev(&va->rb_node);
+                                /*
+                                 * We don't try to update cached_hole_size or
+                                 * cached_align, but it won't go very wrong.
+                                 */
+                        }
+                }
+        }
        rb_erase(&va->rb_node, &vmap_area_root);
        RB_CLEAR_NODE(&va->rb_node);
        list_del_rcu(&va->list);
@@ -503,9 +553,6 @@ static unsigned long lazy_max_pages(void)
 {
        unsigned int log;
-        if (!vmap_lazy_unmap)
-                return 0;
        log = fls(num_online_cpus());
        return log * (32UL * 1024 * 1024 / PAGE_SIZE);
@@ -517,6 +564,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
 static void purge_fragmented_blocks_allcpus(void);
 /*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+        atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+/*
 * Purges all lazily-freed vmap areas.
 *
 * If sync is 0 then don't purge if there is already a purge in progress.
@@ -557,7 +613,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
                        if (va->va_end > *end)
                                *end = va->va_end;
                        nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
-                        unmap_vmap_area(va);
                        list_add_tail(&va->purge_list, &valist);
                        va->flags |= VM_LAZY_FREEING;
                        va->flags &= ~VM_LAZY_FREE;
@@ -602,10 +657,11 @@ static void purge_vmap_area_lazy(void)
 }
 /*
- * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * Free a vmap area, caller ensuring that the area has been unmapped
- * called for the correct range previously.
+ * and flush_cache_vunmap had been called for the correct range
+ * previously.
 */
-static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+static void free_vmap_area_noflush(struct vmap_area *va)
 {
        va->flags |= VM_LAZY_FREE;
        atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
@@ -614,6 +670,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va)
 }
 /*
+ * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been
+ * called for the correct range previously.
+ */
+static void free_unmap_vmap_area_noflush(struct vmap_area *va)
+{
+        unmap_vmap_area(va);
+        free_vmap_area_noflush(va);
+}
+/*
 * Free and unmap a vmap area
 */
 static void free_unmap_vmap_area(struct vmap_area *va)
@@ -734,7 +800,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
                                        VMALLOC_START, VMALLOC_END,
                                        node, gfp_mask);
-        if (unlikely(IS_ERR(va))) {
+        if (IS_ERR(va)) {
                kfree(vb);
                return ERR_CAST(va);
        }
@@ -789,7 +855,7 @@ static void free_vmap_block(struct vmap_block *vb)
        spin_unlock(&vmap_block_tree_lock);
        BUG_ON(tmp != vb);
-        free_unmap_vmap_area_noflush(vb->va);
+        free_vmap_area_noflush(vb->va);
        call_rcu(&vb->rcu_head, rcu_free_vb);
 }
@@ -927,6 +993,8 @@ static void vb_free(const void *addr, unsigned long size)
        rcu_read_unlock();
        BUG_ON(!vb);
+        vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
        spin_lock(&vb->lock);
        BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order));
@@ -979,7 +1047,6 @@ void vm_unmap_aliases(void)
                                s = vb->va->va_start + (i << PAGE_SHIFT);
                                e = vb->va->va_start + (j << PAGE_SHIFT);
-                                vunmap_page_range(s, e);
                                flush = 1;
                                if (s < start)
@@ -1160,6 +1227,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
 {
        vunmap_page_range(addr, addr + size);
 }
+EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
 /**
 * unmap_kernel_range - unmap kernel VM area and flush cache and TLB
@@ -1300,13 +1368,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
                                                -1, GFP_KERNEL, caller);
 }
-struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
-                                   int node, gfp_t gfp_mask)
-{
-        return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
-                                  node, gfp_mask, __builtin_return_address(0));
-}
 static struct vm_struct *find_vm_area(const void *addr)
 {
        struct vmap_area *va;
@@ -1473,6 +1534,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 pgprot_t prot, int node, void *caller)
 {
+        const int order = 0;
        struct page **pages;
        unsigned int nr_pages, array_size, i;
        gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
@@ -1499,11 +1561,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        for (i = 0; i < area->nr_pages; i++) {
                struct page *page;
+                gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
                if (node < 0)
-                        page = alloc_page(gfp_mask);
+                        page = alloc_page(tmp_mask);
                else
-                        page = alloc_pages_node(node, gfp_mask, 0);
+                        page = alloc_pages_node(node, tmp_mask, order);
                if (unlikely(!page)) {
                        /* Successfully allocated i pages, free them in __vunmap() */
@@ -1518,29 +1581,19 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        return area->addr;
 fail:
+        warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, "
+                          "allocated %ld of %ld bytes\n",
+                          (area->nr_pages*PAGE_SIZE), area->size);
        vfree(area->addr);
        return NULL;
 }
-void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
-{
-        void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
-                                         __builtin_return_address(0));
-        /*
-         * A ref_count = 3 is needed because the vm_struct and vmap_area
-         * structures allocated in the __get_vm_area_node() function contain
-         * references to the virtual address of the vmalloc'ed block.
-         */
-        kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
-        return addr;
-}
 /**
- *      __vmalloc_node  -  allocate virtually contiguous memory
+ *      __vmalloc_node_range  -  allocate virtually contiguous memory
 *      @size:          allocation size
 *      @align:         desired alignment
+ *      @start:         vm area range start
+ *      @end:           vm area range end
 *      @gfp_mask:      flags for the page level allocator
 *      @prot:          protection mask for the allocated pages
 *      @node:          node to use for allocation or -1
@@ -1550,9 +1603,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 *      allocator with @gfp_mask flags.  Map them into contiguous
 *      kernel virtual space, using a pagetable protection of @prot.
 */
-static void *__vmalloc_node(unsigned long size, unsigned long align,
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
-                            gfp_t gfp_mask, pgprot_t prot,
+                        unsigned long start, unsigned long end, gfp_t gfp_mask,
-                            int node, void *caller)
+                        pgprot_t prot, int node, void *caller)
 {
        struct vm_struct *area;
        void *addr;
@@ -1562,8 +1615,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
        if (!size || (size >> PAGE_SHIFT) > totalram_pages)
                return NULL;
-        area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
+        area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
-                                  VMALLOC_END, node, gfp_mask, caller);
+                                  gfp_mask, caller);
        if (!area)
                return NULL;
@@ -1580,6 +1633,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
        return addr;
 }
+/**
+ *      __vmalloc_node  -  allocate virtually contiguous memory
+ *      @size:          allocation size
+ *      @align:         desired alignment
+ *      @gfp_mask:      flags for the page level allocator
+ *      @prot:          protection mask for the allocated pages
+ *      @node:          node to use for allocation or -1
+ *      @caller:        caller's return address
+ *
+ *      Allocate enough pages to cover @size from the page level
+ *      allocator with @gfp_mask flags.  Map them into contiguous
+ *      kernel virtual space, using a pagetable protection of @prot.
+ */
+static void *__vmalloc_node(unsigned long size, unsigned long align,
+                            gfp_t gfp_mask, pgprot_t prot,
+                            int node, void *caller)
+{
+        return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
+                                gfp_mask, prot, node, caller);
+}
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
        return __vmalloc_node(size, 1, gfp_mask, prot, -1,
@@ -1587,6 +1661,13 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 }
 EXPORT_SYMBOL(__vmalloc);
+static inline void *__vmalloc_node_flags(unsigned long size,
+                                        int node, gfp_t flags)
+{
+        return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
+                                        node, __builtin_return_address(0));
+}
 /**
 *      vmalloc  -  allocate virtually contiguous memory
 *      @size:          allocation size
@@ -1598,12 +1679,28 @@ EXPORT_SYMBOL(__vmalloc);
 */
 void *vmalloc(unsigned long size)
 {
-        return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+        return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM);
-                                        -1, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc);
 /**
+ *      vzalloc - allocate virtually contiguous memory with zero fill
+ *      @size:  allocation size
+ *      Allocate enough pages to cover @size from the page level
+ *      allocator and map them into contiguous kernel virtual space.
+ *      The memory allocated is set to zero.
+ *
+ *      For tight control over page level allocator and protection flags
+ *      use __vmalloc() instead.
+ */
+void *vzalloc(unsigned long size)
+{
+        return __vmalloc_node_flags(size, -1,
+                                GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc);
+/**
 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
 * @size: allocation size
 *
@@ -1644,6 +1741,25 @@ void *vmalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vmalloc_node);
+/**
+ * vzalloc_node - allocate memory on a specific node with zero fill
+ * @size:       allocation size
+ * @node:       numa node
+ *
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc_node() instead.
+ */
+void *vzalloc_node(unsigned long size, int node)
+{
+        return __vmalloc_node_flags(size, node,
+                         GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc_node);
 #ifndef PAGE_KERNEL_EXEC
 # define PAGE_KERNEL_EXEC PAGE_KERNEL
 #endif
@@ -1892,8 +2008,6 @@ finished:
 *      should know vmalloc() area is valid and can use memcpy().
 *      This is for routines which have to access vmalloc area without
 *      any informaion, as /dev/kmem.
- *
- *      The caller should guarantee KM_USER1 is not used.
 */
 long vwrite(char *buf, char *addr, unsigned long count)
@@ -2039,10 +2153,6 @@ struct vm_struct *alloc_vm_area(size_t size)
                return NULL;
        }
-        /* Make sure the pagetables are constructed in process kernel
-           mappings */
-        vmalloc_sync_all();
        return area;
 }
 EXPORT_SYMBOL_GPL(alloc_vm_area);
@@ -2056,6 +2166,7 @@ void free_vm_area(struct vm_struct *area)
 }
 EXPORT_SYMBOL_GPL(free_vm_area);
+#ifdef CONFIG_SMP
 static struct vmap_area *node_to_va(struct rb_node *n)
 {
        return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
@@ -2145,17 +2256,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
 * @sizes: array containing size of each area
 * @nr_vms: the number of areas to allocate
 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
- * @gfp_mask: allocation mask
 *
 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
 *          vm_structs on success, %NULL on failure
 *
 * Percpu allocator wants to use congruent vm areas so that it can
 * maintain the offsets among percpu areas.  This function allocates
- * congruent vmalloc areas for it.  These areas tend to be scattered
+ * congruent vmalloc areas for it with GFP_KERNEL.  These areas tend to
- * pretty far, distance between two areas easily going up to
+ * be scattered pretty far, distance between two areas easily going up
- * gigabytes.  To avoid interacting with regular vmallocs, these areas
+ * to gigabytes.  To avoid interacting with regular vmallocs, these
- * are allocated from top.
+ * areas are allocated from top.
 *
 * Despite its complicated look, this allocator is rather simple.  It
 * does everything top-down and scans areas from the end looking for
@@ -2166,7 +2276,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext,
 */
 struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                                     const size_t *sizes, int nr_vms,
-                                     size_t align, gfp_t gfp_mask)
+                                     size_t align)
 {
        const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
        const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
@@ -2176,8 +2286,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
        unsigned long base, start, end, last_end;
        bool purged = false;
-        gfp_mask &= GFP_RECLAIM_MASK;
        /* verify parameters and allocate data structures */
        BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
        for (last_area = 0, area = 0; area < nr_vms; area++) {
@@ -2210,14 +2318,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
                return NULL;
        }
-        vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
+        vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL);
-        vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
+        vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL);
        if (!vas || !vms)
                goto err_free;
        for (area = 0; area < nr_vms; area++) {
-                vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
+                vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
-                vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
+                vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
                if (!vas[area] || !vms[area])
                        goto err_free;
        }
@@ -2336,9 +2444,11 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
                free_vm_area(vms[i]);
        kfree(vms);
 }
+#endif  /* CONFIG_SMP */
 #ifdef CONFIG_PROC_FS
 static void *s_start(struct seq_file *m, loff_t *pos)
+        __acquires(&vmlist_lock)
 {
        loff_t n = *pos;
        struct vm_struct *v;
@@ -2365,6 +2475,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 }
 static void s_stop(struct seq_file *m, void *p)
+        __releases(&vmlist_lock)
 {
        read_unlock(&vmlist_lock);
 }
@@ -2395,13 +2506,8 @@ static int s_show(struct seq_file *m, void *p)
        seq_printf(m, "0x%p-0x%p %7ld",
                v->addr, v->addr + v->size, v->size);
-        if (v->caller) {
+        if (v->caller)
-                char buff[KSYM_SYMBOL_LEN];
+                seq_printf(m, " %pS", v->caller);
-                seq_putc(m, ' ');
-                sprint_symbol(buff, (unsigned long)v->caller);
-                seq_puts(m, buff);
-        }
        if (v->nr_pages)
                seq_printf(m, " pages=%d", v->nr_pages);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c5dfabf25f11..d036e59d302b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/compaction.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
@@ -40,6 +41,8 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
+#include <linux/oom.h>
+#include <linux/prefetch.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -51,6 +54,24 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/vmscan.h>
+/*
+ * reclaim_mode determines how the inactive list is shrunk
+ * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages
+ * RECLAIM_MODE_ASYNC:  Do not block
+ * RECLAIM_MODE_SYNC:   Allow blocking e.g. call wait_on_page_writeback
+ * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference
+ *                      page from the LRU and reclaim all pages within a
+ *                      naturally aligned range
+ * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ *                      order-0 pages and then compact the zone
+ */
+typedef unsigned __bitwise__ reclaim_mode_t;
+#define RECLAIM_MODE_SINGLE             ((__force reclaim_mode_t)0x01u)
+#define RECLAIM_MODE_ASYNC              ((__force reclaim_mode_t)0x02u)
+#define RECLAIM_MODE_SYNC               ((__force reclaim_mode_t)0x04u)
+#define RECLAIM_MODE_LUMPYRECLAIM       ((__force reclaim_mode_t)0x08u)
+#define RECLAIM_MODE_COMPACTION         ((__force reclaim_mode_t)0x10u)
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
@@ -79,10 +100,10 @@ struct scan_control {
        int order;
        /*
-         * Intend to reclaim enough contenious memory rather than to reclaim
+         * Intend to reclaim enough continuous memory rather than reclaim
-         * enough amount memory. I.e, it's the mode for high order allocation.
+         * enough amount of memory. i.e, mode for high order allocation.
         */
-        bool lumpy_reclaim_mode;
+        reclaim_mode_t reclaim_mode;
        /* Which cgroup do we reclaim from */
        struct mem_cgroup *mem_cgroup;
@@ -152,7 +173,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
                                struct scan_control *sc, enum lru_list lru)
 {
        if (!scanning_global_lru(sc))
-                return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
+                return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru);
        return zone_page_state(zone, NR_LRU_BASE + lru);
 }
@@ -181,6 +202,14 @@ void unregister_shrinker(struct shrinker *shrinker)
 }
 EXPORT_SYMBOL(unregister_shrinker);
+static inline int do_shrinker_shrink(struct shrinker *shrinker,
+                                     struct shrink_control *sc,
+                                     unsigned long nr_to_scan)
+{
+        sc->nr_to_scan = nr_to_scan;
+        return (*shrinker->shrink)(shrinker, sc);
+}
 #define SHRINK_BATCH 128
 /*
 * Call the shrink functions to age shrinkable caches
@@ -201,25 +230,29 @@ EXPORT_SYMBOL(unregister_shrinker);
 *
 * Returns the number of slab objects which we shrunk.
 */
-unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+unsigned long shrink_slab(struct shrink_control *shrink,
-                        unsigned long lru_pages)
+                          unsigned long nr_pages_scanned,
+                          unsigned long lru_pages)
 {
        struct shrinker *shrinker;
        unsigned long ret = 0;
-        if (scanned == 0)
+        if (nr_pages_scanned == 0)
-                scanned = SWAP_CLUSTER_MAX;
+                nr_pages_scanned = SWAP_CLUSTER_MAX;
-        if (!down_read_trylock(&shrinker_rwsem))
+        if (!down_read_trylock(&shrinker_rwsem)) {
-                return 1;       /* Assume we'll be able to shrink next time */
+                /* Assume we'll be able to shrink next time */
+                ret = 1;
+                goto out;
+        }
        list_for_each_entry(shrinker, &shrinker_list, list) {
                unsigned long long delta;
                unsigned long total_scan;
                unsigned long max_pass;
-                max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask);
+                max_pass = do_shrinker_shrink(shrinker, shrink, 0);
-                delta = (4 * scanned) / shrinker->seeks;
+                delta = (4 * nr_pages_scanned) / shrinker->seeks;
                delta *= max_pass;
                do_div(delta, lru_pages + 1);
                shrinker->nr += delta;
@@ -246,9 +279,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                        int shrink_ret;
                        int nr_before;
-                        nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask);
+                        nr_before = do_shrinker_shrink(shrinker, shrink, 0);
-                        shrink_ret = (*shrinker->shrink)(shrinker, this_scan,
+                        shrink_ret = do_shrinker_shrink(shrinker, shrink,
-                                                                gfp_mask);
+                                                        this_scan);
                        if (shrink_ret == -1)
                                break;
                        if (shrink_ret < nr_before)
@@ -262,9 +295,44 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                shrinker->nr += total_scan;
        }
        up_read(&shrinker_rwsem);
+out:
+        cond_resched();
        return ret;
 }
+static void set_reclaim_mode(int priority, struct scan_control *sc,
+                                   bool sync)
+{
+        reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC;
+        /*
+         * Initially assume we are entering either lumpy reclaim or
+         * reclaim/compaction.Depending on the order, we will either set the
+         * sync mode or just reclaim order-0 pages later.
+         */
+        if (COMPACTION_BUILD)
+                sc->reclaim_mode = RECLAIM_MODE_COMPACTION;
+        else
+                sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM;
+        /*
+         * Avoid using lumpy reclaim or reclaim/compaction if possible by
+         * restricting when its set to either costly allocations or when
+         * under memory pressure
+         */
+        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+                sc->reclaim_mode |= syncmode;
+        else if (sc->order && priority < DEF_PRIORITY - 2)
+                sc->reclaim_mode |= syncmode;
+        else
+                sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
+}
+static void reset_reclaim_mode(struct scan_control *sc)
+{
+        sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC;
+}
 static inline int is_page_cache_freeable(struct page *page)
 {
        /*
@@ -275,7 +343,8 @@ static inline int is_page_cache_freeable(struct page *page)
        return page_count(page) - page_has_private(page) == 2;
 }
-static int may_write_to_queue(struct backing_dev_info *bdi)
+static int may_write_to_queue(struct backing_dev_info *bdi,
+                              struct scan_control *sc)
 {
        if (current->flags & PF_SWAPWRITE)
                return 1;
@@ -283,6 +352,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
                return 1;
        if (bdi == current->backing_dev_info)
                return 1;
+        /* lumpy reclaim for hugepage often need a lot of write */
+        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+                return 1;
        return 0;
 }
@@ -301,18 +374,12 @@ static int may_write_to_queue(struct backing_dev_info *bdi)
 static void handle_write_error(struct address_space *mapping,
                                struct page *page, int error)
 {
-        lock_page_nosync(page);
+        lock_page(page);
        if (page_mapping(page) == mapping)
                mapping_set_error(mapping, error);
        unlock_page(page);
 }
-/* Request for sync pageout. */
-enum pageout_io {
-        PAGEOUT_IO_ASYNC,
-        PAGEOUT_IO_SYNC,
-};
 /* possible outcome of pageout() */
 typedef enum {
        /* failed to write page out, page is locked */
@@ -330,7 +397,7 @@ typedef enum {
 * Calls ->writepage().
 */
 static pageout_t pageout(struct page *page, struct address_space *mapping,
-                                                enum pageout_io sync_writeback)
+                         struct scan_control *sc)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -366,7 +433,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
-        if (!may_write_to_queue(mapping->backing_dev_info))
+        if (!may_write_to_queue(mapping->backing_dev_info, sc))
                return PAGE_KEEP;
        if (clear_page_dirty_for_io(page)) {
@@ -376,7 +443,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        .nr_to_write = SWAP_CLUSTER_MAX,
                        .range_start = 0,
                        .range_end = LLONG_MAX,
-                        .nonblocking = 1,
                        .for_reclaim = 1,
                };
@@ -394,7 +460,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                 * direct reclaiming a large contiguous area and the
                 * first attempt to free a range of pages fails.
                 */
-                if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+                if (PageWriteback(page) &&
+                    (sc->reclaim_mode & RECLAIM_MODE_SYNC))
                        wait_on_page_writeback(page);
                if (!PageWriteback(page)) {
@@ -402,7 +469,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        ClearPageReclaim(page);
                }
                trace_mm_vmscan_writepage(page,
-                        trace_reclaim_flags(page, sync_writeback));
+                        trace_reclaim_flags(page, sc->reclaim_mode));
                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -459,9 +526,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
                spin_unlock_irq(&mapping->tree_lock);
                swapcache_free(swap, page);
        } else {
-                __remove_from_page_cache(page);
+                void (*freepage)(struct page *);
+                freepage = mapping->a_ops->freepage;
+                __delete_from_page_cache(page);
                spin_unlock_irq(&mapping->tree_lock);
                mem_cgroup_uncharge_cache_page(page);
+                if (freepage != NULL)
+                        freepage(page);
        }
        return 1;
@@ -580,7 +654,7 @@ static enum page_references page_check_references(struct page *page,
        referenced_page = TestClearPageReferenced(page);
        /* Lumpy reclaim - ignore references */
-        if (sc->lumpy_reclaim_mode)
+        if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
                return PAGEREF_RECLAIM;
        /*
@@ -616,7 +690,7 @@ static enum page_references page_check_references(struct page *page,
        }
        /* Reclaim if clean, defer dirty pages to writeback */
-        if (referenced_page)
+        if (referenced_page && !PageSwapBacked(page))
                return PAGEREF_RECLAIM_CLEAN;
        return PAGEREF_RECLAIM;
@@ -644,12 +718,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
 * shrink_page_list() returns the number of reclaimed pages
 */
 static unsigned long shrink_page_list(struct list_head *page_list,
-                                        struct scan_control *sc,
+                                      struct zone *zone,
-                                        enum pageout_io sync_writeback)
+                                      struct scan_control *sc)
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
        int pgactivate = 0;
+        unsigned long nr_dirty = 0;
+        unsigned long nr_congested = 0;
        unsigned long nr_reclaimed = 0;
        cond_resched();
@@ -669,6 +745,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        goto keep;
                VM_BUG_ON(PageActive(page));
+                VM_BUG_ON(page_zone(page) != zone);
                sc->nr_scanned++;
@@ -694,10 +771,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         * for any page for which writeback has already
                         * started.
                         */
-                        if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+                        if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
+                            may_enter_fs)
                                wait_on_page_writeback(page);
-                        else
+                        else {
-                                goto keep_locked;
+                                unlock_page(page);
+                                goto keep_lumpy;
+                        }
                }
                references = page_check_references(page, sc);
@@ -743,6 +823,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                }
                if (PageDirty(page)) {
+                        nr_dirty++;
                        if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
@@ -751,14 +833,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                        /* Page is dirty, try to write it out here */
-                        switch (pageout(page, mapping, sync_writeback)) {
+                        switch (pageout(page, mapping, sc)) {
                        case PAGE_KEEP:
+                                nr_congested++;
                                goto keep_locked;
                        case PAGE_ACTIVATE:
                                goto activate_locked;
                        case PAGE_SUCCESS:
-                                if (PageWriteback(page) || PageDirty(page))
+                                if (PageWriteback(page))
+                                        goto keep_lumpy;
+                                if (PageDirty(page))
                                        goto keep;
                                /*
                                 * A synchronous write - probably a ramdisk.  Go
                                 * ahead and try to reclaim the page.
@@ -841,6 +927,7 @@ cull_mlocked:
                        try_to_free_swap(page);
                unlock_page(page);
                putback_lru_page(page);
+                reset_reclaim_mode(sc);
                continue;
 activate_locked:
@@ -853,10 +940,21 @@ activate_locked:
 keep_locked:
                unlock_page(page);
 keep:
+                reset_reclaim_mode(sc);
+keep_lumpy:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
+        /*
+         * Tag a zone as congested if all the dirty pages encountered were
+         * backed by a congested BDI. In this case, reclaimers should just
+         * back off and wait for congestion to clear because further reclaim
+         * will encounter the same problem
+         */
+        if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
+                zone_set_flag(zone, ZONE_CONGESTED);
        free_page_list(&free_pages);
        list_splice(&ret_pages, page_list);
@@ -962,7 +1060,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                case 0:
                        list_move(&page->lru, dst);
                        mem_cgroup_del_lru(page);
-                        nr_taken++;
+                        nr_taken += hpage_nr_pages(page);
                        break;
                case -EBUSY:
@@ -983,7 +1081,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 * surrounding the tag page.  Only take those pages of
                 * the same active state as that tag page.  We may safely
                 * round the target page pfn down to the requested order
-                 * as the mem_map is guarenteed valid out to MAX_ORDER,
+                 * as the mem_map is guaranteed valid out to MAX_ORDER,
                 * where that page is in a different zone we will detect
                 * it from its zone id and abort this block scan.
                 */
@@ -1006,7 +1104,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                        /* Check that we have not crossed a zone boundary. */
                        if (unlikely(page_zone_id(cursor_page) != zone_id))
-                                continue;
+                                break;
                        /*
                         * If we don't have enough swap space, reclaiming of
@@ -1014,23 +1112,40 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                         * pointless.
                         */
                        if (nr_swap_pages <= 0 && PageAnon(cursor_page) &&
-                                        !PageSwapCache(cursor_page))
+                            !PageSwapCache(cursor_page))
-                                continue;
+                                break;
                        if (__isolate_lru_page(cursor_page, mode, file) == 0) {
                                list_move(&cursor_page->lru, dst);
                                mem_cgroup_del_lru(cursor_page);
-                                nr_taken++;
+                                nr_taken += hpage_nr_pages(page);
                                nr_lumpy_taken++;
                                if (PageDirty(cursor_page))
                                        nr_lumpy_dirty++;
                                scan++;
                        } else {
-                                if (mode == ISOLATE_BOTH &&
+                                /*
-                                                page_count(cursor_page))
+                                 * Check if the page is freed already.
-                                        nr_lumpy_failed++;
+                                 *
+                                 * We can't use page_count() as that
+                                 * requires compound_head and we don't
+                                 * have a pin on the page here. If a
+                                 * page is tail, we may or may not
+                                 * have isolated the head, so assume
+                                 * it's not free, it'd be tricky to
+                                 * track the head status without a
+                                 * page pin.
+                                 */
+                                if (!PageTail(cursor_page) &&
+                                    !atomic_read(&cursor_page->_count))
+                                        continue;
+                                break;
                        }
                }
+                /* If we break out of the loop above, lumpy reclaim failed */
+                if (pfn < end_pfn)
+                        nr_lumpy_failed++;
        }
        *scanned = scan;
@@ -1070,14 +1185,15 @@ static unsigned long clear_active_flags(struct list_head *page_list,
        struct page *page;
        list_for_each_entry(page, page_list, lru) {
+                int numpages = hpage_nr_pages(page);
                lru = page_lru_base_type(page);
                if (PageActive(page)) {
                        lru += LRU_ACTIVE;
                        ClearPageActive(page);
-                        nr_active++;
+                        nr_active += numpages;
                }
                if (count)
-                        count[lru]++;
+                        count[lru] += numpages;
        }
        return nr_active;
@@ -1112,13 +1228,16 @@ int isolate_lru_page(struct page *page)
 {
        int ret = -EBUSY;
+        VM_BUG_ON(!page_count(page));
        if (PageLRU(page)) {
                struct zone *zone = page_zone(page);
                spin_lock_irq(&zone->lru_lock);
-                if (PageLRU(page) && get_page_unless_zero(page)) {
+                if (PageLRU(page)) {
                        int lru = page_lru(page);
                        ret = 0;
+                        get_page(page);
                        ClearPageLRU(page);
                        del_page_from_lru_list(zone, page, lru);
@@ -1187,7 +1306,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
                add_page_to_lru_list(zone, page, lru);
                if (is_active_lru(lru)) {
                        int file = is_file_lru(lru);
-                        reclaim_stat->recent_rotated[file]++;
+                        int numpages = hpage_nr_pages(page);
+                        reclaim_stat->recent_rotated[file] += numpages;
                }
                if (!pagevec_add(&pvec, page)) {
                        spin_unlock_irq(&zone->lru_lock);
@@ -1253,7 +1373,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
                return false;
        /* Only stall on lumpy reclaim */
-        if (!sc->lumpy_reclaim_mode)
+        if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
                return false;
        /* If we have relaimed everything on the isolated list, no stall */
@@ -1286,7 +1406,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        unsigned long nr_scanned;
        unsigned long nr_reclaimed = 0;
        unsigned long nr_taken;
-        unsigned long nr_active;
        unsigned long nr_anon;
        unsigned long nr_file;
@@ -1298,15 +1417,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
                        return SWAP_CLUSTER_MAX;
        }
+        set_reclaim_mode(priority, sc, false);
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode ?
+                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
@@ -1318,8 +1437,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode ?
+                        sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
-                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, sc->mem_cgroup,
                        0, file);
                /*
@@ -1337,20 +1456,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
-        nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+        nr_reclaimed = shrink_page_list(&page_list, zone, sc);
        /* Check if we should syncronously wait for writeback */
        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
-                congestion_wait(BLK_RW_ASYNC, HZ/10);
+                set_reclaim_mode(priority, sc, true);
+                nr_reclaimed += shrink_page_list(&page_list, zone, sc);
-                /*
-                 * The attempt at page out may have made some
-                 * of the pages active, mark them inactive again.
-                 */
-                nr_active = clear_active_flags(&page_list, NULL);
-                count_vm_events(PGDEACTIVATE, nr_active);
-                nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
        }
        local_irq_disable();
@@ -1359,6 +1470,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
        putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+        trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
+                zone_idx(zone),
+                nr_scanned, nr_reclaimed,
+                priority,
+                trace_shrink_flags(file, sc->reclaim_mode));
        return nr_reclaimed;
 }
@@ -1398,7 +1515,7 @@ static void move_active_pages_to_lru(struct zone *zone,
                list_move(&page->lru, &zone->lru[lru].list);
                mem_cgroup_add_lru_list(page, lru);
-                pgmoved++;
+                pgmoved += hpage_nr_pages(page);
                if (!pagevec_add(&pvec, page) || list_empty(list)) {
                        spin_unlock_irq(&zone->lru_lock);
@@ -1466,7 +1583,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                }
                if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
-                        nr_rotated++;
+                        nr_rotated += hpage_nr_pages(page);
                        /*
                         * Identify referenced, file-backed active pages and
                         * give them one more trip around the active list. So
@@ -1506,6 +1623,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        spin_unlock_irq(&zone->lru_lock);
 }
+#ifdef CONFIG_SWAP
 static int inactive_anon_is_low_global(struct zone *zone)
 {
        unsigned long active, inactive;
@@ -1531,12 +1649,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
 {
        int low;
+        /*
+         * If we don't have swap space, anonymous page deactivation
+         * is pointless.
+         */
+        if (!total_swap_pages)
+                return 0;
        if (scanning_global_lru(sc))
                low = inactive_anon_is_low_global(zone);
        else
                low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
        return low;
 }
+#else
+static inline int inactive_anon_is_low(struct zone *zone,
+                                        struct scan_control *sc)
+{
+        return 0;
+}
+#endif
 static int inactive_file_is_low_global(struct zone *zone)
 {
@@ -1598,26 +1730,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 }
 /*
- * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
- * until we collected @swap_cluster_max pages to scan.
- */
-static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
-                                       unsigned long *nr_saved_scan)
-{
-        unsigned long nr;
-        *nr_saved_scan += nr_to_scan;
-        nr = *nr_saved_scan;
-        if (nr >= SWAP_CLUSTER_MAX)
-                *nr_saved_scan = 0;
-        else
-                nr = 0;
-        return nr;
-}
-/*
 * Determine how aggressively the anon and file LRU lists should be
 * scanned.  The relative value of each set of LRU lists is determined
 * by looking at the fraction of the pages scanned we did rotate back
@@ -1635,6 +1747,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        u64 fraction[2], denominator;
        enum lru_list l;
        int noswap = 0;
+        int force_scan = 0;
+        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
+        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
+                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) {
+                /* kswapd does zone balancing and need to scan this zone */
+                if (scanning_global_lru(sc) && current_is_kswapd())
+                        force_scan = 1;
+                /* memcg may have small limit and need to avoid priority drop */
+                if (!scanning_global_lru(sc))
+                        force_scan = 1;
+        }
        /* If we have no swap space, do not bother scanning anon pages. */
        if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1645,11 +1773,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
                goto out;
        }
-        anon  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
-        file  = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
-                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
        if (scanning_global_lru(sc)) {
                free  = zone_page_state(zone, NR_FREE_PAGES);
                /* If we have very few page cache pages,
@@ -1716,24 +1839,87 @@ out:
                        scan >>= priority;
                        scan = div64_u64(scan * fraction[file], denominator);
                }
-                nr[l] = nr_scan_try_batch(scan,
-                                          &reclaim_stat->nr_saved_scan[l]);
+                /*
+                 * If zone is small or memcg is small, nr[l] can be 0.
+                 * This results no-scan on this priority and priority drop down.
+                 * For global direct reclaim, it can visit next zone and tend
+                 * not to have problems. For global kswapd, it's for zone
+                 * balancing and it need to scan a small amounts. When using
+                 * memcg, priority drop can cause big latency. So, it's better
+                 * to scan small amount. See may_noscan above.
+                 */
+                if (!scan && force_scan) {
+                        if (file)
+                                scan = SWAP_CLUSTER_MAX;
+                        else if (!noswap)
+                                scan = SWAP_CLUSTER_MAX;
+                }
+                nr[l] = scan;
        }
 }
-static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc)
+/*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+                                        unsigned long nr_reclaimed,
+                                        unsigned long nr_scanned,
+                                        struct scan_control *sc)
 {
+        unsigned long pages_for_compaction;
+        unsigned long inactive_lru_pages;
+        /* If not in reclaim/compaction mode, stop */
+        if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION))
+                return false;
+        /* Consider stopping depending on scan and reclaim activity */
+        if (sc->gfp_mask & __GFP_REPEAT) {
+                /*
+                 * For __GFP_REPEAT allocations, stop reclaiming if the
+                 * full LRU list has been scanned and we are still failing
+                 * to reclaim pages. This full LRU scan is potentially
+                 * expensive but a __GFP_REPEAT caller really wants to succeed
+                 */
+                if (!nr_reclaimed && !nr_scanned)
+                        return false;
+        } else {
+                /*
+                 * For non-__GFP_REPEAT allocations which can presumably
+                 * fail without consequence, stop if we failed to reclaim
+                 * any pages from the last SWAP_CLUSTER_MAX number of
+                 * pages that were scanned. This will return to the
+                 * caller faster at the risk reclaim/compaction and
+                 * the resulting allocation attempt fails
+                 */
+                if (!nr_reclaimed)
+                        return false;
+        }
        /*
-         * If we need a large contiguous chunk of memory, or have
+         * If we have not reclaimed enough pages for compaction and the
-         * trouble getting a small set of contiguous pages, we
+         * inactive lists are large enough, continue reclaiming
-         * will reclaim both active and inactive pages.
         */
-        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+        pages_for_compaction = (2UL << sc->order);
-                sc->lumpy_reclaim_mode = 1;
+        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
-        else if (sc->order && priority < DEF_PRIORITY - 2)
+                                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
-                sc->lumpy_reclaim_mode = 1;
+        if (sc->nr_reclaimed < pages_for_compaction &&
-        else
+                        inactive_lru_pages > pages_for_compaction)
-                sc->lumpy_reclaim_mode = 0;
+                return true;
+        /* If compaction would go ahead or the allocation would succeed, stop */
+        switch (compaction_suitable(zone, sc->order)) {
+        case COMPACT_PARTIAL:
+        case COMPACT_CONTINUE:
+                return false;
+        default:
+                return true;
+        }
 }
 /*
@@ -1745,13 +1931,14 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list l;
-        unsigned long nr_reclaimed = sc->nr_reclaimed;
+        unsigned long nr_reclaimed, nr_scanned;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+restart:
+        nr_reclaimed = 0;
+        nr_scanned = sc->nr_scanned;
        get_scan_count(zone, sc, nr, priority);
-        set_lumpy_reclaim_mode(priority, sc);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                for_each_evictable_lru(l) {
@@ -1775,16 +1962,20 @@ static void shrink_zone(int priority, struct zone *zone,
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
+        sc->nr_reclaimed += nr_reclaimed;
-        sc->nr_reclaimed = nr_reclaimed;
        /*
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-        if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
+        if (inactive_anon_is_low(zone, sc))
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+        /* reclaim/compaction might need reclaim to continue */
+        if (should_continue_reclaim(zone, nr_reclaimed,
+                                        sc->nr_scanned - nr_scanned, sc))
+                goto restart;
        throttle_vm_writeout(sc->gfp_mask);
 }
@@ -1809,6 +2000,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 {
        struct zoneref *z;
        struct zone *zone;
+        unsigned long nr_soft_reclaimed;
+        unsigned long nr_soft_scanned;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1823,6 +2016,19 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
+                        /*
+                         * This steals pages from memory cgroups over softlimit
+                         * and returns the number of reclaimed pages and
+                         * scanned pages. This works for global memory pressure
+                         * and balancing, not for a memcg's limit.
+                         */
+                        nr_soft_scanned = 0;
+                        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                                                sc->order, sc->gfp_mask,
+                                                &nr_soft_scanned);
+                        sc->nr_reclaimed += nr_soft_reclaimed;
+                        sc->nr_scanned += nr_soft_scanned;
+                        /* need some check for avoid more shrink_zone() */
                }
                shrink_zone(priority, zone, sc);
@@ -1834,17 +2040,12 @@ static bool zone_reclaimable(struct zone *zone)
        return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
 }
-/*
+/* All zones in zonelist are unreclaimable? */
- * As hibernation is going on, kswapd is freezed so that it can't mark
- * the zone into all_unreclaimable. It can't handle OOM during hibernation.
- * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
- */
 static bool all_unreclaimable(struct zonelist *zonelist,
                struct scan_control *sc)
 {
        struct zoneref *z;
        struct zone *zone;
-        bool all_unreclaimable = true;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                        gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1852,13 +2053,11 @@ static bool all_unreclaimable(struct zonelist *zonelist,
                        continue;
                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                        continue;
-                if (zone_reclaimable(zone)) {
+                if (!zone->all_unreclaimable)
-                        all_unreclaimable = false;
+                        return false;
-                        break;
-                }
        }
-        return all_unreclaimable;
+        return true;
 }
 /*
@@ -1878,7 +2077,8 @@ static bool all_unreclaimable(struct zonelist *zonelist,
 *              else, the number of pages reclaimed
 */
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
-                                        struct scan_control *sc)
+                                        struct scan_control *sc,
+                                        struct shrink_control *shrink)
 {
        int priority;
        unsigned long total_scanned = 0;
@@ -1896,7 +2096,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc->nr_scanned = 0;
                if (!priority)
-                        disable_swap_token();
+                        disable_swap_token(sc->mem_cgroup);
                shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
@@ -1912,7 +2112,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                lru_pages += zone_reclaimable_pages(zone);
                        }
-                        shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+                        shrink_slab(shrink, sc->nr_scanned, lru_pages);
                        if (reclaim_state) {
                                sc->nr_reclaimed += reclaim_state->reclaimed_slab;
                                reclaim_state->reclaimed_slab = 0;
@@ -1937,27 +2137,31 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                /* Take a nap, wait for some writeback to complete */
                if (!sc->hibernation_mode && sc->nr_scanned &&
-                    priority < DEF_PRIORITY - 2)
+                    priority < DEF_PRIORITY - 2) {
-                        congestion_wait(BLK_RW_ASYNC, HZ/10);
+                        struct zone *preferred_zone;
+                        first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+                                                &cpuset_current_mems_allowed,
+                                                &preferred_zone);
+                        wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
+                }
        }
 out:
-        /*
-         * Now that we've scanned all the zones at this priority level, note
-         * that level within the zone so that the next thread which performs
-         * scanning of this zone will immediately start out at this priority
-         * level.  This affects only the decision whether or not to bring
-         * mapped pages onto the inactive list.
-         */
-        if (priority < 0)
-                priority = 0;
        delayacct_freepages_end();
        put_mems_allowed();
        if (sc->nr_reclaimed)
                return sc->nr_reclaimed;
+        /*
+         * As hibernation is going on, kswapd is freezed so that it can't mark
+         * the zone into all_unreclaimable. Thus bypassing all_unreclaimable
+         * check.
+         */
+        if (oom_killer_disabled)
+                return 0;
        /* top priority shrink_zones still had more to do? don't OOM, then */
        if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
                return 1;
@@ -1980,12 +2184,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .mem_cgroup = NULL,
                .nodemask = nodemask,
        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
                                gfp_mask);
-        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
        trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
@@ -1997,9 +2204,11 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                unsigned int swappiness,
-                                                struct zone *zone)
+                                                struct zone *zone,
+                                                unsigned long *nr_scanned)
 {
        struct scan_control sc = {
+                .nr_scanned = 0,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -2008,6 +2217,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .order = 0,
                .mem_cgroup = mem,
        };
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2026,6 +2236,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
+        *nr_scanned = sc.nr_scanned;
        return sc.nr_reclaimed;
 }
@@ -2036,6 +2247,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 {
        struct zonelist *zonelist;
        unsigned long nr_reclaimed;
+        int nid;
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -2045,17 +2257,27 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .order = 0,
                .mem_cgroup = mem_cont,
                .nodemask = NULL, /* we don't care the placement */
+                .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
+        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
        };
-        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+        /*
-                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+         * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
-        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+         * take care of from where we get pages. So the node where we start the
+         * scan does not need to be the current node.
+         */
+        nid = mem_cgroup_select_victim_node(mem_cont);
+        zonelist = NODE_DATA(nid)->node_zonelists;
        trace_mm_vmscan_memcg_reclaim_begin(0,
                                            sc.may_writepage,
                                            sc.gfp_mask);
-        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
@@ -2063,38 +2285,88 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
+/*
+ * pgdat_balanced is used when checking if a node is balanced for high-order
+ * allocations. Only zones that meet watermarks and are in a zone allowed
+ * by the callers classzone_idx are added to balanced_pages. The total of
+ * balanced pages must be at least 25% of the zones allowed by classzone_idx
+ * for the node to be considered balanced. Forcing all zones to be balanced
+ * for high orders can cause excessive reclaim when there are imbalanced zones.
+ * The choice of 25% is due to
+ *   o a 16M DMA zone that is balanced will not balance a zone on any
+ *     reasonable sized machine
+ *   o On all other machines, the top zone must be at least a reasonable
+ *     percentage of the middle zones. For example, on 32-bit x86, highmem
+ *     would need to be at least 256M for it to be balance a whole node.
+ *     Similarly, on x86-64 the Normal zone would need to be at least 1G
+ *     to balance a node on its own. These seemed like reasonable ratios.
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
+                                                int classzone_idx)
+{
+        unsigned long present_pages = 0;
+        int i;
+        for (i = 0; i <= classzone_idx; i++)
+                present_pages += pgdat->node_zones[i].present_pages;
+        /* A special case here: if zone has no page, we think it's balanced */
+        return balanced_pages >= (present_pages >> 2);
+}
 /* is kswapd sleeping prematurely? */
-static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+                                        int classzone_idx)
 {
        int i;
+        unsigned long balanced = 0;
+        bool all_zones_ok = true;
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
-                return 1;
+                return true;
-        /* If after HZ/10, a zone is below the high mark, it's premature */
+        /* Check the watermark levels */
-        for (i = 0; i < pgdat->nr_zones; i++) {
+        for (i = 0; i <= classzone_idx; i++) {
                struct zone *zone = pgdat->node_zones + i;
                if (!populated_zone(zone))
                        continue;
-                if (zone->all_unreclaimable)
+                /*
+                 * balance_pgdat() skips over all_unreclaimable after
+                 * DEF_PRIORITY. Effectively, it considers them balanced so
+                 * they must be considered balanced here as well if kswapd
+                 * is to sleep
+                 */
+                if (zone->all_unreclaimable) {
+                        balanced += zone->present_pages;
                        continue;
+                }
-                if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+                if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone),
-                                                                0, 0))
+                                                        i, 0))
-                        return 1;
+                        all_zones_ok = false;
+                else
+                        balanced += zone->present_pages;
        }
-        return 0;
+        /*
+         * For high-order requests, the balanced zones must contain at least
+         * 25% of the nodes pages for kswapd to sleep. For order-0, all zones
+         * must be balanced
+         */
+        if (order)
+                return !pgdat_balanced(pgdat, balanced, classzone_idx);
+        else
+                return !all_zones_ok;
 }
 /*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
 *
- * Returns the number of pages which were actually freed.
+ * Returns the final order kswapd was reclaiming at
 *
 * There is special handling here for zones which are full of pinned pages.
 * This can happen if the pages are all mlocked, or if they are all used by
@@ -2111,13 +2383,18 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
 * interoperates with the page allocator fallback scheme to ensure that aging
 * of pages is balanced across the zones.
 */
-static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
+                                                        int *classzone_idx)
 {
        int all_zones_ok;
+        unsigned long balanced;
        int priority;
        int i;
+        int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
        unsigned long total_scanned;
        struct reclaim_state *reclaim_state = current->reclaim_state;
+        unsigned long nr_soft_reclaimed;
+        unsigned long nr_soft_scanned;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_unmap = 1,
@@ -2131,6 +2408,9 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                .order = order,
                .mem_cgroup = NULL,
        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
 loop_again:
        total_scanned = 0;
        sc.nr_reclaimed = 0;
@@ -2138,15 +2418,15 @@ loop_again:
        count_vm_event(PAGEOUTRUN);
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-                int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                unsigned long lru_pages = 0;
                int has_under_min_watermark_zone = 0;
                /* The swap token gets in the way of swapout... */
                if (!priority)
-                        disable_swap_token();
+                        disable_swap_token(NULL);
                all_zones_ok = 1;
+                balanced = 0;
                /*
                 * Scan in the highmem->dma direction for the highest
@@ -2169,7 +2449,7 @@ loop_again:
                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
                                                        &sc, priority, 0);
-                        if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), 0, 0)) {
                                end_zone = i;
                                break;
@@ -2196,6 +2476,7 @@ loop_again:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
+                        unsigned long balance_gap;
                        if (!populated_zone(zone))
                                continue;
@@ -2205,28 +2486,42 @@ loop_again:
                        sc.nr_scanned = 0;
+                        nr_soft_scanned = 0;
                        /*
                         * Call soft limit reclaim before calling shrink_zone.
-                         * For now we ignore the return value
                         */
-                        mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+                        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone,
+                                                        order, sc.gfp_mask,
+                                                        &nr_soft_scanned);
+                        sc.nr_reclaimed += nr_soft_reclaimed;
+                        total_scanned += nr_soft_scanned;
                        /*
-                         * We put equal pressure on every zone, unless one
+                         * We put equal pressure on every zone, unless
-                         * zone has way too many pages free already.
+                         * one zone has way too many pages free
+                         * already. The "too many pages" is defined
+                         * as the high wmark plus a "gap" where the
+                         * gap is either the low watermark or 1%
+                         * of the zone, whichever is smaller.
                         */
-                        if (!zone_watermark_ok(zone, order,
+                        balance_gap = min(low_wmark_pages(zone),
-                                        8*high_wmark_pages(zone), end_zone, 0))
+                                (zone->present_pages +
+                                        KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
+                                KSWAPD_ZONE_BALANCE_GAP_RATIO);
+                        if (!zone_watermark_ok_safe(zone, order,
+                                        high_wmark_pages(zone) + balance_gap,
+                                        end_zone, 0)) {
                                shrink_zone(priority, zone, &sc);
-                        reclaim_state->reclaimed_slab = 0;
-                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
+                                reclaim_state->reclaimed_slab = 0;
-                                                lru_pages);
+                                nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
-                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+                                sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-                        total_scanned += sc.nr_scanned;
+                                total_scanned += sc.nr_scanned;
-                        if (zone->all_unreclaimable)
-                                continue;
+                                if (nr_slab == 0 && !zone_reclaimable(zone))
-                        if (nr_slab == 0 && !zone_reclaimable(zone))
+                                        zone->all_unreclaimable = 1;
-                                zone->all_unreclaimable = 1;
+                        }
                        /*
                         * If we've done a decent amount of scanning and
                         * the reclaim ratio is low, start doing writepage
@@ -2236,7 +2531,13 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
-                        if (!zone_watermark_ok(zone, order,
+                        if (zone->all_unreclaimable) {
+                                if (end_zone && end_zone == i)
+                                        end_zone--;
+                                continue;
+                        }
+                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
                                /*
@@ -2244,13 +2545,24 @@ loop_again:
                                 * means that we have a GFP_ATOMIC allocation
                                 * failure risk. Hurry up!
                                 */
-                                if (!zone_watermark_ok(zone, order,
+                                if (!zone_watermark_ok_safe(zone, order,
                                            min_wmark_pages(zone), end_zone, 0))
                                        has_under_min_watermark_zone = 1;
+                        } else {
+                                /*
+                                 * If a zone reaches its high watermark,
+                                 * consider it to be no longer congested. It's
+                                 * possible there are dirty pages backed by
+                                 * congested BDIs but as pressure is relieved,
+                                 * spectulatively avoid congestion waits
+                                 */
+                                zone_clear_flag(zone, ZONE_CONGESTED);
+                                if (i <= *classzone_idx)
+                                        balanced += zone->present_pages;
                        }
                }
-                if (all_zones_ok)
+                if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                        break;          /* kswapd: all done */
                /*
                 * OK, kswapd is getting into trouble.  Take a nap, then take
@@ -2273,7 +2585,13 @@ loop_again:
                        break;
        }
 out:
-        if (!all_zones_ok) {
+        /*
+         * order-0: All zones must meet high watermark for a balanced node
+         * high-order: Balanced zones must make up at least 25% of the node
+         *             for the node to be balanced
+         */
+        if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) {
                cond_resched();
                try_to_freeze();
@@ -2298,7 +2616,88 @@ out:
                goto loop_again;
        }
-        return sc.nr_reclaimed;
+        /*
+         * If kswapd was reclaiming at a higher order, it has the option of
+         * sleeping without all zones being balanced. Before it does, it must
+         * ensure that the watermarks for order-0 on *all* zones are met and
+         * that the congestion flags are cleared. The congestion flag must
+         * be cleared as kswapd is the only mechanism that clears the flag
+         * and it is potentially going to sleep here.
+         */
+        if (order) {
+                for (i = 0; i <= end_zone; i++) {
+                        struct zone *zone = pgdat->node_zones + i;
+                        if (!populated_zone(zone))
+                                continue;
+                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                                continue;
+                        /* Confirm the zone is balanced for order-0 */
+                        if (!zone_watermark_ok(zone, 0,
+                                        high_wmark_pages(zone), 0, 0)) {
+                                order = sc.order = 0;
+                                goto loop_again;
+                        }
+                        /* If balanced, clear the congested flag */
+                        zone_clear_flag(zone, ZONE_CONGESTED);
+                }
+        }
+        /*
+         * Return the order we were reclaiming at so sleeping_prematurely()
+         * makes a decision on the order we were last reclaiming at. However,
+         * if another caller entered the allocator slow path while kswapd
+         * was awake, order will remain at the higher level
+         */
+        *classzone_idx = end_zone;
+        return order;
+}
+static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+{
+        long remaining = 0;
+        DEFINE_WAIT(wait);
+        if (freezing(current) || kthread_should_stop())
+                return;
+        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+        /* Try to sleep for a short interval */
+        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+                remaining = schedule_timeout(HZ/10);
+                finish_wait(&pgdat->kswapd_wait, &wait);
+                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+        }
+        /*
+         * After a short sleep, check if it was a premature sleep. If not, then
+         * go fully to sleep until explicitly woken up.
+         */
+        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
+                /*
+                 * vmstat counters are not perfectly accurate and the estimated
+                 * value for counters such as NR_FREE_PAGES can deviate from the
+                 * true value by nr_online_cpus * threshold. To avoid the zone
+                 * watermarks being breached while under pressure, we reduce the
+                 * per-cpu vmstat threshold while kswapd is awake and restore
+                 * them before going back to sleep.
+                 */
+                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
+                schedule();
+                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
+        } else {
+                if (remaining)
+                        count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+                else
+                        count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+        }
+        finish_wait(&pgdat->kswapd_wait, &wait);
 }
 /*
@@ -2316,10 +2715,11 @@ out:
 */
 static int kswapd(void *p)
 {
-        unsigned long order;
+        unsigned long order, new_order;
+        int classzone_idx, new_classzone_idx;
        pg_data_t *pgdat = (pg_data_t*)p;
        struct task_struct *tsk = current;
-        DEFINE_WAIT(wait);
        struct reclaim_state reclaim_state = {
                .reclaimed_slab = 0,
        };
@@ -2346,50 +2746,37 @@ static int kswapd(void *p)
        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
        set_freezable();
-        order = 0;
+        order = new_order = 0;
+        classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
        for ( ; ; ) {
-                unsigned long new_order;
                int ret;
-                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+                /*
-                new_order = pgdat->kswapd_max_order;
+                 * If the last balance_pgdat was unsuccessful it's unlikely a
-                pgdat->kswapd_max_order = 0;
+                 * new request of a similar or harder type will succeed soon
-                if (order < new_order) {
+                 * so consider going to sleep on the basis we reclaimed at
+                 */
+                if (classzone_idx >= new_classzone_idx && order == new_order) {
+                        new_order = pgdat->kswapd_max_order;
+                        new_classzone_idx = pgdat->classzone_idx;
+                        pgdat->kswapd_max_order =  0;
+                        pgdat->classzone_idx = pgdat->nr_zones - 1;
+                }
+                if (order < new_order || classzone_idx > new_classzone_idx) {
                        /*
                         * Don't sleep if someone wants a larger 'order'
-                         * allocation
+                         * allocation or has tigher zone constraints
                         */
                        order = new_order;
+                        classzone_idx = new_classzone_idx;
                } else {
-                        if (!freezing(current) && !kthread_should_stop()) {
+                        kswapd_try_to_sleep(pgdat, order, classzone_idx);
-                                long remaining = 0;
-                                /* Try to sleep for a short interval */
-                                if (!sleeping_prematurely(pgdat, order, remaining)) {
-                                        remaining = schedule_timeout(HZ/10);
-                                        finish_wait(&pgdat->kswapd_wait, &wait);
-                                        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
-                                }
-                                /*
-                                 * After a short sleep, check if it was a
-                                 * premature sleep. If not, then go fully
-                                 * to sleep until explicitly woken up
-                                 */
-                                if (!sleeping_prematurely(pgdat, order, remaining)) {
-                                        trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
-                                        schedule();
-                                } else {
-                                        if (remaining)
-                                                count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
-                                        else
-                                                count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
-                                }
-                        }
                        order = pgdat->kswapd_max_order;
+                        classzone_idx = pgdat->classzone_idx;
+                        pgdat->kswapd_max_order = 0;
+                        pgdat->classzone_idx = pgdat->nr_zones - 1;
                }
-                finish_wait(&pgdat->kswapd_wait, &wait);
                ret = try_to_freeze();
                if (kthread_should_stop())
@@ -2401,7 +2788,7 @@ static int kswapd(void *p)
                 */
                if (!ret) {
                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
-                        balance_pgdat(pgdat, order);
+                        order = balance_pgdat(pgdat, order, &classzone_idx);
                }
        }
        return 0;
@@ -2410,23 +2797,26 @@ static int kswapd(void *p)
 /*
 * A zone is low on free memory, so wake its kswapd task to service it.
 */
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 {
        pg_data_t *pgdat;
        if (!populated_zone(zone))
                return;
-        pgdat = zone->zone_pgdat;
-        if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
-                return;
-        if (pgdat->kswapd_max_order < order)
-                pgdat->kswapd_max_order = order;
-        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                return;
+        pgdat = zone->zone_pgdat;
+        if (pgdat->kswapd_max_order < order) {
+                pgdat->kswapd_max_order = order;
+                pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx);
+        }
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
+        if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+                return;
+        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
@@ -2487,7 +2877,10 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
                .swappiness = vm_swappiness,
                .order = 0,
        };
-        struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
+        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
        struct task_struct *p = current;
        unsigned long nr_reclaimed;
@@ -2496,7 +2889,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
        p->reclaim_state = NULL;
        lockdep_clear_current_reclaim_state();
@@ -2671,6 +3064,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .swappiness = vm_swappiness,
                .order = order,
        };
+        struct shrink_control shrink = {
+                .gfp_mask = sc.gfp_mask,
+        };
        unsigned long nr_slab_pages0, nr_slab_pages1;
        cond_resched();
@@ -2712,7 +3108,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                        unsigned long lru_pages = zone_reclaimable_pages(zone);
                        /* No reclaimable slab or very low memory pressure */
-                        if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+                        if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
                                break;
                        /* Freed enough memory */
@@ -2987,6 +3383,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
        return 0;
 }
+#ifdef CONFIG_NUMA
 /*
 * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
 * a specified node's per zone unevictable lists for evictable pages.
@@ -3033,4 +3430,4 @@ void scan_unevictable_unregister_node(struct node *node)
 {
        sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
 }
+#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 355a9e669aaa..20c18b7694b2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,8 @@
 #include <linux/vmstat.h>
 #include <linux/sched.h>
 #include <linux/math64.h>
+#include <linux/writeback.h>
+#include <linux/compaction.h>
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -81,7 +83,31 @@ EXPORT_SYMBOL(vm_stat);
 #ifdef CONFIG_SMP
-static int calculate_threshold(struct zone *zone)
+int calculate_pressure_threshold(struct zone *zone)
+{
+        int threshold;
+        int watermark_distance;
+        /*
+         * As vmstats are not up to date, there is drift between the estimated
+         * and real values. For high thresholds and a high number of CPUs, it
+         * is possible for the min watermark to be breached while the estimated
+         * value looks fine. The pressure threshold is a reduced value such
+         * that even the maximum amount of drift will not accidentally breach
+         * the min watermark
+         */
+        watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone);
+        threshold = max(1, (int)(watermark_distance / num_online_cpus()));
+        /*
+         * Maximum threshold is 125
+         */
+        threshold = min(125, threshold);
+        return threshold;
+}
+int calculate_normal_threshold(struct zone *zone)
 {
        int threshold;
        int mem;        /* memory in 128 MB units */
@@ -131,7 +157,7 @@ static int calculate_threshold(struct zone *zone)
 /*
 * Refresh the thresholds for each zone.
 */
-static void refresh_zone_stat_thresholds(void)
+void refresh_zone_stat_thresholds(void)
 {
        struct zone *zone;
        int cpu;
@@ -140,7 +166,7 @@ static void refresh_zone_stat_thresholds(void)
        for_each_populated_zone(zone) {
                unsigned long max_drift, tolerate_drift;
-                threshold = calculate_threshold(zone);
+                threshold = calculate_normal_threshold(zone);
                for_each_online_cpu(cpu)
                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
@@ -159,42 +185,50 @@ static void refresh_zone_stat_thresholds(void)
        }
 }
+void set_pgdat_percpu_threshold(pg_data_t *pgdat,
+                                int (*calculate_pressure)(struct zone *))
+{
+        struct zone *zone;
+        int cpu;
+        int threshold;
+        int i;
+        for (i = 0; i < pgdat->nr_zones; i++) {
+                zone = &pgdat->node_zones[i];
+                if (!zone->percpu_drift_mark)
+                        continue;
+                threshold = (*calculate_pressure)(zone);
+                for_each_possible_cpu(cpu)
+                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
+                                                        = threshold;
+        }
+}
 /*
 * For use when we know that interrupts are disabled.
 */
 void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
                                int delta)
 {
-        struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
+        struct per_cpu_pageset __percpu *pcp = zone->pageset;
+        s8 __percpu *p = pcp->vm_stat_diff + item;
-        s8 *p = pcp->vm_stat_diff + item;
        long x;
+        long t;
-        x = delta + *p;
+        x = delta + __this_cpu_read(*p);
-        if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) {
+        t = __this_cpu_read(pcp->stat_threshold);
+        if (unlikely(x > t || x < -t)) {
                zone_page_state_add(x, zone, item);
                x = 0;
        }
-        *p = x;
+        __this_cpu_write(*p, x);
 }
 EXPORT_SYMBOL(__mod_zone_page_state);
 /*
- * For an unknown interrupt state
- */
-void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
-                                        int delta)
-{
-        unsigned long flags;
-        local_irq_save(flags);
-        __mod_zone_page_state(zone, item, delta);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_zone_page_state);
-/*
 * Optimized increment and decrement functions.
 *
 * These are only for a single page and therefore can take a struct page *
@@ -219,16 +253,17 @@ EXPORT_SYMBOL(mod_zone_page_state);
 */
 void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-        struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
+        struct per_cpu_pageset __percpu *pcp = zone->pageset;
-        s8 *p = pcp->vm_stat_diff + item;
+        s8 __percpu *p = pcp->vm_stat_diff + item;
+        s8 v, t;
-        (*p)++;
-        if (unlikely(*p > pcp->stat_threshold)) {
+        v = __this_cpu_inc_return(*p);
-                int overstep = pcp->stat_threshold / 2;
+        t = __this_cpu_read(pcp->stat_threshold);
+        if (unlikely(v > t)) {
+                s8 overstep = t >> 1;
-                zone_page_state_add(*p + overstep, zone, item);
+                zone_page_state_add(v + overstep, zone, item);
-                *p = -overstep;
+                __this_cpu_write(*p, -overstep);
        }
 }
@@ -240,16 +275,17 @@ EXPORT_SYMBOL(__inc_zone_page_state);
 void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
 {
-        struct per_cpu_pageset *pcp = this_cpu_ptr(zone->pageset);
+        struct per_cpu_pageset __percpu *pcp = zone->pageset;
-        s8 *p = pcp->vm_stat_diff + item;
+        s8 __percpu *p = pcp->vm_stat_diff + item;
+        s8 v, t;
-        (*p)--;
+        v = __this_cpu_dec_return(*p);
+        t = __this_cpu_read(pcp->stat_threshold);
+        if (unlikely(v < - t)) {
+                s8 overstep = t >> 1;
-        if (unlikely(*p < - pcp->stat_threshold)) {
+                zone_page_state_add(v - overstep, zone, item);
-                int overstep = pcp->stat_threshold / 2;
+                __this_cpu_write(*p, overstep);
-                zone_page_state_add(*p - overstep, zone, item);
-                *p = overstep;
        }
 }
@@ -259,6 +295,95 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
 }
 EXPORT_SYMBOL(__dec_zone_page_state);
+#ifdef CONFIG_CMPXCHG_LOCAL
+/*
+ * If we have cmpxchg_local support then we do not need to incur the overhead
+ * that comes with local_irq_save/restore if we use this_cpu_cmpxchg.
+ *
+ * mod_state() modifies the zone counter state through atomic per cpu
+ * operations.
+ *
+ * Overstep mode specifies how overstep should handled:
+ *     0       No overstepping
+ *     1       Overstepping half of threshold
+ *     -1      Overstepping minus half of threshold
+*/
+static inline void mod_state(struct zone *zone,
+       enum zone_stat_item item, int delta, int overstep_mode)
+{
+        struct per_cpu_pageset __percpu *pcp = zone->pageset;
+        s8 __percpu *p = pcp->vm_stat_diff + item;
+        long o, n, t, z;
+        do {
+                z = 0;  /* overflow to zone counters */
+                /*
+                 * The fetching of the stat_threshold is racy. We may apply
+                 * a counter threshold to the wrong the cpu if we get
+                 * rescheduled while executing here. However, the next
+                 * counter update will apply the threshold again and
+                 * therefore bring the counter under the threshold again.
+                 *
+                 * Most of the time the thresholds are the same anyways
+                 * for all cpus in a zone.
+                 */
+                t = this_cpu_read(pcp->stat_threshold);
+                o = this_cpu_read(*p);
+                n = delta + o;
+                if (n > t || n < -t) {
+                        int os = overstep_mode * (t >> 1) ;
+                        /* Overflow must be added to zone counters */
+                        z = n + os;
+                        n = -os;
+                }
+        } while (this_cpu_cmpxchg(*p, o, n) != o);
+        if (z)
+                zone_page_state_add(z, zone, item);
+}
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+                                        int delta)
+{
+        mod_state(zone, item, delta, 0);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+        mod_state(zone, item, 1, 1);
+}
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        mod_state(page_zone(page), item, 1, 1);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        mod_state(page_zone(page), item, -1, -1);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+#else
+/*
+ * Use interrupt disable to serialize counter updates
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+                                        int delta)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __mod_zone_page_state(zone, item, delta);
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
 void inc_zone_state(struct zone *zone, enum zone_stat_item item)
 {
        unsigned long flags;
@@ -289,6 +414,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(dec_zone_page_state);
+#endif
 /*
 * Update the zone counters for one cpu.
@@ -377,8 +503,12 @@ void refresh_cpu_vm_stats(int cpu)
 * z        = the zone from which the allocation occurred.
 *
 * Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
 */
-void zone_statistics(struct zone *preferred_zone, struct zone *z)
+void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
 {
        if (z->zone_pgdat == preferred_zone->zone_pgdat) {
                __inc_zone_state(z, NUMA_HIT);
@@ -386,7 +516,8 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
                __inc_zone_state(z, NUMA_MISS);
                __inc_zone_state(preferred_zone, NUMA_FOREIGN);
        }
-        if (z->node == numa_node_id())
+        if (z->node == ((flags & __GFP_OTHER_NODE) ?
+                        preferred_zone->node : numa_node_id()))
                __inc_zone_state(z, NUMA_LOCAL);
        else
                __inc_zone_state(z, NUMA_OTHER);
@@ -394,6 +525,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #endif
 #ifdef CONFIG_COMPACTION
 struct contig_page_info {
        unsigned long free_pages;
        unsigned long free_blocks_total;
@@ -527,6 +659,138 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
 }
 #endif
+#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS)
+#ifdef CONFIG_ZONE_DMA
+#define TEXT_FOR_DMA(xx) xx "_dma",
+#else
+#define TEXT_FOR_DMA(xx)
+#endif
+#ifdef CONFIG_ZONE_DMA32
+#define TEXT_FOR_DMA32(xx) xx "_dma32",
+#else
+#define TEXT_FOR_DMA32(xx)
+#endif
+#ifdef CONFIG_HIGHMEM
+#define TEXT_FOR_HIGHMEM(xx) xx "_high",
+#else
+#define TEXT_FOR_HIGHMEM(xx)
+#endif
+#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
+                                        TEXT_FOR_HIGHMEM(xx) xx "_movable",
+const char * const vmstat_text[] = {
+        /* Zoned VM counters */
+        "nr_free_pages",
+        "nr_inactive_anon",
+        "nr_active_anon",
+        "nr_inactive_file",
+        "nr_active_file",
+        "nr_unevictable",
+        "nr_mlock",
+        "nr_anon_pages",
+        "nr_mapped",
+        "nr_file_pages",
+        "nr_dirty",
+        "nr_writeback",
+        "nr_slab_reclaimable",
+        "nr_slab_unreclaimable",
+        "nr_page_table_pages",
+        "nr_kernel_stack",
+        "nr_unstable",
+        "nr_bounce",
+        "nr_vmscan_write",
+        "nr_writeback_temp",
+        "nr_isolated_anon",
+        "nr_isolated_file",
+        "nr_shmem",
+        "nr_dirtied",
+        "nr_written",
+#ifdef CONFIG_NUMA
+        "numa_hit",
+        "numa_miss",
+        "numa_foreign",
+        "numa_interleave",
+        "numa_local",
+        "numa_other",
+#endif
+        "nr_anon_transparent_hugepages",
+        "nr_dirty_threshold",
+        "nr_dirty_background_threshold",
+#ifdef CONFIG_VM_EVENT_COUNTERS
+        "pgpgin",
+        "pgpgout",
+        "pswpin",
+        "pswpout",
+        TEXTS_FOR_ZONES("pgalloc")
+        "pgfree",
+        "pgactivate",
+        "pgdeactivate",
+        "pgfault",
+        "pgmajfault",
+        TEXTS_FOR_ZONES("pgrefill")
+        TEXTS_FOR_ZONES("pgsteal")
+        TEXTS_FOR_ZONES("pgscan_kswapd")
+        TEXTS_FOR_ZONES("pgscan_direct")
+#ifdef CONFIG_NUMA
+        "zone_reclaim_failed",
+#endif
+        "pginodesteal",
+        "slabs_scanned",
+        "kswapd_steal",
+        "kswapd_inodesteal",
+        "kswapd_low_wmark_hit_quickly",
+        "kswapd_high_wmark_hit_quickly",
+        "kswapd_skip_congestion_wait",
+        "pageoutrun",
+        "allocstall",
+        "pgrotated",
+#ifdef CONFIG_COMPACTION
+        "compact_blocks_moved",
+        "compact_pages_moved",
+        "compact_pagemigrate_failed",
+        "compact_stall",
+        "compact_fail",
+        "compact_success",
+#endif
+#ifdef CONFIG_HUGETLB_PAGE
+        "htlb_buddy_alloc_success",
+        "htlb_buddy_alloc_fail",
+#endif
+        "unevictable_pgs_culled",
+        "unevictable_pgs_scanned",
+        "unevictable_pgs_rescued",
+        "unevictable_pgs_mlocked",
+        "unevictable_pgs_munlocked",
+        "unevictable_pgs_cleared",
+        "unevictable_pgs_stranded",
+        "unevictable_pgs_mlockfreed",
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+        "thp_fault_alloc",
+        "thp_fault_fallback",
+        "thp_collapse_alloc",
+        "thp_collapse_alloc_failed",
+        "thp_split",
+#endif
+#endif /* CONFIG_VM_EVENTS_COUNTERS */
+};
+#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */
 #ifdef CONFIG_PROC_FS
 static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
                                                struct zone *zone)
@@ -699,120 +963,6 @@ static const struct file_operations pagetypeinfo_file_ops = {
        .release        = seq_release,
 };
-#ifdef CONFIG_ZONE_DMA
-#define TEXT_FOR_DMA(xx) xx "_dma",
-#else
-#define TEXT_FOR_DMA(xx)
-#endif
-#ifdef CONFIG_ZONE_DMA32
-#define TEXT_FOR_DMA32(xx) xx "_dma32",
-#else
-#define TEXT_FOR_DMA32(xx)
-#endif
-#ifdef CONFIG_HIGHMEM
-#define TEXT_FOR_HIGHMEM(xx) xx "_high",
-#else
-#define TEXT_FOR_HIGHMEM(xx)
-#endif
-#define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \
-                                        TEXT_FOR_HIGHMEM(xx) xx "_movable",
-static const char * const vmstat_text[] = {
-        /* Zoned VM counters */
-        "nr_free_pages",
-        "nr_inactive_anon",
-        "nr_active_anon",
-        "nr_inactive_file",
-        "nr_active_file",
-        "nr_unevictable",
-        "nr_mlock",
-        "nr_anon_pages",
-        "nr_mapped",
-        "nr_file_pages",
-        "nr_dirty",
-        "nr_writeback",
-        "nr_slab_reclaimable",
-        "nr_slab_unreclaimable",
-        "nr_page_table_pages",
-        "nr_kernel_stack",
-        "nr_unstable",
-        "nr_bounce",
-        "nr_vmscan_write",
-        "nr_writeback_temp",
-        "nr_isolated_anon",
-        "nr_isolated_file",
-        "nr_shmem",
-#ifdef CONFIG_NUMA
-        "numa_hit",
-        "numa_miss",
-        "numa_foreign",
-        "numa_interleave",
-        "numa_local",
-        "numa_other",
-#endif
-#ifdef CONFIG_VM_EVENT_COUNTERS
-        "pgpgin",
-        "pgpgout",
-        "pswpin",
-        "pswpout",
-        TEXTS_FOR_ZONES("pgalloc")
-        "pgfree",
-        "pgactivate",
-        "pgdeactivate",
-        "pgfault",
-        "pgmajfault",
-        TEXTS_FOR_ZONES("pgrefill")
-        TEXTS_FOR_ZONES("pgsteal")
-        TEXTS_FOR_ZONES("pgscan_kswapd")
-        TEXTS_FOR_ZONES("pgscan_direct")
-#ifdef CONFIG_NUMA
-        "zone_reclaim_failed",
-#endif
-        "pginodesteal",
-        "slabs_scanned",
-        "kswapd_steal",
-        "kswapd_inodesteal",
-        "kswapd_low_wmark_hit_quickly",
-        "kswapd_high_wmark_hit_quickly",
-        "kswapd_skip_congestion_wait",
-        "pageoutrun",
-        "allocstall",
-        "pgrotated",
-#ifdef CONFIG_COMPACTION
-        "compact_blocks_moved",
-        "compact_pages_moved",
-        "compact_pagemigrate_failed",
-        "compact_stall",
-        "compact_fail",
-        "compact_success",
-#endif
-#ifdef CONFIG_HUGETLB_PAGE
-        "htlb_buddy_alloc_success",
-        "htlb_buddy_alloc_fail",
-#endif
-        "unevictable_pgs_culled",
-        "unevictable_pgs_scanned",
-        "unevictable_pgs_rescued",
-        "unevictable_pgs_mlocked",
-        "unevictable_pgs_munlocked",
-        "unevictable_pgs_cleared",
-        "unevictable_pgs_stranded",
-        "unevictable_pgs_mlockfreed",
-#endif
-};
 static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                                                        struct zone *zone)
 {
@@ -826,7 +976,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n        scanned  %lu"
                   "\n        spanned  %lu"
                   "\n        present  %lu",
-                   zone_nr_free_pages(zone),
+                   zone_page_state(zone, NR_FREE_PAGES),
                   min_wmark_pages(zone),
                   low_wmark_pages(zone),
                   high_wmark_pages(zone),
@@ -904,36 +1054,44 @@ static const struct file_operations proc_zoneinfo_file_operations = {
        .release        = seq_release,
 };
+enum writeback_stat_item {
+        NR_DIRTY_THRESHOLD,
+        NR_DIRTY_BG_THRESHOLD,
+        NR_VM_WRITEBACK_STAT_ITEMS,
+};
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 {
        unsigned long *v;
-#ifdef CONFIG_VM_EVENT_COUNTERS
+        int i, stat_items_size;
-        unsigned long *e;
-#endif
-        int i;
        if (*pos >= ARRAY_SIZE(vmstat_text))
                return NULL;
+        stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+                          NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
 #ifdef CONFIG_VM_EVENT_COUNTERS
-        v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
+        stat_items_size += sizeof(struct vm_event_state);
-                        + sizeof(struct vm_event_state), GFP_KERNEL);
-#else
-        v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
-                        GFP_KERNEL);
 #endif
+        v = kmalloc(stat_items_size, GFP_KERNEL);
        m->private = v;
        if (!v)
                return ERR_PTR(-ENOMEM);
        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
                v[i] = global_page_state(i);
+        v += NR_VM_ZONE_STAT_ITEMS;
+        global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
+                            v + NR_DIRTY_THRESHOLD);
+        v += NR_VM_WRITEBACK_STAT_ITEMS;
 #ifdef CONFIG_VM_EVENT_COUNTERS
-        e = v + NR_VM_ZONE_STAT_ITEMS;
+        all_vm_events(v);
-        all_vm_events(e);
+        v[PGPGIN] /= 2;         /* sectors -> kbytes */
-        e[PGPGIN] /= 2;         /* sectors -> kbytes */
+        v[PGPGOUT] /= 2;
-        e[PGPGOUT] /= 2;
 #endif
-        return v + *pos;
+        return (unsigned long *)m->private + *pos;
 }
 static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
@@ -1017,7 +1175,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
-                cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+                cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
                per_cpu(vmstat_work, cpu).work.func = NULL;
                break;
        case CPU_DOWN_FAILED:
@@ -1043,7 +1201,6 @@ static int __init setup_vmstat(void)
 #ifdef CONFIG_SMP
        int cpu;
-        refresh_zone_stat_thresholds();
        register_cpu_notifier(&vmstat_notifier);
        for_each_online_cpu(cpu)
author	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
committer	Glenn Elliott <gelliott@cs.unc.edu>	2012-03-04 19:47:13 -0500
commit	c71c03bda1e86c9d5198c5d83f712e695c4f2a1e (patch)
tree	ecb166cb3e2b7e2adb3b5e292245fefd23381ac8 /mm
parent	ea53c912f8a86a8567697115b6a0d8152beee5c8 (diff)
parent	6a00f206debf8a5c8899055726ad127dbeeed098 (diff)