42 files changed, 4091 insertions, 2315 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index fd3386242cf0..17b8947aa7da 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -128,12 +128,9 @@ config SPARSEMEM_VMEMMAP
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
        depends on SPARSEMEM || X86_64_ACPI_NUMA
-        depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG
+        depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG
        depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390)
-comment "Memory hotplug is currently incompatible with Software Suspend"
-        depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390
 config MEMORY_HOTPLUG_SPARSE
        def_bool y
        depends on SPARSEMEM && MEMORY_HOTPLUG
@@ -161,11 +158,13 @@ config PAGEFLAGS_EXTENDED
 # Default to 4 for wider testing, though 8 might be more appropriate.
 # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
 # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
+# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
 #
 config SPLIT_PTLOCK_CPUS
        int
-        default "4096" if ARM && !CPU_CACHE_VIPT
+        default "999999" if ARM && !CPU_CACHE_VIPT
-        default "4096" if PARISC && !PA20
+        default "999999" if PARISC && !PA20
+        default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
        default "4"
 #
@@ -203,14 +202,6 @@ config VIRT_TO_BUS
        def_bool y
        depends on !ARCH_NO_VIRT_TO_BUS
-config HAVE_MLOCK
-        bool
-        default y if MMU=y
-config HAVE_MLOCKED_PAGE_BIT
-        bool
-        default y if HAVE_MLOCK=y
 config MMU_NOTIFIER
        bool
@@ -221,7 +212,7 @@ config KSM
          Enable Kernel Samepage Merging: KSM periodically scans those areas
          of an application's address space that an app has advised may be
          mergeable.  When it finds pages of identical content, it replaces
-          the many instances by a single resident page with that content, so
+          the many instances by a single page with that content, so
          saving memory until one or another app needs to modify the content.
          Recommended for use with KVM, or with other duplicative applications.
          See Documentation/vm/ksm.txt for more information: KSM is inactive
@@ -230,6 +221,7 @@ config KSM
 config DEFAULT_MMAP_MIN_ADDR
        int "Low address space to protect from user allocation"
+        depends on MMU
        default 4096
        help
          This is the portion of low virtual memory which should be protected
@@ -260,8 +252,9 @@ config MEMORY_FAILURE
          special hardware support and typically ECC memory.
 config HWPOISON_INJECT
-        tristate "Poison pages injector"
+        tristate "HWPoison pages injector"
-        depends on MEMORY_FAILURE && DEBUG_KERNEL
+        depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS
+        select PROC_PAGE_MONITOR
 config NOMMU_INITIAL_TRIM_EXCESS
        int "Turn on mmap() excess space trimming before booting"
diff --git a/mm/Makefile b/mm/Makefile
index ebf849042ed3..7a68d2ab5560 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
-obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_KSM) += ksm.o
@@ -34,11 +33,7 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
-ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
 obj-$(CONFIG_SMP) += percpu.o
-else
-obj-$(CONFIG_SMP) += allocpercpu.o
-endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
deleted file mode 100644
index df34ceae0c67..000000000000
--- a/mm/allocpercpu.c
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * linux/mm/allocpercpu.c
- *
- * Separated from slab.c August 11, 2006 Christoph Lameter
- */
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/bootmem.h>
-#include <asm/sections.h>
-#ifndef cache_line_size
-#define cache_line_size()       L1_CACHE_BYTES
-#endif
-/**
- * percpu_depopulate - depopulate per-cpu data for given cpu
- * @__pdata: per-cpu data to depopulate
- * @cpu: depopulate per-cpu data for this cpu
- *
- * Depopulating per-cpu data for a cpu going offline would be a typical
- * use case. You need to register a cpu hotplug handler for that purpose.
- */
-static void percpu_depopulate(void *__pdata, int cpu)
-{
-        struct percpu_data *pdata = __percpu_disguise(__pdata);
-        kfree(pdata->ptrs[cpu]);
-        pdata->ptrs[cpu] = NULL;
-}
-/**
- * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
- * @__pdata: per-cpu data to depopulate
- * @mask: depopulate per-cpu data for cpu's selected through mask bits
- */
-static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask)
-{
-        int cpu;
-        for_each_cpu_mask_nr(cpu, *mask)
-                percpu_depopulate(__pdata, cpu);
-}
-#define percpu_depopulate_mask(__pdata, mask) \
-        __percpu_depopulate_mask((__pdata), &(mask))
-/**
- * percpu_populate - populate per-cpu data for given cpu
- * @__pdata: per-cpu data to populate further
- * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @cpu: populate per-data for this cpu
- *
- * Populating per-cpu data for a cpu coming online would be a typical
- * use case. You need to register a cpu hotplug handler for that purpose.
- * Per-cpu object is populated with zeroed buffer.
- */
-static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
-{
-        struct percpu_data *pdata = __percpu_disguise(__pdata);
-        int node = cpu_to_node(cpu);
-        /*
-         * We should make sure each CPU gets private memory.
-         */
-        size = roundup(size, cache_line_size());
-        BUG_ON(pdata->ptrs[cpu]);
-        if (node_online(node))
-                pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node);
-        else
-                pdata->ptrs[cpu] = kzalloc(size, gfp);
-        return pdata->ptrs[cpu];
-}
-/**
- * percpu_populate_mask - populate per-cpu data for more cpu's
- * @__pdata: per-cpu data to populate further
- * @size: size of per-cpu object
- * @gfp: may sleep or not etc.
- * @mask: populate per-cpu data for cpu's selected through mask bits
- *
- * Per-cpu objects are populated with zeroed buffers.
- */
-static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
-                                  cpumask_t *mask)
-{
-        cpumask_t populated;
-        int cpu;
-        cpus_clear(populated);
-        for_each_cpu_mask_nr(cpu, *mask)
-                if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
-                        __percpu_depopulate_mask(__pdata, &populated);
-                        return -ENOMEM;
-                } else
-                        cpu_set(cpu, populated);
-        return 0;
-}
-#define percpu_populate_mask(__pdata, size, gfp, mask) \
-        __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
-/**
- * alloc_percpu - initial setup of per-cpu data
- * @size: size of per-cpu object
- * @align: alignment
- *
- * Allocate dynamic percpu area.  Percpu objects are populated with
- * zeroed buffers.
- */
-void *__alloc_percpu(size_t size, size_t align)
-{
-        /*
-         * We allocate whole cache lines to avoid false sharing
-         */
-        size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
-        void *pdata = kzalloc(sz, GFP_KERNEL);
-        void *__pdata = __percpu_disguise(pdata);
-        /*
-         * Can't easily make larger alignment work with kmalloc.  WARN
-         * on it.  Larger alignment should only be used for module
-         * percpu sections on SMP for which this path isn't used.
-         */
-        WARN_ON_ONCE(align > SMP_CACHE_BYTES);
-        if (unlikely(!pdata))
-                return NULL;
-        if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
-                                           &cpu_possible_map)))
-                return __pdata;
-        kfree(pdata);
-        return NULL;
-}
-EXPORT_SYMBOL_GPL(__alloc_percpu);
-/**
- * free_percpu - final cleanup of per-cpu data
- * @__pdata: object to clean up
- *
- * We simply clean up any per-cpu object left. No need for the client to
- * track and specify through a bis mask which per-cpu objects are to free.
- */
-void free_percpu(void *__pdata)
-{
-        if (unlikely(!__pdata))
-                return;
-        __percpu_depopulate_mask(__pdata, cpu_possible_mask);
-        kfree(__percpu_disguise(__pdata));
-}
-EXPORT_SYMBOL_GPL(free_percpu);
-/*
- * Generic percpu area setup.
- */
-#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
-unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(__per_cpu_offset);
-void __init setup_per_cpu_areas(void)
-{
-        unsigned long size, i;
-        char *ptr;
-        unsigned long nr_possible_cpus = num_possible_cpus();
-        /* Copy section for each CPU (we discard the original) */
-        size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
-        ptr = alloc_bootmem_pages(size * nr_possible_cpus);
-        for_each_possible_cpu(i) {
-                __per_cpu_offset[i] = ptr - __per_cpu_start;
-                memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-                ptr += size;
-        }
-}
-#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1065b715ef64..0e8ca0347707 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -604,10 +604,14 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
        /*
         * Finally, kill the kernel threads. We don't need to be RCU
-         * safe anymore, since the bdi is gone from visibility.
+         * safe anymore, since the bdi is gone from visibility. Force
+         * unfreeze of the thread before calling kthread_stop(), otherwise
+         * it would never exet if it is currently stuck in the refrigerator.
         */
-        list_for_each_entry(wb, &bdi->wb_list, list)
+        list_for_each_entry(wb, &bdi->wb_list, list) {
+                thaw_process(wb->task);
                kthread_stop(wb->task);
+        }
 }
 /*
@@ -628,6 +632,8 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
 void bdi_unregister(struct backing_dev_info *bdi)
 {
        if (bdi->dev) {
+                bdi_prune_sb(bdi);
                if (!bdi_cap_flush_forker(bdi))
                        bdi_wb_shutdown(bdi);
                bdi_debug_unregister(bdi);
@@ -697,7 +703,6 @@ void bdi_destroy(struct backing_dev_info *bdi)
                spin_unlock(&inode_lock);
        }
-        bdi_prune_sb(bdi);
        bdi_unregister(bdi);
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 555d5d2731c6..7d1486875e1c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -143,6 +143,30 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
        return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
 }
+/*
+ * free_bootmem_late - free bootmem pages directly to page allocator
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * This is only useful when the bootmem allocator has already been torn
+ * down, but we are still initializing the system.  Pages are given directly
+ * to the page allocator, no bootmem metadata is updated because it is gone.
+ */
+void __init free_bootmem_late(unsigned long addr, unsigned long size)
+{
+        unsigned long cursor, end;
+        kmemleak_free_part(__va(addr), size);
+        cursor = PFN_UP(addr);
+        end = PFN_DOWN(addr + size);
+        for (; cursor < end; cursor++) {
+                __free_pages_bootmem(pfn_to_page(cursor), 0);
+                totalram_pages++;
+        }
+}
 static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
 {
        int aligned;
@@ -408,8 +432,8 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
        return mark_bootmem(start, end, 1, flags);
 }
-static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
+static unsigned long __init align_idx(struct bootmem_data *bdata,
-                        unsigned long step)
+                                      unsigned long idx, unsigned long step)
 {
        unsigned long base = bdata->node_min_pfn;
@@ -421,8 +445,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
        return ALIGN(base + idx, step) - base;
 }
-static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
+static unsigned long __init align_off(struct bootmem_data *bdata,
-                        unsigned long align)
+                                      unsigned long off, unsigned long align)
 {
        unsigned long base = PFN_PHYS(bdata->node_min_pfn);
diff --git a/mm/filemap.c b/mm/filemap.c
index ef169f37156d..96ac6b0eb6cb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -260,27 +260,27 @@ int filemap_flush(struct address_space *mapping)
 EXPORT_SYMBOL(filemap_flush);
 /**
- * wait_on_page_writeback_range - wait for writeback to complete
+ * filemap_fdatawait_range - wait for writeback to complete
- * @mapping:    target address_space
+ * @mapping:            address space structure to wait for
- * @start:      beginning page index
+ * @start_byte:         offset in bytes where the range starts
- * @end:        ending page index
+ * @end_byte:           offset in bytes where the range ends (inclusive)
 *
- * Wait for writeback to complete against pages indexed by start->end
+ * Walk the list of under-writeback pages of the given address space
- * inclusive
+ * in the given range and wait for all of them.
 */
-int wait_on_page_writeback_range(struct address_space *mapping,
+int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
-                                pgoff_t start, pgoff_t end)
+                            loff_t end_byte)
 {
+        pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
+        pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
        struct pagevec pvec;
        int nr_pages;
        int ret = 0;
-        pgoff_t index;
-        if (end < start)
+        if (end_byte < start_byte)
                return 0;
        pagevec_init(&pvec, 0);
-        index = start;
        while ((index <= end) &&
                        (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
                        PAGECACHE_TAG_WRITEBACK,
@@ -310,25 +310,6 @@ int wait_on_page_writeback_range(struct address_space *mapping,
        return ret;
 }
-/**
- * filemap_fdatawait_range - wait for all under-writeback pages to complete in a given range
- * @mapping: address space structure to wait for
- * @start:      offset in bytes where the range starts
- * @end:        offset in bytes where the range ends (inclusive)
- *
- * Walk the list of under-writeback pages of the given address space
- * in the given range and wait for all of them.
- *
- * This is just a simple wrapper so that callers don't have to convert offsets
- * to page indexes themselves
- */
-int filemap_fdatawait_range(struct address_space *mapping, loff_t start,
-                            loff_t end)
-{
-        return wait_on_page_writeback_range(mapping, start >> PAGE_CACHE_SHIFT,
-                                            end >> PAGE_CACHE_SHIFT);
-}
 EXPORT_SYMBOL(filemap_fdatawait_range);
 /**
@@ -345,8 +326,7 @@ int filemap_fdatawait(struct address_space *mapping)
        if (i_size == 0)
                return 0;
-        return wait_on_page_writeback_range(mapping, 0,
+        return filemap_fdatawait_range(mapping, 0, i_size - 1);
-                                (i_size - 1) >> PAGE_CACHE_SHIFT);
 }
 EXPORT_SYMBOL(filemap_fdatawait);
@@ -393,9 +373,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
                                                 WB_SYNC_ALL);
                /* See comment of filemap_write_and_wait() */
                if (err != -EIO) {
-                        int err2 = wait_on_page_writeback_range(mapping,
+                        int err2 = filemap_fdatawait_range(mapping,
-                                                lstart >> PAGE_CACHE_SHIFT,
+                                                lstart, lend);
-                                                lend >> PAGE_CACHE_SHIFT);
                        if (!err)
                                err = err2;
                }
@@ -1844,7 +1823,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr,
 /*
 * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then return the number of
+ * were successfully copied.  If a fault is encountered then return the number of
 * bytes which were copied.
 */
 size_t iov_iter_copy_from_user_atomic(struct page *page,
@@ -2261,7 +2240,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                size_t count, ssize_t written)
 {
        struct file *file = iocb->ki_filp;
-        struct address_space *mapping = file->f_mapping;
        ssize_t status;
        struct iov_iter i;
@@ -2273,15 +2251,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                *ppos = pos + status;
        }
        
-        /*
-         * If we get here for O_DIRECT writes then we must have fallen through
-         * to buffered writes (block instantiation inside i_size).  So we sync
-         * the file data here, to try to honour O_DIRECT expectations.
-         */
-        if (unlikely(file->f_flags & O_DIRECT) && written)
-                status = filemap_write_and_wait_range(mapping,
-                                        pos, pos + written - 1);
        return written ? written : status;
 }
 EXPORT_SYMBOL(generic_file_buffered_write);
@@ -2380,10 +2349,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
                 * semantics.
                 */
                endbyte = pos + written_buffered - written - 1;
-                err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
+                err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
-                                            SYNC_FILE_RANGE_WAIT_BEFORE|
-                                            SYNC_FILE_RANGE_WRITE|
-                                            SYNC_FILE_RANGE_WAIT_AFTER);
                if (err == 0) {
                        written = written_buffered;
                        invalidate_mapping_pages(mapping,
diff --git a/mm/highmem.c b/mm/highmem.c
index 25878cc49daa..9c1e627f282e 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -426,16 +426,21 @@ void __init page_address_init(void)
 void debug_kmap_atomic(enum km_type type)
 {
-        static unsigned warn_count = 10;
+        static int warn_count = 10;
-        if (unlikely(warn_count == 0))
+        if (unlikely(warn_count < 0))
                return;
        if (unlikely(in_interrupt())) {
-                if (in_irq()) {
+                if (in_nmi()) {
+                        if (type != KM_NMI && type != KM_NMI_PTE) {
+                                WARN_ON(1);
+                                warn_count--;
+                        }
+                } else if (in_irq()) {
                        if (type != KM_IRQ0 && type != KM_IRQ1 &&
                            type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
-                            type != KM_BOUNCE_READ) {
+                            type != KM_BOUNCE_READ && type != KM_IRQ_PTE) {
                                WARN_ON(1);
                                warn_count--;
                        }
@@ -452,7 +457,9 @@ void debug_kmap_atomic(enum km_type type)
        }
        if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
-                        type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
+                        type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ ||
+                        type == KM_IRQ_PTE || type == KM_NMI ||
+                        type == KM_NMI_PTE ) {
                if (!irqs_disabled()) {
                        WARN_ON(1);
                        warn_count--;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5d7601b02874..65f38c218207 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -24,6 +24,7 @@
 #include <asm/io.h>
 #include <linux/hugetlb.h>
+#include <linux/node.h>
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
@@ -622,42 +623,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 }
 /*
- * Use a helper variable to find the next node and then
+ * common helper functions for hstate_next_node_to_{alloc|free}.
- * copy it back to next_nid_to_alloc afterwards:
+ * We may have allocated or freed a huge page based on a different
- * otherwise there's a window in which a racer might
+ * nodes_allowed previously, so h->next_node_to_{alloc|free} might
- * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
+ * be outside of *nodes_allowed.  Ensure that we use an allowed
- * But we don't need to use a spin_lock here: it really
+ * node for alloc or free.
- * doesn't matter if occasionally a racer chooses the
- * same nid as we do.  Move nid forward in the mask even
- * if we just successfully allocated a hugepage so that
- * the next caller gets hugepages on the next node.
 */
-static int hstate_next_node_to_alloc(struct hstate *h)
+static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
 {
-        int next_nid;
+        nid = next_node(nid, *nodes_allowed);
-        next_nid = next_node(h->next_nid_to_alloc, node_online_map);
+        if (nid == MAX_NUMNODES)
-        if (next_nid == MAX_NUMNODES)
+                nid = first_node(*nodes_allowed);
-                next_nid = first_node(node_online_map);
+        VM_BUG_ON(nid >= MAX_NUMNODES);
-        h->next_nid_to_alloc = next_nid;
-        return next_nid;
+        return nid;
+}
+static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
+{
+        if (!node_isset(nid, *nodes_allowed))
+                nid = next_node_allowed(nid, nodes_allowed);
+        return nid;
+}
+/*
+ * returns the previously saved node ["this node"] from which to
+ * allocate a persistent huge page for the pool and advance the
+ * next node from which to allocate, handling wrap at end of node
+ * mask.
+ */
+static int hstate_next_node_to_alloc(struct hstate *h,
+                                        nodemask_t *nodes_allowed)
+{
+        int nid;
+        VM_BUG_ON(!nodes_allowed);
+        nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
+        h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
+        return nid;
 }
-static int alloc_fresh_huge_page(struct hstate *h)
+static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
 {
        struct page *page;
        int start_nid;
        int next_nid;
        int ret = 0;
-        start_nid = h->next_nid_to_alloc;
+        start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
        next_nid = start_nid;
        do {
                page = alloc_fresh_huge_page_node(h, next_nid);
-                if (page)
+                if (page) {
                        ret = 1;
-                next_nid = hstate_next_node_to_alloc(h);
+                        break;
-        } while (!page && next_nid != start_nid);
+                }
+                next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
+        } while (next_nid != start_nid);
        if (ret)
                count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -668,17 +693,21 @@ static int alloc_fresh_huge_page(struct hstate *h)
 }
 /*
- * helper for free_pool_huge_page() - find next node
+ * helper for free_pool_huge_page() - return the previously saved
- * from which to free a huge page
+ * node ["this node"] from which to free a huge page.  Advance the
+ * next node id whether or not we find a free huge page to free so
+ * that the next attempt to free addresses the next node.
 */
-static int hstate_next_node_to_free(struct hstate *h)
+static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
 {
-        int next_nid;
+        int nid;
-        next_nid = next_node(h->next_nid_to_free, node_online_map);
-        if (next_nid == MAX_NUMNODES)
+        VM_BUG_ON(!nodes_allowed);
-                next_nid = first_node(node_online_map);
-        h->next_nid_to_free = next_nid;
+        nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
-        return next_nid;
+        h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
+        return nid;
 }
 /*
@@ -687,13 +716,14 @@ static int hstate_next_node_to_free(struct hstate *h)
 * balanced over allowed nodes.
 * Called with hugetlb_lock locked.
 */
-static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
+static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+                                                         bool acct_surplus)
 {
        int start_nid;
        int next_nid;
        int ret = 0;
-        start_nid = h->next_nid_to_free;
+        start_nid = hstate_next_node_to_free(h, nodes_allowed);
        next_nid = start_nid;
        do {
@@ -715,9 +745,10 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
                        }
                        update_and_free_page(h, page);
                        ret = 1;
+                        break;
                }
-                next_nid = hstate_next_node_to_free(h);
+                next_nid = hstate_next_node_to_free(h, nodes_allowed);
-        } while (!ret && next_nid != start_nid);
+        } while (next_nid != start_nid);
        return ret;
 }
@@ -911,14 +942,14 @@ static void return_unused_surplus_pages(struct hstate *h,
        /*
         * We want to release as many surplus pages as possible, spread
-         * evenly across all nodes. Iterate across all nodes until we
+         * evenly across all nodes with memory. Iterate across these nodes
-         * can no longer free unreserved surplus pages. This occurs when
+         * until we can no longer free unreserved surplus pages. This occurs
-         * the nodes with surplus pages have no free pages.
+         * when the nodes with surplus pages have no free pages.
-         * free_pool_huge_page() will balance the the frees across the
+         * free_pool_huge_page() will balance the the freed pages across the
-         * on-line nodes for us and will handle the hstate accounting.
+         * on-line nodes with memory and will handle the hstate accounting.
         */
        while (nr_pages--) {
-                if (!free_pool_huge_page(h, 1))
+                if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1))
                        break;
        }
 }
@@ -1022,16 +1053,16 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 int __weak alloc_bootmem_huge_page(struct hstate *h)
 {
        struct huge_bootmem_page *m;
-        int nr_nodes = nodes_weight(node_online_map);
+        int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
        while (nr_nodes) {
                void *addr;
                addr = __alloc_bootmem_node_nopanic(
-                                NODE_DATA(h->next_nid_to_alloc),
+                                NODE_DATA(hstate_next_node_to_alloc(h,
+                                                &node_states[N_HIGH_MEMORY])),
                                huge_page_size(h), huge_page_size(h), 0);
-                hstate_next_node_to_alloc(h);
                if (addr) {
                        /*
                         * Use the beginning of the huge page to store the
@@ -1084,7 +1115,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
                if (h->order >= MAX_ORDER) {
                        if (!alloc_bootmem_huge_page(h))
                                break;
-                } else if (!alloc_fresh_huge_page(h))
+                } else if (!alloc_fresh_huge_page(h,
+                                         &node_states[N_HIGH_MEMORY]))
                        break;
        }
        h->max_huge_pages = i;
@@ -1126,14 +1158,15 @@ static void __init report_hugepages(void)
 }
 #ifdef CONFIG_HIGHMEM
-static void try_to_free_low(struct hstate *h, unsigned long count)
+static void try_to_free_low(struct hstate *h, unsigned long count,
+                                                nodemask_t *nodes_allowed)
 {
        int i;
        if (h->order >= MAX_ORDER)
                return;
-        for (i = 0; i < MAX_NUMNODES; ++i) {
+        for_each_node_mask(i, *nodes_allowed) {
                struct page *page, *next;
                struct list_head *freel = &h->hugepage_freelists[i];
                list_for_each_entry_safe(page, next, freel, lru) {
@@ -1149,7 +1182,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
        }
 }
 #else
-static inline void try_to_free_low(struct hstate *h, unsigned long count)
+static inline void try_to_free_low(struct hstate *h, unsigned long count,
+                                                nodemask_t *nodes_allowed)
 {
 }
 #endif
@@ -1159,7 +1193,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
 * balanced by operating on them in a round-robin fashion.
 * Returns 1 if an adjustment was made.
 */
-static int adjust_pool_surplus(struct hstate *h, int delta)
+static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
+                                int delta)
 {
        int start_nid, next_nid;
        int ret = 0;
@@ -1167,29 +1202,33 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
        VM_BUG_ON(delta != -1 && delta != 1);
        if (delta < 0)
-                start_nid = h->next_nid_to_alloc;
+                start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
        else
-                start_nid = h->next_nid_to_free;
+                start_nid = hstate_next_node_to_free(h, nodes_allowed);
        next_nid = start_nid;
        do {
                int nid = next_nid;
                if (delta < 0)  {
-                        next_nid = hstate_next_node_to_alloc(h);
                        /*
                         * To shrink on this node, there must be a surplus page
                         */
-                        if (!h->surplus_huge_pages_node[nid])
+                        if (!h->surplus_huge_pages_node[nid]) {
+                                next_nid = hstate_next_node_to_alloc(h,
+                                                                nodes_allowed);
                                continue;
+                        }
                }
                if (delta > 0) {
-                        next_nid = hstate_next_node_to_free(h);
                        /*
                         * Surplus cannot exceed the total number of pages
                         */
                        if (h->surplus_huge_pages_node[nid] >=
-                                                h->nr_huge_pages_node[nid])
+                                                h->nr_huge_pages_node[nid]) {
+                                next_nid = hstate_next_node_to_free(h,
+                                                                nodes_allowed);
                                continue;
+                        }
                }
                h->surplus_huge_pages += delta;
@@ -1202,7 +1241,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
 }
 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
+static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
+                                                nodemask_t *nodes_allowed)
 {
        unsigned long min_count, ret;
@@ -1222,7 +1262,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
         */
        spin_lock(&hugetlb_lock);
        while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
-                if (!adjust_pool_surplus(h, -1))
+                if (!adjust_pool_surplus(h, nodes_allowed, -1))
                        break;
        }
@@ -1233,11 +1273,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
                 * and reducing the surplus.
                 */
                spin_unlock(&hugetlb_lock);
-                ret = alloc_fresh_huge_page(h);
+                ret = alloc_fresh_huge_page(h, nodes_allowed);
                spin_lock(&hugetlb_lock);
                if (!ret)
                        goto out;
+                /* Bail for signals. Probably ctrl-c from user */
+                if (signal_pending(current))
+                        goto out;
        }
        /*
@@ -1257,13 +1300,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
         */
        min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
        min_count = max(count, min_count);
-        try_to_free_low(h, min_count);
+        try_to_free_low(h, min_count, nodes_allowed);
        while (min_count < persistent_huge_pages(h)) {
-                if (!free_pool_huge_page(h, 0))
+                if (!free_pool_huge_page(h, nodes_allowed, 0))
                        break;
        }
        while (count < persistent_huge_pages(h)) {
-                if (!adjust_pool_surplus(h, 1))
+                if (!adjust_pool_surplus(h, nodes_allowed, 1))
                        break;
        }
 out:
@@ -1282,43 +1325,117 @@ out:
 static struct kobject *hugepages_kobj;
 static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
-static struct hstate *kobj_to_hstate(struct kobject *kobj)
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp);
+static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp)
 {
        int i;
        for (i = 0; i < HUGE_MAX_HSTATE; i++)
-                if (hstate_kobjs[i] == kobj)
+                if (hstate_kobjs[i] == kobj) {
+                        if (nidp)
+                                *nidp = NUMA_NO_NODE;
                        return &hstates[i];
-        BUG();
+                }
-        return NULL;
+        return kobj_to_node_hstate(kobj, nidp);
 }
-static ssize_t nr_hugepages_show(struct kobject *kobj,
+static ssize_t nr_hugepages_show_common(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
 {
-        struct hstate *h = kobj_to_hstate(kobj);
+        struct hstate *h;
-        return sprintf(buf, "%lu\n", h->nr_huge_pages);
+        unsigned long nr_huge_pages;
+        int nid;
+        h = kobj_to_hstate(kobj, &nid);
+        if (nid == NUMA_NO_NODE)
+                nr_huge_pages = h->nr_huge_pages;
+        else
+                nr_huge_pages = h->nr_huge_pages_node[nid];
+        return sprintf(buf, "%lu\n", nr_huge_pages);
 }
-static ssize_t nr_hugepages_store(struct kobject *kobj,
+static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
-                struct kobj_attribute *attr, const char *buf, size_t count)
+                        struct kobject *kobj, struct kobj_attribute *attr,
+                        const char *buf, size_t len)
 {
        int err;
-        unsigned long input;
+        int nid;
-        struct hstate *h = kobj_to_hstate(kobj);
+        unsigned long count;
+        struct hstate *h;
+        NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
-        err = strict_strtoul(buf, 10, &input);
+        err = strict_strtoul(buf, 10, &count);
        if (err)
                return 0;
-        h->max_huge_pages = set_max_huge_pages(h, input);
+        h = kobj_to_hstate(kobj, &nid);
+        if (nid == NUMA_NO_NODE) {
+                /*
+                 * global hstate attribute
+                 */
+                if (!(obey_mempolicy &&
+                                init_nodemask_of_mempolicy(nodes_allowed))) {
+                        NODEMASK_FREE(nodes_allowed);
+                        nodes_allowed = &node_states[N_HIGH_MEMORY];
+                }
+        } else if (nodes_allowed) {
+                /*
+                 * per node hstate attribute: adjust count to global,
+                 * but restrict alloc/free to the specified node.
+                 */
+                count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+                init_nodemask_of_node(nodes_allowed, nid);
+        } else
+                nodes_allowed = &node_states[N_HIGH_MEMORY];
+        h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
-        return count;
+        if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+                NODEMASK_FREE(nodes_allowed);
+        return len;
+}
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+                                       struct kobj_attribute *attr, char *buf)
+{
+        return nr_hugepages_show_common(kobj, attr, buf);
+}
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+               struct kobj_attribute *attr, const char *buf, size_t len)
+{
+        return nr_hugepages_store_common(false, kobj, attr, buf, len);
 }
 HSTATE_ATTR(nr_hugepages);
+#ifdef CONFIG_NUMA
+/*
+ * hstate attribute for optionally mempolicy-based constraint on persistent
+ * huge page alloc/free.
+ */
+static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
+                                       struct kobj_attribute *attr, char *buf)
+{
+        return nr_hugepages_show_common(kobj, attr, buf);
+}
+static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj,
+               struct kobj_attribute *attr, const char *buf, size_t len)
+{
+        return nr_hugepages_store_common(true, kobj, attr, buf, len);
+}
+HSTATE_ATTR(nr_hugepages_mempolicy);
+#endif
 static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
 {
-        struct hstate *h = kobj_to_hstate(kobj);
+        struct hstate *h = kobj_to_hstate(kobj, NULL);
        return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
 }
 static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
@@ -1326,7 +1443,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
 {
        int err;
        unsigned long input;
-        struct hstate *h = kobj_to_hstate(kobj);
+        struct hstate *h = kobj_to_hstate(kobj, NULL);
        err = strict_strtoul(buf, 10, &input);
        if (err)
@@ -1343,15 +1460,24 @@ HSTATE_ATTR(nr_overcommit_hugepages);
 static ssize_t free_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
 {
-        struct hstate *h = kobj_to_hstate(kobj);
+        struct hstate *h;
-        return sprintf(buf, "%lu\n", h->free_huge_pages);
+        unsigned long free_huge_pages;
+        int nid;
+        h = kobj_to_hstate(kobj, &nid);
+        if (nid == NUMA_NO_NODE)
+                free_huge_pages = h->free_huge_pages;
+        else
+                free_huge_pages = h->free_huge_pages_node[nid];
+        return sprintf(buf, "%lu\n", free_huge_pages);
 }
 HSTATE_ATTR_RO(free_hugepages);
 static ssize_t resv_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
 {
-        struct hstate *h = kobj_to_hstate(kobj);
+        struct hstate *h = kobj_to_hstate(kobj, NULL);
        return sprintf(buf, "%lu\n", h->resv_huge_pages);
 }
 HSTATE_ATTR_RO(resv_hugepages);
@@ -1359,8 +1485,17 @@ HSTATE_ATTR_RO(resv_hugepages);
 static ssize_t surplus_hugepages_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
 {
-        struct hstate *h = kobj_to_hstate(kobj);
+        struct hstate *h;
-        return sprintf(buf, "%lu\n", h->surplus_huge_pages);
+        unsigned long surplus_huge_pages;
+        int nid;
+        h = kobj_to_hstate(kobj, &nid);
+        if (nid == NUMA_NO_NODE)
+                surplus_huge_pages = h->surplus_huge_pages;
+        else
+                surplus_huge_pages = h->surplus_huge_pages_node[nid];
+        return sprintf(buf, "%lu\n", surplus_huge_pages);
 }
 HSTATE_ATTR_RO(surplus_hugepages);
@@ -1370,6 +1505,9 @@ static struct attribute *hstate_attrs[] = {
        &free_hugepages_attr.attr,
        &resv_hugepages_attr.attr,
        &surplus_hugepages_attr.attr,
+#ifdef CONFIG_NUMA
+        &nr_hugepages_mempolicy_attr.attr,
+#endif
        NULL,
 };
@@ -1377,19 +1515,21 @@ static struct attribute_group hstate_attr_group = {
        .attrs = hstate_attrs,
 };
-static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
+static int __init hugetlb_sysfs_add_hstate(struct hstate *h,
+                                struct kobject *parent,
+                                struct kobject **hstate_kobjs,
+                                struct attribute_group *hstate_attr_group)
 {
        int retval;
+        int hi = h - hstates;
-        hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
+        hstate_kobjs[hi] = kobject_create_and_add(h->name, parent);
-                                                        hugepages_kobj);
+        if (!hstate_kobjs[hi])
-        if (!hstate_kobjs[h - hstates])
                return -ENOMEM;
-        retval = sysfs_create_group(hstate_kobjs[h - hstates],
+        retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
-                                                        &hstate_attr_group);
        if (retval)
-                kobject_put(hstate_kobjs[h - hstates]);
+                kobject_put(hstate_kobjs[hi]);
        return retval;
 }
@@ -1404,17 +1544,184 @@ static void __init hugetlb_sysfs_init(void)
                return;
        for_each_hstate(h) {
-                err = hugetlb_sysfs_add_hstate(h);
+                err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+                                         hstate_kobjs, &hstate_attr_group);
                if (err)
                        printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
                                                                h->name);
        }
 }
+#ifdef CONFIG_NUMA
+/*
+ * node_hstate/s - associate per node hstate attributes, via their kobjects,
+ * with node sysdevs in node_devices[] using a parallel array.  The array
+ * index of a node sysdev or _hstate == node id.
+ * This is here to avoid any static dependency of the node sysdev driver, in
+ * the base kernel, on the hugetlb module.
+ */
+struct node_hstate {
+        struct kobject          *hugepages_kobj;
+        struct kobject          *hstate_kobjs[HUGE_MAX_HSTATE];
+};
+struct node_hstate node_hstates[MAX_NUMNODES];
+/*
+ * A subset of global hstate attributes for node sysdevs
+ */
+static struct attribute *per_node_hstate_attrs[] = {
+        &nr_hugepages_attr.attr,
+        &free_hugepages_attr.attr,
+        &surplus_hugepages_attr.attr,
+        NULL,
+};
+static struct attribute_group per_node_hstate_attr_group = {
+        .attrs = per_node_hstate_attrs,
+};
+/*
+ * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj.
+ * Returns node id via non-NULL nidp.
+ */
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+        int nid;
+        for (nid = 0; nid < nr_node_ids; nid++) {
+                struct node_hstate *nhs = &node_hstates[nid];
+                int i;
+                for (i = 0; i < HUGE_MAX_HSTATE; i++)
+                        if (nhs->hstate_kobjs[i] == kobj) {
+                                if (nidp)
+                                        *nidp = nid;
+                                return &hstates[i];
+                        }
+        }
+        BUG();
+        return NULL;
+}
+/*
+ * Unregister hstate attributes from a single node sysdev.
+ * No-op if no hstate attributes attached.
+ */
+void hugetlb_unregister_node(struct node *node)
+{
+        struct hstate *h;
+        struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+        if (!nhs->hugepages_kobj)
+                return;         /* no hstate attributes */
+        for_each_hstate(h)
+                if (nhs->hstate_kobjs[h - hstates]) {
+                        kobject_put(nhs->hstate_kobjs[h - hstates]);
+                        nhs->hstate_kobjs[h - hstates] = NULL;
+                }
+        kobject_put(nhs->hugepages_kobj);
+        nhs->hugepages_kobj = NULL;
+}
+/*
+ * hugetlb module exit:  unregister hstate attributes from node sysdevs
+ * that have them.
+ */
+static void hugetlb_unregister_all_nodes(void)
+{
+        int nid;
+        /*
+         * disable node sysdev registrations.
+         */
+        register_hugetlbfs_with_node(NULL, NULL);
+        /*
+         * remove hstate attributes from any nodes that have them.
+         */
+        for (nid = 0; nid < nr_node_ids; nid++)
+                hugetlb_unregister_node(&node_devices[nid]);
+}
+/*
+ * Register hstate attributes for a single node sysdev.
+ * No-op if attributes already registered.
+ */
+void hugetlb_register_node(struct node *node)
+{
+        struct hstate *h;
+        struct node_hstate *nhs = &node_hstates[node->sysdev.id];
+        int err;
+        if (nhs->hugepages_kobj)
+                return;         /* already allocated */
+        nhs->hugepages_kobj = kobject_create_and_add("hugepages",
+                                                        &node->sysdev.kobj);
+        if (!nhs->hugepages_kobj)
+                return;
+        for_each_hstate(h) {
+                err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj,
+                                                nhs->hstate_kobjs,
+                                                &per_node_hstate_attr_group);
+                if (err) {
+                        printk(KERN_ERR "Hugetlb: Unable to add hstate %s"
+                                        " for node %d\n",
+                                                h->name, node->sysdev.id);
+                        hugetlb_unregister_node(node);
+                        break;
+                }
+        }
+}
+/*
+ * hugetlb init time:  register hstate attributes for all registered node
+ * sysdevs of nodes that have memory.  All on-line nodes should have
+ * registered their associated sysdev by this time.
+ */
+static void hugetlb_register_all_nodes(void)
+{
+        int nid;
+        for_each_node_state(nid, N_HIGH_MEMORY) {
+                struct node *node = &node_devices[nid];
+                if (node->sysdev.id == nid)
+                        hugetlb_register_node(node);
+        }
+        /*
+         * Let the node sysdev driver know we're here so it can
+         * [un]register hstate attributes on node hotplug.
+         */
+        register_hugetlbfs_with_node(hugetlb_register_node,
+                                     hugetlb_unregister_node);
+}
+#else   /* !CONFIG_NUMA */
+static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
+{
+        BUG();
+        if (nidp)
+                *nidp = -1;
+        return NULL;
+}
+static void hugetlb_unregister_all_nodes(void) { }
+static void hugetlb_register_all_nodes(void) { }
+#endif
 static void __exit hugetlb_exit(void)
 {
        struct hstate *h;
+        hugetlb_unregister_all_nodes();
        for_each_hstate(h) {
                kobject_put(hstate_kobjs[h - hstates]);
        }
@@ -1449,6 +1756,8 @@ static int __init hugetlb_init(void)
        hugetlb_sysfs_init();
+        hugetlb_register_all_nodes();
        return 0;
 }
 module_init(hugetlb_init);
@@ -1472,8 +1781,8 @@ void __init hugetlb_add_hstate(unsigned order)
        h->free_huge_pages = 0;
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
-        h->next_nid_to_alloc = first_node(node_online_map);
+        h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]);
-        h->next_nid_to_free = first_node(node_online_map);
+        h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
@@ -1536,9 +1845,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 }
 #ifdef CONFIG_SYSCTL
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
-                           void __user *buffer,
+                         struct ctl_table *table, int write,
-                           size_t *length, loff_t *ppos)
+                         void __user *buffer, size_t *length, loff_t *ppos)
 {
        struct hstate *h = &default_hstate;
        unsigned long tmp;
@@ -1550,12 +1859,40 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
        table->maxlen = sizeof(unsigned long);
        proc_doulongvec_minmax(table, write, buffer, length, ppos);
-        if (write)
+        if (write) {
-                h->max_huge_pages = set_max_huge_pages(h, tmp);
+                NODEMASK_ALLOC(nodemask_t, nodes_allowed,
+                                                GFP_KERNEL | __GFP_NORETRY);
+                if (!(obey_mempolicy &&
+                               init_nodemask_of_mempolicy(nodes_allowed))) {
+                        NODEMASK_FREE(nodes_allowed);
+                        nodes_allowed = &node_states[N_HIGH_MEMORY];
+                }
+                h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed);
+                if (nodes_allowed != &node_states[N_HIGH_MEMORY])
+                        NODEMASK_FREE(nodes_allowed);
+        }
        return 0;
 }
+int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+                          void __user *buffer, size_t *length, loff_t *ppos)
+{
+        return hugetlb_sysctl_handler_common(false, table, write,
+                                                        buffer, length, ppos);
+}
+#ifdef CONFIG_NUMA
+int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
+                          void __user *buffer, size_t *length, loff_t *ppos)
+{
+        return hugetlb_sysctl_handler_common(true, table, write,
+                                                        buffer, length, ppos);
+}
+#endif /* CONFIG_NUMA */
 int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
                        void __user *buffer,
                        size_t *length, loff_t *ppos)
@@ -1903,6 +2240,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                + (vma->vm_pgoff >> PAGE_SHIFT);
        mapping = (struct address_space *)page_private(page);
+        /*
+         * Take the mapping lock for the duration of the table walk. As
+         * this mapping should be shared between all the VMAs,
+         * __unmap_hugepage_range() is called as the lock is already held
+         */
+        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                /* Do not unmap the current VMA */
                if (iter_vma == vma)
@@ -1916,10 +2259,11 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                 * from the time of fork. This would look like data corruption
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
-                        unmap_hugepage_range(iter_vma,
+                        __unmap_hugepage_range(iter_vma,
                                address, address + huge_page_size(h),
                                page);
        }
+        spin_unlock(&mapping->i_mmap_lock);
        return 1;
 }
@@ -1959,6 +2303,9 @@ retry_avoidcopy:
                outside_reserve = 1;
        page_cache_get(old_page);
+        /* Drop page_table_lock as buddy allocator may be called */
+        spin_unlock(&mm->page_table_lock);
        new_page = alloc_huge_page(vma, address, outside_reserve);
        if (IS_ERR(new_page)) {
@@ -1976,19 +2323,25 @@ retry_avoidcopy:
                        if (unmap_ref_private(mm, vma, old_page, address)) {
                                BUG_ON(page_count(old_page) != 1);
                                BUG_ON(huge_pte_none(pte));
+                                spin_lock(&mm->page_table_lock);
                                goto retry_avoidcopy;
                        }
                        WARN_ON_ONCE(1);
                }
+                /* Caller expects lock to be held */
+                spin_lock(&mm->page_table_lock);
                return -PTR_ERR(new_page);
        }
-        spin_unlock(&mm->page_table_lock);
        copy_huge_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
-        spin_lock(&mm->page_table_lock);
+        /*
+         * Retake the page_table_lock to check for racing updates
+         * before the page tables are altered
+         */
+        spin_lock(&mm->page_table_lock);
        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
                /* Break COW */
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index e1d85137f086..10ea71905c1f 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -3,18 +3,68 @@
 #include <linux/debugfs.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include "internal.h"
-static struct dentry *hwpoison_dir, *corrupt_pfn;
+static struct dentry *hwpoison_dir;
 static int hwpoison_inject(void *data, u64 val)
 {
+        unsigned long pfn = val;
+        struct page *p;
+        int err;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (!hwpoison_filter_enable)
+                goto inject;
+        if (!pfn_valid(pfn))
+                return -ENXIO;
+        p = pfn_to_page(pfn);
+        /*
+         * This implies unable to support free buddy pages.
+         */
+        if (!get_page_unless_zero(p))
+                return 0;
+        if (!PageLRU(p))
+                shake_page(p, 0);
+        /*
+         * This implies unable to support non-LRU pages.
+         */
+        if (!PageLRU(p))
+                return 0;
+        /*
+         * do a racy check with elevated page count, to make sure PG_hwpoison
+         * will only be set for the targeted owner (or on a free page).
+         * We temporarily take page lock for try_get_mem_cgroup_from_page().
+         * __memory_failure() will redo the check reliably inside page lock.
+         */
+        lock_page(p);
+        err = hwpoison_filter(p);
+        unlock_page(p);
+        if (err)
+                return 0;
+inject:
+        printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn);
+        return __memory_failure(pfn, 18, MF_COUNT_INCREASED);
+}
+static int hwpoison_unpoison(void *data, u64 val)
+{
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val);
-        return __memory_failure(val, 18, 0);
+        return unpoison_memory(val);
 }
 DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n");
+DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n");
 static void pfn_inject_exit(void)
 {
@@ -24,16 +74,63 @@ static void pfn_inject_exit(void)
 static int pfn_inject_init(void)
 {
+        struct dentry *dentry;
        hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
        if (hwpoison_dir == NULL)
                return -ENOMEM;
-        corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
+        /*
+         * Note that the below poison/unpoison interfaces do not involve
+         * hardware status change, hence do not require hardware support.
+         * They are mainly for testing hwpoison in software level.
+         */
+        dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
                                          NULL, &hwpoison_fops);
-        if (corrupt_pfn == NULL) {
+        if (!dentry)
-                pfn_inject_exit();
+                goto fail;
-                return -ENOMEM;
-        }
+        dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
+                                     NULL, &unpoison_fops);
+        if (!dentry)
+                goto fail;
+        dentry = debugfs_create_u32("corrupt-filter-enable", 0600,
+                                    hwpoison_dir, &hwpoison_filter_enable);
+        if (!dentry)
+                goto fail;
+        dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600,
+                                    hwpoison_dir, &hwpoison_filter_dev_major);
+        if (!dentry)
+                goto fail;
+        dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600,
+                                    hwpoison_dir, &hwpoison_filter_dev_minor);
+        if (!dentry)
+                goto fail;
+        dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600,
+                                    hwpoison_dir, &hwpoison_filter_flags_mask);
+        if (!dentry)
+                goto fail;
+        dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600,
+                                    hwpoison_dir, &hwpoison_filter_flags_value);
+        if (!dentry)
+                goto fail;
+#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+        dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
+                                    hwpoison_dir, &hwpoison_filter_memcg);
+        if (!dentry)
+                goto fail;
+#endif
        return 0;
+fail:
+        pfn_inject_exit();
+        return -ENOMEM;
 }
 module_init(pfn_inject_init);
diff --git a/mm/internal.h b/mm/internal.h
index 22ec8d2b0fb8..6a697bb97fc5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -50,6 +50,9 @@ extern void putback_lru_page(struct page *page);
 */
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 extern void prep_compound_page(struct page *page, unsigned long order);
+#ifdef CONFIG_MEMORY_FAILURE
+extern bool is_free_buddy_page(struct page *page);
+#endif
 /*
@@ -63,7 +66,7 @@ static inline unsigned long page_order(struct page *page)
        return page_private(page);
 }
-#ifdef CONFIG_HAVE_MLOCK
+#ifdef CONFIG_MMU
 extern long mlock_vma_pages_range(struct vm_area_struct *vma,
                        unsigned long start, unsigned long end);
 extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -72,22 +75,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 {
        munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
 }
-#endif
 /*
- * unevictable_migrate_page() called only from migrate_page_copy() to
- * migrate unevictable flag to new page.
- * Note that the old page has been isolated from the LRU lists at this
- * point so we don't need to worry about LRU statistics.
- */
-static inline void unevictable_migrate_page(struct page *new, struct page *old)
-{
-        if (TestClearPageUnevictable(old))
-                SetPageUnevictable(new);
-}
-#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
-/*
 * Called only in fault path via page_evictable() for a new page
 * to determine if it's being mapped into a LOCKED vma.
 * If so, mark page as mlocked.
@@ -107,9 +96,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
 }
 /*
- * must be called with vma's mmap_sem held for read, and page locked.
+ * must be called with vma's mmap_sem held for read or write, and page locked.
 */
 extern void mlock_vma_page(struct page *page);
+extern void munlock_vma_page(struct page *page);
 /*
 * Clear the page's PageMlocked().  This can be useful in a situation where
@@ -144,7 +134,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page)
        }
 }
-#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
+#else /* !CONFIG_MMU */
 static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
 {
        return 0;
@@ -153,7 +143,7 @@ static inline void clear_page_mlock(struct page *page) { }
 static inline void mlock_vma_page(struct page *page) { }
 static inline void mlock_migrate_page(struct page *new, struct page *old) { }
-#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
+#endif /* !CONFIG_MMU */
 /*
 * Return the mem_map entry representing the 'offset' subpage within
@@ -260,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 #define ZONE_RECLAIM_SOME       0
 #define ZONE_RECLAIM_SUCCESS    1
 #endif
+extern int hwpoison_filter(struct page *p);
+extern u32 hwpoison_filter_dev_major;
+extern u32 hwpoison_filter_dev_minor;
+extern u64 hwpoison_filter_flags_mask;
+extern u64 hwpoison_filter_flags_value;
+extern u64 hwpoison_filter_memcg;
+extern u32 hwpoison_filter_enable;
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 8bf765c4f58d..5b069e4f5e48 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -93,6 +93,7 @@
 #include <linux/nodemask.h>
 #include <linux/mm.h>
 #include <linux/workqueue.h>
+#include <linux/crc32.h>
 #include <asm/sections.h>
 #include <asm/processor.h>
@@ -108,7 +109,6 @@
 #define MSECS_MIN_AGE           5000    /* minimum object age for reporting */
 #define SECS_FIRST_SCAN         60      /* delay before the first scan */
 #define SECS_SCAN_WAIT          600     /* subsequent auto scanning delay */
-#define GRAY_LIST_PASSES        25      /* maximum number of gray list scans */
 #define MAX_SCAN_SIZE           4096    /* maximum size of a scanned block */
 #define BYTES_PER_POINTER       sizeof(void *)
@@ -119,8 +119,8 @@
 /* scanning area inside a memory block */
 struct kmemleak_scan_area {
        struct hlist_node node;
-        unsigned long offset;
+        unsigned long start;
-        size_t length;
+        size_t size;
 };
 #define KMEMLEAK_GREY   0
@@ -149,6 +149,8 @@ struct kmemleak_object {
        int min_count;
        /* the total number of pointers found pointing to this object */
        int count;
+        /* checksum for detecting modified objects */
+        u32 checksum;
        /* memory ranges to be scanned inside an object (empty for all) */
        struct hlist_head area_list;
        unsigned long trace[MAX_TRACE];
@@ -164,8 +166,6 @@ struct kmemleak_object {
 #define OBJECT_REPORTED         (1 << 1)
 /* flag set to not scan the object */
 #define OBJECT_NO_SCAN          (1 << 2)
-/* flag set on newly allocated objects */
-#define OBJECT_NEW              (1 << 3)
 /* number of bytes to print per line; must be 16 or 32 */
 #define HEX_ROW_SIZE            16
@@ -241,8 +241,6 @@ struct early_log {
        const void *ptr;                /* allocated/freed memory block */
        size_t size;                    /* memory block size */
        int min_count;                  /* minimum reference count */
-        unsigned long offset;           /* scan area offset */
-        size_t length;                  /* scan area length */
        unsigned long trace[MAX_TRACE]; /* stack trace */
        unsigned int trace_len;         /* stack trace length */
 };
@@ -323,11 +321,6 @@ static bool color_gray(const struct kmemleak_object *object)
                object->count >= object->min_count;
 }
-static bool color_black(const struct kmemleak_object *object)
-{
-        return object->min_count == KMEMLEAK_BLACK;
-}
 /*
 * Objects are considered unreferenced only if their color is white, they have
 * not be deleted and have a minimum age to avoid false positives caused by
@@ -335,7 +328,7 @@ static bool color_black(const struct kmemleak_object *object)
 */
 static bool unreferenced_object(struct kmemleak_object *object)
 {
-        return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
+        return (color_white(object) && object->flags & OBJECT_ALLOCATED) &&
                time_before_eq(object->jiffies + jiffies_min_age,
                               jiffies_last_scan);
 }
@@ -348,11 +341,13 @@ static void print_unreferenced(struct seq_file *seq,
                               struct kmemleak_object *object)
 {
        int i;
+        unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
        seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
                   object->pointer, object->size);
-        seq_printf(seq, "  comm \"%s\", pid %d, jiffies %lu\n",
+        seq_printf(seq, "  comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
-                   object->comm, object->pid, object->jiffies);
+                   object->comm, object->pid, object->jiffies,
+                   msecs_age / 1000, msecs_age % 1000);
        hex_dump_object(seq, object);
        seq_printf(seq, "  backtrace:\n");
@@ -381,6 +376,7 @@ static void dump_object_info(struct kmemleak_object *object)
        pr_notice("  min_count = %d\n", object->min_count);
        pr_notice("  count = %d\n", object->count);
        pr_notice("  flags = 0x%lx\n", object->flags);
+        pr_notice("  checksum = %d\n", object->checksum);
        pr_notice("  backtrace:\n");
        print_stack_trace(&trace, 4);
 }
@@ -522,12 +518,13 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
        INIT_HLIST_HEAD(&object->area_list);
        spin_lock_init(&object->lock);
        atomic_set(&object->use_count, 1);
-        object->flags = OBJECT_ALLOCATED | OBJECT_NEW;
+        object->flags = OBJECT_ALLOCATED;
        object->pointer = ptr;
        object->size = size;
        object->min_count = min_count;
-        object->count = -1;                     /* no color initially */
+        object->count = 0;                      /* white color initially */
        object->jiffies = jiffies;
+        object->checksum = 0;
        /* task information */
        if (in_irq()) {
@@ -720,14 +717,13 @@ static void make_black_object(unsigned long ptr)
 * Add a scanning area to the object. If at least one such area is added,
 * kmemleak will only scan these ranges rather than the whole memory block.
 */
-static void add_scan_area(unsigned long ptr, unsigned long offset,
+static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
-                          size_t length, gfp_t gfp)
 {
        unsigned long flags;
        struct kmemleak_object *object;
        struct kmemleak_scan_area *area;
-        object = find_and_get_object(ptr, 0);
+        object = find_and_get_object(ptr, 1);
        if (!object) {
                kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n",
                              ptr);
@@ -741,7 +737,7 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
        }
        spin_lock_irqsave(&object->lock, flags);
-        if (offset + length > object->size) {
+        if (ptr + size > object->pointer + object->size) {
                kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
                dump_object_info(object);
                kmem_cache_free(scan_area_cache, area);
@@ -749,8 +745,8 @@ static void add_scan_area(unsigned long ptr, unsigned long offset,
        }
        INIT_HLIST_NODE(&area->node);
-        area->offset = offset;
+        area->start = ptr;
-        area->length = length;
+        area->size = size;
        hlist_add_head(&area->node, &object->area_list);
 out_unlock:
@@ -786,7 +782,7 @@ static void object_no_scan(unsigned long ptr)
 * processed later once kmemleak is fully initialized.
 */
 static void __init log_early(int op_type, const void *ptr, size_t size,
-                             int min_count, unsigned long offset, size_t length)
+                             int min_count)
 {
        unsigned long flags;
        struct early_log *log;
@@ -808,8 +804,6 @@ static void __init log_early(int op_type, const void *ptr, size_t size,
        log->ptr = ptr;
        log->size = size;
        log->min_count = min_count;
-        log->offset = offset;
-        log->length = length;
        if (op_type == KMEMLEAK_ALLOC)
                log->trace_len = __save_stack_trace(log->trace);
        crt_early_log++;
@@ -858,7 +852,7 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                create_object((unsigned long)ptr, size, min_count, gfp);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0);
+                log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
 }
 EXPORT_SYMBOL_GPL(kmemleak_alloc);
@@ -873,7 +867,7 @@ void __ref kmemleak_free(const void *ptr)
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                delete_object_full((unsigned long)ptr);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
+                log_early(KMEMLEAK_FREE, ptr, 0, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_free);
@@ -888,7 +882,7 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                delete_object_part((unsigned long)ptr, size);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0);
+                log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
 }
 EXPORT_SYMBOL_GPL(kmemleak_free_part);
@@ -903,7 +897,7 @@ void __ref kmemleak_not_leak(const void *ptr)
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                make_gray_object((unsigned long)ptr);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0);
+                log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_not_leak);
@@ -919,22 +913,21 @@ void __ref kmemleak_ignore(const void *ptr)
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                make_black_object((unsigned long)ptr);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0);
+                log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_ignore);
 /*
 * Limit the range to be scanned in an allocated memory block.
 */
-void __ref kmemleak_scan_area(const void *ptr, unsigned long offset,
+void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
-                              size_t length, gfp_t gfp)
 {
        pr_debug("%s(0x%p)\n", __func__, ptr);
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
-                add_scan_area((unsigned long)ptr, offset, length, gfp);
+                add_scan_area((unsigned long)ptr, size, gfp);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length);
+                log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
 }
 EXPORT_SYMBOL(kmemleak_scan_area);
@@ -948,11 +941,25 @@ void __ref kmemleak_no_scan(const void *ptr)
        if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
                object_no_scan((unsigned long)ptr);
        else if (atomic_read(&kmemleak_early_log))
-                log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0);
+                log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
 }
 EXPORT_SYMBOL(kmemleak_no_scan);
 /*
+ * Update an object's checksum and return true if it was modified.
+ */
+static bool update_checksum(struct kmemleak_object *object)
+{
+        u32 old_csum = object->checksum;
+        if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
+                return false;
+        object->checksum = crc32(0, (void *)object->pointer, object->size);
+        return object->checksum != old_csum;
+}
+/*
 * Memory scanning is a long process and it needs to be interruptable. This
 * function checks whether such interrupt condition occured.
 */
@@ -1031,11 +1038,14 @@ static void scan_block(void *_start, void *_end,
                 * added to the gray_list.
                 */
                object->count++;
-                if (color_gray(object))
+                if (color_gray(object)) {
                        list_add_tail(&object->gray_list, &gray_list);
-                else
+                        spin_unlock_irqrestore(&object->lock, flags);
-                        put_object(object);
+                        continue;
+                }
                spin_unlock_irqrestore(&object->lock, flags);
+                put_object(object);
        }
 }
@@ -1050,8 +1060,8 @@ static void scan_object(struct kmemleak_object *object)
        unsigned long flags;
        /*
-         * Once the object->lock is aquired, the corresponding memory block
+         * Once the object->lock is acquired, the corresponding memory block
-         * cannot be freed (the same lock is aquired in delete_object).
+         * cannot be freed (the same lock is acquired in delete_object).
         */
        spin_lock_irqsave(&object->lock, flags);
        if (object->flags & OBJECT_NO_SCAN)
@@ -1075,14 +1085,47 @@ static void scan_object(struct kmemleak_object *object)
                }
        } else
                hlist_for_each_entry(area, elem, &object->area_list, node)
-                        scan_block((void *)(object->pointer + area->offset),
+                        scan_block((void *)area->start,
-                                   (void *)(object->pointer + area->offset
+                                   (void *)(area->start + area->size),
-                                            + area->length), object, 0);
+                                   object, 0);
 out:
        spin_unlock_irqrestore(&object->lock, flags);
 }
 /*
+ * Scan the objects already referenced (gray objects). More objects will be
+ * referenced and, if there are no memory leaks, all the objects are scanned.
+ */
+static void scan_gray_list(void)
+{
+        struct kmemleak_object *object, *tmp;
+        /*
+         * The list traversal is safe for both tail additions and removals
+         * from inside the loop. The kmemleak objects cannot be freed from
+         * outside the loop because their use_count was incremented.
+         */
+        object = list_entry(gray_list.next, typeof(*object), gray_list);
+        while (&object->gray_list != &gray_list) {
+                cond_resched();
+                /* may add new objects to the list */
+                if (!scan_should_stop())
+                        scan_object(object);
+                tmp = list_entry(object->gray_list.next, typeof(*object),
+                                 gray_list);
+                /* remove the object from the list and release it */
+                list_del(&object->gray_list);
+                put_object(object);
+                object = tmp;
+        }
+        WARN_ON(!list_empty(&gray_list));
+}
+/*
 * Scan data sections and all the referenced memory blocks allocated via the
 * kernel's standard allocators. This function must be called with the
 * scan_mutex held.
@@ -1090,10 +1133,9 @@ out:
 static void kmemleak_scan(void)
 {
        unsigned long flags;
-        struct kmemleak_object *object, *tmp;
+        struct kmemleak_object *object;
        int i;
        int new_leaks = 0;
-        int gray_list_pass = 0;
        jiffies_last_scan = jiffies;
@@ -1114,7 +1156,6 @@ static void kmemleak_scan(void)
 #endif
                /* reset the reference count (whiten the object) */
                object->count = 0;
-                object->flags &= ~OBJECT_NEW;
                if (color_gray(object) && get_object(object))
                        list_add_tail(&object->gray_list, &gray_list);
@@ -1172,62 +1213,36 @@ static void kmemleak_scan(void)
        /*
         * Scan the objects already referenced from the sections scanned
-         * above. More objects will be referenced and, if there are no memory
+         * above.
-         * leaks, all the objects will be scanned. The list traversal is safe
-         * for both tail additions and removals from inside the loop. The
-         * kmemleak objects cannot be freed from outside the loop because their
-         * use_count was increased.
         */
-repeat:
+        scan_gray_list();
-        object = list_entry(gray_list.next, typeof(*object), gray_list);
-        while (&object->gray_list != &gray_list) {
-                cond_resched();
-                /* may add new objects to the list */
-                if (!scan_should_stop())
-                        scan_object(object);
-                tmp = list_entry(object->gray_list.next, typeof(*object),
-                                 gray_list);
-                /* remove the object from the list and release it */
-                list_del(&object->gray_list);
-                put_object(object);
-                object = tmp;
-        }
-        if (scan_should_stop() || ++gray_list_pass >= GRAY_LIST_PASSES)
-                goto scan_end;
        /*
-         * Check for new objects allocated during this scanning and add them
+         * Check for new or unreferenced objects modified since the previous
-         * to the gray list.
+         * scan and color them gray until the next scan.
         */
        rcu_read_lock();
        list_for_each_entry_rcu(object, &object_list, object_list) {
                spin_lock_irqsave(&object->lock, flags);
-                if ((object->flags & OBJECT_NEW) && !color_black(object) &&
+                if (color_white(object) && (object->flags & OBJECT_ALLOCATED)
-                    get_object(object)) {
+                    && update_checksum(object) && get_object(object)) {
-                        object->flags &= ~OBJECT_NEW;
+                        /* color it gray temporarily */
+                        object->count = object->min_count;
                        list_add_tail(&object->gray_list, &gray_list);
                }
                spin_unlock_irqrestore(&object->lock, flags);
        }
        rcu_read_unlock();
-        if (!list_empty(&gray_list))
+        /*
-                goto repeat;
+         * Re-scan the gray list for modified unreferenced objects.
+         */
-scan_end:
+        scan_gray_list();
-        WARN_ON(!list_empty(&gray_list));
        /*
-         * If scanning was stopped or new objects were being allocated at a
+         * If scanning was stopped do not report any new unreferenced objects.
-         * higher rate than gray list scanning, do not report any new
-         * unreferenced objects.
         */
-        if (scan_should_stop() || gray_list_pass >= GRAY_LIST_PASSES)
+        if (scan_should_stop())
                return;
        /*
@@ -1642,8 +1657,7 @@ void __init kmemleak_init(void)
                        kmemleak_ignore(log->ptr);
                        break;
                case KMEMLEAK_SCAN_AREA:
-                        kmemleak_scan_area(log->ptr, log->offset, log->length,
+                        kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
-                                           GFP_KERNEL);
                        break;
                case KMEMLEAK_NO_SCAN:
                        kmemleak_no_scan(log->ptr);
diff --git a/mm/ksm.c b/mm/ksm.c
index bef1af4f77e3..56a0da1f9979 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -29,11 +29,13 @@
 #include <linux/wait.h>
 #include <linux/slab.h>
 #include <linux/rbtree.h>
+#include <linux/memory.h>
 #include <linux/mmu_notifier.h>
 #include <linux/swap.h>
 #include <linux/ksm.h>
 #include <asm/tlbflush.h>
+#include "internal.h"
 /*
 * A few notes about the KSM scanning process,
@@ -79,13 +81,13 @@
 * struct mm_slot - ksm information per mm that is being scanned
 * @link: link to the mm_slots hash list
 * @mm_list: link into the mm_slots list, rooted in ksm_mm_head
- * @rmap_list: head for this mm_slot's list of rmap_items
+ * @rmap_list: head for this mm_slot's singly-linked list of rmap_items
 * @mm: the mm that this information is valid for
 */
 struct mm_slot {
        struct hlist_node link;
        struct list_head mm_list;
-        struct list_head rmap_list;
+        struct rmap_item *rmap_list;
        struct mm_struct *mm;
 };
@@ -93,7 +95,7 @@ struct mm_slot {
 * struct ksm_scan - cursor for scanning
 * @mm_slot: the current mm_slot we are scanning
 * @address: the next address inside that to be scanned
- * @rmap_item: the current rmap that we are scanning inside the rmap_list
+ * @rmap_list: link to the next rmap to be scanned in the rmap_list
 * @seqnr: count of completed full scans (needed when removing unstable node)
 *
 * There is only the one ksm_scan instance of this cursor structure.
@@ -101,37 +103,51 @@ struct mm_slot {
 struct ksm_scan {
        struct mm_slot *mm_slot;
        unsigned long address;
-        struct rmap_item *rmap_item;
+        struct rmap_item **rmap_list;
        unsigned long seqnr;
 };
 /**
+ * struct stable_node - node of the stable rbtree
+ * @node: rb node of this ksm page in the stable tree
+ * @hlist: hlist head of rmap_items using this ksm page
+ * @kpfn: page frame number of this ksm page
+ */
+struct stable_node {
+        struct rb_node node;
+        struct hlist_head hlist;
+        unsigned long kpfn;
+};
+/**
 * struct rmap_item - reverse mapping item for virtual addresses
- * @link: link into mm_slot's rmap_list (rmap_list is per mm)
+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
 * @mm: the memory structure this rmap_item is pointing into
 * @address: the virtual address this rmap_item tracks (+ flags in low bits)
 * @oldchecksum: previous checksum of the page at that virtual address
- * @node: rb_node of this rmap_item in either unstable or stable tree
+ * @node: rb node of this rmap_item in the unstable tree
- * @next: next rmap_item hanging off the same node of the stable tree
+ * @head: pointer to stable_node heading this list in the stable tree
- * @prev: previous rmap_item hanging off the same node of the stable tree
+ * @hlist: link into hlist of rmap_items hanging off that stable_node
 */
 struct rmap_item {
-        struct list_head link;
+        struct rmap_item *rmap_list;
+        struct anon_vma *anon_vma;      /* when stable */
        struct mm_struct *mm;
        unsigned long address;          /* + low bits used for flags below */
+        unsigned int oldchecksum;       /* when unstable */
        union {
-                unsigned int oldchecksum;               /* when unstable */
+                struct rb_node node;    /* when node of unstable tree */
-                struct rmap_item *next;                 /* when stable */
+                struct {                /* when listed from stable tree */
-        };
+                        struct stable_node *head;
-        union {
+                        struct hlist_node hlist;
-                struct rb_node node;                    /* when tree node */
+                };
-                struct rmap_item *prev;                 /* in stable list */
        };
 };
 #define SEQNR_MASK      0x0ff   /* low bits of unstable tree seqnr */
-#define NODE_FLAG       0x100   /* is a node of unstable or stable tree */
+#define UNSTABLE_FLAG   0x100   /* is a node of the unstable tree */
-#define STABLE_FLAG     0x200   /* is a node or list item of stable tree */
+#define STABLE_FLAG     0x200   /* is listed from the stable tree */
 /* The stable and unstable tree heads */
 static struct rb_root root_stable_tree = RB_ROOT;
@@ -148,6 +164,7 @@ static struct ksm_scan ksm_scan = {
 };
 static struct kmem_cache *rmap_item_cache;
+static struct kmem_cache *stable_node_cache;
 static struct kmem_cache *mm_slot_cache;
 /* The number of nodes in the stable tree */
@@ -162,9 +179,6 @@ static unsigned long ksm_pages_unshared;
 /* The number of rmap_items in use: to calculate pages_volatile */
 static unsigned long ksm_rmap_items;
-/* Limit on the number of unswappable pages used */
-static unsigned long ksm_max_kernel_pages;
 /* Number of pages ksmd should scan in one batch */
 static unsigned int ksm_thread_pages_to_scan = 100;
@@ -190,13 +204,19 @@ static int __init ksm_slab_init(void)
        if (!rmap_item_cache)
                goto out;
+        stable_node_cache = KSM_KMEM_CACHE(stable_node, 0);
+        if (!stable_node_cache)
+                goto out_free1;
        mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0);
        if (!mm_slot_cache)
-                goto out_free;
+                goto out_free2;
        return 0;
-out_free:
+out_free2:
+        kmem_cache_destroy(stable_node_cache);
+out_free1:
        kmem_cache_destroy(rmap_item_cache);
 out:
        return -ENOMEM;
@@ -205,6 +225,7 @@ out:
 static void __init ksm_slab_free(void)
 {
        kmem_cache_destroy(mm_slot_cache);
+        kmem_cache_destroy(stable_node_cache);
        kmem_cache_destroy(rmap_item_cache);
        mm_slot_cache = NULL;
 }
@@ -226,6 +247,16 @@ static inline void free_rmap_item(struct rmap_item *rmap_item)
        kmem_cache_free(rmap_item_cache, rmap_item);
 }
+static inline struct stable_node *alloc_stable_node(void)
+{
+        return kmem_cache_alloc(stable_node_cache, GFP_KERNEL);
+}
+static inline void free_stable_node(struct stable_node *stable_node)
+{
+        kmem_cache_free(stable_node_cache, stable_node);
+}
 static inline struct mm_slot *alloc_mm_slot(void)
 {
        if (!mm_slot_cache)     /* initialization failed */
@@ -275,7 +306,6 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
                                % MM_SLOTS_HASH_HEADS];
        mm_slot->mm = mm;
-        INIT_LIST_HEAD(&mm_slot->rmap_list);
        hlist_add_head(&mm_slot->link, bucket);
 }
@@ -284,6 +314,25 @@ static inline int in_stable_tree(struct rmap_item *rmap_item)
        return rmap_item->address & STABLE_FLAG;
 }
+static void hold_anon_vma(struct rmap_item *rmap_item,
+                          struct anon_vma *anon_vma)
+{
+        rmap_item->anon_vma = anon_vma;
+        atomic_inc(&anon_vma->ksm_refcount);
+}
+static void drop_anon_vma(struct rmap_item *rmap_item)
+{
+        struct anon_vma *anon_vma = rmap_item->anon_vma;
+        if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) {
+                int empty = list_empty(&anon_vma->head);
+                spin_unlock(&anon_vma->lock);
+                if (empty)
+                        anon_vma_free(anon_vma);
+        }
+}
 /*
 * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
 * page tables after it has passed through ksm_exit() - which, if necessary,
@@ -356,10 +405,18 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
        return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
 }
-static void break_cow(struct mm_struct *mm, unsigned long addr)
+static void break_cow(struct rmap_item *rmap_item)
 {
+        struct mm_struct *mm = rmap_item->mm;
+        unsigned long addr = rmap_item->address;
        struct vm_area_struct *vma;
+        /*
+         * It is not an accident that whenever we want to break COW
+         * to undo, we also need to drop a reference to the anon_vma.
+         */
+        drop_anon_vma(rmap_item);
        down_read(&mm->mmap_sem);
        if (ksm_test_exit(mm))
                goto out;
@@ -403,21 +460,77 @@ out:		page = NULL;
        return page;
 }
+static void remove_node_from_stable_tree(struct stable_node *stable_node)
+{
+        struct rmap_item *rmap_item;
+        struct hlist_node *hlist;
+        hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+                if (rmap_item->hlist.next)
+                        ksm_pages_sharing--;
+                else
+                        ksm_pages_shared--;
+                drop_anon_vma(rmap_item);
+                rmap_item->address &= PAGE_MASK;
+                cond_resched();
+        }
+        rb_erase(&stable_node->node, &root_stable_tree);
+        free_stable_node(stable_node);
+}
 /*
- * get_ksm_page: checks if the page at the virtual address in rmap_item
+ * get_ksm_page: checks if the page indicated by the stable node
- * is still PageKsm, in which case we can trust the content of the page,
+ * is still its ksm page, despite having held no reference to it.
- * and it returns the gotten page; but NULL if the page has been zapped.
+ * In which case we can trust the content of the page, and it
+ * returns the gotten page; but if the page has now been zapped,
+ * remove the stale node from the stable tree and return NULL.
+ *
+ * You would expect the stable_node to hold a reference to the ksm page.
+ * But if it increments the page's count, swapping out has to wait for
+ * ksmd to come around again before it can free the page, which may take
+ * seconds or even minutes: much too unresponsive.  So instead we use a
+ * "keyhole reference": access to the ksm page from the stable node peeps
+ * out through its keyhole to see if that page still holds the right key,
+ * pointing back to this stable node.  This relies on freeing a PageAnon
+ * page to reset its page->mapping to NULL, and relies on no other use of
+ * a page to put something that might look like our key in page->mapping.
+ *
+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
+ * but this is different - made simpler by ksm_thread_mutex being held, but
+ * interesting for assuming that no other use of the struct page could ever
+ * put our expected_mapping into page->mapping (or a field of the union which
+ * coincides with page->mapping).  The RCU calls are not for KSM at all, but
+ * to keep the page_count protocol described with page_cache_get_speculative.
+ *
+ * Note: it is possible that get_ksm_page() will return NULL one moment,
+ * then page the next, if the page is in between page_freeze_refs() and
+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
+ * is on its way to being freed; but it is an anomaly to bear in mind.
 */
-static struct page *get_ksm_page(struct rmap_item *rmap_item)
+static struct page *get_ksm_page(struct stable_node *stable_node)
 {
        struct page *page;
+        void *expected_mapping;
-        page = get_mergeable_page(rmap_item);
-        if (page && !PageKsm(page)) {
+        page = pfn_to_page(stable_node->kpfn);
+        expected_mapping = (void *)stable_node +
+                                (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
+        rcu_read_lock();
+        if (page->mapping != expected_mapping)
+                goto stale;
+        if (!get_page_unless_zero(page))
+                goto stale;
+        if (page->mapping != expected_mapping) {
                put_page(page);
-                page = NULL;
+                goto stale;
        }
+        rcu_read_unlock();
        return page;
+stale:
+        rcu_read_unlock();
+        remove_node_from_stable_tree(stable_node);
+        return NULL;
 }
 /*
@@ -426,35 +539,29 @@ static struct page *get_ksm_page(struct rmap_item *rmap_item)
 */
 static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
 {
-        if (in_stable_tree(rmap_item)) {
+        if (rmap_item->address & STABLE_FLAG) {
-                struct rmap_item *next_item = rmap_item->next;
+                struct stable_node *stable_node;
+                struct page *page;
-                if (rmap_item->address & NODE_FLAG) {
-                        if (next_item) {
-                                rb_replace_node(&rmap_item->node,
-                                                &next_item->node,
-                                                &root_stable_tree);
-                                next_item->address |= NODE_FLAG;
-                                ksm_pages_sharing--;
-                        } else {
-                                rb_erase(&rmap_item->node, &root_stable_tree);
-                                ksm_pages_shared--;
-                        }
-                } else {
-                        struct rmap_item *prev_item = rmap_item->prev;
-                        BUG_ON(prev_item->next != rmap_item);
+                stable_node = rmap_item->head;
-                        prev_item->next = next_item;
+                page = get_ksm_page(stable_node);
-                        if (next_item) {
+                if (!page)
-                                BUG_ON(next_item->prev != rmap_item);
+                        goto out;
-                                next_item->prev = rmap_item->prev;
-                        }
+                lock_page(page);
+                hlist_del(&rmap_item->hlist);
+                unlock_page(page);
+                put_page(page);
+                if (stable_node->hlist.first)
                        ksm_pages_sharing--;
-                }
+                else
+                        ksm_pages_shared--;
-                rmap_item->next = NULL;
+                drop_anon_vma(rmap_item);
+                rmap_item->address &= PAGE_MASK;
-        } else if (rmap_item->address & NODE_FLAG) {
+        } else if (rmap_item->address & UNSTABLE_FLAG) {
                unsigned char age;
                /*
                 * Usually ksmd can and must skip the rb_erase, because
@@ -467,24 +574,21 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                BUG_ON(age > 1);
                if (!age)
                        rb_erase(&rmap_item->node, &root_unstable_tree);
                ksm_pages_unshared--;
+                rmap_item->address &= PAGE_MASK;
        }
+out:
-        rmap_item->address &= PAGE_MASK;
        cond_resched();         /* we're called from many long loops */
 }
 static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
-                                       struct list_head *cur)
+                                       struct rmap_item **rmap_list)
 {
-        struct rmap_item *rmap_item;
+        while (*rmap_list) {
+                struct rmap_item *rmap_item = *rmap_list;
-        while (cur != &mm_slot->rmap_list) {
+                *rmap_list = rmap_item->rmap_list;
-                rmap_item = list_entry(cur, struct rmap_item, link);
-                cur = cur->next;
                remove_rmap_item_from_tree(rmap_item);
-                list_del(&rmap_item->link);
                free_rmap_item(rmap_item);
        }
 }
@@ -550,7 +654,7 @@ static int unmerge_and_remove_all_rmap_items(void)
                                goto error;
                }
-                remove_trailing_rmap_items(mm_slot, mm_slot->rmap_list.next);
+                remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
                spin_lock(&ksm_mmlist_lock);
                ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -646,7 +750,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
                 * Check that no O_DIRECT or similar I/O is in progress on the
                 * page
                 */
-                if ((page_mapcount(page) + 2 + swapped) != page_count(page)) {
+                if (page_mapcount(page) + 1 + swapped != page_count(page)) {
                        set_pte_at_notify(mm, addr, ptep, entry);
                        goto out_unlock;
                }
@@ -664,15 +768,15 @@ out:
 /**
 * replace_page - replace page in vma by new ksm page
- * @vma:      vma that holds the pte pointing to oldpage
+ * @vma:      vma that holds the pte pointing to page
- * @oldpage:  the page we are replacing by newpage
+ * @page:     the page we are replacing by kpage
- * @newpage:  the ksm page we replace oldpage by
+ * @kpage:    the ksm page we replace page by
 * @orig_pte: the original value of the pte
 *
 * Returns 0 on success, -EFAULT on failure.
 */
-static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
+static int replace_page(struct vm_area_struct *vma, struct page *page,
-                        struct page *newpage, pte_t orig_pte)
+                        struct page *kpage, pte_t orig_pte)
 {
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
@@ -681,12 +785,9 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
        pte_t *ptep;
        spinlock_t *ptl;
        unsigned long addr;
-        pgprot_t prot;
        int err = -EFAULT;
-        prot = vm_get_page_prot(vma->vm_flags & ~VM_WRITE);
+        addr = page_address_in_vma(page, vma);
-        addr = page_address_in_vma(oldpage, vma);
        if (addr == -EFAULT)
                goto out;
@@ -708,15 +809,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *oldpage,
                goto out;
        }
-        get_page(newpage);
+        get_page(kpage);
-        page_add_ksm_rmap(newpage);
+        page_add_anon_rmap(kpage, vma, addr);
        flush_cache_page(vma, addr, pte_pfn(*ptep));
        ptep_clear_flush(vma, addr, ptep);
-        set_pte_at_notify(mm, addr, ptep, mk_pte(newpage, prot));
+        set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot));
-        page_remove_rmap(oldpage);
+        page_remove_rmap(page);
-        put_page(oldpage);
+        put_page(page);
        pte_unmap_unlock(ptep, ptl);
        err = 0;
@@ -726,32 +827,27 @@ out:
 /*
 * try_to_merge_one_page - take two pages and merge them into one
- * @vma: the vma that hold the pte pointing into oldpage
+ * @vma: the vma that holds the pte pointing to page
- * @oldpage: the page that we want to replace with newpage
+ * @page: the PageAnon page that we want to replace with kpage
- * @newpage: the page that we want to map instead of oldpage
+ * @kpage: the PageKsm page that we want to map instead of page,
- *
+ *         or NULL the first time when we want to use page as kpage.
- * Note:
- * oldpage should be a PageAnon page, while newpage should be a PageKsm page,
- * or a newly allocated kernel page which page_add_ksm_rmap will make PageKsm.
 *
 * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
 static int try_to_merge_one_page(struct vm_area_struct *vma,
-                                 struct page *oldpage,
+                                 struct page *page, struct page *kpage)
-                                 struct page *newpage)
 {
        pte_t orig_pte = __pte(0);
        int err = -EFAULT;
+        if (page == kpage)                      /* ksm page forked */
+                return 0;
        if (!(vma->vm_flags & VM_MERGEABLE))
                goto out;
+        if (!PageAnon(page))
-        if (!PageAnon(oldpage))
                goto out;
-        get_page(newpage);
-        get_page(oldpage);
        /*
         * We need the page lock to read a stable PageSwapCache in
         * write_protect_page().  We use trylock_page() instead of
@@ -759,26 +855,39 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
         * prefer to continue scanning and merging different pages,
         * then come back to this page when it is unlocked.
         */
-        if (!trylock_page(oldpage))
+        if (!trylock_page(page))
-                goto out_putpage;
+                goto out;
        /*
         * If this anonymous page is mapped only here, its pte may need
         * to be write-protected.  If it's mapped elsewhere, all of its
         * ptes are necessarily already write-protected.  But in either
         * case, we need to lock and check page_count is not raised.
         */
-        if (write_protect_page(vma, oldpage, &orig_pte)) {
+        if (write_protect_page(vma, page, &orig_pte) == 0) {
-                unlock_page(oldpage);
+                if (!kpage) {
-                goto out_putpage;
+                        /*
+                         * While we hold page lock, upgrade page from
+                         * PageAnon+anon_vma to PageKsm+NULL stable_node:
+                         * stable_tree_insert() will update stable_node.
+                         */
+                        set_page_stable_node(page, NULL);
+                        mark_page_accessed(page);
+                        err = 0;
+                } else if (pages_identical(page, kpage))
+                        err = replace_page(vma, page, kpage, orig_pte);
        }
-        unlock_page(oldpage);
-        if (pages_identical(oldpage, newpage))
+        if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
-                err = replace_page(vma, oldpage, newpage, orig_pte);
+                munlock_vma_page(page);
+                if (!PageMlocked(kpage)) {
+                        unlock_page(page);
+                        lock_page(kpage);
+                        mlock_vma_page(kpage);
+                        page = kpage;           /* for final unlock */
+                }
+        }
-out_putpage:
+        unlock_page(page);
-        put_page(oldpage);
-        put_page(newpage);
 out:
        return err;
 }
@@ -786,26 +895,31 @@ out:
 /*
 * try_to_merge_with_ksm_page - like try_to_merge_two_pages,
 * but no new kernel page is allocated: kpage must already be a ksm page.
+ *
+ * This function returns 0 if the pages were merged, -EFAULT otherwise.
 */
-static int try_to_merge_with_ksm_page(struct mm_struct *mm1,
+static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
-                                      unsigned long addr1,
+                                      struct page *page, struct page *kpage)
-                                      struct page *page1,
-                                      struct page *kpage)
 {
+        struct mm_struct *mm = rmap_item->mm;
        struct vm_area_struct *vma;
        int err = -EFAULT;
-        down_read(&mm1->mmap_sem);
+        down_read(&mm->mmap_sem);
-        if (ksm_test_exit(mm1))
+        if (ksm_test_exit(mm))
+                goto out;
+        vma = find_vma(mm, rmap_item->address);
+        if (!vma || vma->vm_start > rmap_item->address)
                goto out;
-        vma = find_vma(mm1, addr1);
+        err = try_to_merge_one_page(vma, page, kpage);
-        if (!vma || vma->vm_start > addr1)
+        if (err)
                goto out;
-        err = try_to_merge_one_page(vma, page1, kpage);
+        /* Must get reference to anon_vma while still holding mmap_sem */
+        hold_anon_vma(rmap_item, vma->anon_vma);
 out:
-        up_read(&mm1->mmap_sem);
+        up_read(&mm->mmap_sem);
        return err;
 }
@@ -813,109 +927,73 @@ out:
 * try_to_merge_two_pages - take two identical pages and prepare them
 * to be merged into one page.
 *
- * This function returns 0 if we successfully mapped two identical pages
+ * This function returns the kpage if we successfully merged two identical
- * into one page, -EFAULT otherwise.
+ * pages into one ksm page, NULL otherwise.
 *
- * Note that this function allocates a new kernel page: if one of the pages
+ * Note that this function upgrades page to ksm page: if one of the pages
 * is already a ksm page, try_to_merge_with_ksm_page should be used.
 */
-static int try_to_merge_two_pages(struct mm_struct *mm1, unsigned long addr1,
+static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
-                                  struct page *page1, struct mm_struct *mm2,
+                                           struct page *page,
-                                  unsigned long addr2, struct page *page2)
+                                           struct rmap_item *tree_rmap_item,
+                                           struct page *tree_page)
 {
-        struct vm_area_struct *vma;
+        int err;
-        struct page *kpage;
-        int err = -EFAULT;
-        /*
-         * The number of nodes in the stable tree
-         * is the number of kernel pages that we hold.
-         */
-        if (ksm_max_kernel_pages &&
-            ksm_max_kernel_pages <= ksm_pages_shared)
-                return err;
-        kpage = alloc_page(GFP_HIGHUSER);
-        if (!kpage)
-                return err;
-        down_read(&mm1->mmap_sem);
-        if (ksm_test_exit(mm1)) {
-                up_read(&mm1->mmap_sem);
-                goto out;
-        }
-        vma = find_vma(mm1, addr1);
-        if (!vma || vma->vm_start > addr1) {
-                up_read(&mm1->mmap_sem);
-                goto out;
-        }
-        copy_user_highpage(kpage, page1, addr1, vma);
-        err = try_to_merge_one_page(vma, page1, kpage);
-        up_read(&mm1->mmap_sem);
+        err = try_to_merge_with_ksm_page(rmap_item, page, NULL);
        if (!err) {
-                err = try_to_merge_with_ksm_page(mm2, addr2, page2, kpage);
+                err = try_to_merge_with_ksm_page(tree_rmap_item,
+                                                        tree_page, page);
                /*
                 * If that fails, we have a ksm page with only one pte
                 * pointing to it: so break it.
                 */
                if (err)
-                        break_cow(mm1, addr1);
+                        break_cow(rmap_item);
        }
-out:
+        return err ? NULL : page;
-        put_page(kpage);
-        return err;
 }
 /*
- * stable_tree_search - search page inside the stable tree
+ * stable_tree_search - search for page inside the stable tree
- * @page: the page that we are searching identical pages to.
- * @page2: pointer into identical page that we are holding inside the stable
- *         tree that we have found.
- * @rmap_item: the reverse mapping item
 *
 * This function checks if there is a page inside the stable tree
 * with identical content to the page that we are scanning right now.
 *
- * This function return rmap_item pointer to the identical item if found,
+ * This function returns the stable tree node of identical content if found,
 * NULL otherwise.
 */
-static struct rmap_item *stable_tree_search(struct page *page,
+static struct page *stable_tree_search(struct page *page)
-                                            struct page **page2,
-                                            struct rmap_item *rmap_item)
 {
        struct rb_node *node = root_stable_tree.rb_node;
+        struct stable_node *stable_node;
+        stable_node = page_stable_node(page);
+        if (stable_node) {                      /* ksm page forked */
+                get_page(page);
+                return page;
+        }
        while (node) {
-                struct rmap_item *tree_rmap_item, *next_rmap_item;
+                struct page *tree_page;
                int ret;
-                tree_rmap_item = rb_entry(node, struct rmap_item, node);
+                cond_resched();
-                while (tree_rmap_item) {
+                stable_node = rb_entry(node, struct stable_node, node);
-                        BUG_ON(!in_stable_tree(tree_rmap_item));
+                tree_page = get_ksm_page(stable_node);
-                        cond_resched();
+                if (!tree_page)
-                        page2[0] = get_ksm_page(tree_rmap_item);
-                        if (page2[0])
-                                break;
-                        next_rmap_item = tree_rmap_item->next;
-                        remove_rmap_item_from_tree(tree_rmap_item);
-                        tree_rmap_item = next_rmap_item;
-                }
-                if (!tree_rmap_item)
                        return NULL;
-                ret = memcmp_pages(page, page2[0]);
+                ret = memcmp_pages(page, tree_page);
                if (ret < 0) {
-                        put_page(page2[0]);
+                        put_page(tree_page);
                        node = node->rb_left;
                } else if (ret > 0) {
-                        put_page(page2[0]);
+                        put_page(tree_page);
                        node = node->rb_right;
-                } else {
+                } else
-                        return tree_rmap_item;
+                        return tree_page;
-                }
        }
        return NULL;
@@ -925,38 +1003,26 @@ static struct rmap_item *stable_tree_search(struct page *page,
 * stable_tree_insert - insert rmap_item pointing to new ksm page
 * into the stable tree.
 *
- * @page: the page that we are searching identical page to inside the stable
+ * This function returns the stable tree node just allocated on success,
- *        tree.
+ * NULL otherwise.
- * @rmap_item: pointer to the reverse mapping item.
- *
- * This function returns rmap_item if success, NULL otherwise.
 */
-static struct rmap_item *stable_tree_insert(struct page *page,
+static struct stable_node *stable_tree_insert(struct page *kpage)
-                                            struct rmap_item *rmap_item)
 {
        struct rb_node **new = &root_stable_tree.rb_node;
        struct rb_node *parent = NULL;
+        struct stable_node *stable_node;
        while (*new) {
-                struct rmap_item *tree_rmap_item, *next_rmap_item;
                struct page *tree_page;
                int ret;
-                tree_rmap_item = rb_entry(*new, struct rmap_item, node);
+                cond_resched();
-                while (tree_rmap_item) {
+                stable_node = rb_entry(*new, struct stable_node, node);
-                        BUG_ON(!in_stable_tree(tree_rmap_item));
+                tree_page = get_ksm_page(stable_node);
-                        cond_resched();
+                if (!tree_page)
-                        tree_page = get_ksm_page(tree_rmap_item);
-                        if (tree_page)
-                                break;
-                        next_rmap_item = tree_rmap_item->next;
-                        remove_rmap_item_from_tree(tree_rmap_item);
-                        tree_rmap_item = next_rmap_item;
-                }
-                if (!tree_rmap_item)
                        return NULL;
-                ret = memcmp_pages(page, tree_page);
+                ret = memcmp_pages(kpage, tree_page);
                put_page(tree_page);
                parent = *new;
@@ -974,22 +1040,24 @@ static struct rmap_item *stable_tree_insert(struct page *page,
                }
        }
-        rmap_item->address |= NODE_FLAG | STABLE_FLAG;
+        stable_node = alloc_stable_node();
-        rmap_item->next = NULL;
+        if (!stable_node)
-        rb_link_node(&rmap_item->node, parent, new);
+                return NULL;
-        rb_insert_color(&rmap_item->node, &root_stable_tree);
-        ksm_pages_shared++;
+        rb_link_node(&stable_node->node, parent, new);
-        return rmap_item;
+        rb_insert_color(&stable_node->node, &root_stable_tree);
+        INIT_HLIST_HEAD(&stable_node->hlist);
+        stable_node->kpfn = page_to_pfn(kpage);
+        set_page_stable_node(kpage, stable_node);
+        return stable_node;
 }
 /*
- * unstable_tree_search_insert - search and insert items into the unstable tree.
+ * unstable_tree_search_insert - search for identical page,
- *
+ * else insert rmap_item into the unstable tree.
- * @page: the page that we are going to search for identical page or to insert
- *        into the unstable tree
- * @page2: pointer into identical page that was found inside the unstable tree
- * @rmap_item: the reverse mapping item of page
 *
 * This function searches for a page in the unstable tree identical to the
 * page currently being scanned; and if no identical page is found in the
@@ -1001,46 +1069,50 @@ static struct rmap_item *stable_tree_insert(struct page *page,
 * This function does both searching and inserting, because they share
 * the same walking algorithm in an rbtree.
 */
-static struct rmap_item *unstable_tree_search_insert(struct page *page,
+static
-                                                struct page **page2,
+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
-                                                struct rmap_item *rmap_item)
+                                              struct page *page,
+                                              struct page **tree_pagep)
 {
        struct rb_node **new = &root_unstable_tree.rb_node;
        struct rb_node *parent = NULL;
        while (*new) {
                struct rmap_item *tree_rmap_item;
+                struct page *tree_page;
                int ret;
+                cond_resched();
                tree_rmap_item = rb_entry(*new, struct rmap_item, node);
-                page2[0] = get_mergeable_page(tree_rmap_item);
+                tree_page = get_mergeable_page(tree_rmap_item);
-                if (!page2[0])
+                if (!tree_page)
                        return NULL;
                /*
-                 * Don't substitute an unswappable ksm page
+                 * Don't substitute a ksm page for a forked page.
-                 * just for one good swappable forked page.
                 */
-                if (page == page2[0]) {
+                if (page == tree_page) {
-                        put_page(page2[0]);
+                        put_page(tree_page);
                        return NULL;
                }
-                ret = memcmp_pages(page, page2[0]);
+                ret = memcmp_pages(page, tree_page);
                parent = *new;
                if (ret < 0) {
-                        put_page(page2[0]);
+                        put_page(tree_page);
                        new = &parent->rb_left;
                } else if (ret > 0) {
-                        put_page(page2[0]);
+                        put_page(tree_page);
                        new = &parent->rb_right;
                } else {
+                        *tree_pagep = tree_page;
                        return tree_rmap_item;
                }
        }
-        rmap_item->address |= NODE_FLAG;
+        rmap_item->address |= UNSTABLE_FLAG;
        rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK);
        rb_link_node(&rmap_item->node, parent, new);
        rb_insert_color(&rmap_item->node, &root_unstable_tree);
@@ -1055,18 +1127,16 @@ static struct rmap_item *unstable_tree_search_insert(struct page *page,
 * the same ksm page.
 */
 static void stable_tree_append(struct rmap_item *rmap_item,
-                               struct rmap_item *tree_rmap_item)
+                               struct stable_node *stable_node)
 {
-        rmap_item->next = tree_rmap_item->next;
+        rmap_item->head = stable_node;
-        rmap_item->prev = tree_rmap_item;
-        if (tree_rmap_item->next)
-                tree_rmap_item->next->prev = rmap_item;
-        tree_rmap_item->next = rmap_item;
        rmap_item->address |= STABLE_FLAG;
+        hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
-        ksm_pages_sharing++;
+        if (rmap_item->hlist.next)
+                ksm_pages_sharing++;
+        else
+                ksm_pages_shared++;
 }
 /*
@@ -1080,49 +1150,37 @@ static void stable_tree_append(struct rmap_item *rmap_item,
 */
 static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 {
-        struct page *page2[1];
        struct rmap_item *tree_rmap_item;
+        struct page *tree_page = NULL;
+        struct stable_node *stable_node;
+        struct page *kpage;
        unsigned int checksum;
        int err;
-        if (in_stable_tree(rmap_item))
+        remove_rmap_item_from_tree(rmap_item);
-                remove_rmap_item_from_tree(rmap_item);
        /* We first start with searching the page inside the stable tree */
-        tree_rmap_item = stable_tree_search(page, page2, rmap_item);
+        kpage = stable_tree_search(page);
-        if (tree_rmap_item) {
+        if (kpage) {
-                if (page == page2[0])                   /* forked */
+                err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
-                        err = 0;
-                else
-                        err = try_to_merge_with_ksm_page(rmap_item->mm,
-                                                         rmap_item->address,
-                                                         page, page2[0]);
-                put_page(page2[0]);
                if (!err) {
                        /*
                         * The page was successfully merged:
                         * add its rmap_item to the stable tree.
                         */
-                        stable_tree_append(rmap_item, tree_rmap_item);
+                        lock_page(kpage);
+                        stable_tree_append(rmap_item, page_stable_node(kpage));
+                        unlock_page(kpage);
                }
+                put_page(kpage);
                return;
        }
        /*
-         * A ksm page might have got here by fork, but its other
+         * If the hash value of the page has changed from the last time
-         * references have already been removed from the stable tree.
+         * we calculated it, this page is changing frequently: therefore we
-         * Or it might be left over from a break_ksm which failed
+         * don't want to insert it in the unstable tree, and we don't want
-         * when the mem_cgroup had reached its limit: try again now.
+         * to waste our time searching for something identical to it there.
-         */
-        if (PageKsm(page))
-                break_cow(rmap_item->mm, rmap_item->address);
-        /*
-         * In case the hash value of the page was changed from the last time we
-         * have calculated it, this page to be changed frequely, therefore we
-         * don't want to insert it to the unstable tree, and we don't want to
-         * waste our time to search if there is something identical to it there.
         */
        checksum = calc_checksum(page);
        if (rmap_item->oldchecksum != checksum) {
@@ -1130,21 +1188,27 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
                return;
        }
-        tree_rmap_item = unstable_tree_search_insert(page, page2, rmap_item);
+        tree_rmap_item =
+                unstable_tree_search_insert(rmap_item, page, &tree_page);
        if (tree_rmap_item) {
-                err = try_to_merge_two_pages(rmap_item->mm,
+                kpage = try_to_merge_two_pages(rmap_item, page,
-                                             rmap_item->address, page,
+                                                tree_rmap_item, tree_page);
-                                             tree_rmap_item->mm,
+                put_page(tree_page);
-                                             tree_rmap_item->address, page2[0]);
                /*
                 * As soon as we merge this page, we want to remove the
                 * rmap_item of the page we have merged with from the unstable
                 * tree, and insert it instead as new node in the stable tree.
                 */
-                if (!err) {
+                if (kpage) {
-                        rb_erase(&tree_rmap_item->node, &root_unstable_tree);
+                        remove_rmap_item_from_tree(tree_rmap_item);
-                        tree_rmap_item->address &= ~NODE_FLAG;
-                        ksm_pages_unshared--;
+                        lock_page(kpage);
+                        stable_node = stable_tree_insert(kpage);
+                        if (stable_node) {
+                                stable_tree_append(tree_rmap_item, stable_node);
+                                stable_tree_append(rmap_item, stable_node);
+                        }
+                        unlock_page(kpage);
                        /*
                         * If we fail to insert the page into the stable tree,
@@ -1152,37 +1216,28 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
                         * to a ksm page left outside the stable tree,
                         * in which case we need to break_cow on both.
                         */
-                        if (stable_tree_insert(page2[0], tree_rmap_item))
+                        if (!stable_node) {
-                                stable_tree_append(rmap_item, tree_rmap_item);
+                                break_cow(tree_rmap_item);
-                        else {
+                                break_cow(rmap_item);
-                                break_cow(tree_rmap_item->mm,
-                                                tree_rmap_item->address);
-                                break_cow(rmap_item->mm, rmap_item->address);
                        }
                }
-                put_page(page2[0]);
        }
 }
 static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
-                                            struct list_head *cur,
+                                            struct rmap_item **rmap_list,
                                            unsigned long addr)
 {
        struct rmap_item *rmap_item;
-        while (cur != &mm_slot->rmap_list) {
+        while (*rmap_list) {
-                rmap_item = list_entry(cur, struct rmap_item, link);
+                rmap_item = *rmap_list;
-                if ((rmap_item->address & PAGE_MASK) == addr) {
+                if ((rmap_item->address & PAGE_MASK) == addr)
-                        if (!in_stable_tree(rmap_item))
-                                remove_rmap_item_from_tree(rmap_item);
                        return rmap_item;
-                }
                if (rmap_item->address > addr)
                        break;
-                cur = cur->next;
+                *rmap_list = rmap_item->rmap_list;
                remove_rmap_item_from_tree(rmap_item);
-                list_del(&rmap_item->link);
                free_rmap_item(rmap_item);
        }
@@ -1191,7 +1246,8 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot,
                /* It has already been zeroed */
                rmap_item->mm = mm_slot->mm;
                rmap_item->address = addr;
-                list_add_tail(&rmap_item->link, cur);
+                rmap_item->rmap_list = *rmap_list;
+                *rmap_list = rmap_item;
        }
        return rmap_item;
 }
@@ -1216,8 +1272,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page)
                spin_unlock(&ksm_mmlist_lock);
 next_mm:
                ksm_scan.address = 0;
-                ksm_scan.rmap_item = list_entry(&slot->rmap_list,
+                ksm_scan.rmap_list = &slot->rmap_list;
-                                                struct rmap_item, link);
        }
        mm = slot->mm;
@@ -1243,10 +1298,10 @@ next_mm:
                                flush_anon_page(vma, *page, ksm_scan.address);
                                flush_dcache_page(*page);
                                rmap_item = get_next_rmap_item(slot,
-                                        ksm_scan.rmap_item->link.next,
+                                        ksm_scan.rmap_list, ksm_scan.address);
-                                        ksm_scan.address);
                                if (rmap_item) {
-                                        ksm_scan.rmap_item = rmap_item;
+                                        ksm_scan.rmap_list =
+                                                        &rmap_item->rmap_list;
                                        ksm_scan.address += PAGE_SIZE;
                                } else
                                        put_page(*page);
@@ -1262,14 +1317,13 @@ next_mm:
        if (ksm_test_exit(mm)) {
                ksm_scan.address = 0;
-                ksm_scan.rmap_item = list_entry(&slot->rmap_list,
+                ksm_scan.rmap_list = &slot->rmap_list;
-                                                struct rmap_item, link);
        }
        /*
         * Nuke all the rmap_items that are above this current rmap:
         * because there were no VM_MERGEABLE vmas with such addresses.
         */
-        remove_trailing_rmap_items(slot, ksm_scan.rmap_item->link.next);
+        remove_trailing_rmap_items(slot, ksm_scan.rmap_list);
        spin_lock(&ksm_mmlist_lock);
        ksm_scan.mm_slot = list_entry(slot->mm_list.next,
@@ -1322,14 +1376,6 @@ static void ksm_do_scan(unsigned int scan_npages)
                        return;
                if (!PageKsm(page) || !in_stable_tree(rmap_item))
                        cmp_and_merge_page(page, rmap_item);
-                else if (page_mapcount(page) == 1) {
-                        /*
-                         * Replace now-unshared ksm page by ordinary page.
-                         */
-                        break_cow(rmap_item->mm, rmap_item->address);
-                        remove_rmap_item_from_tree(rmap_item);
-                        rmap_item->oldchecksum = calc_checksum(page);
-                }
                put_page(page);
        }
 }
@@ -1374,7 +1420,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
                if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
                                 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
                                 VM_RESERVED  | VM_HUGETLB | VM_INSERTPAGE |
-                                 VM_MIXEDMAP  | VM_SAO))
+                                 VM_NONLINEAR | VM_MIXEDMAP | VM_SAO))
                        return 0;               /* just ignore the advice */
                if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
@@ -1451,7 +1497,7 @@ void __ksm_exit(struct mm_struct *mm)
        spin_lock(&ksm_mmlist_lock);
        mm_slot = get_mm_slot(mm);
        if (mm_slot && ksm_scan.mm_slot != mm_slot) {
-                if (list_empty(&mm_slot->rmap_list)) {
+                if (!mm_slot->rmap_list) {
                        hlist_del(&mm_slot->link);
                        list_del(&mm_slot->mm_list);
                        easy_to_free = 1;
@@ -1472,6 +1518,249 @@ void __ksm_exit(struct mm_struct *mm)
        }
 }
+struct page *ksm_does_need_to_copy(struct page *page,
+                        struct vm_area_struct *vma, unsigned long address)
+{
+        struct page *new_page;
+        unlock_page(page);      /* any racers will COW it, not modify it */
+        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+        if (new_page) {
+                copy_user_highpage(new_page, page, address, vma);
+                SetPageDirty(new_page);
+                __SetPageUptodate(new_page);
+                SetPageSwapBacked(new_page);
+                __set_page_locked(new_page);
+                if (page_evictable(new_page, vma))
+                        lru_cache_add_lru(new_page, LRU_ACTIVE_ANON);
+                else
+                        add_page_to_unevictable_list(new_page);
+        }
+        page_cache_release(page);
+        return new_page;
+}
+int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
+                        unsigned long *vm_flags)
+{
+        struct stable_node *stable_node;
+        struct rmap_item *rmap_item;
+        struct hlist_node *hlist;
+        unsigned int mapcount = page_mapcount(page);
+        int referenced = 0;
+        int search_new_forks = 0;
+        VM_BUG_ON(!PageKsm(page));
+        VM_BUG_ON(!PageLocked(page));
+        stable_node = page_stable_node(page);
+        if (!stable_node)
+                return 0;
+again:
+        hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+                struct anon_vma *anon_vma = rmap_item->anon_vma;
+                struct vm_area_struct *vma;
+                spin_lock(&anon_vma->lock);
+                list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+                        if (rmap_item->address < vma->vm_start ||
+                            rmap_item->address >= vma->vm_end)
+                                continue;
+                        /*
+                         * Initially we examine only the vma which covers this
+                         * rmap_item; but later, if there is still work to do,
+                         * we examine covering vmas in other mms: in case they
+                         * were forked from the original since ksmd passed.
+                         */
+                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+                                continue;
+                        if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
+                                continue;
+                        referenced += page_referenced_one(page, vma,
+                                rmap_item->address, &mapcount, vm_flags);
+                        if (!search_new_forks || !mapcount)
+                                break;
+                }
+                spin_unlock(&anon_vma->lock);
+                if (!mapcount)
+                        goto out;
+        }
+        if (!search_new_forks++)
+                goto again;
+out:
+        return referenced;
+}
+int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
+{
+        struct stable_node *stable_node;
+        struct hlist_node *hlist;
+        struct rmap_item *rmap_item;
+        int ret = SWAP_AGAIN;
+        int search_new_forks = 0;
+        VM_BUG_ON(!PageKsm(page));
+        VM_BUG_ON(!PageLocked(page));
+        stable_node = page_stable_node(page);
+        if (!stable_node)
+                return SWAP_FAIL;
+again:
+        hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+                struct anon_vma *anon_vma = rmap_item->anon_vma;
+                struct vm_area_struct *vma;
+                spin_lock(&anon_vma->lock);
+                list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+                        if (rmap_item->address < vma->vm_start ||
+                            rmap_item->address >= vma->vm_end)
+                                continue;
+                        /*
+                         * Initially we examine only the vma which covers this
+                         * rmap_item; but later, if there is still work to do,
+                         * we examine covering vmas in other mms: in case they
+                         * were forked from the original since ksmd passed.
+                         */
+                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+                                continue;
+                        ret = try_to_unmap_one(page, vma,
+                                        rmap_item->address, flags);
+                        if (ret != SWAP_AGAIN || !page_mapped(page)) {
+                                spin_unlock(&anon_vma->lock);
+                                goto out;
+                        }
+                }
+                spin_unlock(&anon_vma->lock);
+        }
+        if (!search_new_forks++)
+                goto again;
+out:
+        return ret;
+}
+#ifdef CONFIG_MIGRATION
+int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
+                  struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+        struct stable_node *stable_node;
+        struct hlist_node *hlist;
+        struct rmap_item *rmap_item;
+        int ret = SWAP_AGAIN;
+        int search_new_forks = 0;
+        VM_BUG_ON(!PageKsm(page));
+        VM_BUG_ON(!PageLocked(page));
+        stable_node = page_stable_node(page);
+        if (!stable_node)
+                return ret;
+again:
+        hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+                struct anon_vma *anon_vma = rmap_item->anon_vma;
+                struct vm_area_struct *vma;
+                spin_lock(&anon_vma->lock);
+                list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+                        if (rmap_item->address < vma->vm_start ||
+                            rmap_item->address >= vma->vm_end)
+                                continue;
+                        /*
+                         * Initially we examine only the vma which covers this
+                         * rmap_item; but later, if there is still work to do,
+                         * we examine covering vmas in other mms: in case they
+                         * were forked from the original since ksmd passed.
+                         */
+                        if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
+                                continue;
+                        ret = rmap_one(page, vma, rmap_item->address, arg);
+                        if (ret != SWAP_AGAIN) {
+                                spin_unlock(&anon_vma->lock);
+                                goto out;
+                        }
+                }
+                spin_unlock(&anon_vma->lock);
+        }
+        if (!search_new_forks++)
+                goto again;
+out:
+        return ret;
+}
+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+{
+        struct stable_node *stable_node;
+        VM_BUG_ON(!PageLocked(oldpage));
+        VM_BUG_ON(!PageLocked(newpage));
+        VM_BUG_ON(newpage->mapping != oldpage->mapping);
+        stable_node = page_stable_node(newpage);
+        if (stable_node) {
+                VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
+                stable_node->kpfn = page_to_pfn(newpage);
+        }
+}
+#endif /* CONFIG_MIGRATION */
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn,
+                                                 unsigned long end_pfn)
+{
+        struct rb_node *node;
+        for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) {
+                struct stable_node *stable_node;
+                stable_node = rb_entry(node, struct stable_node, node);
+                if (stable_node->kpfn >= start_pfn &&
+                    stable_node->kpfn < end_pfn)
+                        return stable_node;
+        }
+        return NULL;
+}
+static int ksm_memory_callback(struct notifier_block *self,
+                               unsigned long action, void *arg)
+{
+        struct memory_notify *mn = arg;
+        struct stable_node *stable_node;
+        switch (action) {
+        case MEM_GOING_OFFLINE:
+                /*
+                 * Keep it very simple for now: just lock out ksmd and
+                 * MADV_UNMERGEABLE while any memory is going offline.
+                 */
+                mutex_lock(&ksm_thread_mutex);
+                break;
+        case MEM_OFFLINE:
+                /*
+                 * Most of the work is done by page migration; but there might
+                 * be a few stable_nodes left over, still pointing to struct
+                 * pages which have been offlined: prune those from the tree.
+                 */
+                while ((stable_node = ksm_check_stable_tree(mn->start_pfn,
+                                        mn->start_pfn + mn->nr_pages)) != NULL)
+                        remove_node_from_stable_tree(stable_node);
+                /* fallthrough */
+        case MEM_CANCEL_OFFLINE:
+                mutex_unlock(&ksm_thread_mutex);
+                break;
+        }
+        return NOTIFY_OK;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 #ifdef CONFIG_SYSFS
 /*
 * This all compiles without CONFIG_SYSFS, but is a waste of space.
@@ -1550,8 +1839,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
        /*
         * KSM_RUN_MERGE sets ksmd running, and 0 stops it running.
         * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items,
-         * breaking COW to free the unswappable pages_shared (but leaves
+         * breaking COW to free the pages_shared (but leaves mm_slots
-         * mm_slots on the list for when ksmd may be set running again).
+         * on the list for when ksmd may be set running again).
         */
        mutex_lock(&ksm_thread_mutex);
@@ -1576,29 +1865,6 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
 }
 KSM_ATTR(run);
-static ssize_t max_kernel_pages_store(struct kobject *kobj,
-                                      struct kobj_attribute *attr,
-                                      const char *buf, size_t count)
-{
-        int err;
-        unsigned long nr_pages;
-        err = strict_strtoul(buf, 10, &nr_pages);
-        if (err)
-                return -EINVAL;
-        ksm_max_kernel_pages = nr_pages;
-        return count;
-}
-static ssize_t max_kernel_pages_show(struct kobject *kobj,
-                                     struct kobj_attribute *attr, char *buf)
-{
-        return sprintf(buf, "%lu\n", ksm_max_kernel_pages);
-}
-KSM_ATTR(max_kernel_pages);
 static ssize_t pages_shared_show(struct kobject *kobj,
                                 struct kobj_attribute *attr, char *buf)
 {
@@ -1648,7 +1914,6 @@ static struct attribute *ksm_attrs[] = {
        &sleep_millisecs_attr.attr,
        &pages_to_scan_attr.attr,
        &run_attr.attr,
-        &max_kernel_pages_attr.attr,
        &pages_shared_attr.attr,
        &pages_sharing_attr.attr,
        &pages_unshared_attr.attr,
@@ -1668,8 +1933,6 @@ static int __init ksm_init(void)
        struct task_struct *ksm_thread;
        int err;
-        ksm_max_kernel_pages = totalram_pages / 4;
        err = ksm_slab_init();
        if (err)
                goto out;
@@ -1697,6 +1960,13 @@ static int __init ksm_init(void)
 #endif /* CONFIG_SYSFS */
+#ifdef CONFIG_MEMORY_HOTREMOVE
+        /*
+         * Choose a high priority since the callback takes ksm_thread_mutex:
+         * later callbacks could only be taking locks which nest within that.
+         */
+        hotplug_memory_notifier(ksm_memory_callback, 100);
+#endif
        return 0;
 out_free2:
diff --git a/mm/madvise.c b/mm/madvise.c
index 35b1479b7c9d..319528b8db74 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -9,6 +9,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/mempolicy.h>
+#include <linux/page-isolation.h>
 #include <linux/hugetlb.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
@@ -222,7 +223,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 /*
 * Error injection support for memory error handling.
 */
-static int madvise_hwpoison(unsigned long start, unsigned long end)
+static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 {
        int ret = 0;
@@ -230,15 +231,21 @@ static int madvise_hwpoison(unsigned long start, unsigned long end)
                return -EPERM;
        for (; start < end; start += PAGE_SIZE) {
                struct page *p;
-                int ret = get_user_pages(current, current->mm, start, 1,
+                int ret = get_user_pages_fast(start, 1, 0, &p);
-                                                0, 0, &p, NULL);
                if (ret != 1)
                        return ret;
+                if (bhv == MADV_SOFT_OFFLINE) {
+                        printk(KERN_INFO "Soft offlining page %lx at %lx\n",
+                                page_to_pfn(p), start);
+                        ret = soft_offline_page(p, MF_COUNT_INCREASED);
+                        if (ret)
+                                break;
+                        continue;
+                }
                printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
                       page_to_pfn(p), start);
                /* Ignore return value for now */
-                __memory_failure(page_to_pfn(p), 0, 1);
+                __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
-                put_page(p);
        }
        return ret;
 }
@@ -335,8 +342,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
        size_t len;
 #ifdef CONFIG_MEMORY_FAILURE
-        if (behavior == MADV_HWPOISON)
+        if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
-                return madvise_hwpoison(start, start+len_in);
+                return madvise_hwpoison(behavior, start, start+len_in);
 #endif
        if (!madvise_behavior_valid(behavior))
                return error;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f99f5991d6bb..488b644e0e8e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -38,6 +38,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
+#include <linux/cpu.h>
 #include "internal.h"
 #include <asm/uaccess.h>
@@ -54,7 +55,6 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
 #define do_swap_account         (0)
 #endif
-static DEFINE_MUTEX(memcg_tasklist);    /* can be hold under cgroup_mutex */
 #define SOFTLIMIT_EVENTS_THRESH (1000)
 /*
@@ -66,7 +66,7 @@ enum mem_cgroup_stat_index {
         */
        MEM_CGROUP_STAT_CACHE,     /* # of pages charged as cache */
        MEM_CGROUP_STAT_RSS,       /* # of pages charged as anon rss */
-        MEM_CGROUP_STAT_MAPPED_FILE,  /* # of pages charged as file rss */
+        MEM_CGROUP_STAT_FILE_MAPPED,  /* # of pages charged as file rss */
        MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
        MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
        MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
@@ -209,7 +209,7 @@ struct mem_cgroup {
        int     prev_priority;  /* for recording reclaim priority */
        /*
-         * While reclaiming in a hiearchy, we cache the last child we
+         * While reclaiming in a hierarchy, we cache the last child we
         * reclaimed from.
         */
        int last_scanned_child;
@@ -275,6 +275,7 @@ enum charge_type {
 static void mem_cgroup_get(struct mem_cgroup *mem);
 static void mem_cgroup_put(struct mem_cgroup *mem);
 static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
+static void drain_all_stock_async(void);
 static struct mem_cgroup_per_zone *
 mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
@@ -282,6 +283,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
        return &mem->info.nodeinfo[nid]->zoneinfo[zid];
 }
+struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem)
+{
+        return &mem->css;
+}
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct page_cgroup *pc)
 {
@@ -758,7 +764,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
        task_unlock(task);
        if (!curr)
                return 0;
-        if (curr->use_hierarchy)
+        /*
+         * We should check use_hierarchy of "mem" not "curr". Because checking
+         * use_hierarchy of "curr" here make this function true if hierarchy is
+         * enabled in "curr" and "curr" is a child of "mem" in *cgroup*
+         * hierarchy(even if use_hierarchy is disabled in "mem").
+         */
+        if (mem->use_hierarchy)
                ret = css_is_ancestor(&curr->css, &mem->css);
        else
                ret = (curr == mem);
@@ -1007,7 +1019,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        static char memcg_name[PATH_MAX];
        int ret;
-        if (!memcg)
+        if (!memcg || !p)
                return;
@@ -1137,6 +1149,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                victim = mem_cgroup_select_victim(root_mem);
                if (victim == root_mem) {
                        loop++;
+                        if (loop >= 1)
+                                drain_all_stock_async();
                        if (loop >= 2) {
                                /*
                                 * If we have not been able to reclaim
@@ -1223,7 +1237,7 @@ static void record_last_oom(struct mem_cgroup *mem)
 * Currently used to update mapped file statistics, but the routine can be
 * generalized to update other statistics as well.
 */
-void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
+void mem_cgroup_update_file_mapped(struct page *page, int val)
 {
        struct mem_cgroup *mem;
        struct mem_cgroup_stat *stat;
@@ -1231,9 +1245,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
        int cpu;
        struct page_cgroup *pc;
-        if (!page_is_file_cache(page))
-                return;
        pc = lookup_page_cgroup(page);
        if (unlikely(!pc))
                return;
@@ -1253,12 +1264,139 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val)
        stat = &mem->stat;
        cpustat = &stat->cpustat[cpu];
-        __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val);
+        __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
 done:
        unlock_page_cgroup(pc);
 }
 /*
+ * size of first charge trial. "32" comes from vmscan.c's magic value.
+ * TODO: maybe necessary to use big numbers in big irons.
+ */
+#define CHARGE_SIZE     (32 * PAGE_SIZE)
+struct memcg_stock_pcp {
+        struct mem_cgroup *cached; /* this never be root cgroup */
+        int charge;
+        struct work_struct work;
+};
+static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static atomic_t memcg_drain_count;
+/*
+ * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed
+ * from local stock and true is returned. If the stock is 0 or charges from a
+ * cgroup which is not current target, returns false. This stock will be
+ * refilled.
+ */
+static bool consume_stock(struct mem_cgroup *mem)
+{
+        struct memcg_stock_pcp *stock;
+        bool ret = true;
+        stock = &get_cpu_var(memcg_stock);
+        if (mem == stock->cached && stock->charge)
+                stock->charge -= PAGE_SIZE;
+        else /* need to call res_counter_charge */
+                ret = false;
+        put_cpu_var(memcg_stock);
+        return ret;
+}
+/*
+ * Returns stocks cached in percpu to res_counter and reset cached information.
+ */
+static void drain_stock(struct memcg_stock_pcp *stock)
+{
+        struct mem_cgroup *old = stock->cached;
+        if (stock->charge) {
+                res_counter_uncharge(&old->res, stock->charge);
+                if (do_swap_account)
+                        res_counter_uncharge(&old->memsw, stock->charge);
+        }
+        stock->cached = NULL;
+        stock->charge = 0;
+}
+/*
+ * This must be called under preempt disabled or must be called by
+ * a thread which is pinned to local cpu.
+ */
+static void drain_local_stock(struct work_struct *dummy)
+{
+        struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock);
+        drain_stock(stock);
+}
+/*
+ * Cache charges(val) which is from res_counter, to local per_cpu area.
+ * This will be consumed by consumt_stock() function, later.
+ */
+static void refill_stock(struct mem_cgroup *mem, int val)
+{
+        struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock);
+        if (stock->cached != mem) { /* reset if necessary */
+                drain_stock(stock);
+                stock->cached = mem;
+        }
+        stock->charge += val;
+        put_cpu_var(memcg_stock);
+}
+/*
+ * Tries to drain stocked charges in other cpus. This function is asynchronous
+ * and just put a work per cpu for draining localy on each cpu. Caller can
+ * expects some charges will be back to res_counter later but cannot wait for
+ * it.
+ */
+static void drain_all_stock_async(void)
+{
+        int cpu;
+        /* This function is for scheduling "drain" in asynchronous way.
+         * The result of "drain" is not directly handled by callers. Then,
+         * if someone is calling drain, we don't have to call drain more.
+         * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if
+         * there is a race. We just do loose check here.
+         */
+        if (atomic_read(&memcg_drain_count))
+                return;
+        /* Notify other cpus that system-wide "drain" is running */
+        atomic_inc(&memcg_drain_count);
+        get_online_cpus();
+        for_each_online_cpu(cpu) {
+                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
+                schedule_work_on(cpu, &stock->work);
+        }
+        put_online_cpus();
+        atomic_dec(&memcg_drain_count);
+        /* We don't wait for flush_work */
+}
+/* This is a synchronous drain interface. */
+static void drain_all_stock_sync(void)
+{
+        /* called when force_empty is called */
+        atomic_inc(&memcg_drain_count);
+        schedule_on_each_cpu(drain_local_stock);
+        atomic_dec(&memcg_drain_count);
+}
+static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
+                                        unsigned long action,
+                                        void *hcpu)
+{
+        int cpu = (unsigned long)hcpu;
+        struct memcg_stock_pcp *stock;
+        if (action != CPU_DEAD)
+                return NOTIFY_OK;
+        stock = &per_cpu(memcg_stock, cpu);
+        drain_stock(stock);
+        return NOTIFY_OK;
+}
+/*
 * Unlike exported interface, "oom" parameter is added. if oom==true,
 * oom-killer can be invoked.
 */
@@ -1269,6 +1407,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
        struct mem_cgroup *mem, *mem_over_limit;
        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct res_counter *fail_res;
+        int csize = CHARGE_SIZE;
        if (unlikely(test_thread_flag(TIF_MEMDIE))) {
                /* Don't account this! */
@@ -1293,23 +1432,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                return 0;
        VM_BUG_ON(css_is_removed(&mem->css));
+        if (mem_cgroup_is_root(mem))
+                goto done;
        while (1) {
                int ret = 0;
                unsigned long flags = 0;
-                if (mem_cgroup_is_root(mem))
+                if (consume_stock(mem))
-                        goto done;
+                        goto charged;
-                ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
+                ret = res_counter_charge(&mem->res, csize, &fail_res);
                if (likely(!ret)) {
                        if (!do_swap_account)
                                break;
-                        ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
+                        ret = res_counter_charge(&mem->memsw, csize, &fail_res);
-                                                        &fail_res);
                        if (likely(!ret))
                                break;
                        /* mem+swap counter fails */
-                        res_counter_uncharge(&mem->res, PAGE_SIZE);
+                        res_counter_uncharge(&mem->res, csize);
                        flags |= MEM_CGROUP_RECLAIM_NOSWAP;
                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
                                                                        memsw);
@@ -1318,6 +1459,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
                                                                        res);
+                /* reduce request size and retry */
+                if (csize > PAGE_SIZE) {
+                        csize = PAGE_SIZE;
+                        continue;
+                }
                if (!(gfp_mask & __GFP_WAIT))
                        goto nomem;
@@ -1339,14 +1485,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                if (!nr_retries--) {
                        if (oom) {
-                                mutex_lock(&memcg_tasklist);
                                mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
-                                mutex_unlock(&memcg_tasklist);
                                record_last_oom(mem_over_limit);
                        }
                        goto nomem;
                }
        }
+        if (csize > PAGE_SIZE)
+                refill_stock(mem, csize - PAGE_SIZE);
+charged:
        /*
         * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
         * if they exceeds softlimit.
@@ -1361,6 +1508,21 @@ nomem:
 }
 /*
+ * Somemtimes we have to undo a charge we got by try_charge().
+ * This function is for that and do uncharge, put css's refcnt.
+ * gotten by try_charge().
+ */
+static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
+{
+        if (!mem_cgroup_is_root(mem)) {
+                res_counter_uncharge(&mem->res, PAGE_SIZE);
+                if (do_swap_account)
+                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+        }
+        css_put(&mem->css);
+}
+/*
 * A helper function to get mem_cgroup from ID. must be called under
 * rcu_read_lock(). The caller must check css_is_removed() or some if
 * it's concern. (dropping refcnt from swap can be called against removed
@@ -1379,25 +1541,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
        return container_of(css, struct mem_cgroup, css);
 }
-static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
+struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        struct page_cgroup *pc;
        unsigned short id;
        swp_entry_t ent;
        VM_BUG_ON(!PageLocked(page));
-        if (!PageSwapCache(page))
-                return NULL;
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
        if (PageCgroupUsed(pc)) {
                mem = pc->mem_cgroup;
                if (mem && !css_tryget(&mem->css))
                        mem = NULL;
-        } else {
+        } else if (PageSwapCache(page)) {
                ent.val = page_private(page);
                id = lookup_swap_cgroup(ent);
                rcu_read_lock();
@@ -1426,12 +1585,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
        lock_page_cgroup(pc);
        if (unlikely(PageCgroupUsed(pc))) {
                unlock_page_cgroup(pc);
-                if (!mem_cgroup_is_root(mem)) {
+                mem_cgroup_cancel_charge(mem);
-                        res_counter_uncharge(&mem->res, PAGE_SIZE);
-                        if (do_swap_account)
-                                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
-                }
-                css_put(&mem->css);
                return;
        }
@@ -1464,27 +1618,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 }
 /**
- * mem_cgroup_move_account - move account of the page
+ * __mem_cgroup_move_account - move account of the page
 * @pc: page_cgroup of the page.
 * @from: mem_cgroup which the page is moved from.
 * @to: mem_cgroup which the page is moved to. @from != @to.
 *
 * The caller must confirm following.
 * - page is not on LRU (isolate_page() is useful.)
- *
+ * - the pc is locked, used, and ->mem_cgroup points to @from.
- * returns 0 at success,
- * returns -EBUSY when lock is busy or "pc" is unstable.
 *
 * This function does "uncharge" from old cgroup but doesn't do "charge" to
 * new cgroup. It should be done by a caller.
 */
-static int mem_cgroup_move_account(struct page_cgroup *pc,
+static void __mem_cgroup_move_account(struct page_cgroup *pc,
        struct mem_cgroup *from, struct mem_cgroup *to)
 {
-        struct mem_cgroup_per_zone *from_mz, *to_mz;
-        int nid, zid;
-        int ret = -EBUSY;
        struct page *page;
        int cpu;
        struct mem_cgroup_stat *stat;
@@ -1492,38 +1641,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
        VM_BUG_ON(from == to);
        VM_BUG_ON(PageLRU(pc->page));
+        VM_BUG_ON(!PageCgroupLocked(pc));
-        nid = page_cgroup_nid(pc);
+        VM_BUG_ON(!PageCgroupUsed(pc));
-        zid = page_cgroup_zid(pc);
+        VM_BUG_ON(pc->mem_cgroup != from);
-        from_mz =  mem_cgroup_zoneinfo(from, nid, zid);
-        to_mz =  mem_cgroup_zoneinfo(to, nid, zid);
-        if (!trylock_page_cgroup(pc))
-                return ret;
-        if (!PageCgroupUsed(pc))
-                goto out;
-        if (pc->mem_cgroup != from)
-                goto out;
        if (!mem_cgroup_is_root(from))
                res_counter_uncharge(&from->res, PAGE_SIZE);
        mem_cgroup_charge_statistics(from, pc, false);
        page = pc->page;
-        if (page_is_file_cache(page) && page_mapped(page)) {
+        if (page_mapped(page) && !PageAnon(page)) {
                cpu = smp_processor_id();
                /* Update mapped_file data for mem_cgroup "from" */
                stat = &from->stat;
                cpustat = &stat->cpustat[cpu];
-                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
+                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
                                                -1);
                /* Update mapped_file data for mem_cgroup "to" */
                stat = &to->stat;
                cpustat = &stat->cpustat[cpu];
-                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE,
+                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
                                                1);
        }
@@ -1534,15 +1672,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
        css_get(&to->css);
        pc->mem_cgroup = to;
        mem_cgroup_charge_statistics(to, pc, true);
-        ret = 0;
-out:
-        unlock_page_cgroup(pc);
        /*
         * We charges against "to" which may not have any tasks. Then, "to"
         * can be under rmdir(). But in current implementation, caller of
         * this function is just force_empty() and it's garanteed that
         * "to" is never removed. So, we don't check rmdir status here.
         */
+}
+/*
+ * check whether the @pc is valid for moving account and call
+ * __mem_cgroup_move_account()
+ */
+static int mem_cgroup_move_account(struct page_cgroup *pc,
+                                struct mem_cgroup *from, struct mem_cgroup *to)
+{
+        int ret = -EINVAL;
+        lock_page_cgroup(pc);
+        if (PageCgroupUsed(pc) && pc->mem_cgroup == from) {
+                __mem_cgroup_move_account(pc, from, to);
+                ret = 0;
+        }
+        unlock_page_cgroup(pc);
        return ret;
 }
@@ -1564,45 +1715,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
        if (!pcg)
                return -EINVAL;
+        ret = -EBUSY;
+        if (!get_page_unless_zero(page))
+                goto out;
+        if (isolate_lru_page(page))
+                goto put;
        parent = mem_cgroup_from_cont(pcg);
        ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
        if (ret || !parent)
-                return ret;
+                goto put_back;
-        if (!get_page_unless_zero(page)) {
-                ret = -EBUSY;
-                goto uncharge;
-        }
-        ret = isolate_lru_page(page);
-        if (ret)
-                goto cancel;
        ret = mem_cgroup_move_account(pc, child, parent);
+        if (!ret)
+                css_put(&parent->css);  /* drop extra refcnt by try_charge() */
+        else
+                mem_cgroup_cancel_charge(parent);       /* does css_put */
+put_back:
        putback_lru_page(page);
-        if (!ret) {
+put:
-                put_page(page);
-                /* drop extra refcnt by try_charge() */
-                css_put(&parent->css);
-                return 0;
-        }
-cancel:
        put_page(page);
-uncharge:
+out:
-        /* drop extra refcnt by try_charge() */
-        css_put(&parent->css);
-        /* uncharge if move fails */
-        if (!mem_cgroup_is_root(parent)) {
-                res_counter_uncharge(&parent->res, PAGE_SIZE);
-                if (do_swap_account)
-                        res_counter_uncharge(&parent->memsw, PAGE_SIZE);
-        }
        return ret;
 }
@@ -1720,7 +1853,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
 /*
 * While swap-in, try_charge -> commit or cancel, the page is locked.
 * And when try_charge() successfully returns, one refcnt to memcg without
- * struct page_cgroup is aquired. This refcnt will be cumsumed by
+ * struct page_cgroup is acquired. This refcnt will be consumed by
 * "commit()" or removed by "cancel()"
 */
 int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
@@ -1737,12 +1870,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                goto charge_cur_mm;
        /*
         * A racing thread's fault, or swapoff, may have already updated
-         * the pte, and even removed page from swap cache: return success
+         * the pte, and even removed page from swap cache: in those cases
-         * to go on to do_swap_page()'s pte_same() test, which should fail.
+         * do_swap_page()'s pte_same() test will fail; but there's also a
+         * KSM case which does need to charge the page.
         */
        if (!PageSwapCache(page))
-                return 0;
+                goto charge_cur_mm;
-        mem = try_get_mem_cgroup_from_swapcache(page);
+        mem = try_get_mem_cgroup_from_page(page);
        if (!mem)
                goto charge_cur_mm;
        *ptr = mem;
@@ -1818,14 +1952,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
                return;
        if (!mem)
                return;
-        if (!mem_cgroup_is_root(mem)) {
+        mem_cgroup_cancel_charge(mem);
-                res_counter_uncharge(&mem->res, PAGE_SIZE);
-                if (do_swap_account)
-                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
-        }
-        css_put(&mem->css);
 }
+static void
+__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype)
+{
+        struct memcg_batch_info *batch = NULL;
+        bool uncharge_memsw = true;
+        /* If swapout, usage of swap doesn't decrease */
+        if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+                uncharge_memsw = false;
+        /*
+         * do_batch > 0 when unmapping pages or inode invalidate/truncate.
+         * In those cases, all pages freed continously can be expected to be in
+         * the same cgroup and we have chance to coalesce uncharges.
+         * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
+         * because we want to do uncharge as soon as possible.
+         */
+        if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE))
+                goto direct_uncharge;
+        batch = &current->memcg_batch;
+        /*
+         * In usual, we do css_get() when we remember memcg pointer.
+         * But in this case, we keep res->usage until end of a series of
+         * uncharges. Then, it's ok to ignore memcg's refcnt.
+         */
+        if (!batch->memcg)
+                batch->memcg = mem;
+        /*
+         * In typical case, batch->memcg == mem. This means we can
+         * merge a series of uncharges to an uncharge of res_counter.
+         * If not, we uncharge res_counter ony by one.
+         */
+        if (batch->memcg != mem)
+                goto direct_uncharge;
+        /* remember freed charge and uncharge it later */
+        batch->bytes += PAGE_SIZE;
+        if (uncharge_memsw)
+                batch->memsw_bytes += PAGE_SIZE;
+        return;
+direct_uncharge:
+        res_counter_uncharge(&mem->res, PAGE_SIZE);
+        if (uncharge_memsw)
+                res_counter_uncharge(&mem->memsw, PAGE_SIZE);
+        return;
+}
 /*
 * uncharge if !page_mapped(page)
@@ -1874,12 +2047,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        if (!mem_cgroup_is_root(mem)) {
+        if (!mem_cgroup_is_root(mem))
-                res_counter_uncharge(&mem->res, PAGE_SIZE);
+                __do_uncharge(mem, ctype);
-                if (do_swap_account &&
-                                (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
-                        res_counter_uncharge(&mem->memsw, PAGE_SIZE);
-        }
        if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
                mem_cgroup_swap_statistics(mem, true);
        mem_cgroup_charge_statistics(mem, pc, false);
@@ -1925,6 +2094,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
+/*
+ * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
+ * In that cases, pages are freed continuously and we can expect pages
+ * are in the same memcg. All these calls itself limits the number of
+ * pages freed at once, then uncharge_start/end() is called properly.
+ * This may be called prural(2) times in a context,
+ */
+void mem_cgroup_uncharge_start(void)
+{
+        current->memcg_batch.do_batch++;
+        /* We can do nest. */
+        if (current->memcg_batch.do_batch == 1) {
+                current->memcg_batch.memcg = NULL;
+                current->memcg_batch.bytes = 0;
+                current->memcg_batch.memsw_bytes = 0;
+        }
+}
+void mem_cgroup_uncharge_end(void)
+{
+        struct memcg_batch_info *batch = &current->memcg_batch;
+        if (!batch->do_batch)
+                return;
+        batch->do_batch--;
+        if (batch->do_batch) /* If stacked, do nothing. */
+                return;
+        if (!batch->memcg)
+                return;
+        /*
+         * This "batch->memcg" is valid without any css_get/put etc...
+         * bacause we hide charges behind us.
+         */
+        if (batch->bytes)
+                res_counter_uncharge(&batch->memcg->res, batch->bytes);
+        if (batch->memsw_bytes)
+                res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes);
+        /* forget this pointer (for sanity check) */
+        batch->memcg = NULL;
+}
 #ifdef CONFIG_SWAP
 /*
 * called after __delete_from_swap_cache() and drop "page" account.
@@ -2100,7 +2313,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                                unsigned long long val)
 {
        int retry_count;
-        int progress;
        u64 memswlimit;
        int ret = 0;
        int children = mem_cgroup_count_children(memcg);
@@ -2144,8 +2356,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                if (!ret)
                        break;
-                progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
+                mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
-                                                GFP_KERNEL,
                                                MEM_CGROUP_RECLAIM_SHRINK);
                curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
                /* Usage is reduced ? */
@@ -2384,6 +2595,7 @@ move_account:
                        goto out;
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
+                drain_all_stock_sync();
                ret = 0;
                for_each_node_state(node, N_HIGH_MEMORY) {
                        for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
@@ -2466,7 +2678,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
        cgroup_lock();
        /*
-         * If parent's use_hiearchy is set, we can't make any modifications
+         * If parent's use_hierarchy is set, we can't make any modifications
         * in the child subtrees. If it is unset, then the change can
         * occur, provided the current cgroup has no children.
         *
@@ -2541,6 +2753,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
                        val += idx_val;
                        mem_cgroup_get_recursive_idx_stat(mem,
                                MEM_CGROUP_STAT_SWAPOUT, &idx_val);
+                        val += idx_val;
                        val <<= PAGE_SHIFT;
                } else
                        val = res_counter_read_u64(&mem->memsw, name);
@@ -2660,7 +2873,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 enum {
        MCS_CACHE,
        MCS_RSS,
-        MCS_MAPPED_FILE,
+        MCS_FILE_MAPPED,
        MCS_PGPGIN,
        MCS_PGPGOUT,
        MCS_SWAP,
@@ -2704,8 +2917,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
        s->stat[MCS_CACHE] += val * PAGE_SIZE;
        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
        s->stat[MCS_RSS] += val * PAGE_SIZE;
-        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE);
+        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
-        s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE;
+        s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
        s->stat[MCS_PGPGIN] += val;
        val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
@@ -3097,11 +3310,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        /* root ? */
        if (cont->parent == NULL) {
+                int cpu;
                enable_swap_cgroup();
                parent = NULL;
                root_mem_cgroup = mem;
                if (mem_cgroup_soft_limit_tree_init())
                        goto free_out;
+                for_each_possible_cpu(cpu) {
+                        struct memcg_stock_pcp *stock =
+                                                &per_cpu(memcg_stock, cpu);
+                        INIT_WORK(&stock->work, drain_local_stock);
+                }
+                hotcpu_notifier(memcg_stock_cpu_callback, 0);
        } else {
                parent = mem_cgroup_from_cont(cont->parent);
@@ -3170,12 +3390,10 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
                                struct task_struct *p,
                                bool threadgroup)
 {
-        mutex_lock(&memcg_tasklist);
        /*
         * FIXME: It's better to move charges of this process from old
         * memcg to new memcg. But it's just on TODO-List now.
         */
-        mutex_unlock(&memcg_tasklist);
 }
 struct cgroup_subsys mem_cgroup_subsys = {
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index dacc64183874..17299fd4577c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -34,12 +34,16 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/page-flags.h>
+#include <linux/kernel-page-flags.h>
 #include <linux/sched.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/backing-dev.h>
+#include <linux/migrate.h>
+#include <linux/page-isolation.h>
+#include <linux/suspend.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -48,6 +52,129 @@ int sysctl_memory_failure_recovery __read_mostly = 1;
 atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0);
+#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE)
+u32 hwpoison_filter_enable = 0;
+u32 hwpoison_filter_dev_major = ~0U;
+u32 hwpoison_filter_dev_minor = ~0U;
+u64 hwpoison_filter_flags_mask;
+u64 hwpoison_filter_flags_value;
+EXPORT_SYMBOL_GPL(hwpoison_filter_enable);
+EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major);
+EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor);
+EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask);
+EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
+static int hwpoison_filter_dev(struct page *p)
+{
+        struct address_space *mapping;
+        dev_t dev;
+        if (hwpoison_filter_dev_major == ~0U &&
+            hwpoison_filter_dev_minor == ~0U)
+                return 0;
+        /*
+         * page_mapping() does not accept slab page
+         */
+        if (PageSlab(p))
+                return -EINVAL;
+        mapping = page_mapping(p);
+        if (mapping == NULL || mapping->host == NULL)
+                return -EINVAL;
+        dev = mapping->host->i_sb->s_dev;
+        if (hwpoison_filter_dev_major != ~0U &&
+            hwpoison_filter_dev_major != MAJOR(dev))
+                return -EINVAL;
+        if (hwpoison_filter_dev_minor != ~0U &&
+            hwpoison_filter_dev_minor != MINOR(dev))
+                return -EINVAL;
+        return 0;
+}
+static int hwpoison_filter_flags(struct page *p)
+{
+        if (!hwpoison_filter_flags_mask)
+                return 0;
+        if ((stable_page_flags(p) & hwpoison_filter_flags_mask) ==
+                                    hwpoison_filter_flags_value)
+                return 0;
+        else
+                return -EINVAL;
+}
+/*
+ * This allows stress tests to limit test scope to a collection of tasks
+ * by putting them under some memcg. This prevents killing unrelated/important
+ * processes such as /sbin/init. Note that the target task may share clean
+ * pages with init (eg. libc text), which is harmless. If the target task
+ * share _dirty_ pages with another task B, the test scheme must make sure B
+ * is also included in the memcg. At last, due to race conditions this filter
+ * can only guarantee that the page either belongs to the memcg tasks, or is
+ * a freed page.
+ */
+#ifdef  CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+u64 hwpoison_filter_memcg;
+EXPORT_SYMBOL_GPL(hwpoison_filter_memcg);
+static int hwpoison_filter_task(struct page *p)
+{
+        struct mem_cgroup *mem;
+        struct cgroup_subsys_state *css;
+        unsigned long ino;
+        if (!hwpoison_filter_memcg)
+                return 0;
+        mem = try_get_mem_cgroup_from_page(p);
+        if (!mem)
+                return -EINVAL;
+        css = mem_cgroup_css(mem);
+        /* root_mem_cgroup has NULL dentries */
+        if (!css->cgroup->dentry)
+                return -EINVAL;
+        ino = css->cgroup->dentry->d_inode->i_ino;
+        css_put(css);
+        if (ino != hwpoison_filter_memcg)
+                return -EINVAL;
+        return 0;
+}
+#else
+static int hwpoison_filter_task(struct page *p) { return 0; }
+#endif
+int hwpoison_filter(struct page *p)
+{
+        if (!hwpoison_filter_enable)
+                return 0;
+        if (hwpoison_filter_dev(p))
+                return -EINVAL;
+        if (hwpoison_filter_flags(p))
+                return -EINVAL;
+        if (hwpoison_filter_task(p))
+                return -EINVAL;
+        return 0;
+}
+#else
+int hwpoison_filter(struct page *p)
+{
+        return 0;
+}
+#endif
+EXPORT_SYMBOL_GPL(hwpoison_filter);
 /*
 * Send all the processes who have the page mapped an ``action optional''
 * signal.
@@ -83,6 +210,36 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
 }
 /*
+ * When a unknown page type is encountered drain as many buffers as possible
+ * in the hope to turn the page into a LRU or free page, which we can handle.
+ */
+void shake_page(struct page *p, int access)
+{
+        if (!PageSlab(p)) {
+                lru_add_drain_all();
+                if (PageLRU(p))
+                        return;
+                drain_all_pages();
+                if (PageLRU(p) || is_free_buddy_page(p))
+                        return;
+        }
+        /*
+         * Only all shrink_slab here (which would also
+         * shrink other caches) if access is not potentially fatal.
+         */
+        if (access) {
+                int nr;
+                do {
+                        nr = shrink_slab(1000, GFP_KERNEL, 1000);
+                        if (page_count(p) == 0)
+                                break;
+                } while (nr > 10);
+        }
+}
+EXPORT_SYMBOL_GPL(shake_page);
+/*
 * Kill all processes that have a poisoned page mapped and then isolate
 * the page.
 *
@@ -174,10 +331,9 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
        list_for_each_entry_safe (tk, next, to_kill, nd) {
                if (doit) {
                        /*
-                         * In case something went wrong with munmaping
+                         * In case something went wrong with munmapping
                         * make sure the process doesn't catch the
                         * signal and then access the memory. Just kill it.
-                         * the signal handlers
                         */
                        if (fail || tk->addr_valid == 0) {
                                printk(KERN_ERR
@@ -314,33 +470,49 @@ static void collect_procs(struct page *page, struct list_head *tokill)
 */
 enum outcome {
-        FAILED,         /* Error handling failed */
+        IGNORED,        /* Error: cannot be handled */
+        FAILED,         /* Error: handling failed */
        DELAYED,        /* Will be handled later */
-        IGNORED,        /* Error safely ignored */
        RECOVERED,      /* Successfully recovered */
 };
 static const char *action_name[] = {
+        [IGNORED] = "Ignored",
        [FAILED] = "Failed",
        [DELAYED] = "Delayed",
-        [IGNORED] = "Ignored",
        [RECOVERED] = "Recovered",
 };
 /*
- * Error hit kernel page.
+ * XXX: It is possible that a page is isolated from LRU cache,
- * Do nothing, try to be lucky and not touch this instead. For a few cases we
+ * and then kept in swap cache or failed to remove from page cache.
- * could be more sophisticated.
+ * The page count will stop it from being freed by unpoison.
+ * Stress tests should be aware of this memory leak problem.
 */
-static int me_kernel(struct page *p, unsigned long pfn)
+static int delete_from_lru_cache(struct page *p)
 {
-        return DELAYED;
+        if (!isolate_lru_page(p)) {
+                /*
+                 * Clear sensible page flags, so that the buddy system won't
+                 * complain when the page is unpoison-and-freed.
+                 */
+                ClearPageActive(p);
+                ClearPageUnevictable(p);
+                /*
+                 * drop the page count elevated by isolate_lru_page()
+                 */
+                page_cache_release(p);
+                return 0;
+        }
+        return -EIO;
 }
 /*
- * Already poisoned page.
+ * Error hit kernel page.
+ * Do nothing, try to be lucky and not touch this instead. For a few cases we
+ * could be more sophisticated.
 */
-static int me_ignore(struct page *p, unsigned long pfn)
+static int me_kernel(struct page *p, unsigned long pfn)
 {
        return IGNORED;
 }
@@ -355,14 +527,6 @@ static int me_unknown(struct page *p, unsigned long pfn)
 }
 /*
- * Free memory
- */
-static int me_free(struct page *p, unsigned long pfn)
-{
-        return DELAYED;
-}
-/*
 * Clean (or cleaned) page cache page.
 */
 static int me_pagecache_clean(struct page *p, unsigned long pfn)
@@ -371,6 +535,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
        int ret = FAILED;
        struct address_space *mapping;
+        delete_from_lru_cache(p);
        /*
         * For anonymous pages we're done the only reference left
         * should be the one m_f() holds.
@@ -500,14 +666,20 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
        /* Trigger EIO in shmem: */
        ClearPageUptodate(p);
-        return DELAYED;
+        if (!delete_from_lru_cache(p))
+                return DELAYED;
+        else
+                return FAILED;
 }
 static int me_swapcache_clean(struct page *p, unsigned long pfn)
 {
        delete_from_swap_cache(p);
-        return RECOVERED;
+        if (!delete_from_lru_cache(p))
+                return RECOVERED;
+        else
+                return FAILED;
 }
 /*
@@ -550,7 +722,6 @@ static int me_huge_page(struct page *p, unsigned long pfn)
 #define tail            (1UL << PG_tail)
 #define compound        (1UL << PG_compound)
 #define slab            (1UL << PG_slab)
-#define buddy           (1UL << PG_buddy)
 #define reserved        (1UL << PG_reserved)
 static struct page_state {
@@ -559,8 +730,11 @@ static struct page_state {
        char *msg;
        int (*action)(struct page *p, unsigned long pfn);
 } error_states[] = {
-        { reserved,     reserved,       "reserved kernel",      me_ignore },
+        { reserved,     reserved,       "reserved kernel",      me_kernel },
-        { buddy,        buddy,          "free kernel",  me_free },
+        /*
+         * free pages are specially detected outside this table:
+         * PG_buddy pages only make a small fraction of all free pages.
+         */
        /*
         * Could in theory check if slab page is free or if we can drop
@@ -582,14 +756,11 @@ static struct page_state {
        { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty},
        { unevict,      unevict,        "unevictable LRU", me_pagecache_clean},
-#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
        { mlock|dirty,  mlock|dirty,    "mlocked LRU",  me_pagecache_dirty },
        { mlock,        mlock,          "mlocked LRU",  me_pagecache_clean },
-#endif
        { lru|dirty,    lru|dirty,      "LRU",          me_pagecache_dirty },
        { lru|dirty,    lru,            "clean LRU",    me_pagecache_clean },
-        { swapbacked,   swapbacked,     "anonymous",    me_pagecache_clean },
        /*
         * Catchall entry: must be at end.
@@ -597,20 +768,31 @@ static struct page_state {
        { 0,            0,              "unknown page state",   me_unknown },
 };
+#undef dirty
+#undef sc
+#undef unevict
+#undef mlock
+#undef writeback
+#undef lru
+#undef swapbacked
+#undef head
+#undef tail
+#undef compound
+#undef slab
+#undef reserved
 static void action_result(unsigned long pfn, char *msg, int result)
 {
-        struct page *page = NULL;
+        struct page *page = pfn_to_page(pfn);
-        if (pfn_valid(pfn))
-                page = pfn_to_page(pfn);
        printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n",
                pfn,
-                page && PageDirty(page) ? "dirty " : "",
+                PageDirty(page) ? "dirty " : "",
                msg, action_name[result]);
 }
 static int page_action(struct page_state *ps, struct page *p,
-                        unsigned long pfn, int ref)
+                        unsigned long pfn)
 {
        int result;
        int count;
@@ -618,18 +800,22 @@ static int page_action(struct page_state *ps, struct page *p,
        result = ps->action(p, pfn);
        action_result(pfn, ps->msg, result);
-        count = page_count(p) - 1 - ref;
+        count = page_count(p) - 1;
-        if (count != 0)
+        if (ps->action == me_swapcache_dirty && result == DELAYED)
+                count--;
+        if (count != 0) {
                printk(KERN_ERR
                       "MCE %#lx: %s page still referenced by %d users\n",
                       pfn, ps->msg, count);
+                result = FAILED;
+        }
        /* Could do more checks here if page looks ok */
        /*
         * Could adjust zone counters here to correct for the missing page.
         */
-        return result == RECOVERED ? 0 : -EBUSY;
+        return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
 }
 #define N_UNMAP_TRIES 5
@@ -638,7 +824,7 @@ static int page_action(struct page_state *ps, struct page *p,
 * Do all that is necessary to remove user space mappings. Unmap
 * the pages and send SIGBUS to the processes if the data was dirty.
 */
-static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
+static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
                                  int trapno)
 {
        enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
@@ -648,15 +834,18 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int i;
        int kill = 1;
-        if (PageReserved(p) || PageCompound(p) || PageSlab(p) || PageKsm(p))
+        if (PageReserved(p) || PageSlab(p))
-                return;
+                return SWAP_SUCCESS;
        /*
         * This check implies we don't kill processes if their pages
         * are in the swap cache early. Those are always late kills.
         */
        if (!page_mapped(p))
-                return;
+                return SWAP_SUCCESS;
+        if (PageCompound(p) || PageKsm(p))
+                return SWAP_FAIL;
        if (PageSwapCache(p)) {
                printk(KERN_ERR
@@ -667,6 +856,8 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
        /*
         * Propagate the dirty bit from PTEs to struct page first, because we
         * need this to decide if we should kill or just drop the page.
+         * XXX: the dirty test could be racy: set_page_dirty() may not always
+         * be called inside page lock (it's recommended but not enforced).
         */
        mapping = page_mapping(p);
        if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
@@ -718,11 +909,12 @@ static void hwpoison_user_mappings(struct page *p, unsigned long pfn,
         */
        kill_procs_ao(&tokill, !!PageDirty(p), trapno,
                      ret != SWAP_SUCCESS, pfn);
+        return ret;
 }
-int __memory_failure(unsigned long pfn, int trapno, int ref)
+int __memory_failure(unsigned long pfn, int trapno, int flags)
 {
-        unsigned long lru_flag;
        struct page_state *ps;
        struct page *p;
        int res;
@@ -731,13 +923,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
                panic("Memory failure from trap %d on page %lx", trapno, pfn);
        if (!pfn_valid(pfn)) {
-                action_result(pfn, "memory outside kernel control", IGNORED);
+                printk(KERN_ERR
-                return -EIO;
+                       "MCE %#lx: memory outside kernel control\n",
+                       pfn);
+                return -ENXIO;
        }
        p = pfn_to_page(pfn);
        if (TestSetPageHWPoison(p)) {
-                action_result(pfn, "already hardware poisoned", IGNORED);
+                printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
                return 0;
        }
@@ -754,9 +948,15 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
         * In fact it's dangerous to directly bump up page count from 0,
         * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
         */
-        if (!get_page_unless_zero(compound_head(p))) {
+        if (!(flags & MF_COUNT_INCREASED) &&
-                action_result(pfn, "free or high order kernel", IGNORED);
+                !get_page_unless_zero(compound_head(p))) {
-                return PageBuddy(compound_head(p)) ? 0 : -EBUSY;
+                if (is_free_buddy_page(p)) {
+                        action_result(pfn, "free buddy", DELAYED);
+                        return 0;
+                } else {
+                        action_result(pfn, "high order kernel", IGNORED);
+                        return -EBUSY;
+                }
        }
        /*
@@ -768,14 +968,19 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
         * walked by the page reclaim code, however that's not a big loss.
         */
        if (!PageLRU(p))
-                lru_add_drain_all();
+                shake_page(p, 0);
-        lru_flag = p->flags & lru;
+        if (!PageLRU(p)) {
-        if (isolate_lru_page(p)) {
+                /*
+                 * shake_page could have turned it free.
+                 */
+                if (is_free_buddy_page(p)) {
+                        action_result(pfn, "free buddy, 2nd try", DELAYED);
+                        return 0;
+                }
                action_result(pfn, "non LRU", IGNORED);
                put_page(p);
                return -EBUSY;
        }
-        page_cache_release(p);
        /*
         * Lock the page and wait for writeback to finish.
@@ -783,26 +988,48 @@ int __memory_failure(unsigned long pfn, int trapno, int ref)
         * and in many cases impossible, so we just avoid it here.
         */
        lock_page_nosync(p);
+        /*
+         * unpoison always clear PG_hwpoison inside page lock
+         */
+        if (!PageHWPoison(p)) {
+                printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
+                res = 0;
+                goto out;
+        }
+        if (hwpoison_filter(p)) {
+                if (TestClearPageHWPoison(p))
+                        atomic_long_dec(&mce_bad_pages);
+                unlock_page(p);
+                put_page(p);
+                return 0;
+        }
        wait_on_page_writeback(p);
        /*
         * Now take care of user space mappings.
+         * Abort on fail: __remove_from_page_cache() assumes unmapped page.
         */
-        hwpoison_user_mappings(p, pfn, trapno);
+        if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) {
+                printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn);
+                res = -EBUSY;
+                goto out;
+        }
        /*
         * Torn down by someone else?
         */
-        if ((lru_flag & lru) && !PageSwapCache(p) && p->mapping == NULL) {
+        if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
                action_result(pfn, "already truncated LRU", IGNORED);
-                res = 0;
+                res = -EBUSY;
                goto out;
        }
        res = -EBUSY;
        for (ps = error_states;; ps++) {
-                if (((p->flags | lru_flag)& ps->mask) == ps->res) {
+                if ((p->flags & ps->mask) == ps->res) {
-                        res = page_action(ps, p, pfn, ref);
+                        res = page_action(ps, p, pfn);
                        break;
                }
        }
@@ -833,3 +1060,235 @@ void memory_failure(unsigned long pfn, int trapno)
 {
        __memory_failure(pfn, trapno, 0);
 }
+/**
+ * unpoison_memory - Unpoison a previously poisoned page
+ * @pfn: Page number of the to be unpoisoned page
+ *
+ * Software-unpoison a page that has been poisoned by
+ * memory_failure() earlier.
+ *
+ * This is only done on the software-level, so it only works
+ * for linux injected failures, not real hardware failures
+ *
+ * Returns 0 for success, otherwise -errno.
+ */
+int unpoison_memory(unsigned long pfn)
+{
+        struct page *page;
+        struct page *p;
+        int freeit = 0;
+        if (!pfn_valid(pfn))
+                return -ENXIO;
+        p = pfn_to_page(pfn);
+        page = compound_head(p);
+        if (!PageHWPoison(p)) {
+                pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+                return 0;
+        }
+        if (!get_page_unless_zero(page)) {
+                if (TestClearPageHWPoison(p))
+                        atomic_long_dec(&mce_bad_pages);
+                pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+                return 0;
+        }
+        lock_page_nosync(page);
+        /*
+         * This test is racy because PG_hwpoison is set outside of page lock.
+         * That's acceptable because that won't trigger kernel panic. Instead,
+         * the PG_hwpoison page will be caught and isolated on the entrance to
+         * the free buddy page pool.
+         */
+        if (TestClearPageHWPoison(p)) {
+                pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
+                atomic_long_dec(&mce_bad_pages);
+                freeit = 1;
+        }
+        unlock_page(page);
+        put_page(page);
+        if (freeit)
+                put_page(page);
+        return 0;
+}
+EXPORT_SYMBOL(unpoison_memory);
+static struct page *new_page(struct page *p, unsigned long private, int **x)
+{
+        int nid = page_to_nid(p);
+        return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+}
+/*
+ * Safely get reference count of an arbitrary page.
+ * Returns 0 for a free page, -EIO for a zero refcount page
+ * that is not free, and 1 for any other page type.
+ * For 1 the page is returned with increased page count, otherwise not.
+ */
+static int get_any_page(struct page *p, unsigned long pfn, int flags)
+{
+        int ret;
+        if (flags & MF_COUNT_INCREASED)
+                return 1;
+        /*
+         * The lock_system_sleep prevents a race with memory hotplug,
+         * because the isolation assumes there's only a single user.
+         * This is a big hammer, a better would be nicer.
+         */
+        lock_system_sleep();
+        /*
+         * Isolate the page, so that it doesn't get reallocated if it
+         * was free.
+         */
+        set_migratetype_isolate(p);
+        if (!get_page_unless_zero(compound_head(p))) {
+                if (is_free_buddy_page(p)) {
+                        pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+                        /* Set hwpoison bit while page is still isolated */
+                        SetPageHWPoison(p);
+                        ret = 0;
+                } else {
+                        pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+                                pfn, p->flags);
+                        ret = -EIO;
+                }
+        } else {
+                /* Not a free page */
+                ret = 1;
+        }
+        unset_migratetype_isolate(p);
+        unlock_system_sleep();
+        return ret;
+}
+/**
+ * soft_offline_page - Soft offline a page.
+ * @page: page to offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
+ */
+int soft_offline_page(struct page *page, int flags)
+{
+        int ret;
+        unsigned long pfn = page_to_pfn(page);
+        ret = get_any_page(page, pfn, flags);
+        if (ret < 0)
+                return ret;
+        if (ret == 0)
+                goto done;
+        /*
+         * Page cache page we can handle?
+         */
+        if (!PageLRU(page)) {
+                /*
+                 * Try to free it.
+                 */
+                put_page(page);
+                shake_page(page, 1);
+                /*
+                 * Did it turn free?
+                 */
+                ret = get_any_page(page, pfn, 0);
+                if (ret < 0)
+                        return ret;
+                if (ret == 0)
+                        goto done;
+        }
+        if (!PageLRU(page)) {
+                pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+                                pfn, page->flags);
+                return -EIO;
+        }
+        lock_page(page);
+        wait_on_page_writeback(page);
+        /*
+         * Synchronized using the page lock with memory_failure()
+         */
+        if (PageHWPoison(page)) {
+                unlock_page(page);
+                put_page(page);
+                pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+                return -EBUSY;
+        }
+        /*
+         * Try to invalidate first. This should work for
+         * non dirty unmapped page cache pages.
+         */
+        ret = invalidate_inode_page(page);
+        unlock_page(page);
+        /*
+         * Drop count because page migration doesn't like raised
+         * counts. The page could get re-allocated, but if it becomes
+         * LRU the isolation will just fail.
+         * RED-PEN would be better to keep it isolated here, but we
+         * would need to fix isolation locking first.
+         */
+        put_page(page);
+        if (ret == 1) {
+                ret = 0;
+                pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+                goto done;
+        }
+        /*
+         * Simple invalidation didn't work.
+         * Try to migrate to a new page instead. migrate.c
+         * handles a large number of cases for us.
+         */
+        ret = isolate_lru_page(page);
+        if (!ret) {
+                LIST_HEAD(pagelist);
+                list_add(&page->lru, &pagelist);
+                ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+                if (ret) {
+                        pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                                pfn, ret, page->flags);
+                        if (ret > 0)
+                                ret = -EIO;
+                }
+        } else {
+                pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+                                pfn, ret, page_count(page), page->flags);
+        }
+        if (ret)
+                return ret;
+done:
+        atomic_long_add(1, &mce_bad_pages);
+        SetPageHWPoison(page);
+        /* keep elevated page count for bad page */
+        return ret;
+}
diff --git a/mm/memory.c b/mm/memory.c
index 6ab19dd4a199..09e4b1be7b67 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -572,7 +572,7 @@ out:
 * covered by this vma.
 */
-static inline void
+static inline unsigned long
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
                unsigned long addr, int *rss)
@@ -586,7 +586,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                if (!pte_file(pte)) {
                        swp_entry_t entry = pte_to_swp_entry(pte);
-                        swap_duplicate(entry);
+                        if (swap_duplicate(entry) < 0)
+                                return entry.val;
                        /* make sure dst_mm is on swapoff's mmlist. */
                        if (unlikely(list_empty(&dst_mm->mmlist))) {
                                spin_lock(&mmlist_lock);
@@ -635,6 +637,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 out_set_pte:
        set_pte_at(dst_mm, addr, dst_pte, pte);
+        return 0;
 }
 static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -646,6 +649,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        spinlock_t *src_ptl, *dst_ptl;
        int progress = 0;
        int rss[2];
+        swp_entry_t entry = (swp_entry_t){0};
 again:
        rss[1] = rss[0] = 0;
@@ -674,7 +678,10 @@ again:
                        progress++;
                        continue;
                }
-                copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
+                entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
+                                                        vma, addr, rss);
+                if (entry.val)
+                        break;
                progress += 8;
        } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -684,6 +691,12 @@ again:
        add_mm_rss(dst_mm, rss[0], rss[1]);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
+        if (entry.val) {
+                if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
+                        return -ENOMEM;
+                progress = 0;
+        }
        if (addr != end)
                goto again;
        return 0;
@@ -943,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
                details = NULL;
        BUG_ON(addr >= end);
+        mem_cgroup_uncharge_start();
        tlb_start_vma(tlb, vma);
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
@@ -955,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
                                                zap_work, details);
        } while (pgd++, addr = next, (addr != end && *zap_work > 0));
        tlb_end_vma(tlb, vma);
+        mem_cgroup_uncharge_end();
        return addr;
 }
@@ -2514,7 +2529,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        ret = VM_FAULT_HWPOISON;
                } else {
                        print_bad_pte(vma, address, orig_pte, NULL);
-                        ret = VM_FAULT_OOM;
+                        ret = VM_FAULT_SIGBUS;
                }
                goto out;
        }
@@ -2540,6 +2555,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                ret = VM_FAULT_MAJOR;
                count_vm_event(PGMAJFAULT);
        } else if (PageHWPoison(page)) {
+                /*
+                 * hwpoisoned dirty swapcache pages are kept for killing
+                 * owner processes (which may be unknown at hwpoison time)
+                 */
                ret = VM_FAULT_HWPOISON;
                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                goto out_release;
@@ -2548,6 +2567,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        lock_page(page);
        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+        page = ksm_might_need_to_copy(page, vma, address);
+        if (!page) {
+                ret = VM_FAULT_OOM;
+                goto out;
+        }
        if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
                ret = VM_FAULT_OOM;
                goto out_page;
@@ -2910,7 +2935,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 * Page table corrupted: show pte and kill process.
                 */
                print_bad_pte(vma, address, orig_pte, NULL);
-                return VM_FAULT_OOM;
+                return VM_FAULT_SIGBUS;
        }
        pgoff = pte_to_pgoff(orig_pte);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 821dee596377..030ce8a5bb0e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,8 @@
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
 #include <linux/pfn.h>
+#include <linux/suspend.h>
+#include <linux/mm_inline.h>
 #include <asm/tlbflush.h>
@@ -70,7 +72,9 @@ static void get_page_bootmem(unsigned long info,  struct page *page, int type)
        atomic_inc(&page->_count);
 }
-void put_page_bootmem(struct page *page)
+/* reference to __meminit __free_pages_bootmem is valid
+ * so use __ref to tell modpost not to generate a warning */
+void __ref put_page_bootmem(struct page *page)
 {
        int type;
@@ -447,7 +451,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
-static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
+/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
+static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 {
        struct pglist_data *pgdat;
        unsigned long zones_size[MAX_NR_ZONES] = {0};
@@ -484,14 +489,18 @@ int __ref add_memory(int nid, u64 start, u64 size)
        struct resource *res;
        int ret;
+        lock_system_sleep();
        res = register_memory_resource(start, size);
+        ret = -EEXIST;
        if (!res)
-                return -EEXIST;
+                goto out;
        if (!node_online(nid)) {
                pgdat = hotadd_new_pgdat(nid, start);
+                ret = -ENOMEM;
                if (!pgdat)
-                        return -ENOMEM;
+                        goto out;
                new_pgdat = 1;
        }
@@ -514,7 +523,8 @@ int __ref add_memory(int nid, u64 start, u64 size)
                BUG_ON(ret);
        }
-        return ret;
+        goto out;
 error:
        /* rollback pgdat allocation and others */
        if (new_pgdat)
@@ -522,6 +532,8 @@ error:
        if (res)
                release_memory_resource(res);
+out:
+        unlock_system_sleep();
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_memory);
@@ -663,6 +675,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                if (!ret) { /* Success */
                        list_add_tail(&page->lru, &source);
                        move_pages--;
+                        inc_zone_page_state(page, NR_ISOLATED_ANON +
+                                            page_is_file_cache(page));
                } else {
                        /* Becasue we don't have big zone->lock. we should
                           check this again here. */
@@ -685,7 +700,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
        if (list_empty(&source))
                goto out;
        /* this function returns # of failed pages */
-        ret = migrate_pages(&source, hotremove_migrate_alloc, 0);
+        ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
 out:
        return ret;
@@ -738,7 +753,7 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        return offlined;
 }
-int offline_pages(unsigned long start_pfn,
+static int offline_pages(unsigned long start_pfn,
                  unsigned long end_pfn, unsigned long timeout)
 {
        unsigned long pfn, nr_pages, expire;
@@ -758,6 +773,8 @@ int offline_pages(unsigned long start_pfn,
        if (!test_pages_in_a_zone(start_pfn, end_pfn))
                return -EINVAL;
+        lock_system_sleep();
        zone = page_zone(pfn_to_page(start_pfn));
        node = zone_to_nid(zone);
        nr_pages = end_pfn - start_pfn;
@@ -765,7 +782,7 @@ int offline_pages(unsigned long start_pfn,
        /* set above range as isolated */
        ret = start_isolate_page_range(start_pfn, end_pfn);
        if (ret)
-                return ret;
+                goto out;
        arg.start_pfn = start_pfn;
        arg.nr_pages = nr_pages;
@@ -838,11 +855,16 @@ repeat:
        setup_per_zone_wmarks();
        calculate_zone_inactive_ratio(zone);
+        if (!node_present_pages(node)) {
+                node_clear_state(node, N_HIGH_MEMORY);
+                kswapd_stop(node);
+        }
        vm_total_pages = nr_free_pagecache_pages();
        writeback_set_ratelimit();
        memory_notify(MEM_OFFLINE, &arg);
+        unlock_system_sleep();
        return 0;
 failed_removal:
@@ -852,6 +874,8 @@ failed_removal:
        /* pushback to free area */
        undo_isolate_page_range(start_pfn, end_pfn);
+out:
+        unlock_system_sleep();
        return ret;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4545d5944243..290fb5bf0440 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -85,10 +85,12 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/migrate.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/ctype.h>
+#include <linux/mm_inline.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -412,17 +414,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                if (!page)
                        continue;
                /*
-                 * The check for PageReserved here is important to avoid
+                 * vm_normal_page() filters out zero pages, but there might
-                 * handling zero pages and other pages that may have been
+                 * still be PageReserved pages to skip, perhaps in a VDSO.
-                 * marked special by the system.
+                 * And we cannot move PageKsm pages sensibly or safely yet.
-                 *
-                 * If the PageReserved would not be checked here then f.e.
-                 * the location of the zero page could have an influence
-                 * on MPOL_MF_STRICT, zero pages would be counted for
-                 * the per node stats, and there would be useless attempts
-                 * to put zero pages on the migration list.
                 */
-                if (PageReserved(page))
+                if (PageReserved(page) || PageKsm(page))
                        continue;
                nid = page_to_nid(page);
                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -809,6 +805,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
                if (!isolate_lru_page(page)) {
                        list_add_tail(&page->lru, pagelist);
+                        inc_zone_page_state(page, NR_ISOLATED_ANON +
+                                            page_is_file_cache(page));
                }
        }
 }
@@ -836,7 +834,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
        if (!list_empty(&pagelist))
-                err = migrate_pages(&pagelist, new_node_page, dest);
+                err = migrate_pages(&pagelist, new_node_page, dest, 0);
        return err;
 }
@@ -1053,7 +1051,7 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (!list_empty(&pagelist))
                        nr_failed = migrate_pages(&pagelist, new_vma_page,
-                                                (unsigned long)vma);
+                                                (unsigned long)vma, 0);
                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
@@ -1565,6 +1563,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
        }
        return zl;
 }
+/*
+ * init_nodemask_of_mempolicy
+ *
+ * If the current task's mempolicy is "default" [NULL], return 'false'
+ * to indicate default policy.  Otherwise, extract the policy nodemask
+ * for 'bind' or 'interleave' policy into the argument nodemask, or
+ * initialize the argument nodemask to contain the single node for
+ * 'preferred' or 'local' policy and return 'true' to indicate presence
+ * of non-default mempolicy.
+ *
+ * We don't bother with reference counting the mempolicy [mpol_get/put]
+ * because the current task is examining it's own mempolicy and a task's
+ * mempolicy is only ever changed by the task itself.
+ *
+ * N.B., it is the caller's responsibility to free a returned nodemask.
+ */
+bool init_nodemask_of_mempolicy(nodemask_t *mask)
+{
+        struct mempolicy *mempolicy;
+        int nid;
+        if (!(mask && current->mempolicy))
+                return false;
+        mempolicy = current->mempolicy;
+        switch (mempolicy->mode) {
+        case MPOL_PREFERRED:
+                if (mempolicy->flags & MPOL_F_LOCAL)
+                        nid = numa_node_id();
+                else
+                        nid = mempolicy->v.preferred_node;
+                init_nodemask_of_node(mask, nid);
+                break;
+        case MPOL_BIND:
+                /* Fall through */
+        case MPOL_INTERLEAVE:
+                *mask =  mempolicy->v.nodes;
+                break;
+        default:
+                BUG();
+        }
+        return true;
+}
 #endif
 /* Allocate a page in interleaved policy.
diff --git a/mm/migrate.c b/mm/migrate.c
index 1a4bf4813780..efddbf0926b2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -21,6 +21,7 @@
 #include <linux/mm_inline.h>
 #include <linux/nsproxy.h>
 #include <linux/pagevec.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/topology.h>
 #include <linux/cpu.h>
@@ -78,8 +79,8 @@ int putback_lru_pages(struct list_head *l)
 /*
 * Restore a potential migration pte to a working pte entry
 */
-static void remove_migration_pte(struct vm_area_struct *vma,
+static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
-                struct page *old, struct page *new)
+                                 unsigned long addr, void *old)
 {
        struct mm_struct *mm = vma->vm_mm;
        swp_entry_t entry;
@@ -88,40 +89,37 @@ static void remove_migration_pte(struct vm_area_struct *vma,
        pmd_t *pmd;
        pte_t *ptep, pte;
        spinlock_t *ptl;
-        unsigned long addr = page_address_in_vma(new, vma);
-        if (addr == -EFAULT)
-                return;
        pgd = pgd_offset(mm, addr);
        if (!pgd_present(*pgd))
-                return;
+                goto out;
        pud = pud_offset(pgd, addr);
        if (!pud_present(*pud))
-                return;
+                goto out;
        pmd = pmd_offset(pud, addr);
        if (!pmd_present(*pmd))
-                return;
+                goto out;
        ptep = pte_offset_map(pmd, addr);
        if (!is_swap_pte(*ptep)) {
                pte_unmap(ptep);
-                return;
+                goto out;
        }
        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
        pte = *ptep;
        if (!is_swap_pte(pte))
-                goto out;
+                goto unlock;
        entry = pte_to_swp_entry(pte);
-        if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
+        if (!is_migration_entry(entry) ||
-                goto out;
+            migration_entry_to_page(entry) != old)
+                goto unlock;
        get_page(new);
        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
@@ -137,58 +135,10 @@ static void remove_migration_pte(struct vm_area_struct *vma,
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, addr, pte);
+unlock:
-out:
        pte_unmap_unlock(ptep, ptl);
-}
+out:
+        return SWAP_AGAIN;
-/*
- * Note that remove_file_migration_ptes will only work on regular mappings,
- * Nonlinear mappings do not use migration entries.
- */
-static void remove_file_migration_ptes(struct page *old, struct page *new)
-{
-        struct vm_area_struct *vma;
-        struct address_space *mapping = new->mapping;
-        struct prio_tree_iter iter;
-        pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-        if (!mapping)
-                return;
-        spin_lock(&mapping->i_mmap_lock);
-        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
-                remove_migration_pte(vma, old, new);
-        spin_unlock(&mapping->i_mmap_lock);
-}
-/*
- * Must hold mmap_sem lock on at least one of the vmas containing
- * the page so that the anon_vma cannot vanish.
- */
-static void remove_anon_migration_ptes(struct page *old, struct page *new)
-{
-        struct anon_vma *anon_vma;
-        struct vm_area_struct *vma;
-        unsigned long mapping;
-        mapping = (unsigned long)new->mapping;
-        if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
-                return;
-        /*
-         * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
-         */
-        anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
-        spin_lock(&anon_vma->lock);
-        list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
-                remove_migration_pte(vma, old, new);
-        spin_unlock(&anon_vma->lock);
 }
 /*
@@ -197,10 +147,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new)
 */
 static void remove_migration_ptes(struct page *old, struct page *new)
 {
-        if (PageAnon(new))
+        rmap_walk(new, remove_migration_pte, old);
-                remove_anon_migration_ptes(old, new);
-        else
-                remove_file_migration_ptes(old, new);
 }
 /*
@@ -341,8 +288,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
        if (TestClearPageActive(page)) {
                VM_BUG_ON(PageUnevictable(page));
                SetPageActive(newpage);
-        } else
+        } else if (TestClearPageUnevictable(page))
-                unevictable_migrate_page(newpage, page);
+                SetPageUnevictable(newpage);
        if (PageChecked(page))
                SetPageChecked(newpage);
        if (PageMappedToDisk(page))
@@ -361,6 +308,7 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
        }
        mlock_migrate_page(newpage, page);
+        ksm_migrate_page(newpage, page);
        ClearPageSwapCache(page);
        ClearPagePrivate(page);
@@ -580,9 +528,9 @@ static int move_to_new_page(struct page *newpage, struct page *page)
        else
                rc = fallback_migrate_page(mapping, newpage, page);
-        if (!rc) {
+        if (!rc)
                remove_migration_ptes(page, newpage);
-        } else
+        else
                newpage->mapping = NULL;
        unlock_page(newpage);
@@ -595,14 +543,14 @@ static int move_to_new_page(struct page *newpage, struct page *page)
 * to the newly allocated page in newpage.
 */
 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
-                        struct page *page, int force)
+                        struct page *page, int force, int offlining)
 {
        int rc = 0;
        int *result = NULL;
        struct page *newpage = get_new_page(page, private, &result);
        int rcu_locked = 0;
        int charge = 0;
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        if (!newpage)
                return -ENOMEM;
@@ -621,6 +569,20 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                lock_page(page);
        }
+        /*
+         * Only memory hotplug's offline_pages() caller has locked out KSM,
+         * and can safely migrate a KSM page.  The other cases have skipped
+         * PageKsm along with PageReserved - but it is only now when we have
+         * the page lock that we can be certain it will not go KSM beneath us
+         * (KSM will not upgrade a page from PageAnon to PageKsm when it sees
+         * its pagecount raised, but only here do we take the page lock which
+         * serializes that).
+         */
+        if (PageKsm(page) && !offlining) {
+                rc = -EBUSY;
+                goto unlock;
+        }
        /* charge against new page */
        charge = mem_cgroup_prepare_migration(page, &mem);
        if (charge == -ENOMEM) {
@@ -737,7 +699,7 @@ move_newpage:
 * Return: Number of pages not migrated or error code.
 */
 int migrate_pages(struct list_head *from,
-                new_page_t get_new_page, unsigned long private)
+                new_page_t get_new_page, unsigned long private, int offlining)
 {
        int retry = 1;
        int nr_failed = 0;
@@ -746,13 +708,6 @@ int migrate_pages(struct list_head *from,
        struct page *page2;
        int swapwrite = current->flags & PF_SWAPWRITE;
        int rc;
-        unsigned long flags;
-        local_irq_save(flags);
-        list_for_each_entry(page, from, lru)
-                __inc_zone_page_state(page, NR_ISOLATED_ANON +
-                                page_is_file_cache(page));
-        local_irq_restore(flags);
        if (!swapwrite)
                current->flags |= PF_SWAPWRITE;
@@ -764,7 +719,7 @@ int migrate_pages(struct list_head *from,
                        cond_resched();
                        rc = unmap_and_move(get_new_page, private,
-                                                page, pass > 2);
+                                                page, pass > 2, offlining);
                        switch(rc) {
                        case -ENOMEM:
@@ -860,7 +815,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                if (!page)
                        goto set_status;
-                if (PageReserved(page))         /* Check for zero page */
+                /* Use PageReserved to check for zero page */
+                if (PageReserved(page) || PageKsm(page))
                        goto put_and_set;
                pp->page = page;
@@ -878,8 +834,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
                        goto put_and_set;
                err = isolate_lru_page(page);
-                if (!err)
+                if (!err) {
                        list_add_tail(&page->lru, &pagelist);
+                        inc_zone_page_state(page, NR_ISOLATED_ANON +
+                                            page_is_file_cache(page));
+                }
 put_and_set:
                /*
                 * Either remove the duplicate refcount from
@@ -894,7 +853,7 @@ set_status:
        err = 0;
        if (!list_empty(&pagelist))
                err = migrate_pages(&pagelist, new_page_node,
-                                (unsigned long)pm);
+                                (unsigned long)pm, 0);
        up_read(&mm->mmap_sem);
        return err;
@@ -1015,7 +974,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
                err = -ENOENT;
                /* Use PageReserved to check for zero page */
-                if (!page || PageReserved(page))
+                if (!page || PageReserved(page) || PageKsm(page))
                        goto set_status;
                err = page_to_nid(page);
@@ -1044,7 +1003,7 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
        int err;
        for (i = 0; i < nr_pages; i += chunk_nr) {
-                if (chunk_nr + i > nr_pages)
+                if (chunk_nr > nr_pages - i)
                        chunk_nr = nr_pages - i;
                err = copy_from_user(chunk_pages, &pages[i],
diff --git a/mm/mincore.c b/mm/mincore.c
index 8cb508f84ea4..7a3436ef39eb 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -14,6 +14,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/hugetlb.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -72,6 +73,42 @@ static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pag
        if (!vma || addr < vma->vm_start)
                return -ENOMEM;
+#ifdef CONFIG_HUGETLB_PAGE
+        if (is_vm_hugetlb_page(vma)) {
+                struct hstate *h;
+                unsigned long nr_huge;
+                unsigned char present;
+                i = 0;
+                nr = min(pages, (vma->vm_end - addr) >> PAGE_SHIFT);
+                h = hstate_vma(vma);
+                nr_huge = ((addr + pages * PAGE_SIZE - 1) >> huge_page_shift(h))
+                          - (addr >> huge_page_shift(h)) + 1;
+                nr_huge = min(nr_huge,
+                              (vma->vm_end - addr) >> huge_page_shift(h));
+                while (1) {
+                        /* hugepage always in RAM for now,
+                         * but generally it needs to be check */
+                        ptep = huge_pte_offset(current->mm,
+                                               addr & huge_page_mask(h));
+                        present = !!(ptep &&
+                                     !huge_pte_none(huge_ptep_get(ptep)));
+                        while (1) {
+                                vec[i++] = present;
+                                addr += PAGE_SIZE;
+                                /* reach buffer limit */
+                                if (i == nr)
+                                        return nr;
+                                /* check hugepage border */
+                                if (!((addr & ~huge_page_mask(h))
+                                      >> PAGE_SHIFT))
+                                        break;
+                        }
+                }
+                return nr;
+        }
+#endif
        /*
         * Calculate how many pages there are left in the last level of the
         * PTE array for our address.
diff --git a/mm/mlock.c b/mm/mlock.c
index bd6f0e466f6c..2b8335a89400 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page)
        }
 }
-/*
+/**
- * called from munlock()/munmap() path with page supposedly on the LRU.
+ * munlock_vma_page - munlock a vma page
+ * @page - page to be unlocked
 *
- * Note:  unlike mlock_vma_page(), we can't just clear the PageMlocked
+ * called from munlock()/munmap() path with page supposedly on the LRU.
- * [in try_to_munlock()] and then attempt to isolate the page.  We must
+ * When we munlock a page, because the vma where we found the page is being
- * isolate the page to keep others from messing with its unevictable
+ * munlock()ed or munmap()ed, we want to check whether other vmas hold the
- * and mlocked state while trying to munlock.  However, we pre-clear the
+ * page locked so that we can leave it on the unevictable lru list and not
- * mlocked state anyway as we might lose the isolation race and we might
+ * bother vmscan with it.  However, to walk the page's rmap list in
- * not get another chance to clear PageMlocked.  If we successfully
+ * try_to_munlock() we must isolate the page from the LRU.  If some other
- * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
+ * task has removed the page from the LRU, we won't be able to do that.
- * mapping the page, it will restore the PageMlocked state, unless the page
+ * So we clear the PageMlocked as we might not get another chance.  If we
- * is mapped in a non-linear vma.  So, we go ahead and SetPageMlocked(),
+ * can't isolate the page, we leave it for putback_lru_page() and vmscan
- * perhaps redundantly.
+ * [page_referenced()/try_to_unmap()] to deal with.
- * If we lose the isolation race, and the page is mapped by other VM_LOCKED
- * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
- * either of which will restore the PageMlocked state by calling
- * mlock_vma_page() above, if it can grab the vma's mmap sem.
 */
-static void munlock_vma_page(struct page *page)
+void munlock_vma_page(struct page *page)
 {
        BUG_ON(!PageLocked(page));
@@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page)
                        /*
                         * did try_to_unlock() succeed or punt?
                         */
-                        if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
+                        if (ret != SWAP_MLOCK)
                                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
                        putback_lru_page(page);
                } else {
                        /*
-                         * We lost the race.  let try_to_unmap() deal
+                         * Some other task has removed the page from the LRU.
-                         * with it.  At least we get the page state and
+                         * putback_lru_page() will take care of removing the
-                         * mlock stats right.  However, page is still on
+                         * page from the unevictable list, if necessary.
-                         * the noreclaim list.  We'll fix that up when
+                         * vmscan [page_referenced()] will move the page back
-                         * the page is eventually freed or we scan the
+                         * to the unevictable list if some other vma has it
-                         * noreclaim list.
+                         * mlocked.
                         */
                        if (PageUnevictable(page))
                                count_vm_event(UNEVICTABLE_PGSTRANDED);
diff --git a/mm/mmap.c b/mm/mmap.c
index 73f5e4b64010..d9c77b2dbe9d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -20,7 +20,6 @@
 #include <linux/fs.h>
 #include <linux/personality.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/hugetlb.h>
 #include <linux/profile.h>
 #include <linux/module.h>
@@ -932,13 +931,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        if (!(flags & MAP_FIXED))
                addr = round_hint_to_min(addr);
-        error = arch_mmap_check(addr, len, flags);
-        if (error)
-                return error;
        /* Careful about overflows.. */
        len = PAGE_ALIGN(len);
-        if (!len || len > TASK_SIZE)
+        if (!len)
                return -ENOMEM;
        /* offset overflow? */
@@ -949,24 +944,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        if (mm->map_count > sysctl_max_map_count)
                return -ENOMEM;
-        if (flags & MAP_HUGETLB) {
-                struct user_struct *user = NULL;
-                if (file)
-                        return -EINVAL;
-                /*
-                 * VM_NORESERVE is used because the reservations will be
-                 * taken when vm_ops->mmap() is called
-                 * A dummy user value is used because we are not locking
-                 * memory so no accounting is necessary
-                 */
-                len = ALIGN(len, huge_page_size(&default_hstate));
-                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
-                                                &user, HUGETLB_ANONHUGE_INODE);
-                if (IS_ERR(file))
-                        return PTR_ERR(file);
-        }
        /* Obtain the address to map to. we verify (or select) it and ensure
         * that it represents a valid section of the address space.
         */
@@ -1061,9 +1038,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
        error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
        if (error)
                return error;
-        error = ima_file_mmap(file, prot);
-        if (error)
-                return error;
        return mmap_region(file, addr, len, flags, vm_flags, pgoff);
 }
@@ -1224,8 +1198,20 @@ munmap_back:
                        goto free_vma;
        }
-        if (vma_wants_writenotify(vma))
+        if (vma_wants_writenotify(vma)) {
+                pgprot_t pprot = vma->vm_page_prot;
+                /* Can vma->vm_page_prot have changed??
+                 *
+                 * Answer: Yes, drivers may have changed it in their
+                 *         f_op->mmap method.
+                 *
+                 * Ensures that vmas marked as uncached stay that way.
+                 */
                vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
+                if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot)))
+                        vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+        }
        vma_link(mm, vma, prev, rb_link, rb_parent);
        file = vma->vm_file;
@@ -1459,6 +1445,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
        unsigned long (*get_area)(struct file *, unsigned long,
                                  unsigned long, unsigned long, unsigned long);
+        unsigned long error = arch_mmap_check(addr, len, flags);
+        if (error)
+                return error;
+        /* Careful about overflows.. */
+        if (len > TASK_SIZE)
+                return -ENOMEM;
        get_area = current->mm->get_unmapped_area;
        if (file && file->f_op && file->f_op->get_unmapped_area)
                get_area = file->f_op->get_unmapped_area;
@@ -1829,10 +1823,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 /*
- * Split a vma into two pieces at address 'addr', a new vma is allocated
+ * __split_vma() bypasses sysctl_max_map_count checking.  We use this on the
- * either for the first part or the tail.
+ * munmap path where it doesn't make sense to fail.
 */
-int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
+static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
              unsigned long addr, int new_below)
 {
        struct mempolicy *pol;
@@ -1842,9 +1836,6 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
                                        ~(huge_page_mask(hstate_vma(vma)))))
                return -EINVAL;
-        if (mm->map_count >= sysctl_max_map_count)
-                return -ENOMEM;
        new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
        if (!new)
                return -ENOMEM;
@@ -1884,6 +1875,19 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        return 0;
 }
+/*
+ * Split a vma into two pieces at address 'addr', a new vma is allocated
+ * either for the first part or the tail.
+ */
+int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+              unsigned long addr, int new_below)
+{
+        if (mm->map_count >= sysctl_max_map_count)
+                return -ENOMEM;
+        return __split_vma(mm, vma, addr, new_below);
+}
 /* Munmap is split into 2 main parts -- this part which finds
 * what needs doing, and the areas themselves, which do the
 * work.  This now handles partial unmappings.
@@ -1919,7 +1923,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
         * places tmp vma above, and higher split_vma places tmp vma below.
         */
        if (start > vma->vm_start) {
-                int error = split_vma(mm, vma, start, 0);
+                int error;
+                /*
+                 * Make sure that map_count on return from munmap() will
+                 * not exceed its limit; but let map_count go just above
+                 * its limit temporarily, to help free resources as expected.
+                 */
+                if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
+                        return -ENOMEM;
+                error = __split_vma(mm, vma, start, 0);
                if (error)
                        return error;
                prev = vma;
@@ -1928,7 +1942,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        /* Does it split the last one? */
        last = find_vma(mm, end);
        if (last && end > last->vm_start) {
-                int error = split_vma(mm, last, end, 1);
+                int error = __split_vma(mm, last, end, 1);
                if (error)
                        return error;
        }
@@ -2003,20 +2017,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
        if (!len)
                return addr;
-        if ((addr + len) > TASK_SIZE || (addr + len) < addr)
-                return -EINVAL;
-        if (is_hugepage_only_range(mm, addr, len))
-                return -EINVAL;
        error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
        if (error)
                return error;
        flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-        error = arch_mmap_check(addr, len, flags);
+        error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
-        if (error)
+        if (error & ~PAGE_MASK)
                return error;
        /*
diff --git a/mm/mremap.c b/mm/mremap.c
index 97bff2547719..845190898d59 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -261,6 +261,137 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        return new_addr;
 }
+static struct vm_area_struct *vma_to_resize(unsigned long addr,
+        unsigned long old_len, unsigned long new_len, unsigned long *p)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma = find_vma(mm, addr);
+        if (!vma || vma->vm_start > addr)
+                goto Efault;
+        if (is_vm_hugetlb_page(vma))
+                goto Einval;
+        /* We can't remap across vm area boundaries */
+        if (old_len > vma->vm_end - addr)
+                goto Efault;
+        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
+                if (new_len > old_len)
+                        goto Efault;
+        }
+        if (vma->vm_flags & VM_LOCKED) {
+                unsigned long locked, lock_limit;
+                locked = mm->locked_vm << PAGE_SHIFT;
+                lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+                locked += new_len - old_len;
+                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+                        goto Eagain;
+        }
+        if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT))
+                goto Enomem;
+        if (vma->vm_flags & VM_ACCOUNT) {
+                unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
+                if (security_vm_enough_memory(charged))
+                        goto Efault;
+                *p = charged;
+        }
+        return vma;
+Efault: /* very odd choice for most of the cases, but... */
+        return ERR_PTR(-EFAULT);
+Einval:
+        return ERR_PTR(-EINVAL);
+Enomem:
+        return ERR_PTR(-ENOMEM);
+Eagain:
+        return ERR_PTR(-EAGAIN);
+}
+static unsigned long mremap_to(unsigned long addr,
+        unsigned long old_len, unsigned long new_addr,
+        unsigned long new_len)
+{
+        struct mm_struct *mm = current->mm;
+        struct vm_area_struct *vma;
+        unsigned long ret = -EINVAL;
+        unsigned long charged = 0;
+        unsigned long map_flags;
+        if (new_addr & ~PAGE_MASK)
+                goto out;
+        if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
+                goto out;
+        /* Check if the location we're moving into overlaps the
+         * old location at all, and fail if it does.
+         */
+        if ((new_addr <= addr) && (new_addr+new_len) > addr)
+                goto out;
+        if ((addr <= new_addr) && (addr+old_len) > new_addr)
+                goto out;
+        ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+        if (ret)
+                goto out;
+        ret = do_munmap(mm, new_addr, new_len);
+        if (ret)
+                goto out;
+        if (old_len >= new_len) {
+                ret = do_munmap(mm, addr+new_len, old_len - new_len);
+                if (ret && old_len != new_len)
+                        goto out;
+                old_len = new_len;
+        }
+        vma = vma_to_resize(addr, old_len, new_len, &charged);
+        if (IS_ERR(vma)) {
+                ret = PTR_ERR(vma);
+                goto out;
+        }
+        map_flags = MAP_FIXED;
+        if (vma->vm_flags & VM_MAYSHARE)
+                map_flags |= MAP_SHARED;
+        ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
+                                ((addr - vma->vm_start) >> PAGE_SHIFT),
+                                map_flags);
+        if (ret & ~PAGE_MASK)
+                goto out1;
+        ret = move_vma(vma, addr, old_len, new_len, new_addr);
+        if (!(ret & ~PAGE_MASK))
+                goto out;
+out1:
+        vm_unacct_memory(charged);
+out:
+        return ret;
+}
+static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
+{
+        unsigned long end = vma->vm_end + delta;
+        if (end < vma->vm_end) /* overflow */
+                return 0;
+        if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */
+                return 0;
+        if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
+                              0, MAP_FIXED) & ~PAGE_MASK)
+                return 0;
+        return 1;
+}
 /*
 * Expand (or shrink) an existing mapping, potentially moving it at the
 * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
@@ -294,32 +425,10 @@ unsigned long do_mremap(unsigned long addr,
        if (!new_len)
                goto out;
-        /* new_addr is only valid if MREMAP_FIXED is specified */
        if (flags & MREMAP_FIXED) {
-                if (new_addr & ~PAGE_MASK)
+                if (flags & MREMAP_MAYMOVE)
-                        goto out;
+                        ret = mremap_to(addr, old_len, new_addr, new_len);
-                if (!(flags & MREMAP_MAYMOVE))
+                goto out;
-                        goto out;
-                if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
-                        goto out;
-                /* Check if the location we're moving into overlaps the
-                 * old location at all, and fail if it does.
-                 */
-                if ((new_addr <= addr) && (new_addr+new_len) > addr)
-                        goto out;
-                if ((addr <= new_addr) && (addr+old_len) > new_addr)
-                        goto out;
-                ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
-                if (ret)
-                        goto out;
-                ret = do_munmap(mm, new_addr, new_len);
-                if (ret)
-                        goto out;
        }
        /*
@@ -332,60 +441,23 @@ unsigned long do_mremap(unsigned long addr,
                if (ret && old_len != new_len)
                        goto out;
                ret = addr;
-                if (!(flags & MREMAP_FIXED) || (new_addr == addr))
+                goto out;
-                        goto out;
-                old_len = new_len;
        }
        /*
-         * Ok, we need to grow..  or relocate.
+         * Ok, we need to grow..
         */
-        ret = -EFAULT;
+        vma = vma_to_resize(addr, old_len, new_len, &charged);
-        vma = find_vma(mm, addr);
+        if (IS_ERR(vma)) {
-        if (!vma || vma->vm_start > addr)
+                ret = PTR_ERR(vma);
-                goto out;
-        if (is_vm_hugetlb_page(vma)) {
-                ret = -EINVAL;
-                goto out;
-        }
-        /* We can't remap across vm area boundaries */
-        if (old_len > vma->vm_end - addr)
-                goto out;
-        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
-                if (new_len > old_len)
-                        goto out;
-        }
-        if (vma->vm_flags & VM_LOCKED) {
-                unsigned long locked, lock_limit;
-                locked = mm->locked_vm << PAGE_SHIFT;
-                lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
-                locked += new_len - old_len;
-                ret = -EAGAIN;
-                if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-                        goto out;
-        }
-        if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) {
-                ret = -ENOMEM;
                goto out;
        }
-        if (vma->vm_flags & VM_ACCOUNT) {
-                charged = (new_len - old_len) >> PAGE_SHIFT;
-                if (security_vm_enough_memory(charged))
-                        goto out_nc;
-        }
        /* old_len exactly to the end of the area..
-         * And we're not relocating the area.
         */
-        if (old_len == vma->vm_end - addr &&
+        if (old_len == vma->vm_end - addr) {
-            !((flags & MREMAP_FIXED) && (addr != new_addr)) &&
-            (old_len != new_len || !(flags & MREMAP_MAYMOVE))) {
-                unsigned long max_addr = TASK_SIZE;
-                if (vma->vm_next)
-                        max_addr = vma->vm_next->vm_start;
                /* can we just expand the current mapping? */
-                if (max_addr - addr >= new_len) {
+                if (vma_expandable(vma, new_len - old_len)) {
                        int pages = (new_len - old_len) >> PAGE_SHIFT;
                        vma_adjust(vma, vma->vm_start,
@@ -409,28 +481,27 @@ unsigned long do_mremap(unsigned long addr,
         */
        ret = -ENOMEM;
        if (flags & MREMAP_MAYMOVE) {
-                if (!(flags & MREMAP_FIXED)) {
+                unsigned long map_flags = 0;
-                        unsigned long map_flags = 0;
+                if (vma->vm_flags & VM_MAYSHARE)
-                        if (vma->vm_flags & VM_MAYSHARE)
+                        map_flags |= MAP_SHARED;
-                                map_flags |= MAP_SHARED;
+                new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
-                        new_addr = get_unmapped_area(vma->vm_file, 0, new_len,
+                                        vma->vm_pgoff +
-                                                vma->vm_pgoff, map_flags);
+                                        ((addr - vma->vm_start) >> PAGE_SHIFT),
-                        if (new_addr & ~PAGE_MASK) {
+                                        map_flags);
-                                ret = new_addr;
+                if (new_addr & ~PAGE_MASK) {
-                                goto out;
+                        ret = new_addr;
-                        }
+                        goto out;
-                        ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
-                        if (ret)
-                                goto out;
                }
+                ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
+                if (ret)
+                        goto out;
                ret = move_vma(vma, addr, old_len, new_len, new_addr);
        }
 out:
        if (ret & ~PAGE_MASK)
                vm_unacct_memory(charged);
-out_nc:
        return ret;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 9876fa0c3ad3..8687973462bb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1143,9 +1143,6 @@ static int do_mmap_private(struct vm_area_struct *vma,
                if (ret < rlen)
                        memset(base + ret, 0, rlen - ret);
-        } else {
-                /* if it's an anonymous mapping, then just clear it */
-                memset(base, 0, rlen);
        }
        return 0;
@@ -1343,6 +1340,11 @@ unsigned long do_mmap_pgoff(struct file *file,
                goto error_just_free;
        add_nommu_region(region);
+        /* clear anonymous mappings that don't ask for uninitialized data */
+        if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
+                memset((void *)region->vm_start, 0,
+                       region->vm_end - region->vm_start);
        /* okay... we have a mapping; now we have to register it */
        result = vma->vm_start;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ea2147dabba6..f52481b1c1e5 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 /*
 * Determine the type of allocation constraint.
 */
-static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-                                                    gfp_t gfp_mask)
-{
 #ifdef CONFIG_NUMA
+static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+                                    gfp_t gfp_mask, nodemask_t *nodemask)
+{
        struct zone *zone;
        struct zoneref *z;
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
-        nodemask_t nodes = node_states[N_HIGH_MEMORY];
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+        /*
-                if (cpuset_zone_allowed_softwall(zone, gfp_mask))
+         * Reach here only when __GFP_NOFAIL is used. So, we should avoid
-                        node_clear(zone_to_nid(zone), nodes);
+         * to kill current.We have to random task kill in this case.
-                else
+         * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
-                        return CONSTRAINT_CPUSET;
+         */
+        if (gfp_mask & __GFP_THISNODE)
+                return CONSTRAINT_NONE;
-        if (!nodes_empty(nodes))
+        /*
+         * The nodemask here is a nodemask passed to alloc_pages(). Now,
+         * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
+         * feature. mempolicy is an only user of nodemask here.
+         * check mempolicy's nodemask contains all N_HIGH_MEMORY
+         */
+        if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
                return CONSTRAINT_MEMORY_POLICY;
-#endif
+        /* Check this allocation failure is caused by cpuset's wall function */
+        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                        high_zoneidx, nodemask)
+                if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
+                        return CONSTRAINT_CPUSET;
+        return CONSTRAINT_NONE;
+}
+#else
+static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+                                gfp_t gfp_mask, nodemask_t *nodemask)
+{
        return CONSTRAINT_NONE;
 }
+#endif
 /*
 * Simple selection loop. We chose the process with the highest
@@ -337,6 +356,24 @@ static void dump_tasks(const struct mem_cgroup *mem)
        } while_each_thread(g, p);
 }
+static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
+                                                        struct mem_cgroup *mem)
+{
+        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
+                "oom_adj=%d\n",
+                current->comm, gfp_mask, order, current->signal->oom_adj);
+        task_lock(current);
+        cpuset_print_task_mems_allowed(current);
+        task_unlock(current);
+        dump_stack();
+        mem_cgroup_print_oom_info(mem, p);
+        show_mem();
+        if (sysctl_oom_dump_tasks)
+                dump_tasks(mem);
+}
+#define K(x) ((x) << (PAGE_SHIFT-10))
 /*
 * Send SIGKILL to the selected  process irrespective of  CAP_SYS_RAW_IO
 * flag though it's unlikely that  we select a process with CAP_SYS_RAW_IO
@@ -350,15 +387,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
                return;
        }
+        task_lock(p);
        if (!p->mm) {
                WARN_ON(1);
-                printk(KERN_WARNING "tried to kill an mm-less task!\n");
+                printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
+                        task_pid_nr(p), p->comm);
+                task_unlock(p);
                return;
        }
        if (verbose)
-                printk(KERN_ERR "Killed process %d (%s)\n",
+                printk(KERN_ERR "Killed process %d (%s) "
-                                task_pid_nr(p), p->comm);
+                       "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+                       task_pid_nr(p), p->comm,
+                       K(p->mm->total_vm),
+                       K(get_mm_counter(p->mm, anon_rss)),
+                       K(get_mm_counter(p->mm, file_rss)));
+        task_unlock(p);
        /*
         * We give our sacrificial lamb high priority and access to
@@ -395,20 +440,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 {
        struct task_struct *c;
-        if (printk_ratelimit()) {
+        if (printk_ratelimit())
-                printk(KERN_WARNING "%s invoked oom-killer: "
+                dump_header(p, gfp_mask, order, mem);
-                        "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
-                        current->comm, gfp_mask, order,
-                        current->signal->oom_adj);
-                task_lock(current);
-                cpuset_print_task_mems_allowed(current);
-                task_unlock(current);
-                dump_stack();
-                mem_cgroup_print_oom_info(mem, current);
-                show_mem();
-                if (sysctl_oom_dump_tasks)
-                        dump_tasks(mem);
-        }
        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
@@ -544,6 +577,7 @@ retry:
        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (!p) {
                read_unlock(&tasklist_lock);
+                dump_header(NULL, gfp_mask, order, NULL);
                panic("Out of memory and no killable processes...\n");
        }
@@ -599,7 +633,8 @@ rest_and_return:
 * OR try to be smart about which process to kill. Note that we
 * don't have to be perfect here, we just have to be good.
 */
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
+void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+                int order, nodemask_t *nodemask)
 {
        unsigned long freed = 0;
        enum oom_constraint constraint;
@@ -609,14 +644,16 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
                /* Got some memory back in the last second. */
                return;
-        if (sysctl_panic_on_oom == 2)
+        if (sysctl_panic_on_oom == 2) {
+                dump_header(NULL, gfp_mask, order, NULL);
                panic("out of memory. Compulsory panic_on_oom is selected.\n");
+        }
        /*
         * Check if there were limitations on the allocation (only relevant for
         * NUMA) that may require different handling.
         */
-        constraint = constrained_alloc(zonelist, gfp_mask);
+        constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
        read_lock(&tasklist_lock);
        switch (constraint) {
@@ -626,8 +663,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
                break;
        case CONSTRAINT_NONE:
-                if (sysctl_panic_on_oom)
+                if (sysctl_panic_on_oom) {
+                        dump_header(NULL, gfp_mask, order, NULL);
                        panic("out of memory. panic_on_oom is selected\n");
+                }
                /* Fall-through */
        case CONSTRAINT_CPUSET:
                __out_of_memory(gfp_mask, order);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2c5d79236ead..0b19943ecf8b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -821,7 +821,6 @@ int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
                      void *data)
 {
-        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ret = 0;
        int done = 0;
        struct pagevec pvec;
@@ -834,11 +833,6 @@ int write_cache_pages(struct address_space *mapping,
        int range_whole = 0;
        long nr_to_write = wbc->nr_to_write;
-        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                wbc->encountered_congestion = 1;
-                return 0;
-        }
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
                writeback_index = mapping->writeback_index; /* prev offset */
@@ -957,12 +951,6 @@ continue_unlock:
                                        break;
                                }
                        }
-                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
-                                wbc->encountered_congestion = 1;
-                                done = 1;
-                                break;
-                        }
                }
                pagevec_release(&pvec);
                cond_resched();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cdcedf661616..4e9f5cc5fb59 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
+#include <linux/memory.h>
 #include <trace/events/kmem.h>
 #include <asm/tlbflush.h>
@@ -486,7 +487,6 @@ static inline void __free_one_page(struct page *page,
        zone->free_area[order].nr_free++;
 }
-#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 /*
 * free_page_mlock() -- clean up attempts to free and mlocked() page.
 * Page should not be on lru, so no need to fix that up.
@@ -497,9 +497,6 @@ static inline void free_page_mlock(struct page *page)
        __dec_zone_page_state(page, NR_MLOCK);
        __count_vm_event(UNEVICTABLE_MLOCKFREED);
 }
-#else
-static void free_page_mlock(struct page *page) { }
-#endif
 static inline int free_pages_check(struct page *page)
 {
@@ -1658,12 +1655,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        if (page)
                goto out;
-        /* The OOM killer will not help higher order allocs */
+        if (!(gfp_mask & __GFP_NOFAIL)) {
-        if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
+                /* The OOM killer will not help higher order allocs */
-                goto out;
+                if (order > PAGE_ALLOC_COSTLY_ORDER)
+                        goto out;
+                /*
+                 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
+                 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
+                 * The caller should handle page allocation failure by itself if
+                 * it specifies __GFP_THISNODE.
+                 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
+                 */
+                if (gfp_mask & __GFP_THISNODE)
+                        goto out;
+        }
        /* Exhausted what can be done so it's blamo time */
-        out_of_memory(zonelist, gfp_mask, order);
+        out_of_memory(zonelist, gfp_mask, order, nodemask);
 out:
        clear_zonelist_oom(zonelist, gfp_mask);
@@ -1769,7 +1776,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
                 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
                 */
                alloc_flags &= ~ALLOC_CPUSET;
-        } else if (unlikely(rt_task(p)))
+        } else if (unlikely(rt_task(p)) && !in_interrupt())
                alloc_flags |= ALLOC_HARDER;
        if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
@@ -1817,9 +1824,9 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
+restart:
        wake_all_kswapd(order, zonelist, high_zoneidx);
-restart:
        /*
         * OK, we're below the kswapd watermark and have kicked background
         * reclaim. Now things get more complex, so set up alloc_flags according
@@ -2395,13 +2402,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
 {
        char saved_string[NUMA_ZONELIST_ORDER_LEN];
        int ret;
+        static DEFINE_MUTEX(zl_order_mutex);
+        mutex_lock(&zl_order_mutex);
        if (write)
-                strncpy(saved_string, (char*)table->data,
+                strcpy(saved_string, (char*)table->data);
-                        NUMA_ZONELIST_ORDER_LEN);
        ret = proc_dostring(table, write, buffer, length, ppos);
        if (ret)
-                return ret;
+                goto out;
        if (write) {
                int oldval = user_zonelist_order;
                if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2414,7 +2422,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                } else if (oldval != user_zonelist_order)
                        build_all_zonelists();
        }
-        return 0;
+out:
+        mutex_unlock(&zl_order_mutex);
+        return ret;
 }
@@ -3127,7 +3137,7 @@ static int __cpuinit process_zones(int cpu)
                if (percpu_pagelist_fraction)
                        setup_pagelist_highmark(zone_pcp(zone, cpu),
-                                (zone->present_pages / percpu_pagelist_fraction));
+                            (zone->present_pages / percpu_pagelist_fraction));
        }
        return 0;
@@ -3573,7 +3583,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
 * then all holes in the requested range will be accounted for.
 */
-static unsigned long __meminit __absent_pages_in_range(int nid,
+unsigned long __meminit __absent_pages_in_range(int nid,
                                unsigned long range_start_pfn,
                                unsigned long range_end_pfn)
 {
@@ -4102,7 +4112,7 @@ static int __init cmp_node_active_region(const void *a, const void *b)
 }
 /* sort the node_map by start_pfn */
-static void __init sort_node_map(void)
+void __init sort_node_map(void)
 {
        sort(early_node_map, (size_t)nr_nodemap_entries,
                        sizeof(struct node_active_region),
@@ -5002,23 +5012,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 int set_migratetype_isolate(struct page *page)
 {
        struct zone *zone;
-        unsigned long flags;
+        struct page *curr_page;
+        unsigned long flags, pfn, iter;
+        unsigned long immobile = 0;
+        struct memory_isolate_notify arg;
+        int notifier_ret;
        int ret = -EBUSY;
        int zone_idx;
        zone = page_zone(page);
        zone_idx = zone_idx(zone);
        spin_lock_irqsave(&zone->lock, flags);
+        if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
+            zone_idx == ZONE_MOVABLE) {
+                ret = 0;
+                goto out;
+        }
+        pfn = page_to_pfn(page);
+        arg.start_pfn = pfn;
+        arg.nr_pages = pageblock_nr_pages;
+        arg.pages_found = 0;
        /*
-         * In future, more migrate types will be able to be isolation target.
+         * It may be possible to isolate a pageblock even if the
+         * migratetype is not MIGRATE_MOVABLE. The memory isolation
+         * notifier chain is used by balloon drivers to return the
+         * number of pages in a range that are held by the balloon
+         * driver to shrink memory. If all the pages are accounted for
+         * by balloons, are free, or on the LRU, isolation can continue.
+         * Later, for example, when memory hotplug notifier runs, these
+         * pages reported as "can be isolated" should be isolated(freed)
+         * by the balloon driver through the memory notifier chain.
         */
-        if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
+        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
-            zone_idx != ZONE_MOVABLE)
+        notifier_ret = notifier_to_errno(notifier_ret);
+        if (notifier_ret || !arg.pages_found)
                goto out;
-        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-        move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
-        ret = 0;
+                if (!pfn_valid_within(pfn))
+                        continue;
+                curr_page = pfn_to_page(iter);
+                if (!page_count(curr_page) || PageLRU(curr_page))
+                        continue;
+                immobile++;
+        }
+        if (arg.pages_found == immobile)
+                ret = 0;
 out:
+        if (!ret) {
+                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+                move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        }
        spin_unlock_irqrestore(&zone->lock, flags);
        if (!ret)
                drain_all_pages();
@@ -5085,3 +5137,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
        spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
+#ifdef CONFIG_MEMORY_FAILURE
+bool is_free_buddy_page(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        unsigned long pfn = page_to_pfn(page);
+        unsigned long flags;
+        int order;
+        spin_lock_irqsave(&zone->lock, flags);
+        for (order = 0; order < MAX_ORDER; order++) {
+                struct page *page_head = page - (pfn & ((1 << order) - 1));
+                if (PageBuddy(page_head) && page_order(page_head) >= order)
+                        break;
+        }
+        spin_unlock_irqrestore(&zone->lock, flags);
+        return order < MAX_ORDER;
+}
+#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index c6f3e5071de3..a19af956ee1b 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -19,20 +19,15 @@
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
-static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
+static struct bio *get_swap_bio(gfp_t gfp_flags,
                                struct page *page, bio_end_io_t end_io)
 {
        struct bio *bio;
        bio = bio_alloc(gfp_flags, 1);
        if (bio) {
-                struct swap_info_struct *sis;
+                bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
-                swp_entry_t entry = { .val = index, };
+                bio->bi_sector <<= PAGE_SHIFT - 9;
-                sis = get_swap_info_struct(swp_type(entry));
-                bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
-                                        (PAGE_SIZE >> 9);
-                bio->bi_bdev = sis->bdev;
                bio->bi_io_vec[0].bv_page = page;
                bio->bi_io_vec[0].bv_len = PAGE_SIZE;
                bio->bi_io_vec[0].bv_offset = 0;
@@ -102,8 +97,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                unlock_page(page);
                goto out;
        }
-        bio = get_swap_bio(GFP_NOIO, page_private(page), page,
+        bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
-                                end_swap_bio_write);
        if (bio == NULL) {
                set_page_dirty(page);
                unlock_page(page);
@@ -127,8 +121,7 @@ int swap_readpage(struct page *page)
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(PageUptodate(page));
-        bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
+        bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
-                                end_swap_bio_read);
        if (bio == NULL) {
                unlock_page(page);
                ret = -ENOMEM;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d5878bed7841..7b47a57b6646 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -1,6 +1,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/sched.h>
+#include <linux/hugetlb.h>
 static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                          struct mm_walk *walk)
@@ -107,6 +108,7 @@ int walk_page_range(unsigned long addr, unsigned long end,
        pgd_t *pgd;
        unsigned long next;
        int err = 0;
+        struct vm_area_struct *vma;
        if (addr >= end)
                return err;
@@ -117,11 +119,38 @@ int walk_page_range(unsigned long addr, unsigned long end,
        pgd = pgd_offset(walk->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
+                /*
+                 * handle hugetlb vma individually because pagetable walk for
+                 * the hugetlb page is dependent on the architecture and
+                 * we can't handled it in the same manner as non-huge pages.
+                 */
+                vma = find_vma(walk->mm, addr);
+#ifdef CONFIG_HUGETLB_PAGE
+                if (vma && is_vm_hugetlb_page(vma)) {
+                        pte_t *pte;
+                        struct hstate *hs;
+                        if (vma->vm_end < next)
+                                next = vma->vm_end;
+                        hs = hstate_vma(vma);
+                        pte = huge_pte_offset(walk->mm,
+                                              addr & huge_page_mask(hs));
+                        if (pte && !huge_pte_none(huge_ptep_get(pte))
+                            && walk->hugetlb_entry)
+                                err = walk->hugetlb_entry(pte, addr,
+                                                          next, walk);
+                        if (err)
+                                break;
+                        continue;
+                }
+#endif
                if (pgd_none_or_clear_bad(pgd)) {
                        if (walk->pte_hole)
                                err = walk->pte_hole(addr, next, walk);
                        if (err)
                                break;
+                        pgd++;
                        continue;
                }
                if (walk->pgd_entry)
@@ -131,7 +160,8 @@ int walk_page_range(unsigned long addr, unsigned long end,
                        err = walk_pud_range(pgd, addr, next, walk);
                if (err)
                        break;
-        } while (pgd++, addr = next, addr != end);
+                pgd++;
+        } while (addr = next, addr != end);
        return err;
 }
diff --git a/mm/percpu.c b/mm/percpu.c
index d90797160c2a..442010cc91c6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -46,8 +46,6 @@
 *
 * To use this allocator, arch code should do the followings.
 *
- * - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
- *
 * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
 *   regular address to percpu pointer and back if they need to be
 *   different from the default
@@ -74,6 +72,7 @@
 #include <asm/cacheflush.h>
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
+#include <asm/io.h>
 #define PCPU_SLOT_BASE_SHIFT            5       /* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC              16      /* start a map with 16 ents */
@@ -355,62 +354,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 }
 /**
- * pcpu_extend_area_map - extend area map for allocation
+ * pcpu_need_to_extend - determine whether chunk area map needs to be extended
- * @chunk: target chunk
+ * @chunk: chunk of interest
 *
- * Extend area map of @chunk so that it can accomodate an allocation.
+ * Determine whether area map of @chunk needs to be extended to
- * A single allocation can split an area into three areas, so this
+ * accomodate a new allocation.
- * function makes sure that @chunk->map has at least two extra slots.
 *
 * CONTEXT:
- * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
+ * pcpu_lock.
- * if area map is extended.
 *
 * RETURNS:
- * 0 if noop, 1 if successfully extended, -errno on failure.
+ * New target map allocation length if extension is necessary, 0
+ * otherwise.
 */
-static int pcpu_extend_area_map(struct pcpu_chunk *chunk, unsigned long *flags)
+static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
 {
        int new_alloc;
-        int *new;
-        size_t size;
-        /* has enough? */
        if (chunk->map_alloc >= chunk->map_used + 2)
                return 0;
-        spin_unlock_irqrestore(&pcpu_lock, *flags);
        new_alloc = PCPU_DFL_MAP_ALLOC;
        while (new_alloc < chunk->map_used + 2)
                new_alloc *= 2;
-        new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
+        return new_alloc;
-        if (!new) {
+}
-                spin_lock_irqsave(&pcpu_lock, *flags);
+/**
+ * pcpu_extend_area_map - extend area map of a chunk
+ * @chunk: chunk of interest
+ * @new_alloc: new target allocation length of the area map
+ *
+ * Extend area map of @chunk to have @new_alloc entries.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.  Grabs and releases pcpu_lock.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
+{
+        int *old = NULL, *new = NULL;
+        size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
+        unsigned long flags;
+        new = pcpu_mem_alloc(new_size);
+        if (!new)
                return -ENOMEM;
-        }
-        /*
+        /* acquire pcpu_lock and switch to new area map */
-         * Acquire pcpu_lock and switch to new area map.  Only free
+        spin_lock_irqsave(&pcpu_lock, flags);
-         * could have happened inbetween, so map_used couldn't have
-         * grown.
-         */
-        spin_lock_irqsave(&pcpu_lock, *flags);
-        BUG_ON(new_alloc < chunk->map_used + 2);
-        size = chunk->map_alloc * sizeof(chunk->map[0]);
+        if (new_alloc <= chunk->map_alloc)
-        memcpy(new, chunk->map, size);
+                goto out_unlock;
+        old_size = chunk->map_alloc * sizeof(chunk->map[0]);
+        memcpy(new, chunk->map, old_size);
        /*
         * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
         * one of the first chunks and still using static map.
         */
        if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
-                pcpu_mem_free(chunk->map, size);
+                old = chunk->map;
        chunk->map_alloc = new_alloc;
        chunk->map = new;
+        new = NULL;
+out_unlock:
+        spin_unlock_irqrestore(&pcpu_lock, flags);
+        /*
+         * pcpu_mem_free() might end up calling vfree() which uses
+         * IRQ-unsafe lock and thus can't be called under pcpu_lock.
+         */
+        pcpu_mem_free(old, old_size);
+        pcpu_mem_free(new, new_size);
        return 0;
 }
@@ -1049,7 +1072,7 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
        static int warn_limit = 10;
        struct pcpu_chunk *chunk;
        const char *err;
-        int slot, off;
+        int slot, off, new_alloc;
        unsigned long flags;
        if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
@@ -1064,14 +1087,25 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;
-                if (size > chunk->contig_hint ||
-                    pcpu_extend_area_map(chunk, &flags) < 0) {
+                if (size > chunk->contig_hint) {
-                        err = "failed to extend area map of reserved chunk";
+                        err = "alloc from reserved chunk failed";
                        goto fail_unlock;
                }
+                while ((new_alloc = pcpu_need_to_extend(chunk))) {
+                        spin_unlock_irqrestore(&pcpu_lock, flags);
+                        if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
+                                err = "failed to extend area map of reserved chunk";
+                                goto fail_unlock_mutex;
+                        }
+                        spin_lock_irqsave(&pcpu_lock, flags);
+                }
                off = pcpu_alloc_area(chunk, size, align);
                if (off >= 0)
                        goto area_found;
                err = "alloc from reserved chunk failed";
                goto fail_unlock;
        }
@@ -1083,14 +1117,20 @@ restart:
                        if (size > chunk->contig_hint)
                                continue;
-                        switch (pcpu_extend_area_map(chunk, &flags)) {
+                        new_alloc = pcpu_need_to_extend(chunk);
-                        case 0:
+                        if (new_alloc) {
-                                break;
+                                spin_unlock_irqrestore(&pcpu_lock, flags);
-                        case 1:
+                                if (pcpu_extend_area_map(chunk,
-                                goto restart;   /* pcpu_lock dropped, restart */
+                                                         new_alloc) < 0) {
-                        default:
+                                        err = "failed to extend area map";
-                                err = "failed to extend area map";
+                                        goto fail_unlock_mutex;
-                                goto fail_unlock;
+                                }
+                                spin_lock_irqsave(&pcpu_lock, flags);
+                                /*
+                                 * pcpu_lock has been dropped, need to
+                                 * restart cpu_slot list walking.
+                                 */
+                                goto restart;
                        }
                        off = pcpu_alloc_area(chunk, size, align);
@@ -1261,6 +1301,27 @@ void free_percpu(void *ptr)
 }
 EXPORT_SYMBOL_GPL(free_percpu);
+/**
+ * per_cpu_ptr_to_phys - convert translated percpu address to physical address
+ * @addr: the address to be converted to physical address
+ *
+ * Given @addr which is dereferenceable address obtained via one of
+ * percpu access macros, this function translates it into its physical
+ * address.  The caller is responsible for ensuring @addr stays valid
+ * until this function finishes.
+ *
+ * RETURNS:
+ * The physical address for @addr.
+ */
+phys_addr_t per_cpu_ptr_to_phys(void *addr)
+{
+        if ((unsigned long)addr < VMALLOC_START ||
+                        (unsigned long)addr >= VMALLOC_END)
+                return __pa(addr);
+        else
+                return page_to_phys(vmalloc_to_page(addr));
+}
 static inline size_t pcpu_calc_fc_sizes(size_t static_size,
                                        size_t reserved_size,
                                        ssize_t *dyn_sizep)
diff --git a/mm/readahead.c b/mm/readahead.c
index aa1aa2345235..033bc135a41f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -547,5 +547,17 @@ page_cache_async_readahead(struct address_space *mapping,
        /* do read-ahead */
        ondemand_readahead(mapping, ra, filp, true, offset, req_size);
+#ifdef CONFIG_BLOCK
+        /*
+         * Normally the current page is !uptodate and lock_page() will be
+         * immediately called to implicitly unplug the device. However this
+         * is not always true for RAID conifgurations, where data arrives
+         * not strictly in their submission order. In this case we need to
+         * explicitly kick off the IO.
+         */
+        if (PageUptodate(page))
+                blk_run_backing_dev(mapping->backing_dev_info, NULL);
+#endif
 }
 EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index dd43373a483f..278cd277bdec 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
 #include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/rcupdate.h>
 #include <linux/module.h>
@@ -67,7 +68,7 @@ static inline struct anon_vma *anon_vma_alloc(void)
        return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
 }
-static inline void anon_vma_free(struct anon_vma *anon_vma)
+void anon_vma_free(struct anon_vma *anon_vma)
 {
        kmem_cache_free(anon_vma_cachep, anon_vma);
 }
@@ -171,7 +172,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
        list_del(&vma->anon_vma_node);
        /* We must garbage collect the anon_vma if it's empty */
-        empty = list_empty(&anon_vma->head);
+        empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
        spin_unlock(&anon_vma->lock);
        if (empty)
@@ -183,6 +184,7 @@ static void anon_vma_ctor(void *data)
        struct anon_vma *anon_vma = data;
        spin_lock_init(&anon_vma->lock);
+        ksm_refcount_init(anon_vma);
        INIT_LIST_HEAD(&anon_vma->head);
 }
@@ -202,8 +204,8 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
        unsigned long anon_mapping;
        rcu_read_lock();
-        anon_mapping = (unsigned long) page->mapping;
+        anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping);
-        if (!(anon_mapping & PAGE_MAPPING_ANON))
+        if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
                goto out;
        if (!page_mapped(page))
                goto out;
@@ -248,8 +250,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
        if (PageAnon(page)) {
-                if ((void *)vma->anon_vma !=
+                if (vma->anon_vma != page_anon_vma(page))
-                    (void *)page->mapping - PAGE_MAPPING_ANON)
                        return -EFAULT;
        } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
                if (!vma->vm_file ||
@@ -337,21 +338,15 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
 * Subfunctions of page_referenced: page_referenced_one called
 * repeatedly from either page_referenced_anon or page_referenced_file.
 */
-static int page_referenced_one(struct page *page,
+int page_referenced_one(struct page *page, struct vm_area_struct *vma,
-                               struct vm_area_struct *vma,
+                        unsigned long address, unsigned int *mapcount,
-                               unsigned int *mapcount,
+                        unsigned long *vm_flags)
-                               unsigned long *vm_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
-        unsigned long address;
        pte_t *pte;
        spinlock_t *ptl;
        int referenced = 0;
-        address = vma_address(page, vma);
-        if (address == -EFAULT)
-                goto out;
        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
@@ -388,9 +383,10 @@ static int page_referenced_one(struct page *page,
 out_unmap:
        (*mapcount)--;
        pte_unmap_unlock(pte, ptl);
-out:
        if (referenced)
                *vm_flags |= vma->vm_flags;
+out:
        return referenced;
 }
@@ -409,6 +405,9 @@ static int page_referenced_anon(struct page *page,
        mapcount = page_mapcount(page);
        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+                unsigned long address = vma_address(page, vma);
+                if (address == -EFAULT)
+                        continue;
                /*
                 * If we are reclaiming on behalf of a cgroup, skip
                 * counting on behalf of references from different
@@ -416,7 +415,7 @@ static int page_referenced_anon(struct page *page,
                 */
                if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
                        continue;
-                referenced += page_referenced_one(page, vma,
+                referenced += page_referenced_one(page, vma, address,
                                                  &mapcount, vm_flags);
                if (!mapcount)
                        break;
@@ -474,6 +473,9 @@ static int page_referenced_file(struct page *page,
        mapcount = page_mapcount(page);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+                unsigned long address = vma_address(page, vma);
+                if (address == -EFAULT)
+                        continue;
                /*
                 * If we are reclaiming on behalf of a cgroup, skip
                 * counting on behalf of references from different
@@ -481,7 +483,7 @@ static int page_referenced_file(struct page *page,
                 */
                if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
                        continue;
-                referenced += page_referenced_one(page, vma,
+                referenced += page_referenced_one(page, vma, address,
                                                  &mapcount, vm_flags);
                if (!mapcount)
                        break;
@@ -507,46 +509,47 @@ int page_referenced(struct page *page,
                    unsigned long *vm_flags)
 {
        int referenced = 0;
+        int we_locked = 0;
        if (TestClearPageReferenced(page))
                referenced++;
        *vm_flags = 0;
-        if (page_mapped(page) && page->mapping) {
+        if (page_mapped(page) && page_rmapping(page)) {
-                if (PageAnon(page))
+                if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
+                        we_locked = trylock_page(page);
+                        if (!we_locked) {
+                                referenced++;
+                                goto out;
+                        }
+                }
+                if (unlikely(PageKsm(page)))
+                        referenced += page_referenced_ksm(page, mem_cont,
+                                                                vm_flags);
+                else if (PageAnon(page))
                        referenced += page_referenced_anon(page, mem_cont,
                                                                vm_flags);
-                else if (is_locked)
+                else if (page->mapping)
                        referenced += page_referenced_file(page, mem_cont,
                                                                vm_flags);
-                else if (!trylock_page(page))
+                if (we_locked)
-                        referenced++;
-                else {
-                        if (page->mapping)
-                                referenced += page_referenced_file(page,
-                                                        mem_cont, vm_flags);
                        unlock_page(page);
-                }
        }
+out:
        if (page_test_and_clear_young(page))
                referenced++;
        return referenced;
 }
-static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
+static int page_mkclean_one(struct page *page, struct vm_area_struct *vma,
+                            unsigned long address)
 {
        struct mm_struct *mm = vma->vm_mm;
-        unsigned long address;
        pte_t *pte;
        spinlock_t *ptl;
        int ret = 0;
-        address = vma_address(page, vma);
-        if (address == -EFAULT)
-                goto out;
        pte = page_check_address(page, mm, address, &ptl, 1);
        if (!pte)
                goto out;
@@ -578,8 +581,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-                if (vma->vm_flags & VM_SHARED)
+                if (vma->vm_flags & VM_SHARED) {
-                        ret += page_mkclean_one(page, vma);
+                        unsigned long address = vma_address(page, vma);
+                        if (address == -EFAULT)
+                                continue;
+                        ret += page_mkclean_one(page, vma, address);
+                }
        }
        spin_unlock(&mapping->i_mmap_lock);
        return ret;
@@ -620,14 +627,7 @@ static void __page_set_anon_rmap(struct page *page,
        BUG_ON(!anon_vma);
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
        page->mapping = (struct address_space *) anon_vma;
        page->index = linear_page_index(vma, address);
-        /*
-         * nr_mapped state can be updated without turning off
-         * interrupts because it is not modified via interrupt.
-         */
-        __inc_zone_page_state(page, NR_ANON_PAGES);
 }
 /**
@@ -665,14 +665,23 @@ static void __page_check_anon_rmap(struct page *page,
 * @vma:        the vm area in which the mapping is added
 * @address:    the user virtual address mapped
 *
- * The caller needs to hold the pte lock and the page must be locked.
+ * The caller needs to hold the pte lock, and the page must be locked in
+ * the anon_vma case: to serialize mapping,index checking after setting,
+ * and to ensure that PageAnon is not being upgraded racily to PageKsm
+ * (but PageKsm is never downgraded to PageAnon).
 */
 void page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
+        int first = atomic_inc_and_test(&page->_mapcount);
+        if (first)
+                __inc_zone_page_state(page, NR_ANON_PAGES);
+        if (unlikely(PageKsm(page)))
+                return;
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-        if (atomic_inc_and_test(&page->_mapcount))
+        if (first)
                __page_set_anon_rmap(page, vma, address);
        else
                __page_check_anon_rmap(page, vma, address);
@@ -694,6 +703,7 @@ void page_add_new_anon_rmap(struct page *page,
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        SetPageSwapBacked(page);
        atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
+        __inc_zone_page_state(page, NR_ANON_PAGES);
        __page_set_anon_rmap(page, vma, address);
        if (page_evictable(page, vma))
                lru_cache_add_lru(page, LRU_ACTIVE_ANON);
@@ -711,7 +721,7 @@ void page_add_file_rmap(struct page *page)
 {
        if (atomic_inc_and_test(&page->_mapcount)) {
                __inc_zone_page_state(page, NR_FILE_MAPPED);
-                mem_cgroup_update_mapped_file_stat(page, 1);
+                mem_cgroup_update_file_mapped(page, 1);
        }
 }
@@ -743,8 +753,8 @@ void page_remove_rmap(struct page *page)
                __dec_zone_page_state(page, NR_ANON_PAGES);
        } else {
                __dec_zone_page_state(page, NR_FILE_MAPPED);
+                mem_cgroup_update_file_mapped(page, -1);
        }
-        mem_cgroup_update_mapped_file_stat(page, -1);
        /*
         * It would be tidy to reset the PageAnon mapping here,
         * but that might overwrite a racing page_add_anon_rmap
@@ -760,20 +770,15 @@ void page_remove_rmap(struct page *page)
 * Subfunctions of try_to_unmap: try_to_unmap_one called
 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 */
-static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
+int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-                                enum ttu_flags flags)
+                     unsigned long address, enum ttu_flags flags)
 {
        struct mm_struct *mm = vma->vm_mm;
-        unsigned long address;
        pte_t *pte;
        pte_t pteval;
        spinlock_t *ptl;
        int ret = SWAP_AGAIN;
-        address = vma_address(page, vma);
-        if (address == -EFAULT)
-                goto out;
        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
@@ -784,10 +789,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * skipped over this mm) then we should reactivate it.
         */
        if (!(flags & TTU_IGNORE_MLOCK)) {
-                if (vma->vm_flags & VM_LOCKED) {
+                if (vma->vm_flags & VM_LOCKED)
-                        ret = SWAP_MLOCK;
+                        goto out_mlock;
+                if (TTU_ACTION(flags) == TTU_MUNLOCK)
                        goto out_unmap;
-                }
        }
        if (!(flags & TTU_IGNORE_ACCESS)) {
                if (ptep_clear_flush_young_notify(vma, address, pte)) {
@@ -822,7 +828,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         * Store the swap location in the pte.
                         * See handle_pte_fault() ...
                         */
-                        swap_duplicate(entry);
+                        if (swap_duplicate(entry) < 0) {
+                                set_pte_at(mm, address, pte, pteval);
+                                ret = SWAP_FAIL;
+                                goto out_unmap;
+                        }
                        if (list_empty(&mm->mmlist)) {
                                spin_lock(&mmlist_lock);
                                if (list_empty(&mm->mmlist))
@@ -849,7 +859,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        } else
                dec_mm_counter(mm, file_rss);
        page_remove_rmap(page);
        page_cache_release(page);
@@ -857,6 +866,27 @@ out_unmap:
        pte_unmap_unlock(pte, ptl);
 out:
        return ret;
+out_mlock:
+        pte_unmap_unlock(pte, ptl);
+        /*
+         * We need mmap_sem locking, Otherwise VM_LOCKED check makes
+         * unstable result and race. Plus, We can't wait here because
+         * we now hold anon_vma->lock or mapping->i_mmap_lock.
+         * if trylock failed, the page remain in evictable lru and later
+         * vmscan could retry to move the page to unevictable lru if the
+         * page is actually mlocked.
+         */
+        if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
+                if (vma->vm_flags & VM_LOCKED) {
+                        mlock_vma_page(page);
+                        ret = SWAP_MLOCK;
+                }
+                up_read(&vma->vm_mm->mmap_sem);
+        }
+        return ret;
 }
 /*
@@ -922,11 +952,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
                return ret;
        /*
-         * MLOCK_PAGES => feature is configured.
+         * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
-         * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
         * keep the sem while scanning the cluster for mlocking pages.
         */
-        if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
+        if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
                locked_vma = (vma->vm_flags & VM_LOCKED);
                if (!locked_vma)
                        up_read(&vma->vm_mm->mmap_sem); /* don't need it */
@@ -976,29 +1005,11 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
        return ret;
 }
-/*
- * common handling for pages mapped in VM_LOCKED vmas
- */
-static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
-{
-        int mlocked = 0;
-        if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
-                if (vma->vm_flags & VM_LOCKED) {
-                        mlock_vma_page(page);
-                        mlocked++;      /* really mlocked the page */
-                }
-                up_read(&vma->vm_mm->mmap_sem);
-        }
-        return mlocked;
-}
 /**
 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
 * rmap method
 * @page: the page to unmap/unlock
- * @unlock:  request for unlock rather than unmap [unlikely]
+ * @flags: action and flags
- * @migration:  unmapping for migration - ignored if @unlock
 *
 * Find all the mappings of a page using the mapping pointer and the vma chains
 * contained in the anon_vma struct it points to.
@@ -1014,42 +1025,22 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
        struct anon_vma *anon_vma;
        struct vm_area_struct *vma;
-        unsigned int mlocked = 0;
        int ret = SWAP_AGAIN;
-        int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
-        if (MLOCK_PAGES && unlikely(unlock))
-                ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
        anon_vma = page_lock_anon_vma(page);
        if (!anon_vma)
                return ret;
        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-                if (MLOCK_PAGES && unlikely(unlock)) {
+                unsigned long address = vma_address(page, vma);
-                        if (!((vma->vm_flags & VM_LOCKED) &&
+                if (address == -EFAULT)
-                              page_mapped_in_vma(page, vma)))
+                        continue;
-                                continue;  /* must visit all unlocked vmas */
+                ret = try_to_unmap_one(page, vma, address, flags);
-                        ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
+                if (ret != SWAP_AGAIN || !page_mapped(page))
-                } else {
+                        break;
-                        ret = try_to_unmap_one(page, vma, flags);
-                        if (ret == SWAP_FAIL || !page_mapped(page))
-                                break;
-                }
-                if (ret == SWAP_MLOCK) {
-                        mlocked = try_to_mlock_page(page, vma);
-                        if (mlocked)
-                                break;  /* stop if actually mlocked page */
-                }
        }
        page_unlock_anon_vma(anon_vma);
-        if (mlocked)
-                ret = SWAP_MLOCK;       /* actually mlocked the page */
-        else if (ret == SWAP_MLOCK)
-                ret = SWAP_AGAIN;       /* saw VM_LOCKED vma */
        return ret;
 }
@@ -1079,48 +1070,30 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        unsigned long max_nl_cursor = 0;
        unsigned long max_nl_size = 0;
        unsigned int mapcount;
-        unsigned int mlocked = 0;
-        int unlock = TTU_ACTION(flags) == TTU_MUNLOCK;
-        if (MLOCK_PAGES && unlikely(unlock))
-                ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-                if (MLOCK_PAGES && unlikely(unlock)) {
+                unsigned long address = vma_address(page, vma);
-                        if (!((vma->vm_flags & VM_LOCKED) &&
+                if (address == -EFAULT)
-                                                page_mapped_in_vma(page, vma)))
+                        continue;
-                                continue;       /* must visit all vmas */
+                ret = try_to_unmap_one(page, vma, address, flags);
-                        ret = SWAP_MLOCK;
+                if (ret != SWAP_AGAIN || !page_mapped(page))
-                } else {
+                        goto out;
-                        ret = try_to_unmap_one(page, vma, flags);
-                        if (ret == SWAP_FAIL || !page_mapped(page))
-                                goto out;
-                }
-                if (ret == SWAP_MLOCK) {
-                        mlocked = try_to_mlock_page(page, vma);
-                        if (mlocked)
-                                break;  /* stop if actually mlocked page */
-                }
        }
-        if (mlocked)
+        if (list_empty(&mapping->i_mmap_nonlinear))
                goto out;
-        if (list_empty(&mapping->i_mmap_nonlinear))
+        /*
+         * We don't bother to try to find the munlocked page in nonlinears.
+         * It's costly. Instead, later, page reclaim logic may call
+         * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
+         */
+        if (TTU_ACTION(flags) == TTU_MUNLOCK)
                goto out;
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                shared.vm_set.list) {
-                if (MLOCK_PAGES && unlikely(unlock)) {
-                        if (!(vma->vm_flags & VM_LOCKED))
-                                continue;       /* must visit all vmas */
-                        ret = SWAP_MLOCK;       /* leave mlocked == 0 */
-                        goto out;               /* no need to look further */
-                }
-                if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
-                        (vma->vm_flags & VM_LOCKED))
-                        continue;
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
                        max_nl_cursor = cursor;
@@ -1153,16 +1126,12 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
        do {
                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                shared.vm_set.list) {
-                        if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) &&
-                            (vma->vm_flags & VM_LOCKED))
-                                continue;
                        cursor = (unsigned long) vma->vm_private_data;
                        while ( cursor < max_nl_cursor &&
                                cursor < vma->vm_end - vma->vm_start) {
-                                ret = try_to_unmap_cluster(cursor, &mapcount,
+                                if (try_to_unmap_cluster(cursor, &mapcount,
-                                                                vma, page);
+                                                vma, page) == SWAP_MLOCK)
-                                if (ret == SWAP_MLOCK)
+                                        ret = SWAP_MLOCK;
-                                        mlocked = 2;    /* to return below */
                                cursor += CLUSTER_SIZE;
                                vma->vm_private_data = (void *) cursor;
                                if ((int)mapcount <= 0)
@@ -1183,10 +1152,6 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
                vma->vm_private_data = NULL;
 out:
        spin_unlock(&mapping->i_mmap_lock);
-        if (mlocked)
-                ret = SWAP_MLOCK;       /* actually mlocked the page */
-        else if (ret == SWAP_MLOCK)
-                ret = SWAP_AGAIN;       /* saw VM_LOCKED vma */
        return ret;
 }
@@ -1210,7 +1175,9 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
        BUG_ON(!PageLocked(page));
-        if (PageAnon(page))
+        if (unlikely(PageKsm(page)))
+                ret = try_to_unmap_ksm(page, flags);
+        else if (PageAnon(page))
                ret = try_to_unmap_anon(page, flags);
        else
                ret = try_to_unmap_file(page, flags);
@@ -1229,17 +1196,98 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
 *
 * Return values are:
 *
- * SWAP_SUCCESS - no vma's holding page mlocked.
+ * SWAP_AGAIN   - no vma is holding page mlocked, or,
 * SWAP_AGAIN   - page mapped in mlocked vma -- couldn't acquire mmap sem
+ * SWAP_FAIL    - page cannot be located at present
 * SWAP_MLOCK   - page is now mlocked.
 */
 int try_to_munlock(struct page *page)
 {
        VM_BUG_ON(!PageLocked(page) || PageLRU(page));
-        if (PageAnon(page))
+        if (unlikely(PageKsm(page)))
+                return try_to_unmap_ksm(page, TTU_MUNLOCK);
+        else if (PageAnon(page))
                return try_to_unmap_anon(page, TTU_MUNLOCK);
        else
                return try_to_unmap_file(page, TTU_MUNLOCK);
 }
+#ifdef CONFIG_MIGRATION
+/*
+ * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
+ * Called by migrate.c to remove migration ptes, but might be used more later.
+ */
+static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
+                struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+        struct anon_vma *anon_vma;
+        struct vm_area_struct *vma;
+        int ret = SWAP_AGAIN;
+        /*
+         * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
+         * because that depends on page_mapped(); but not all its usages
+         * are holding mmap_sem, which also gave the necessary guarantee
+         * (that this anon_vma's slab has not already been destroyed).
+         * This needs to be reviewed later: avoiding page_lock_anon_vma()
+         * is risky, and currently limits the usefulness of rmap_walk().
+         */
+        anon_vma = page_anon_vma(page);
+        if (!anon_vma)
+                return ret;
+        spin_lock(&anon_vma->lock);
+        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+                unsigned long address = vma_address(page, vma);
+                if (address == -EFAULT)
+                        continue;
+                ret = rmap_one(page, vma, address, arg);
+                if (ret != SWAP_AGAIN)
+                        break;
+        }
+        spin_unlock(&anon_vma->lock);
+        return ret;
+}
+static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
+                struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+        struct address_space *mapping = page->mapping;
+        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        struct vm_area_struct *vma;
+        struct prio_tree_iter iter;
+        int ret = SWAP_AGAIN;
+        if (!mapping)
+                return ret;
+        spin_lock(&mapping->i_mmap_lock);
+        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+                unsigned long address = vma_address(page, vma);
+                if (address == -EFAULT)
+                        continue;
+                ret = rmap_one(page, vma, address, arg);
+                if (ret != SWAP_AGAIN)
+                        break;
+        }
+        /*
+         * No nonlinear handling: being always shared, nonlinear vmas
+         * never contain migration ptes.  Decide what to do about this
+         * limitation to linear when we need rmap_walk() on nonlinear.
+         */
+        spin_unlock(&mapping->i_mmap_lock);
+        return ret;
+}
+int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
+                struct vm_area_struct *, unsigned long, void *), void *arg)
+{
+        VM_BUG_ON(!PageLocked(page));
+        if (unlikely(PageKsm(page)))
+                return rmap_walk_ksm(page, rmap_one, arg);
+        else if (PageAnon(page))
+                return rmap_walk_anon(page, rmap_one, arg);
+        else
+                return rmap_walk_file(page, rmap_one, arg);
+}
+#endif /* CONFIG_MIGRATION */
diff --git a/mm/shmem.c b/mm/shmem.c
index 356dd99566ec..eef4ebea5158 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -29,7 +29,6 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/swap.h>
-#include <linux/ima.h>
 static struct vfsmount *shm_mnt;
@@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt;
 #include <linux/xattr.h>
 #include <linux/exportfs.h>
+#include <linux/posix_acl.h>
 #include <linux/generic_acl.h>
 #include <linux/mman.h>
 #include <linux/string.h>
@@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
                error = inode_setattr(inode, attr);
 #ifdef CONFIG_TMPFS_POSIX_ACL
        if (!error && (attr->ia_valid & ATTR_MODE))
-                error = generic_acl_chmod(inode, &shmem_acl_ops);
+                error = generic_acl_chmod(inode);
 #endif
        if (page)
                page_cache_release(page);
@@ -1017,7 +1017,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
                        goto out;
        }
        mutex_unlock(&shmem_swaplist_mutex);
-out:    return found;   /* 0 or 1 or -ENOMEM */
+        /*
+         * Can some race bring us here?  We've been holding page lock,
+         * so I think not; but would rather try again later than BUG()
+         */
+        unlock_page(page);
+        page_cache_release(page);
+out:
+        return (found < 0) ? found : 0;
 }
 /*
@@ -1080,7 +1087,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                else
                        inode = NULL;
                spin_unlock(&info->lock);
-                swap_duplicate(swap);
+                swap_shmem_alloc(swap);
                BUG_ON(page_mapped(page));
                page_cache_release(page);       /* pagecache ref */
                swap_writepage(page, wbc);
@@ -1817,11 +1824,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
                                return error;
                        }
                }
-                error = shmem_acl_init(inode, dir);
+#ifdef CONFIG_TMPFS_POSIX_ACL
+                error = generic_acl_init(inode, dir);
                if (error) {
                        iput(inode);
                        return error;
                }
+#else
+                error = 0;
+#endif
                if (dir->i_mode & S_ISGID) {
                        inode->i_gid = dir->i_gid;
                        if (S_ISDIR(mode))
@@ -2036,27 +2047,28 @@ static const struct inode_operations shmem_symlink_inode_operations = {
 * filesystem level, though.
 */
-static size_t shmem_xattr_security_list(struct inode *inode, char *list,
+static size_t shmem_xattr_security_list(struct dentry *dentry, char *list,
                                        size_t list_len, const char *name,
-                                        size_t name_len)
+                                        size_t name_len, int handler_flags)
 {
-        return security_inode_listsecurity(inode, list, list_len);
+        return security_inode_listsecurity(dentry->d_inode, list, list_len);
 }
-static int shmem_xattr_security_get(struct inode *inode, const char *name,
+static int shmem_xattr_security_get(struct dentry *dentry, const char *name,
-                                    void *buffer, size_t size)
+                void *buffer, size_t size, int handler_flags)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return xattr_getsecurity(inode, name, buffer, size);
+        return xattr_getsecurity(dentry->d_inode, name, buffer, size);
 }
-static int shmem_xattr_security_set(struct inode *inode, const char *name,
+static int shmem_xattr_security_set(struct dentry *dentry, const char *name,
-                                    const void *value, size_t size, int flags)
+                const void *value, size_t size, int flags, int handler_flags)
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-        return security_inode_setsecurity(inode, name, value, size, flags);
+        return security_inode_setsecurity(dentry->d_inode, name, value,
+                                          size, flags);
 }
 static struct xattr_handler shmem_xattr_security_handler = {
@@ -2067,8 +2079,8 @@ static struct xattr_handler shmem_xattr_security_handler = {
 };
 static struct xattr_handler *shmem_xattr_handlers[] = {
-        &shmem_xattr_acl_access_handler,
+        &generic_acl_access_handler,
-        &shmem_xattr_acl_default_handler,
+        &generic_acl_default_handler,
        &shmem_xattr_security_handler,
        NULL
 };
@@ -2447,7 +2459,7 @@ static const struct inode_operations shmem_inode_operations = {
        .getxattr       = generic_getxattr,
        .listxattr      = generic_listxattr,
        .removexattr    = generic_removexattr,
-        .check_acl      = shmem_check_acl,
+        .check_acl      = generic_check_acl,
 #endif
 };
@@ -2470,7 +2482,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
        .getxattr       = generic_getxattr,
        .listxattr      = generic_listxattr,
        .removexattr    = generic_removexattr,
-        .check_acl      = shmem_check_acl,
+        .check_acl      = generic_check_acl,
 #endif
 };
@@ -2481,7 +2493,7 @@ static const struct inode_operations shmem_special_inode_operations = {
        .getxattr       = generic_getxattr,
        .listxattr      = generic_listxattr,
        .removexattr    = generic_removexattr,
-        .check_acl      = shmem_check_acl,
+        .check_acl      = generic_check_acl,
 #endif
 };
@@ -2619,7 +2631,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        int error;
        struct file *file;
        struct inode *inode;
-        struct dentry *dentry, *root;
+        struct path path;
+        struct dentry *root;
        struct qstr this;
        if (IS_ERR(shm_mnt))
@@ -2636,38 +2649,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
        this.len = strlen(name);
        this.hash = 0; /* will go */
        root = shm_mnt->mnt_root;
-        dentry = d_alloc(root, &this);
+        path.dentry = d_alloc(root, &this);
-        if (!dentry)
+        if (!path.dentry)
                goto put_memory;
+        path.mnt = mntget(shm_mnt);
-        error = -ENFILE;
-        file = get_empty_filp();
-        if (!file)
-                goto put_dentry;
        error = -ENOSPC;
        inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags);
        if (!inode)
-                goto close_file;
+                goto put_dentry;
-        d_instantiate(dentry, inode);
+        d_instantiate(path.dentry, inode);
        inode->i_size = size;
        inode->i_nlink = 0;     /* It is unlinked */
-        init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
-                  &shmem_file_operations);
 #ifndef CONFIG_MMU
        error = ramfs_nommu_expand_for_mapping(inode, size);
        if (error)
-                goto close_file;
+                goto put_dentry;
 #endif
-        ima_counts_get(file);
+        error = -ENFILE;
+        file = alloc_file(&path, FMODE_WRITE | FMODE_READ,
+                  &shmem_file_operations);
+        if (!file)
+                goto put_dentry;
        return file;
-close_file:
-        put_filp(file);
 put_dentry:
-        dput(dentry);
+        path_put(&path);
 put_memory:
        shmem_unacct_size(flags, size);
        return ERR_PTR(error);
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
deleted file mode 100644
index df2c87fdae50..000000000000
--- a/mm/shmem_acl.c
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * mm/shmem_acl.c
- *
- * (C) 2005 Andreas Gruenbacher <agruen@suse.de>
- *
- * This file is released under the GPL.
- */
-#include <linux/fs.h>
-#include <linux/shmem_fs.h>
-#include <linux/xattr.h>
-#include <linux/generic_acl.h>
-/**
- * shmem_get_acl  -   generic_acl_operations->getacl() operation
- */
-static struct posix_acl *
-shmem_get_acl(struct inode *inode, int type)
-{
-        struct posix_acl *acl = NULL;
-        spin_lock(&inode->i_lock);
-        switch(type) {
-                case ACL_TYPE_ACCESS:
-                        acl = posix_acl_dup(inode->i_acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        acl = posix_acl_dup(inode->i_default_acl);
-                        break;
-        }
-        spin_unlock(&inode->i_lock);
-        return acl;
-}
-/**
- * shmem_set_acl  -   generic_acl_operations->setacl() operation
- */
-static void
-shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl)
-{
-        struct posix_acl *free = NULL;
-        spin_lock(&inode->i_lock);
-        switch(type) {
-                case ACL_TYPE_ACCESS:
-                        free = inode->i_acl;
-                        inode->i_acl = posix_acl_dup(acl);
-                        break;
-                case ACL_TYPE_DEFAULT:
-                        free = inode->i_default_acl;
-                        inode->i_default_acl = posix_acl_dup(acl);
-                        break;
-        }
-        spin_unlock(&inode->i_lock);
-        posix_acl_release(free);
-}
-struct generic_acl_operations shmem_acl_ops = {
-        .getacl = shmem_get_acl,
-        .setacl = shmem_set_acl,
-};
-/**
- * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access,
- * shmem_xattr_acl_access_handler  -  plumbing code to implement the
- * system.posix_acl_access xattr using the generic acl functions.
- */
-static size_t
-shmem_list_acl_access(struct inode *inode, char *list, size_t list_size,
-                      const char *name, size_t name_len)
-{
-        return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS,
-                                list, list_size);
-}
-static int
-shmem_get_acl_access(struct inode *inode, const char *name, void *buffer,
-                     size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer,
-                               size);
-}
-static int
-shmem_set_acl_access(struct inode *inode, const char *name, const void *value,
-                     size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value,
-                               size);
-}
-struct xattr_handler shmem_xattr_acl_access_handler = {
-        .prefix = POSIX_ACL_XATTR_ACCESS,
-        .list   = shmem_list_acl_access,
-        .get    = shmem_get_acl_access,
-        .set    = shmem_set_acl_access,
-};
-/**
- * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default,
- * shmem_xattr_acl_default_handler  -  plumbing code to implement the
- * system.posix_acl_default xattr using the generic acl functions.
- */
-static size_t
-shmem_list_acl_default(struct inode *inode, char *list, size_t list_size,
-                       const char *name, size_t name_len)
-{
-        return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT,
-                                list, list_size);
-}
-static int
-shmem_get_acl_default(struct inode *inode, const char *name, void *buffer,
-                      size_t size)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer,
-                               size);
-}
-static int
-shmem_set_acl_default(struct inode *inode, const char *name, const void *value,
-                      size_t size, int flags)
-{
-        if (strcmp(name, "") != 0)
-                return -EINVAL;
-        return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value,
-                               size);
-}
-struct xattr_handler shmem_xattr_acl_default_handler = {
-        .prefix = POSIX_ACL_XATTR_DEFAULT,
-        .list   = shmem_list_acl_default,
-        .get    = shmem_get_acl_default,
-        .set    = shmem_set_acl_default,
-};
-/**
- * shmem_acl_init  -  Inizialize the acl(s) of a new inode
- */
-int
-shmem_acl_init(struct inode *inode, struct inode *dir)
-{
-        return generic_acl_init(inode, dir, &shmem_acl_ops);
-}
-/**
- * shmem_check_acl  -  check_acl() callback for generic_permission()
- */
-int
-shmem_check_acl(struct inode *inode, int mask)
-{
-        struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS);
-        if (acl) {
-                int error = posix_acl_permission(inode, acl, mask);
-                posix_acl_release(acl);
-                return error;
-        }
-        return -EAGAIN;
-}
diff --git a/mm/slab.c b/mm/slab.c
index 7dfa481c96ba..7d41f15b48d3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -490,7 +490,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #endif
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
 size_t slab_buffer_size(struct kmem_cache *cachep)
 {
        return cachep->buffer_size;
@@ -604,6 +604,26 @@ static struct kmem_cache cache_cache = {
 #define BAD_ALIEN_MAGIC 0x01020304ul
+/*
+ * chicken and egg problem: delay the per-cpu array allocation
+ * until the general caches are up.
+ */
+static enum {
+        NONE,
+        PARTIAL_AC,
+        PARTIAL_L3,
+        EARLY,
+        FULL
+} g_cpucache_up;
+/*
+ * used by boot code to determine if it can use slab based allocator
+ */
+int slab_is_available(void)
+{
+        return g_cpucache_up >= EARLY;
+}
 #ifdef CONFIG_LOCKDEP
 /*
@@ -620,40 +640,52 @@ static struct kmem_cache cache_cache = {
 static struct lock_class_key on_slab_l3_key;
 static struct lock_class_key on_slab_alc_key;
-static inline void init_lock_keys(void)
+static void init_node_lock_keys(int q)
 {
-        int q;
        struct cache_sizes *s = malloc_sizes;
-        while (s->cs_size != ULONG_MAX) {
+        if (g_cpucache_up != FULL)
-                for_each_node(q) {
+                return;
-                        struct array_cache **alc;
-                        int r;
+        for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
-                        struct kmem_list3 *l3 = s->cs_cachep->nodelists[q];
+                struct array_cache **alc;
-                        if (!l3 || OFF_SLAB(s->cs_cachep))
+                struct kmem_list3 *l3;
-                                continue;
+                int r;
-                        lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
-                        alc = l3->alien;
+                l3 = s->cs_cachep->nodelists[q];
-                        /*
+                if (!l3 || OFF_SLAB(s->cs_cachep))
-                         * FIXME: This check for BAD_ALIEN_MAGIC
+                        return;
-                         * should go away when common slab code is taught to
+                lockdep_set_class(&l3->list_lock, &on_slab_l3_key);
-                         * work even without alien caches.
+                alc = l3->alien;
-                         * Currently, non NUMA code returns BAD_ALIEN_MAGIC
+                /*
-                         * for alloc_alien_cache,
+                 * FIXME: This check for BAD_ALIEN_MAGIC
-                         */
+                 * should go away when common slab code is taught to
-                        if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
+                 * work even without alien caches.
-                                continue;
+                 * Currently, non NUMA code returns BAD_ALIEN_MAGIC
-                        for_each_node(r) {
+                 * for alloc_alien_cache,
-                                if (alc[r])
+                 */
-                                        lockdep_set_class(&alc[r]->lock,
+                if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC)
-                                             &on_slab_alc_key);
+                        return;
-                        }
+                for_each_node(r) {
+                        if (alc[r])
+                                lockdep_set_class(&alc[r]->lock,
+                                        &on_slab_alc_key);
                }
-                s++;
        }
 }
+static inline void init_lock_keys(void)
+{
+        int node;
+        for_each_node(node)
+                init_node_lock_keys(node);
+}
 #else
+static void init_node_lock_keys(int q)
+{
+}
 static inline void init_lock_keys(void)
 {
 }
@@ -665,27 +697,7 @@ static inline void init_lock_keys(void)
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
-/*
+static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
- * chicken and egg problem: delay the per-cpu array allocation
- * until the general caches are up.
- */
-static enum {
-        NONE,
-        PARTIAL_AC,
-        PARTIAL_L3,
-        EARLY,
-        FULL
-} g_cpucache_up;
-/*
- * used by boot code to determine if it can use slab based allocator
- */
-int slab_is_available(void)
-{
-        return g_cpucache_up >= EARLY;
-}
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
@@ -826,7 +838,7 @@ __setup("noaliencache", noaliencache_setup);
 * objects freed on different nodes from which they were allocated) and the
 * flushing of remote pcps by calling drain_node_pages.
 */
-static DEFINE_PER_CPU(unsigned long, reap_node);
+static DEFINE_PER_CPU(unsigned long, slab_reap_node);
 static void init_reap_node(int cpu)
 {
@@ -836,17 +848,17 @@ static void init_reap_node(int cpu)
        if (node == MAX_NUMNODES)
                node = first_node(node_online_map);
-        per_cpu(reap_node, cpu) = node;
+        per_cpu(slab_reap_node, cpu) = node;
 }
 static void next_reap_node(void)
 {
-        int node = __get_cpu_var(reap_node);
+        int node = __get_cpu_var(slab_reap_node);
        node = next_node(node, node_online_map);
        if (unlikely(node >= MAX_NUMNODES))
                node = first_node(node_online_map);
-        __get_cpu_var(reap_node) = node;
+        __get_cpu_var(slab_reap_node) = node;
 }
 #else
@@ -863,7 +875,7 @@ static void next_reap_node(void)
 */
 static void __cpuinit start_cpu_timer(int cpu)
 {
-        struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
+        struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu);
        /*
         * When this gets called from do_initcalls via cpucache_init(),
@@ -1027,7 +1039,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
 */
 static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
 {
-        int node = __get_cpu_var(reap_node);
+        int node = __get_cpu_var(slab_reap_node);
        if (l3->alien) {
                struct array_cache *ac = l3->alien[node];
@@ -1120,7 +1132,7 @@ static void __cpuinit cpuup_canceled(long cpu)
                if (nc)
                        free_block(cachep, nc->entry, nc->avail, node);
-                if (!cpus_empty(*mask)) {
+                if (!cpumask_empty(mask)) {
                        spin_unlock_irq(&l3->list_lock);
                        goto free_array_cache;
                }
@@ -1254,6 +1266,8 @@ static int __cpuinit cpuup_prepare(long cpu)
                kfree(shared);
                free_alien_cache(alien);
        }
+        init_node_lock_keys(node);
        return 0;
 bad:
        cpuup_canceled(cpu);
@@ -1286,9 +1300,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
                 * anything expensive but will only modify reap_work
                 * and reschedule the timer.
                */
-                cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
+                cancel_rearming_delayed_work(&per_cpu(slab_reap_work, cpu));
                /* Now the cache_reaper is guaranteed to be not running. */
-                per_cpu(reap_work, cpu).work.func = NULL;
+                per_cpu(slab_reap_work, cpu).work.func = NULL;
                break;
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
@@ -2261,9 +2275,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /*
         * Determine if the slab management is 'on' or 'off' slab.
         * (bootstrapping cannot cope with offslab caches so don't do
-         * it too early on.)
+         * it too early on. Always use on-slab management when
+         * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
         */
-        if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
+        if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
+            !(flags & SLAB_NOLEAKTRACE))
                /*
                 * Size is large, assume best to place the slab management obj
                 * off-slab (should allow better packing of objs).
@@ -2582,8 +2598,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
                 * kmemleak does not treat the ->s_mem pointer as a reference
                 * to the object. Otherwise we will not report the leak.
                 */
-                kmemleak_scan_area(slabp, offsetof(struct slab, list),
+                kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
-                                   sizeof(struct list_head), local_flags);
+                                   local_flags);
                if (!slabp)
                        return NULL;
        } else {
@@ -3103,13 +3119,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
        } else {
                STATS_INC_ALLOCMISS(cachep);
                objp = cache_alloc_refill(cachep, flags);
+                /*
+                 * the 'ac' may be updated by cache_alloc_refill(),
+                 * and kmemleak_erase() requires its correct value.
+                 */
+                ac = cpu_cache_get(cachep);
        }
        /*
         * To avoid a false negative, if an object that is in one of the
         * per-CPU caches is leaked, we need to make sure kmemleak doesn't
         * treat the array pointers as a reference to the object.
         */
-        kmemleak_erase(&ac->entry[ac->avail]);
+        if (objp)
+                kmemleak_erase(&ac->entry[ac->avail]);
        return objp;
 }
@@ -3306,7 +3328,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
-        if (unlikely(nodeid == -1))
+        if (nodeid == -1)
                nodeid = numa_node_id();
        if (unlikely(!cachep->nodelists[nodeid])) {
@@ -3558,7 +3580,7 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
 void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
 {
        return __cache_alloc(cachep, flags, __builtin_return_address(0));
@@ -3621,7 +3643,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
 void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
                                    gfp_t flags,
                                    int nodeid)
@@ -3649,7 +3671,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
        return ret;
 }
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
        return __do_kmalloc_node(size, flags, node,
@@ -3669,7 +3691,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
        return __do_kmalloc_node(size, flags, node, NULL);
 }
 EXPORT_SYMBOL(__kmalloc_node);
-#endif /* CONFIG_DEBUG_SLAB */
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */
 #endif /* CONFIG_NUMA */
 /**
@@ -3701,7 +3723,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 }
-#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING)
 void *__kmalloc(size_t size, gfp_t flags)
 {
        return __do_kmalloc(size, flags, __builtin_return_address(0));
diff --git a/mm/slub.c b/mm/slub.c
index 4996fc719552..8d71aaf888d7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1735,7 +1735,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
        }
        local_irq_restore(flags);
-        if (unlikely((gfpflags & __GFP_ZERO) && object))
+        if (unlikely(gfpflags & __GFP_ZERO) && object)
                memset(object, 0, objsize);
        kmemcheck_slab_alloc(s, gfpflags, object, c->objsize);
@@ -1754,7 +1754,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
 void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
 {
        return slab_alloc(s, gfpflags, -1, _RET_IP_);
@@ -1775,7 +1775,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 #endif
-#ifdef CONFIG_KMEMTRACE
+#ifdef CONFIG_TRACING
 void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
                                    gfp_t gfpflags,
                                    int node)
@@ -4371,12 +4371,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
        return len + sprintf(buf + len, "\n");
 }
+static void clear_stat(struct kmem_cache *s, enum stat_item si)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                get_cpu_slab(s, cpu)->stat[si] = 0;
+}
 #define STAT_ATTR(si, text)                                     \
 static ssize_t text##_show(struct kmem_cache *s, char *buf)     \
 {                                                               \
        return show_stat(s, buf, si);                           \
 }                                                               \
-SLAB_ATTR_RO(text);                                             \
+static ssize_t text##_store(struct kmem_cache *s,               \
+                                const char *buf, size_t length) \
+{                                                               \
+        if (buf[0] != '0')                                      \
+                return -EINVAL;                                 \
+        clear_stat(s, si);                                      \
+        return length;                                          \
+}                                                               \
+SLAB_ATTR(text);                                                \
 STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath);
 STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 9c590eef7912..6c0585b16418 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -22,6 +22,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
@@ -35,11 +36,15 @@
 #include <linux/swapops.h>
 #include <linux/page_cgroup.h>
+static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+                                 unsigned char);
+static void free_swap_count_continuations(struct swap_info_struct *);
+static sector_t map_swap_entry(swp_entry_t, struct block_device**);
 static DEFINE_SPINLOCK(swap_lock);
 static unsigned int nr_swapfiles;
 long nr_swap_pages;
 long total_swap_pages;
-static int swap_overflow;
 static int least_priority;
 static const char Bad_file[] = "Bad swap file entry ";
@@ -49,42 +54,20 @@ static const char Unused_offset[] = "Unused swap offset entry ";
 static struct swap_list_t swap_list = {-1, -1};
-static struct swap_info_struct swap_info[MAX_SWAPFILES];
+static struct swap_info_struct *swap_info[MAX_SWAPFILES];
 static DEFINE_MUTEX(swapon_mutex);
-/* For reference count accounting in swap_map */
+static inline unsigned char swap_count(unsigned char ent)
-/* enum for swap_map[] handling. internal use only */
-enum {
-        SWAP_MAP = 0,   /* ops for reference from swap users */
-        SWAP_CACHE,     /* ops for reference from swap cache */
-};
-static inline int swap_count(unsigned short ent)
-{
-        return ent & SWAP_COUNT_MASK;
-}
-static inline bool swap_has_cache(unsigned short ent)
 {
-        return !!(ent & SWAP_HAS_CACHE);
+        return ent & ~SWAP_HAS_CACHE;   /* may include SWAP_HAS_CONT flag */
 }
-static inline unsigned short encode_swapmap(int count, bool has_cache)
+/* returns 1 if swap entry is freed */
-{
-        unsigned short ret = count;
-        if (has_cache)
-                return SWAP_HAS_CACHE | ret;
-        return ret;
-}
-/* returnes 1 if swap entry is freed */
 static int
 __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
 {
-        int type = si - swap_info;
+        swp_entry_t entry = swp_entry(si->type, offset);
-        swp_entry_t entry = swp_entry(type, offset);
        struct page *page;
        int ret = 0;
@@ -120,7 +103,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
        down_read(&swap_unplug_sem);
        entry.val = page_private(page);
        if (PageSwapCache(page)) {
-                struct block_device *bdev = swap_info[swp_type(entry)].bdev;
+                struct block_device *bdev = swap_info[swp_type(entry)]->bdev;
                struct backing_dev_info *bdi;
                /*
@@ -146,23 +129,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 static int discard_swap(struct swap_info_struct *si)
 {
        struct swap_extent *se;
+        sector_t start_block;
+        sector_t nr_blocks;
        int err = 0;
-        list_for_each_entry(se, &si->extent_list, list) {
+        /* Do not discard the swap header page! */
-                sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
+        se = &si->first_swap_extent;
-                sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
+        start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
+        nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
+        if (nr_blocks) {
+                err = blkdev_issue_discard(si->bdev, start_block,
+                                nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
+                if (err)
+                        return err;
+                cond_resched();
+        }
-                if (se->start_page == 0) {
+        list_for_each_entry(se, &si->first_swap_extent.list, list) {
-                        /* Do not discard the swap header page! */
+                start_block = se->start_block << (PAGE_SHIFT - 9);
-                        start_block += 1 << (PAGE_SHIFT - 9);
+                nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
-                        nr_blocks -= 1 << (PAGE_SHIFT - 9);
-                        if (!nr_blocks)
-                                continue;
-                }
                err = blkdev_issue_discard(si->bdev, start_block,
-                                                nr_blocks, GFP_KERNEL,
+                                nr_blocks, GFP_KERNEL, DISCARD_FL_BARRIER);
-                                                DISCARD_FL_BARRIER);
                if (err)
                        break;
@@ -201,14 +189,11 @@ static void discard_swap_cluster(struct swap_info_struct *si,
                        start_block <<= PAGE_SHIFT - 9;
                        nr_blocks <<= PAGE_SHIFT - 9;
                        if (blkdev_issue_discard(si->bdev, start_block,
-                                                        nr_blocks, GFP_NOIO,
+                                    nr_blocks, GFP_NOIO, DISCARD_FL_BARRIER))
-                                                        DISCARD_FL_BARRIER))
                                break;
                }
                lh = se->list.next;
-                if (lh == &si->extent_list)
-                        lh = lh->next;
                se = list_entry(lh, struct swap_extent, list);
        }
 }
@@ -223,7 +208,7 @@ static int wait_for_discard(void *word)
 #define LATENCY_LIMIT           256
 static inline unsigned long scan_swap_map(struct swap_info_struct *si,
-                                          int cache)
+                                          unsigned char usage)
 {
        unsigned long offset;
        unsigned long scan_base;
@@ -354,10 +339,7 @@ checks:
                si->lowest_bit = si->max;
                si->highest_bit = 0;
        }
-        if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */
+        si->swap_map[offset] = usage;
-                si->swap_map[offset] = encode_swapmap(0, true);
-        else /* at suspend */
-                si->swap_map[offset] = encode_swapmap(1, false);
        si->cluster_next = offset + 1;
        si->flags -= SWP_SCANNING;
@@ -467,10 +449,10 @@ swp_entry_t get_swap_page(void)
        nr_swap_pages--;
        for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
-                si = swap_info + type;
+                si = swap_info[type];
                next = si->next;
                if (next < 0 ||
-                    (!wrapped && si->prio != swap_info[next].prio)) {
+                    (!wrapped && si->prio != swap_info[next]->prio)) {
                        next = swap_list.head;
                        wrapped++;
                }
@@ -482,7 +464,7 @@ swp_entry_t get_swap_page(void)
                swap_list.next = next;
                /* This is called for allocating swap entry for cache */
-                offset = scan_swap_map(si, SWAP_CACHE);
+                offset = scan_swap_map(si, SWAP_HAS_CACHE);
                if (offset) {
                        spin_unlock(&swap_lock);
                        return swp_entry(type, offset);
@@ -503,11 +485,11 @@ swp_entry_t get_swap_page_of_type(int type)
        pgoff_t offset;
        spin_lock(&swap_lock);
-        si = swap_info + type;
+        si = swap_info[type];
-        if (si->flags & SWP_WRITEOK) {
+        if (si && (si->flags & SWP_WRITEOK)) {
                nr_swap_pages--;
                /* This is called for allocating swap entry, not cache */
-                offset = scan_swap_map(si, SWAP_MAP);
+                offset = scan_swap_map(si, 1);
                if (offset) {
                        spin_unlock(&swap_lock);
                        return swp_entry(type, offset);
@@ -518,9 +500,9 @@ swp_entry_t get_swap_page_of_type(int type)
        return (swp_entry_t) {0};
 }
-static struct swap_info_struct * swap_info_get(swp_entry_t entry)
+static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 {
-        struct swap_info_struct * p;
+        struct swap_info_struct *p;
        unsigned long offset, type;
        if (!entry.val)
@@ -528,7 +510,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry)
        type = swp_type(entry);
        if (type >= nr_swapfiles)
                goto bad_nofile;
-        p = & swap_info[type];
+        p = swap_info[type];
        if (!(p->flags & SWP_USED))
                goto bad_device;
        offset = swp_offset(entry);
@@ -554,41 +536,56 @@ out:
        return NULL;
 }
-static int swap_entry_free(struct swap_info_struct *p,
+static unsigned char swap_entry_free(struct swap_info_struct *p,
-                           swp_entry_t ent, int cache)
+                                     swp_entry_t entry, unsigned char usage)
 {
-        unsigned long offset = swp_offset(ent);
+        unsigned long offset = swp_offset(entry);
-        int count = swap_count(p->swap_map[offset]);
+        unsigned char count;
-        bool has_cache;
+        unsigned char has_cache;
-        has_cache = swap_has_cache(p->swap_map[offset]);
+        count = p->swap_map[offset];
+        has_cache = count & SWAP_HAS_CACHE;
+        count &= ~SWAP_HAS_CACHE;
-        if (cache == SWAP_MAP) { /* dropping usage count of swap */
+        if (usage == SWAP_HAS_CACHE) {
-                if (count < SWAP_MAP_MAX) {
-                        count--;
-                        p->swap_map[offset] = encode_swapmap(count, has_cache);
-                }
-        } else { /* dropping swap cache flag */
                VM_BUG_ON(!has_cache);
-                p->swap_map[offset] = encode_swapmap(count, false);
+                has_cache = 0;
+        } else if (count == SWAP_MAP_SHMEM) {
+                /*
+                 * Or we could insist on shmem.c using a special
+                 * swap_shmem_free() and free_shmem_swap_and_cache()...
+                 */
+                count = 0;
+        } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
+                if (count == COUNT_CONTINUED) {
+                        if (swap_count_continued(p, offset, count))
+                                count = SWAP_MAP_MAX | COUNT_CONTINUED;
+                        else
+                                count = SWAP_MAP_MAX;
+                } else
+                        count--;
        }
-        /* return code. */
-        count = p->swap_map[offset];
+        if (!count)
+                mem_cgroup_uncharge_swap(entry);
+        usage = count | has_cache;
+        p->swap_map[offset] = usage;
        /* free if no reference */
-        if (!count) {
+        if (!usage) {
                if (offset < p->lowest_bit)
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
                        p->highest_bit = offset;
-                if (p->prio > swap_info[swap_list.next].prio)
+                if (swap_list.next >= 0 &&
-                        swap_list.next = p - swap_info;
+                    p->prio > swap_info[swap_list.next]->prio)
+                        swap_list.next = p->type;
                nr_swap_pages++;
                p->inuse_pages--;
        }
-        if (!swap_count(count))
-                mem_cgroup_uncharge_swap(ent);
+        return usage;
-        return count;
 }
 /*
@@ -597,11 +594,11 @@ static int swap_entry_free(struct swap_info_struct *p,
 */
 void swap_free(swp_entry_t entry)
 {
-        struct swap_info_struct * p;
+        struct swap_info_struct *p;
        p = swap_info_get(entry);
        if (p) {
-                swap_entry_free(p, entry, SWAP_MAP);
+                swap_entry_free(p, entry, 1);
                spin_unlock(&swap_lock);
        }
 }
@@ -612,26 +609,21 @@ void swap_free(swp_entry_t entry)
 void swapcache_free(swp_entry_t entry, struct page *page)
 {
        struct swap_info_struct *p;
-        int ret;
+        unsigned char count;
        p = swap_info_get(entry);
        if (p) {
-                ret = swap_entry_free(p, entry, SWAP_CACHE);
+                count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
-                if (page) {
+                if (page)
-                        bool swapout;
+                        mem_cgroup_uncharge_swapcache(page, entry, count != 0);
-                        if (ret)
-                                swapout = true; /* the end of swap out */
-                        else
-                                swapout = false; /* no more swap users! */
-                        mem_cgroup_uncharge_swapcache(page, entry, swapout);
-                }
                spin_unlock(&swap_lock);
        }
-        return;
 }
 /*
 * How many references to page are currently swapped out?
+ * This does not give an exact answer when swap count is continued,
+ * but does include the high COUNT_CONTINUED flag to allow for that.
 */
 static inline int page_swapcount(struct page *page)
 {
@@ -659,6 +651,8 @@ int reuse_swap_page(struct page *page)
        int count;
        VM_BUG_ON(!PageLocked(page));
+        if (unlikely(PageKsm(page)))
+                return 0;
        count = page_mapcount(page);
        if (count <= 1 && PageSwapCache(page)) {
                count += page_swapcount(page);
@@ -667,7 +661,7 @@ int reuse_swap_page(struct page *page)
                        SetPageDirty(page);
                }
        }
-        return count == 1;
+        return count <= 1;
 }
 /*
@@ -704,7 +698,7 @@ int free_swap_and_cache(swp_entry_t entry)
        p = swap_info_get(entry);
        if (p) {
-                if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) {
+                if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) {
                        page = find_get_page(&swapper_space, entry.val);
                        if (page && !trylock_page(page)) {
                                page_cache_release(page);
@@ -741,14 +735,14 @@ int free_swap_and_cache(swp_entry_t entry)
 int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
 {
        struct block_device *bdev = NULL;
-        int i;
+        int type;
        if (device)
                bdev = bdget(device);
        spin_lock(&swap_lock);
-        for (i = 0; i < nr_swapfiles; i++) {
+        for (type = 0; type < nr_swapfiles; type++) {
-                struct swap_info_struct *sis = swap_info + i;
+                struct swap_info_struct *sis = swap_info[type];
                if (!(sis->flags & SWP_WRITEOK))
                        continue;
@@ -758,20 +752,18 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
                                *bdev_p = bdgrab(sis->bdev);
                        spin_unlock(&swap_lock);
-                        return i;
+                        return type;
                }
                if (bdev == sis->bdev) {
-                        struct swap_extent *se;
+                        struct swap_extent *se = &sis->first_swap_extent;
-                        se = list_entry(sis->extent_list.next,
-                                        struct swap_extent, list);
                        if (se->start_block == offset) {
                                if (bdev_p)
                                        *bdev_p = bdgrab(sis->bdev);
                                spin_unlock(&swap_lock);
                                bdput(bdev);
-                                return i;
+                                return type;
                        }
                }
        }
@@ -783,6 +775,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p)
 }
 /*
+ * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * corresponding to given index in swap_info (swap type).
+ */
+sector_t swapdev_block(int type, pgoff_t offset)
+{
+        struct block_device *bdev;
+        if ((unsigned int)type >= nr_swapfiles)
+                return 0;
+        if (!(swap_info[type]->flags & SWP_WRITEOK))
+                return 0;
+        return map_swap_entry(swp_entry(type, offset), &bdev);
+}
+/*
 * Return either the total number of swap pages of given type, or the number
 * of free pages of that type (depending on @free)
 *
@@ -792,18 +799,20 @@ unsigned int count_swap_pages(int type, int free)
 {
        unsigned int n = 0;
-        if (type < nr_swapfiles) {
+        spin_lock(&swap_lock);
-                spin_lock(&swap_lock);
+        if ((unsigned int)type < nr_swapfiles) {
-                if (swap_info[type].flags & SWP_WRITEOK) {
+                struct swap_info_struct *sis = swap_info[type];
-                        n = swap_info[type].pages;
+                if (sis->flags & SWP_WRITEOK) {
+                        n = sis->pages;
                        if (free)
-                                n -= swap_info[type].inuse_pages;
+                                n -= sis->inuse_pages;
                }
-                spin_unlock(&swap_lock);
        }
+        spin_unlock(&swap_lock);
        return n;
 }
-#endif
+#endif /* CONFIG_HIBERNATION */
 /*
 * No need to decide whether this PTE shares the swap entry with others,
@@ -932,7 +941,7 @@ static int unuse_vma(struct vm_area_struct *vma,
        unsigned long addr, end, next;
        int ret;
-        if (page->mapping) {
+        if (page_anon_vma(page)) {
                addr = page_address_in_vma(page, vma);
                if (addr == -EFAULT)
                        return 0;
@@ -988,7 +997,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 {
        unsigned int max = si->max;
        unsigned int i = prev;
-        int count;
+        unsigned char count;
        /*
         * No need for swap_lock here: we're just looking
@@ -1024,16 +1033,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 */
 static int try_to_unuse(unsigned int type)
 {
-        struct swap_info_struct * si = &swap_info[type];
+        struct swap_info_struct *si = swap_info[type];
        struct mm_struct *start_mm;
-        unsigned short *swap_map;
+        unsigned char *swap_map;
-        unsigned short swcount;
+        unsigned char swcount;
        struct page *page;
        swp_entry_t entry;
        unsigned int i = 0;
        int retval = 0;
-        int reset_overflow = 0;
-        int shmem;
        /*
         * When searching mms for an entry, a good strategy is to
@@ -1047,8 +1054,7 @@ static int try_to_unuse(unsigned int type)
         * together, child after parent.  If we race with dup_mmap(), we
         * prefer to resolve parent before child, lest we miss entries
         * duplicated after we scanned child: using last mm would invert
-         * that.  Though it's only a serious concern when an overflowed
+         * that.
-         * swap count is reset from SWAP_MAP_MAX, preventing a rescan.
         */
        start_mm = &init_mm;
        atomic_inc(&init_mm.mm_users);
@@ -1110,17 +1116,18 @@ static int try_to_unuse(unsigned int type)
                /*
                 * Remove all references to entry.
-                 * Whenever we reach init_mm, there's no address space
-                 * to search, but use it as a reminder to search shmem.
                 */
-                shmem = 0;
                swcount = *swap_map;
-                if (swap_count(swcount)) {
+                if (swap_count(swcount) == SWAP_MAP_SHMEM) {
-                        if (start_mm == &init_mm)
+                        retval = shmem_unuse(entry, page);
-                                shmem = shmem_unuse(entry, page);
+                        /* page has already been unlocked and released */
-                        else
+                        if (retval < 0)
-                                retval = unuse_mm(start_mm, entry, page);
+                                break;
+                        continue;
                }
+                if (swap_count(swcount) && start_mm != &init_mm)
+                        retval = unuse_mm(start_mm, entry, page);
                if (swap_count(*swap_map)) {
                        int set_start_mm = (*swap_map >= swcount);
                        struct list_head *p = &start_mm->mmlist;
@@ -1131,7 +1138,7 @@ static int try_to_unuse(unsigned int type)
                        atomic_inc(&new_start_mm->mm_users);
                        atomic_inc(&prev_mm->mm_users);
                        spin_lock(&mmlist_lock);
-                        while (swap_count(*swap_map) && !retval && !shmem &&
+                        while (swap_count(*swap_map) && !retval &&
                                        (p = p->next) != &start_mm->mmlist) {
                                mm = list_entry(p, struct mm_struct, mmlist);
                                if (!atomic_inc_not_zero(&mm->mm_users))
@@ -1145,10 +1152,9 @@ static int try_to_unuse(unsigned int type)
                                swcount = *swap_map;
                                if (!swap_count(swcount)) /* any usage ? */
                                        ;
-                                else if (mm == &init_mm) {
+                                else if (mm == &init_mm)
                                        set_start_mm = 1;
-                                        shmem = shmem_unuse(entry, page);
+                                else
-                                } else
                                        retval = unuse_mm(mm, entry, page);
                                if (set_start_mm && *swap_map < swcount) {
@@ -1164,13 +1170,6 @@ static int try_to_unuse(unsigned int type)
                        mmput(start_mm);
                        start_mm = new_start_mm;
                }
-                if (shmem) {
-                        /* page has already been unlocked and released */
-                        if (shmem > 0)
-                                continue;
-                        retval = shmem;
-                        break;
-                }
                if (retval) {
                        unlock_page(page);
                        page_cache_release(page);
@@ -1178,30 +1177,6 @@ static int try_to_unuse(unsigned int type)
                }
                /*
-                 * How could swap count reach 0x7ffe ?
-                 * There's no way to repeat a swap page within an mm
-                 * (except in shmem, where it's the shared object which takes
-                 * the reference count)?
-                 * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned
-                 * short is too small....)
-                 * If that's wrong, then we should worry more about
-                 * exit_mmap() and do_munmap() cases described above:
-                 * we might be resetting SWAP_MAP_MAX too early here.
-                 * We know "Undead"s can happen, they're okay, so don't
-                 * report them; but do report if we reset SWAP_MAP_MAX.
-                 */
-                /* We might release the lock_page() in unuse_mm(). */
-                if (!PageSwapCache(page) || page_private(page) != entry.val)
-                        goto retry;
-                if (swap_count(*swap_map) == SWAP_MAP_MAX) {
-                        spin_lock(&swap_lock);
-                        *swap_map = encode_swapmap(0, true);
-                        spin_unlock(&swap_lock);
-                        reset_overflow = 1;
-                }
-                /*
                 * If a reference remains (rare), we would like to leave
                 * the page in the swap cache; but try_to_unmap could
                 * then re-duplicate the entry once we drop page lock,
@@ -1213,6 +1188,12 @@ static int try_to_unuse(unsigned int type)
                 * read from disk into another page.  Splitting into two
                 * pages would be incorrect if swap supported "shared
                 * private" pages, but they are handled by tmpfs files.
+                 *
+                 * Given how unuse_vma() targets one particular offset
+                 * in an anon_vma, once the anon_vma has been determined,
+                 * this splitting happens to be just what is needed to
+                 * handle where KSM pages have been swapped out: re-reading
+                 * is unnecessarily slow, but we can fix that later on.
                 */
                if (swap_count(*swap_map) &&
                     PageDirty(page) && PageSwapCache(page)) {
@@ -1242,7 +1223,6 @@ static int try_to_unuse(unsigned int type)
                 * mark page dirty so shrink_page_list will preserve it.
                 */
                SetPageDirty(page);
-retry:
                unlock_page(page);
                page_cache_release(page);
@@ -1254,10 +1234,6 @@ retry:
        }
        mmput(start_mm);
-        if (reset_overflow) {
-                printk(KERN_WARNING "swapoff: cleared swap entry overflow\n");
-                swap_overflow = 0;
-        }
        return retval;
 }
@@ -1270,10 +1246,10 @@ retry:
 static void drain_mmlist(void)
 {
        struct list_head *p, *next;
-        unsigned int i;
+        unsigned int type;
-        for (i = 0; i < nr_swapfiles; i++)
+        for (type = 0; type < nr_swapfiles; type++)
-                if (swap_info[i].inuse_pages)
+                if (swap_info[type]->inuse_pages)
                        return;
        spin_lock(&mmlist_lock);
        list_for_each_safe(p, next, &init_mm.mmlist)
@@ -1283,12 +1259,23 @@ static void drain_mmlist(void)
 /*
 * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
- * corresponds to page offset `offset'.
+ * corresponds to page offset for the specified swap entry.
+ * Note that the type of this function is sector_t, but it returns page offset
+ * into the bdev, not sector offset.
 */
-sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
+static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
 {
-        struct swap_extent *se = sis->curr_swap_extent;
+        struct swap_info_struct *sis;
-        struct swap_extent *start_se = se;
+        struct swap_extent *start_se;
+        struct swap_extent *se;
+        pgoff_t offset;
+        sis = swap_info[swp_type(entry)];
+        *bdev = sis->bdev;
+        offset = swp_offset(entry);
+        start_se = sis->curr_swap_extent;
+        se = start_se;
        for ( ; ; ) {
                struct list_head *lh;
@@ -1298,40 +1285,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
                        return se->start_block + (offset - se->start_page);
                }
                lh = se->list.next;
-                if (lh == &sis->extent_list)
-                        lh = lh->next;
                se = list_entry(lh, struct swap_extent, list);
                sis->curr_swap_extent = se;
                BUG_ON(se == start_se);         /* It *must* be present */
        }
 }
-#ifdef CONFIG_HIBERNATION
 /*
- * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
+ * Returns the page offset into bdev for the specified page's swap entry.
- * corresponding to given index in swap_info (swap type).
 */
-sector_t swapdev_block(int swap_type, pgoff_t offset)
+sector_t map_swap_page(struct page *page, struct block_device **bdev)
 {
-        struct swap_info_struct *sis;
+        swp_entry_t entry;
+        entry.val = page_private(page);
-        if (swap_type >= nr_swapfiles)
+        return map_swap_entry(entry, bdev);
-                return 0;
-        sis = swap_info + swap_type;
-        return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0;
 }
-#endif /* CONFIG_HIBERNATION */
 /*
 * Free all of a swapdev's extent information
 */
 static void destroy_swap_extents(struct swap_info_struct *sis)
 {
-        while (!list_empty(&sis->extent_list)) {
+        while (!list_empty(&sis->first_swap_extent.list)) {
                struct swap_extent *se;
-                se = list_entry(sis->extent_list.next,
+                se = list_entry(sis->first_swap_extent.list.next,
                                struct swap_extent, list);
                list_del(&se->list);
                kfree(se);
@@ -1352,8 +1330,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
        struct swap_extent *new_se;
        struct list_head *lh;
-        lh = sis->extent_list.prev;     /* The highest page extent */
+        if (start_page == 0) {
-        if (lh != &sis->extent_list) {
+                se = &sis->first_swap_extent;
+                sis->curr_swap_extent = se;
+                se->start_page = 0;
+                se->nr_pages = nr_pages;
+                se->start_block = start_block;
+                return 1;
+        } else {
+                lh = sis->first_swap_extent.list.prev;  /* Highest extent */
                se = list_entry(lh, struct swap_extent, list);
                BUG_ON(se->start_page + se->nr_pages != start_page);
                if (se->start_block + se->nr_pages == start_block) {
@@ -1373,7 +1358,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
        new_se->nr_pages = nr_pages;
        new_se->start_block = start_block;
-        list_add_tail(&new_se->list, &sis->extent_list);
+        list_add_tail(&new_se->list, &sis->first_swap_extent.list);
        return 1;
 }
@@ -1425,7 +1410,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
        if (S_ISBLK(inode->i_mode)) {
                ret = add_swap_extent(sis, 0, sis->max, 0);
                *span = sis->pages;
-                goto done;
+                goto out;
        }
        blkbits = inode->i_blkbits;
@@ -1496,25 +1481,22 @@ reprobe:
        sis->max = page_no;
        sis->pages = page_no - 1;
        sis->highest_bit = page_no - 1;
-done:
+out:
-        sis->curr_swap_extent = list_entry(sis->extent_list.prev,
+        return ret;
-                                        struct swap_extent, list);
-        goto out;
 bad_bmap:
        printk(KERN_ERR "swapon: swapfile has holes\n");
        ret = -EINVAL;
-out:
+        goto out;
-        return ret;
 }
 SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
-        struct swap_info_struct * p = NULL;
+        struct swap_info_struct *p = NULL;
-        unsigned short *swap_map;
+        unsigned char *swap_map;
        struct file *swap_file, *victim;
        struct address_space *mapping;
        struct inode *inode;
-        char * pathname;
+        char *pathname;
        int i, type, prev;
        int err;
@@ -1535,8 +1517,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        mapping = victim->f_mapping;
        prev = -1;
        spin_lock(&swap_lock);
-        for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
+        for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
-                p = swap_info + type;
+                p = swap_info[type];
                if (p->flags & SWP_WRITEOK) {
                        if (p->swap_file->f_mapping == mapping)
                                break;
@@ -1555,18 +1537,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                spin_unlock(&swap_lock);
                goto out_dput;
        }
-        if (prev < 0) {
+        if (prev < 0)
                swap_list.head = p->next;
-        } else {
+        else
-                swap_info[prev].next = p->next;
+                swap_info[prev]->next = p->next;
-        }
        if (type == swap_list.next) {
                /* just pick something that's safe... */
                swap_list.next = swap_list.head;
        }
        if (p->prio < 0) {
-                for (i = p->next; i >= 0; i = swap_info[i].next)
+                for (i = p->next; i >= 0; i = swap_info[i]->next)
-                        swap_info[i].prio = p->prio--;
+                        swap_info[i]->prio = p->prio--;
                least_priority++;
        }
        nr_swap_pages -= p->pages;
@@ -1584,16 +1565,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                if (p->prio < 0)
                        p->prio = --least_priority;
                prev = -1;
-                for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
+                for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
-                        if (p->prio >= swap_info[i].prio)
+                        if (p->prio >= swap_info[i]->prio)
                                break;
                        prev = i;
                }
                p->next = i;
                if (prev < 0)
-                        swap_list.head = swap_list.next = p - swap_info;
+                        swap_list.head = swap_list.next = type;
                else
-                        swap_info[prev].next = p - swap_info;
+                        swap_info[prev]->next = type;
                nr_swap_pages += p->pages;
                total_swap_pages += p->pages;
                p->flags |= SWP_WRITEOK;
@@ -1606,6 +1587,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        up_write(&swap_unplug_sem);
        destroy_swap_extents(p);
+        if (p->flags & SWP_CONTINUED)
+                free_swap_count_continuations(p);
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        drain_mmlist();
@@ -1653,8 +1637,8 @@ out:
 /* iterator */
 static void *swap_start(struct seq_file *swap, loff_t *pos)
 {
-        struct swap_info_struct *ptr = swap_info;
+        struct swap_info_struct *si;
-        int i;
+        int type;
        loff_t l = *pos;
        mutex_lock(&swapon_mutex);
@@ -1662,11 +1646,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
        if (!l)
                return SEQ_START_TOKEN;
-        for (i = 0; i < nr_swapfiles; i++, ptr++) {
+        for (type = 0; type < nr_swapfiles; type++) {
-                if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+                smp_rmb();      /* read nr_swapfiles before swap_info[type] */
+                si = swap_info[type];
+                if (!(si->flags & SWP_USED) || !si->swap_map)
                        continue;
                if (!--l)
-                        return ptr;
+                        return si;
        }
        return NULL;
@@ -1674,21 +1660,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
 static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
 {
-        struct swap_info_struct *ptr;
+        struct swap_info_struct *si = v;
-        struct swap_info_struct *endptr = swap_info + nr_swapfiles;
+        int type;
        if (v == SEQ_START_TOKEN)
-                ptr = swap_info;
+                type = 0;
-        else {
+        else
-                ptr = v;
+                type = si->type + 1;
-                ptr++;
-        }
-        for (; ptr < endptr; ptr++) {
+        for (; type < nr_swapfiles; type++) {
-                if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
+                smp_rmb();      /* read nr_swapfiles before swap_info[type] */
+                si = swap_info[type];
+                if (!(si->flags & SWP_USED) || !si->swap_map)
                        continue;
                ++*pos;
-                return ptr;
+                return si;
        }
        return NULL;
@@ -1701,24 +1687,24 @@ static void swap_stop(struct seq_file *swap, void *v)
 static int swap_show(struct seq_file *swap, void *v)
 {
-        struct swap_info_struct *ptr = v;
+        struct swap_info_struct *si = v;
        struct file *file;
        int len;
-        if (ptr == SEQ_START_TOKEN) {
+        if (si == SEQ_START_TOKEN) {
                seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
                return 0;
        }
-        file = ptr->swap_file;
+        file = si->swap_file;
        len = seq_path(swap, &file->f_path, " \t\n\\");
        seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
                        len < 40 ? 40 - len : 1, " ",
                        S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
                                "partition" : "file\t",
-                        ptr->pages << (PAGE_SHIFT - 10),
+                        si->pages << (PAGE_SHIFT - 10),
-                        ptr->inuse_pages << (PAGE_SHIFT - 10),
+                        si->inuse_pages << (PAGE_SHIFT - 10),
-                        ptr->prio);
+                        si->prio);
        return 0;
 }
@@ -1765,7 +1751,7 @@ late_initcall(max_swapfiles_check);
 */
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 {
-        struct swap_info_struct * p;
+        struct swap_info_struct *p;
        char *name = NULL;
        struct block_device *bdev = NULL;
        struct file *swap_file = NULL;
@@ -1779,30 +1765,52 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        sector_t span;
        unsigned long maxpages = 1;
        unsigned long swapfilepages;
-        unsigned short *swap_map = NULL;
+        unsigned char *swap_map = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
        int did_down = 0;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
+        p = kzalloc(sizeof(*p), GFP_KERNEL);
+        if (!p)
+                return -ENOMEM;
        spin_lock(&swap_lock);
-        p = swap_info;
+        for (type = 0; type < nr_swapfiles; type++) {
-        for (type = 0 ; type < nr_swapfiles ; type++,p++)
+                if (!(swap_info[type]->flags & SWP_USED))
-                if (!(p->flags & SWP_USED))
                        break;
+        }
        error = -EPERM;
        if (type >= MAX_SWAPFILES) {
                spin_unlock(&swap_lock);
+                kfree(p);
                goto out;
        }
-        if (type >= nr_swapfiles)
+        if (type >= nr_swapfiles) {
-                nr_swapfiles = type+1;
+                p->type = type;
-        memset(p, 0, sizeof(*p));
+                swap_info[type] = p;
-        INIT_LIST_HEAD(&p->extent_list);
+                /*
+                 * Write swap_info[type] before nr_swapfiles, in case a
+                 * racing procfs swap_start() or swap_next() is reading them.
+                 * (We never shrink nr_swapfiles, we never free this entry.)
+                 */
+                smp_wmb();
+                nr_swapfiles++;
+        } else {
+                kfree(p);
+                p = swap_info[type];
+                /*
+                 * Do not memset this entry: a racing procfs swap_next()
+                 * would be relying on p->type to remain valid.
+                 */
+        }
+        INIT_LIST_HEAD(&p->first_swap_extent.list);
        p->flags = SWP_USED;
        p->next = -1;
        spin_unlock(&swap_lock);
        name = getname(specialfile);
        error = PTR_ERR(name);
        if (IS_ERR(name)) {
@@ -1822,7 +1830,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        error = -EBUSY;
        for (i = 0; i < nr_swapfiles; i++) {
-                struct swap_info_struct *q = &swap_info[i];
+                struct swap_info_struct *q = swap_info[i];
                if (i == type || !q->swap_file)
                        continue;
@@ -1897,6 +1905,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        p->lowest_bit  = 1;
        p->cluster_next = 1;
+        p->cluster_nr = 0;
        /*
         * Find out how many pages are allowed for a single swap
@@ -1932,13 +1941,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
                goto bad_swap;
        /* OK, set up the swap map and apply the bad block list */
-        swap_map = vmalloc(maxpages * sizeof(short));
+        swap_map = vmalloc(maxpages);
        if (!swap_map) {
                error = -ENOMEM;
                goto bad_swap;
        }
-        memset(swap_map, 0, maxpages * sizeof(short));
+        memset(swap_map, 0, maxpages);
        for (i = 0; i < swap_header->info.nr_badpages; i++) {
                int page_nr = swap_header->info.badpages[i];
                if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
@@ -2003,18 +2012,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
        /* insert swap space into swap_list: */
        prev = -1;
-        for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
+        for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
-                if (p->prio >= swap_info[i].prio) {
+                if (p->prio >= swap_info[i]->prio)
                        break;
-                }
                prev = i;
        }
        p->next = i;
-        if (prev < 0) {
+        if (prev < 0)
-                swap_list.head = swap_list.next = p - swap_info;
+                swap_list.head = swap_list.next = type;
-        } else {
+        else
-                swap_info[prev].next = p - swap_info;
+                swap_info[prev]->next = type;
-        }
        spin_unlock(&swap_lock);
        mutex_unlock(&swapon_mutex);
        error = 0;
@@ -2051,15 +2058,15 @@ out:
 void si_swapinfo(struct sysinfo *val)
 {
-        unsigned int i;
+        unsigned int type;
        unsigned long nr_to_be_unused = 0;
        spin_lock(&swap_lock);
-        for (i = 0; i < nr_swapfiles; i++) {
+        for (type = 0; type < nr_swapfiles; type++) {
-                if (!(swap_info[i].flags & SWP_USED) ||
+                struct swap_info_struct *si = swap_info[type];
-                     (swap_info[i].flags & SWP_WRITEOK))
-                        continue;
+                if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
-                nr_to_be_unused += swap_info[i].inuse_pages;
+                        nr_to_be_unused += si->inuse_pages;
        }
        val->freeswap = nr_swap_pages + nr_to_be_unused;
        val->totalswap = total_swap_pages + nr_to_be_unused;
@@ -2069,101 +2076,107 @@ void si_swapinfo(struct sysinfo *val)
 /*
 * Verify that a swap entry is valid and increment its swap map count.
 *
- * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as
- * "permanent", but will be reclaimed by the next swapoff.
 * Returns error code in following case.
 * - success -> 0
 * - swp_entry is invalid -> EINVAL
 * - swp_entry is migration entry -> EINVAL
 * - swap-cache reference is requested but there is already one. -> EEXIST
 * - swap-cache reference is requested but the entry is not used. -> ENOENT
+ * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
 */
-static int __swap_duplicate(swp_entry_t entry, bool cache)
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 {
-        struct swap_info_struct * p;
+        struct swap_info_struct *p;
        unsigned long offset, type;
-        int result = -EINVAL;
+        unsigned char count;
-        int count;
+        unsigned char has_cache;
-        bool has_cache;
+        int err = -EINVAL;
        if (non_swap_entry(entry))
-                return -EINVAL;
+                goto out;
        type = swp_type(entry);
        if (type >= nr_swapfiles)
                goto bad_file;
-        p = type + swap_info;
+        p = swap_info[type];
        offset = swp_offset(entry);
        spin_lock(&swap_lock);
        if (unlikely(offset >= p->max))
                goto unlock_out;
-        count = swap_count(p->swap_map[offset]);
+        count = p->swap_map[offset];
-        has_cache = swap_has_cache(p->swap_map[offset]);
+        has_cache = count & SWAP_HAS_CACHE;
+        count &= ~SWAP_HAS_CACHE;
+        err = 0;
-        if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */
+        if (usage == SWAP_HAS_CACHE) {
                /* set SWAP_HAS_CACHE if there is no cache and entry is used */
-                if (!has_cache && count) {
+                if (!has_cache && count)
-                        p->swap_map[offset] = encode_swapmap(count, true);
+                        has_cache = SWAP_HAS_CACHE;
-                        result = 0;
+                else if (has_cache)             /* someone else added cache */
-                } else if (has_cache) /* someone added cache */
+                        err = -EEXIST;
-                        result = -EEXIST;
+                else                            /* no users remaining */
-                else if (!count) /* no users */
+                        err = -ENOENT;
-                        result = -ENOENT;
        } else if (count || has_cache) {
-                if (count < SWAP_MAP_MAX - 1) {
-                        p->swap_map[offset] = encode_swapmap(count + 1,
+                if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
-                                                             has_cache);
+                        count += usage;
-                        result = 0;
+                else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
-                } else if (count <= SWAP_MAP_MAX) {
+                        err = -EINVAL;
-                        if (swap_overflow++ < 5)
+                else if (swap_count_continued(p, offset, count))
-                                printk(KERN_WARNING
+                        count = COUNT_CONTINUED;
-                                       "swap_dup: swap entry overflow\n");
+                else
-                        p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX,
+                        err = -ENOMEM;
-                                                              has_cache);
-                        result = 0;
-                }
        } else
-                result = -ENOENT; /* unused swap entry */
+                err = -ENOENT;                  /* unused swap entry */
+        p->swap_map[offset] = count | has_cache;
 unlock_out:
        spin_unlock(&swap_lock);
 out:
-        return result;
+        return err;
 bad_file:
        printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
        goto out;
 }
+/*
+ * Help swapoff by noting that swap entry belongs to shmem/tmpfs
+ * (in which case its reference count is never incremented).
+ */
+void swap_shmem_alloc(swp_entry_t entry)
+{
+        __swap_duplicate(entry, SWAP_MAP_SHMEM);
+}
 /*
 * increase reference count of swap entry by 1.
 */
-void swap_duplicate(swp_entry_t entry)
+int swap_duplicate(swp_entry_t entry)
 {
-        __swap_duplicate(entry, SWAP_MAP);
+        int err = 0;
+        while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
+                err = add_swap_count_continuation(entry, GFP_ATOMIC);
+        return err;
 }
 /*
 * @entry: swap entry for which we allocate swap cache.
 *
- * Called when allocating swap cache for exising swap entry,
+ * Called when allocating swap cache for existing swap entry,
 * This can return error codes. Returns 0 at success.
 * -EBUSY means there is a swap cache.
 * Note: return code is different from swap_duplicate().
 */
 int swapcache_prepare(swp_entry_t entry)
 {
-        return __swap_duplicate(entry, SWAP_CACHE);
+        return __swap_duplicate(entry, SWAP_HAS_CACHE);
-}
-struct swap_info_struct *
-get_swap_info_struct(unsigned type)
-{
-        return &swap_info[type];
 }
 /*
@@ -2181,7 +2194,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
        if (!our_page_cluster)  /* no readahead */
                return 0;
-        si = &swap_info[swp_type(entry)];
+        si = swap_info[swp_type(entry)];
        target = swp_offset(entry);
        base = (target >> our_page_cluster) << our_page_cluster;
        end = base + (1 << our_page_cluster);
@@ -2217,3 +2230,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
        *offset = ++toff;
        return nr_pages? ++nr_pages: 0;
 }
+/*
+ * add_swap_count_continuation - called when a swap count is duplicated
+ * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
+ * page of the original vmalloc'ed swap_map, to hold the continuation count
+ * (for that entry and for its neighbouring PAGE_SIZE swap entries).  Called
+ * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
+ *
+ * These continuation pages are seldom referenced: the common paths all work
+ * on the original swap_map, only referring to a continuation page when the
+ * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
+ *
+ * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
+ * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
+ * can be called after dropping locks.
+ */
+int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
+{
+        struct swap_info_struct *si;
+        struct page *head;
+        struct page *page;
+        struct page *list_page;
+        pgoff_t offset;
+        unsigned char count;
+        /*
+         * When debugging, it's easier to use __GFP_ZERO here; but it's better
+         * for latency not to zero a page while GFP_ATOMIC and holding locks.
+         */
+        page = alloc_page(gfp_mask | __GFP_HIGHMEM);
+        si = swap_info_get(entry);
+        if (!si) {
+                /*
+                 * An acceptable race has occurred since the failing
+                 * __swap_duplicate(): the swap entry has been freed,
+                 * perhaps even the whole swap_map cleared for swapoff.
+                 */
+                goto outer;
+        }
+        offset = swp_offset(entry);
+        count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
+        if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
+                /*
+                 * The higher the swap count, the more likely it is that tasks
+                 * will race to add swap count continuation: we need to avoid
+                 * over-provisioning.
+                 */
+                goto out;
+        }
+        if (!page) {
+                spin_unlock(&swap_lock);
+                return -ENOMEM;
+        }
+        /*
+         * We are fortunate that although vmalloc_to_page uses pte_offset_map,
+         * no architecture is using highmem pages for kernel pagetables: so it
+         * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
+         */
+        head = vmalloc_to_page(si->swap_map + offset);
+        offset &= ~PAGE_MASK;
+        /*
+         * Page allocation does not initialize the page's lru field,
+         * but it does always reset its private field.
+         */
+        if (!page_private(head)) {
+                BUG_ON(count & COUNT_CONTINUED);
+                INIT_LIST_HEAD(&head->lru);
+                set_page_private(head, SWP_CONTINUED);
+                si->flags |= SWP_CONTINUED;
+        }
+        list_for_each_entry(list_page, &head->lru, lru) {
+                unsigned char *map;
+                /*
+                 * If the previous map said no continuation, but we've found
+                 * a continuation page, free our allocation and use this one.
+                 */
+                if (!(count & COUNT_CONTINUED))
+                        goto out;
+                map = kmap_atomic(list_page, KM_USER0) + offset;
+                count = *map;
+                kunmap_atomic(map, KM_USER0);
+                /*
+                 * If this continuation count now has some space in it,
+                 * free our allocation and use this one.
+                 */
+                if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
+                        goto out;
+        }
+        list_add_tail(&page->lru, &head->lru);
+        page = NULL;                    /* now it's attached, don't free it */
+out:
+        spin_unlock(&swap_lock);
+outer:
+        if (page)
+                __free_page(page);
+        return 0;
+}
+/*
+ * swap_count_continued - when the original swap_map count is incremented
+ * from SWAP_MAP_MAX, check if there is already a continuation page to carry
+ * into, carry if so, or else fail until a new continuation page is allocated;
+ * when the original swap_map count is decremented from 0 with continuation,
+ * borrow from the continuation and report whether it still holds more.
+ * Called while __swap_duplicate() or swap_entry_free() holds swap_lock.
+ */
+static bool swap_count_continued(struct swap_info_struct *si,
+                                 pgoff_t offset, unsigned char count)
+{
+        struct page *head;
+        struct page *page;
+        unsigned char *map;
+        head = vmalloc_to_page(si->swap_map + offset);
+        if (page_private(head) != SWP_CONTINUED) {
+                BUG_ON(count & COUNT_CONTINUED);
+                return false;           /* need to add count continuation */
+        }
+        offset &= ~PAGE_MASK;
+        page = list_entry(head->lru.next, struct page, lru);
+        map = kmap_atomic(page, KM_USER0) + offset;
+        if (count == SWAP_MAP_MAX)      /* initial increment from swap_map */
+                goto init_map;          /* jump over SWAP_CONT_MAX checks */
+        if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
+                /*
+                 * Think of how you add 1 to 999
+                 */
+                while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.next, struct page, lru);
+                        BUG_ON(page == head);
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                }
+                if (*map == SWAP_CONT_MAX) {
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.next, struct page, lru);
+                        if (page == head)
+                                return false;   /* add count continuation */
+                        map = kmap_atomic(page, KM_USER0) + offset;
+init_map:               *map = 0;               /* we didn't zero the page */
+                }
+                *map += 1;
+                kunmap_atomic(map, KM_USER0);
+                page = list_entry(page->lru.prev, struct page, lru);
+                while (page != head) {
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                        *map = COUNT_CONTINUED;
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.prev, struct page, lru);
+                }
+                return true;                    /* incremented */
+        } else {                                /* decrementing */
+                /*
+                 * Think of how you subtract 1 from 1000
+                 */
+                BUG_ON(count != COUNT_CONTINUED);
+                while (*map == COUNT_CONTINUED) {
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.next, struct page, lru);
+                        BUG_ON(page == head);
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                }
+                BUG_ON(*map == 0);
+                *map -= 1;
+                if (*map == 0)
+                        count = 0;
+                kunmap_atomic(map, KM_USER0);
+                page = list_entry(page->lru.prev, struct page, lru);
+                while (page != head) {
+                        map = kmap_atomic(page, KM_USER0) + offset;
+                        *map = SWAP_CONT_MAX | count;
+                        count = COUNT_CONTINUED;
+                        kunmap_atomic(map, KM_USER0);
+                        page = list_entry(page->lru.prev, struct page, lru);
+                }
+                return count == COUNT_CONTINUED;
+        }
+}
+/*
+ * free_swap_count_continuations - swapoff free all the continuation pages
+ * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
+ */
+static void free_swap_count_continuations(struct swap_info_struct *si)
+{
+        pgoff_t offset;
+        for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
+                struct page *head;
+                head = vmalloc_to_page(si->swap_map + offset);
+                if (page_private(head)) {
+                        struct list_head *this, *next;
+                        list_for_each_safe(this, next, &head->lru) {
+                                struct page *page;
+                                page = list_entry(this, struct page, lru);
+                                list_del(this);
+                                __free_page(page);
+                        }
+                }
+        }
+}
diff --git a/mm/truncate.c b/mm/truncate.c
index 450cebdabfc0..342deee22684 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        pagevec_release(&pvec);
                        break;
                }
+                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        unlock_page(page);
                }
                pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
        }
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
        pagevec_init(&pvec, 0);
        while (next <= end &&
                        pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t index;
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                                break;
                }
                pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
                cond_resched();
        }
        return ret;
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
        while (next <= end && !wrapped &&
                pagevec_lookup(&pvec, mapping, next,
                        min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t page_index;
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                        unlock_page(page);
                }
                pagevec_release(&pvec);
+                mem_cgroup_uncharge_end();
                cond_resched();
        }
        return ret;
@@ -490,7 +496,7 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
- * Returns -EIO if any pages could not be invalidated.
+ * Returns -EBUSY if any pages could not be invalidated.
 */
 int invalidate_inode_pages2(struct address_space *mapping)
 {
diff --git a/mm/util.c b/mm/util.c
index 7c35ad95f927..b377ce430803 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,10 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/hugetlb.h>
+#include <linux/syscalls.h>
+#include <linux/mman.h>
+#include <linux/file.h>
 #include <asm/uaccess.h>
 #define CREATE_TRACE_POINTS
@@ -268,6 +272,46 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
+SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
+                unsigned long, prot, unsigned long, flags,
+                unsigned long, fd, unsigned long, pgoff)
+{
+        struct file * file = NULL;
+        unsigned long retval = -EBADF;
+        if (!(flags & MAP_ANONYMOUS)) {
+                if (unlikely(flags & MAP_HUGETLB))
+                        return -EINVAL;
+                file = fget(fd);
+                if (!file)
+                        goto out;
+        } else if (flags & MAP_HUGETLB) {
+                struct user_struct *user = NULL;
+                /*
+                 * VM_NORESERVE is used because the reservations will be
+                 * taken when vm_ops->mmap() is called
+                 * A dummy user value is used because we are not locking
+                 * memory so no accounting is necessary
+                 */
+                len = ALIGN(len, huge_page_size(&default_hstate));
+                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE,
+                                                &user, HUGETLB_ANONHUGE_INODE);
+                if (IS_ERR(file))
+                        return PTR_ERR(file);
+        }
+        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
+        down_write(&current->mm->mmap_sem);
+        retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+        up_write(&current->mm->mmap_sem);
+        if (file)
+                fput(file);
+out:
+        return retval;
+}
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0f551a4a44cd..37e69295f250 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -761,7 +761,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
        spin_lock(&vbq->lock);
        list_add(&vb->free_list, &vbq->free);
        spin_unlock(&vbq->lock);
-        put_cpu_var(vmap_cpu_blocks);
+        put_cpu_var(vmap_block_queue);
        return vb;
 }
@@ -826,7 +826,7 @@ again:
                }
                spin_unlock(&vb->lock);
        }
-        put_cpu_var(vmap_cpu_blocks);
+        put_cpu_var(vmap_block_queue);
        rcu_read_unlock();
        if (!addr) {
@@ -1411,6 +1411,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 {
        struct page **pages;
        unsigned int nr_pages, array_size, i;
+        gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
        nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
        array_size = (nr_pages * sizeof(struct page *));
@@ -1418,13 +1419,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        area->nr_pages = nr_pages;
        /* Please note that the recursion is strictly bounded. */
        if (array_size > PAGE_SIZE) {
-                pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO,
+                pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
                                PAGE_KERNEL, node, caller);
                area->flags |= VM_VPAGES;
        } else {
-                pages = kmalloc_node(array_size,
+                pages = kmalloc_node(array_size, nested_gfp, node);
-                                (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
-                                node);
        }
        area->pages = pages;
        area->caller = caller;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 777af57fd8c8..885207a6b6b7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,6 +55,11 @@ struct scan_control {
        /* Number of pages freed so far during a call to shrink_zones() */
        unsigned long nr_reclaimed;
+        /* How many pages shrink_list() should reclaim */
+        unsigned long nr_to_reclaim;
+        unsigned long hibernation_mode;
        /* This context's GFP mask */
        gfp_t gfp_mask;
@@ -66,12 +71,6 @@ struct scan_control {
        /* Can pages be swapped as part of reclaim? */
        int may_swap;
-        /* This context's SWAP_CLUSTER_MAX. If freeing memory for
-         * suspend, we effectively ignore SWAP_CLUSTER_MAX.
-         * In this context, it doesn't matter that we scan the
-         * whole list at once. */
-        int swap_cluster_max;
        int swappiness;
        int all_unreclaimable;
@@ -358,7 +357,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
         * stalls if we need to run get_block().  We could test
         * PagePrivate for that.
         *
-         * If this process is currently in generic_file_write() against
+         * If this process is currently in __generic_file_aio_write() against
         * this page's queue, we can perform writeback even if that
         * will block.
         *
@@ -1132,7 +1131,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                unsigned long nr_anon;
                unsigned long nr_file;
-                nr_taken = sc->isolate_pages(sc->swap_cluster_max,
+                nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
                             &page_list, &nr_scan, sc->order, mode,
                                zone, sc->mem_cgroup, 0, file);
@@ -1166,10 +1165,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
                __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
-                reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
+                reclaim_stat->recent_scanned[0] += nr_anon;
-                reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
+                reclaim_stat->recent_scanned[1] += nr_file;
-                reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
-                reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
                spin_unlock_irq(&zone->lru_lock);
@@ -1464,20 +1461,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
        return low;
 }
+static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
+                                int file)
+{
+        if (file)
+                return inactive_file_is_low(zone, sc);
+        else
+                return inactive_anon_is_low(zone, sc);
+}
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
        struct zone *zone, struct scan_control *sc, int priority)
 {
        int file = is_file_lru(lru);
-        if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
+        if (is_active_lru(lru)) {
-                shrink_active_list(nr_to_scan, zone, sc, priority, file);
+                if (inactive_list_is_low(zone, sc, file))
+                    shrink_active_list(nr_to_scan, zone, sc, priority, file);
                return 0;
        }
-        if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
-                shrink_active_list(nr_to_scan, zone, sc, priority, file);
-                return 0;
-        }
        return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
 }
@@ -1567,15 +1570,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 * until we collected @swap_cluster_max pages to scan.
 */
 static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
-                                       unsigned long *nr_saved_scan,
+                                       unsigned long *nr_saved_scan)
-                                       unsigned long swap_cluster_max)
 {
        unsigned long nr;
        *nr_saved_scan += nr_to_scan;
        nr = *nr_saved_scan;
-        if (nr >= swap_cluster_max)
+        if (nr >= SWAP_CLUSTER_MAX)
                *nr_saved_scan = 0;
        else
                nr = 0;
@@ -1594,7 +1596,7 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long percent[2];       /* anon @ 0; file @ 1 */
        enum lru_list l;
        unsigned long nr_reclaimed = sc->nr_reclaimed;
-        unsigned long swap_cluster_max = sc->swap_cluster_max;
+        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        int noswap = 0;
@@ -1616,15 +1618,15 @@ static void shrink_zone(int priority, struct zone *zone,
                        scan = (scan * percent[file]) / 100;
                }
                nr[l] = nr_scan_try_batch(scan,
-                                          &reclaim_stat->nr_saved_scan[l],
+                                          &reclaim_stat->nr_saved_scan[l]);
-                                          swap_cluster_max);
        }
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                for_each_evictable_lru(l) {
                        if (nr[l]) {
-                                nr_to_scan = min(nr[l], swap_cluster_max);
+                                nr_to_scan = min_t(unsigned long,
+                                                   nr[l], SWAP_CLUSTER_MAX);
                                nr[l] -= nr_to_scan;
                                nr_reclaimed += shrink_list(l, nr_to_scan,
@@ -1639,8 +1641,7 @@ static void shrink_zone(int priority, struct zone *zone,
                 * with multiple processes reclaiming pages, the total
                 * freeing target can get unreasonably large.
                 */
-                if (nr_reclaimed > swap_cluster_max &&
+                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
-                        priority < DEF_PRIORITY && !current_is_kswapd())
                        break;
        }
@@ -1738,6 +1739,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        struct zoneref *z;
        struct zone *zone;
        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
+        unsigned long writeback_threshold;
        delayacct_freepages_start();
@@ -1773,7 +1775,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                        }
                }
                total_scanned += sc->nr_scanned;
-                if (sc->nr_reclaimed >= sc->swap_cluster_max) {
+                if (sc->nr_reclaimed >= sc->nr_to_reclaim) {
                        ret = sc->nr_reclaimed;
                        goto out;
                }
@@ -1785,14 +1787,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 * that's undesirable in laptop mode, where we *want* lumpy
                 * writeout.  So in laptop mode, write out the whole world.
                 */
-                if (total_scanned > sc->swap_cluster_max +
+                writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
-                                        sc->swap_cluster_max / 2) {
+                if (total_scanned > writeback_threshold) {
                        wakeup_flusher_threads(laptop_mode ? 0 : total_scanned);
                        sc->may_writepage = 1;
                }
                /* Take a nap, wait for some writeback to complete */
-                if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
+                if (!sc->hibernation_mode && sc->nr_scanned &&
+                    priority < DEF_PRIORITY - 2)
                        congestion_wait(BLK_RW_ASYNC, HZ/10);
        }
        /* top priority shrink_zones still had more to do? don't OOM, then */
@@ -1831,7 +1834,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        struct scan_control sc = {
                .gfp_mask = gfp_mask,
                .may_writepage = !laptop_mode,
-                .swap_cluster_max = SWAP_CLUSTER_MAX,
+                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_unmap = 1,
                .may_swap = 1,
                .swappiness = vm_swappiness,
@@ -1855,7 +1858,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = !noswap,
-                .swap_cluster_max = SWAP_CLUSTER_MAX,
                .swappiness = swappiness,
                .order = 0,
                .mem_cgroup = mem,
@@ -1889,7 +1891,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = !noswap,
-                .swap_cluster_max = SWAP_CLUSTER_MAX,
+                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .swappiness = swappiness,
                .order = 0,
                .mem_cgroup = mem_cont,
@@ -1904,6 +1906,30 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
+/* is kswapd sleeping prematurely? */
+static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
+{
+        int i;
+        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
+        if (remaining)
+                return 1;
+        /* If after HZ/10, a zone is below the high mark, it's premature */
+        for (i = 0; i < pgdat->nr_zones; i++) {
+                struct zone *zone = pgdat->node_zones + i;
+                if (!populated_zone(zone))
+                        continue;
+                if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
+                                                                0, 0))
+                        return 1;
+        }
+        return 0;
+}
 /*
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at high_wmark_pages(zone).
@@ -1936,7 +1962,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                .gfp_mask = GFP_KERNEL,
                .may_unmap = 1,
                .may_swap = 1,
-                .swap_cluster_max = SWAP_CLUSTER_MAX,
+                /*
+                 * kswapd doesn't want to be bailed out while reclaim. because
+                 * we want to put equal scanning pressure on each zone.
+                 */
+                .nr_to_reclaim = ULONG_MAX,
                .swappiness = vm_swappiness,
                .order = order,
                .mem_cgroup = NULL,
@@ -1961,6 +1991,7 @@ loop_again:
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                unsigned long lru_pages = 0;
+                int has_under_min_watermark_zone = 0;
                /* The swap token gets in the way of swapout... */
                if (!priority)
@@ -2067,6 +2098,15 @@ loop_again:
                        if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
+                        /*
+                         * We are still under min water mark. it mean we have
+                         * GFP_ATOMIC allocation failure risk. Hurry up!
+                         */
+                        if (!zone_watermark_ok(zone, order, min_wmark_pages(zone),
+                                              end_zone, 0))
+                                has_under_min_watermark_zone = 1;
                }
                if (all_zones_ok)
                        break;          /* kswapd: all done */
@@ -2074,8 +2114,12 @@ loop_again:
                 * OK, kswapd is getting into trouble.  Take a nap, then take
                 * another pass across the zones.
                 */
-                if (total_scanned && priority < DEF_PRIORITY - 2)
+                if (total_scanned && (priority < DEF_PRIORITY - 2)) {
-                        congestion_wait(BLK_RW_ASYNC, HZ/10);
+                        if (has_under_min_watermark_zone)
+                                count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
+                        else
+                                congestion_wait(BLK_RW_ASYNC, HZ/10);
+                }
                /*
                 * We do this so kswapd doesn't build up large priorities for
@@ -2173,6 +2217,7 @@ static int kswapd(void *p)
        order = 0;
        for ( ; ; ) {
                unsigned long new_order;
+                int ret;
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
                new_order = pgdat->kswapd_max_order;
@@ -2184,19 +2229,45 @@ static int kswapd(void *p)
                         */
                        order = new_order;
                } else {
-                        if (!freezing(current))
+                        if (!freezing(current) && !kthread_should_stop()) {
-                                schedule();
+                                long remaining = 0;
+                                /* Try to sleep for a short interval */
+                                if (!sleeping_prematurely(pgdat, order, remaining)) {
+                                        remaining = schedule_timeout(HZ/10);
+                                        finish_wait(&pgdat->kswapd_wait, &wait);
+                                        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+                                }
+                                /*
+                                 * After a short sleep, check if it was a
+                                 * premature sleep. If not, then go fully
+                                 * to sleep until explicitly woken up
+                                 */
+                                if (!sleeping_prematurely(pgdat, order, remaining))
+                                        schedule();
+                                else {
+                                        if (remaining)
+                                                count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
+                                        else
+                                                count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
+                                }
+                        }
                        order = pgdat->kswapd_max_order;
                }
                finish_wait(&pgdat->kswapd_wait, &wait);
-                if (!try_to_freeze()) {
+                ret = try_to_freeze();
-                        /* We can speed up thawing tasks if we don't call
+                if (kthread_should_stop())
-                         * balance_pgdat after returning from the refrigerator
+                        break;
-                         */
+                /*
+                 * We can speed up thawing tasks if we don't call balance_pgdat
+                 * after returning from the refrigerator
+                 */
+                if (!ret)
                        balance_pgdat(pgdat, order);
-                }
        }
        return 0;
 }
@@ -2260,148 +2331,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
 #ifdef CONFIG_HIBERNATION
 /*
- * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
+ * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
- * from LRU lists system-wide, for given pass and priority.
- *
- * For pass > 3 we also try to shrink the LRU lists that contain a few pages
- */
-static void shrink_all_zones(unsigned long nr_pages, int prio,
-                                      int pass, struct scan_control *sc)
-{
-        struct zone *zone;
-        unsigned long nr_reclaimed = 0;
-        struct zone_reclaim_stat *reclaim_stat;
-        for_each_populated_zone(zone) {
-                enum lru_list l;
-                if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
-                        continue;
-                for_each_evictable_lru(l) {
-                        enum zone_stat_item ls = NR_LRU_BASE + l;
-                        unsigned long lru_pages = zone_page_state(zone, ls);
-                        /* For pass = 0, we don't shrink the active list */
-                        if (pass == 0 && (l == LRU_ACTIVE_ANON ||
-                                                l == LRU_ACTIVE_FILE))
-                                continue;
-                        reclaim_stat = get_reclaim_stat(zone, sc);
-                        reclaim_stat->nr_saved_scan[l] +=
-                                                (lru_pages >> prio) + 1;
-                        if (reclaim_stat->nr_saved_scan[l]
-                                                >= nr_pages || pass > 3) {
-                                unsigned long nr_to_scan;
-                                reclaim_stat->nr_saved_scan[l] = 0;
-                                nr_to_scan = min(nr_pages, lru_pages);
-                                nr_reclaimed += shrink_list(l, nr_to_scan, zone,
-                                                                sc, prio);
-                                if (nr_reclaimed >= nr_pages) {
-                                        sc->nr_reclaimed += nr_reclaimed;
-                                        return;
-                                }
-                        }
-                }
-        }
-        sc->nr_reclaimed += nr_reclaimed;
-}
-/*
- * Try to free `nr_pages' of memory, system-wide, and return the number of
 * freed pages.
 *
 * Rather than trying to age LRUs the aim is to preserve the overall
 * LRU order by reclaiming preferentially
 * inactive > active > active referenced > active mapped
 */
-unsigned long shrink_all_memory(unsigned long nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 {
-        unsigned long lru_pages, nr_slab;
-        int pass;
        struct reclaim_state reclaim_state;
        struct scan_control sc = {
-                .gfp_mask = GFP_KERNEL,
+                .gfp_mask = GFP_HIGHUSER_MOVABLE,
-                .may_unmap = 0,
+                .may_swap = 1,
+                .may_unmap = 1,
                .may_writepage = 1,
+                .nr_to_reclaim = nr_to_reclaim,
+                .hibernation_mode = 1,
+                .swappiness = vm_swappiness,
+                .order = 0,
                .isolate_pages = isolate_pages_global,
-                .nr_reclaimed = 0,
        };
+        struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
+        struct task_struct *p = current;
+        unsigned long nr_reclaimed;
-        current->reclaim_state = &reclaim_state;
+        p->flags |= PF_MEMALLOC;
+        lockdep_set_current_reclaim_state(sc.gfp_mask);
-        lru_pages = global_reclaimable_pages();
+        reclaim_state.reclaimed_slab = 0;
-        nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
+        p->reclaim_state = &reclaim_state;
-        /* If slab caches are huge, it's better to hit them first */
-        while (nr_slab >= lru_pages) {
-                reclaim_state.reclaimed_slab = 0;
-                shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
-                if (!reclaim_state.reclaimed_slab)
-                        break;
-                sc.nr_reclaimed += reclaim_state.reclaimed_slab;
-                if (sc.nr_reclaimed >= nr_pages)
-                        goto out;
-                nr_slab -= reclaim_state.reclaimed_slab;
-        }
-        /*
-         * We try to shrink LRUs in 5 passes:
-         * 0 = Reclaim from inactive_list only
-         * 1 = Reclaim from active list but don't reclaim mapped
-         * 2 = 2nd pass of type 1
-         * 3 = Reclaim mapped (normal reclaim)
-         * 4 = 2nd pass of type 3
-         */
-        for (pass = 0; pass < 5; pass++) {
-                int prio;
-                /* Force reclaiming mapped pages in the passes #3 and #4 */
-                if (pass > 2)
-                        sc.may_unmap = 1;
-                for (prio = DEF_PRIORITY; prio >= 0; prio--) {
-                        unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
-                        sc.nr_scanned = 0;
-                        sc.swap_cluster_max = nr_to_scan;
-                        shrink_all_zones(nr_to_scan, prio, pass, &sc);
-                        if (sc.nr_reclaimed >= nr_pages)
-                                goto out;
-                        reclaim_state.reclaimed_slab = 0;
-                        shrink_slab(sc.nr_scanned, sc.gfp_mask,
-                                    global_reclaimable_pages());
-                        sc.nr_reclaimed += reclaim_state.reclaimed_slab;
-                        if (sc.nr_reclaimed >= nr_pages)
-                                goto out;
-                        if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
-                                congestion_wait(BLK_RW_ASYNC, HZ / 10);
-                }
-        }
-        /*
-         * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be
-         * something in slab caches
-         */
-        if (!sc.nr_reclaimed) {
-                do {
-                        reclaim_state.reclaimed_slab = 0;
-                        shrink_slab(nr_pages, sc.gfp_mask,
-                                    global_reclaimable_pages());
-                        sc.nr_reclaimed += reclaim_state.reclaimed_slab;
-                } while (sc.nr_reclaimed < nr_pages &&
-                                reclaim_state.reclaimed_slab > 0);
-        }
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
-out:
+        p->reclaim_state = NULL;
-        current->reclaim_state = NULL;
+        lockdep_clear_current_reclaim_state();
+        p->flags &= ~PF_MEMALLOC;
-        return sc.nr_reclaimed;
+        return nr_reclaimed;
 }
 #endif /* CONFIG_HIBERNATION */
@@ -2451,6 +2417,17 @@ int kswapd_run(int nid)
        return ret;
 }
+/*
+ * Called by memory hotplug when all memory in a node is offlined.
+ */
+void kswapd_stop(int nid)
+{
+        struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
+        if (kswapd)
+                kthread_stop(kswapd);
+}
 static int __init kswapd_init(void)
 {
        int nid;
@@ -2553,8 +2530,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
                .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
                .may_swap = 1,
-                .swap_cluster_max = max_t(unsigned long, nr_pages,
+                .nr_to_reclaim = max_t(unsigned long, nr_pages,
-                                        SWAP_CLUSTER_MAX),
+                                       SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
                .swappiness = vm_swappiness,
                .order = order,
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c81321f9feec..6051fbab67ba 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -683,6 +683,9 @@ static const char * const vmstat_text[] = {
        "slabs_scanned",
        "kswapd_steal",
        "kswapd_inodesteal",
+        "kswapd_low_wmark_hit_quickly",
+        "kswapd_high_wmark_hit_quickly",
+        "kswapd_skip_congestion_wait",
        "pageoutrun",
        "allocstall",
@@ -883,11 +886,10 @@ static void vmstat_update(struct work_struct *w)
 static void __cpuinit start_cpu_timer(int cpu)
 {
-        struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
+        struct delayed_work *work = &per_cpu(vmstat_work, cpu);
-        INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
+        INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
-        schedule_delayed_work_on(cpu, vmstat_work,
+        schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
-                                 __round_jiffies_relative(HZ, cpu));
 }
 /*