Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks

author: Ingo Molnar <mingo@elte.hu> 2008-10-28 11:26:12 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-10-28 11:26:12 -0400
commit: 7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
tree: e730a4565e0318140d2fbd2f0415d18a339d7336 /mm
parent: 41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff)
parent: 0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)
50 files changed, 7869 insertions, 2747 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c4de85285bb4..5b5790f8a816 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -101,7 +101,7 @@ config HAVE_MEMORY_PRESENT
 # with gcc 3.4 and later.
 #
 config SPARSEMEM_STATIC
-        def_bool n
+        bool
 #
 # Architecture platforms which require a two level mem_section in SPARSEMEM
@@ -113,7 +113,7 @@ config SPARSEMEM_EXTREME
        depends on SPARSEMEM && !SPARSEMEM_STATIC
 config SPARSEMEM_VMEMMAP_ENABLE
-        def_bool n
+        bool
 config SPARSEMEM_VMEMMAP
        bool "Sparse Memory virtual memmap"
@@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS
 config MIGRATION
        bool "Page migration"
        def_bool y
-        depends on NUMA
+        depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
        help
          Allows the migration of the physical location of pages of processes
          while the virtual addresses are not changed. This is useful for
@@ -187,6 +187,9 @@ config RESOURCES_64BIT
        help
          This option allows memory and IO resources to be 64 bit.
+config PHYS_ADDR_T_64BIT
+        def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
 config ZONE_DMA_FLAG
        int
        default "0" if !ZONE_DMA
@@ -205,3 +208,17 @@ config NR_QUICK
 config VIRT_TO_BUS
        def_bool y
        depends on !ARCH_NO_VIRT_TO_BUS
+config UNEVICTABLE_LRU
+        bool "Add LRU list to track non-evictable pages"
+        default y
+        depends on MMU
+        help
+          Keeps unevictable pages off of the active and inactive pageout
+          lists, so kswapd will not waste CPU time or have its balancing
+          algorithms thrown off by scanning these pages.  Selecting this
+          will use one page flag and increase the code size a little,
+          say Y unless you know what you are doing.
+config MMU_NOTIFIER
+        bool
diff --git a/mm/Makefile b/mm/Makefile
index 18c143b3c46c..c06b45a1ff5f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o pdflush.o \
                           readahead.o swap.o truncate.o vmscan.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                           page_isolation.o $(mmu-y)
+                           page_isolation.o mm_init.o $(mmu-y)
 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)    += bounce.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
@@ -32,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
-obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
+obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 05f2b4009ccc..4297bc41bfd2 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -18,27 +18,28 @@
 * Depopulating per-cpu data for a cpu going offline would be a typical
 * use case. You need to register a cpu hotplug handler for that purpose.
 */
-void percpu_depopulate(void *__pdata, int cpu)
+static void percpu_depopulate(void *__pdata, int cpu)
 {
        struct percpu_data *pdata = __percpu_disguise(__pdata);
        kfree(pdata->ptrs[cpu]);
        pdata->ptrs[cpu] = NULL;
 }
-EXPORT_SYMBOL_GPL(percpu_depopulate);
 /**
 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
 * @__pdata: per-cpu data to depopulate
 * @mask: depopulate per-cpu data for cpu's selected through mask bits
 */
-void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
+static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
 {
        int cpu;
-        for_each_cpu_mask(cpu, *mask)
+        for_each_cpu_mask_nr(cpu, *mask)
                percpu_depopulate(__pdata, cpu);
 }
-EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
+#define percpu_depopulate_mask(__pdata, mask) \
+        __percpu_depopulate_mask((__pdata), &(mask))
 /**
 * percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
 * use case. You need to register a cpu hotplug handler for that purpose.
 * Per-cpu object is populated with zeroed buffer.
 */
-void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
+static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
 {
        struct percpu_data *pdata = __percpu_disguise(__pdata);
        int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
                pdata->ptrs[cpu] = kzalloc(size, gfp);
        return pdata->ptrs[cpu];
 }
-EXPORT_SYMBOL_GPL(percpu_populate);
 /**
 * percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,14 +79,14 @@ EXPORT_SYMBOL_GPL(percpu_populate);
 *
 * Per-cpu objects are populated with zeroed buffers.
 */
-int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
-                           cpumask_t *mask)
+                                  cpumask_t *mask)
 {
        cpumask_t populated;
        int cpu;
        cpus_clear(populated);
-        for_each_cpu_mask(cpu, *mask)
+        for_each_cpu_mask_nr(cpu, *mask)
                if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
                        __percpu_depopulate_mask(__pdata, &populated);
                        return -ENOMEM;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
                        cpu_set(cpu, populated);
        return 0;
 }
-EXPORT_SYMBOL_GPL(__percpu_populate_mask);
+#define percpu_populate_mask(__pdata, size, gfp, mask) \
+        __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 /**
 * percpu_alloc_mask - initial setup of per-cpu data
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8d9f60e06f62..ac5a891f142a 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -1,12 +1,12 @@
 /*
- *  linux/mm/bootmem.c
+ *  bootmem - A boot-time physical memory allocator and configurator
 *
 *  Copyright (C) 1999 Ingo Molnar
- *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *                1999 Kanoj Sarcar, SGI
+ *                2008 Johannes Weiner
 *
- *  simple boot-time physical memory area allocator and
+ * Access to this subsystem has to be serialized externally (which is true
- *  free memory collector. It's used to deal with reserved
+ * for the boot process anyway).
- *  system memory and memory holes as well.
 */
 #include <linux/init.h>
 #include <linux/pfn.h>
@@ -19,15 +19,10 @@
 #include "internal.h"
-/*
- * Access to this subsystem has to be serialized externally. (this is
- * true for the boot process anyway)
- */
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
-static LIST_HEAD(bdata_list);
 #ifdef CONFIG_CRASH_DUMP
 /*
 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list);
 unsigned long saved_max_pfn;
 #endif
-/* return the number of _pages_ that will be allocated for the boot bitmap */
+bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
-unsigned long __init bootmem_bootmap_pages(unsigned long pages)
+static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
+static int bootmem_debug;
+static int __init bootmem_debug_setup(char *buf)
 {
-        unsigned long mapsize;
+        bootmem_debug = 1;
+        return 0;
+}
+early_param("bootmem_debug", bootmem_debug_setup);
-        mapsize = (pages+7)/8;
+#define bdebug(fmt, args...) ({                         \
-        mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
+        if (unlikely(bootmem_debug))                    \
-        mapsize >>= PAGE_SHIFT;
+                printk(KERN_INFO                        \
+                        "bootmem::%s " fmt,             \
+                        __func__, ## args);             \
+})
-        return mapsize;
+static unsigned long __init bootmap_bytes(unsigned long pages)
+{
+        unsigned long bytes = (pages + 7) / 8;
+        return ALIGN(bytes, sizeof(long));
 }
-/*
+/**
- * link bdata in order
+ * bootmem_bootmap_pages - calculate bitmap size in pages
+ * @pages: number of pages the bitmap has to represent
 */
-static void __init link_bootmem(bootmem_data_t *bdata)
+unsigned long __init bootmem_bootmap_pages(unsigned long pages)
 {
-        bootmem_data_t *ent;
+        unsigned long bytes = bootmap_bytes(pages);
-        if (list_empty(&bdata_list)) {
+        return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
-                list_add(&bdata->list, &bdata_list);
-                return;
-        }
-        /* insert in order */
-        list_for_each_entry(ent, &bdata_list, list) {
-                if (bdata->node_boot_start < ent->node_boot_start) {
-                        list_add_tail(&bdata->list, &ent->list);
-                        return;
-                }
-        }
-        list_add_tail(&bdata->list, &bdata_list);
 }
 /*
- * Given an initialised bdata, it returns the size of the boot bitmap
+ * link bdata in order
 */
-static unsigned long __init get_mapsize(bootmem_data_t *bdata)
+static void __init link_bootmem(bootmem_data_t *bdata)
 {
-        unsigned long mapsize;
+        struct list_head *iter;
-        unsigned long start = PFN_DOWN(bdata->node_boot_start);
-        unsigned long end = bdata->node_low_pfn;
-        mapsize = ((end - start) + 7) / 8;
+        list_for_each(iter, &bdata_list) {
-        return ALIGN(mapsize, sizeof(long));
+                bootmem_data_t *ent;
+                ent = list_entry(iter, bootmem_data_t, list);
+                if (bdata->node_min_pfn < ent->node_min_pfn)
+                        break;
+        }
+        list_add_tail(&bdata->list, iter);
 }
 /*
 * Called once to set up the allocator itself.
 */
-static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
+static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
        unsigned long mapstart, unsigned long start, unsigned long end)
 {
-        bootmem_data_t *bdata = pgdat->bdata;
        unsigned long mapsize;
+        mminit_validate_memmodel_limits(&start, &end);
        bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
-        bdata->node_boot_start = PFN_PHYS(start);
+        bdata->node_min_pfn = start;
        bdata->node_low_pfn = end;
        link_bootmem(bdata);
@@ -100,429 +104,484 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
         * Initially all pages are reserved - setup_arch() has to
         * register free RAM areas explicitly.
         */
-        mapsize = get_mapsize(bdata);
+        mapsize = bootmap_bytes(end - start);
        memset(bdata->node_bootmem_map, 0xff, mapsize);
+        bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
+                bdata - bootmem_node_data, start, mapstart, end, mapsize);
        return mapsize;
 }
-/*
+/**
- * Marks a particular physical memory range as unallocatable. Usable RAM
+ * init_bootmem_node - register a node as boot memory
- * might be used for boot-time allocations - or it might get added
+ * @pgdat: node to register
- * to the free page pool later on.
+ * @freepfn: pfn where the bitmap for this node is to be placed
+ * @startpfn: first pfn on the node
+ * @endpfn: first pfn after the node
+ *
+ * Returns the number of bytes needed to hold the bitmap for this node.
 */
-static int __init can_reserve_bootmem_core(bootmem_data_t *bdata,
+unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
-                        unsigned long addr, unsigned long size, int flags)
+                                unsigned long startpfn, unsigned long endpfn)
 {
-        unsigned long sidx, eidx;
+        return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
-        unsigned long i;
+}
-        BUG_ON(!size);
+/**
+ * init_bootmem - register boot memory
+ * @start: pfn where the bitmap is to be placed
+ * @pages: number of available physical pages
+ *
+ * Returns the number of bytes needed to hold the bitmap.
+ */
+unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+{
+        max_low_pfn = pages;
+        min_low_pfn = start;
+        return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
+}
+static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
+{
+        int aligned;
+        struct page *page;
+        unsigned long start, end, pages, count = 0;
-        /* out of range, don't hold other */
+        if (!bdata->node_bootmem_map)
-        if (addr + size < bdata->node_boot_start ||
-                PFN_DOWN(addr) > bdata->node_low_pfn)
                return 0;
+        start = bdata->node_min_pfn;
+        end = bdata->node_low_pfn;
        /*
-         * Round up to index to the range.
+         * If the start is aligned to the machines wordsize, we might
+         * be able to free pages in bulks of that order.
         */
-        if (addr > bdata->node_boot_start)
+        aligned = !(start & (BITS_PER_LONG - 1));
-                sidx= PFN_DOWN(addr - bdata->node_boot_start);
-        else
-                sidx = 0;
-        eidx = PFN_UP(addr + size - bdata->node_boot_start);
+        bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
-        if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+                bdata - bootmem_node_data, start, end, aligned);
-                eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
-        for (i = sidx; i < eidx; i++) {
+        while (start < end) {
-                if (test_bit(i, bdata->node_bootmem_map)) {
+                unsigned long *map, idx, vec;
-                        if (flags & BOOTMEM_EXCLUSIVE)
-                                return -EBUSY;
+                map = bdata->node_bootmem_map;
+                idx = start - bdata->node_min_pfn;
+                vec = ~map[idx / BITS_PER_LONG];
+                if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+                        int order = ilog2(BITS_PER_LONG);
+                        __free_pages_bootmem(pfn_to_page(start), order);
+                        count += BITS_PER_LONG;
+                } else {
+                        unsigned long off = 0;
+                        while (vec && off < BITS_PER_LONG) {
+                                if (vec & 1) {
+                                        page = pfn_to_page(start + off);
+                                        __free_pages_bootmem(page, 0);
+                                        count++;
+                                }
+                                vec >>= 1;
+                                off++;
+                        }
                }
+                start += BITS_PER_LONG;
        }
-        return 0;
+        page = virt_to_page(bdata->node_bootmem_map);
+        pages = bdata->node_low_pfn - bdata->node_min_pfn;
+        pages = bootmem_bootmap_pages(pages);
+        count += pages;
+        while (pages--)
+                __free_pages_bootmem(page++, 0);
+        bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
+        return count;
 }
-static void __init reserve_bootmem_core(bootmem_data_t *bdata,
+/**
-                        unsigned long addr, unsigned long size, int flags)
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
-        unsigned long sidx, eidx;
+        register_page_bootmem_info_node(pgdat);
-        unsigned long i;
+        return free_all_bootmem_core(pgdat->bdata);
+}
-        BUG_ON(!size);
+/**
+ * free_all_bootmem - release free pages to the buddy allocator
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem(void)
+{
+        return free_all_bootmem_core(NODE_DATA(0)->bdata);
+}
-        /* out of range */
+static void __init __free(bootmem_data_t *bdata,
-        if (addr + size < bdata->node_boot_start ||
+                        unsigned long sidx, unsigned long eidx)
-                PFN_DOWN(addr) > bdata->node_low_pfn)
+{
-                return;
+        unsigned long idx;
-        /*
+        bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
-         * Round up to index to the range.
+                sidx + bdata->node_min_pfn,
-         */
+                eidx + bdata->node_min_pfn);
-        if (addr > bdata->node_boot_start)
-                sidx= PFN_DOWN(addr - bdata->node_boot_start);
-        else
-                sidx = 0;
-        eidx = PFN_UP(addr + size - bdata->node_boot_start);
+        if (bdata->hint_idx > sidx)
-        if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+                bdata->hint_idx = sidx;
-                eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
-        for (i = sidx; i < eidx; i++) {
+        for (idx = sidx; idx < eidx; idx++)
-                if (test_and_set_bit(i, bdata->node_bootmem_map)) {
+                if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
-#ifdef CONFIG_DEBUG_BOOTMEM
+                        BUG();
-                        printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
+}
-#endif
+static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
+                        unsigned long eidx, int flags)
+{
+        unsigned long idx;
+        int exclusive = flags & BOOTMEM_EXCLUSIVE;
+        bdebug("nid=%td start=%lx end=%lx flags=%x\n",
+                bdata - bootmem_node_data,
+                sidx + bdata->node_min_pfn,
+                eidx + bdata->node_min_pfn,
+                flags);
+        for (idx = sidx; idx < eidx; idx++)
+                if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
+                        if (exclusive) {
+                                __free(bdata, sidx, idx);
+                                return -EBUSY;
+                        }
+                        bdebug("silent double reserve of PFN %lx\n",
+                                idx + bdata->node_min_pfn);
                }
-        }
+        return 0;
 }
-static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
+static int __init mark_bootmem_node(bootmem_data_t *bdata,
-                                     unsigned long size)
+                                unsigned long start, unsigned long end,
+                                int reserve, int flags)
 {
        unsigned long sidx, eidx;
-        unsigned long i;
-        BUG_ON(!size);
+        bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
+                bdata - bootmem_node_data, start, end, reserve, flags);
-        /* out range */
+        BUG_ON(start < bdata->node_min_pfn);
-        if (addr + size < bdata->node_boot_start ||
+        BUG_ON(end > bdata->node_low_pfn);
-                PFN_DOWN(addr) > bdata->node_low_pfn)
-                return;
-        /*
-         * round down end of usable mem, partially free pages are
-         * considered reserved.
-         */
-        if (addr >= bdata->node_boot_start && addr < bdata->last_success)
+        sidx = start - bdata->node_min_pfn;
-                bdata->last_success = addr;
+        eidx = end - bdata->node_min_pfn;
-        /*
+        if (reserve)
-         * Round up to index to the range.
+                return __reserve(bdata, sidx, eidx, flags);
-         */
-        if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start))
-                sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
        else
-                sidx = 0;
+                __free(bdata, sidx, eidx);
+        return 0;
+}
-        eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
+static int __init mark_bootmem(unsigned long start, unsigned long end,
-        if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+                                int reserve, int flags)
-                eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
+{
+        unsigned long pos;
+        bootmem_data_t *bdata;
-        for (i = sidx; i < eidx; i++) {
+        pos = start;
-                if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
+        list_for_each_entry(bdata, &bdata_list, list) {
-                        BUG();
+                int err;
+                unsigned long max;
+                if (pos < bdata->node_min_pfn ||
+                    pos >= bdata->node_low_pfn) {
+                        BUG_ON(pos != start);
+                        continue;
+                }
+                max = min(bdata->node_low_pfn, end);
+                err = mark_bootmem_node(bdata, pos, max, reserve, flags);
+                if (reserve && err) {
+                        mark_bootmem(start, pos, 0, 0);
+                        return err;
+                }
+                if (max == end)
+                        return 0;
+                pos = bdata->node_low_pfn;
        }
+        BUG();
 }
-/*
+/**
- * We 'merge' subsequent allocations to save space. We might 'lose'
+ * free_bootmem_node - mark a page range as usable
- * some fraction of a page if allocations cannot be satisfied due to
+ * @pgdat: node the range resides on
- * size constraints on boxes where there is physical RAM space
+ * @physaddr: starting address of the range
- * fragmentation - in these cases (mostly large memory boxes) this
+ * @size: size of the range in bytes
- * is not a problem.
- *
- * On low memory boxes we get it right in 100% of the cases.
 *
- * alignment has to be a power of 2 value.
+ * Partial pages will be considered reserved and left as they are.
 *
- * NOTE:  This function is _not_ reentrant.
+ * The range must reside completely on the specified node.
 */
-void * __init
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
-__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
+                              unsigned long size)
-              unsigned long align, unsigned long goal, unsigned long limit)
 {
-        unsigned long areasize, preferred;
+        unsigned long start, end;
-        unsigned long i, start = 0, incr, eidx, end_pfn;
-        void *ret;
-        unsigned long node_boot_start;
-        void *node_bootmem_map;
-        if (!size) {
-                printk("__alloc_bootmem_core(): zero-sized request\n");
-                BUG();
-        }
-        BUG_ON(align & (align-1));
-        /* on nodes without memory - bootmem_map is NULL */
+        start = PFN_UP(physaddr);
-        if (!bdata->node_bootmem_map)
+        end = PFN_DOWN(physaddr + size);
-                return NULL;
-        /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */
-        node_boot_start = bdata->node_boot_start;
-        node_bootmem_map = bdata->node_bootmem_map;
-        if (align) {
-                node_boot_start = ALIGN(bdata->node_boot_start, align);
-                if (node_boot_start > bdata->node_boot_start)
-                        node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
-                            PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
-        }
-        if (limit && node_boot_start >= limit)
-                return NULL;
-        end_pfn = bdata->node_low_pfn;
+        mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
-        limit = PFN_DOWN(limit);
+}
-        if (limit && end_pfn > limit)
-                end_pfn = limit;
-        eidx = end_pfn - PFN_DOWN(node_boot_start);
+/**
+ * free_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long addr, unsigned long size)
+{
+        unsigned long start, end;
-        /*
+        start = PFN_UP(addr);
-         * We try to allocate bootmem pages above 'goal'
+        end = PFN_DOWN(addr + size);
-         * first, then we try to allocate lower pages.
-         */
-        preferred = 0;
-        if (goal && PFN_DOWN(goal) < end_pfn) {
-                if (goal > node_boot_start)
-                        preferred = goal - node_boot_start;
-                if (bdata->last_success > node_boot_start &&
-                        bdata->last_success - node_boot_start >= preferred)
-                        if (!limit || (limit && limit > bdata->last_success))
-                                preferred = bdata->last_success - node_boot_start;
-        }
-        preferred = PFN_DOWN(ALIGN(preferred, align));
+        mark_bootmem(start, end, 0, 0);
-        areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
+}
-        incr = align >> PAGE_SHIFT ? : 1;
-restart_scan:
+/**
-        for (i = preferred; i < eidx;) {
+ * reserve_bootmem_node - mark a page range as reserved
-                unsigned long j;
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must reside completely on the specified node.
+ */
+int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+                                 unsigned long size, int flags)
+{
+        unsigned long start, end;
-                i = find_next_zero_bit(node_bootmem_map, eidx, i);
+        start = PFN_DOWN(physaddr);
-                i = ALIGN(i, incr);
+        end = PFN_UP(physaddr + size);
-                if (i >= eidx)
-                        break;
-                if (test_bit(i, node_bootmem_map)) {
-                        i += incr;
-                        continue;
-                }
-                for (j = i + 1; j < i + areasize; ++j) {
-                        if (j >= eidx)
-                                goto fail_block;
-                        if (test_bit(j, node_bootmem_map))
-                                goto fail_block;
-                }
-                start = i;
-                goto found;
-        fail_block:
-                i = ALIGN(j, incr);
-                if (i == j)
-                        i += incr;
-        }
-        if (preferred > 0) {
+        return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
-                preferred = 0;
+}
-                goto restart_scan;
-        }
-        return NULL;
-found:
+#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
-        bdata->last_success = PFN_PHYS(start) + node_boot_start;
+/**
-        BUG_ON(start >= eidx);
+ * reserve_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
+int __init reserve_bootmem(unsigned long addr, unsigned long size,
+                            int flags)
+{
+        unsigned long start, end;
-        /*
+        start = PFN_DOWN(addr);
-         * Is the next page of the previous allocation-end the start
+        end = PFN_UP(addr + size);
-         * of this allocation's buffer? If yes then we can 'merge'
-         * the previous partial page with this allocation.
-         */
-        if (align < PAGE_SIZE &&
-            bdata->last_offset && bdata->last_pos+1 == start) {
-                unsigned long offset, remaining_size;
-                offset = ALIGN(bdata->last_offset, align);
-                BUG_ON(offset > PAGE_SIZE);
-                remaining_size = PAGE_SIZE - offset;
-                if (size < remaining_size) {
-                        areasize = 0;
-                        /* last_pos unchanged */
-                        bdata->last_offset = offset + size;
-                        ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-                                           offset + node_boot_start);
-                } else {
-                        remaining_size = size - remaining_size;
-                        areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
-                        ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-                                           offset + node_boot_start);
-                        bdata->last_pos = start + areasize - 1;
-                        bdata->last_offset = remaining_size;
-                }
-                bdata->last_offset &= ~PAGE_MASK;
-        } else {
-                bdata->last_pos = start + areasize - 1;
-                bdata->last_offset = size & ~PAGE_MASK;
-                ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
-        }
-        /*
+        return mark_bootmem(start, end, 1, flags);
-         * Reserve the area now:
-         */
-        for (i = start; i < start + areasize; i++)
-                if (unlikely(test_and_set_bit(i, node_bootmem_map)))
-                        BUG();
-        memset(ret, 0, size);
-        return ret;
 }
+#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
-static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
+static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
+                        unsigned long step)
 {
-        struct page *page;
+        unsigned long base = bdata->node_min_pfn;
-        unsigned long pfn;
-        bootmem_data_t *bdata = pgdat->bdata;
-        unsigned long i, count, total = 0;
-        unsigned long idx;
-        unsigned long *map; 
-        int gofast = 0;
-        BUG_ON(!bdata->node_bootmem_map);
-        count = 0;
-        /* first extant page of the node */
-        pfn = PFN_DOWN(bdata->node_boot_start);
-        idx = bdata->node_low_pfn - pfn;
-        map = bdata->node_bootmem_map;
-        /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
-        if (bdata->node_boot_start == 0 ||
-            ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
-                gofast = 1;
-        for (i = 0; i < idx; ) {
-                unsigned long v = ~map[i / BITS_PER_LONG];
-                if (gofast && v == ~0UL) {
-                        int order;
-                        page = pfn_to_page(pfn);
-                        count += BITS_PER_LONG;
-                        order = ffs(BITS_PER_LONG) - 1;
-                        __free_pages_bootmem(page, order);
-                        i += BITS_PER_LONG;
-                        page += BITS_PER_LONG;
-                } else if (v) {
-                        unsigned long m;
-                        page = pfn_to_page(pfn);
-                        for (m = 1; m && i < idx; m<<=1, page++, i++) {
-                                if (v & m) {
-                                        count++;
-                                        __free_pages_bootmem(page, 0);
-                                }
-                        }
-                } else {
-                        i += BITS_PER_LONG;
-                }
-                pfn += BITS_PER_LONG;
-        }
-        total += count;
        /*
-         * Now free the allocator bitmap itself, it's not
+         * Align the index with respect to the node start so that the
-         * needed anymore:
+         * combination of both satisfies the requested alignment.
         */
-        page = virt_to_page(bdata->node_bootmem_map);
-        count = 0;
-        idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
-        for (i = 0; i < idx; i++, page++) {
-                __free_pages_bootmem(page, 0);
-                count++;
-        }
-        total += count;
-        bdata->node_bootmem_map = NULL;
-        return total;
+        return ALIGN(base + idx, step) - base;
 }
-unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
+static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
-                                unsigned long startpfn, unsigned long endpfn)
+                        unsigned long align)
 {
-        return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
+        unsigned long base = PFN_PHYS(bdata->node_min_pfn);
+        /* Same as align_idx for byte offsets */
+        return ALIGN(base + off, align) - base;
 }
-int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
-                                 unsigned long size, int flags)
+                                unsigned long size, unsigned long align,
+                                unsigned long goal, unsigned long limit)
 {
-        int ret;
+        unsigned long fallback = 0;
+        unsigned long min, max, start, sidx, midx, step;
-        ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
+        BUG_ON(!size);
-        if (ret < 0)
+        BUG_ON(align & (align - 1));
-                return -ENOMEM;
+        BUG_ON(limit && goal + size > limit);
-        reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
-        return 0;
+        if (!bdata->node_bootmem_map)
-}
+                return NULL;
-void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+        bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
-                              unsigned long size)
+                bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
-{
+                align, goal, limit);
-        free_bootmem_core(pgdat->bdata, physaddr, size);
-}
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+        min = bdata->node_min_pfn;
-{
+        max = bdata->node_low_pfn;
-        register_page_bootmem_info_node(pgdat);
-        return free_all_bootmem_core(pgdat);
-}
-unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+        goal >>= PAGE_SHIFT;
-{
+        limit >>= PAGE_SHIFT;
-        max_low_pfn = pages;
-        min_low_pfn = start;
-        return init_bootmem_core(NODE_DATA(0), start, 0, pages);
-}
-#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+        if (limit && max > limit)
-int __init reserve_bootmem(unsigned long addr, unsigned long size,
+                max = limit;
-                            int flags)
+        if (max <= min)
-{
+                return NULL;
-        bootmem_data_t *bdata;
-        int ret;
-        list_for_each_entry(bdata, &bdata_list, list) {
+        step = max(align >> PAGE_SHIFT, 1UL);
-                ret = can_reserve_bootmem_core(bdata, addr, size, flags);
-                if (ret < 0)
+        if (goal && min < goal && goal < max)
-                        return ret;
+                start = ALIGN(goal, step);
+        else
+                start = ALIGN(min, step);
+        sidx = start - bdata->node_min_pfn;
+        midx = max - bdata->node_min_pfn;
+        if (bdata->hint_idx > sidx) {
+                /*
+                 * Handle the valid case of sidx being zero and still
+                 * catch the fallback below.
+                 */
+                fallback = sidx + 1;
+                sidx = align_idx(bdata, bdata->hint_idx, step);
        }
-        list_for_each_entry(bdata, &bdata_list, list)
-                reserve_bootmem_core(bdata, addr, size, flags);
-        return 0;
+        while (1) {
-}
+                int merge;
-#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
+                void *region;
+                unsigned long eidx, i, start_off, end_off;
+find_block:
+                sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
+                sidx = align_idx(bdata, sidx, step);
+                eidx = sidx + PFN_UP(size);
-void __init free_bootmem(unsigned long addr, unsigned long size)
+                if (sidx >= midx || eidx > midx)
-{
+                        break;
-        bootmem_data_t *bdata;
-        list_for_each_entry(bdata, &bdata_list, list)
-                free_bootmem_core(bdata, addr, size);
-}
-unsigned long __init free_all_bootmem(void)
+                for (i = sidx; i < eidx; i++)
-{
+                        if (test_bit(i, bdata->node_bootmem_map)) {
-        return free_all_bootmem_core(NODE_DATA(0));
+                                sidx = align_idx(bdata, i, step);
+                                if (sidx == i)
+                                        sidx += step;
+                                goto find_block;
+                        }
+                if (bdata->last_end_off & (PAGE_SIZE - 1) &&
+                                PFN_DOWN(bdata->last_end_off) + 1 == sidx)
+                        start_off = align_off(bdata, bdata->last_end_off, align);
+                else
+                        start_off = PFN_PHYS(sidx);
+                merge = PFN_DOWN(start_off) < sidx;
+                end_off = start_off + size;
+                bdata->last_end_off = end_off;
+                bdata->hint_idx = PFN_UP(end_off);
+                /*
+                 * Reserve the area now:
+                 */
+                if (__reserve(bdata, PFN_DOWN(start_off) + merge,
+                                PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
+                        BUG();
+                region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
+                                start_off);
+                memset(region, 0, size);
+                return region;
+        }
+        if (fallback) {
+                sidx = align_idx(bdata, fallback - 1, step);
+                fallback = 0;
+                goto find_block;
+        }
+        return NULL;
 }
-void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
-                                      unsigned long goal)
+                                        unsigned long align,
+                                        unsigned long goal,
+                                        unsigned long limit)
 {
        bootmem_data_t *bdata;
-        void *ptr;
+restart:
        list_for_each_entry(bdata, &bdata_list, list) {
-                ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
+                void *region;
-                if (ptr)
-                        return ptr;
+                if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
+                        continue;
+                if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
+                        break;
+                region = alloc_bootmem_core(bdata, size, align, goal, limit);
+                if (region)
+                        return region;
+        }
+        if (goal) {
+                goal = 0;
+                goto restart;
        }
        return NULL;
 }
-void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+/**
-                              unsigned long goal)
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+                                        unsigned long goal)
 {
-        void *mem = __alloc_bootmem_nopanic(size,align,goal);
+        return ___alloc_bootmem_nopanic(size, align, goal, 0);
+}
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+                                        unsigned long goal, unsigned long limit)
+{
+        void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
        if (mem)
                return mem;
@@ -534,78 +593,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return NULL;
 }
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+                              unsigned long goal)
+{
+        return ___alloc_bootmem(size, align, goal, 0);
+}
-void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
-                                   unsigned long align, unsigned long goal)
+                                unsigned long size, unsigned long align,
+                                unsigned long goal, unsigned long limit)
 {
        void *ptr;
-        ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+        ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
        if (ptr)
                return ptr;
-        return __alloc_bootmem(size, align, goal);
+        return ___alloc_bootmem(size, align, goal, limit);
+}
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+                                   unsigned long align, unsigned long goal)
+{
+        return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
 }
 #ifdef CONFIG_SPARSEMEM
+/**
+ * alloc_bootmem_section - allocate boot memory from a specific section
+ * @size: size of the request in bytes
+ * @section_nr: sparse map section to allocate from
+ *
+ * Return NULL on failure.
+ */
 void * __init alloc_bootmem_section(unsigned long size,
                                    unsigned long section_nr)
 {
-        void *ptr;
+        bootmem_data_t *bdata;
-        unsigned long limit, goal, start_nr, end_nr, pfn;
+        unsigned long pfn, goal, limit;
-        struct pglist_data *pgdat;
        pfn = section_nr_to_pfn(section_nr);
-        goal = PFN_PHYS(pfn);
+        goal = pfn << PAGE_SHIFT;
-        limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1;
+        limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
-        pgdat = NODE_DATA(early_pfn_to_nid(pfn));
+        bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
-        ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
-                                   limit);
-        if (!ptr)
+        return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
-                return NULL;
+}
+#endif
-        start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr)));
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-        end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size));
+                                   unsigned long align, unsigned long goal)
-        if (start_nr != section_nr || end_nr != section_nr) {
+{
-                printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n",
+        void *ptr;
-                       section_nr);
-                free_bootmem_core(pgdat->bdata, __pa(ptr), size);
-                ptr = NULL;
-        }
-        return ptr;
+        ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+        if (ptr)
+                return ptr;
+        return __alloc_bootmem_nopanic(size, align, goal);
 }
-#endif
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
 #endif
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
                                  unsigned long goal)
 {
-        bootmem_data_t *bdata;
+        return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
-        void *ptr;
-        list_for_each_entry(bdata, &bdata_list, list) {
-                ptr = __alloc_bootmem_core(bdata, size, align, goal,
-                                                ARCH_LOW_ADDRESS_LIMIT);
-                if (ptr)
-                        return ptr;
-        }
-        /*
-         * Whoops, we cannot satisfy the allocation request.
-         */
-        printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
-        panic("Out of low memory");
-        return NULL;
 }
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
                                       unsigned long align, unsigned long goal)
 {
-        return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
+        return ___alloc_bootmem_node(pgdat->bdata, size, align,
-                                    ARCH_LOW_ADDRESS_LIMIT);
+                                goal, ARCH_LOW_ADDRESS_LIMIT);
 }
diff --git a/mm/bounce.c b/mm/bounce.c
index b6d2d0f1019b..06722c403058 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
        /*
         * Data-less bio, nothing to bounce
         */
-        if (bio_empty_barrier(*bio_orig))
+        if (!bio_has_data(*bio_orig))
                return;
        /*
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 343cfdfebd9e..a1da969bd980 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -3,7 +3,7 @@
 *
 * Copyright (C) 2002, Linus Torvalds
 *
- * 11Jan2003    akpm@digeo.com
+ * 11Jan2003    Andrew Morton
 *              Initial version.
 */
diff --git a/mm/filemap.c b/mm/filemap.c
index 65d9d9e2b755..ab8553658af3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
+#include <linux/mm_inline.h> /* for page_is_file_cache() */
 #include "internal.h"
 /*
@@ -42,9 +43,6 @@
 #include <asm/mman.h>
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-        loff_t offset, unsigned long nr_segs);
 /*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -112,18 +110,18 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 /*
 * Remove a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock.
 */
 void __remove_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
-        mem_cgroup_uncharge_page(page);
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
        mapping->nrpages--;
        __dec_zone_page_state(page, NR_FILE_PAGES);
        BUG_ON(page_mapped(page));
+        mem_cgroup_uncharge_cache_page(page);
        /*
         * Some filesystems seem to re-dirty the page even after
@@ -144,9 +142,9 @@ void remove_from_page_cache(struct page *page)
        BUG_ON(!PageLocked(page));
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
 }
 static int sync_page(void *word)
@@ -445,55 +443,74 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 }
 /**
- * add_to_page_cache - add newly allocated pagecache pages
+ * add_to_page_cache_locked - add a locked page to the pagecache
 * @page:       page to add
 * @mapping:    the page's address_space
 * @offset:     page index
 * @gfp_mask:   page allocation mode
 *
- * This function is used to add newly allocated pagecache pages;
+ * This function is used to add a page to the pagecache. It must be locked.
- * the page is new, so we can just run SetPageLocked() against it.
- * The other page state flags were set by rmqueue().
- *
 * This function does not add the page to the LRU.  The caller must do that.
 */
-int add_to_page_cache(struct page *page, struct address_space *mapping,
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                pgoff_t offset, gfp_t gfp_mask)
 {
-        int error = mem_cgroup_cache_charge(page, current->mm,
+        int error;
+        VM_BUG_ON(!PageLocked(page));
+        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & ~__GFP_HIGHMEM);
        if (error)
                goto out;
        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (error == 0) {
-                write_lock_irq(&mapping->tree_lock);
+                page_cache_get(page);
+                page->mapping = mapping;
+                page->index = offset;
+                spin_lock_irq(&mapping->tree_lock);
                error = radix_tree_insert(&mapping->page_tree, offset, page);
-                if (!error) {
+                if (likely(!error)) {
-                        page_cache_get(page);
-                        SetPageLocked(page);
-                        page->mapping = mapping;
-                        page->index = offset;
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                } else
+                } else {
-                        mem_cgroup_uncharge_page(page);
+                        page->mapping = NULL;
+                        mem_cgroup_uncharge_cache_page(page);
+                        page_cache_release(page);
+                }
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                radix_tree_preload_end();
        } else
-                mem_cgroup_uncharge_page(page);
+                mem_cgroup_uncharge_cache_page(page);
 out:
        return error;
 }
-EXPORT_SYMBOL(add_to_page_cache);
+EXPORT_SYMBOL(add_to_page_cache_locked);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                pgoff_t offset, gfp_t gfp_mask)
 {
-        int ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+        int ret;
-        if (ret == 0)
-                lru_cache_add(page);
+        /*
+         * Splice_read and readahead add shmem/tmpfs pages into the page cache
+         * before shmem_readpage has a chance to mark them as SwapBacked: they
+         * need to go on the active_anon lru below, and mem_cgroup_cache_charge
+         * (called in add_to_page_cache) needs to know where they're going too.
+         */
+        if (mapping_cap_swap_backed(mapping))
+                SetPageSwapBacked(page);
+        ret = add_to_page_cache(page, mapping, offset, gfp_mask);
+        if (ret == 0) {
+                if (page_is_file_cache(page))
+                        lru_cache_add_file(page);
+                else
+                        lru_cache_add_active_anon(page);
+        }
        return ret;
 }
@@ -556,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
 * mechananism between PageLocked pages and PageWriteback pages is shared.
 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 *
- * The first mb is necessary to safely close the critical section opened by the
+ * The mb is necessary to enforce ordering between the clear_bit and the read
- * TestSetPageLocked(), the second mb is necessary to enforce ordering between
+ * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
- * the clear_bit and the read of the waitqueue (to avoid SMP races with a
- * parallel wait_on_page_locked()).
 */
 void unlock_page(struct page *page)
 {
-        smp_mb__before_clear_bit();
+        VM_BUG_ON(!PageLocked(page));
-        if (!TestClearPageLocked(page))
+        clear_bit_unlock(PG_locked, &page->flags);
-                BUG();
+        smp_mb__after_clear_bit();
-        smp_mb__after_clear_bit(); 
        wake_up_page(page, PG_locked);
 }
 EXPORT_SYMBOL(unlock_page);
@@ -636,15 +650,35 @@ void __lock_page_nosync(struct page *page)
 * Is there a pagecache struct page at the given (mapping, offset) tuple?
 * If yes, increment its refcount and return it; if no, return NULL.
 */
-struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
 {
+        void **pagep;
        struct page *page;
-        read_lock_irq(&mapping->tree_lock);
+        rcu_read_lock();
-        page = radix_tree_lookup(&mapping->page_tree, offset);
+repeat:
-        if (page)
+        page = NULL;
-                page_cache_get(page);
+        pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
-        read_unlock_irq(&mapping->tree_lock);
+        if (pagep) {
+                page = radix_tree_deref_slot(pagep);
+                if (unlikely(!page || page == RADIX_TREE_RETRY))
+                        goto repeat;
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /*
+                 * Has the page moved?
+                 * This is part of the lockless pagecache protocol. See
+                 * include/linux/pagemap.h for details.
+                 */
+                if (unlikely(page != *pagep)) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+        }
+        rcu_read_unlock();
        return page;
 }
 EXPORT_SYMBOL(find_get_page);
@@ -659,32 +693,22 @@ EXPORT_SYMBOL(find_get_page);
 *
 * Returns zero if the page was not present. find_lock_page() may sleep.
 */
-struct page *find_lock_page(struct address_space *mapping,
+struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
-                                pgoff_t offset)
 {
        struct page *page;
 repeat:
-        read_lock_irq(&mapping->tree_lock);
+        page = find_get_page(mapping, offset);
-        page = radix_tree_lookup(&mapping->page_tree, offset);
        if (page) {
-                page_cache_get(page);
+                lock_page(page);
-                if (TestSetPageLocked(page)) {
+                /* Has the page been truncated? */
-                        read_unlock_irq(&mapping->tree_lock);
+                if (unlikely(page->mapping != mapping)) {
-                        __lock_page(page);
+                        unlock_page(page);
+                        page_cache_release(page);
-                        /* Has the page been truncated while we slept? */
+                        goto repeat;
-                        if (unlikely(page->mapping != mapping)) {
-                                unlock_page(page);
-                                page_cache_release(page);
-                                goto repeat;
-                        }
-                        VM_BUG_ON(page->index != offset);
-                        goto out;
                }
+                VM_BUG_ON(page->index != offset);
        }
-        read_unlock_irq(&mapping->tree_lock);
-out:
        return page;
 }
 EXPORT_SYMBOL(find_lock_page);
@@ -750,13 +774,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                                (void ***)pages, start, nr_pages);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
-        read_lock_irq(&mapping->tree_lock);
+                if (!page_cache_get_speculative(page))
-        ret = radix_tree_gang_lookup(&mapping->page_tree,
+                        goto repeat;
-                                (void **)pages, start, nr_pages);
-        for (i = 0; i < ret; i++)
+                /* Has the page moved? */
-                page_cache_get(pages[i]);
+                if (unlikely(page != *((void **)pages[i]))) {
-        read_unlock_irq(&mapping->tree_lock);
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                pages[ret] = page;
+                ret++;
+        }
+        rcu_read_unlock();
        return ret;
 }
@@ -777,19 +827,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                                (void ***)pages, index, nr_pages);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
-        read_lock_irq(&mapping->tree_lock);
+                if (page->mapping == NULL || page->index != index)
-        ret = radix_tree_gang_lookup(&mapping->page_tree,
-                                (void **)pages, index, nr_pages);
-        for (i = 0; i < ret; i++) {
-                if (pages[i]->mapping == NULL || pages[i]->index != index)
                        break;
-                page_cache_get(pages[i]);
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /* Has the page moved? */
+                if (unlikely(page != *((void **)pages[i]))) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                pages[ret] = page;
+                ret++;
                index++;
        }
-        read_unlock_irq(&mapping->tree_lock);
+        rcu_read_unlock();
-        return i;
+        return ret;
 }
 EXPORT_SYMBOL(find_get_pages_contig);
@@ -809,15 +884,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
+                                (void ***)pages, *index, nr_pages, tag);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /* Has the page moved? */
+                if (unlikely(page != *((void **)pages[i]))) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                pages[ret] = page;
+                ret++;
+        }
+        rcu_read_unlock();
-        read_lock_irq(&mapping->tree_lock);
-        ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
-                                (void **)pages, *index, nr_pages, tag);
-        for (i = 0; i < ret; i++)
-                page_cache_get(pages[i]);
        if (ret)
                *index = pages[ret - 1]->index + 1;
-        read_unlock_irq(&mapping->tree_lock);
        return ret;
 }
 EXPORT_SYMBOL(find_get_pages_tag);
@@ -841,7 +944,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
        struct page *page = find_get_page(mapping, index);
        if (page) {
-                if (!TestSetPageLocked(page))
+                if (trylock_page(page))
                        return page;
                page_cache_release(page);
                return NULL;
@@ -933,8 +1036,17 @@ find_page:
                                        ra, filp, page,
                                        index, last_index - index);
                }
-                if (!PageUptodate(page))
+                if (!PageUptodate(page)) {
-                        goto page_not_up_to_date;
+                        if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+                                        !mapping->a_ops->is_partially_uptodate)
+                                goto page_not_up_to_date;
+                        if (!trylock_page(page))
+                                goto page_not_up_to_date;
+                        if (!mapping->a_ops->is_partially_uptodate(page,
+                                                                desc, offset))
+                                goto page_not_up_to_date_locked;
+                        unlock_page(page);
+                }
 page_ok:
                /*
                 * i_size must be checked after we know the page is Uptodate.
@@ -1001,9 +1113,11 @@ page_ok:
 page_not_up_to_date:
                /* Get exclusive access to the page ... */
-                if (lock_page_killable(page))
+                error = lock_page_killable(page);
-                        goto readpage_eio;
+                if (unlikely(error))
+                        goto readpage_error;
+page_not_up_to_date_locked:
                /* Did it get truncated before we got the lock? */
                if (!page->mapping) {
                        unlock_page(page);
@@ -1030,8 +1144,9 @@ readpage:
                }
                if (!PageUptodate(page)) {
-                        if (lock_page_killable(page))
+                        error = lock_page_killable(page);
-                                goto readpage_eio;
+                        if (unlikely(error))
+                                goto readpage_error;
                        if (!PageUptodate(page)) {
                                if (page->mapping == NULL) {
                                        /*
@@ -1043,15 +1158,14 @@ readpage:
                                }
                                unlock_page(page);
                                shrink_readahead_size_eio(filp, ra);
-                                goto readpage_eio;
+                                error = -EIO;
+                                goto readpage_error;
                        }
                        unlock_page(page);
                }
                goto page_ok;
-readpage_eio:
-                error = -EIO;
 readpage_error:
                /* UHHUH! A synchronous read error occurred. Report it */
                desc->error = error;
@@ -1086,8 +1200,7 @@ out:
        ra->prev_pos |= prev_offset;
        *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
-        if (filp)
+        file_accessed(filp);
-                file_accessed(filp);
 }
 int file_read_actor(read_descriptor_t *desc, struct page *page,
@@ -1200,42 +1313,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                mapping = filp->f_mapping;
                inode = mapping->host;
-                retval = 0;
                if (!count)
                        goto out; /* skip atime */
                size = i_size_read(inode);
                if (pos < size) {
-                        retval = generic_file_direct_IO(READ, iocb,
+                        retval = filemap_write_and_wait(mapping);
-                                                iov, pos, nr_segs);
+                        if (!retval) {
+                                retval = mapping->a_ops->direct_IO(READ, iocb,
+                                                        iov, pos, nr_segs);
+                        }
                        if (retval > 0)
                                *ppos = pos + retval;
-                }
+                        if (retval) {
-                if (likely(retval != 0)) {
+                                file_accessed(filp);
-                        file_accessed(filp);
+                                goto out;
-                        goto out;
+                        }
                }
        }
-        retval = 0;
+        for (seg = 0; seg < nr_segs; seg++) {
-        if (count) {
+                read_descriptor_t desc;
-                for (seg = 0; seg < nr_segs; seg++) {
-                        read_descriptor_t desc;
-                        desc.written = 0;
+                desc.written = 0;
-                        desc.arg.buf = iov[seg].iov_base;
+                desc.arg.buf = iov[seg].iov_base;
-                        desc.count = iov[seg].iov_len;
+                desc.count = iov[seg].iov_len;
-                        if (desc.count == 0)
+                if (desc.count == 0)
-                                continue;
+                        continue;
-                        desc.error = 0;
+                desc.error = 0;
-                        do_generic_file_read(filp,ppos,&desc,file_read_actor);
+                do_generic_file_read(filp, ppos, &desc, file_read_actor);
-                        retval += desc.written;
+                retval += desc.written;
-                        if (desc.error) {
+                if (desc.error) {
-                                retval = retval ?: desc.error;
+                        retval = retval ?: desc.error;
-                                break;
+                        break;
-                        }
-                        if (desc.count > 0)
-                                break;
                }
+                if (desc.count > 0)
+                        break;
        }
 out:
        return retval;
@@ -1669,8 +1781,9 @@ static int __remove_suid(struct dentry *dentry, int kill)
        return notify_change(dentry, &newattrs);
 }
-int remove_suid(struct dentry *dentry)
+int file_remove_suid(struct file *file)
 {
+        struct dentry *dentry = file->f_path.dentry;
        int killsuid = should_remove_suid(dentry);
        int killpriv = security_inode_need_killpriv(dentry);
        int error = 0;
@@ -1684,7 +1797,7 @@ int remove_suid(struct dentry *dentry)
        return error;
 }
-EXPORT_SYMBOL(remove_suid);
+EXPORT_SYMBOL(file_remove_suid);
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
                        const struct iovec *iov, size_t base, size_t bytes)
@@ -1779,7 +1892,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
                 * The !iov->iov_len check ensures we skip over unlikely
                 * zero-length segments (without overruning the iovec).
                 */
-                while (bytes || unlikely(!iov->iov_len && i->count)) {
+                while (bytes || unlikely(i->count && !iov->iov_len)) {
                        int copy;
                        copy = min(bytes, iov->iov_len - base);
@@ -2004,11 +2117,62 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        struct address_space *mapping = file->f_mapping;
        struct inode    *inode = mapping->host;
        ssize_t         written;
+        size_t          write_len;
+        pgoff_t         end;
        if (count != ocount)
                *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
-        written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+        /*
+         * Unmap all mmappings of the file up-front.
+         *
+         * This will cause any pte dirty bits to be propagated into the
+         * pageframes for the subsequent filemap_write_and_wait().
+         */
+        write_len = iov_length(iov, *nr_segs);
+        end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+        if (mapping_mapped(mapping))
+                unmap_mapping_range(mapping, pos, write_len, 0);
+        written = filemap_write_and_wait(mapping);
+        if (written)
+                goto out;
+        /*
+         * After a write we want buffered reads to be sure to go to disk to get
+         * the new data.  We invalidate clean cached page from the region we're
+         * about to write.  We do this *before* the write so that we can return
+         * without clobbering -EIOCBQUEUED from ->direct_IO().
+         */
+        if (mapping->nrpages) {
+                written = invalidate_inode_pages2_range(mapping,
+                                        pos >> PAGE_CACHE_SHIFT, end);
+                /*
+                 * If a page can not be invalidated, return 0 to fall back
+                 * to buffered write.
+                 */
+                if (written) {
+                        if (written == -EBUSY)
+                                return 0;
+                        goto out;
+                }
+        }
+        written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+        /*
+         * Finally, try again to invalidate clean pages which might have been
+         * cached by non-direct readahead, or faulted in by get_user_pages()
+         * if the source of the write was an mmap'ed region of the file
+         * we're writing.  Either one is a pretty crazy thing to do,
+         * so we don't support it 100%.  If this invalidation
+         * fails, tough, the write still worked...
+         */
+        if (mapping->nrpages) {
+                invalidate_inode_pages2_range(mapping,
+                                              pos >> PAGE_CACHE_SHIFT, end);
+        }
        if (written > 0) {
                loff_t end = pos + written;
                if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -2024,6 +2188,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
         * i_mutex is held, which protects generic_osync_inode() from
         * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
         */
+out:
        if ((written >= 0 || written == -EIOCBQUEUED) &&
            ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
@@ -2395,7 +2560,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
        if (count == 0)
                goto out;
-        err = remove_suid(file->f_path.dentry);
+        err = file_remove_suid(file);
        if (err)
                goto out;
@@ -2511,66 +2676,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 }
 EXPORT_SYMBOL(generic_file_aio_write);
-/*
- * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
- * went wrong during pagecache shootdown.
- */
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-        loff_t offset, unsigned long nr_segs)
-{
-        struct file *file = iocb->ki_filp;
-        struct address_space *mapping = file->f_mapping;
-        ssize_t retval;
-        size_t write_len;
-        pgoff_t end = 0; /* silence gcc */
-        /*
-         * If it's a write, unmap all mmappings of the file up-front.  This
-         * will cause any pte dirty bits to be propagated into the pageframes
-         * for the subsequent filemap_write_and_wait().
-         */
-        if (rw == WRITE) {
-                write_len = iov_length(iov, nr_segs);
-                end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
-                if (mapping_mapped(mapping))
-                        unmap_mapping_range(mapping, offset, write_len, 0);
-        }
-        retval = filemap_write_and_wait(mapping);
-        if (retval)
-                goto out;
-        /*
-         * After a write we want buffered reads to be sure to go to disk to get
-         * the new data.  We invalidate clean cached page from the region we're
-         * about to write.  We do this *before* the write so that we can return
-         * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
-         */
-        if (rw == WRITE && mapping->nrpages) {
-                retval = invalidate_inode_pages2_range(mapping,
-                                        offset >> PAGE_CACHE_SHIFT, end);
-                if (retval)
-                        goto out;
-        }
-        retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
-        /*
-         * Finally, try again to invalidate clean pages which might have been
-         * cached by non-direct readahead, or faulted in by get_user_pages()
-         * if the source of the write was an mmap'ed region of the file
-         * we're writing.  Either one is a pretty crazy thing to do,
-         * so we don't support it 100%.  If this invalidation
-         * fails, tough, the write still worked...
-         */
-        if (rw == WRITE && mapping->nrpages) {
-                invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
-        }
-out:
-        return retval;
-}
 /**
 * try_to_release_page() - release old fs-specific metadata on a page
 *
@@ -2582,9 +2687,8 @@ out:
 * Otherwise return zero.
 *
 * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
 *
- * NOTE: @gfp_mask may go away, and this function may become non-blocking.
 */
 int try_to_release_page(struct page *page, gfp_t gfp_mask)
 {
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 3e744abcce9d..b5167dfb2f2d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,7 +13,10 @@
 #include <linux/module.h>
 #include <linux/uio.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 #include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
@@ -21,22 +24,18 @@
 * We do use our own empty page to avoid interference with other users
 * of ZERO_PAGE(), such as /dev/zero
 */
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
 static struct page *__xip_sparse_page;
+/* called under xip_sparse_mutex */
 static struct page *xip_sparse_page(void)
 {
        if (!__xip_sparse_page) {
                struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
-                if (page) {
+                if (page)
-                        static DEFINE_SPINLOCK(xip_alloc_lock);
+                        __xip_sparse_page = page;
-                        spin_lock(&xip_alloc_lock);
-                        if (!__xip_sparse_page)
-                                __xip_sparse_page = page;
-                        else
-                                __free_page(page);
-                        spin_unlock(&xip_alloc_lock);
-                }
        }
        return __xip_sparse_page;
 }
@@ -173,22 +172,27 @@ __xip_unmap (struct address_space * mapping,
        pte_t pteval;
        spinlock_t *ptl;
        struct page *page;
+        unsigned count;
+        int locked = 0;
+        count = read_seqcount_begin(&xip_sparse_seq);
        page = __xip_sparse_page;
        if (!page)
                return;
+retry:
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
                mm = vma->vm_mm;
                address = vma->vm_start +
                        ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
                BUG_ON(address < vma->vm_start || address >= vma->vm_end);
-                pte = page_check_address(page, mm, address, &ptl);
+                pte = page_check_address(page, mm, address, &ptl, 1);
                if (pte) {
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
-                        pteval = ptep_clear_flush(vma, address, pte);
+                        pteval = ptep_clear_flush_notify(vma, address, pte);
                        page_remove_rmap(page, vma);
                        dec_mm_counter(mm, file_rss);
                        BUG_ON(pte_dirty(pteval));
@@ -197,6 +201,14 @@ __xip_unmap (struct address_space * mapping,
                }
        }
        spin_unlock(&mapping->i_mmap_lock);
+        if (locked) {
+                mutex_unlock(&xip_sparse_mutex);
+        } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+                mutex_lock(&xip_sparse_mutex);
+                locked = 1;
+                goto retry;
+        }
 }
 /*
@@ -217,7 +229,7 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        int error;
        /* XXX: are VM_FAULT_ codes OK? */
+again:
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (vmf->pgoff >= size)
                return VM_FAULT_SIGBUS;
@@ -236,8 +248,10 @@ static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                int err;
                /* maybe shared writable, allocate new block */
+                mutex_lock(&xip_sparse_mutex);
                error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
                                                        &xip_mem, &xip_pfn);
+                mutex_unlock(&xip_sparse_mutex);
                if (error)
                        return VM_FAULT_SIGBUS;
                /* unmap sparse mappings at pgoff from all other vmas */
@@ -251,14 +265,34 @@ found:
                BUG_ON(err);
                return VM_FAULT_NOPAGE;
        } else {
+                int err, ret = VM_FAULT_OOM;
+                mutex_lock(&xip_sparse_mutex);
+                write_seqcount_begin(&xip_sparse_seq);
+                error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+                                                        &xip_mem, &xip_pfn);
+                if (unlikely(!error)) {
+                        write_seqcount_end(&xip_sparse_seq);
+                        mutex_unlock(&xip_sparse_mutex);
+                        goto again;
+                }
+                if (error != -ENODATA)
+                        goto out;
                /* not shared and writable, use xip_sparse_page() */
                page = xip_sparse_page();
                if (!page)
-                        return VM_FAULT_OOM;
+                        goto out;
+                err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+                                                        page);
+                if (err == -ENOMEM)
+                        goto out;
-                page_cache_get(page);
+                ret = VM_FAULT_NOPAGE;
-                vmf->page = page;
+out:
-                return 0;
+                write_seqcount_end(&xip_sparse_seq);
+                mutex_unlock(&xip_sparse_mutex);
+                return ret;
        }
 }
@@ -307,8 +341,10 @@ __xip_file_write(struct file *filp, const char __user *buf,
                                                &xip_mem, &xip_pfn);
                if (status == -ENODATA) {
                        /* we allocate a new page unmap it */
+                        mutex_lock(&xip_sparse_mutex);
                        status = a_ops->get_xip_mem(mapping, index, 1,
                                                        &xip_mem, &xip_pfn);
+                        mutex_unlock(&xip_sparse_mutex);
                        if (!status)
                                /* unmap page at pgoff from all other vmas */
                                __xip_unmap(mapping, index);
@@ -380,7 +416,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        if (count == 0)
                goto out_backing;
-        ret = remove_suid(filp->f_path.dentry);
+        ret = file_remove_suid(filp);
        if (ret)
                goto out_backing;
diff --git a/mm/fremap.c b/mm/fremap.c
index 07a9c82ce1a3..7d12ca70ef7b 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,11 +15,14 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#include "internal.h"
 static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, pte_t *ptep)
 {
@@ -214,13 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
                spin_unlock(&mapping->i_mmap_lock);
        }
+        if (vma->vm_flags & VM_LOCKED) {
+                /*
+                 * drop PG_Mlocked flag for over-mapped range
+                 */
+                unsigned int saved_flags = vma->vm_flags;
+                munlock_vma_pages_range(vma, start, start + size);
+                vma->vm_flags = saved_flags;
+        }
+        mmu_notifier_invalidate_range_start(mm, start, start + size);
        err = populate_range(mm, vma, start, size, pgoff);
+        mmu_notifier_invalidate_range_end(mm, start, start + size);
        if (!err && !(flags & MAP_NONBLOCK)) {
-                if (unlikely(has_write_lock)) {
+                if (vma->vm_flags & VM_LOCKED) {
-                        downgrade_write(&mm->mmap_sem);
+                        /*
-                        has_write_lock = 0;
+                         * might be mapping previously unmapped range of file
+                         */
+                        mlock_vma_pages_range(vma, start, start + size);
+                } else {
+                        if (unlikely(has_write_lock)) {
+                                downgrade_write(&mm->mmap_sem);
+                                has_write_lock = 0;
+                        }
+                        make_pages_present(start, start+size);
                }
-                make_pages_present(start, start+size);
        }
        /*
@@ -237,4 +258,3 @@ out:
        return err;
 }
diff --git a/mm/highmem.c b/mm/highmem.c
index 7da4a7b6af11..b36b83b920ff 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -40,6 +40,7 @@
 #ifdef CONFIG_HIGHMEM
 unsigned long totalhigh_pages __read_mostly;
+EXPORT_SYMBOL(totalhigh_pages);
 unsigned int nr_free_highpages (void)
 {
@@ -69,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
 static void flush_all_zero_pkmaps(void)
 {
        int i;
+        int need_flush = 0;
        flush_cache_kmaps();
@@ -100,8 +102,10 @@ static void flush_all_zero_pkmaps(void)
                          &pkmap_page_table[i]);
                set_page_address(page, NULL);
+                need_flush = 1;
        }
-        flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
+        if (need_flush)
+                flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
 }
 /**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ab171274ef21..421aee99b84a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,45 +7,360 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/seq_file.h>
 #include <linux/sysctl.h>
 #include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
+#include <linux/bootmem.h>
+#include <linux/sysfs.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/io.h>
 #include <linux/hugetlb.h>
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
-static unsigned long surplus_huge_pages;
-static unsigned long nr_overcommit_huge_pages;
-unsigned long max_huge_pages;
-unsigned long sysctl_overcommit_huge_pages;
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static unsigned int nr_huge_pages_node[MAX_NUMNODES];
-static unsigned int free_huge_pages_node[MAX_NUMNODES];
-static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-static int hugetlb_next_nid;
+static int max_hstate;
+unsigned int default_hstate_idx;
+struct hstate hstates[HUGE_MAX_HSTATE];
+__initdata LIST_HEAD(huge_boot_pages);
+/* for command line parsing */
+static struct hstate * __initdata parsed_hstate;
+static unsigned long __initdata default_hstate_max_huge_pages;
+static unsigned long __initdata default_hstate_size;
+#define for_each_hstate(h) \
+        for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
 static DEFINE_SPINLOCK(hugetlb_lock);
-static void clear_huge_page(struct page *page, unsigned long addr)
+/*
+ * Region tracking -- allows tracking of reservations and instantiated pages
+ *                    across the pages in a mapping.
+ *
+ * The region data structures are protected by a combination of the mmap_sem
+ * and the hugetlb_instantion_mutex.  To access or modify a region the caller
+ * must either hold the mmap_sem for write, or the mmap_sem for read and
+ * the hugetlb_instantiation mutex:
+ *
+ *      down_write(&mm->mmap_sem);
+ * or
+ *      down_read(&mm->mmap_sem);
+ *      mutex_lock(&hugetlb_instantiation_mutex);
+ */
+struct file_region {
+        struct list_head link;
+        long from;
+        long to;
+};
+static long region_add(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg, *trg;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        /* Check for and consume any regions we now overlap with. */
+        nrg = rg;
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        break;
+                /* If this area reaches higher then extend our area to
+                 * include it completely.  If this is not the first area
+                 * which we intend to reuse, free it. */
+                if (rg->to > t)
+                        t = rg->to;
+                if (rg != nrg) {
+                        list_del(&rg->link);
+                        kfree(rg);
+                }
+        }
+        nrg->from = f;
+        nrg->to = t;
+        return 0;
+}
+static long region_chg(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg;
+        long chg = 0;
+        /* Locate the region we are before or in. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* If we are below the current region then a new region is required.
+         * Subtle, allocate a new region at the position but make it zero
+         * size such that we can guarantee to record the reservation. */
+        if (&rg->link == head || t < rg->from) {
+                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+                if (!nrg)
+                        return -ENOMEM;
+                nrg->from = f;
+                nrg->to   = f;
+                INIT_LIST_HEAD(&nrg->link);
+                list_add(&nrg->link, rg->link.prev);
+                return t - f;
+        }
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        chg = t - f;
+        /* Check for and consume any regions we now overlap with. */
+        list_for_each_entry(rg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        return chg;
+                /* We overlap with this area, if it extends futher than
+                 * us then we must extend ourselves.  Account for its
+                 * existing reservation. */
+                if (rg->to > t) {
+                        chg += rg->to - t;
+                        t = rg->to;
+                }
+                chg -= rg->to - rg->from;
+        }
+        return chg;
+}
+static long region_truncate(struct list_head *head, long end)
+{
+        struct file_region *rg, *trg;
+        long chg = 0;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (end <= rg->to)
+                        break;
+        if (&rg->link == head)
+                return 0;
+        /* If we are in the middle of a region then adjust it. */
+        if (end > rg->from) {
+                chg = rg->to - end;
+                rg->to = end;
+                rg = list_entry(rg->link.next, typeof(*rg), link);
+        }
+        /* Drop any remaining regions. */
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                chg += rg->to - rg->from;
+                list_del(&rg->link);
+                kfree(rg);
+        }
+        return chg;
+}
+static long region_count(struct list_head *head, long f, long t)
+{
+        struct file_region *rg;
+        long chg = 0;
+        /* Locate each segment we overlap with, and count that overlap. */
+        list_for_each_entry(rg, head, link) {
+                int seg_from;
+                int seg_to;
+                if (rg->to <= f)
+                        continue;
+                if (rg->from >= t)
+                        break;
+                seg_from = max(rg->from, f);
+                seg_to = min(rg->to, t);
+                chg += seg_to - seg_from;
+        }
+        return chg;
+}
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in pagecache page units; huge pages here.
+ */
+static pgoff_t vma_hugecache_offset(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long address)
+{
+        return ((address - vma->vm_start) >> huge_page_shift(h)) +
+                        (vma->vm_pgoff >> huge_page_order(h));
+}
+/*
+ * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
+ * bits of the reservation map pointer, which are always clear due to
+ * alignment.
+ */
+#define HPAGE_RESV_OWNER    (1UL << 0)
+#define HPAGE_RESV_UNMAPPED (1UL << 1)
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ *
+ * The private mapping reservation is represented in a subtly different
+ * manner to a shared mapping.  A shared mapping has a region map associated
+ * with the underlying file, this region map represents the backing file
+ * pages which have ever had a reservation assigned which this persists even
+ * after the page is instantiated.  A private mapping has a region map
+ * associated with the original mmap which is attached to all VMAs which
+ * reference it, this region map represents those offsets which have consumed
+ * reservation ie. where pages have been instantiated.
+ */
+static unsigned long get_vma_private_data(struct vm_area_struct *vma)
+{
+        return (unsigned long)vma->vm_private_data;
+}
+static void set_vma_private_data(struct vm_area_struct *vma,
+                                                        unsigned long value)
+{
+        vma->vm_private_data = (void *)value;
+}
+struct resv_map {
+        struct kref refs;
+        struct list_head regions;
+};
+static struct resv_map *resv_map_alloc(void)
+{
+        struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
+        if (!resv_map)
+                return NULL;
+        kref_init(&resv_map->refs);
+        INIT_LIST_HEAD(&resv_map->regions);
+        return resv_map;
+}
+static void resv_map_release(struct kref *ref)
+{
+        struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+        /* Clear out any active regions before we release the map. */
+        region_truncate(&resv_map->regions, 0);
+        kfree(resv_map);
+}
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        if (!(vma->vm_flags & VM_SHARED))
+                return (struct resv_map *)(get_vma_private_data(vma) &
+                                                        ~HPAGE_RESV_MASK);
+        return NULL;
+}
+static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        VM_BUG_ON(vma->vm_flags & VM_SHARED);
+        set_vma_private_data(vma, (get_vma_private_data(vma) &
+                                HPAGE_RESV_MASK) | (unsigned long)map);
+}
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        VM_BUG_ON(vma->vm_flags & VM_SHARED);
+        set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+}
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        return (get_vma_private_data(vma) & flag) != 0;
+}
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct hstate *h,
+                        struct vm_area_struct *vma)
+{
+        if (vma->vm_flags & VM_NORESERVE)
+                return;
+        if (vma->vm_flags & VM_SHARED) {
+                /* Shared mappings always use reserves */
+                h->resv_huge_pages--;
+        } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                /*
+                 * Only the process that called mmap() has reserves for
+                 * private mappings.
+                 */
+                h->resv_huge_pages--;
+        }
+}
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        if (!(vma->vm_flags & VM_SHARED))
+                vma->vm_private_data = (void *)0;
+}
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_reserves(struct vm_area_struct *vma)
+{
+        if (vma->vm_flags & VM_SHARED)
+                return 1;
+        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+                return 1;
+        return 0;
+}
+static void clear_huge_page(struct page *page,
+                        unsigned long addr, unsigned long sz)
 {
        int i;
        might_sleep();
-        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+        for (i = 0; i < sz/PAGE_SIZE; i++) {
                cond_resched();
                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
        }
@@ -55,42 +370,44 @@ static void copy_huge_page(struct page *dst, struct page *src,
                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
+        struct hstate *h = hstate_vma(vma);
        might_sleep();
-        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+        for (i = 0; i < pages_per_huge_page(h); i++) {
                cond_resched();
                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
        }
 }
-static void enqueue_huge_page(struct page *page)
+static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
-        list_add(&page->lru, &hugepage_freelists[nid]);
+        list_add(&page->lru, &h->hugepage_freelists[nid]);
-        free_huge_pages++;
+        h->free_huge_pages++;
-        free_huge_pages_node[nid]++;
+        h->free_huge_pages_node[nid]++;
 }
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct hstate *h)
 {
        int nid;
        struct page *page = NULL;
        for (nid = 0; nid < MAX_NUMNODES; ++nid) {
-                if (!list_empty(&hugepage_freelists[nid])) {
+                if (!list_empty(&h->hugepage_freelists[nid])) {
-                        page = list_entry(hugepage_freelists[nid].next,
+                        page = list_entry(h->hugepage_freelists[nid].next,
                                          struct page, lru);
                        list_del(&page->lru);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[nid]--;
+                        h->free_huge_pages_node[nid]--;
                        break;
                }
        }
        return page;
 }
-static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
+static struct page *dequeue_huge_page_vma(struct hstate *h,
-                                unsigned long address)
+                                struct vm_area_struct *vma,
+                                unsigned long address, int avoid_reserve)
 {
        int nid;
        struct page *page = NULL;
@@ -101,18 +418,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
        struct zone *zone;
        struct zoneref *z;
+        /*
+         * A child process with MAP_PRIVATE mappings created by their parent
+         * have no page reserves. This check ensures that reservations are
+         * not "stolen". The child may still get SIGKILLed
+         */
+        if (!vma_has_reserves(vma) &&
+                        h->free_huge_pages - h->resv_huge_pages == 0)
+                return NULL;
+        /* If reserves cannot be used, ensure enough pages are in the pool */
+        if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+                return NULL;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
                nid = zone_to_nid(zone);
                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
-                    !list_empty(&hugepage_freelists[nid])) {
+                    !list_empty(&h->hugepage_freelists[nid])) {
-                        page = list_entry(hugepage_freelists[nid].next,
+                        page = list_entry(h->hugepage_freelists[nid].next,
                                          struct page, lru);
                        list_del(&page->lru);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[nid]--;
+                        h->free_huge_pages_node[nid]--;
-                        if (vma && vma->vm_flags & VM_MAYSHARE)
-                                resv_huge_pages--;
+                        if (!avoid_reserve)
+                                decrement_hugepage_resv_vma(h, vma);
                        break;
                }
        }
@@ -120,12 +452,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
        return page;
 }
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
-        nr_huge_pages--;
-        nr_huge_pages_node[page_to_nid(page)]--;
+        h->nr_huge_pages--;
-        for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+        h->nr_huge_pages_node[page_to_nid(page)]--;
+        for (i = 0; i < pages_per_huge_page(h); i++) {
                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
                                1 << PG_private | 1<< PG_writeback);
@@ -133,11 +466,27 @@ static void update_and_free_page(struct page *page)
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
        arch_release_hugepage(page);
-        __free_pages(page, HUGETLB_PAGE_ORDER);
+        __free_pages(page, huge_page_order(h));
+}
+struct hstate *size_to_hstate(unsigned long size)
+{
+        struct hstate *h;
+        for_each_hstate(h) {
+                if (huge_page_size(h) == size)
+                        return h;
+        }
+        return NULL;
 }
 static void free_huge_page(struct page *page)
 {
+        /*
+         * Can't pass hstate in here because it is called from the
+         * compound page destructor.
+         */
+        struct hstate *h = page_hstate(page);
        int nid = page_to_nid(page);
        struct address_space *mapping;
@@ -147,12 +496,12 @@ static void free_huge_page(struct page *page)
        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
-        if (surplus_huge_pages_node[nid]) {
+        if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
-                update_and_free_page(page);
+                update_and_free_page(h, page);
-                surplus_huge_pages--;
+                h->surplus_huge_pages--;
-                surplus_huge_pages_node[nid]--;
+                h->surplus_huge_pages_node[nid]--;
        } else {
-                enqueue_huge_page(page);
+                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
        if (mapping)
@@ -164,7 +513,7 @@ static void free_huge_page(struct page *page)
 * balanced by operating on them in a round-robin fashion.
 * Returns 1 if an adjustment was made.
 */
-static int adjust_pool_surplus(int delta)
+static int adjust_pool_surplus(struct hstate *h, int delta)
 {
        static int prev_nid;
        int nid = prev_nid;
@@ -177,15 +526,15 @@ static int adjust_pool_surplus(int delta)
                        nid = first_node(node_online_map);
                /* To shrink on this node, there must be a surplus page */
-                if (delta < 0 && !surplus_huge_pages_node[nid])
+                if (delta < 0 && !h->surplus_huge_pages_node[nid])
                        continue;
                /* Surplus cannot exceed the total number of pages */
-                if (delta > 0 && surplus_huge_pages_node[nid] >=
+                if (delta > 0 && h->surplus_huge_pages_node[nid] >=
-                                                nr_huge_pages_node[nid])
+                                                h->nr_huge_pages_node[nid])
                        continue;
-                surplus_huge_pages += delta;
+                h->surplus_huge_pages += delta;
-                surplus_huge_pages_node[nid] += delta;
+                h->surplus_huge_pages_node[nid] += delta;
                ret = 1;
                break;
        } while (nid != prev_nid);
@@ -194,59 +543,74 @@ static int adjust_pool_surplus(int delta)
        return ret;
 }
-static struct page *alloc_fresh_huge_page_node(int nid)
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+        set_compound_page_dtor(page, free_huge_page);
+        spin_lock(&hugetlb_lock);
+        h->nr_huge_pages++;
+        h->nr_huge_pages_node[nid]++;
+        spin_unlock(&hugetlb_lock);
+        put_page(page); /* free it into the hugepage allocator */
+}
+static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
+        if (h->order >= MAX_ORDER)
+                return NULL;
        page = alloc_pages_node(nid,
                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
                                                __GFP_REPEAT|__GFP_NOWARN,
-                HUGETLB_PAGE_ORDER);
+                huge_page_order(h));
        if (page) {
                if (arch_prepare_hugepage(page)) {
-                        __free_pages(page, HUGETLB_PAGE_ORDER);
+                        __free_pages(page, huge_page_order(h));
                        return NULL;
                }
-                set_compound_page_dtor(page, free_huge_page);
+                prep_new_huge_page(h, page, nid);
-                spin_lock(&hugetlb_lock);
-                nr_huge_pages++;
-                nr_huge_pages_node[nid]++;
-                spin_unlock(&hugetlb_lock);
-                put_page(page); /* free it into the hugepage allocator */
        }
        return page;
 }
-static int alloc_fresh_huge_page(void)
+/*
+ * Use a helper variable to find the next node and then
+ * copy it back to hugetlb_next_nid afterwards:
+ * otherwise there's a window in which a racer might
+ * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * But we don't need to use a spin_lock here: it really
+ * doesn't matter if occasionally a racer chooses the
+ * same nid as we do.  Move nid forward in the mask even
+ * if we just successfully allocated a hugepage so that
+ * the next caller gets hugepages on the next node.
+ */
+static int hstate_next_node(struct hstate *h)
+{
+        int next_nid;
+        next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+        if (next_nid == MAX_NUMNODES)
+                next_nid = first_node(node_online_map);
+        h->hugetlb_next_nid = next_nid;
+        return next_nid;
+}
+static int alloc_fresh_huge_page(struct hstate *h)
 {
        struct page *page;
        int start_nid;
        int next_nid;
        int ret = 0;
-        start_nid = hugetlb_next_nid;
+        start_nid = h->hugetlb_next_nid;
        do {
-                page = alloc_fresh_huge_page_node(hugetlb_next_nid);
+                page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
                if (page)
                        ret = 1;
-                /*
+                next_nid = hstate_next_node(h);
-                 * Use a helper variable to find the next node and then
+        } while (!page && h->hugetlb_next_nid != start_nid);
-                 * copy it back to hugetlb_next_nid afterwards:
-                 * otherwise there's a window in which a racer might
-                 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
-                 * But we don't need to use a spin_lock here: it really
-                 * doesn't matter if occasionally a racer chooses the
-                 * same nid as we do.  Move nid forward in the mask even
-                 * if we just successfully allocated a hugepage so that
-                 * the next caller gets hugepages on the next node.
-                 */
-                next_nid = next_node(hugetlb_next_nid, node_online_map);
-                if (next_nid == MAX_NUMNODES)
-                        next_nid = first_node(node_online_map);
-                hugetlb_next_nid = next_nid;
-        } while (!page && hugetlb_next_nid != start_nid);
        if (ret)
                count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -256,12 +620,15 @@ static int alloc_fresh_huge_page(void)
        return ret;
 }
-static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
+static struct page *alloc_buddy_huge_page(struct hstate *h,
-                                                unsigned long address)
+                        struct vm_area_struct *vma, unsigned long address)
 {
        struct page *page;
        unsigned int nid;
+        if (h->order >= MAX_ORDER)
+                return NULL;
        /*
         * Assume we will successfully allocate the surplus page to
         * prevent racing processes from causing the surplus to exceed
@@ -286,18 +653,23 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
         * per-node value is checked there.
         */
        spin_lock(&hugetlb_lock);
-        if (surplus_huge_pages >= nr_overcommit_huge_pages) {
+        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                spin_unlock(&hugetlb_lock);
                return NULL;
        } else {
-                nr_huge_pages++;
+                h->nr_huge_pages++;
-                surplus_huge_pages++;
+                h->surplus_huge_pages++;
        }
        spin_unlock(&hugetlb_lock);
        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
                                        __GFP_REPEAT|__GFP_NOWARN,
-                                        HUGETLB_PAGE_ORDER);
+                                        huge_page_order(h));
+        if (page && arch_prepare_hugepage(page)) {
+                __free_pages(page, huge_page_order(h));
+                return NULL;
+        }
        spin_lock(&hugetlb_lock);
        if (page) {
@@ -312,12 +684,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
                /*
                 * We incremented the global counters already
                 */
-                nr_huge_pages_node[nid]++;
+                h->nr_huge_pages_node[nid]++;
-                surplus_huge_pages_node[nid]++;
+                h->surplus_huge_pages_node[nid]++;
                __count_vm_event(HTLB_BUDDY_PGALLOC);
        } else {
-                nr_huge_pages--;
+                h->nr_huge_pages--;
-                surplus_huge_pages--;
+                h->surplus_huge_pages--;
                __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
        }
        spin_unlock(&hugetlb_lock);
@@ -329,16 +701,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
 * Increase the hugetlb pool such that it can accomodate a reservation
 * of size 'delta'.
 */
-static int gather_surplus_pages(int delta)
+static int gather_surplus_pages(struct hstate *h, int delta)
 {
        struct list_head surplus_list;
        struct page *page, *tmp;
        int ret, i;
        int needed, allocated;
-        needed = (resv_huge_pages + delta) - free_huge_pages;
+        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
        if (needed <= 0) {
-                resv_huge_pages += delta;
+                h->resv_huge_pages += delta;
                return 0;
        }
@@ -349,7 +721,7 @@ static int gather_surplus_pages(int delta)
 retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
-                page = alloc_buddy_huge_page(NULL, 0);
+                page = alloc_buddy_huge_page(h, NULL, 0);
                if (!page) {
                        /*
                         * We were not able to allocate enough pages to
@@ -370,7 +742,8 @@ retry:
         * because either resv_huge_pages or free_huge_pages may have changed.
         */
        spin_lock(&hugetlb_lock);
-        needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
+        needed = (h->resv_huge_pages + delta) -
+                        (h->free_huge_pages + allocated);
        if (needed > 0)
                goto retry;
@@ -383,7 +756,7 @@ retry:
         * before they are reserved.
         */
        needed += allocated;
-        resv_huge_pages += delta;
+        h->resv_huge_pages += delta;
        ret = 0;
 free:
        /* Free the needed pages to the hugetlb pool */
@@ -391,7 +764,7 @@ free:
                if ((--needed) < 0)
                        break;
                list_del(&page->lru);
-                enqueue_huge_page(page);
+                enqueue_huge_page(h, page);
        }
        /* Free unnecessary surplus pages to the buddy allocator */
@@ -419,7 +792,8 @@ free:
 * allocated to satisfy the reservation must be explicitly freed if they were
 * never used.
 */
-static void return_unused_surplus_pages(unsigned long unused_resv_pages)
+static void return_unused_surplus_pages(struct hstate *h,
+                                        unsigned long unused_resv_pages)
 {
        static int nid = -1;
        struct page *page;
@@ -434,157 +808,269 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
        unsigned long remaining_iterations = num_online_nodes();
        /* Uncommit the reservation */
-        resv_huge_pages -= unused_resv_pages;
+        h->resv_huge_pages -= unused_resv_pages;
-        nr_pages = min(unused_resv_pages, surplus_huge_pages);
+        /* Cannot return gigantic pages currently */
+        if (h->order >= MAX_ORDER)
+                return;
+        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
        while (remaining_iterations-- && nr_pages) {
                nid = next_node(nid, node_online_map);
                if (nid == MAX_NUMNODES)
                        nid = first_node(node_online_map);
-                if (!surplus_huge_pages_node[nid])
+                if (!h->surplus_huge_pages_node[nid])
                        continue;
-                if (!list_empty(&hugepage_freelists[nid])) {
+                if (!list_empty(&h->hugepage_freelists[nid])) {
-                        page = list_entry(hugepage_freelists[nid].next,
+                        page = list_entry(h->hugepage_freelists[nid].next,
                                          struct page, lru);
                        list_del(&page->lru);
-                        update_and_free_page(page);
+                        update_and_free_page(h, page);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[nid]--;
+                        h->free_huge_pages_node[nid]--;
-                        surplus_huge_pages--;
+                        h->surplus_huge_pages--;
-                        surplus_huge_pages_node[nid]--;
+                        h->surplus_huge_pages_node[nid]--;
                        nr_pages--;
                        remaining_iterations = num_online_nodes();
                }
        }
 }
+/*
+ * Determine if the huge page at addr within the vma has an associated
+ * reservation.  Where it does not we will need to logically increase
+ * reservation and actually increase quota before an allocation can occur.
+ * Where any new reservation would be required the reservation change is
+ * prepared, but not committed.  Once the page has been quota'd allocated
+ * an instantiated the change should be committed via vma_commit_reservation.
+ * No action is required on failure.
+ */
+static int vma_needs_reservation(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long addr)
+{
+        struct address_space *mapping = vma->vm_file->f_mapping;
+        struct inode *inode = mapping->host;
+        if (vma->vm_flags & VM_SHARED) {
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+                return region_chg(&inode->i_mapping->private_list,
+                                                        idx, idx + 1);
-static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
+        } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
-                                                unsigned long addr)
+                return 1;
+        } else  {
+                int err;
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+                struct resv_map *reservations = vma_resv_map(vma);
+                err = region_chg(&reservations->regions, idx, idx + 1);
+                if (err < 0)
+                        return err;
+                return 0;
+        }
+}
+static void vma_commit_reservation(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long addr)
 {
-        struct page *page;
+        struct address_space *mapping = vma->vm_file->f_mapping;
+        struct inode *inode = mapping->host;
-        spin_lock(&hugetlb_lock);
+        if (vma->vm_flags & VM_SHARED) {
-        page = dequeue_huge_page_vma(vma, addr);
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-        spin_unlock(&hugetlb_lock);
+                region_add(&inode->i_mapping->private_list, idx, idx + 1);
-        return page ? page : ERR_PTR(-VM_FAULT_OOM);
+        } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+                struct resv_map *reservations = vma_resv_map(vma);
+                /* Mark this page used in the map. */
+                region_add(&reservations->regions, idx, idx + 1);
+        }
 }
-static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
-                                                unsigned long addr)
+                                    unsigned long addr, int avoid_reserve)
 {
-        struct page *page = NULL;
+        struct hstate *h = hstate_vma(vma);
+        struct page *page;
+        struct address_space *mapping = vma->vm_file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned int chg;
-        if (hugetlb_get_quota(vma->vm_file->f_mapping, 1))
+        /*
-                return ERR_PTR(-VM_FAULT_SIGBUS);
+         * Processes that did not create the mapping will have no reserves and
+         * will not have accounted against quota. Check that the quota can be
+         * made before satisfying the allocation
+         * MAP_NORESERVE mappings may also need pages and quota allocated
+         * if no reserve mapping overlaps.
+         */
+        chg = vma_needs_reservation(h, vma, addr);
+        if (chg < 0)
+                return ERR_PTR(chg);
+        if (chg)
+                if (hugetlb_get_quota(inode->i_mapping, chg))
+                        return ERR_PTR(-ENOSPC);
        spin_lock(&hugetlb_lock);
-        if (free_huge_pages > resv_huge_pages)
+        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
-                page = dequeue_huge_page_vma(vma, addr);
        spin_unlock(&hugetlb_lock);
        if (!page) {
-                page = alloc_buddy_huge_page(vma, addr);
+                page = alloc_buddy_huge_page(h, vma, addr);
                if (!page) {
-                        hugetlb_put_quota(vma->vm_file->f_mapping, 1);
+                        hugetlb_put_quota(inode->i_mapping, chg);
                        return ERR_PTR(-VM_FAULT_OOM);
                }
        }
+        set_page_refcounted(page);
+        set_page_private(page, (unsigned long) mapping);
+        vma_commit_reservation(h, vma, addr);
        return page;
 }
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
-                                    unsigned long addr)
 {
-        struct page *page;
+        struct huge_bootmem_page *m;
-        struct address_space *mapping = vma->vm_file->f_mapping;
+        int nr_nodes = nodes_weight(node_online_map);
-        if (vma->vm_flags & VM_MAYSHARE)
+        while (nr_nodes) {
-                page = alloc_huge_page_shared(vma, addr);
+                void *addr;
-        else
-                page = alloc_huge_page_private(vma, addr);
+                addr = __alloc_bootmem_node_nopanic(
+                                NODE_DATA(h->hugetlb_next_nid),
+                                huge_page_size(h), huge_page_size(h), 0);
-        if (!IS_ERR(page)) {
+                if (addr) {
-                set_page_refcounted(page);
+                        /*
-                set_page_private(page, (unsigned long) mapping);
+                         * Use the beginning of the huge page to store the
+                         * huge_bootmem_page struct (until gather_bootmem
+                         * puts them into the mem_map).
+                         */
+                        m = addr;
+                        if (m)
+                                goto found;
+                }
+                hstate_next_node(h);
+                nr_nodes--;
        }
-        return page;
+        return 0;
+found:
+        BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+        /* Put them into a private list first because mem_map is not up yet */
+        list_add(&m->list, &huge_boot_pages);
+        m->hstate = h;
+        return 1;
 }
-static int __init hugetlb_init(void)
+/* Put bootmem huge pages into the standard lists after mem_map is up */
+static void __init gather_bootmem_prealloc(void)
 {
-        unsigned long i;
+        struct huge_bootmem_page *m;
-        if (HPAGE_SHIFT == 0)
+        list_for_each_entry(m, &huge_boot_pages, list) {
-                return 0;
+                struct page *page = virt_to_page(m);
+                struct hstate *h = m->hstate;
-        for (i = 0; i < MAX_NUMNODES; ++i)
+                __ClearPageReserved(page);
-                INIT_LIST_HEAD(&hugepage_freelists[i]);
+                WARN_ON(page_count(page) != 1);
+                prep_compound_page(page, h->order);
+                prep_new_huge_page(h, page, page_to_nid(page));
+        }
+}
-        hugetlb_next_nid = first_node(node_online_map);
+static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+{
+        unsigned long i;
-        for (i = 0; i < max_huge_pages; ++i) {
+        for (i = 0; i < h->max_huge_pages; ++i) {
-                if (!alloc_fresh_huge_page())
+                if (h->order >= MAX_ORDER) {
+                        if (!alloc_bootmem_huge_page(h))
+                                break;
+                } else if (!alloc_fresh_huge_page(h))
                        break;
        }
-        max_huge_pages = free_huge_pages = nr_huge_pages = i;
+        h->max_huge_pages = i;
-        printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
-        return 0;
 }
-module_init(hugetlb_init);
-static int __init hugetlb_setup(char *s)
+static void __init hugetlb_init_hstates(void)
 {
-        if (sscanf(s, "%lu", &max_huge_pages) <= 0)
+        struct hstate *h;
-                max_huge_pages = 0;
-        return 1;
+        for_each_hstate(h) {
+                /* oversize hugepages were init'ed in early boot */
+                if (h->order < MAX_ORDER)
+                        hugetlb_hstate_alloc_pages(h);
+        }
 }
-__setup("hugepages=", hugetlb_setup);
-static unsigned int cpuset_mems_nr(unsigned int *array)
+static char * __init memfmt(char *buf, unsigned long n)
 {
-        int node;
+        if (n >= (1UL << 30))
-        unsigned int nr = 0;
+                sprintf(buf, "%lu GB", n >> 30);
+        else if (n >= (1UL << 20))
-        for_each_node_mask(node, cpuset_current_mems_allowed)
+                sprintf(buf, "%lu MB", n >> 20);
-                nr += array[node];
+        else
+                sprintf(buf, "%lu KB", n >> 10);
+        return buf;
+}
-        return nr;
+static void __init report_hugepages(void)
+{
+        struct hstate *h;
+        for_each_hstate(h) {
+                char buf[32];
+                printk(KERN_INFO "HugeTLB registered %s page size, "
+                                 "pre-allocated %ld pages\n",
+                        memfmt(buf, huge_page_size(h)),
+                        h->free_huge_pages);
+        }
 }
-#ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
-static void try_to_free_low(unsigned long count)
+static void try_to_free_low(struct hstate *h, unsigned long count)
 {
        int i;
+        if (h->order >= MAX_ORDER)
+                return;
        for (i = 0; i < MAX_NUMNODES; ++i) {
                struct page *page, *next;
-                list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
+                struct list_head *freel = &h->hugepage_freelists[i];
-                        if (count >= nr_huge_pages)
+                list_for_each_entry_safe(page, next, freel, lru) {
+                        if (count >= h->nr_huge_pages)
                                return;
                        if (PageHighMem(page))
                                continue;
                        list_del(&page->lru);
-                        update_and_free_page(page);
+                        update_and_free_page(h, page);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[page_to_nid(page)]--;
+                        h->free_huge_pages_node[page_to_nid(page)]--;
                }
        }
 }
 #else
-static inline void try_to_free_low(unsigned long count)
+static inline void try_to_free_low(struct hstate *h, unsigned long count)
 {
 }
 #endif
-#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
+#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(unsigned long count)
+static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
 {
        unsigned long min_count, ret;
+        if (h->order >= MAX_ORDER)
+                return h->max_huge_pages;
        /*
         * Increase the pool size
         * First take pages out of surplus state.  Then make up the
@@ -597,20 +1083,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
         * within all the constraints specified by the sysctls.
         */
        spin_lock(&hugetlb_lock);
-        while (surplus_huge_pages && count > persistent_huge_pages) {
+        while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
-                if (!adjust_pool_surplus(-1))
+                if (!adjust_pool_surplus(h, -1))
                        break;
        }
-        while (count > persistent_huge_pages) {
+        while (count > persistent_huge_pages(h)) {
-                int ret;
                /*
                 * If this allocation races such that we no longer need the
                 * page, free_huge_page will handle it by freeing the page
                 * and reducing the surplus.
                 */
                spin_unlock(&hugetlb_lock);
-                ret = alloc_fresh_huge_page();
+                ret = alloc_fresh_huge_page(h);
                spin_lock(&hugetlb_lock);
                if (!ret)
                        goto out;
@@ -632,31 +1117,305 @@ static unsigned long set_max_huge_pages(unsigned long count)
         * and won't grow the pool anywhere else. Not until one of the
         * sysctls are changed, or the surplus pages go out of use.
         */
-        min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
+        min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
        min_count = max(count, min_count);
-        try_to_free_low(min_count);
+        try_to_free_low(h, min_count);
-        while (min_count < persistent_huge_pages) {
+        while (min_count < persistent_huge_pages(h)) {
-                struct page *page = dequeue_huge_page();
+                struct page *page = dequeue_huge_page(h);
                if (!page)
                        break;
-                update_and_free_page(page);
+                update_and_free_page(h, page);
        }
-        while (count < persistent_huge_pages) {
+        while (count < persistent_huge_pages(h)) {
-                if (!adjust_pool_surplus(1))
+                if (!adjust_pool_surplus(h, 1))
                        break;
        }
 out:
-        ret = persistent_huge_pages;
+        ret = persistent_huge_pages(h);
        spin_unlock(&hugetlb_lock);
        return ret;
 }
+#define HSTATE_ATTR_RO(_name) \
+        static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define HSTATE_ATTR(_name) \
+        static struct kobj_attribute _name##_attr = \
+                __ATTR(_name, 0644, _name##_show, _name##_store)
+static struct kobject *hugepages_kobj;
+static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+static struct hstate *kobj_to_hstate(struct kobject *kobj)
+{
+        int i;
+        for (i = 0; i < HUGE_MAX_HSTATE; i++)
+                if (hstate_kobjs[i] == kobj)
+                        return &hstates[i];
+        BUG();
+        return NULL;
+}
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->nr_huge_pages);
+}
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+                struct kobj_attribute *attr, const char *buf, size_t count)
+{
+        int err;
+        unsigned long input;
+        struct hstate *h = kobj_to_hstate(kobj);
+        err = strict_strtoul(buf, 10, &input);
+        if (err)
+                return 0;
+        h->max_huge_pages = set_max_huge_pages(h, input);
+        return count;
+}
+HSTATE_ATTR(nr_hugepages);
+static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
+                struct kobj_attribute *attr, const char *buf, size_t count)
+{
+        int err;
+        unsigned long input;
+        struct hstate *h = kobj_to_hstate(kobj);
+        err = strict_strtoul(buf, 10, &input);
+        if (err)
+                return 0;
+        spin_lock(&hugetlb_lock);
+        h->nr_overcommit_huge_pages = input;
+        spin_unlock(&hugetlb_lock);
+        return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+static ssize_t free_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+static ssize_t resv_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+static ssize_t surplus_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+static struct attribute *hstate_attrs[] = {
+        &nr_hugepages_attr.attr,
+        &nr_overcommit_hugepages_attr.attr,
+        &free_hugepages_attr.attr,
+        &resv_hugepages_attr.attr,
+        &surplus_hugepages_attr.attr,
+        NULL,
+};
+static struct attribute_group hstate_attr_group = {
+        .attrs = hstate_attrs,
+};
+static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
+{
+        int retval;
+        hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
+                                                        hugepages_kobj);
+        if (!hstate_kobjs[h - hstates])
+                return -ENOMEM;
+        retval = sysfs_create_group(hstate_kobjs[h - hstates],
+                                                        &hstate_attr_group);
+        if (retval)
+                kobject_put(hstate_kobjs[h - hstates]);
+        return retval;
+}
+static void __init hugetlb_sysfs_init(void)
+{
+        struct hstate *h;
+        int err;
+        hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+        if (!hugepages_kobj)
+                return;
+        for_each_hstate(h) {
+                err = hugetlb_sysfs_add_hstate(h);
+                if (err)
+                        printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
+                                                                h->name);
+        }
+}
+static void __exit hugetlb_exit(void)
+{
+        struct hstate *h;
+        for_each_hstate(h) {
+                kobject_put(hstate_kobjs[h - hstates]);
+        }
+        kobject_put(hugepages_kobj);
+}
+module_exit(hugetlb_exit);
+static int __init hugetlb_init(void)
+{
+        /* Some platform decide whether they support huge pages at boot
+         * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
+         * there is no such support
+         */
+        if (HPAGE_SHIFT == 0)
+                return 0;
+        if (!size_to_hstate(default_hstate_size)) {
+                default_hstate_size = HPAGE_SIZE;
+                if (!size_to_hstate(default_hstate_size))
+                        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+        }
+        default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+        if (default_hstate_max_huge_pages)
+                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+        hugetlb_init_hstates();
+        gather_bootmem_prealloc();
+        report_hugepages();
+        hugetlb_sysfs_init();
+        return 0;
+}
+module_init(hugetlb_init);
+/* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_add_hstate(unsigned order)
+{
+        struct hstate *h;
+        unsigned long i;
+        if (size_to_hstate(PAGE_SIZE << order)) {
+                printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
+                return;
+        }
+        BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+        BUG_ON(order == 0);
+        h = &hstates[max_hstate++];
+        h->order = order;
+        h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+        h->nr_huge_pages = 0;
+        h->free_huge_pages = 0;
+        for (i = 0; i < MAX_NUMNODES; ++i)
+                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+        h->hugetlb_next_nid = first_node(node_online_map);
+        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
+                                        huge_page_size(h)/1024);
+        parsed_hstate = h;
+}
+static int __init hugetlb_nrpages_setup(char *s)
+{
+        unsigned long *mhp;
+        static unsigned long *last_mhp;
+        /*
+         * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+         * so this hugepages= parameter goes to the "default hstate".
+         */
+        if (!max_hstate)
+                mhp = &default_hstate_max_huge_pages;
+        else
+                mhp = &parsed_hstate->max_huge_pages;
+        if (mhp == last_mhp) {
+                printk(KERN_WARNING "hugepages= specified twice without "
+                        "interleaving hugepagesz=, ignoring\n");
+                return 1;
+        }
+        if (sscanf(s, "%lu", mhp) <= 0)
+                *mhp = 0;
+        /*
+         * Global state is always initialized later in hugetlb_init.
+         * But we need to allocate >= MAX_ORDER hstates here early to still
+         * use the bootmem allocator.
+         */
+        if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+                hugetlb_hstate_alloc_pages(parsed_hstate);
+        last_mhp = mhp;
+        return 1;
+}
+__setup("hugepages=", hugetlb_nrpages_setup);
+static int __init hugetlb_default_setup(char *s)
+{
+        default_hstate_size = memparse(s, &s);
+        return 1;
+}
+__setup("default_hugepagesz=", hugetlb_default_setup);
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+        int node;
+        unsigned int nr = 0;
+        for_each_node_mask(node, cpuset_current_mems_allowed)
+                nr += array[node];
+        return nr;
+}
+#ifdef CONFIG_SYSCTL
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
                           struct file *file, void __user *buffer,
                           size_t *length, loff_t *ppos)
 {
+        struct hstate *h = &default_hstate;
+        unsigned long tmp;
+        if (!write)
+                tmp = h->max_huge_pages;
+        table->data = &tmp;
+        table->maxlen = sizeof(unsigned long);
        proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
-        max_huge_pages = set_max_huge_pages(max_huge_pages);
+        if (write)
+                h->max_huge_pages = set_max_huge_pages(h, tmp);
        return 0;
 }
@@ -676,45 +1435,141 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
                        struct file *file, void __user *buffer,
                        size_t *length, loff_t *ppos)
 {
+        struct hstate *h = &default_hstate;
+        unsigned long tmp;
+        if (!write)
+                tmp = h->nr_overcommit_huge_pages;
+        table->data = &tmp;
+        table->maxlen = sizeof(unsigned long);
        proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
-        spin_lock(&hugetlb_lock);
-        nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
+        if (write) {
-        spin_unlock(&hugetlb_lock);
+                spin_lock(&hugetlb_lock);
+                h->nr_overcommit_huge_pages = tmp;
+                spin_unlock(&hugetlb_lock);
+        }
        return 0;
 }
 #endif /* CONFIG_SYSCTL */
-int hugetlb_report_meminfo(char *buf)
+void hugetlb_report_meminfo(struct seq_file *m)
 {
-        return sprintf(buf,
+        struct hstate *h = &default_hstate;
-                        "HugePages_Total: %5lu\n"
+        seq_printf(m,
-                        "HugePages_Free:  %5lu\n"
+                        "HugePages_Total:   %5lu\n"
-                        "HugePages_Rsvd:  %5lu\n"
+                        "HugePages_Free:    %5lu\n"
-                        "HugePages_Surp:  %5lu\n"
+                        "HugePages_Rsvd:    %5lu\n"
-                        "Hugepagesize:    %5lu kB\n",
+                        "HugePages_Surp:    %5lu\n"
-                        nr_huge_pages,
+                        "Hugepagesize:   %8lu kB\n",
-                        free_huge_pages,
+                        h->nr_huge_pages,
-                        resv_huge_pages,
+                        h->free_huge_pages,
-                        surplus_huge_pages,
+                        h->resv_huge_pages,
-                        HPAGE_SIZE/1024);
+                        h->surplus_huge_pages,
+                        1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
 }
 int hugetlb_report_node_meminfo(int nid, char *buf)
 {
+        struct hstate *h = &default_hstate;
        return sprintf(buf,
                "Node %d HugePages_Total: %5u\n"
                "Node %d HugePages_Free:  %5u\n"
                "Node %d HugePages_Surp:  %5u\n",
-                nid, nr_huge_pages_node[nid],
+                nid, h->nr_huge_pages_node[nid],
-                nid, free_huge_pages_node[nid],
+                nid, h->free_huge_pages_node[nid],
-                nid, surplus_huge_pages_node[nid]);
+                nid, h->surplus_huge_pages_node[nid]);
 }
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-        return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+        struct hstate *h = &default_hstate;
+        return h->nr_huge_pages * pages_per_huge_page(h);
+}
+static int hugetlb_acct_memory(struct hstate *h, long delta)
+{
+        int ret = -ENOMEM;
+        spin_lock(&hugetlb_lock);
+        /*
+         * When cpuset is configured, it breaks the strict hugetlb page
+         * reservation as the accounting is done on a global variable. Such
+         * reservation is completely rubbish in the presence of cpuset because
+         * the reservation is not checked against page availability for the
+         * current cpuset. Application can still potentially OOM'ed by kernel
+         * with lack of free htlb page in cpuset that the task is in.
+         * Attempt to enforce strict accounting with cpuset is almost
+         * impossible (or too ugly) because cpuset is too fluid that
+         * task or memory node can be dynamically moved between cpusets.
+         *
+         * The change of semantics for shared hugetlb mapping with cpuset is
+         * undesirable. However, in order to preserve some of the semantics,
+         * we fall back to check against current free page availability as
+         * a best attempt and hopefully to minimize the impact of changing
+         * semantics that cpuset has.
+         */
+        if (delta > 0) {
+                if (gather_surplus_pages(h, delta) < 0)
+                        goto out;
+                if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+                        return_unused_surplus_pages(h, delta);
+                        goto out;
+                }
+        }
+        ret = 0;
+        if (delta < 0)
+                return_unused_surplus_pages(h, (unsigned long) -delta);
+out:
+        spin_unlock(&hugetlb_lock);
+        return ret;
+}
+static void hugetlb_vm_op_open(struct vm_area_struct *vma)
+{
+        struct resv_map *reservations = vma_resv_map(vma);
+        /*
+         * This new VMA should share its siblings reservation map if present.
+         * The VMA will only ever have a valid reservation map pointer where
+         * it is being copied for another still existing VMA.  As that VMA
+         * has a reference to the reservation map it cannot dissappear until
+         * after this open call completes.  It is therefore safe to take a
+         * new reference here without additional locking.
+         */
+        if (reservations)
+                kref_get(&reservations->refs);
+}
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+        struct hstate *h = hstate_vma(vma);
+        struct resv_map *reservations = vma_resv_map(vma);
+        unsigned long reserve;
+        unsigned long start;
+        unsigned long end;
+        if (reservations) {
+                start = vma_hugecache_offset(h, vma, vma->vm_start);
+                end = vma_hugecache_offset(h, vma, vma->vm_end);
+                reserve = (end - start) -
+                        region_count(&reservations->regions, start, end);
+                kref_put(&reservations->refs, resv_map_release);
+                if (reserve) {
+                        hugetlb_acct_memory(h, -reserve);
+                        hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+                }
+        }
 }
 /*
@@ -731,6 +1586,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
+        .open = hugetlb_vm_op_open,
+        .close = hugetlb_vm_op_close,
 };
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -769,14 +1626,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        struct page *ptepage;
        unsigned long addr;
        int cow;
+        struct hstate *h = hstate_vma(vma);
+        unsigned long sz = huge_page_size(h);
        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-        for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
                src_pte = huge_pte_offset(src, addr);
                if (!src_pte)
                        continue;
-                dst_pte = huge_pte_alloc(dst, addr);
+                dst_pte = huge_pte_alloc(dst, addr, sz);
                if (!dst_pte)
                        goto nomem;
@@ -804,7 +1663,7 @@ nomem:
 }
 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                            unsigned long end)
+                            unsigned long end, struct page *ref_page)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -812,6 +1671,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        pte_t pte;
        struct page *page;
        struct page *tmp;
+        struct hstate *h = hstate_vma(vma);
+        unsigned long sz = huge_page_size(h);
        /*
         * A page gathering list, protected by per file i_mmap_lock. The
         * lock is used to avoid list corruption from multiple unmapping
@@ -820,11 +1682,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        LIST_HEAD(page_list);
        WARN_ON(!is_vm_hugetlb_page(vma));
-        BUG_ON(start & ~HPAGE_MASK);
+        BUG_ON(start & ~huge_page_mask(h));
-        BUG_ON(end & ~HPAGE_MASK);
+        BUG_ON(end & ~huge_page_mask(h));
+        mmu_notifier_invalidate_range_start(mm, start, end);
        spin_lock(&mm->page_table_lock);
-        for (address = start; address < end; address += HPAGE_SIZE) {
+        for (address = start; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
@@ -832,6 +1695,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                if (huge_pmd_unshare(mm, &address, ptep))
                        continue;
+                /*
+                 * If a reference page is supplied, it is because a specific
+                 * page is being unmapped, not a range. Ensure the page we
+                 * are about to unmap is the actual page of interest.
+                 */
+                if (ref_page) {
+                        pte = huge_ptep_get(ptep);
+                        if (huge_pte_none(pte))
+                                continue;
+                        page = pte_page(pte);
+                        if (page != ref_page)
+                                continue;
+                        /*
+                         * Mark the VMA as having unmapped its page so that
+                         * future faults in this VMA will fail rather than
+                         * looking like data was lost
+                         */
+                        set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+                }
                pte = huge_ptep_get_and_clear(mm, address, ptep);
                if (huge_pte_none(pte))
                        continue;
@@ -843,6 +1727,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        }
        spin_unlock(&mm->page_table_lock);
        flush_tlb_range(vma, start, end);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        list_for_each_entry_safe(page, tmp, &page_list, lru) {
                list_del(&page->lru);
                put_page(page);
@@ -850,31 +1735,69 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 }
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                          unsigned long end)
+                          unsigned long end, struct page *ref_page)
 {
+        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+        __unmap_hugepage_range(vma, start, end, ref_page);
+        spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+}
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
+                                struct page *page, unsigned long address)
+{
+        struct vm_area_struct *iter_vma;
+        struct address_space *mapping;
+        struct prio_tree_iter iter;
+        pgoff_t pgoff;
        /*
-         * It is undesirable to test vma->vm_file as it should be non-null
+         * vm_pgoff is in PAGE_SIZE units, hence the different calculation
-         * for valid hugetlb area. However, vm_file will be NULL in the error
+         * from page cache lookup which is in HPAGE_SIZE units.
-         * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
-         * do_mmap_pgoff() nullifies vma->vm_file before calling this function
-         * to clean up. Since no pte has actually been setup, it is safe to
-         * do nothing in this case.
         */
-        if (vma->vm_file) {
+        address = address & huge_page_mask(hstate_vma(vma));
-                spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
-                __unmap_hugepage_range(vma, start, end);
+                + (vma->vm_pgoff >> PAGE_SHIFT);
-                spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+        mapping = (struct address_space *)page_private(page);
+        vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+                /* Do not unmap the current VMA */
+                if (iter_vma == vma)
+                        continue;
+                /*
+                 * Unmap the page from other VMAs without their own reserves.
+                 * They get marked to be SIGKILLed if they fault in these
+                 * areas. This is because a future no-page fault on this VMA
+                 * could insert a zeroed page instead of the data existing
+                 * from the time of fork. This would look like data corruption
+                 */
+                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+                        unmap_hugepage_range(iter_vma,
+                                address, address + HPAGE_SIZE,
+                                page);
        }
+        return 1;
 }
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long address, pte_t *ptep, pte_t pte)
+                        unsigned long address, pte_t *ptep, pte_t pte,
+                        struct page *pagecache_page)
 {
+        struct hstate *h = hstate_vma(vma);
        struct page *old_page, *new_page;
        int avoidcopy;
+        int outside_reserve = 0;
        old_page = pte_page(pte);
+retry_avoidcopy:
        /* If no-one else is actually using this page, avoid the copy
         * and just make the page writable */
        avoidcopy = (page_count(old_page) == 1);
@@ -883,11 +1806,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                return 0;
        }
+        /*
+         * If the process that created a MAP_PRIVATE mapping is about to
+         * perform a COW due to a shared page count, attempt to satisfy
+         * the allocation without using the existing reserves. The pagecache
+         * page is used to determine if the reserve at this address was
+         * consumed or not. If reserves were used, a partial faulted mapping
+         * at the time of fork() could consume its reserves on COW instead
+         * of the full address range.
+         */
+        if (!(vma->vm_flags & VM_SHARED) &&
+                        is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+                        old_page != pagecache_page)
+                outside_reserve = 1;
        page_cache_get(old_page);
-        new_page = alloc_huge_page(vma, address);
+        new_page = alloc_huge_page(vma, address, outside_reserve);
        if (IS_ERR(new_page)) {
                page_cache_release(old_page);
+                /*
+                 * If a process owning a MAP_PRIVATE mapping fails to COW,
+                 * it is due to references held by a child and an insufficient
+                 * huge page pool. To guarantee the original mappers
+                 * reliability, unmap the page from child processes. The child
+                 * may get SIGKILLed if it later faults.
+                 */
+                if (outside_reserve) {
+                        BUG_ON(huge_pte_none(pte));
+                        if (unmap_ref_private(mm, vma, old_page, address)) {
+                                BUG_ON(page_count(old_page) != 1);
+                                BUG_ON(huge_pte_none(pte));
+                                goto retry_avoidcopy;
+                        }
+                        WARN_ON_ONCE(1);
+                }
                return -PTR_ERR(new_page);
        }
@@ -896,7 +1851,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        __SetPageUptodate(new_page);
        spin_lock(&mm->page_table_lock);
-        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
                /* Break COW */
                huge_ptep_clear_flush(vma, address, ptep);
@@ -910,19 +1865,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        return 0;
 }
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long address)
+{
+        struct address_space *mapping;
+        pgoff_t idx;
+        mapping = vma->vm_file->f_mapping;
+        idx = vma_hugecache_offset(h, vma, address);
+        return find_lock_page(mapping, idx);
+}
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, int write_access)
 {
+        struct hstate *h = hstate_vma(vma);
        int ret = VM_FAULT_SIGBUS;
-        unsigned long idx;
+        pgoff_t idx;
        unsigned long size;
        struct page *page;
        struct address_space *mapping;
        pte_t new_pte;
+        /*
+         * Currently, we are forced to kill the process in the event the
+         * original mapper has unmapped pages from the child due to a failed
+         * COW. Warn that such a situation has occured as it may not be obvious
+         */
+        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+                printk(KERN_WARNING
+                        "PID %d killed due to inadequate hugepage pool\n",
+                        current->pid);
+                return ret;
+        }
        mapping = vma->vm_file->f_mapping;
-        idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+        idx = vma_hugecache_offset(h, vma, address);
-                + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
        /*
         * Use page lock to guard against racing truncation
@@ -931,15 +1911,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
-                size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+                size = i_size_read(mapping->host) >> huge_page_shift(h);
                if (idx >= size)
                        goto out;
-                page = alloc_huge_page(vma, address);
+                page = alloc_huge_page(vma, address, 0);
                if (IS_ERR(page)) {
                        ret = -PTR_ERR(page);
                        goto out;
                }
-                clear_huge_page(page, address);
+                clear_huge_page(page, address, huge_page_size(h));
                __SetPageUptodate(page);
                if (vma->vm_flags & VM_SHARED) {
@@ -955,14 +1935,26 @@ retry:
                        }
                        spin_lock(&inode->i_lock);
-                        inode->i_blocks += BLOCKS_PER_HUGEPAGE;
+                        inode->i_blocks += blocks_per_huge_page(h);
                        spin_unlock(&inode->i_lock);
                } else
                        lock_page(page);
        }
+        /*
+         * If we are going to COW a private mapping later, we examine the
+         * pending reservations for this page now. This will ensure that
+         * any allocations necessary to record that reservation occur outside
+         * the spinlock.
+         */
+        if (write_access && !(vma->vm_flags & VM_SHARED))
+                if (vma_needs_reservation(h, vma, address) < 0) {
+                        ret = VM_FAULT_OOM;
+                        goto backout_unlocked;
+                }
        spin_lock(&mm->page_table_lock);
-        size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+        size = i_size_read(mapping->host) >> huge_page_shift(h);
        if (idx >= size)
                goto backout;
@@ -976,7 +1968,7 @@ retry:
        if (write_access && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
-                ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+                ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
        }
        spin_unlock(&mm->page_table_lock);
@@ -986,6 +1978,7 @@ out:
 backout:
        spin_unlock(&mm->page_table_lock);
+backout_unlocked:
        unlock_page(page);
        put_page(page);
        goto out;
@@ -997,9 +1990,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t entry;
        int ret;
+        struct page *pagecache_page = NULL;
        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+        struct hstate *h = hstate_vma(vma);
-        ptep = huge_pte_alloc(mm, address);
+        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
        if (!ptep)
                return VM_FAULT_OOM;
@@ -1012,23 +2007,79 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        entry = huge_ptep_get(ptep);
        if (huge_pte_none(entry)) {
                ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
-                mutex_unlock(&hugetlb_instantiation_mutex);
+                goto out_mutex;
-                return ret;
        }
        ret = 0;
+        /*
+         * If we are going to COW the mapping later, we examine the pending
+         * reservations for this page now. This will ensure that any
+         * allocations necessary to record that reservation occur outside the
+         * spinlock. For private mappings, we also lookup the pagecache
+         * page now as it is used to determine if a reservation has been
+         * consumed.
+         */
+        if (write_access && !pte_write(entry)) {
+                if (vma_needs_reservation(h, vma, address) < 0) {
+                        ret = VM_FAULT_OOM;
+                        goto out_mutex;
+                }
+                if (!(vma->vm_flags & VM_SHARED))
+                        pagecache_page = hugetlbfs_pagecache_page(h,
+                                                                vma, address);
+        }
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
-        if (likely(pte_same(entry, huge_ptep_get(ptep))))
+        if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
-                if (write_access && !pte_write(entry))
+                goto out_page_table_lock;
-                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
+        if (write_access) {
+                if (!pte_write(entry)) {
+                        ret = hugetlb_cow(mm, vma, address, ptep, entry,
+                                                        pagecache_page);
+                        goto out_page_table_lock;
+                }
+                entry = pte_mkdirty(entry);
+        }
+        entry = pte_mkyoung(entry);
+        if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access))
+                update_mmu_cache(vma, address, entry);
+out_page_table_lock:
        spin_unlock(&mm->page_table_lock);
+        if (pagecache_page) {
+                unlock_page(pagecache_page);
+                put_page(pagecache_page);
+        }
+out_mutex:
        mutex_unlock(&hugetlb_instantiation_mutex);
        return ret;
 }
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+               pud_t *pud, int write)
+{
+        BUG();
+        return NULL;
+}
+static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
+{
+        if (!ptep || write || shared)
+                return 0;
+        else
+                return huge_pte_none(huge_ptep_get(ptep));
+}
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i,
@@ -1037,6 +2088,9 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long pfn_offset;
        unsigned long vaddr = *position;
        int remainder = *length;
+        struct hstate *h = hstate_vma(vma);
+        int zeropage_ok = 0;
+        int shared = vma->vm_flags & VM_SHARED;
        spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
@@ -1048,9 +2102,12 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * each hugepage.  We have to make * sure we get the
                 * first, for the page indexing below to work.
                 */
-                pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+                pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
+                if (huge_zeropage_ok(pte, write, shared))
+                        zeropage_ok = 1;
-                if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
+                if (!pte ||
+                    (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
                    (write && !pte_write(huge_ptep_get(pte)))) {
                        int ret;
@@ -1066,12 +2123,15 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        break;
                }
-                pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+                pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
                page = pte_page(huge_ptep_get(pte));
 same_page:
                if (pages) {
-                        get_page(page);
+                        if (zeropage_ok)
-                        pages[i] = page + pfn_offset;
+                                pages[i] = ZERO_PAGE(0);
+                        else
+                                pages[i] = page + pfn_offset;
+                        get_page(pages[i]);
                }
                if (vmas)
@@ -1082,7 +2142,7 @@ same_page:
                --remainder;
                ++i;
                if (vaddr < vma->vm_end && remainder &&
-                                pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+                                pfn_offset < pages_per_huge_page(h)) {
                        /*
                         * We use pfn_offset to avoid touching the pageframes
                         * of this compound page.
@@ -1104,13 +2164,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        unsigned long start = address;
        pte_t *ptep;
        pte_t pte;
+        struct hstate *h = hstate_vma(vma);
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
        spin_lock(&mm->page_table_lock);
-        for (; address < end; address += HPAGE_SIZE) {
+        for (; address < end; address += huge_page_size(h)) {
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
@@ -1128,195 +2189,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        flush_tlb_range(vma, start, end);
 }
-struct file_region {
+int hugetlb_reserve_pages(struct inode *inode,
-        struct list_head link;
+                                        long from, long to,
-        long from;
+                                        struct vm_area_struct *vma)
-        long to;
-};
-static long region_add(struct list_head *head, long f, long t)
-{
-        struct file_region *rg, *nrg, *trg;
-        /* Locate the region we are either in or before. */
-        list_for_each_entry(rg, head, link)
-                if (f <= rg->to)
-                        break;
-        /* Round our left edge to the current segment if it encloses us. */
-        if (f > rg->from)
-                f = rg->from;
-        /* Check for and consume any regions we now overlap with. */
-        nrg = rg;
-        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-                if (&rg->link == head)
-                        break;
-                if (rg->from > t)
-                        break;
-                /* If this area reaches higher then extend our area to
-                 * include it completely.  If this is not the first area
-                 * which we intend to reuse, free it. */
-                if (rg->to > t)
-                        t = rg->to;
-                if (rg != nrg) {
-                        list_del(&rg->link);
-                        kfree(rg);
-                }
-        }
-        nrg->from = f;
-        nrg->to = t;
-        return 0;
-}
-static long region_chg(struct list_head *head, long f, long t)
-{
-        struct file_region *rg, *nrg;
-        long chg = 0;
-        /* Locate the region we are before or in. */
-        list_for_each_entry(rg, head, link)
-                if (f <= rg->to)
-                        break;
-        /* If we are below the current region then a new region is required.
-         * Subtle, allocate a new region at the position but make it zero
-         * size such that we can guarantee to record the reservation. */
-        if (&rg->link == head || t < rg->from) {
-                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
-                if (!nrg)
-                        return -ENOMEM;
-                nrg->from = f;
-                nrg->to   = f;
-                INIT_LIST_HEAD(&nrg->link);
-                list_add(&nrg->link, rg->link.prev);
-                return t - f;
-        }
-        /* Round our left edge to the current segment if it encloses us. */
-        if (f > rg->from)
-                f = rg->from;
-        chg = t - f;
-        /* Check for and consume any regions we now overlap with. */
-        list_for_each_entry(rg, rg->link.prev, link) {
-                if (&rg->link == head)
-                        break;
-                if (rg->from > t)
-                        return chg;
-                /* We overlap with this area, if it extends futher than
-                 * us then we must extend ourselves.  Account for its
-                 * existing reservation. */
-                if (rg->to > t) {
-                        chg += rg->to - t;
-                        t = rg->to;
-                }
-                chg -= rg->to - rg->from;
-        }
-        return chg;
-}
-static long region_truncate(struct list_head *head, long end)
 {
-        struct file_region *rg, *trg;
+        long ret, chg;
-        long chg = 0;
+        struct hstate *h = hstate_inode(inode);
-        /* Locate the region we are either in or before. */
+        if (vma && vma->vm_flags & VM_NORESERVE)
-        list_for_each_entry(rg, head, link)
-                if (end <= rg->to)
-                        break;
-        if (&rg->link == head)
                return 0;
-        /* If we are in the middle of a region then adjust it. */
-        if (end > rg->from) {
-                chg = rg->to - end;
-                rg->to = end;
-                rg = list_entry(rg->link.next, typeof(*rg), link);
-        }
-        /* Drop any remaining regions. */
-        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-                if (&rg->link == head)
-                        break;
-                chg += rg->to - rg->from;
-                list_del(&rg->link);
-                kfree(rg);
-        }
-        return chg;
-}
-static int hugetlb_acct_memory(long delta)
-{
-        int ret = -ENOMEM;
-        spin_lock(&hugetlb_lock);
        /*
-         * When cpuset is configured, it breaks the strict hugetlb page
+         * Shared mappings base their reservation on the number of pages that
-         * reservation as the accounting is done on a global variable. Such
+         * are already allocated on behalf of the file. Private mappings need
-         * reservation is completely rubbish in the presence of cpuset because
+         * to reserve the full area even if read-only as mprotect() may be
-         * the reservation is not checked against page availability for the
+         * called to make the mapping read-write. Assume !vma is a shm mapping
-         * current cpuset. Application can still potentially OOM'ed by kernel
-         * with lack of free htlb page in cpuset that the task is in.
-         * Attempt to enforce strict accounting with cpuset is almost
-         * impossible (or too ugly) because cpuset is too fluid that
-         * task or memory node can be dynamically moved between cpusets.
-         *
-         * The change of semantics for shared hugetlb mapping with cpuset is
-         * undesirable. However, in order to preserve some of the semantics,
-         * we fall back to check against current free page availability as
-         * a best attempt and hopefully to minimize the impact of changing
-         * semantics that cpuset has.
         */
-        if (delta > 0) {
+        if (!vma || vma->vm_flags & VM_SHARED)
-                if (gather_surplus_pages(delta) < 0)
+                chg = region_chg(&inode->i_mapping->private_list, from, to);
-                        goto out;
+        else {
+                struct resv_map *resv_map = resv_map_alloc();
-                if (delta > cpuset_mems_nr(free_huge_pages_node)) {
+                if (!resv_map)
-                        return_unused_surplus_pages(delta);
+                        return -ENOMEM;
-                        goto out;
-                }
-        }
-        ret = 0;
+                chg = to - from;
-        if (delta < 0)
-                return_unused_surplus_pages((unsigned long) -delta);
-out:
+                set_vma_resv_map(vma, resv_map);
-        spin_unlock(&hugetlb_lock);
+                set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
-        return ret;
+        }
-}
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
-{
-        long ret, chg;
-        chg = region_chg(&inode->i_mapping->private_list, from, to);
        if (chg < 0)
                return chg;
        if (hugetlb_get_quota(inode->i_mapping, chg))
                return -ENOSPC;
-        ret = hugetlb_acct_memory(chg);
+        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
                hugetlb_put_quota(inode->i_mapping, chg);
                return ret;
        }
-        region_add(&inode->i_mapping->private_list, from, to);
+        if (!vma || vma->vm_flags & VM_SHARED)
+                region_add(&inode->i_mapping->private_list, from, to);
        return 0;
 }
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
+        struct hstate *h = hstate_inode(inode);
        long chg = region_truncate(&inode->i_mapping->private_list, offset);
        spin_lock(&inode->i_lock);
-        inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed;
+        inode->i_blocks -= blocks_per_huge_page(h);
        spin_unlock(&inode->i_lock);
        hugetlb_put_quota(inode->i_mapping, (chg - freed));
-        hugetlb_acct_memory(-(chg - freed));
+        hugetlb_acct_memory(h, -(chg - freed));
 }
diff --git a/mm/internal.h b/mm/internal.h
index 0034e947e4bc..e4e728bdf324 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,11 @@
 #include <linux/mm.h>
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+                unsigned long floor, unsigned long ceiling);
+extern void prep_compound_page(struct page *page, unsigned long order);
 static inline void set_page_count(struct page *page, int v)
 {
        atomic_set(&page->_count, v);
@@ -34,6 +39,15 @@ static inline void __put_page(struct page *page)
        atomic_dec(&page->_count);
 }
+/*
+ * in mm/vmscan.c:
+ */
+extern int isolate_lru_page(struct page *page);
+extern void putback_lru_page(struct page *page);
+/*
+ * in mm/page_alloc.c
+ */
 extern void __free_pages_bootmem(struct page *page, unsigned int order);
 /*
@@ -47,6 +61,120 @@ static inline unsigned long page_order(struct page *page)
        return page_private(page);
 }
+extern long mlock_vma_pages_range(struct vm_area_struct *vma,
+                        unsigned long start, unsigned long end);
+extern void munlock_vma_pages_range(struct vm_area_struct *vma,
+                        unsigned long start, unsigned long end);
+static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
+{
+        munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
+}
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * unevictable_migrate_page() called only from migrate_page_copy() to
+ * migrate unevictable flag to new page.
+ * Note that the old page has been isolated from the LRU lists at this
+ * point so we don't need to worry about LRU statistics.
+ */
+static inline void unevictable_migrate_page(struct page *new, struct page *old)
+{
+        if (TestClearPageUnevictable(old))
+                SetPageUnevictable(new);
+}
+#else
+static inline void unevictable_migrate_page(struct page *new, struct page *old)
+{
+}
+#endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * Called only in fault path via page_evictable() for a new page
+ * to determine if it's being mapped into a LOCKED vma.
+ * If so, mark page as mlocked.
+ */
+static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
+{
+        VM_BUG_ON(PageLRU(page));
+        if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
+                return 0;
+        if (!TestSetPageMlocked(page)) {
+                inc_zone_page_state(page, NR_MLOCK);
+                count_vm_event(UNEVICTABLE_PGMLOCKED);
+        }
+        return 1;
+}
+/*
+ * must be called with vma's mmap_sem held for read, and page locked.
+ */
+extern void mlock_vma_page(struct page *page);
+/*
+ * Clear the page's PageMlocked().  This can be useful in a situation where
+ * we want to unconditionally remove a page from the pagecache -- e.g.,
+ * on truncation or freeing.
+ *
+ * It is legal to call this function for any page, mlocked or not.
+ * If called for a page that is still mapped by mlocked vmas, all we do
+ * is revert to lazy LRU behaviour -- semantics are not broken.
+ */
+extern void __clear_page_mlock(struct page *page);
+static inline void clear_page_mlock(struct page *page)
+{
+        if (unlikely(TestClearPageMlocked(page)))
+                __clear_page_mlock(page);
+}
+/*
+ * mlock_migrate_page - called only from migrate_page_copy() to
+ * migrate the Mlocked page flag; update statistics.
+ */
+static inline void mlock_migrate_page(struct page *newpage, struct page *page)
+{
+        if (TestClearPageMlocked(page)) {
+                unsigned long flags;
+                local_irq_save(flags);
+                __dec_zone_page_state(page, NR_MLOCK);
+                SetPageMlocked(newpage);
+                __inc_zone_page_state(newpage, NR_MLOCK);
+                local_irq_restore(flags);
+        }
+}
+/*
+ * free_page_mlock() -- clean up attempts to free and mlocked() page.
+ * Page should not be on lru, so no need to fix that up.
+ * free_pages_check() will verify...
+ */
+static inline void free_page_mlock(struct page *page)
+{
+        if (unlikely(TestClearPageMlocked(page))) {
+                unsigned long flags;
+                local_irq_save(flags);
+                __dec_zone_page_state(page, NR_MLOCK);
+                __count_vm_event(UNEVICTABLE_MLOCKFREED);
+                local_irq_restore(flags);
+        }
+}
+#else /* CONFIG_UNEVICTABLE_LRU */
+static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
+{
+        return 0;
+}
+static inline void clear_page_mlock(struct page *page) { }
+static inline void mlock_vma_page(struct page *page) { }
+static inline void mlock_migrate_page(struct page *new, struct page *old) { }
+static inline void free_page_mlock(struct page *page) { }
+#endif /* CONFIG_UNEVICTABLE_LRU */
 /*
 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
 * so all functions starting at paging_init should be marked __init
@@ -59,4 +187,68 @@ static inline unsigned long page_order(struct page *page)
 #define __paginginit __init
 #endif
+/* Memory initialisation debug and verification */
+enum mminit_level {
+        MMINIT_WARNING,
+        MMINIT_VERIFY,
+        MMINIT_TRACE
+};
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+extern int mminit_loglevel;
+#define mminit_dprintk(level, prefix, fmt, arg...) \
+do { \
+        if (level < mminit_loglevel) { \
+                printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
+                printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+        } \
+} while (0)
+extern void mminit_verify_pageflags_layout(void);
+extern void mminit_verify_page_links(struct page *page,
+                enum zone_type zone, unsigned long nid, unsigned long pfn);
+extern void mminit_verify_zonelist(void);
+#else
+static inline void mminit_dprintk(enum mminit_level level,
+                                const char *prefix, const char *fmt, ...)
+{
+}
+static inline void mminit_verify_pageflags_layout(void)
+{
+}
+static inline void mminit_verify_page_links(struct page *page,
+                enum zone_type zone, unsigned long nid, unsigned long pfn)
+{
+}
+static inline void mminit_verify_zonelist(void)
+{
+}
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
+#if defined(CONFIG_SPARSEMEM)
+extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+                                unsigned long *end_pfn);
+#else
+static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+                                unsigned long *end_pfn)
+{
+}
+#endif /* CONFIG_SPARSEMEM */
+#define GUP_FLAGS_WRITE                  0x1
+#define GUP_FLAGS_FORCE                  0x2
+#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                     unsigned long start, int len, int flags,
+                     struct page **pages, struct vm_area_struct **vmas);
 #endif
diff --git a/mm/madvise.c b/mm/madvise.c
index 23a0ec3e0ea0..f9349c18a1b5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
 * Application no longer needs these pages.  If the pages are dirty,
 * it's OK to just throw them away.  The app will be more careful about
 * data it wants to keep.  Be sure to free swap resources too.  The
- * zap_page_range call sets things up for refill_inactive to actually free
+ * zap_page_range call sets things up for shrink_active_list to actually free
 * these pages later if no one else has touched them in the meantime,
 * although we could add these pages to a global reuse list for
- * refill_inactive to pick up before reclaiming other pages.
+ * shrink_active_list to pick up before reclaiming other pages.
 *
 * NB: This interface discards data rather than pushes it out to swap,
 * as some implementations do.  This has performance implications for
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e46451e1d9b7..866dcc7eeb0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -32,12 +32,13 @@
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <linux/mm_inline.h>
+#include <linux/page_cgroup.h>
 #include <asm/uaccess.h>
-struct cgroup_subsys mem_cgroup_subsys;
+struct cgroup_subsys mem_cgroup_subsys __read_mostly;
-static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
+#define MEM_CGROUP_RECLAIM_RETRIES      5
-static struct kmem_cache *page_cgroup_cache;
 /*
 * Statistics for memory cgroup.
@@ -65,11 +66,10 @@ struct mem_cgroup_stat {
 /*
 * For accounting under irq disable, no need for increment preempt count.
 */
-static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat,
+static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
                enum mem_cgroup_stat_index idx, int val)
 {
-        int cpu = smp_processor_id();
+        stat->count[idx] += val;
-        stat->cpustat[cpu].count[idx] += val;
 }
 static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
@@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
 /*
 * per-zone information in memory controller.
 */
-enum mem_cgroup_zstat_index {
-        MEM_CGROUP_ZSTAT_ACTIVE,
-        MEM_CGROUP_ZSTAT_INACTIVE,
-        NR_MEM_CGROUP_ZSTAT,
-};
 struct mem_cgroup_per_zone {
        /*
         * spin_lock to protect the per cgroup LRU
         */
        spinlock_t              lru_lock;
-        struct list_head        active_list;
+        struct list_head        lists[NR_LRU_LISTS];
-        struct list_head        inactive_list;
+        unsigned long           count[NR_LRU_LISTS];
-        unsigned long count[NR_MEM_CGROUP_ZSTAT];
 };
 /* Macro for accessing counter */
 #define MEM_CGROUP_ZSTAT(mz, idx)       ((mz)->count[(idx)])
@@ -144,69 +135,52 @@ struct mem_cgroup {
 };
 static struct mem_cgroup init_mem_cgroup;
-/*
- * We use the lower bit of the page->page_cgroup pointer as a bit spin
- * lock.  We need to ensure that page->page_cgroup is at least two
- * byte aligned (based on comments from Nick Piggin).  But since
- * bit_spin_lock doesn't actually set that lock bit in a non-debug
- * uniprocessor kernel, we should avoid setting it here too.
- */
-#define PAGE_CGROUP_LOCK_BIT    0x0
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define PAGE_CGROUP_LOCK        (1 << PAGE_CGROUP_LOCK_BIT)
-#else
-#define PAGE_CGROUP_LOCK        0x0
-#endif
-/*
- * A page_cgroup page is associated with every page descriptor. The
- * page_cgroup helps us identify information about the cgroup
- */
-struct page_cgroup {
-        struct list_head lru;           /* per cgroup LRU list */
-        struct page *page;
-        struct mem_cgroup *mem_cgroup;
-        int ref_cnt;                    /* cached, mapped, migrating */
-        int flags;
-};
-#define PAGE_CGROUP_FLAG_CACHE  (0x1)   /* charged as cache */
-#define PAGE_CGROUP_FLAG_ACTIVE (0x2)   /* page is active in this cgroup */
-static int page_cgroup_nid(struct page_cgroup *pc)
-{
-        return page_to_nid(pc->page);
-}
-static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
-{
-        return page_zonenum(pc->page);
-}
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
        MEM_CGROUP_CHARGE_TYPE_MAPPED,
+        MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
+        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
+        NR_CHARGE_TYPE,
+};
+/* only for here (for easy reading.) */
+#define PCGF_CACHE      (1UL << PCG_CACHE)
+#define PCGF_USED       (1UL << PCG_USED)
+#define PCGF_ACTIVE     (1UL << PCG_ACTIVE)
+#define PCGF_LOCK       (1UL << PCG_LOCK)
+#define PCGF_FILE       (1UL << PCG_FILE)
+static const unsigned long
+pcg_default_flags[NR_CHARGE_TYPE] = {
+        PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
+        PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
+        PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
+        0, /* FORCE */
 };
 /*
 * Always modified under lru lock. Then, not necessary to preempt_disable()
 */
-static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
+static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
-                                        bool charge)
+                                         struct page_cgroup *pc,
+                                         bool charge)
 {
        int val = (charge)? 1 : -1;
        struct mem_cgroup_stat *stat = &mem->stat;
+        struct mem_cgroup_stat_cpu *cpustat;
        VM_BUG_ON(!irqs_disabled());
-        if (flags & PAGE_CGROUP_FLAG_CACHE)
-                __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
+        cpustat = &stat->cpustat[smp_processor_id()];
+        if (PageCgroupCache(pc))
+                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
        else
-                __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
+                __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
        if (charge)
-                __mem_cgroup_stat_add_safe(stat,
+                __mem_cgroup_stat_add_safe(cpustat,
                                MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
        else
-                __mem_cgroup_stat_add_safe(stat,
+                __mem_cgroup_stat_add_safe(cpustat,
                                MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
 }
@@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
 }
 static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
-                                        enum mem_cgroup_zstat_index idx)
+                                        enum lru_list idx)
 {
        int nid, zid;
        struct mem_cgroup_per_zone *mz;
@@ -250,89 +224,89 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
+        /*
+         * mm_update_next_owner() may clear mm->owner to NULL
+         * if it races with swapoff, page migration, etc.
+         * So this can be called with p == NULL.
+         */
+        if (unlikely(!p))
+                return NULL;
        return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
                                struct mem_cgroup, css);
 }
-static inline int page_cgroup_locked(struct page *page)
-{
-        return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
-{
-        VM_BUG_ON(!page_cgroup_locked(page));
-        page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
-}
-struct page_cgroup *page_get_page_cgroup(struct page *page)
-{
-        return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
-}
-static void lock_page_cgroup(struct page *page)
-{
-        bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-static int try_lock_page_cgroup(struct page *page)
-{
-        return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
-static void unlock_page_cgroup(struct page *page)
-{
-        bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
-}
 static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
                        struct page_cgroup *pc)
 {
-        int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+        int lru = LRU_BASE;
+        if (PageCgroupUnevictable(pc))
+                lru = LRU_UNEVICTABLE;
+        else {
+                if (PageCgroupActive(pc))
+                        lru += LRU_ACTIVE;
+                if (PageCgroupFile(pc))
+                        lru += LRU_FILE;
+        }
-        if (from)
+        MEM_CGROUP_ZSTAT(mz, lru) -= 1;
-                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
-        else
-                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
-        mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
+        mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
-        list_del_init(&pc->lru);
+        list_del(&pc->lru);
 }
 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
                                struct page_cgroup *pc)
 {
-        int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
+        int lru = LRU_BASE;
-        if (!to) {
+        if (PageCgroupUnevictable(pc))
-                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
+                lru = LRU_UNEVICTABLE;
-                list_add(&pc->lru, &mz->inactive_list);
+        else {
-        } else {
+                if (PageCgroupActive(pc))
-                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
+                        lru += LRU_ACTIVE;
-                list_add(&pc->lru, &mz->active_list);
+                if (PageCgroupFile(pc))
+                        lru += LRU_FILE;
        }
-        mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true);
+        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+        list_add(&pc->lru, &mz->lists[lru]);
+        mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
 }
-static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
+static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
 {
-        int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
        struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
+        int active    = PageCgroupActive(pc);
+        int file      = PageCgroupFile(pc);
+        int unevictable = PageCgroupUnevictable(pc);
+        enum lru_list from = unevictable ? LRU_UNEVICTABLE :
+                                (LRU_FILE * !!file + !!active);
-        if (from)
+        if (lru == from)
-                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
+                return;
-        else
-                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
-        if (active) {
+        MEM_CGROUP_ZSTAT(mz, from) -= 1;
-                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1;
+        /*
-                pc->flags |= PAGE_CGROUP_FLAG_ACTIVE;
+         * However this is done under mz->lru_lock, another flags, which
-                list_move(&pc->lru, &mz->active_list);
+         * are not related to LRU, will be modified from out-of-lock.
+         * We have to use atomic set/clear flags.
+         */
+        if (is_unevictable_lru(lru)) {
+                ClearPageCgroupActive(pc);
+                SetPageCgroupUnevictable(pc);
        } else {
-                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1;
+                if (is_active_lru(lru))
-                pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE;
+                        SetPageCgroupActive(pc);
-                list_move(&pc->lru, &mz->inactive_list);
+                else
+                        ClearPageCgroupActive(pc);
+                ClearPageCgroupUnevictable(pc);
        }
+        MEM_CGROUP_ZSTAT(mz, lru) += 1;
+        list_move(&pc->lru, &mz->lists[lru]);
 }
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -348,12 +322,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 /*
 * This routine assumes that the appropriate zone's lru lock is already held
 */
-void mem_cgroup_move_lists(struct page *page, bool active)
+void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
 {
        struct page_cgroup *pc;
        struct mem_cgroup_per_zone *mz;
        unsigned long flags;
+        if (mem_cgroup_subsys.disabled)
+                return;
        /*
         * We cannot lock_page_cgroup while holding zone's lru_lock,
         * because other holders of lock_page_cgroup can be interrupted
@@ -361,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active)
         * safely get to page_cgroup without it, so just try_lock it:
         * mem_cgroup_isolate_pages allows for page left on wrong list.
         */
-        if (!try_lock_page_cgroup(page))
+        pc = lookup_page_cgroup(page);
+        if (!trylock_page_cgroup(pc))
                return;
+        if (pc && PageCgroupUsed(pc)) {
-        pc = page_get_page_cgroup(page);
-        if (pc) {
                mz = page_cgroup_zoneinfo(pc);
                spin_lock_irqsave(&mz->lru_lock, flags);
-                __mem_cgroup_move_lists(pc, active);
+                __mem_cgroup_move_lists(pc, lru);
                spin_unlock_irqrestore(&mz->lru_lock, flags);
        }
-        unlock_page_cgroup(page);
+        unlock_page_cgroup(pc);
 }
 /*
@@ -392,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
 }
 /*
- * This function is called from vmscan.c. In page reclaiming loop. balance
- * between active and inactive list is calculated. For memory controller
- * page reclaiming, we should use using mem_cgroup's imbalance rather than
- * zone's global lru imbalance.
- */
-long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
-{
-        unsigned long active, inactive;
-        /* active and inactive are the number of pages. 'long' is ok.*/
-        active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
-        inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
-        return (long) (active / (inactive + 1));
-}
-/*
 * prev_priority control...this will be used in memory reclaim path.
 */
 int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -433,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
 * (see include/linux/mmzone.h)
 */
-long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem,
+long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
-                                   struct zone *zone, int priority)
+                                        int priority, enum lru_list lru)
 {
-        long nr_active;
+        long nr_pages;
        int nid = zone->zone_pgdat->node_id;
        int zid = zone_idx(zone);
        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
-        nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE);
+        nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
-        return (nr_active >> priority);
-}
-long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
+        return (nr_pages >> priority);
-                                        struct zone *zone, int priority)
-{
-        long nr_inactive;
-        int nid = zone->zone_pgdat->node_id;
-        int zid = zone_idx(zone);
-        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
-        nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
-        return (nr_inactive >> priority);
 }
 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -462,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
                                        unsigned long *scanned, int order,
                                        int mode, struct zone *z,
                                        struct mem_cgroup *mem_cont,
-                                        int active)
+                                        int active, int file)
 {
        unsigned long nr_taken = 0;
        struct page *page;
@@ -473,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
        int nid = z->zone_pgdat->node_id;
        int zid = zone_idx(z);
        struct mem_cgroup_per_zone *mz;
+        int lru = LRU_FILE * !!file + !!active;
        BUG_ON(!mem_cont);
        mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
-        if (active)
+        src = &mz->lists[lru];
-                src = &mz->active_list;
-        else
-                src = &mz->inactive_list;
        spin_lock(&mz->lru_lock);
        scan = 0;
        list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
                if (scan >= nr_to_scan)
                        break;
+                if (unlikely(!PageCgroupUsed(pc)))
+                        continue;
                page = pc->page;
                if (unlikely(!PageLRU(page)))
                        continue;
-                if (PageActive(page) && !active) {
+                /*
-                        __mem_cgroup_move_lists(pc, true);
+                 * TODO: play better with lumpy reclaim, grabbing anything.
-                        continue;
+                 */
-                }
+                if (PageUnevictable(page) ||
-                if (!PageActive(page) && active) {
+                    (PageActive(page) && !active) ||
-                        __mem_cgroup_move_lists(pc, false);
+                    (!PageActive(page) && active)) {
+                        __mem_cgroup_move_lists(pc, page_lru(page));
                        continue;
                }
                scan++;
                list_move(&pc->lru, &pc_list);
-                if (__isolate_lru_page(page, mode) == 0) {
+                if (__isolate_lru_page(page, mode, file) == 0) {
                        list_move(&page->lru, dst);
                        nr_taken++;
                }
@@ -524,63 +474,45 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 * < 0 if the cgroup is over its limit
 */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-                                gfp_t gfp_mask, enum charge_type ctype)
+                                gfp_t gfp_mask, enum charge_type ctype,
+                                struct mem_cgroup *memcg)
 {
        struct mem_cgroup *mem;
        struct page_cgroup *pc;
-        unsigned long flags;
        unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct mem_cgroup_per_zone *mz;
+        unsigned long flags;
-        if (mem_cgroup_subsys.disabled)
+        pc = lookup_page_cgroup(page);
+        /* can happen at boot */
+        if (unlikely(!pc))
                return 0;
+        prefetchw(pc);
-        /*
-         * Should page_cgroup's go to their own slab?
-         * One could optimize the performance of the charging routine
-         * by saving a bit in the page_flags and using it as a lock
-         * to see if the cgroup page already has a page_cgroup associated
-         * with it
-         */
-retry:
-        lock_page_cgroup(page);
-        pc = page_get_page_cgroup(page);
-        /*
-         * The page_cgroup exists and
-         * the page has already been accounted.
-         */
-        if (pc) {
-                VM_BUG_ON(pc->page != page);
-                VM_BUG_ON(pc->ref_cnt <= 0);
-                pc->ref_cnt++;
-                unlock_page_cgroup(page);
-                goto done;
-        }
-        unlock_page_cgroup(page);
-        pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
-        if (pc == NULL)
-                goto err;
        /*
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
         * thread group leader migrates. It's possible that mm is not
         * set, if so charge the init_mm (happens for pagecache usage).
         */
-        if (!mm)
-                mm = &init_mm;
-        rcu_read_lock();
+        if (likely(!memcg)) {
-        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+                rcu_read_lock();
-        /*
+                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-         * For every charge from the cgroup, increment reference count
+                if (unlikely(!mem)) {
-         */
+                        rcu_read_unlock();
-        css_get(&mem->css);
+                        return 0;
-        rcu_read_unlock();
+                }
+                /*
+                 * For every charge from the cgroup, increment reference count
+                 */
+                css_get(&mem->css);
+                rcu_read_unlock();
+        } else {
+                mem = memcg;
+                css_get(&memcg->css);
+        }
-        while (res_counter_charge(&mem->res, PAGE_SIZE)) {
+        while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
                if (!(gfp_mask & __GFP_WAIT))
                        goto out;
@@ -603,63 +535,104 @@ retry:
                }
        }
-        pc->ref_cnt = 1;
-        pc->mem_cgroup = mem;
+        lock_page_cgroup(pc);
-        pc->page = page;
+        if (unlikely(PageCgroupUsed(pc))) {
-        pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
+                unlock_page_cgroup(pc);
-        if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
-                pc->flags = PAGE_CGROUP_FLAG_CACHE;
-        lock_page_cgroup(page);
-        if (page_get_page_cgroup(page)) {
-                unlock_page_cgroup(page);
-                /*
-                 * Another charge has been added to this page already.
-                 * We take lock_page_cgroup(page) again and read
-                 * page->cgroup, increment refcnt.... just retry is OK.
-                 */
                res_counter_uncharge(&mem->res, PAGE_SIZE);
                css_put(&mem->css);
-                kmem_cache_free(page_cgroup_cache, pc);
-                goto retry;
+                goto done;
        }
-        page_assign_page_cgroup(page, pc);
+        pc->mem_cgroup = mem;
+        /*
+         * If a page is accounted as a page cache, insert to inactive list.
+         * If anon, insert to active list.
+         */
+        pc->flags = pcg_default_flags[ctype];
        mz = page_cgroup_zoneinfo(pc);
        spin_lock_irqsave(&mz->lru_lock, flags);
        __mem_cgroup_add_list(mz, pc);
        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        unlock_page_cgroup(pc);
-        unlock_page_cgroup(page);
 done:
        return 0;
 out:
        css_put(&mem->css);
-        kmem_cache_free(page_cgroup_cache, pc);
-err:
        return -ENOMEM;
 }
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
+        if (mem_cgroup_subsys.disabled)
+                return 0;
+        if (PageCompound(page))
+                return 0;
+        /*
+         * If already mapped, we don't have to account.
+         * If page cache, page->mapping has address_space.
+         * But page->mapping may have out-of-use anon_vma pointer,
+         * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
+         * is NULL.
+         */
+        if (page_mapped(page) || (page->mapping && !PageAnon(page)))
+                return 0;
+        if (unlikely(!mm))
+                mm = &init_mm;
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 }
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
-        if (!mm)
+        if (mem_cgroup_subsys.disabled)
+                return 0;
+        if (PageCompound(page))
+                return 0;
+        /*
+         * Corner case handling. This is called from add_to_page_cache()
+         * in usual. But some FS (shmem) precharges this page before calling it
+         * and call add_to_page_cache() with GFP_NOWAIT.
+         *
+         * For GFP_NOWAIT case, the page may be pre-charged before calling
+         * add_to_page_cache(). (See shmem.c) check it here and avoid to call
+         * charge twice. (It works but has to pay a bit larger cost.)
+         */
+        if (!(gfp_mask & __GFP_WAIT)) {
+                struct page_cgroup *pc;
+                pc = lookup_page_cgroup(page);
+                if (!pc)
+                        return 0;
+                lock_page_cgroup(pc);
+                if (PageCgroupUsed(pc)) {
+                        unlock_page_cgroup(pc);
+                        return 0;
+                }
+                unlock_page_cgroup(pc);
+        }
+        if (unlikely(!mm))
                mm = &init_mm;
-        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_CACHE);
+        if (page_is_file_cache(page))
+                return mem_cgroup_charge_common(page, mm, gfp_mask,
+                                MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
+        else
+                return mem_cgroup_charge_common(page, mm, gfp_mask,
+                                MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 }
 /*
- * Uncharging is always a welcome operation, we never complain, simply
+ * uncharge if !page_mapped(page)
- * uncharge.
 */
-void mem_cgroup_uncharge_page(struct page *page)
+static void
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
        struct page_cgroup *pc;
        struct mem_cgroup *mem;
@@ -672,106 +645,172 @@ void mem_cgroup_uncharge_page(struct page *page)
        /*
         * Check if our page_cgroup is valid
         */
-        lock_page_cgroup(page);
+        pc = lookup_page_cgroup(page);
-        pc = page_get_page_cgroup(page);
+        if (unlikely(!pc || !PageCgroupUsed(pc)))
-        if (!pc)
+                return;
-                goto unlock;
-        VM_BUG_ON(pc->page != page);
+        lock_page_cgroup(pc);
-        VM_BUG_ON(pc->ref_cnt <= 0);
+        if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
+             || !PageCgroupUsed(pc)) {
+                /* This happens at race in zap_pte_range() and do_swap_page()*/
+                unlock_page_cgroup(pc);
+                return;
+        }
+        ClearPageCgroupUsed(pc);
+        mem = pc->mem_cgroup;
-        if (--(pc->ref_cnt) == 0) {
+        mz = page_cgroup_zoneinfo(pc);
-                mz = page_cgroup_zoneinfo(pc);
+        spin_lock_irqsave(&mz->lru_lock, flags);
-                spin_lock_irqsave(&mz->lru_lock, flags);
+        __mem_cgroup_remove_list(mz, pc);
-                __mem_cgroup_remove_list(mz, pc);
+        spin_unlock_irqrestore(&mz->lru_lock, flags);
-                spin_unlock_irqrestore(&mz->lru_lock, flags);
+        unlock_page_cgroup(pc);
-                page_assign_page_cgroup(page, NULL);
+        res_counter_uncharge(&mem->res, PAGE_SIZE);
-                unlock_page_cgroup(page);
+        css_put(&mem->css);
-                mem = pc->mem_cgroup;
+        return;
-                res_counter_uncharge(&mem->res, PAGE_SIZE);
+}
-                css_put(&mem->css);
-                kmem_cache_free(page_cgroup_cache, pc);
+void mem_cgroup_uncharge_page(struct page *page)
+{
+        /* early check. */
+        if (page_mapped(page))
                return;
-        }
+        if (page->mapping && !PageAnon(page))
+                return;
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+}
-unlock:
+void mem_cgroup_uncharge_cache_page(struct page *page)
-        unlock_page_cgroup(page);
+{
+        VM_BUG_ON(page_mapped(page));
+        VM_BUG_ON(page->mapping);
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 /*
- * Returns non-zero if a page (under migration) has valid page_cgroup member.
+ * Before starting migration, account against new page.
- * Refcnt of page_cgroup is incremented.
 */
-int mem_cgroup_prepare_migration(struct page *page)
+int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
 {
        struct page_cgroup *pc;
+        struct mem_cgroup *mem = NULL;
+        enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+        int ret = 0;
        if (mem_cgroup_subsys.disabled)
                return 0;
-        lock_page_cgroup(page);
+        pc = lookup_page_cgroup(page);
-        pc = page_get_page_cgroup(page);
+        lock_page_cgroup(pc);
-        if (pc)
+        if (PageCgroupUsed(pc)) {
-                pc->ref_cnt++;
+                mem = pc->mem_cgroup;
-        unlock_page_cgroup(page);
+                css_get(&mem->css);
-        return pc != NULL;
+                if (PageCgroupCache(pc)) {
+                        if (page_is_file_cache(page))
+                                ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+                        else
+                                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+                }
+        }
+        unlock_page_cgroup(pc);
+        if (mem) {
+                ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
+                        ctype, mem);
+                css_put(&mem->css);
+        }
+        return ret;
 }
-void mem_cgroup_end_migration(struct page *page)
+/* remove redundant charge if migration failed*/
+void mem_cgroup_end_migration(struct page *newpage)
 {
-        mem_cgroup_uncharge_page(page);
+        /*
+         * At success, page->mapping is not NULL.
+         * special rollback care is necessary when
+         * 1. at migration failure. (newpage->mapping is cleared in this case)
+         * 2. the newpage was moved but not remapped again because the task
+         *    exits and the newpage is obsolete. In this case, the new page
+         *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
+         *    always for avoiding mess. The  page_cgroup will be removed if
+         *    unnecessary. File cache pages is still on radix-tree. Don't
+         *    care it.
+         */
+        if (!newpage->mapping)
+                __mem_cgroup_uncharge_common(newpage,
+                                MEM_CGROUP_CHARGE_TYPE_FORCE);
+        else if (PageAnon(newpage))
+                mem_cgroup_uncharge_page(newpage);
 }
 /*
- * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
+ * A call to try to shrink memory usage under specified resource controller.
- * And no race with uncharge() routines because page_cgroup for *page*
+ * This is typically used for page reclaiming for shmem for reducing side
- * has extra one reference by mem_cgroup_prepare_migration.
+ * effect of page allocation from shmem, which is used by some mem_cgroup.
 */
-void mem_cgroup_page_migration(struct page *page, struct page *newpage)
+int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 {
-        struct page_cgroup *pc;
+        struct mem_cgroup *mem;
-        struct mem_cgroup_per_zone *mz;
+        int progress = 0;
-        unsigned long flags;
+        int retry = MEM_CGROUP_RECLAIM_RETRIES;
-        lock_page_cgroup(page);
+        if (mem_cgroup_subsys.disabled)
-        pc = page_get_page_cgroup(page);
+                return 0;
-        if (!pc) {
+        if (!mm)
-                unlock_page_cgroup(page);
+                return 0;
-                return;
+        rcu_read_lock();
+        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+        if (unlikely(!mem)) {
+                rcu_read_unlock();
+                return 0;
        }
+        css_get(&mem->css);
+        rcu_read_unlock();
-        mz = page_cgroup_zoneinfo(pc);
+        do {
-        spin_lock_irqsave(&mz->lru_lock, flags);
+                progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
-        __mem_cgroup_remove_list(mz, pc);
+                progress += res_counter_check_under_limit(&mem->res);
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        } while (!progress && --retry);
-        page_assign_page_cgroup(page, NULL);
+        css_put(&mem->css);
-        unlock_page_cgroup(page);
+        if (!retry)
+                return -ENOMEM;
+        return 0;
+}
-        pc->page = newpage;
+int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
-        lock_page_cgroup(newpage);
+{
-        page_assign_page_cgroup(newpage, pc);
-        mz = page_cgroup_zoneinfo(pc);
+        int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
-        spin_lock_irqsave(&mz->lru_lock, flags);
+        int progress;
-        __mem_cgroup_add_list(mz, pc);
+        int ret = 0;
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
-        unlock_page_cgroup(newpage);
+        while (res_counter_set_limit(&memcg->res, val)) {
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                if (!retry_count) {
+                        ret = -EBUSY;
+                        break;
+                }
+                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
+                if (!progress)
+                        retry_count--;
+        }
+        return ret;
 }
 /*
 * This routine traverse page_cgroup in given list and drop them all.
- * This routine ignores page_cgroup->ref_cnt.
 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 */
 #define FORCE_UNCHARGE_BATCH    (128)
 static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
                            struct mem_cgroup_per_zone *mz,
-                            int active)
+                            enum lru_list lru)
 {
        struct page_cgroup *pc;
        struct page *page;
@@ -779,22 +818,31 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
        unsigned long flags;
        struct list_head *list;
-        if (active)
+        list = &mz->lists[lru];
-                list = &mz->active_list;
-        else
-                list = &mz->inactive_list;
        spin_lock_irqsave(&mz->lru_lock, flags);
        while (!list_empty(list)) {
                pc = list_entry(list->prev, struct page_cgroup, lru);
                page = pc->page;
+                if (!PageCgroupUsed(pc))
+                        break;
                get_page(page);
                spin_unlock_irqrestore(&mz->lru_lock, flags);
-                mem_cgroup_uncharge_page(page);
+                /*
-                put_page(page);
+                 * Check if this page is on LRU. !LRU page can be found
-                if (--count <= 0) {
+                 * if it's under page migration.
-                        count = FORCE_UNCHARGE_BATCH;
+                 */
-                        cond_resched();
+                if (PageLRU(page)) {
+                        __mem_cgroup_uncharge_common(page,
+                                        MEM_CGROUP_CHARGE_TYPE_FORCE);
+                        put_page(page);
+                        if (--count <= 0) {
+                                count = FORCE_UNCHARGE_BATCH;
+                                cond_resched();
+                        }
+                } else {
+                        spin_lock_irqsave(&mz->lru_lock, flags);
+                        break;
                }
                spin_lock_irqsave(&mz->lru_lock, flags);
        }
@@ -810,9 +858,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
        int ret = -EBUSY;
        int node, zid;
-        if (mem_cgroup_subsys.disabled)
-                return 0;
        css_get(&mem->css);
        /*
         * page reclaim code (kswapd etc..) will move pages between
@@ -822,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
        while (mem->res.usage > 0) {
                if (atomic_read(&mem->css.cgroup->count) > 0)
                        goto out;
+                /* This is for making all *used* pages to be on LRU. */
+                lru_add_drain_all();
                for_each_node_state(node, N_POSSIBLE)
                        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
                                struct mem_cgroup_per_zone *mz;
+                                enum lru_list l;
                                mz = mem_cgroup_zoneinfo(mem, node, zid);
-                                /* drop all page_cgroup in active_list */
+                                for_each_lru(l)
-                                mem_cgroup_force_empty_list(mem, mz, 1);
+                                        mem_cgroup_force_empty_list(mem, mz, l);
-                                /* drop all page_cgroup in inactive_list */
-                                mem_cgroup_force_empty_list(mem, mz, 0);
                        }
+                cond_resched();
        }
        ret = 0;
 out:
@@ -838,32 +885,34 @@ out:
        return ret;
 }
-static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
-{
-        *tmp = memparse(buf, &buf);
-        if (*buf != '\0')
-                return -EINVAL;
-        /*
-         * Round up the value to the closest page size
-         */
-        *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
-        return 0;
-}
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
        return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
                                    cft->private);
 }
+/*
-static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+ * The user of this function is...
-                                struct file *file, const char __user *userbuf,
+ * RES_LIMIT.
-                                size_t nbytes, loff_t *ppos)
+ */
+static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+                            const char *buffer)
 {
-        return res_counter_write(&mem_cgroup_from_cont(cont)->res,
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-                                cft->private, userbuf, nbytes, ppos,
+        unsigned long long val;
-                                mem_cgroup_write_strategy);
+        int ret;
+        switch (cft->private) {
+        case RES_LIMIT:
+                /* This function does all necessary parse...reuse it */
+                ret = res_counter_memparse_write_strategy(buffer, &val);
+                if (!ret)
+                        ret = mem_cgroup_resize_limit(memcg, val);
+                break;
+        default:
+                ret = -EINVAL; /* should be BUG() ? */
+                break;
+        }
+        return ret;
 }
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -913,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        }
        /* showing # of active pages */
        {
-                unsigned long active, inactive;
+                unsigned long active_anon, inactive_anon;
+                unsigned long active_file, inactive_file;
-                inactive = mem_cgroup_get_all_zonestat(mem_cont,
+                unsigned long unevictable;
-                                                MEM_CGROUP_ZSTAT_INACTIVE);
-                active = mem_cgroup_get_all_zonestat(mem_cont,
+                inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
-                                                MEM_CGROUP_ZSTAT_ACTIVE);
+                                                LRU_INACTIVE_ANON);
-                cb->fill(cb, "active", (active) * PAGE_SIZE);
+                active_anon = mem_cgroup_get_all_zonestat(mem_cont,
-                cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
+                                                LRU_ACTIVE_ANON);
+                inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
+                                                LRU_INACTIVE_FILE);
+                active_file = mem_cgroup_get_all_zonestat(mem_cont,
+                                                LRU_ACTIVE_FILE);
+                unevictable = mem_cgroup_get_all_zonestat(mem_cont,
+                                                        LRU_UNEVICTABLE);
+                cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
+                cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
+                cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
+                cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
+                cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
        }
        return 0;
 }
@@ -940,7 +1002,7 @@ static struct cftype mem_cgroup_files[] = {
        {
                .name = "limit_in_bytes",
                .private = RES_LIMIT,
-                .write = mem_cgroup_write,
+                .write_string = mem_cgroup_write,
                .read_u64 = mem_cgroup_read,
        },
        {
@@ -963,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
 {
        struct mem_cgroup_per_node *pn;
        struct mem_cgroup_per_zone *mz;
+        enum lru_list l;
        int zone, tmp = node;
        /*
         * This routine is called against possible nodes.
@@ -983,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
        for (zone = 0; zone < MAX_NR_ZONES; zone++) {
                mz = &pn->zoneinfo[zone];
-                INIT_LIST_HEAD(&mz->active_list);
-                INIT_LIST_HEAD(&mz->inactive_list);
                spin_lock_init(&mz->lru_lock);
+                for_each_lru(l)
+                        INIT_LIST_HEAD(&mz->lists[l]);
        }
        return 0;
 }
@@ -1026,7 +1089,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
        if (unlikely((cont->parent) == NULL)) {
                mem = &init_mem_cgroup;
-                page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
        } else {
                mem = mem_cgroup_alloc();
                if (!mem)
@@ -1070,8 +1132,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
                                struct cgroup *cont)
 {
-        if (mem_cgroup_subsys.disabled)
-                return 0;
        return cgroup_add_files(cont, ss, mem_cgroup_files,
                                        ARRAY_SIZE(mem_cgroup_files));
 }
@@ -1084,9 +1144,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
        struct mm_struct *mm;
        struct mem_cgroup *mem, *old_mem;
-        if (mem_cgroup_subsys.disabled)
-                return;
        mm = get_task_mm(p);
        if (mm == NULL)
                return;
@@ -1094,9 +1151,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
        mem = mem_cgroup_from_cont(cont);
        old_mem = mem_cgroup_from_cont(old_cont);
-        if (mem == old_mem)
-                goto out;
        /*
         * Only thread group leaders are allowed to migrate, the mm_struct is
         * in effect owned by the leader
diff --git a/mm/memory.c b/mm/memory.c
index 2302d228fe04..164951c47305 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -61,6 +62,8 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
+#include "internal.h"
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -211,7 +214,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 *
 * Must be called with pagetable lock held.
 */
-void free_pgd_range(struct mmu_gather **tlb,
+void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
                        unsigned long floor, unsigned long ceiling)
 {
@@ -262,16 +265,16 @@ void free_pgd_range(struct mmu_gather **tlb,
                return;
        start = addr;
-        pgd = pgd_offset((*tlb)->mm, addr);
+        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+                free_pud_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
 }
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long floor, unsigned long ceiling)
 {
        while (vma) {
@@ -372,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 *
 * The calling function must still handle the error.
 */
-void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
+                          unsigned long vaddr)
 {
        printk(KERN_ERR "Bad pte = %08llx, process = %s, "
                        "vm_flags = %lx, vaddr = %lx\n",
@@ -649,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        unsigned long next;
        unsigned long addr = vma->vm_start;
        unsigned long end = vma->vm_end;
+        int ret;
        /*
         * Don't copy ptes where a page fault will fill them correctly.
@@ -664,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+        /*
+         * We need to invalidate the secondary MMU mappings only when
+         * there could be a permission downgrade on the ptes of the
+         * parent mm. And a permission downgrade will only happen if
+         * is_cow_mapping() returns true.
+         */
+        if (is_cow_mapping(vma->vm_flags))
+                mmu_notifier_invalidate_range_start(src_mm, addr, end);
+        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
        src_pgd = pgd_offset(src_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(src_pgd))
                        continue;
-                if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+                if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
-                                                vma, addr, next))
+                                            vma, addr, next))) {
-                        return -ENOMEM;
+                        ret = -ENOMEM;
+                        break;
+                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);
-        return 0;
+        if (is_cow_mapping(vma->vm_flags))
+                mmu_notifier_invalidate_range_end(src_mm,
+                                                  vma->vm_start, end);
+        return ret;
 }
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -878,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
        unsigned long start = start_addr;
        spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
        int fullmm = (*tlbp)->fullmm;
+        struct mm_struct *mm = vma->vm_mm;
+        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
                unsigned long end;
@@ -899,9 +922,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        }
                        if (unlikely(is_vm_hugetlb_page(vma))) {
-                                unmap_hugepage_range(vma, start, end);
+                                /*
-                                zap_work -= (end - start) /
+                                 * It is undesirable to test vma->vm_file as it
-                                                (HPAGE_SIZE / PAGE_SIZE);
+                                 * should be non-null for valid hugetlb area.
+                                 * However, vm_file will be NULL in the error
+                                 * cleanup path of do_mmap_pgoff. When
+                                 * hugetlbfs ->mmap method fails,
+                                 * do_mmap_pgoff() nullifies vma->vm_file
+                                 * before calling this function to clean up.
+                                 * Since no pte has actually been setup, it is
+                                 * safe to do nothing in this case.
+                                 */
+                                if (vma->vm_file) {
+                                        unmap_hugepage_range(vma, start, end, NULL);
+                                        zap_work -= (end - start) /
+                                        pages_per_huge_page(hstate_vma(vma));
+                                }
                                start = end;
                        } else
                                start = unmap_page_range(*tlbp, vma,
@@ -929,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                }
        }
 out:
+        mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
        return start;   /* which is now the end (or restart) address */
 }
@@ -956,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
        return end;
 }
+/**
+ * zap_vma_ptes - remove ptes mapping the vma
+ * @vma: vm_area_struct holding ptes to be zapped
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ *
+ * This function only unmaps ptes assigned to VM_PFNMAP vmas.
+ *
+ * The entire address range must be fully contained within the vma.
+ *
+ * Returns 0 if successful.
+ */
+int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
+                unsigned long size)
+{
+        if (address < vma->vm_start || address + size > vma->vm_end ||
+                        !(vma->vm_flags & VM_PFNMAP))
+                return -1;
+        zap_page_range(vma, address, size, NULL);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(zap_vma_ptes);
 /*
 * Do a quick page-table lookup for a single page.
 */
@@ -982,19 +1043,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                goto no_page_table;
        pud = pud_offset(pgd, address);
-        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+        if (pud_none(*pud))
                goto no_page_table;
-        
+        if (pud_huge(*pud)) {
+                BUG_ON(flags & FOLL_GET);
+                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+                goto out;
+        }
+        if (unlikely(pud_bad(*pud)))
+                goto no_page_table;
        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                goto no_page_table;
        if (pmd_huge(*pmd)) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
        if (unlikely(pmd_bad(*pmd)))
                goto no_page_table;
@@ -1058,19 +1124,22 @@ static inline int use_zero_page(struct vm_area_struct *vma)
        if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
                return 0;
        /*
-         * And if we have a fault or a nopfn routine, it's not an
+         * And if we have a fault routine, it's not an anonymous region.
-         * anonymous region.
         */
-        return !vma->vm_ops ||
+        return !vma->vm_ops || !vma->vm_ops->fault;
-                (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
 }
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-                unsigned long start, int len, int write, int force,
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                     unsigned long start, int len, int flags,
                struct page **pages, struct vm_area_struct **vmas)
 {
        int i;
-        unsigned int vm_flags;
+        unsigned int vm_flags = 0;
+        int write = !!(flags & GUP_FLAGS_WRITE);
+        int force = !!(flags & GUP_FLAGS_FORCE);
+        int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
        if (len <= 0)
                return 0;
@@ -1094,7 +1163,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        pud_t *pud;
                        pmd_t *pmd;
                        pte_t *pte;
-                        if (write) /* user gate pages are read-only */
+                        /* user gate pages are read-only */
+                        if (!ignore && write)
                                return i ? : -EFAULT;
                        if (pg > TASK_SIZE)
                                pgd = pgd_offset_k(pg);
@@ -1126,8 +1197,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        continue;
                }
-                if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                if (!vma ||
-                                || !(vm_flags & vma->vm_flags))
+                    (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+                    (!ignore && !(vm_flags & vma->vm_flags)))
                        return i ? : -EFAULT;
                if (is_vm_hugetlb_page(vma)) {
@@ -1202,6 +1274,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        } while (len);
        return i;
 }
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+                unsigned long start, int len, int write, int force,
+                struct page **pages, struct vm_area_struct **vmas)
+{
+        int flags = 0;
+        if (write)
+                flags |= GUP_FLAGS_WRITE;
+        if (force)
+                flags |= GUP_FLAGS_FORCE;
+        return __get_user_pages(tsk, mm,
+                                start, len, flags,
+                                pages, vmas);
+}
 EXPORT_SYMBOL(get_user_pages);
 pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -1232,18 +1321,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
        pte_t *pte;
        spinlock_t *ptl;
-        retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
-        if (retval)
-                goto out;
        retval = -EINVAL;
        if (PageAnon(page))
-                goto out_uncharge;
+                goto out;
        retval = -ENOMEM;
        flush_dcache_page(page);
        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
-                goto out_uncharge;
+                goto out;
        retval = -EBUSY;
        if (!pte_none(*pte))
                goto out_unlock;
@@ -1259,8 +1344,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
        return retval;
 out_unlock:
        pte_unmap_unlock(pte, ptl);
-out_uncharge:
-        mem_cgroup_uncharge_page(page);
 out:
        return retval;
 }
@@ -1338,6 +1421,11 @@ out:
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return NULL.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
 */
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
@@ -1548,6 +1636,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
        unsigned long next;
        int err;
+        BUG_ON(pud_huge(*pud));
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
@@ -1589,10 +1679,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long end = addr + size;
+        unsigned long start = addr, end = addr + size;
        int err;
        BUG_ON(addr >= end);
+        mmu_notifier_invalidate_range_start(mm, start, end);
        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -1600,6 +1691,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1716,7 +1808,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * not dirty accountable.
         */
        if (PageAnon(old_page)) {
-                if (!TestSetPageLocked(old_page)) {
+                if (trylock_page(old_page)) {
                        reuse = can_share_swap_page(old_page);
                        unlock_page(old_page);
                }
@@ -1785,6 +1877,15 @@ gotten:
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
        if (!new_page)
                goto oom;
+        /*
+         * Don't let another task, with possibly unlocked vma,
+         * keep the mlocked page.
+         */
+        if (vma->vm_flags & VM_LOCKED) {
+                lock_page(old_page);    /* for LRU manipulation */
+                clear_page_mlock(old_page);
+                unlock_page(old_page);
+        }
        cow_user_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
@@ -1812,12 +1913,14 @@ gotten:
                 * seen in the presence of one thread doing SMC and another
                 * thread doing COW.
                 */
-                ptep_clear_flush(vma, address, page_table);
+                ptep_clear_flush_notify(vma, address, page_table);
-                set_pte_at(mm, address, page_table, entry);
+                SetPageSwapBacked(new_page);
-                update_mmu_cache(vma, address, entry);
+                lru_cache_add_active_or_unevictable(new_page, vma);
-                lru_cache_add_active(new_page);
                page_add_new_anon_rmap(new_page, vma, address);
+//TODO:  is this safe?  do_anonymous_page() does it this way.
+                set_pte_at(mm, address, page_table, entry);
+                update_mmu_cache(vma, address, entry);
                if (old_page) {
                        /*
                         * Only after switching the pte to the new page may
@@ -2215,16 +2318,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                count_vm_event(PGMAJFAULT);
        }
+        mark_page_accessed(page);
+        lock_page(page);
+        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
-                delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
                ret = VM_FAULT_OOM;
+                unlock_page(page);
                goto out;
        }
-        mark_page_accessed(page);
-        lock_page(page);
-        delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
        /*
         * Back out if somebody else already faulted in this pte.
         */
@@ -2251,7 +2355,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page_add_anon_rmap(page, vma, address);
        swap_free(entry);
-        if (vm_swap_full())
+        if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
                remove_exclusive_swap_page(page);
        unlock_page(page);
@@ -2309,7 +2413,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!pte_none(*page_table))
                goto release;
        inc_mm_counter(mm, anon_rss);
-        lru_cache_add_active(page);
+        SetPageSwapBacked(page);
+        lru_cache_add_active_or_unevictable(page, vma);
        page_add_new_anon_rmap(page, vma, address);
        set_pte_at(mm, address, page_table, entry);
@@ -2350,6 +2455,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page;
        pte_t entry;
        int anon = 0;
+        int charged = 0;
        struct page *dirty_page = NULL;
        struct vm_fault vmf;
        int ret;
@@ -2390,6 +2496,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                ret = VM_FAULT_OOM;
                                goto out;
                        }
+                        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+                                ret = VM_FAULT_OOM;
+                                page_cache_release(page);
+                                goto out;
+                        }
+                        charged = 1;
+                        /*
+                         * Don't let another task, with possibly unlocked vma,
+                         * keep the mlocked page.
+                         */
+                        if (vma->vm_flags & VM_LOCKED)
+                                clear_page_mlock(vmf.page);
                        copy_user_highpage(page, vmf.page, address, vma);
                        __SetPageUptodate(page);
                } else {
@@ -2424,11 +2542,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
-        if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
-                ret = VM_FAULT_OOM;
-                goto out;
-        }
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
        /*
@@ -2447,11 +2560,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                entry = mk_pte(page, vma->vm_page_prot);
                if (flags & FAULT_FLAG_WRITE)
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                set_pte_at(mm, address, page_table, entry);
                if (anon) {
-                        inc_mm_counter(mm, anon_rss);
+                        inc_mm_counter(mm, anon_rss);
-                        lru_cache_add_active(page);
+                        SetPageSwapBacked(page);
-                        page_add_new_anon_rmap(page, vma, address);
+                        lru_cache_add_active_or_unevictable(page, vma);
+                        page_add_new_anon_rmap(page, vma, address);
                } else {
                        inc_mm_counter(mm, file_rss);
                        page_add_file_rmap(page);
@@ -2460,11 +2573,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                get_page(dirty_page);
                        }
                }
+//TODO:  is this safe?  do_anonymous_page() does it this way.
+                set_pte_at(mm, address, page_table, entry);
                /* no need to invalidate: a not-present page won't be cached */
                update_mmu_cache(vma, address, entry);
        } else {
-                mem_cgroup_uncharge_page(page);
+                if (charged)
+                        mem_cgroup_uncharge_page(page);
                if (anon)
                        page_cache_release(page);
                else
@@ -2501,59 +2617,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
-/*
- * do_no_pfn() tries to create a new page mapping for a page without
- * a struct_page backing it
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- *
- * It is expected that the ->nopfn handler always returns the same pfn
- * for a given virtual mapping.
- *
- * Mark this `noinline' to prevent it from bloating the main pagefault code.
- */
-static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
-                     unsigned long address, pte_t *page_table, pmd_t *pmd,
-                     int write_access)
-{
-        spinlock_t *ptl;
-        pte_t entry;
-        unsigned long pfn;
-        pte_unmap(page_table);
-        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
-        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
-        pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
-        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
-        if (unlikely(pfn == NOPFN_OOM))
-                return VM_FAULT_OOM;
-        else if (unlikely(pfn == NOPFN_SIGBUS))
-                return VM_FAULT_SIGBUS;
-        else if (unlikely(pfn == NOPFN_REFAULT))
-                return 0;
-        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-        /* Only go through if we didn't race with anybody else... */
-        if (pte_none(*page_table)) {
-                entry = pfn_pte(pfn, vma->vm_page_prot);
-                if (write_access)
-                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                set_pte_at(mm, address, page_table, entry);
-        }
-        pte_unmap_unlock(page_table, ptl);
-        return 0;
-}
 /*
 * Fault of a previously existing named mapping. Repopulate the pte
 * from the encoded file_pte if possible. This enables swappable
@@ -2614,9 +2677,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                                if (likely(vma->vm_ops->fault))
                                        return do_linear_fault(mm, vma, address,
                                                pte, pmd, write_access, entry);
-                                if (unlikely(vma->vm_ops->nopfn))
-                                        return do_no_pfn(mm, vma, address, pte,
-                                                         pmd, write_access);
                        }
                        return do_anonymous_page(mm, vma, address,
                                                 pte, pmd, write_access);
@@ -2748,7 +2808,7 @@ int make_pages_present(unsigned long addr, unsigned long end)
        vma = find_vma(current->mm, addr);
        if (!vma)
-                return -1;
+                return -ENOMEM;
        write = (vma->vm_flags & VM_WRITE) != 0;
        BUG_ON(addr >= end);
        BUG_ON(end > vma->vm_end);
@@ -2757,7 +2817,7 @@ int make_pages_present(unsigned long addr, unsigned long end)
                        len, write, 0, NULL, NULL);
        if (ret < 0)
                return ret;
-        return ret == len ? 0 : -1;
+        return ret == len ? 0 : -EFAULT;
 }
 #if !defined(__HAVE_ARCH_GATE_AREA)
@@ -2804,6 +2864,86 @@ int in_gate_area_no_task(unsigned long addr)
 #endif  /* __HAVE_ARCH_GATE_AREA */
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+static resource_size_t follow_phys(struct vm_area_struct *vma,
+                        unsigned long address, unsigned int flags,
+                        unsigned long *prot)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *ptep, pte;
+        spinlock_t *ptl;
+        resource_size_t phys_addr = 0;
+        struct mm_struct *mm = vma->vm_mm;
+        VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
+        pgd = pgd_offset(mm, address);
+        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+                goto no_page_table;
+        pud = pud_offset(pgd, address);
+        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+                goto no_page_table;
+        pmd = pmd_offset(pud, address);
+        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+                goto no_page_table;
+        /* We cannot handle huge page PFN maps. Luckily they don't exist. */
+        if (pmd_huge(*pmd))
+                goto no_page_table;
+        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+        if (!ptep)
+                goto out;
+        pte = *ptep;
+        if (!pte_present(pte))
+                goto unlock;
+        if ((flags & FOLL_WRITE) && !pte_write(pte))
+                goto unlock;
+        phys_addr = pte_pfn(pte);
+        phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
+        *prot = pgprot_val(pte_pgprot(pte));
+unlock:
+        pte_unmap_unlock(ptep, ptl);
+out:
+        return phys_addr;
+no_page_table:
+        return 0;
+}
+int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
+                        void *buf, int len, int write)
+{
+        resource_size_t phys_addr;
+        unsigned long prot = 0;
+        void *maddr;
+        int offset = addr & (PAGE_SIZE-1);
+        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+                return -EINVAL;
+        phys_addr = follow_phys(vma, addr, write, &prot);
+        if (!phys_addr)
+                return -EINVAL;
+        maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+        if (write)
+                memcpy_toio(maddr + offset, buf, len);
+        else
+                memcpy_fromio(buf, maddr + offset, len);
+        iounmap(maddr);
+        return len;
+}
+#endif
 /*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
@@ -2813,7 +2953,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 {
        struct mm_struct *mm;
        struct vm_area_struct *vma;
-        struct page *page;
        void *old_buf = buf;
        mm = get_task_mm(tsk);
@@ -2825,28 +2964,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
        while (len) {
                int bytes, ret, offset;
                void *maddr;
+                struct page *page = NULL;
                ret = get_user_pages(tsk, mm, addr, 1,
                                write, 1, &page, &vma);
-                if (ret <= 0)
+                if (ret <= 0) {
-                        break;
+                        /*
+                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
-                bytes = len;
+                         * we can access using slightly different code.
-                offset = addr & (PAGE_SIZE-1);
+                         */
-                if (bytes > PAGE_SIZE-offset)
+#ifdef CONFIG_HAVE_IOREMAP_PROT
-                        bytes = PAGE_SIZE-offset;
+                        vma = find_vma(mm, addr);
+                        if (!vma)
-                maddr = kmap(page);
+                                break;
-                if (write) {
+                        if (vma->vm_ops && vma->vm_ops->access)
-                        copy_to_user_page(vma, page, addr,
+                                ret = vma->vm_ops->access(vma, addr, buf,
-                                          maddr + offset, buf, bytes);
+                                                          len, write);
-                        set_page_dirty_lock(page);
+                        if (ret <= 0)
+#endif
+                                break;
+                        bytes = ret;
                } else {
-                        copy_from_user_page(vma, page, addr,
+                        bytes = len;
-                                            buf, maddr + offset, bytes);
+                        offset = addr & (PAGE_SIZE-1);
+                        if (bytes > PAGE_SIZE-offset)
+                                bytes = PAGE_SIZE-offset;
+                        maddr = kmap(page);
+                        if (write) {
+                                copy_to_user_page(vma, page, addr,
+                                                  maddr + offset, buf, bytes);
+                                set_page_dirty_lock(page);
+                        } else {
+                                copy_from_user_page(vma, page, addr,
+                                                    buf, maddr + offset, bytes);
+                        }
+                        kunmap(page);
+                        page_cache_release(page);
                }
-                kunmap(page);
-                page_cache_release(page);
                len -= bytes;
                buf += bytes;
                addr += bytes;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 833f854eabe5..6837a1014372 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,6 +26,7 @@
 #include <linux/delay.h>
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
+#include <linux/pfn.h>
 #include <asm/tlbflush.h>
@@ -62,9 +63,9 @@ static void release_memory_resource(struct resource *res)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info,  struct page *page, int magic)
+static void get_page_bootmem(unsigned long info,  struct page *page, int type)
 {
-        atomic_set(&page->_mapcount, magic);
+        atomic_set(&page->_mapcount, type);
        SetPagePrivate(page);
        set_page_private(page, info);
        atomic_inc(&page->_count);
@@ -72,10 +73,10 @@ static void get_page_bootmem(unsigned long info,  struct page *page, int magic)
 void put_page_bootmem(struct page *page)
 {
-        int magic;
+        int type;
-        magic = atomic_read(&page->_mapcount);
+        type = atomic_read(&page->_mapcount);
-        BUG_ON(magic >= -1);
+        BUG_ON(type >= -1);
        if (atomic_dec_return(&page->_count) == 1) {
                ClearPagePrivate(page);
@@ -86,7 +87,7 @@ void put_page_bootmem(struct page *page)
 }
-void register_page_bootmem_info_section(unsigned long start_pfn)
+static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
        unsigned long *usemap, mapsize, section_nr, i;
        struct mem_section *ms;
@@ -119,7 +120,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn)
        mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
        for (i = 0; i < mapsize; i++, page++)
-                get_page_bootmem(section_nr, page, MIX_INFO);
+                get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 }
@@ -323,11 +324,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
        BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
        BUG_ON(nr_pages % PAGES_PER_SECTION);
-        release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
        sections_to_remove = nr_pages / PAGES_PER_SECTION;
        for (i = 0; i < sections_to_remove; i++) {
                unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+                release_mem_region(pfn << PAGE_SHIFT,
+                                   PAGES_PER_SECTION << PAGE_SHIFT);
                ret = __remove_section(zone, __pfn_to_section(pfn));
                if (ret)
                        break;
@@ -429,7 +430,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        if (need_zonelists_rebuild)
                build_all_zonelists();
-        vm_total_pages = nr_free_pagecache_pages();
+        else
+                vm_total_pages = nr_free_pagecache_pages();
        writeback_set_ratelimit();
        if (onlined_pages)
@@ -455,7 +458,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
        /* we can use NODE_DATA(nid) from here */
        /* init node's zones as empty zones, we don't have any present pages.*/
-        free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
+        free_area_init_node(nid, zones_size, start_pfn, zholes_size);
        return pgdat;
 }
@@ -521,6 +524,66 @@ EXPORT_SYMBOL_GPL(add_memory);
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
+ * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
+ * set and the size of the free page is given by page_order(). Using this,
+ * the function determines if the pageblock contains only free pages.
+ * Due to buddy contraints, a free page at least the size of a pageblock will
+ * be located at the start of the pageblock
+ */
+static inline int pageblock_free(struct page *page)
+{
+        return PageBuddy(page) && page_order(page) >= pageblock_order;
+}
+/* Return the start of the next active pageblock after a given page */
+static struct page *next_active_pageblock(struct page *page)
+{
+        int pageblocks_stride;
+        /* Ensure the starting page is pageblock-aligned */
+        BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
+        /* Move forward by at least 1 * pageblock_nr_pages */
+        pageblocks_stride = 1;
+        /* If the entire pageblock is free, move to the end of free page */
+        if (pageblock_free(page))
+                pageblocks_stride += page_order(page) - pageblock_order;
+        return page + (pageblocks_stride * pageblock_nr_pages);
+}
+/* Checks if this range of memory is likely to be hot-removable. */
+int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+{
+        int type;
+        struct page *page = pfn_to_page(start_pfn);
+        struct page *end_page = page + nr_pages;
+        /* Check the starting page of each pageblock within the range */
+        for (; page < end_page; page = next_active_pageblock(page)) {
+                type = get_pageblock_migratetype(page);
+                /*
+                 * A pageblock containing MOVABLE or free pages is considered
+                 * removable
+                 */
+                if (type != MIGRATE_MOVABLE && !pageblock_free(page))
+                        return 0;
+                /*
+                 * A pageblock starting with a PageReserved page is not
+                 * considered removable.
+                 */
+                if (PageReserved(page))
+                        return 0;
+        }
+        /* All pageblocks in the memory block are likely to be hot-removable */
+        return 1;
+}
+/*
 * Confirm all pages in a range [start, end) is belongs to the same zone.
 */
 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
@@ -595,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
                 * We can skip free pages. And we can only deal with pages on
                 * LRU.
                 */
-                ret = isolate_lru_page(page, &source);
+                ret = isolate_lru_page(page);
                if (!ret) { /* Success */
+                        list_add_tail(&page->lru, &source);
                        move_pages--;
                } else {
                        /* Becasue we don't have big zone->lock. we should
@@ -787,10 +851,19 @@ failed_removal:
        return ret;
 }
+int remove_memory(u64 start, u64 size)
+{
+        unsigned long start_pfn, end_pfn;
+        start_pfn = PFN_DOWN(start);
+        end_pfn = start_pfn + PFN_DOWN(size);
+        return offline_pages(start_pfn, end_pfn, 120 * HZ);
+}
 #else
 int remove_memory(u64 start, u64 size)
 {
        return -EINVAL;
 }
-EXPORT_SYMBOL_GPL(remove_memory);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
+EXPORT_SYMBOL_GPL(remove_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c3..36f42573a335 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,6 +93,8 @@
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 /* Internal flags */
 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
@@ -762,8 +764,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
        /*
         * Avoid migrating a page that is shared with others.
         */
-        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
-                isolate_lru_page(page, pagelist);
+                if (!isolate_lru_page(page)) {
+                        list_add_tail(&page->lru, pagelist);
+                }
+        }
 }
 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
@@ -803,7 +808,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 int do_migrate_pages(struct mm_struct *mm,
        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 {
-        LIST_HEAD(pagelist);
        int busy = 0;
        int err = 0;
        nodemask_t tmp;
@@ -1481,7 +1485,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
-                                                HPAGE_SHIFT), gfp_flags);
+                                huge_page_shift(hstate_vma(vma))), gfp_flags);
        } else {
                zl = policy_zonelist(gfp_flags, *mpol);
                if ((*mpol)->mode == MPOL_BIND)
@@ -2198,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
        if (PageSwapCache(page))
                md->swapcache++;
-        if (PageActive(page))
+        if (PageActive(page) || PageUnevictable(page))
                md->active++;
        if (PageWriteback(page))
@@ -2220,9 +2224,12 @@ static void check_huge_range(struct vm_area_struct *vma,
 {
        unsigned long addr;
        struct page *page;
+        struct hstate *h = hstate_vma(vma);
+        unsigned long sz = huge_page_size(h);
-        for (addr = start; addr < end; addr += HPAGE_SIZE) {
+        for (addr = start; addr < end; addr += sz) {
-                pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+                pte_t *ptep = huge_pte_offset(vma->vm_mm,
+                                                addr & huge_page_mask(h));
                pte_t pte;
                if (!ptep)
diff --git a/mm/migrate.c b/mm/migrate.c
index 55bd355d170d..6602941bfab0 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -30,42 +30,13 @@
 #include <linux/vmalloc.h>
 #include <linux/security.h>
 #include <linux/memcontrol.h>
+#include <linux/syscalls.h>
 #include "internal.h"
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 /*
- * Isolate one page from the LRU lists. If successful put it onto
- * the indicated list with elevated page count.
- *
- * Result:
- *  -EBUSY: page not on LRU list
- *  0: page removed from LRU list and added to the specified list.
- */
-int isolate_lru_page(struct page *page, struct list_head *pagelist)
-{
-        int ret = -EBUSY;
-        if (PageLRU(page)) {
-                struct zone *zone = page_zone(page);
-                spin_lock_irq(&zone->lru_lock);
-                if (PageLRU(page) && get_page_unless_zero(page)) {
-                        ret = 0;
-                        ClearPageLRU(page);
-                        if (PageActive(page))
-                                del_page_from_active_list(zone, page);
-                        else
-                                del_page_from_inactive_list(zone, page);
-                        list_add_tail(&page->lru, pagelist);
-                }
-                spin_unlock_irq(&zone->lru_lock);
-        }
-        return ret;
-}
-/*
 * migrate_prep() needs to be called before we start compiling a list of pages
 * to be migrated using isolate_lru_page().
 */
@@ -82,23 +53,9 @@ int migrate_prep(void)
        return 0;
 }
-static inline void move_to_lru(struct page *page)
-{
-        if (PageActive(page)) {
-                /*
-                 * lru_cache_add_active checks that
-                 * the PG_active bit is off.
-                 */
-                ClearPageActive(page);
-                lru_cache_add_active(page);
-        } else {
-                lru_cache_add(page);
-        }
-        put_page(page);
-}
 /*
- * Add isolated pages on the list back to the LRU.
+ * Add isolated pages on the list back to the LRU under page lock
+ * to avoid leaking evictable pages back onto unevictable list.
 *
 * returns the number of pages put back.
 */
@@ -110,7 +67,7 @@ int putback_lru_pages(struct list_head *l)
        list_for_each_entry_safe(page, page2, l, lru) {
                list_del(&page->lru);
-                move_to_lru(page);
+                putback_lru_page(page);
                count++;
        }
        return count;
@@ -284,7 +241,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
        page = migration_entry_to_page(entry);
-        get_page(page);
+        /*
+         * Once radix-tree replacement of page migration started, page_count
+         * *must* be zero. And, we don't want to call wait_on_page_locked()
+         * against a page without get_page().
+         * So, we use get_page_unless_zero(), here. Even failed, page fault
+         * will occur again.
+         */
+        if (!get_page_unless_zero(page))
+                goto out;
        pte_unmap_unlock(ptep, ptl);
        wait_on_page_locked(page);
        put_page(page);
@@ -304,6 +269,7 @@ out:
 static int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page)
 {
+        int expected_count;
        void **pslot;
        if (!mapping) {
@@ -313,14 +279,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
                return 0;
        }
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        pslot = radix_tree_lookup_slot(&mapping->page_tree,
                                        page_index(page));
-        if (page_count(page) != 2 + !!PagePrivate(page) ||
+        expected_count = 2 + !!PagePrivate(page);
+        if (page_count(page) != expected_count ||
                        (struct page *)radix_tree_deref_slot(pslot) != page) {
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        if (!page_freeze_refs(page, expected_count)) {
+                spin_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -337,6 +309,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        radix_tree_replace_slot(pslot, newpage);
+        page_unfreeze_refs(page, expected_count);
        /*
         * Drop cache reference from old page.
         * We know this isn't the last reference.
@@ -356,7 +329,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        __dec_zone_page_state(page, NR_FILE_PAGES);
        __inc_zone_page_state(newpage, NR_FILE_PAGES);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
        return 0;
 }
@@ -366,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping,
 */
 static void migrate_page_copy(struct page *newpage, struct page *page)
 {
+        int anon;
        copy_highpage(newpage, page);
        if (PageError(page))
@@ -374,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
                SetPageReferenced(newpage);
        if (PageUptodate(page))
                SetPageUptodate(newpage);
-        if (PageActive(page))
+        if (TestClearPageActive(page)) {
+                VM_BUG_ON(PageUnevictable(page));
                SetPageActive(newpage);
+        } else
+                unevictable_migrate_page(newpage, page);
        if (PageChecked(page))
                SetPageChecked(newpage);
        if (PageMappedToDisk(page))
@@ -393,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
                __set_page_dirty_nobuffers(newpage);
        }
+        mlock_migrate_page(newpage, page);
 #ifdef CONFIG_SWAP
        ClearPageSwapCache(page);
 #endif
-        ClearPageActive(page);
        ClearPagePrivate(page);
        set_page_private(page, 0);
+        /* page->mapping contains a flag for PageAnon() */
+        anon = PageAnon(page);
        page->mapping = NULL;
+        if (!anon) /* This page was removed from radix-tree. */
+                mem_cgroup_uncharge_cache_page(page);
        /*
         * If any waiters have accumulated on the new page then
         * wake them up.
@@ -575,6 +559,10 @@ static int fallback_migrate_page(struct address_space *mapping,
 *
 * The new page will have replaced the old page if this function
 * is successful.
+ *
+ * Return value:
+ *   < 0 - error code
+ *  == 0 - success
 */
 static int move_to_new_page(struct page *newpage, struct page *page)
 {
@@ -586,12 +574,14 @@ static int move_to_new_page(struct page *newpage, struct page *page)
         * establishing additional references. We are the only one
         * holding a reference to the new page at this point.
         */
-        if (TestSetPageLocked(newpage))
+        if (!trylock_page(newpage))
                BUG();
        /* Prepare mapping for the new page.*/
        newpage->index = page->index;
        newpage->mapping = page->mapping;
+        if (PageSwapBacked(page))
+                SetPageSwapBacked(newpage);
        mapping = page_mapping(page);
        if (!mapping)
@@ -610,7 +600,6 @@ static int move_to_new_page(struct page *newpage, struct page *page)
                rc = fallback_migrate_page(mapping, newpage, page);
        if (!rc) {
-                mem_cgroup_page_migration(page, newpage);
                remove_migration_ptes(page, newpage);
        } else
                newpage->mapping = NULL;
@@ -636,12 +625,21 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        if (!newpage)
                return -ENOMEM;
-        if (page_count(page) == 1)
+        if (page_count(page) == 1) {
                /* page was freed from under us. So we are done. */
                goto move_newpage;
+        }
+        charge = mem_cgroup_prepare_migration(page, newpage);
+        if (charge == -ENOMEM) {
+                rc = -ENOMEM;
+                goto move_newpage;
+        }
+        /* prepare cgroup just returns 0 or -ENOMEM */
+        BUG_ON(charge);
        rc = -EAGAIN;
-        if (TestSetPageLocked(page)) {
+        if (!trylock_page(page)) {
                if (!force)
                        goto move_newpage;
                lock_page(page);
@@ -691,25 +689,19 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                goto rcu_unlock;
        }
-        charge = mem_cgroup_prepare_migration(page);
        /* Establish migration ptes or remove ptes */
        try_to_unmap(page, 1);
        if (!page_mapped(page))
                rc = move_to_new_page(newpage, page);
-        if (rc) {
+        if (rc)
                remove_migration_ptes(page, page);
-                if (charge)
-                        mem_cgroup_end_migration(page);
-        } else if (charge)
-                mem_cgroup_end_migration(newpage);
 rcu_unlock:
        if (rcu_locked)
                rcu_read_unlock();
 unlock:
        unlock_page(page);
        if (rc != -EAGAIN) {
@@ -720,15 +712,19 @@ unlock:
                 * restored.
                 */
                list_del(&page->lru);
-                move_to_lru(page);
+                putback_lru_page(page);
        }
 move_newpage:
+        if (!charge)
+                mem_cgroup_end_migration(newpage);
        /*
         * Move the new page to the LRU. If migration was not successful
         * then this will free the page.
         */
-        move_to_lru(newpage);
+        putback_lru_page(newpage);
        if (result) {
                if (rc)
                        *result = rc;
@@ -835,9 +831,11 @@ static struct page *new_page_node(struct page *p, unsigned long private,
 * Move a set of pages as indicated in the pm array. The addr
 * field must be set to the virtual address of the page to be moved
 * and the node number must contain a valid target node.
+ * The pm array ends with node = MAX_NUMNODES.
 */
-static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
+static int do_move_page_to_node_array(struct mm_struct *mm,
-                                int migrate_all)
+                                      struct page_to_node *pm,
+                                      int migrate_all)
 {
        int err;
        struct page_to_node *pp;
@@ -891,7 +889,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
                                !migrate_all)
                        goto put_and_set;
-                err = isolate_lru_page(page, &pagelist);
+                err = isolate_lru_page(page);
+                if (!err)
+                        list_add_tail(&page->lru, &pagelist);
 put_and_set:
                /*
                 * Either remove the duplicate refcount from
@@ -903,36 +903,118 @@ set_status:
                pp->status = err;
        }
+        err = 0;
        if (!list_empty(&pagelist))
                err = migrate_pages(&pagelist, new_page_node,
                                (unsigned long)pm);
-        else
-                err = -ENOENT;
        up_read(&mm->mmap_sem);
        return err;
 }
 /*
- * Determine the nodes of a list of pages. The addr in the pm array
+ * Migrate an array of page address onto an array of nodes and fill
- * must have been set to the virtual address of which we want to determine
+ * the corresponding array of status.
- * the node number.
 */
-static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
+static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
+                         unsigned long nr_pages,
+                         const void __user * __user *pages,
+                         const int __user *nodes,
+                         int __user *status, int flags)
 {
+        struct page_to_node *pm = NULL;
+        nodemask_t task_nodes;
+        int err = 0;
+        int i;
+        task_nodes = cpuset_mems_allowed(task);
+        /* Limit nr_pages so that the multiplication may not overflow */
+        if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
+                err = -E2BIG;
+                goto out;
+        }
+        pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
+        if (!pm) {
+                err = -ENOMEM;
+                goto out;
+        }
+        /*
+         * Get parameters from user space and initialize the pm
+         * array. Return various errors if the user did something wrong.
+         */
+        for (i = 0; i < nr_pages; i++) {
+                const void __user *p;
+                err = -EFAULT;
+                if (get_user(p, pages + i))
+                        goto out_pm;
+                pm[i].addr = (unsigned long)p;
+                if (nodes) {
+                        int node;
+                        if (get_user(node, nodes + i))
+                                goto out_pm;
+                        err = -ENODEV;
+                        if (!node_state(node, N_HIGH_MEMORY))
+                                goto out_pm;
+                        err = -EACCES;
+                        if (!node_isset(node, task_nodes))
+                                goto out_pm;
+                        pm[i].node = node;
+                } else
+                        pm[i].node = 0; /* anything to not match MAX_NUMNODES */
+        }
+        /* End marker */
+        pm[nr_pages].node = MAX_NUMNODES;
+        err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
+        if (err >= 0)
+                /* Return status information */
+                for (i = 0; i < nr_pages; i++)
+                        if (put_user(pm[i].status, status + i))
+                                err = -EFAULT;
+out_pm:
+        vfree(pm);
+out:
+        return err;
+}
+/*
+ * Determine the nodes of an array of pages and store it in an array of status.
+ */
+static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
+                         const void __user * __user *pages,
+                         int __user *status)
+{
+        unsigned long i;
+        int err;
        down_read(&mm->mmap_sem);
-        for ( ; pm->node != MAX_NUMNODES; pm++) {
+        for (i = 0; i < nr_pages; i++) {
+                const void __user *p;
+                unsigned long addr;
                struct vm_area_struct *vma;
                struct page *page;
-                int err;
                err = -EFAULT;
-                vma = find_vma(mm, pm->addr);
+                if (get_user(p, pages+i))
+                        goto out;
+                addr = (unsigned long) p;
+                vma = find_vma(mm, addr);
                if (!vma)
                        goto set_status;
-                page = follow_page(vma, pm->addr, 0);
+                page = follow_page(vma, addr, 0);
                err = PTR_ERR(page);
                if (IS_ERR(page))
@@ -945,11 +1027,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
                err = page_to_nid(page);
 set_status:
-                pm->status = err;
+                put_user(err, status+i);
        }
+        err = 0;
+out:
        up_read(&mm->mmap_sem);
-        return 0;
+        return err;
 }
 /*
@@ -961,12 +1045,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
                        const int __user *nodes,
                        int __user *status, int flags)
 {
-        int err = 0;
-        int i;
        struct task_struct *task;
-        nodemask_t task_nodes;
        struct mm_struct *mm;
-        struct page_to_node *pm = NULL;
+        int err;
        /* Check flags */
        if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -998,79 +1079,24 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
            (current->uid != task->suid) && (current->uid != task->uid) &&
            !capable(CAP_SYS_NICE)) {
                err = -EPERM;
-                goto out2;
+                goto out;
        }
        err = security_task_movememory(task);
        if (err)
-                goto out2;
+                goto out;
-        task_nodes = cpuset_mems_allowed(task);
-        /* Limit nr_pages so that the multiplication may not overflow */
-        if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
-                err = -E2BIG;
-                goto out2;
-        }
-        pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
-        if (!pm) {
-                err = -ENOMEM;
-                goto out2;
-        }
-        /*
-         * Get parameters from user space and initialize the pm
-         * array. Return various errors if the user did something wrong.
-         */
-        for (i = 0; i < nr_pages; i++) {
-                const void __user *p;
-                err = -EFAULT;
-                if (get_user(p, pages + i))
-                        goto out;
-                pm[i].addr = (unsigned long)p;
-                if (nodes) {
-                        int node;
-                        if (get_user(node, nodes + i))
-                                goto out;
-                        err = -ENODEV;
-                        if (!node_state(node, N_HIGH_MEMORY))
-                                goto out;
-                        err = -EACCES;
-                        if (!node_isset(node, task_nodes))
-                                goto out;
-                        pm[i].node = node;
+        if (nodes) {
-                } else
+                err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
-                        pm[i].node = 0; /* anything to not match MAX_NUMNODES */
+                                    flags);
+        } else {
+                err = do_pages_stat(mm, nr_pages, pages, status);
        }
-        /* End marker */
-        pm[nr_pages].node = MAX_NUMNODES;
-        if (nodes)
-                err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
-        else
-                err = do_pages_stat(mm, pm);
-        if (err >= 0)
-                /* Return status information */
-                for (i = 0; i < nr_pages; i++)
-                        if (put_user(pm[i].status, status + i))
-                                err = -EFAULT;
 out:
-        vfree(pm);
-out2:
        mmput(mm);
        return err;
 }
-#endif
 /*
 * Call migration functions in the vma_ops that may prepare
@@ -1092,3 +1118,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
        }
        return err;
 }
+#endif
diff --git a/mm/mlock.c b/mm/mlock.c
index 7b2656055d6a..008ea70b7afa 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,10 +8,18 @@
 #include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/rmap.h>
+#include <linux/mmzone.h>
+#include <linux/hugetlb.h>
+#include "internal.h"
 int can_do_mlock(void)
 {
@@ -23,17 +31,381 @@ int can_do_mlock(void)
 }
 EXPORT_SYMBOL(can_do_mlock);
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * Mlocked pages are marked with PageMlocked() flag for efficient testing
+ * in vmscan and, possibly, the fault path; and to support semi-accurate
+ * statistics.
+ *
+ * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
+ * be placed on the LRU "unevictable" list, rather than the [in]active lists.
+ * The unevictable list is an LRU sibling list to the [in]active lists.
+ * PageUnevictable is set to indicate the unevictable state.
+ *
+ * When lazy mlocking via vmscan, it is important to ensure that the
+ * vma's VM_LOCKED status is not concurrently being modified, otherwise we
+ * may have mlocked a page that is being munlocked. So lazy mlock must take
+ * the mmap_sem for read, and verify that the vma really is locked
+ * (see mm/rmap.c).
+ */
+/*
+ *  LRU accounting for clear_page_mlock()
+ */
+void __clear_page_mlock(struct page *page)
+{
+        VM_BUG_ON(!PageLocked(page));
+        if (!page->mapping) {   /* truncated ? */
+                return;
+        }
+        dec_zone_page_state(page, NR_MLOCK);
+        count_vm_event(UNEVICTABLE_PGCLEARED);
+        if (!isolate_lru_page(page)) {
+                putback_lru_page(page);
+        } else {
+                /*
+                 * Page not on the LRU yet.  Flush all pagevecs and retry.
+                 */
+                lru_add_drain_all();
+                if (!isolate_lru_page(page))
+                        putback_lru_page(page);
+                else if (PageUnevictable(page))
+                        count_vm_event(UNEVICTABLE_PGSTRANDED);
+        }
+}
+/*
+ * Mark page as mlocked if not already.
+ * If page on LRU, isolate and putback to move to unevictable list.
+ */
+void mlock_vma_page(struct page *page)
+{
+        BUG_ON(!PageLocked(page));
+        if (!TestSetPageMlocked(page)) {
+                inc_zone_page_state(page, NR_MLOCK);
+                count_vm_event(UNEVICTABLE_PGMLOCKED);
+                if (!isolate_lru_page(page))
+                        putback_lru_page(page);
+        }
+}
+/*
+ * called from munlock()/munmap() path with page supposedly on the LRU.
+ *
+ * Note:  unlike mlock_vma_page(), we can't just clear the PageMlocked
+ * [in try_to_munlock()] and then attempt to isolate the page.  We must
+ * isolate the page to keep others from messing with its unevictable
+ * and mlocked state while trying to munlock.  However, we pre-clear the
+ * mlocked state anyway as we might lose the isolation race and we might
+ * not get another chance to clear PageMlocked.  If we successfully
+ * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
+ * mapping the page, it will restore the PageMlocked state, unless the page
+ * is mapped in a non-linear vma.  So, we go ahead and SetPageMlocked(),
+ * perhaps redundantly.
+ * If we lose the isolation race, and the page is mapped by other VM_LOCKED
+ * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
+ * either of which will restore the PageMlocked state by calling
+ * mlock_vma_page() above, if it can grab the vma's mmap sem.
+ */
+static void munlock_vma_page(struct page *page)
+{
+        BUG_ON(!PageLocked(page));
+        if (TestClearPageMlocked(page)) {
+                dec_zone_page_state(page, NR_MLOCK);
+                if (!isolate_lru_page(page)) {
+                        int ret = try_to_munlock(page);
+                        /*
+                         * did try_to_unlock() succeed or punt?
+                         */
+                        if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
+                                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+                        putback_lru_page(page);
+                } else {
+                        /*
+                         * We lost the race.  let try_to_unmap() deal
+                         * with it.  At least we get the page state and
+                         * mlock stats right.  However, page is still on
+                         * the noreclaim list.  We'll fix that up when
+                         * the page is eventually freed or we scan the
+                         * noreclaim list.
+                         */
+                        if (PageUnevictable(page))
+                                count_vm_event(UNEVICTABLE_PGSTRANDED);
+                        else
+                                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+                }
+        }
+}
+/**
+ * __mlock_vma_pages_range() -  mlock/munlock a range of pages in the vma.
+ * @vma:   target vma
+ * @start: start address
+ * @end:   end address
+ * @mlock: 0 indicate munlock, otherwise mlock.
+ *
+ * If @mlock == 0, unlock an mlocked range;
+ * else mlock the range of pages.  This takes care of making the pages present ,
+ * too.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held for at least read.
+ */
+static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+                                   unsigned long start, unsigned long end,
+                                   int mlock)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long addr = start;
+        struct page *pages[16]; /* 16 gives a reasonable batch */
+        int nr_pages = (end - start) / PAGE_SIZE;
+        int ret;
+        int gup_flags = 0;
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(end   & ~PAGE_MASK);
+        VM_BUG_ON(start < vma->vm_start);
+        VM_BUG_ON(end   > vma->vm_end);
+        VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
+                  (atomic_read(&mm->mm_users) != 0));
+        /*
+         * mlock:   don't page populate if page has PROT_NONE permission.
+         * munlock: the pages always do munlock althrough
+         *          its has PROT_NONE permission.
+         */
+        if (!mlock)
+                gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
+        if (vma->vm_flags & VM_WRITE)
+                gup_flags |= GUP_FLAGS_WRITE;
+        lru_add_drain_all();    /* push cached pages to LRU */
+        while (nr_pages > 0) {
+                int i;
+                cond_resched();
+                /*
+                 * get_user_pages makes pages present if we are
+                 * setting mlock. and this extra reference count will
+                 * disable migration of this page.  However, page may
+                 * still be truncated out from under us.
+                 */
+                ret = __get_user_pages(current, mm, addr,
+                                min_t(int, nr_pages, ARRAY_SIZE(pages)),
+                                gup_flags, pages, NULL);
+                /*
+                 * This can happen for, e.g., VM_NONLINEAR regions before
+                 * a page has been allocated and mapped at a given offset,
+                 * or for addresses that map beyond end of a file.
+                 * We'll mlock the the pages if/when they get faulted in.
+                 */
+                if (ret < 0)
+                        break;
+                if (ret == 0) {
+                        /*
+                         * We know the vma is there, so the only time
+                         * we cannot get a single page should be an
+                         * error (ret < 0) case.
+                         */
+                        WARN_ON(1);
+                        break;
+                }
+                lru_add_drain();        /* push cached pages to LRU */
+                for (i = 0; i < ret; i++) {
+                        struct page *page = pages[i];
+                        lock_page(page);
+                        /*
+                         * Because we lock page here and migration is blocked
+                         * by the elevated reference, we need only check for
+                         * page truncation (file-cache only).
+                         */
+                        if (page->mapping) {
+                                if (mlock)
+                                        mlock_vma_page(page);
+                                else
+                                        munlock_vma_page(page);
+                        }
+                        unlock_page(page);
+                        put_page(page);         /* ref from get_user_pages() */
+                        /*
+                         * here we assume that get_user_pages() has given us
+                         * a list of virtually contiguous pages.
+                         */
+                        addr += PAGE_SIZE;      /* for next get_user_pages() */
+                        nr_pages--;
+                }
+                ret = 0;
+        }
+        lru_add_drain_all();    /* to update stats */
+        return ret;     /* count entire vma as locked_vm */
+}
+/*
+ * convert get_user_pages() return value to posix mlock() error
+ */
+static int __mlock_posix_error_return(long retval)
+{
+        if (retval == -EFAULT)
+                retval = -ENOMEM;
+        else if (retval == -ENOMEM)
+                retval = -EAGAIN;
+        return retval;
+}
+#else /* CONFIG_UNEVICTABLE_LRU */
+/*
+ * Just make pages present if VM_LOCKED.  No-op if unlocking.
+ */
+static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+                                   unsigned long start, unsigned long end,
+                                   int mlock)
+{
+        if (mlock && (vma->vm_flags & VM_LOCKED))
+                return make_pages_present(start, end);
+        return 0;
+}
+static inline int __mlock_posix_error_return(long retval)
+{
+        return 0;
+}
+#endif /* CONFIG_UNEVICTABLE_LRU */
+/**
+ * mlock_vma_pages_range() - mlock pages in specified vma range.
+ * @vma - the vma containing the specfied address range
+ * @start - starting address in @vma to mlock
+ * @end   - end address [+1] in @vma to mlock
+ *
+ * For mmap()/mremap()/expansion of mlocked vma.
+ *
+ * return 0 on success for "normal" vmas.
+ *
+ * return number of pages [> 0] to be removed from locked_vm on success
+ * of "special" vmas.
+ *
+ * return negative error if vma spanning @start-@range disappears while
+ * mmap semaphore is dropped.  Unlikely?
+ */
+long mlock_vma_pages_range(struct vm_area_struct *vma,
+                        unsigned long start, unsigned long end)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        int nr_pages = (end - start) / PAGE_SIZE;
+        BUG_ON(!(vma->vm_flags & VM_LOCKED));
+        /*
+         * filter unlockable vmas
+         */
+        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                goto no_mlock;
+        if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+                        is_vm_hugetlb_page(vma) ||
+                        vma == get_gate_vma(current))) {
+                long error;
+                downgrade_write(&mm->mmap_sem);
+                error = __mlock_vma_pages_range(vma, start, end, 1);
+                up_read(&mm->mmap_sem);
+                /* vma can change or disappear */
+                down_write(&mm->mmap_sem);
+                vma = find_vma(mm, start);
+                /* non-NULL vma must contain @start, but need to check @end */
+                if (!vma ||  end > vma->vm_end)
+                        return -ENOMEM;
+                return 0;       /* hide other errors from mmap(), et al */
+        }
+        /*
+         * User mapped kernel pages or huge pages:
+         * make these pages present to populate the ptes, but
+         * fall thru' to reset VM_LOCKED--no need to unlock, and
+         * return nr_pages so these don't get counted against task's
+         * locked limit.  huge pages are already counted against
+         * locked vm limit.
+         */
+        make_pages_present(start, end);
+no_mlock:
+        vma->vm_flags &= ~VM_LOCKED;    /* and don't come back! */
+        return nr_pages;                /* error or pages NOT mlocked */
+}
+/*
+ * munlock_vma_pages_range() - munlock all pages in the vma range.'
+ * @vma - vma containing range to be munlock()ed.
+ * @start - start address in @vma of the range
+ * @end - end of range in @vma.
+ *
+ *  For mremap(), munmap() and exit().
+ *
+ * Called with @vma VM_LOCKED.
+ *
+ * Returns with VM_LOCKED cleared.  Callers must be prepared to
+ * deal with this.
+ *
+ * We don't save and restore VM_LOCKED here because pages are
+ * still on lru.  In unmap path, pages might be scanned by reclaim
+ * and re-mlocked by try_to_{munlock|unmap} before we unmap and
+ * free them.  This will result in freeing mlocked pages.
+ */
+void munlock_vma_pages_range(struct vm_area_struct *vma,
+                           unsigned long start, unsigned long end)
+{
+        vma->vm_flags &= ~VM_LOCKED;
+        __mlock_vma_pages_range(vma, start, end, 0);
+}
+/*
+ * mlock_fixup  - handle mlock[all]/munlock[all] requests.
+ *
+ * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
+ * munlock is a no-op.  However, for some special vmas, we go ahead and
+ * populate the ptes via make_pages_present().
+ *
+ * For vmas that pass the filters, merge/split as appropriate.
+ */
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
        unsigned long start, unsigned long end, unsigned int newflags)
 {
-        struct mm_struct * mm = vma->vm_mm;
+        struct mm_struct *mm = vma->vm_mm;
        pgoff_t pgoff;
-        int pages;
+        int nr_pages;
        int ret = 0;
+        int lock = newflags & VM_LOCKED;
-        if (newflags == vma->vm_flags) {
-                *prev = vma;
+        if (newflags == vma->vm_flags ||
-                goto out;
+                        (vma->vm_flags & (VM_IO | VM_PFNMAP)))
+                goto out;       /* don't set VM_LOCKED,  don't count */
+        if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+                        is_vm_hugetlb_page(vma) ||
+                        vma == get_gate_vma(current)) {
+                if (lock)
+                        make_pages_present(start, end);
+                goto out;       /* don't set VM_LOCKED,  don't count */
        }
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -44,8 +416,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
                goto success;
        }
-        *prev = vma;
        if (start != vma->vm_start) {
                ret = split_vma(mm, vma, start, 1);
                if (ret)
@@ -60,26 +430,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 success:
        /*
+         * Keep track of amount of locked VM.
+         */
+        nr_pages = (end - start) >> PAGE_SHIFT;
+        if (!lock)
+                nr_pages = -nr_pages;
+        mm->locked_vm += nr_pages;
+        /*
         * vm_flags is protected by the mmap_sem held in write mode.
         * It's okay if try_to_unmap_one unmaps a page just after we
-         * set VM_LOCKED, make_pages_present below will bring it back.
+         * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
         */
        vma->vm_flags = newflags;
-        /*
+        if (lock) {
-         * Keep track of amount of locked VM.
+                /*
-         */
+                 * mmap_sem is currently held for write.  Downgrade the write
-        pages = (end - start) >> PAGE_SHIFT;
+                 * lock to a read lock so that other faults, mmap scans, ...
-        if (newflags & VM_LOCKED) {
+                 * while we fault in all pages.
-                pages = -pages;
+                 */
-                if (!(newflags & VM_IO))
+                downgrade_write(&mm->mmap_sem);
-                        ret = make_pages_present(start, end);
+                ret = __mlock_vma_pages_range(vma, start, end, 1);
+                /*
+                 * Need to reacquire mmap sem in write mode, as our callers
+                 * expect this.  We have no support for atomically upgrading
+                 * a sem to write, so we need to check for ranges while sem
+                 * is unlocked.
+                 */
+                up_read(&mm->mmap_sem);
+                /* vma can change or disappear */
+                down_write(&mm->mmap_sem);
+                *prev = find_vma(mm, start);
+                /* non-NULL *prev must contain @start, but need to check @end */
+                if (!(*prev) || end > (*prev)->vm_end)
+                        ret = -ENOMEM;
+                else if (ret > 0) {
+                        mm->locked_vm -= ret;
+                        ret = 0;
+                } else
+                        ret = __mlock_posix_error_return(ret); /* translate if needed */
+        } else {
+                /*
+                 * TODO:  for unlocking, pages will already be resident, so
+                 * we don't need to wait for allocations/reclaim/pagein, ...
+                 * However, unlocking a very large region can still take a
+                 * while.  Should we downgrade the semaphore for both lock
+                 * AND unlock ?
+                 */
+                __mlock_vma_pages_range(vma, start, end, 0);
        }
-        mm->locked_vm -= pages;
 out:
-        if (ret == -ENOMEM)
+        *prev = vma;
-                ret = -EAGAIN;
        return ret;
 }
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 000000000000..4e0e26591dfa
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,152 @@
+/*
+ * mm_init.c - Memory initialisation verification and debugging
+ *
+ * Copyright 2008 IBM Corporation, 2008
+ * Author Mel Gorman <mel@csn.ul.ie>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include "internal.h"
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+int mminit_loglevel;
+#ifndef SECTIONS_SHIFT
+#define SECTIONS_SHIFT  0
+#endif
+/* The zonelists are simply reported, validation is manual. */
+void mminit_verify_zonelist(void)
+{
+        int nid;
+        if (mminit_loglevel < MMINIT_VERIFY)
+                return;
+        for_each_online_node(nid) {
+                pg_data_t *pgdat = NODE_DATA(nid);
+                struct zone *zone;
+                struct zoneref *z;
+                struct zonelist *zonelist;
+                int i, listid, zoneid;
+                BUG_ON(MAX_ZONELISTS > 2);
+                for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
+                        /* Identify the zone and nodelist */
+                        zoneid = i % MAX_NR_ZONES;
+                        listid = i / MAX_NR_ZONES;
+                        zonelist = &pgdat->node_zonelists[listid];
+                        zone = &pgdat->node_zones[zoneid];
+                        if (!populated_zone(zone))
+                                continue;
+                        /* Print information about the zonelist */
+                        printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
+                                listid > 0 ? "thisnode" : "general", nid,
+                                zone->name);
+                        /* Iterate the zonelist */
+                        for_each_zone_zonelist(zone, z, zonelist, zoneid) {
+#ifdef CONFIG_NUMA
+                                printk(KERN_CONT "%d:%s ",
+                                        zone->node, zone->name);
+#else
+                                printk(KERN_CONT "0:%s ", zone->name);
+#endif /* CONFIG_NUMA */
+                        }
+                        printk(KERN_CONT "\n");
+                }
+        }
+}
+void __init mminit_verify_pageflags_layout(void)
+{
+        int shift, width;
+        unsigned long or_mask, add_mask;
+        shift = 8 * sizeof(unsigned long);
+        width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
+                "Section %d Node %d Zone %d Flags %d\n",
+                SECTIONS_WIDTH,
+                NODES_WIDTH,
+                ZONES_WIDTH,
+                NR_PAGEFLAGS);
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
+                "Section %d Node %d Zone %d\n",
+                SECTIONS_SHIFT,
+                NODES_SHIFT,
+                ZONES_SHIFT);
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
+                "Section %lu Node %lu Zone %lu\n",
+                (unsigned long)SECTIONS_PGSHIFT,
+                (unsigned long)NODES_PGSHIFT,
+                (unsigned long)ZONES_PGSHIFT);
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
+                "Zone ID: %lu -> %lu\n",
+                (unsigned long)ZONEID_PGOFF,
+                (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
+                "location: %d -> %d unused %d -> %d flags %d -> %d\n",
+                shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
+                "Node not in page flags");
+#endif
+        if (SECTIONS_WIDTH) {
+                shift -= SECTIONS_WIDTH;
+                BUG_ON(shift != SECTIONS_PGSHIFT);
+        }
+        if (NODES_WIDTH) {
+                shift -= NODES_WIDTH;
+                BUG_ON(shift != NODES_PGSHIFT);
+        }
+        if (ZONES_WIDTH) {
+                shift -= ZONES_WIDTH;
+                BUG_ON(shift != ZONES_PGSHIFT);
+        }
+        /* Check for bitmask overlaps */
+        or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
+                        (NODES_MASK << NODES_PGSHIFT) |
+                        (SECTIONS_MASK << SECTIONS_PGSHIFT);
+        add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
+                        (NODES_MASK << NODES_PGSHIFT) +
+                        (SECTIONS_MASK << SECTIONS_PGSHIFT);
+        BUG_ON(or_mask != add_mask);
+}
+void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
+                        unsigned long nid, unsigned long pfn)
+{
+        BUG_ON(page_to_nid(page) != nid);
+        BUG_ON(page_zonenum(page) != zone);
+        BUG_ON(page_to_pfn(page) != pfn);
+}
+static __init int set_mminit_loglevel(char *str)
+{
+        get_option(&str, &mminit_loglevel);
+        return 0;
+}
+early_param("mminit_loglevel", set_mminit_loglevel);
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+struct kobject *mm_kobj;
+EXPORT_SYMBOL_GPL(mm_kobj);
+static int __init mm_sysfs_init(void)
+{
+        mm_kobj = kobject_create_and_add("mm", kernel_kobj);
+        if (!mm_kobj)
+                return -ENOMEM;
+        return 0;
+}
+__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d102b956fd8..74f4d158022e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,12 +26,15 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
+#include "internal.h"
 #ifndef arch_mmap_check
 #define arch_mmap_check(addr, len, flags)       (0)
 #endif
@@ -367,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                if (vma_tmp->vm_end > addr) {
                        vma = vma_tmp;
                        if (vma_tmp->vm_start <= addr)
-                                return vma;
+                                break;
                        __rb_link = &__rb_parent->rb_left;
                } else {
                        rb_prev = __rb_parent;
@@ -407,7 +410,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
        rb_insert_color(&vma->vm_rb, &mm->mm_rb);
 }
-static inline void __vma_link_file(struct vm_area_struct *vma)
+static void __vma_link_file(struct vm_area_struct *vma)
 {
        struct file * file;
@@ -659,8 +662,6 @@ again:			remove_next = 1 + (end > next->vm_end);
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those.
 */
-#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                        struct file *file, unsigned long vm_flags)
 {
@@ -969,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
                        return -EPERM;
                vm_flags |= VM_LOCKED;
        }
        /* mlock MCL_FUTURE? */
        if (vm_flags & VM_LOCKED) {
                unsigned long locked, lock_limit;
@@ -1027,6 +1029,10 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
        } else {
                switch (flags & MAP_TYPE) {
                case MAP_SHARED:
+                        /*
+                         * Ignore pgoff.
+                         */
+                        pgoff = 0;
                        vm_flags |= VM_SHARED | VM_MAYSHARE;
                        break;
                case MAP_PRIVATE:
@@ -1108,6 +1114,9 @@ munmap_back:
        if (!may_expand_vm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;
+        if (flags & MAP_NORESERVE)
+                vm_flags |= VM_NORESERVE;
        if (accountable && (!(flags & MAP_NORESERVE) ||
                            sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
                if (vm_flags & VM_SHARED) {
@@ -1129,10 +1138,12 @@ munmap_back:
         * The VM_SHARED test is necessary because shmem_zero_setup
         * will create the file object for a shared anonymous map below.
         */
-        if (!file && !(vm_flags & VM_SHARED) &&
+        if (!file && !(vm_flags & VM_SHARED)) {
-            vma_merge(mm, prev, addr, addr + len, vm_flags,
+                vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
-                                        NULL, NULL, pgoff, NULL))
+                                        NULL, NULL, pgoff, NULL);
-                goto out;
+                if (vma)
+                        goto out;
+        }
        /*
         * Determine the object being mapped and call the appropriate
@@ -1214,10 +1225,14 @@ out:
        mm->total_vm += len >> PAGE_SHIFT;
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
-                mm->locked_vm += len >> PAGE_SHIFT;
+                /*
-                make_pages_present(addr, addr + len);
+                 * makes pages present; downgrades, drops, reacquires mmap_sem
-        }
+                 */
-        if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
+                long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
+                if (nr_pages < 0)
+                        return nr_pages;        /* vma gone! */
+                mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
+        } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
                make_pages_present(addr, addr + len);
        return addr;
@@ -1576,7 +1591,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
 * vma is the last one with address > vma->vm_end.  Have to extend vma.
 */
 #ifndef CONFIG_IA64
-static inline
+static
 #endif
 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 {
@@ -1626,7 +1641,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 /*
 * vma is the first one with address < vma->vm_start.  Have to extend vma.
 */
-static inline int expand_downwards(struct vm_area_struct *vma,
+static int expand_downwards(struct vm_area_struct *vma,
                                   unsigned long address)
 {
        int error;
@@ -1688,10 +1703,12 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
        vma = find_vma_prev(mm, addr, &prev);
        if (vma && (vma->vm_start <= addr))
                return vma;
-        if (!prev || expand_stack(prev, addr))
+        if (expand_stack(prev, addr))
                return NULL;
-        if (prev->vm_flags & VM_LOCKED)
+        if (prev->vm_flags & VM_LOCKED) {
-                make_pages_present(addr, prev->vm_end);
+                if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
+                        return NULL;    /* vma gone! */
+        }
        return prev;
 }
 #else
@@ -1717,8 +1734,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
        start = vma->vm_start;
        if (expand_stack(vma, addr))
                return NULL;
-        if (vma->vm_flags & VM_LOCKED)
+        if (vma->vm_flags & VM_LOCKED) {
-                make_pages_present(addr, start);
+                if (mlock_vma_pages_range(vma, addr, start) < 0)
+                        return NULL;    /* vma gone! */
+        }
        return vma;
 }
 #endif
@@ -1737,8 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
                long nrpages = vma_pages(vma);
                mm->total_vm -= nrpages;
-                if (vma->vm_flags & VM_LOCKED)
-                        mm->locked_vm -= nrpages;
                vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
                vma = remove_vma(vma);
        } while (vma);
@@ -1763,7 +1780,7 @@ static void unmap_region(struct mm_struct *mm,
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+        free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
                                 next? next->vm_start: 0);
        tlb_finish_mmu(tlb, start, end);
 }
@@ -1807,7 +1824,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        struct mempolicy *pol;
        struct vm_area_struct *new;
-        if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+        if (is_vm_hugetlb_page(vma) && (addr &
+                                        ~(huge_page_mask(hstate_vma(vma)))))
                return -EINVAL;
        if (mm->map_count >= sysctl_max_map_count)
@@ -1903,6 +1921,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
        vma = prev? prev->vm_next: mm->mmap;
        /*
+         * unlock any mlock()ed ranges before detaching vmas
+         */
+        if (mm->locked_vm) {
+                struct vm_area_struct *tmp = vma;
+                while (tmp && tmp->vm_start < end) {
+                        if (tmp->vm_flags & VM_LOCKED) {
+                                mm->locked_vm -= vma_pages(tmp);
+                                munlock_vma_pages_all(tmp);
+                        }
+                        tmp = tmp->vm_next;
+                }
+        }
+        /*
         * Remove the vma's, and unmap the actual pages
         */
        detach_vmas_to_be_unmapped(mm, vma, prev, end);
@@ -2014,8 +2046,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
                return -ENOMEM;
        /* Can we just expand an old private anonymous mapping? */
-        if (vma_merge(mm, prev, addr, addr + len, flags,
+        vma = vma_merge(mm, prev, addr, addr + len, flags,
-                                        NULL, NULL, pgoff, NULL))
+                                        NULL, NULL, pgoff, NULL);
+        if (vma)
                goto out;
        /*
@@ -2037,8 +2070,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 out:
        mm->total_vm += len >> PAGE_SHIFT;
        if (flags & VM_LOCKED) {
-                mm->locked_vm += len >> PAGE_SHIFT;
+                if (!mlock_vma_pages_range(vma, addr, addr + len))
-                make_pages_present(addr, addr + len);
+                        mm->locked_vm += (len >> PAGE_SHIFT);
        }
        return addr;
 }
@@ -2049,13 +2082,23 @@ EXPORT_SYMBOL(do_brk);
 void exit_mmap(struct mm_struct *mm)
 {
        struct mmu_gather *tlb;
-        struct vm_area_struct *vma = mm->mmap;
+        struct vm_area_struct *vma;
        unsigned long nr_accounted = 0;
        unsigned long end;
        /* mm's last user has gone, and its about to be pulled down */
        arch_exit_mmap(mm);
+        mmu_notifier_release(mm);
+        if (mm->locked_vm) {
+                vma = mm->mmap;
+                while (vma) {
+                        if (vma->vm_flags & VM_LOCKED)
+                                munlock_vma_pages_all(vma);
+                        vma = vma->vm_next;
+                }
+        }
+        vma = mm->mmap;
        lru_add_drain();
        flush_cache_mm(mm);
        tlb = tlb_gather_mmu(mm, 1);
@@ -2063,7 +2106,7 @@ void exit_mmap(struct mm_struct *mm)
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+        free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
        tlb_finish_mmu(tlb, 0, end);
        /*
@@ -2262,3 +2305,167 @@ int install_special_mapping(struct mm_struct *mm,
        return 0;
 }
+static DEFINE_MUTEX(mm_all_locks_mutex);
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
+{
+        if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+                /*
+                 * We can safely modify head.next after taking the
+                 * anon_vma->lock. If some other vma in this mm shares
+                 * the same anon_vma we won't take it again.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us thanks to the
+                 * anon_vma->lock.
+                 */
+                if (__test_and_set_bit(0, (unsigned long *)
+                                       &anon_vma->head.next))
+                        BUG();
+        }
+}
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+{
+        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change from under us because
+                 * we hold the mm_all_locks_mutex.
+                 *
+                 * Operations on ->flags have to be atomic because
+                 * even if AS_MM_ALL_LOCKS is stable thanks to the
+                 * mm_all_locks_mutex, there may be other cpus
+                 * changing other bitflags in parallel to us.
+                 */
+                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+                        BUG();
+                spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+        }
+}
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_sem in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_sem until mm_drop_all_locks() returns.
+ *
+ * mmap_sem in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout (for example populate_range() with
+ * nonlinear vmas). It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We can take all the locks in random order because the VM code
+ * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * takes more than one of them in a row. Secondly we're protected
+ * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        int ret = -EINTR;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        mutex_lock(&mm_all_locks_mutex);
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
+        }
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->anon_vma)
+                        vm_lock_anon_vma(mm, vma->anon_vma);
+        }
+        ret = 0;
+out_unlock:
+        if (ret)
+                mm_drop_all_locks(mm);
+        return ret;
+}
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+        if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change to 0 from under
+                 * us because we hold the mm_all_locks_mutex.
+                 *
+                 * We must however clear the bitflag before unlocking
+                 * the vma so the users using the anon_vma->head will
+                 * never see our bitflag.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us until we release the
+                 * anon_vma->lock.
+                 */
+                if (!__test_and_clear_bit(0, (unsigned long *)
+                                          &anon_vma->head.next))
+                        BUG();
+                spin_unlock(&anon_vma->lock);
+        }
+}
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change to 0 from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_unlock(&mapping->i_mmap_lock);
+                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+                                        &mapping->flags))
+                        BUG();
+        }
+}
+/*
+ * The mmap_sem cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (vma->anon_vma)
+                        vm_unlock_anon_vma(vma->anon_vma);
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_unlock_mapping(vma->vm_file->f_mapping);
+        }
+        mutex_unlock(&mm_all_locks_mutex);
+}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 000000000000..5f4ef0250bee
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
+/*
+ *  linux/mm/mmu_notifier.c
+ *
+ *  Copyright (C) 2008  Qumranet, Inc.
+ *  Copyright (C) 2008  SGI
+ *             Christoph Lameter <clameter@sgi.com>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+/*
+ * This function can't run concurrently against mmu_notifier_register
+ * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
+ * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
+ * in parallel despite there being no task using this mm any more,
+ * through the vmas outside of the exit_mmap context, such as with
+ * vmtruncate. This serializes against mmu_notifier_unregister with
+ * the mmu_notifier_mm->lock in addition to RCU and it serializes
+ * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * can't go away from under us as exit_mmap holds an mm_count pin
+ * itself.
+ */
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+        struct mmu_notifier *mn;
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
+                mn = hlist_entry(mm->mmu_notifier_mm->list.first,
+                                 struct mmu_notifier,
+                                 hlist);
+                /*
+                 * We arrived before mmu_notifier_unregister so
+                 * mmu_notifier_unregister will do nothing other than
+                 * to wait ->release to finish and
+                 * mmu_notifier_unregister to return.
+                 */
+                hlist_del_init_rcu(&mn->hlist);
+                /*
+                 * RCU here will block mmu_notifier_unregister until
+                 * ->release returns.
+                 */
+                rcu_read_lock();
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+                /*
+                 * if ->release runs before mmu_notifier_unregister it
+                 * must be handled as it's the only way for the driver
+                 * to flush all existing sptes and stop the driver
+                 * from establishing any more sptes before all the
+                 * pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+                rcu_read_unlock();
+                spin_lock(&mm->mmu_notifier_mm->lock);
+        }
+        spin_unlock(&mm->mmu_notifier_mm->lock);
+        /*
+         * synchronize_rcu here prevents mmu_notifier_release to
+         * return to exit_mmap (which would proceed freeing all pages
+         * in the mm) until the ->release method returns, if it was
+         * invoked by mmu_notifier_unregister.
+         *
+         * The mmu_notifier_mm can't go away from under us because one
+         * mm_count is hold by exit_mmap.
+         */
+        synchronize_rcu();
+}
+/*
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+                                        unsigned long address)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        int young = 0;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->clear_flush_young)
+                        young |= mn->ops->clear_flush_young(mn, mm, address);
+        }
+        rcu_read_unlock();
+        return young;
+}
+void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+                                          unsigned long address)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_page)
+                        mn->ops->invalidate_page(mn, mm, address);
+        }
+        rcu_read_unlock();
+}
+void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+                                  unsigned long start, unsigned long end)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_range_start)
+                        mn->ops->invalidate_range_start(mn, mm, start, end);
+        }
+        rcu_read_unlock();
+}
+void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+                                  unsigned long start, unsigned long end)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_range_end)
+                        mn->ops->invalidate_range_end(mn, mm, start, end);
+        }
+        rcu_read_unlock();
+}
+static int do_mmu_notifier_register(struct mmu_notifier *mn,
+                                    struct mm_struct *mm,
+                                    int take_mmap_sem)
+{
+        struct mmu_notifier_mm *mmu_notifier_mm;
+        int ret;
+        BUG_ON(atomic_read(&mm->mm_users) <= 0);
+        ret = -ENOMEM;
+        mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+        if (unlikely(!mmu_notifier_mm))
+                goto out;
+        if (take_mmap_sem)
+                down_write(&mm->mmap_sem);
+        ret = mm_take_all_locks(mm);
+        if (unlikely(ret))
+                goto out_cleanup;
+        if (!mm_has_notifiers(mm)) {
+                INIT_HLIST_HEAD(&mmu_notifier_mm->list);
+                spin_lock_init(&mmu_notifier_mm->lock);
+                mm->mmu_notifier_mm = mmu_notifier_mm;
+                mmu_notifier_mm = NULL;
+        }
+        atomic_inc(&mm->mm_count);
+        /*
+         * Serialize the update against mmu_notifier_unregister. A
+         * side note: mmu_notifier_release can't run concurrently with
+         * us because we hold the mm_users pin (either implicitly as
+         * current->mm or explicitly with get_task_mm() or similar).
+         * We can't race against any other mmu notifier method either
+         * thanks to mm_take_all_locks().
+         */
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
+        spin_unlock(&mm->mmu_notifier_mm->lock);
+        mm_drop_all_locks(mm);
+out_cleanup:
+        if (take_mmap_sem)
+                up_write(&mm->mmap_sem);
+        /* kfree() does nothing if mmu_notifier_mm is NULL */
+        kfree(mmu_notifier_mm);
+out:
+        BUG_ON(atomic_read(&mm->mm_users) <= 0);
+        return ret;
+}
+/*
+ * Must not hold mmap_sem nor any other VM related lock when calling
+ * this registration function. Must also ensure mm_users can't go down
+ * to zero while this runs to avoid races with mmu_notifier_release,
+ * so mm has to be current->mm or the mm should be pinned safely such
+ * as with get_task_mm(). If the mm is not current->mm, the mm_users
+ * pin should be released by calling mmput after mmu_notifier_register
+ * returns. mmu_notifier_unregister must be always called to
+ * unregister the notifier. mm_count is automatically pinned to allow
+ * mmu_notifier_unregister to safely run at any time later, before or
+ * after exit_mmap. ->release will always be called before exit_mmap
+ * frees the pages.
+ */
+int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        return do_mmu_notifier_register(mn, mm, 1);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+/*
+ * Same as mmu_notifier_register but here the caller must hold the
+ * mmap_sem in write mode.
+ */
+int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        return do_mmu_notifier_register(mn, mm, 0);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
+/* this is called after the last mmu_notifier_unregister() returned */
+void __mmu_notifier_mm_destroy(struct mm_struct *mm)
+{
+        BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
+        kfree(mm->mmu_notifier_mm);
+        mm->mmu_notifier_mm = LIST_POISON1; /* debug */
+}
+/*
+ * This releases the mm_count pin automatically and frees the mm
+ * structure if it was the last user of it. It serializes against
+ * running mmu notifiers with RCU and against mmu_notifier_unregister
+ * with the unregister lock + RCU. All sptes must be dropped before
+ * calling mmu_notifier_unregister. ->release or any other notifier
+ * method may be invoked concurrently with mmu_notifier_unregister,
+ * and only after mmu_notifier_unregister returned we're guaranteed
+ * that ->release or any other method can't run anymore.
+ */
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        BUG_ON(atomic_read(&mm->mm_count) <= 0);
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        if (!hlist_unhashed(&mn->hlist)) {
+                hlist_del_rcu(&mn->hlist);
+                /*
+                 * RCU here will force exit_mmap to wait ->release to finish
+                 * before freeing the pages.
+                 */
+                rcu_read_lock();
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+                /*
+                 * exit_mmap will block in mmu_notifier_release to
+                 * guarantee ->release is called before freeing the
+                 * pages.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+                rcu_read_unlock();
+        } else
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+        /*
+         * Wait any running method to finish, of course including
+         * ->release if it was run by mmu_notifier_relase instead of us.
+         */
+        synchronize_rcu();
+        BUG_ON(atomic_read(&mm->mm_count) <= 0);
+        mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 486ed595ee6f..16ce8b955dcf 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
                                (z->zone && !zref_in_nodemask(z, nodes)))
                        z++;
-        *zone = zonelist_zone(z++);
+        *zone = zonelist_zone(z);
        return z;
 }
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 360d9cc8b38c..fded06f923f4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -153,12 +154,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
         * If we make a private mapping writable we increase our commit;
         * but (without finer accounting) cannot reduce our commit if we
         * make it unwritable again.
-         *
-         * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
-         * a MAP_NORESERVE private mapping to writable will now reserve.
         */
        if (newflags & VM_WRITE) {
-                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
+                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
+                                                VM_SHARED|VM_NORESERVE))) {
                        charged = nrpages;
                        if (security_vm_enough_memory(charged))
                                return -ENOMEM;
@@ -205,10 +204,12 @@ success:
                dirty_accountable = 1;
        }
+        mmu_notifier_invalidate_range_start(mm, start, end);
        if (is_vm_hugetlb_page(vma))
                hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
        else
                change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
        return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index 08e3c7f2bd15..58a2908f42f5 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,11 +18,14 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#include "internal.h"
 static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 {
        pgd_t *pgd;
@@ -74,7 +77,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
+        unsigned long old_start;
+        old_start = old_addr;
+        mmu_notifier_invalidate_range_start(vma->vm_mm,
+                                            old_start, old_end);
        if (vma->vm_file) {
                /*
                 * Subtle point from Rajesh Venkatasubramanian: before
@@ -116,6 +123,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (mapping)
                spin_unlock(&mapping->i_mmap_lock);
+        mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 }
 #define LATENCY_LIMIT   (64 * PAGE_SIZE)
@@ -232,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
        if (vm_flags & VM_LOCKED) {
                mm->locked_vm += new_len >> PAGE_SHIFT;
                if (new_len > old_len)
-                        make_pages_present(new_addr + old_len,
+                        mlock_vma_pages_range(new_vma, new_addr + old_len,
-                                           new_addr + new_len);
+                                                       new_addr + new_len);
        }
        return new_addr;
@@ -373,7 +381,7 @@ unsigned long do_mremap(unsigned long addr,
                        vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
                        if (vma->vm_flags & VM_LOCKED) {
                                mm->locked_vm += pages;
-                                make_pages_present(addr + old_len,
+                                mlock_vma_pages_range(vma, addr + old_len,
                                                   addr + new_len);
                        }
                        ret = addr;
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb9..2696b24f2bb3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
@@ -34,6 +34,8 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
+#include "internal.h"
 void *high_memory;
 struct page *mem_map;
 unsigned long max_mapnr;
@@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp)
        return PAGE_SIZE << compound_order(page);
 }
-/*
+int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- * get a list of pages in an address range belonging to the specified process
+                     unsigned long start, int len, int flags,
- * and indicate the VMA that covers each page
+                struct page **pages, struct vm_area_struct **vmas)
- * - this is potentially dodgy as we may end incrementing the page count of a
- *   slab page or a secondary page from a compound page
- * - don't permit access to VMAs that don't support it, such as I/O mappings
- */
-int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
-        unsigned long start, int len, int write, int force,
-        struct page **pages, struct vm_area_struct **vmas)
 {
        struct vm_area_struct *vma;
        unsigned long vm_flags;
        int i;
+        int write = !!(flags & GUP_FLAGS_WRITE);
+        int force = !!(flags & GUP_FLAGS_FORCE);
+        int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
        /* calculate required read or write permissions.
         * - if 'force' is set, we only require the "MAY" flags.
@@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                /* protect what we can, including chardevs */
                if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
-                    !(vm_flags & vma->vm_flags))
+                    (!ignore && !(vm_flags & vma->vm_flags)))
                        goto finish_or_fault;
                if (pages) {
@@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 finish_or_fault:
        return i ? : -EFAULT;
 }
+/*
+ * get a list of pages in an address range belonging to the specified process
+ * and indicate the VMA that covers each page
+ * - this is potentially dodgy as we may end incrementing the page count of a
+ *   slab page or a secondary page from a compound page
+ * - don't permit access to VMAs that don't support it, such as I/O mappings
+ */
+int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+        unsigned long start, int len, int write, int force,
+        struct page **pages, struct vm_area_struct **vmas)
+{
+        int flags = 0;
+        if (write)
+                flags |= GUP_FLAGS_WRITE;
+        if (force)
+                flags |= GUP_FLAGS_FORCE;
+        return __get_user_pages(tsk, mm,
+                                start, len, flags,
+                                pages, vmas);
+}
 EXPORT_SYMBOL(get_user_pages);
 DEFINE_RWLOCK(vmlist_lock);
@@ -266,6 +288,27 @@ void *vmalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vmalloc_node);
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+/**
+ *      vmalloc_exec  -  allocate virtually contiguous, executable memory
+ *      @size:          allocation size
+ *
+ *      Kernel-internal function to allocate enough pages to cover @size
+ *      the page level allocator and map them into contiguous and
+ *      executable kernel virtual space.
+ *
+ *      For tight control over page level allocator and protection flags
+ *      use __vmalloc() instead.
+ */
+void *vmalloc_exec(unsigned long size)
+{
+        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+}
 /**
 * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
 *      @size:          allocation size
@@ -745,7 +788,7 @@ static unsigned long determine_vm_flags(struct file *file,
         * it's being traced - otherwise breakpoints set in it may interfere
         * with another untraced process
         */
-        if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
+        if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
                vm_flags &= ~VM_MAYSHARE;
        return vm_flags;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8a5467ee6265..64e5b4bcd964 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -26,6 +26,7 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/memcontrol.h>
+#include <linux/security.h>
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
@@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * Superuser processes are usually more important, so we make it
         * less likely that we kill those.
         */
-        if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE))
+        if (has_capability(p, CAP_SYS_ADMIN) ||
+            has_capability(p, CAP_SYS_RESOURCE))
                points /= 4;
        /*
@@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * tend to only have this flag set on applications they think
         * of as important.
         */
-        if (__capable(p, CAP_SYS_RAWIO))
+        if (has_capability(p, CAP_SYS_RAWIO))
                points /= 4;
        /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94c6d8988ab3..2970e35fd03f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -7,7 +7,7 @@
 * Contains functions related to writing back dirty pages at the
 * address_space level.
 *
- * 10Apr2002    akpm@zip.com.au
+ * 10Apr2002    Andrew Morton
 *              Initial version
 */
@@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
                struct zone *z =
                        &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
-                x += zone_page_state(z, NR_FREE_PAGES)
+                x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
-                        + zone_page_state(z, NR_INACTIVE)
-                        + zone_page_state(z, NR_ACTIVE);
        }
        /*
         * Make sure that the number of highmem pages is never larger
@@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void)
 {
        unsigned long x;
-        x = global_page_state(NR_FREE_PAGES)
+        x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
-                + global_page_state(NR_INACTIVE)
-                + global_page_state(NR_ACTIVE);
        if (!vm_highmem_is_dirtyable)
                x -= highmem_dirtyable_memory(x);
@@ -876,6 +872,7 @@ int write_cache_pages(struct address_space *mapping,
        pgoff_t end;            /* Inclusive */
        int scanned = 0;
        int range_whole = 0;
+        long nr_to_write = wbc->nr_to_write;
        if (wbc->nonblocking && bdi_write_congested(bdi)) {
                wbc->encountered_congestion = 1;
@@ -939,7 +936,7 @@ retry:
                                unlock_page(page);
                                ret = 0;
                        }
-                        if (ret || (--(wbc->nr_to_write) <= 0))
+                        if (ret || (--nr_to_write <= 0))
                                done = 1;
                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
                                wbc->encountered_congestion = 1;
@@ -958,11 +955,12 @@ retry:
                index = 0;
                goto retry;
        }
-        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+        if (!wbc->no_nrwrite_index_update) {
-                mapping->writeback_index = index;
+                if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
+                        mapping->writeback_index = index;
+                wbc->nr_to_write = nr_to_write;
+        }
-        if (wbc->range_cont)
-                wbc->range_start = index << PAGE_CACHE_SHIFT;
        return ret;
 }
 EXPORT_SYMBOL(write_cache_pages);
@@ -1088,7 +1086,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                if (!mapping)
                        return 1;
-                write_lock_irq(&mapping->tree_lock);
+                spin_lock_irq(&mapping->tree_lock);
                mapping2 = page_mapping(page);
                if (mapping2) { /* Race with truncate? */
                        BUG_ON(mapping2 != mapping);
@@ -1102,7 +1100,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                        radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
                }
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1258,7 +1256,7 @@ int test_clear_page_writeback(struct page *page)
                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
-                write_lock_irqsave(&mapping->tree_lock, flags);
+                spin_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestClearPageWriteback(page);
                if (ret) {
                        radix_tree_tag_clear(&mapping->page_tree,
@@ -1269,7 +1267,7 @@ int test_clear_page_writeback(struct page *page)
                                __bdi_writeout_inc(bdi);
                        }
                }
-                write_unlock_irqrestore(&mapping->tree_lock, flags);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestClearPageWriteback(page);
        }
@@ -1287,7 +1285,7 @@ int test_set_page_writeback(struct page *page)
                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
-                write_lock_irqsave(&mapping->tree_lock, flags);
+                spin_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestSetPageWriteback(page);
                if (!ret) {
                        radix_tree_tag_set(&mapping->page_tree,
@@ -1300,7 +1298,7 @@ int test_set_page_writeback(struct page *page)
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
-                write_unlock_irqrestore(&mapping->tree_lock, flags);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestSetPageWriteback(page);
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 79ac4afc908c..d0a240fbb8bf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -44,7 +44,7 @@
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
 #include <linux/page-isolation.h>
-#include <linux/memcontrol.h>
+#include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <asm/tlbflush.h>
@@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve;
  static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
-  unsigned long __initdata required_kernelcore;
+  static unsigned long __initdata required_kernelcore;
  static unsigned long __initdata required_movablecore;
-  unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
  int movable_zone;
@@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page)
 static void bad_page(struct page *page)
 {
-        void *pc = page_get_page_cgroup(page);
        printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
                "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
                current->comm, page, (int)(2*sizeof(unsigned long)),
                (unsigned long)page->flags, page->mapping,
                page_mapcount(page), page_count(page));
-        if (pc) {
-                printk(KERN_EMERG "cgroup:%p\n", pc);
-                page_reset_bad_cgroup(page);
-        }
        printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
                KERN_EMERG "Backtrace:\n");
        dump_stack();
@@ -264,17 +259,18 @@ static void free_compound_page(struct page *page)
        __free_pages_ok(page, compound_order(page));
 }
-static void prep_compound_page(struct page *page, unsigned long order)
+void prep_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
+        struct page *p = page + 1;
        set_compound_page_dtor(page, free_compound_page);
        set_compound_order(page, order);
        __SetPageHead(page);
-        for (i = 1; i < nr_pages; i++) {
+        for (i = 1; i < nr_pages; i++, p++) {
-                struct page *p = page + i;
+                if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
+                        p = pfn_to_page(page_to_pfn(page) + i);
                __SetPageTail(p);
                p->first_page = page;
        }
@@ -284,6 +280,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
+        struct page *p = page + 1;
        if (unlikely(compound_order(page) != order))
                bad_page(page);
@@ -291,8 +288,9 @@ static void destroy_compound_page(struct page *page, unsigned long order)
        if (unlikely(!PageHead(page)))
                        bad_page(page);
        __ClearPageHead(page);
-        for (i = 1; i < nr_pages; i++) {
+        for (i = 1; i < nr_pages; i++, p++) {
-                struct page *p = page + i;
+                if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0))
+                        p = pfn_to_page(page_to_pfn(page) + i);
                if (unlikely(!PageTail(p) |
                                (p->first_page != page)))
@@ -432,8 +430,9 @@ static inline void __free_one_page(struct page *page,
                buddy = __page_find_buddy(page, page_idx, order);
                if (!page_is_buddy(page, buddy, order))
-                        break;          /* Move the buddy up one level. */
+                        break;
+                /* Our buddy is free, merge with it and move up one order. */
                list_del(&buddy->lru);
                zone->free_area[order].nr_free--;
                rmv_page_order(buddy);
@@ -450,14 +449,16 @@ static inline void __free_one_page(struct page *page,
 static inline int free_pages_check(struct page *page)
 {
+        free_page_mlock(page);
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
-                (page_get_page_cgroup(page) != NULL) |
                (page_count(page) != 0)  |
                (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
                bad_page(page);
        if (PageDirty(page))
                __ClearPageDirty(page);
+        if (PageSwapBacked(page))
+                __ClearPageSwapBacked(page);
        /*
         * For now, we report if PG_reserved was found set, but do not
         * clear it, and do not free the page.  But we shall soon need
@@ -532,7 +533,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 /*
 * permit the bootmem allocator to evade page validation on high-order frees
 */
-void __free_pages_bootmem(struct page *page, unsigned int order)
+void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
        if (order == 0) {
                __ClearPageReserved(page);
@@ -596,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
-                (page_get_page_cgroup(page) != NULL) |
                (page_count(page) != 0)  |
                (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
                bad_page(page);
@@ -610,7 +610,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
        page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
                        1 << PG_referenced | 1 << PG_arch_1 |
-                        1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
+                        1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
+#ifdef CONFIG_UNEVICTABLE_LRU
+                        | 1 << PG_mlocked
+#endif
+                        );
        set_page_private(page, 0);
        set_page_refcounted(page);
@@ -673,9 +677,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 */
-int move_freepages(struct zone *zone,
+static int move_freepages(struct zone *zone,
-                        struct page *start_page, struct page *end_page,
+                          struct page *start_page, struct page *end_page,
-                        int migratetype)
+                          int migratetype)
 {
        struct page *page;
        unsigned long order;
@@ -693,6 +697,9 @@ int move_freepages(struct zone *zone,
 #endif
        for (page = start_page; page <= end_page;) {
+                /* Make sure we are not inadvertently changing nodes */
+                VM_BUG_ON(page_to_nid(page) != zone_to_nid(zone));
                if (!pfn_valid_within(page_to_pfn(page))) {
                        page++;
                        continue;
@@ -714,7 +721,8 @@ int move_freepages(struct zone *zone,
        return pages_moved;
 }
-int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
+static int move_freepages_block(struct zone *zone, struct page *page,
+                                int migratetype)
 {
        unsigned long start_pfn, end_pfn;
        struct page *start_page, *end_page;
@@ -1429,7 +1437,7 @@ try_next_zone:
 /*
 * This is the 'heart' of the zoned buddy allocator.
 */
-static struct page *
+struct page *
 __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
                        struct zonelist *zonelist, nodemask_t *nodemask)
 {
@@ -1632,22 +1640,7 @@ nopage:
 got_pg:
        return page;
 }
+EXPORT_SYMBOL(__alloc_pages_internal);
-struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
-                struct zonelist *zonelist)
-{
-        return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
-}
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-                struct zonelist *zonelist, nodemask_t *nodemask)
-{
-        return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
-}
-EXPORT_SYMBOL(__alloc_pages);
 /*
 * Common helper functions.
@@ -1711,6 +1704,59 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
+/**
+ * alloc_pages_exact - allocate an exact number physically-contiguous pages.
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * This function is similar to alloc_pages(), except that it allocates the
+ * minimum number of pages to satisfy the request.  alloc_pages() can only
+ * allocate memory in power-of-two pages.
+ *
+ * This function is also limited by MAX_ORDER.
+ *
+ * Memory allocated by this function must be released by free_pages_exact().
+ */
+void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
+{
+        unsigned int order = get_order(size);
+        unsigned long addr;
+        addr = __get_free_pages(gfp_mask, order);
+        if (addr) {
+                unsigned long alloc_end = addr + (PAGE_SIZE << order);
+                unsigned long used = addr + PAGE_ALIGN(size);
+                split_page(virt_to_page(addr), order);
+                while (used < alloc_end) {
+                        free_page(used);
+                        used += PAGE_SIZE;
+                }
+        }
+        return (void *)addr;
+}
+EXPORT_SYMBOL(alloc_pages_exact);
+/**
+ * free_pages_exact - release memory allocated via alloc_pages_exact()
+ * @virt: the value returned by alloc_pages_exact.
+ * @size: size of allocation, same value as passed to alloc_pages_exact().
+ *
+ * Release the memory allocated by a previous call to alloc_pages_exact.
+ */
+void free_pages_exact(void *virt, size_t size)
+{
+        unsigned long addr = (unsigned long)virt;
+        unsigned long end = addr + PAGE_ALIGN(size);
+        while (addr < end) {
+                free_page(addr);
+                addr += PAGE_SIZE;
+        }
+}
+EXPORT_SYMBOL(free_pages_exact);
 static unsigned int nr_free_zone_pages(int offset)
 {
        struct zoneref *z;
@@ -1816,10 +1862,21 @@ void show_free_areas(void)
                }
        }
-        printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+        printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
+                " inactive_file:%lu"
+//TODO:  check/adjust line lengths
+#ifdef CONFIG_UNEVICTABLE_LRU
+                " unevictable:%lu"
+#endif
+                " dirty:%lu writeback:%lu unstable:%lu\n"
                " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
-                global_page_state(NR_ACTIVE),
+                global_page_state(NR_ACTIVE_ANON),
-                global_page_state(NR_INACTIVE),
+                global_page_state(NR_ACTIVE_FILE),
+                global_page_state(NR_INACTIVE_ANON),
+                global_page_state(NR_INACTIVE_FILE),
+#ifdef CONFIG_UNEVICTABLE_LRU
+                global_page_state(NR_UNEVICTABLE),
+#endif
                global_page_state(NR_FILE_DIRTY),
                global_page_state(NR_WRITEBACK),
                global_page_state(NR_UNSTABLE_NFS),
@@ -1842,8 +1899,13 @@ void show_free_areas(void)
                        " min:%lukB"
                        " low:%lukB"
                        " high:%lukB"
-                        " active:%lukB"
+                        " active_anon:%lukB"
-                        " inactive:%lukB"
+                        " inactive_anon:%lukB"
+                        " active_file:%lukB"
+                        " inactive_file:%lukB"
+#ifdef CONFIG_UNEVICTABLE_LRU
+                        " unevictable:%lukB"
+#endif
                        " present:%lukB"
                        " pages_scanned:%lu"
                        " all_unreclaimable? %s"
@@ -1853,8 +1915,13 @@ void show_free_areas(void)
                        K(zone->pages_min),
                        K(zone->pages_low),
                        K(zone->pages_high),
-                        K(zone_page_state(zone, NR_ACTIVE)),
+                        K(zone_page_state(zone, NR_ACTIVE_ANON)),
-                        K(zone_page_state(zone, NR_INACTIVE)),
+                        K(zone_page_state(zone, NR_INACTIVE_ANON)),
+                        K(zone_page_state(zone, NR_ACTIVE_FILE)),
+                        K(zone_page_state(zone, NR_INACTIVE_FILE)),
+#ifdef CONFIG_UNEVICTABLE_LRU
+                        K(zone_page_state(zone, NR_UNEVICTABLE)),
+#endif
                        K(zone->present_pages),
                        zone->pages_scanned,
                        (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -2332,7 +2399,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
 #endif  /* CONFIG_NUMA */
-/* return values int ....just for stop_machine_run() */
+/* return values int ....just for stop_machine() */
 static int __build_all_zonelists(void *dummy)
 {
        int nid;
@@ -2352,11 +2419,12 @@ void build_all_zonelists(void)
        if (system_state == SYSTEM_BOOTING) {
                __build_all_zonelists(NULL);
+                mminit_verify_zonelist();
                cpuset_init_current_mems_allowed();
        } else {
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
-                stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+                stop_machine(__build_all_zonelists, NULL, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -2475,6 +2543,10 @@ static void setup_zone_migrate_reserve(struct zone *zone)
                        continue;
                page = pfn_to_page(pfn);
+                /* Watch out for overlapping nodes */
+                if (page_to_nid(page) != zone_to_nid(zone))
+                        continue;
                /* Blocks with reserved pages will never free, skip them. */
                if (PageReserved(page))
                        continue;
@@ -2534,6 +2606,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                }
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
+                mminit_verify_page_links(page, zone, nid, pfn);
                init_page_count(page);
                reset_page_mapcount(page);
                SetPageReserved(page);
@@ -2611,7 +2684,7 @@ static int zone_batchsize(struct zone *zone)
        return batch;
 }
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
        struct per_cpu_pages *pcp;
@@ -2836,6 +2909,12 @@ __meminit int init_currently_empty_zone(struct zone *zone,
        zone->zone_start_pfn = zone_start_pfn;
+        mminit_dprintk(MMINIT_TRACE, "memmap_init",
+                        "Initialising map node %d zone %lu pfns %lu -> %lu\n",
+                        pgdat->node_id,
+                        (unsigned long)zone_idx(zone),
+                        zone_start_pfn, (zone_start_pfn + size));
        zone_init_free_lists(zone);
        return 0;
@@ -2975,7 +3054,8 @@ void __init sparse_memory_present_with_active_regions(int nid)
 void __init push_node_boundaries(unsigned int nid,
                unsigned long start_pfn, unsigned long end_pfn)
 {
-        printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
+        mminit_dprintk(MMINIT_TRACE, "zoneboundary",
+                        "Entering push_node_boundaries(%u, %lu, %lu)\n",
                        nid, start_pfn, end_pfn);
        /* Initialise the boundary for this node if necessary */
@@ -2993,7 +3073,8 @@ void __init push_node_boundaries(unsigned int nid,
 static void __meminit account_node_boundary(unsigned int nid,
                unsigned long *start_pfn, unsigned long *end_pfn)
 {
-        printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
+        mminit_dprintk(MMINIT_TRACE, "zoneboundary",
+                        "Entering account_node_boundary(%u, %lu, %lu)\n",
                        nid, *start_pfn, *end_pfn);
        /* Return if boundary information has not been provided */
@@ -3050,7 +3131,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
 * assumption is made that zones within a node are ordered in monotonic
 * increasing memory addresses so that the "highest" populated zone is used
 */
-void __init find_usable_zone_for_movable(void)
+static void __init find_usable_zone_for_movable(void)
 {
        int zone_index;
        for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
@@ -3076,7 +3157,7 @@ void __init find_usable_zone_for_movable(void)
 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
 * zones within a node are in order of monotonic increases memory addresses
 */
-void __meminit adjust_zone_range_for_zone_movable(int nid,
+static void __meminit adjust_zone_range_for_zone_movable(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
@@ -3137,7 +3218,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
 * then all holes in the requested range will be accounted for.
 */
-unsigned long __meminit __absent_pages_in_range(int nid,
+static unsigned long __meminit __absent_pages_in_range(int nid,
                                unsigned long range_start_pfn,
                                unsigned long range_end_pfn)
 {
@@ -3350,10 +3431,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        pgdat->nr_zones = 0;
        init_waitqueue_head(&pgdat->kswapd_wait);
        pgdat->kswapd_max_order = 0;
+        pgdat_page_cgroup_init(pgdat);
        
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, memmap_pages;
+                enum lru_list l;
                size = zone_spanned_pages_in_node(nid, j, zones_size);
                realsize = size - zone_absent_pages_in_node(nid, j,
@@ -3404,10 +3487,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->prev_priority = DEF_PRIORITY;
                zone_pcp_init(zone);
-                INIT_LIST_HEAD(&zone->active_list);
+                for_each_lru(l) {
-                INIT_LIST_HEAD(&zone->inactive_list);
+                        INIT_LIST_HEAD(&zone->lru[l].list);
-                zone->nr_scan_active = 0;
+                        zone->lru[l].nr_scan = 0;
-                zone->nr_scan_inactive = 0;
+                }
+                zone->recent_rotated[0] = 0;
+                zone->recent_rotated[1] = 0;
+                zone->recent_scanned[0] = 0;
+                zone->recent_scanned[1] = 0;
                zap_zone_vm_stats(zone);
                zone->flags = 0;
                if (!size)
@@ -3464,10 +3551,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
-void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
+void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
-                unsigned long *zones_size, unsigned long node_start_pfn,
+                unsigned long node_start_pfn, unsigned long *zholes_size)
-                unsigned long *zholes_size)
 {
+        pg_data_t *pgdat = NODE_DATA(nid);
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
        calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -3520,10 +3608,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 {
        int i;
-        printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) "
+        mminit_dprintk(MMINIT_TRACE, "memory_register",
-                          "%d entries of %d used\n",
+                        "Entering add_active_range(%d, %#lx, %#lx) "
-                          nid, start_pfn, end_pfn,
+                        "%d entries of %d used\n",
-                          nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+                        nid, start_pfn, end_pfn,
+                        nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+        mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
        /* Merge with existing active regions if possible */
        for (i = 0; i < nr_nodemap_entries; i++) {
@@ -3669,7 +3760,7 @@ static void __init sort_node_map(void)
 }
 /* Find the lowest pfn for a node */
-unsigned long __init find_min_pfn_for_node(int nid)
+static unsigned long __init find_min_pfn_for_node(int nid)
 {
        int i;
        unsigned long min_pfn = ULONG_MAX;
@@ -3698,23 +3789,6 @@ unsigned long __init find_min_pfn_with_active_regions(void)
        return find_min_pfn_for_node(MAX_NUMNODES);
 }
-/**
- * find_max_pfn_with_active_regions - Find the maximum PFN registered
- *
- * It returns the maximum PFN based on information provided via
- * add_active_range().
- */
-unsigned long __init find_max_pfn_with_active_regions(void)
-{
-        int i;
-        unsigned long max_pfn = 0;
-        for (i = 0; i < nr_nodemap_entries; i++)
-                max_pfn = max(max_pfn, early_node_map[i].end_pfn);
-        return max_pfn;
-}
 /*
 * early_calculate_totalpages()
 * Sum pages in active regions for movable zone.
@@ -3741,7 +3815,7 @@ static unsigned long __init early_calculate_totalpages(void)
 * memory. When they don't, some nodes will have more kernelcore than
 * others
 */
-void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
 {
        int i, nid;
        unsigned long usable_startpfn;
@@ -3904,7 +3978,7 @@ static void check_for_regular_memory(pg_data_t *pgdat)
 void __init free_area_init_nodes(unsigned long *max_zone_pfn)
 {
        unsigned long nid;
-        enum zone_type i;
+        int i;
        /* Sort early_node_map as initialisation assumes it is sorted */
        sort_node_map();
@@ -3957,10 +4031,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                                                early_node_map[i].end_pfn);
        /* Initialise every node */
+        mminit_verify_pageflags_layout();
        setup_nr_node_ids();
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
-                free_area_init_node(nid, pgdat, NULL,
+                free_area_init_node(nid, NULL,
                                find_min_pfn_for_node(nid), NULL);
                /* Any memory on that node */
@@ -4025,15 +4100,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-static bootmem_data_t contig_bootmem_data;
+struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] };
-struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
 EXPORT_SYMBOL(contig_page_data);
 #endif
 void __init free_area_init(unsigned long *zones_size)
 {
-        free_area_init_node(0, NODE_DATA(0), zones_size,
+        free_area_init_node(0, zones_size,
                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
@@ -4163,7 +4236,7 @@ void setup_per_zone_pages_min(void)
        for_each_zone(zone) {
                u64 tmp;
-                spin_lock_irqsave(&zone->lru_lock, flags);
+                spin_lock_irqsave(&zone->lock, flags);
                tmp = (u64)pages_min * zone->present_pages;
                do_div(tmp, lowmem_pages);
                if (is_highmem(zone)) {
@@ -4195,13 +4268,53 @@ void setup_per_zone_pages_min(void)
                zone->pages_low   = zone->pages_min + (tmp >> 2);
                zone->pages_high  = zone->pages_min + (tmp >> 1);
                setup_zone_migrate_reserve(zone);
-                spin_unlock_irqrestore(&zone->lru_lock, flags);
+                spin_unlock_irqrestore(&zone->lock, flags);
        }
        /* update totalreserve_pages */
        calculate_totalreserve_pages();
 }
+/**
+ * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
+ *
+ * The inactive anon list should be small enough that the VM never has to
+ * do too much work, but large enough that each inactive page has a chance
+ * to be referenced again before it is swapped out.
+ *
+ * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
+ * INACTIVE_ANON pages on this zone's LRU, maintained by the
+ * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
+ * the anonymous pages are kept on the inactive list.
+ *
+ * total     target    max
+ * memory    ratio     inactive anon
+ * -------------------------------------
+ *   10MB       1         5MB
+ *  100MB       1        50MB
+ *    1GB       3       250MB
+ *   10GB      10       0.9GB
+ *  100GB      31         3GB
+ *    1TB     101        10GB
+ *   10TB     320        32GB
+ */
+void setup_per_zone_inactive_ratio(void)
+{
+        struct zone *zone;
+        for_each_zone(zone) {
+                unsigned int gb, ratio;
+                /* Zone size in gigabytes */
+                gb = zone->present_pages >> (30 - PAGE_SHIFT);
+                ratio = int_sqrt(10 * gb);
+                if (!ratio)
+                        ratio = 1;
+                zone->inactive_ratio = ratio;
+        }
+}
 /*
 * Initialise min_free_kbytes.
 *
@@ -4239,6 +4352,7 @@ static int __init init_per_zone_pages_min(void)
                min_free_kbytes = 65536;
        setup_per_zone_pages_min();
        setup_per_zone_lowmem_reserve();
+        setup_per_zone_inactive_ratio();
        return 0;
 }
 module_init(init_per_zone_pages_min)
@@ -4400,7 +4514,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        do {
                size = bucketsize << log2qty;
                if (flags & HASH_EARLY)
-                        table = alloc_bootmem(size);
+                        table = alloc_bootmem_nopanic(size);
                else if (hashdist)
                        table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
                else {
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
new file mode 100644
index 000000000000..f59d797dc5a9
--- /dev/null
+++ b/mm/page_cgroup.c
@@ -0,0 +1,256 @@
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/bit_spinlock.h>
+#include <linux/page_cgroup.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+#include <linux/memory.h>
+#include <linux/vmalloc.h>
+#include <linux/cgroup.h>
+static void __meminit
+__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
+{
+        pc->flags = 0;
+        pc->mem_cgroup = NULL;
+        pc->page = pfn_to_page(pfn);
+}
+static unsigned long total_usage;
+#if !defined(CONFIG_SPARSEMEM)
+void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+        pgdat->node_page_cgroup = NULL;
+}
+struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+        unsigned long pfn = page_to_pfn(page);
+        unsigned long offset;
+        struct page_cgroup *base;
+        base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
+        if (unlikely(!base))
+                return NULL;
+        offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
+        return base + offset;
+}
+static int __init alloc_node_page_cgroup(int nid)
+{
+        struct page_cgroup *base, *pc;
+        unsigned long table_size;
+        unsigned long start_pfn, nr_pages, index;
+        start_pfn = NODE_DATA(nid)->node_start_pfn;
+        nr_pages = NODE_DATA(nid)->node_spanned_pages;
+        table_size = sizeof(struct page_cgroup) * nr_pages;
+        base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
+                        table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+        if (!base)
+                return -ENOMEM;
+        for (index = 0; index < nr_pages; index++) {
+                pc = base + index;
+                __init_page_cgroup(pc, start_pfn + index);
+        }
+        NODE_DATA(nid)->node_page_cgroup = base;
+        total_usage += table_size;
+        return 0;
+}
+void __init page_cgroup_init(void)
+{
+        int nid, fail;
+        if (mem_cgroup_subsys.disabled)
+                return;
+        for_each_online_node(nid)  {
+                fail = alloc_node_page_cgroup(nid);
+                if (fail)
+                        goto fail;
+        }
+        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
+        printk(KERN_INFO "please try cgroup_disable=memory option if you"
+        " don't want\n");
+        return;
+fail:
+        printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
+        printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
+        panic("Out of memory");
+}
+#else /* CONFIG_FLAT_NODE_MEM_MAP */
+struct page_cgroup *lookup_page_cgroup(struct page *page)
+{
+        unsigned long pfn = page_to_pfn(page);
+        struct mem_section *section = __pfn_to_section(pfn);
+        return section->page_cgroup + pfn;
+}
+int __meminit init_section_page_cgroup(unsigned long pfn)
+{
+        struct mem_section *section;
+        struct page_cgroup *base, *pc;
+        unsigned long table_size;
+        int nid, index;
+        section = __pfn_to_section(pfn);
+        if (section->page_cgroup)
+                return 0;
+        nid = page_to_nid(pfn_to_page(pfn));
+        table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
+        if (slab_is_available()) {
+                base = kmalloc_node(table_size, GFP_KERNEL, nid);
+                if (!base)
+                        base = vmalloc_node(table_size, nid);
+        } else {
+                base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
+                                PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+        }
+        if (!base) {
+                printk(KERN_ERR "page cgroup allocation failure\n");
+                return -ENOMEM;
+        }
+        for (index = 0; index < PAGES_PER_SECTION; index++) {
+                pc = base + index;
+                __init_page_cgroup(pc, pfn + index);
+        }
+        section = __pfn_to_section(pfn);
+        section->page_cgroup = base - pfn;
+        total_usage += table_size;
+        return 0;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+void __free_page_cgroup(unsigned long pfn)
+{
+        struct mem_section *ms;
+        struct page_cgroup *base;
+        ms = __pfn_to_section(pfn);
+        if (!ms || !ms->page_cgroup)
+                return;
+        base = ms->page_cgroup + pfn;
+        if (is_vmalloc_addr(base)) {
+                vfree(base);
+                ms->page_cgroup = NULL;
+        } else {
+                struct page *page = virt_to_page(base);
+                if (!PageReserved(page)) { /* Is bootmem ? */
+                        kfree(base);
+                        ms->page_cgroup = NULL;
+                }
+        }
+}
+int online_page_cgroup(unsigned long start_pfn,
+                        unsigned long nr_pages,
+                        int nid)
+{
+        unsigned long start, end, pfn;
+        int fail = 0;
+        start = start_pfn & (PAGES_PER_SECTION - 1);
+        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+        for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
+                if (!pfn_present(pfn))
+                        continue;
+                fail = init_section_page_cgroup(pfn);
+        }
+        if (!fail)
+                return 0;
+        /* rollback */
+        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+                __free_page_cgroup(pfn);
+        return -ENOMEM;
+}
+int offline_page_cgroup(unsigned long start_pfn,
+                unsigned long nr_pages, int nid)
+{
+        unsigned long start, end, pfn;
+        start = start_pfn & (PAGES_PER_SECTION - 1);
+        end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
+        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
+                __free_page_cgroup(pfn);
+        return 0;
+}
+static int page_cgroup_callback(struct notifier_block *self,
+                               unsigned long action, void *arg)
+{
+        struct memory_notify *mn = arg;
+        int ret = 0;
+        switch (action) {
+        case MEM_GOING_ONLINE:
+                ret = online_page_cgroup(mn->start_pfn,
+                                   mn->nr_pages, mn->status_change_nid);
+                break;
+        case MEM_CANCEL_ONLINE:
+        case MEM_OFFLINE:
+                offline_page_cgroup(mn->start_pfn,
+                                mn->nr_pages, mn->status_change_nid);
+                break;
+        case MEM_GOING_OFFLINE:
+                break;
+        case MEM_ONLINE:
+        case MEM_CANCEL_OFFLINE:
+                break;
+        }
+        ret = notifier_from_errno(ret);
+        return ret;
+}
+#endif
+void __init page_cgroup_init(void)
+{
+        unsigned long pfn;
+        int fail = 0;
+        if (mem_cgroup_subsys.disabled)
+                return;
+        for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
+                if (!pfn_present(pfn))
+                        continue;
+                fail = init_section_page_cgroup(pfn);
+        }
+        if (fail) {
+                printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
+                panic("Out of memory");
+        } else {
+                hotplug_memory_notifier(page_cgroup_callback, 0);
+        }
+        printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
+        printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
+        " want\n");
+}
+void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
+{
+        return;
+}
+#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 3444b58033c8..b70a7fec1ff6 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -2,7 +2,6 @@
 * linux/mm/page_isolation.c
 */
-#include <stddef.h>
 #include <linux/mm.h>
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
@@ -115,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
 int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 {
-        unsigned long pfn;
+        unsigned long pfn, flags;
        struct page *page;
+        struct zone *zone;
+        int ret;
        pfn = start_pfn;
        /*
@@ -132,7 +133,9 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        if (pfn < end_pfn)
                return -EBUSY;
        /* Check all pages are free or Marked as ISOLATED */
-        if (__test_page_isolated_in_pageblock(start_pfn, end_pfn))
+        zone = page_zone(pfn_to_page(pfn));
-                return 0;
+        spin_lock_irqsave(&zone->lock, flags);
-        return -EBUSY;
+        ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
+        spin_unlock_irqrestore(&zone->lock, flags);
+        return ret ? 0 : -EBUSY;
 }
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 9d834aa4b979..a0a14c4d5072 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -3,7 +3,7 @@
 *
 * Copyright (C) 2002, Linus Torvalds.
 *
- * 09Apr2002    akpm@zip.com.au
+ * 09Apr2002    Andrew Morton
 *              Initial version
 * 29Feb2004    kaos@sgi.com
 *              Move worker thread creation to kthread to avoid chewing
@@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work)
                 * Thread creation: For how long have there been zero
                 * available threads?
                 */
-                if (jiffies - last_empty_jifs > 1 * HZ) {
+                if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
                        /* unlocked list_empty() test is OK here */
                        if (list_empty(&pdflush_list)) {
                                /* unlocked test is OK here */
@@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work)
                if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
                        continue;
                pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
-                if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
+                if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
                        /* Limit exit rate */
                        pdf->when_i_went_to_sleep = jiffies;
                        break;                                  /* exeunt */
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 3f703f7cb398..8dbb6805ef35 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -26,7 +26,10 @@ DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK];
 static unsigned long max_pages(unsigned long min_pages)
 {
        unsigned long node_free_pages, max;
-        struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
+        int node = numa_node_id();
+        struct zone *zones = NODE_DATA(node)->node_zones;
+        int num_cpus_on_node;
+        node_to_cpumask_ptr(cpumask_on_node, node);
        node_free_pages =
 #ifdef CONFIG_ZONE_DMA
@@ -38,6 +41,10 @@ static unsigned long max_pages(unsigned long min_pages)
                zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
        max = node_free_pages / FRACTION_OF_NODE_MEM;
+        num_cpus_on_node = cpus_weight_nr(*cpumask_on_node);
+        max /= num_cpus_on_node;
        return max(max, min_pages);
 }
diff --git a/mm/readahead.c b/mm/readahead.c
index d8723a5f6496..bec83c15a78f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -3,7 +3,7 @@
 *
 * Copyright (C) 2002, Linus Torvalds
 *
- * 09Apr2002    akpm@zip.com.au
+ * 09Apr2002    Andrew Morton
 *              Initial version.
 */
@@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 */
 unsigned long max_sane_readahead(unsigned long nr)
 {
-        return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE)
+        return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
                + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
 }
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping,
        if (hit_readahead_marker) {
                pgoff_t start;
-                read_lock_irq(&mapping->tree_lock);
+                rcu_read_lock();
-                start = radix_tree_next_hole(&mapping->page_tree, offset, max+1);
+                start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
-                read_unlock_irq(&mapping->tree_lock);
+                rcu_read_unlock();
                if (!start || start - offset > max)
                        return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index bf0a5b7cfb8e..10993942d6c9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,12 +49,51 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 #include <asm/tlbflush.h>
-struct kmem_cache *anon_vma_cachep;
+#include "internal.h"
-/* This must be called under the mmap_sem. */
+static struct kmem_cache *anon_vma_cachep;
+static inline struct anon_vma *anon_vma_alloc(void)
+{
+        return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
+}
+static inline void anon_vma_free(struct anon_vma *anon_vma)
+{
+        kmem_cache_free(anon_vma_cachep, anon_vma);
+}
+/**
+ * anon_vma_prepare - attach an anon_vma to a memory region
+ * @vma: the memory region in question
+ *
+ * This makes sure the memory mapping described by 'vma' has
+ * an 'anon_vma' attached to it, so that we can associate the
+ * anonymous pages mapped into it with that anon_vma.
+ *
+ * The common case will be that we already have one, but if
+ * if not we either need to find an adjacent mapping that we
+ * can re-use the anon_vma from (very common when the only
+ * reason for splitting a vma has been mprotect()), or we
+ * allocate a new one.
+ *
+ * Anon-vma allocations are very subtle, because we may have
+ * optimistically looked up an anon_vma in page_lock_anon_vma()
+ * and that may actually touch the spinlock even in the newly
+ * allocated vma (it depends on RCU to make sure that the
+ * anon_vma isn't actually destroyed).
+ *
+ * As a result, we need to do proper anon_vma locking even
+ * for the new allocation. At the same time, we do not want
+ * to do any locking for the common case of already having
+ * an anon_vma.
+ *
+ * This must be called with the mmap_sem held for reading.
+ */
 int anon_vma_prepare(struct vm_area_struct *vma)
 {
        struct anon_vma *anon_vma = vma->anon_vma;
@@ -62,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma)
        might_sleep();
        if (unlikely(!anon_vma)) {
                struct mm_struct *mm = vma->vm_mm;
-                struct anon_vma *allocated, *locked;
+                struct anon_vma *allocated;
                anon_vma = find_mergeable_anon_vma(vma);
-                if (anon_vma) {
+                allocated = NULL;
-                        allocated = NULL;
+                if (!anon_vma) {
-                        locked = anon_vma;
-                        spin_lock(&locked->lock);
-                } else {
                        anon_vma = anon_vma_alloc();
                        if (unlikely(!anon_vma))
                                return -ENOMEM;
                        allocated = anon_vma;
-                        locked = NULL;
                }
+                spin_lock(&anon_vma->lock);
                /* page_table_lock to protect against threads */
                spin_lock(&mm->page_table_lock);
@@ -86,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                }
                spin_unlock(&mm->page_table_lock);
-                if (locked)
+                spin_unlock(&anon_vma->lock);
-                        spin_unlock(&locked->lock);
                if (unlikely(allocated))
                        anon_vma_free(allocated);
        }
@@ -138,7 +173,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
                anon_vma_free(anon_vma);
 }
-static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
+static void anon_vma_ctor(void *data)
 {
        struct anon_vma *anon_vma = data;
@@ -156,7 +191,7 @@ void __init anon_vma_init(void)
 * Getting a lock on a stable anon_vma from a page off the LRU is
 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
 */
-static struct anon_vma *page_lock_anon_vma(struct page *page)
+struct anon_vma *page_lock_anon_vma(struct page *page)
 {
        struct anon_vma *anon_vma;
        unsigned long anon_mapping;
@@ -176,7 +211,7 @@ out:
        return NULL;
 }
-static void page_unlock_anon_vma(struct anon_vma *anon_vma)
+void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
        spin_unlock(&anon_vma->lock);
        rcu_read_unlock();
@@ -223,10 +258,14 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 /*
 * Check that @page is mapped at @address into @mm.
 *
+ * If @sync is false, page_check_address may perform a racy check to avoid
+ * the page table lock when the pte is not present (helpful when reclaiming
+ * highly shared pages).
+ *
 * On success returns with pte mapped and locked.
 */
 pte_t *page_check_address(struct page *page, struct mm_struct *mm,
-                          unsigned long address, spinlock_t **ptlp)
+                          unsigned long address, spinlock_t **ptlp, int sync)
 {
        pgd_t *pgd;
        pud_t *pud;
@@ -248,7 +287,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
        pte = pte_offset_map(pmd, address);
        /* Make a quick check before getting the lock */
-        if (!pte_present(*pte)) {
+        if (!sync && !pte_present(*pte)) {
                pte_unmap(pte);
                return NULL;
        }
@@ -263,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
        return NULL;
 }
+/**
+ * page_mapped_in_vma - check whether a page is really mapped in a VMA
+ * @page: the page to test
+ * @vma: the VMA to test
+ *
+ * Returns 1 if the page is mapped into the page tables of the VMA, 0
+ * if the page is not mapped into the page tables of this VMA.  Only
+ * valid for normal file or anonymous VMAs.
+ */
+static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
+{
+        unsigned long address;
+        pte_t *pte;
+        spinlock_t *ptl;
+        address = vma_address(page, vma);
+        if (address == -EFAULT)         /* out of vma range */
+                return 0;
+        pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
+        if (!pte)                       /* the page is not in this mm */
+                return 0;
+        pte_unmap_unlock(pte, ptl);
+        return 1;
+}
 /*
 * Subfunctions of page_referenced: page_referenced_one called
 * repeatedly from either page_referenced_anon or page_referenced_file.
@@ -280,14 +345,21 @@ static int page_referenced_one(struct page *page,
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
+        /*
+         * Don't want to elevate referenced for mlocked page that gets this far,
+         * in order that it progresses to try_to_unmap and is moved to the
+         * unevictable list.
+         */
        if (vma->vm_flags & VM_LOCKED) {
-                referenced++;
                *mapcount = 1;  /* break early from loop */
-        } else if (ptep_clear_flush_young(vma, address, pte))
+                goto out_unmap;
+        }
+        if (ptep_clear_flush_young_notify(vma, address, pte))
                referenced++;
        /* Pretend the page is referenced if the task has the
@@ -296,6 +368,7 @@ static int page_referenced_one(struct page *page,
                        rwsem_is_locked(&mm->mmap_sem))
                referenced++;
+out_unmap:
        (*mapcount)--;
        pte_unmap_unlock(pte, ptl);
 out:
@@ -385,11 +458,6 @@ static int page_referenced_file(struct page *page,
                 */
                if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
                        continue;
-                if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
-                                  == (VM_LOCKED|VM_MAYSHARE)) {
-                        referenced++;
-                        break;
-                }
                referenced += page_referenced_one(page, vma, &mapcount);
                if (!mapcount)
                        break;
@@ -421,7 +489,7 @@ int page_referenced(struct page *page, int is_locked,
                        referenced += page_referenced_anon(page, mem_cont);
                else if (is_locked)
                        referenced += page_referenced_file(page, mem_cont);
-                else if (TestSetPageLocked(page))
+                else if (!trylock_page(page))
                        referenced++;
                else {
                        if (page->mapping)
@@ -449,7 +517,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 1);
        if (!pte)
                goto out;
@@ -457,7 +525,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
                pte_t entry;
                flush_cache_page(vma, address, pte_pfn(*pte));
-                entry = ptep_clear_flush(vma, address, pte);
+                entry = ptep_clear_flush_notify(vma, address, pte);
                entry = pte_wrprotect(entry);
                entry = pte_mkclean(entry);
                set_pte_at(mm, address, pte, entry);
@@ -576,14 +644,8 @@ void page_add_anon_rmap(struct page *page,
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        if (atomic_inc_and_test(&page->_mapcount))
                __page_set_anon_rmap(page, vma, address);
-        else {
+        else
                __page_check_anon_rmap(page, vma, address);
-                /*
-                 * We unconditionally charged during prepare, we uncharge here
-                 * This takes care of balancing the reference counts
-                 */
-                mem_cgroup_uncharge_page(page);
-        }
 }
 /**
@@ -614,12 +676,6 @@ void page_add_file_rmap(struct page *page)
 {
        if (atomic_inc_and_test(&page->_mapcount))
                __inc_zone_page_state(page, NR_FILE_MAPPED);
-        else
-                /*
-                 * We unconditionally charged during prepare, we uncharge here
-                 * This takes care of balancing the reference counts
-                 */
-                mem_cgroup_uncharge_page(page);
 }
 #ifdef CONFIG_DEBUG_VM
@@ -670,6 +726,22 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                }
                /*
+                 * Now that the last pte has gone, s390 must transfer dirty
+                 * flag from storage key to struct page.  We can usually skip
+                 * this if the page is anon, so about to be freed; but perhaps
+                 * not if it's in swapcache - there might be another pte slot
+                 * containing the swap entry, but page not yet written to swap.
+                 */
+                if ((!PageAnon(page) || PageSwapCache(page)) &&
+                    page_test_dirty(page)) {
+                        page_clear_dirty(page);
+                        set_page_dirty(page);
+                }
+                if (PageAnon(page))
+                        mem_cgroup_uncharge_page(page);
+                __dec_zone_page_state(page,
+                        PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
+                /*
                 * It would be tidy to reset the PageAnon mapping here,
                 * but that might overwrite a racing page_add_anon_rmap
                 * which increments mapcount after us but sets mapping
@@ -678,14 +750,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                 * Leaving it set also helps swapoff to reinstate ptes
                 * faster for those pages still in swapcache.
                 */
-                if (page_test_dirty(page)) {
-                        page_clear_dirty(page);
-                        set_page_dirty(page);
-                }
-                mem_cgroup_uncharge_page(page);
-                __dec_zone_page_state(page,
-                                PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
        }
 }
@@ -707,7 +771,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        if (address == -EFAULT)
                goto out;
-        pte = page_check_address(page, mm, address, &ptl);
+        pte = page_check_address(page, mm, address, &ptl, 0);
        if (!pte)
                goto out;
@@ -716,15 +780,20 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * If it's recently referenced (perhaps page_referenced
         * skipped over this mm) then we should reactivate it.
         */
-        if (!migration && ((vma->vm_flags & VM_LOCKED) ||
+        if (!migration) {
-                        (ptep_clear_flush_young(vma, address, pte)))) {
+                if (vma->vm_flags & VM_LOCKED) {
-                ret = SWAP_FAIL;
+                        ret = SWAP_MLOCK;
-                goto out_unmap;
+                        goto out_unmap;
-        }
+                }
+                if (ptep_clear_flush_young_notify(vma, address, pte)) {
+                        ret = SWAP_FAIL;
+                        goto out_unmap;
+                }
+        }
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-        pteval = ptep_clear_flush(vma, address, pte);
+        pteval = ptep_clear_flush_notify(vma, address, pte);
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
@@ -801,12 +870,17 @@ out:
 * For very sparsely populated VMAs this is a little inefficient - chances are
 * there there won't be many ptes located within the scan cluster.  In this case
 * maybe we could scan further - to the end of the pte page, perhaps.
+ *
+ * Mlocked pages:  check VM_LOCKED under mmap_sem held for read, if we can
+ * acquire it without blocking.  If vma locked, mlock the pages in the cluster,
+ * rather than unmapping them.  If we encounter the "check_page" that vmscan is
+ * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
 */
 #define CLUSTER_SIZE    min(32*PAGE_SIZE, PMD_SIZE)
 #define CLUSTER_MASK    (~(CLUSTER_SIZE - 1))
-static void try_to_unmap_cluster(unsigned long cursor,
+static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
-        unsigned int *mapcount, struct vm_area_struct *vma)
+                struct vm_area_struct *vma, struct page *check_page)
 {
        struct mm_struct *mm = vma->vm_mm;
        pgd_t *pgd;
@@ -818,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
        struct page *page;
        unsigned long address;
        unsigned long end;
+        int ret = SWAP_AGAIN;
+        int locked_vma = 0;
        address = (vma->vm_start + cursor) & CLUSTER_MASK;
        end = address + CLUSTER_SIZE;
@@ -828,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor,
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
-                return;
+                return ret;
        pud = pud_offset(pgd, address);
        if (!pud_present(*pud))
-                return;
+                return ret;
        pmd = pmd_offset(pud, address);
        if (!pmd_present(*pmd))
-                return;
+                return ret;
+        /*
+         * MLOCK_PAGES => feature is configured.
+         * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
+         * keep the sem while scanning the cluster for mlocking pages.
+         */
+        if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
+                locked_vma = (vma->vm_flags & VM_LOCKED);
+                if (!locked_vma)
+                        up_read(&vma->vm_mm->mmap_sem); /* don't need it */
+        }
        pte = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -849,12 +936,19 @@ static void try_to_unmap_cluster(unsigned long cursor,
                page = vm_normal_page(vma, address, *pte);
                BUG_ON(!page || PageAnon(page));
-                if (ptep_clear_flush_young(vma, address, pte))
+                if (locked_vma) {
+                        mlock_vma_page(page);   /* no-op if already mlocked */
+                        if (page == check_page)
+                                ret = SWAP_MLOCK;
+                        continue;       /* don't unmap */
+                }
+                if (ptep_clear_flush_young_notify(vma, address, pte))
                        continue;
                /* Nuke the page table entry. */
                flush_cache_page(vma, address, pte_pfn(*pte));
-                pteval = ptep_clear_flush(vma, address, pte);
+                pteval = ptep_clear_flush_notify(vma, address, pte);
                /* If nonlinear, store the file page offset in the pte. */
                if (page->index != linear_page_index(vma, address))
@@ -870,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor,
                (*mapcount)--;
        }
        pte_unmap_unlock(pte - 1, ptl);
+        if (locked_vma)
+                up_read(&vma->vm_mm->mmap_sem);
+        return ret;
 }
-static int try_to_unmap_anon(struct page *page, int migration)
+/*
+ * common handling for pages mapped in VM_LOCKED vmas
+ */
+static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
+{
+        int mlocked = 0;
+        if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
+                if (vma->vm_flags & VM_LOCKED) {
+                        mlock_vma_page(page);
+                        mlocked++;      /* really mlocked the page */
+                }
+                up_read(&vma->vm_mm->mmap_sem);
+        }
+        return mlocked;
+}
+/**
+ * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
+ * rmap method
+ * @page: the page to unmap/unlock
+ * @unlock:  request for unlock rather than unmap [unlikely]
+ * @migration:  unmapping for migration - ignored if @unlock
+ *
+ * Find all the mappings of a page using the mapping pointer and the vma chains
+ * contained in the anon_vma struct it points to.
+ *
+ * This function is only called from try_to_unmap/try_to_munlock for
+ * anonymous pages.
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write.  So, we won't recheck
+ * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
+ * 'LOCKED.
+ */
+static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 {
        struct anon_vma *anon_vma;
        struct vm_area_struct *vma;
+        unsigned int mlocked = 0;
        int ret = SWAP_AGAIN;
+        if (MLOCK_PAGES && unlikely(unlock))
+                ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
        anon_vma = page_lock_anon_vma(page);
        if (!anon_vma)
                return ret;
        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-                ret = try_to_unmap_one(page, vma, migration);
+                if (MLOCK_PAGES && unlikely(unlock)) {
-                if (ret == SWAP_FAIL || !page_mapped(page))
+                        if (!((vma->vm_flags & VM_LOCKED) &&
-                        break;
+                              page_mapped_in_vma(page, vma)))
+                                continue;  /* must visit all unlocked vmas */
+                        ret = SWAP_MLOCK;  /* saw at least one mlocked vma */
+                } else {
+                        ret = try_to_unmap_one(page, vma, migration);
+                        if (ret == SWAP_FAIL || !page_mapped(page))
+                                break;
+                }
+                if (ret == SWAP_MLOCK) {
+                        mlocked = try_to_mlock_page(page, vma);
+                        if (mlocked)
+                                break;  /* stop if actually mlocked page */
+                }
        }
        page_unlock_anon_vma(anon_vma);
+        if (mlocked)
+                ret = SWAP_MLOCK;       /* actually mlocked the page */
+        else if (ret == SWAP_MLOCK)
+                ret = SWAP_AGAIN;       /* saw VM_LOCKED vma */
        return ret;
 }
 /**
- * try_to_unmap_file - unmap file page using the object-based rmap method
+ * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
- * @page: the page to unmap
+ * @page: the page to unmap/unlock
- * @migration: migration flag
+ * @unlock:  request for unlock rather than unmap [unlikely]
+ * @migration:  unmapping for migration - ignored if @unlock
 *
 * Find all the mappings of a page using the mapping pointer and the vma chains
 * contained in the address_space struct it points to.
 *
- * This function is only called from try_to_unmap for object-based pages.
+ * This function is only called from try_to_unmap/try_to_munlock for
+ * object-based pages.
+ * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * where the page was found will be held for write.  So, we won't recheck
+ * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
+ * 'LOCKED.
 */
-static int try_to_unmap_file(struct page *page, int migration)
+static int try_to_unmap_file(struct page *page, int unlock, int migration)
 {
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -913,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration)
        unsigned long max_nl_cursor = 0;
        unsigned long max_nl_size = 0;
        unsigned int mapcount;
+        unsigned int mlocked = 0;
+        if (MLOCK_PAGES && unlikely(unlock))
+                ret = SWAP_SUCCESS;     /* default for try_to_munlock() */
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-                ret = try_to_unmap_one(page, vma, migration);
+                if (MLOCK_PAGES && unlikely(unlock)) {
-                if (ret == SWAP_FAIL || !page_mapped(page))
+                        if (!(vma->vm_flags & VM_LOCKED))
-                        goto out;
+                                continue;       /* must visit all vmas */
+                        ret = SWAP_MLOCK;
+                } else {
+                        ret = try_to_unmap_one(page, vma, migration);
+                        if (ret == SWAP_FAIL || !page_mapped(page))
+                                goto out;
+                }
+                if (ret == SWAP_MLOCK) {
+                        mlocked = try_to_mlock_page(page, vma);
+                        if (mlocked)
+                                break;  /* stop if actually mlocked page */
+                }
        }
+        if (mlocked)
+                goto out;
        if (list_empty(&mapping->i_mmap_nonlinear))
                goto out;
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                shared.vm_set.list) {
-                if ((vma->vm_flags & VM_LOCKED) && !migration)
+                if (MLOCK_PAGES && unlikely(unlock)) {
+                        if (!(vma->vm_flags & VM_LOCKED))
+                                continue;       /* must visit all vmas */
+                        ret = SWAP_MLOCK;       /* leave mlocked == 0 */
+                        goto out;               /* no need to look further */
+                }
+                if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
                        continue;
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
@@ -936,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration)
                        max_nl_size = cursor;
        }
-        if (max_nl_size == 0) { /* any nonlinears locked or reserved */
+        if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
                ret = SWAP_FAIL;
                goto out;
        }
@@ -960,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration)
        do {
                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                shared.vm_set.list) {
-                        if ((vma->vm_flags & VM_LOCKED) && !migration)
+                        if (!MLOCK_PAGES && !migration &&
+                            (vma->vm_flags & VM_LOCKED))
                                continue;
                        cursor = (unsigned long) vma->vm_private_data;
                        while ( cursor < max_nl_cursor &&
                                cursor < vma->vm_end - vma->vm_start) {
-                                try_to_unmap_cluster(cursor, &mapcount, vma);
+                                ret = try_to_unmap_cluster(cursor, &mapcount,
+                                                                vma, page);
+                                if (ret == SWAP_MLOCK)
+                                        mlocked = 2;    /* to return below */
                                cursor += CLUSTER_SIZE;
                                vma->vm_private_data = (void *) cursor;
                                if ((int)mapcount <= 0)
@@ -986,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration)
                vma->vm_private_data = NULL;
 out:
        spin_unlock(&mapping->i_mmap_lock);
+        if (mlocked)
+                ret = SWAP_MLOCK;       /* actually mlocked the page */
+        else if (ret == SWAP_MLOCK)
+                ret = SWAP_AGAIN;       /* saw VM_LOCKED vma */
        return ret;
 }
@@ -1001,6 +1192,7 @@ out:
 * SWAP_SUCCESS - we succeeded in removing all mappings
 * SWAP_AGAIN   - we missed a mapping, try again later
 * SWAP_FAIL    - the page is unswappable
+ * SWAP_MLOCK   - page is mlocked.
 */
 int try_to_unmap(struct page *page, int migration)
 {
@@ -1009,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration)
        BUG_ON(!PageLocked(page));
        if (PageAnon(page))
-                ret = try_to_unmap_anon(page, migration);
+                ret = try_to_unmap_anon(page, 0, migration);
        else
-                ret = try_to_unmap_file(page, migration);
+                ret = try_to_unmap_file(page, 0, migration);
+        if (ret != SWAP_MLOCK && !page_mapped(page))
-        if (!page_mapped(page))
                ret = SWAP_SUCCESS;
        return ret;
 }
+#ifdef CONFIG_UNEVICTABLE_LRU
+/**
+ * try_to_munlock - try to munlock a page
+ * @page: the page to be munlocked
+ *
+ * Called from munlock code.  Checks all of the VMAs mapping the page
+ * to make sure nobody else has this page mlocked. The page will be
+ * returned with PG_mlocked cleared if no other vmas have it mlocked.
+ *
+ * Return values are:
+ *
+ * SWAP_SUCCESS - no vma's holding page mlocked.
+ * SWAP_AGAIN   - page mapped in mlocked vma -- couldn't acquire mmap sem
+ * SWAP_MLOCK   - page is now mlocked.
+ */
+int try_to_munlock(struct page *page)
+{
+        VM_BUG_ON(!PageLocked(page) || PageLRU(page));
+        if (PageAnon(page))
+                return try_to_unmap_anon(page, 1, 0);
+        else
+                return try_to_unmap_file(page, 1, 0);
+}
+#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index e2a6ae1a44e9..d38d7e61fcd0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -50,14 +50,12 @@
 #include <linux/migrate.h>
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
+#include <linux/magic.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <asm/pgtable.h>
-/* This magic number is used in glibc for posix shared memory */
-#define TMPFS_MAGIC     0x01021994
 #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
 #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
@@ -201,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops;
 static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
        .unplug_io_fn   = default_unplug_io_fn,
 };
@@ -922,20 +920,26 @@ found:
        error = 1;
        if (!inode)
                goto out;
-        /* Precharge page while we can wait, compensate afterwards */
+        /* Precharge page using GFP_KERNEL while we can wait */
        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
        if (error)
                goto out;
        error = radix_tree_preload(GFP_KERNEL);
-        if (error)
+        if (error) {
-                goto uncharge;
+                mem_cgroup_uncharge_cache_page(page);
+                goto out;
+        }
        error = 1;
        spin_lock(&info->lock);
        ptr = shmem_swp_entry(info, idx, NULL);
-        if (ptr && ptr->val == entry.val)
+        if (ptr && ptr->val == entry.val) {
-                error = add_to_page_cache(page, inode->i_mapping,
+                error = add_to_page_cache_locked(page, inode->i_mapping,
                                                idx, GFP_NOWAIT);
+                /* does mem_cgroup_uncharge_cache_page on error */
+        } else  /* we must compensate for our precharge above */
+                mem_cgroup_uncharge_cache_page(page);
        if (error == -EEXIST) {
                struct page *filepage = find_get_page(inode->i_mapping, idx);
                error = 1;
@@ -961,8 +965,6 @@ found:
                shmem_swp_unmap(ptr);
        spin_unlock(&info->lock);
        radix_tree_preload_end();
-uncharge:
-        mem_cgroup_uncharge_page(page);
 out:
        unlock_page(page);
        page_cache_release(page);
@@ -1261,7 +1263,7 @@ repeat:
                }
                /* We have to do this with page locked to prevent races */
-                if (TestSetPageLocked(swappage)) {
+                if (!trylock_page(swappage)) {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
                        wait_on_page_locked(swappage);
@@ -1297,8 +1299,8 @@ repeat:
                        SetPageUptodate(filepage);
                        set_page_dirty(filepage);
                        swap_free(swap);
-                } else if (!(error = add_to_page_cache(
+                } else if (!(error = add_to_page_cache_locked(swappage, mapping,
-                                swappage, mapping, idx, GFP_NOWAIT))) {
+                                        idx, GFP_NOWAIT))) {
                        info->flags |= SHMEM_PAGEIN;
                        shmem_swp_set(info, entry, 0);
                        shmem_swp_unmap(entry);
@@ -1311,24 +1313,21 @@ repeat:
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
                        unlock_page(swappage);
+                        page_cache_release(swappage);
                        if (error == -ENOMEM) {
                                /* allow reclaim from this memory cgroup */
-                                error = mem_cgroup_cache_charge(swappage,
+                                error = mem_cgroup_shrink_usage(current->mm,
-                                        current->mm, gfp & ~__GFP_HIGHMEM);
+                                                                gfp);
-                                if (error) {
+                                if (error)
-                                        page_cache_release(swappage);
                                        goto failed;
-                                }
-                                mem_cgroup_uncharge_page(swappage);
                        }
-                        page_cache_release(swappage);
                        goto repeat;
                }
        } else if (sgp == SGP_READ && !filepage) {
                shmem_swp_unmap(entry);
                filepage = find_get_page(mapping, idx);
                if (filepage &&
-                    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
+                    (!PageUptodate(filepage) || !trylock_page(filepage))) {
                        spin_unlock(&info->lock);
                        wait_on_page_locked(filepage);
                        page_cache_release(filepage);
@@ -1358,6 +1357,8 @@ repeat:
                }
                if (!filepage) {
+                        int ret;
                        spin_unlock(&info->lock);
                        filepage = shmem_alloc_page(gfp, info, idx);
                        if (!filepage) {
@@ -1366,6 +1367,7 @@ repeat:
                                error = -ENOMEM;
                                goto failed;
                        }
+                        SetPageSwapBacked(filepage);
                        /* Precharge page while we can wait, compensate after */
                        error = mem_cgroup_cache_charge(filepage, current->mm,
@@ -1386,10 +1388,18 @@ repeat:
                                swap = *entry;
                                shmem_swp_unmap(entry);
                        }
-                        if (error || swap.val || 0 != add_to_page_cache_lru(
+                        ret = error || swap.val;
-                                        filepage, mapping, idx, GFP_NOWAIT)) {
+                        if (ret)
+                                mem_cgroup_uncharge_cache_page(filepage);
+                        else
+                                ret = add_to_page_cache_lru(filepage, mapping,
+                                                idx, GFP_NOWAIT);
+                        /*
+                         * At add_to_page_cache_lru() failure, uncharge will
+                         * be done automatically.
+                         */
+                        if (ret) {
                                spin_unlock(&info->lock);
-                                mem_cgroup_uncharge_page(filepage);
                                page_cache_release(filepage);
                                shmem_unacct_blocks(info->flags, 1);
                                shmem_free_blocks(inode, 1);
@@ -1398,7 +1408,6 @@ repeat:
                                        goto failed;
                                goto repeat;
                        }
-                        mem_cgroup_uncharge_page(filepage);
                        info->flags |= SHMEM_PAGEIN;
                }
@@ -1468,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
                if (!user_shm_lock(inode->i_size, user))
                        goto out_nomem;
                info->flags |= VM_LOCKED;
+                mapping_set_unevictable(file->f_mapping);
        }
        if (!lock && (info->flags & VM_LOCKED) && user) {
                user_shm_unlock(inode->i_size, user);
                info->flags &= ~VM_LOCKED;
+                mapping_clear_unevictable(file->f_mapping);
+                scan_mapping_unevictable_pages(file->f_mapping);
        }
        retval = 0;
 out_nomem:
        spin_unlock(&info->lock);
        return retval;
@@ -1503,7 +1516,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                inode->i_uid = current->fsuid;
                inode->i_gid = current->fsgid;
                inode->i_blocks = 0;
-                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_generation = get_seconds();
@@ -1518,6 +1530,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                        init_special_inode(inode, mode, dev);
                        break;
                case S_IFREG:
+                        inode->i_mapping->a_ops = &shmem_aops;
                        inode->i_op = &shmem_inode_operations;
                        inode->i_fop = &shmem_file_operations;
                        mpol_shared_policy_init(&info->policy,
@@ -1690,26 +1703,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
        file_accessed(filp);
 }
-static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+static ssize_t shmem_file_aio_read(struct kiocb *iocb,
+                const struct iovec *iov, unsigned long nr_segs, loff_t pos)
 {
-        read_descriptor_t desc;
+        struct file *filp = iocb->ki_filp;
+        ssize_t retval;
+        unsigned long seg;
+        size_t count;
+        loff_t *ppos = &iocb->ki_pos;
-        if ((ssize_t) count < 0)
+        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-                return -EINVAL;
+        if (retval)
-        if (!access_ok(VERIFY_WRITE, buf, count))
+                return retval;
-                return -EFAULT;
-        if (!count)
-                return 0;
-        desc.written = 0;
+        for (seg = 0; seg < nr_segs; seg++) {
-        desc.count = count;
+                read_descriptor_t desc;
-        desc.arg.buf = buf;
-        desc.error = 0;
-        do_shmem_file_read(filp, ppos, &desc, file_read_actor);
+                desc.written = 0;
-        if (desc.written)
+                desc.arg.buf = iov[seg].iov_base;
-                return desc.written;
+                desc.count = iov[seg].iov_len;
-        return desc.error;
+                if (desc.count == 0)
+                        continue;
+                desc.error = 0;
+                do_shmem_file_read(filp, ppos, &desc, file_read_actor);
+                retval += desc.written;
+                if (desc.error) {
+                        retval = retval ?: desc.error;
+                        break;
+                }
+                if (desc.count > 0)
+                        break;
+        }
+        return retval;
 }
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1907,6 +1932,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                        return error;
                }
                unlock_page(page);
+                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_op = &shmem_symlink_inode_operations;
                kaddr = kmap_atomic(page, KM_USER0);
                memcpy(kaddr, symname, len);
@@ -2330,7 +2356,7 @@ static void shmem_destroy_inode(struct inode *inode)
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
@@ -2369,8 +2395,9 @@ static const struct file_operations shmem_file_operations = {
        .mmap           = shmem_mmap,
 #ifdef CONFIG_TMPFS
        .llseek         = generic_file_llseek,
-        .read           = shmem_file_read,
+        .read           = do_sync_read,
        .write          = do_sync_write,
+        .aio_read       = shmem_file_aio_read,
        .aio_write      = generic_file_aio_write,
        .fsync          = simple_sync_file,
        .splice_read    = generic_file_splice_read,
@@ -2558,6 +2585,7 @@ put_memory:
        shmem_unacct_size(flags, size);
        return ERR_PTR(error);
 }
+EXPORT_SYMBOL_GPL(shmem_file_setup);
 /**
 * shmem_zero_setup - setup a shared anonymous mapping
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index f5664c5b9eb1..8e5aadd7dcd6 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask)
 * shmem_permission  -  permission() inode operation
 */
 int
-shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
+shmem_permission(struct inode *inode, int mask)
 {
        return generic_permission(inode, mask, shmem_check_acl);
 }
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537e..09187517f9dc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -95,6 +95,7 @@
 #include        <linux/init.h>
 #include        <linux/compiler.h>
 #include        <linux/cpuset.h>
+#include        <linux/proc_fs.h>
 #include        <linux/seq_file.h>
 #include        <linux/notifier.h>
 #include        <linux/kallsyms.h>
@@ -406,7 +407,7 @@ struct kmem_cache {
        unsigned int dflags;            /* dynamic flags */
        /* constructor func */
-        void (*ctor)(struct kmem_cache *, void *);
+        void (*ctor)(void *obj);
 /* 5) cache creation/removal */
        const char *name;
@@ -2137,8 +2138,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-        unsigned long flags,
+        unsigned long flags, void (*ctor)(void *))
-        void (*ctor)(struct kmem_cache *, void *))
 {
        size_t left_over, slab_size, ralign;
        struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2653,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                 * They must also be threaded.
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-                        cachep->ctor(cachep, objp + obj_offset(cachep));
+                        cachep->ctor(objp + obj_offset(cachep));
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2669,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                         cachep->buffer_size / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
-                        cachep->ctor(cachep, objp);
+                        cachep->ctor(objp);
 #endif
                slab_bufctl(slabp)[i] = i + 1;
        }
@@ -3093,7 +3093,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #endif
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
-                cachep->ctor(cachep, objp);
+                cachep->ctor(objp);
 #if ARCH_SLAB_MINALIGN
        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -4259,7 +4259,7 @@ static int s_show(struct seq_file *m, void *p)
 * + further values on SMP and with statistics enabled
 */
-const struct seq_operations slabinfo_op = {
+static const struct seq_operations slabinfo_op = {
        .start = s_start,
        .next = s_next,
        .stop = s_stop,
@@ -4316,6 +4316,19 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
        return res;
 }
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &slabinfo_op);
+}
+static const struct file_operations proc_slabinfo_operations = {
+        .open           = slabinfo_open,
+        .read           = seq_read,
+        .write          = slabinfo_write,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4444,13 +4457,47 @@ static int leaks_show(struct seq_file *m, void *p)
        return 0;
 }
-const struct seq_operations slabstats_op = {
+static const struct seq_operations slabstats_op = {
        .start = leaks_start,
        .next = s_next,
        .stop = s_stop,
        .show = leaks_show,
 };
+static int slabstats_open(struct inode *inode, struct file *file)
+{
+        unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
+        int ret = -ENOMEM;
+        if (n) {
+                ret = seq_open(file, &slabstats_op);
+                if (!ret) {
+                        struct seq_file *m = file->private_data;
+                        *n = PAGE_SIZE / (2 * sizeof(unsigned long));
+                        m->private = n;
+                        n = NULL;
+                }
+                kfree(n);
+        }
+        return ret;
+}
+static const struct file_operations proc_slabstats_operations = {
+        .open           = slabstats_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release_private,
+};
+#endif
+static int __init slab_proc_init(void)
+{
+        proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+        proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
 #endif
+        return 0;
+}
+module_init(slab_proc_init);
 #endif
 /**
@@ -4473,4 +4520,3 @@ size_t ksize(const void *objp)
        return obj_size(virt_to_cache(objp));
 }
-EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index a3ad6671adf1..cb675d126791 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large);
 */
 static inline int slob_page(struct slob_page *sp)
 {
-        return test_bit(PG_active, &sp->flags);
+        return PageSlobPage((struct page *)sp);
 }
 static inline void set_slob_page(struct slob_page *sp)
 {
-        __set_bit(PG_active, &sp->flags);
+        __SetPageSlobPage((struct page *)sp);
 }
 static inline void clear_slob_page(struct slob_page *sp)
 {
-        __clear_bit(PG_active, &sp->flags);
+        __ClearPageSlobPage((struct page *)sp);
 }
 /*
@@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp)
 */
 static inline int slob_page_free(struct slob_page *sp)
 {
-        return test_bit(PG_private, &sp->flags);
+        return PageSlobFree((struct page *)sp);
 }
 static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
 {
        list_add(&sp->list, list);
-        __set_bit(PG_private, &sp->flags);
+        __SetPageSlobFree((struct page *)sp);
 }
 static inline void clear_slob_page_free(struct slob_page *sp)
 {
        list_del(&sp->list);
-        __clear_bit(PG_private, &sp->flags);
+        __ClearPageSlobFree((struct page *)sp);
 }
 #define SLOB_UNIT sizeof(slob_t)
@@ -514,23 +514,23 @@ size_t ksize(const void *block)
                return 0;
        sp = (struct slob_page *)virt_to_page(block);
-        if (slob_page(sp))
+        if (slob_page(sp)) {
-                return ((slob_t *)block - 1)->units + SLOB_UNIT;
+                int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
-        else
+                unsigned int *m = (unsigned int *)(block - align);
+                return SLOB_UNITS(*m) * SLOB_UNIT;
+        } else
                return sp->page.private;
 }
-EXPORT_SYMBOL(ksize);
 struct kmem_cache {
        unsigned int size, align;
        unsigned long flags;
        const char *name;
-        void (*ctor)(struct kmem_cache *, void *);
+        void (*ctor)(void *);
 };
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-        size_t align, unsigned long flags,
+        size_t align, unsigned long flags, void (*ctor)(void *))
-        void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *c;
@@ -575,7 +575,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
                b = slob_new_page(flags, get_order(c->size), node);
        if (c->ctor)
-                c->ctor(c, b);
+                c->ctor(b);
        return b;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 35ab38a94b46..7ad489af9561 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -14,6 +14,7 @@
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/slab.h>
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
@@ -102,44 +103,12 @@
 *                      the fast path and disables lockless freelists.
 */
-#define FROZEN (1 << PG_active)
 #ifdef CONFIG_SLUB_DEBUG
-#define SLABDEBUG (1 << PG_error)
+#define SLABDEBUG 1
 #else
 #define SLABDEBUG 0
 #endif
-static inline int SlabFrozen(struct page *page)
-{
-        return page->flags & FROZEN;
-}
-static inline void SetSlabFrozen(struct page *page)
-{
-        page->flags |= FROZEN;
-}
-static inline void ClearSlabFrozen(struct page *page)
-{
-        page->flags &= ~FROZEN;
-}
-static inline int SlabDebug(struct page *page)
-{
-        return page->flags & SLABDEBUG;
-}
-static inline void SetSlabDebug(struct page *page)
-{
-        page->flags |= SLABDEBUG;
-}
-static inline void ClearSlabDebug(struct page *page)
-{
-        page->flags &= ~SLABDEBUG;
-}
 /*
 * Issues still to be resolved:
 *
@@ -492,7 +461,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
        if (p > addr + 16)
                print_section("Bytes b4", p - 16, 16);
-        print_section("Object", p, min(s->objsize, 128));
+        print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
        if (s->flags & SLAB_RED_ZONE)
                print_section("Redzone", p + s->objsize,
@@ -971,7 +940,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
        }
        /* Special debug activities for freeing objects */
-        if (!SlabFrozen(page) && !page->freelist)
+        if (!PageSlubFrozen(page) && !page->freelist)
                remove_full(s, page);
        if (s->flags & SLAB_STORE_USER)
                set_track(s, object, TRACK_FREE, addr);
@@ -1044,7 +1013,7 @@ __setup("slub_debug", setup_slub_debug);
 static unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(struct kmem_cache *, void *))
+        void (*ctor)(void *))
 {
        /*
         * Enable debugging if selected on the kernel commandline.
@@ -1072,7 +1041,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(struct kmem_cache *, void *))
+        void (*ctor)(void *))
 {
        return flags;
 }
@@ -1135,7 +1104,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 {
        setup_object_debug(s, page, object);
        if (unlikely(s->ctor))
-                s->ctor(s, object);
+                s->ctor(object);
 }
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1157,7 +1126,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        page->flags |= 1 << PG_slab;
        if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
                        SLAB_STORE_USER | SLAB_TRACE))
-                SetSlabDebug(page);
+                __SetPageSlubDebug(page);
        start = page_address(page);
@@ -1184,14 +1153,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        int order = compound_order(page);
        int pages = 1 << order;
-        if (unlikely(SlabDebug(page))) {
+        if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
                void *p;
                slab_pad_check(s, page);
                for_each_object(p, s, page_address(page),
                                                page->objects)
                        check_object(s, page, p, 0);
-                ClearSlabDebug(page);
+                __ClearPageSlubDebug(page);
        }
        mod_zone_page_state(page_zone(page),
@@ -1288,7 +1257,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
        if (slab_trylock(page)) {
                list_del(&page->lru);
                n->nr_partial--;
-                SetSlabFrozen(page);
+                __SetPageSlubFrozen(page);
                return 1;
        }
        return 0;
@@ -1361,7 +1330,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
                n = get_node(s, zone_to_nid(zone));
                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-                                n->nr_partial > MIN_PARTIAL) {
+                                n->nr_partial > n->min_partial) {
                        page = get_partial_node(n);
                        if (page)
                                return page;
@@ -1398,7 +1367,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
        struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
-        ClearSlabFrozen(page);
+        __ClearPageSlubFrozen(page);
        if (page->inuse) {
                if (page->freelist) {
@@ -1406,13 +1375,14 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
                        stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
                } else {
                        stat(c, DEACTIVATE_FULL);
-                        if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
+                        if (SLABDEBUG && PageSlubDebug(page) &&
+                                                (s->flags & SLAB_STORE_USER))
                                add_full(n, page);
                }
                slab_unlock(page);
        } else {
                stat(c, DEACTIVATE_EMPTY);
-                if (n->nr_partial < MIN_PARTIAL) {
+                if (n->nr_partial < n->min_partial) {
                        /*
                         * Adding an empty slab to the partial slabs in order
                         * to avoid page allocator overhead. This slab needs
@@ -1495,15 +1465,7 @@ static void flush_cpu_slab(void *d)
 static void flush_all(struct kmem_cache *s)
 {
-#ifdef CONFIG_SMP
        on_each_cpu(flush_cpu_slab, s, 1);
-#else
-        unsigned long flags;
-        local_irq_save(flags);
-        flush_cpu_slab(s);
-        local_irq_restore(flags);
-#endif
 }
 /*
@@ -1559,7 +1521,7 @@ load_freelist:
        object = c->page->freelist;
        if (unlikely(!object))
                goto another_slab;
-        if (unlikely(SlabDebug(c->page)))
+        if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
                goto debug;
        c->freelist = object[c->offset];
@@ -1596,7 +1558,7 @@ new_slab:
                if (c->page)
                        flush_slab(s, c);
                slab_lock(new);
-                SetSlabFrozen(new);
+                __SetPageSlubFrozen(new);
                c->page = new;
                goto load_freelist;
        }
@@ -1682,7 +1644,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
        stat(c, FREE_SLOWPATH);
        slab_lock(page);
-        if (unlikely(SlabDebug(page)))
+        if (unlikely(SLABDEBUG && PageSlubDebug(page)))
                goto debug;
 checks_ok:
@@ -1690,7 +1652,7 @@ checks_ok:
        page->freelist = object;
        page->inuse--;
-        if (unlikely(SlabFrozen(page))) {
+        if (unlikely(PageSlubFrozen(page))) {
                stat(c, FREE_FROZEN);
                goto out_unlock;
        }
@@ -1952,13 +1914,26 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
 #endif
 }
-static void init_kmem_cache_node(struct kmem_cache_node *n)
+static void
+init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
 {
        n->nr_partial = 0;
+        /*
+         * The larger the object size is, the more pages we want on the partial
+         * list to avoid pounding the page allocator excessively.
+         */
+        n->min_partial = ilog2(s->size);
+        if (n->min_partial < MIN_PARTIAL)
+                n->min_partial = MIN_PARTIAL;
+        else if (n->min_partial > MAX_PARTIAL)
+                n->min_partial = MAX_PARTIAL;
        spin_lock_init(&n->list_lock);
        INIT_LIST_HEAD(&n->partial);
 #ifdef CONFIG_SLUB_DEBUG
        atomic_long_set(&n->nr_slabs, 0);
+        atomic_long_set(&n->total_objects, 0);
        INIT_LIST_HEAD(&n->full);
 #endif
 }
@@ -2126,7 +2101,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
        init_object(kmalloc_caches, n, 1);
        init_tracking(kmalloc_caches, n);
 #endif
-        init_kmem_cache_node(n);
+        init_kmem_cache_node(n, kmalloc_caches);
        inc_slabs_node(kmalloc_caches, node, page->objects);
        /*
@@ -2183,7 +2158,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
                }
                s->node[node] = n;
-                init_kmem_cache_node(n);
+                init_kmem_cache_node(n, s);
        }
        return 1;
 }
@@ -2194,7 +2169,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
 {
-        init_kmem_cache_node(&s->local_node);
+        init_kmem_cache_node(&s->local_node, s);
        return 1;
 }
 #endif
@@ -2325,7 +2300,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
                const char *name, size_t size,
                size_t align, unsigned long flags,
-                void (*ctor)(struct kmem_cache *, void *))
+                void (*ctor)(void *))
 {
        memset(s, 0, kmem_size);
        s->name = name;
@@ -2339,7 +2314,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
        s->refcount = 1;
 #ifdef CONFIG_NUMA
-        s->remote_node_defrag_ratio = 100;
+        s->remote_node_defrag_ratio = 1000;
 #endif
        if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
                goto error;
@@ -2754,7 +2729,6 @@ size_t ksize(const void *object)
         */
        return s->size;
 }
-EXPORT_SYMBOL(ksize);
 void kfree(const void *x)
 {
@@ -2929,7 +2903,7 @@ static int slab_mem_going_online_callback(void *arg)
                        ret = -ENOMEM;
                        goto out;
                }
-                init_kmem_cache_node(n);
+                init_kmem_cache_node(n, s);
                s->node[nid] = n;
        }
 out:
@@ -3081,7 +3055,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 static struct kmem_cache *find_mergeable(size_t size,
                size_t align, unsigned long flags, const char *name,
-                void (*ctor)(struct kmem_cache *, void *))
+                void (*ctor)(void *))
 {
        struct kmem_cache *s;
@@ -3121,8 +3095,7 @@ static struct kmem_cache *find_mergeable(size_t size,
 }
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-                size_t align, unsigned long flags,
+                size_t align, unsigned long flags, void (*ctor)(void *))
-                void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *s;
@@ -3325,12 +3298,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
                        s->name, page);
        if (s->flags & DEBUG_DEFAULT_FLAGS) {
-                if (!SlabDebug(page))
+                if (!PageSlubDebug(page))
-                        printk(KERN_ERR "SLUB %s: SlabDebug not set "
+                        printk(KERN_ERR "SLUB %s: SlubDebug not set "
                                "on slab 0x%p\n", s->name, page);
        } else {
-                if (SlabDebug(page))
+                if (PageSlubDebug(page))
-                        printk(KERN_ERR "SLUB %s: SlabDebug set on "
+                        printk(KERN_ERR "SLUB %s: SlubDebug set on "
                                "slab 0x%p\n", s->name, page);
        }
 }
@@ -4087,7 +4060,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
        if (err)
                return err;
-        if (ratio < 100)
+        if (ratio <= 100)
                s->remote_node_defrag_ratio = ratio * 10;
        return length;
@@ -4445,14 +4418,6 @@ __initcall(slab_sysfs_init);
 * The /proc/slabinfo ABI
 */
 #ifdef CONFIG_SLABINFO
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
-                       size_t count, loff_t *ppos)
-{
-        return -EINVAL;
-}
 static void print_slabinfo_header(struct seq_file *m)
 {
        seq_puts(m, "slabinfo - version: 2.1\n");
@@ -4520,11 +4485,29 @@ static int s_show(struct seq_file *m, void *p)
        return 0;
 }
-const struct seq_operations slabinfo_op = {
+static const struct seq_operations slabinfo_op = {
        .start = s_start,
        .next = s_next,
        .stop = s_stop,
        .show = s_show,
 };
+static int slabinfo_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &slabinfo_op);
+}
+static const struct file_operations proc_slabinfo_operations = {
+        .open           = slabinfo_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static int __init slab_proc_init(void)
+{
+        proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
+        return 0;
+}
+module_init(slab_proc_init);
 #endif /* CONFIG_SLABINFO */
diff --git a/mm/sparse.c b/mm/sparse.c
index 36511c7b5e2c..39db301b920d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -147,22 +147,41 @@ static inline int sparse_early_nid(struct mem_section *section)
        return (section->section_mem_map >> SECTION_NID_SHIFT);
 }
-/* Record a memory area against a node. */
+/* Validate the physical addressing limitations of the model */
-void __init memory_present(int nid, unsigned long start, unsigned long end)
+void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
+                                                unsigned long *end_pfn)
 {
-        unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+        unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
-        unsigned long pfn;
        /*
         * Sanity checks - do not allow an architecture to pass
         * in larger pfns than the maximum scope of sparsemem:
         */
-        if (start >= max_arch_pfn)
+        if (*start_pfn > max_sparsemem_pfn) {
-                return;
+                mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
-        if (end >= max_arch_pfn)
+                        "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
-                end = max_arch_pfn;
+                        *start_pfn, *end_pfn, max_sparsemem_pfn);
+                WARN_ON_ONCE(1);
+                *start_pfn = max_sparsemem_pfn;
+                *end_pfn = max_sparsemem_pfn;
+        }
+        if (*end_pfn > max_sparsemem_pfn) {
+                mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
+                        "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
+                        *start_pfn, *end_pfn, max_sparsemem_pfn);
+                WARN_ON_ONCE(1);
+                *end_pfn = max_sparsemem_pfn;
+        }
+}
+/* Record a memory area against a node. */
+void __init memory_present(int nid, unsigned long start, unsigned long end)
+{
+        unsigned long pfn;
        start &= PAGE_SECTION_MASK;
+        mminit_validate_memmodel_limits(&start, &end);
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
                unsigned long section = pfn_to_section_nr(pfn);
                struct mem_section *ms;
@@ -187,6 +206,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
        unsigned long pfn;
        unsigned long nr_pages = 0;
+        mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
                if (nid != early_pfn_to_nid(pfn))
                        continue;
@@ -248,16 +268,92 @@ static unsigned long *__kmalloc_section_usemap(void)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static unsigned long * __init
+sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+{
+        unsigned long section_nr;
+        /*
+         * A page may contain usemaps for other sections preventing the
+         * page being freed and making a section unremovable while
+         * other sections referencing the usemap retmain active. Similarly,
+         * a pgdat can prevent a section being removed. If section A
+         * contains a pgdat and section B contains the usemap, both
+         * sections become inter-dependent. This allocates usemaps
+         * from the same section as the pgdat where possible to avoid
+         * this problem.
+         */
+        section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+        return alloc_bootmem_section(usemap_size(), section_nr);
+}
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+        unsigned long usemap_snr, pgdat_snr;
+        static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
+        static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
+        struct pglist_data *pgdat = NODE_DATA(nid);
+        int usemap_nid;
+        usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+        pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+        if (usemap_snr == pgdat_snr)
+                return;
+        if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
+                /* skip redundant message */
+                return;
+        old_usemap_snr = usemap_snr;
+        old_pgdat_snr = pgdat_snr;
+        usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
+        if (usemap_nid != nid) {
+                printk(KERN_INFO
+                       "node %d must be removed before remove section %ld\n",
+                       nid, usemap_snr);
+                return;
+        }
+        /*
+         * There is a circular dependency.
+         * Some platforms allow un-removable section because they will just
+         * gather other removable sections for dynamic partitioning.
+         * Just notify un-removable section's number here.
+         */
+        printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
+               pgdat_snr, nid);
+        printk(KERN_CONT
+               " have a circular dependency on usemap and pgdat allocations\n");
+}
+#else
+static unsigned long * __init
+sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+{
+        return NULL;
+}
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
 {
        unsigned long *usemap;
        struct mem_section *ms = __nr_to_section(pnum);
        int nid = sparse_early_nid(ms);
-        usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+        usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
        if (usemap)
                return usemap;
+        usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+        if (usemap) {
+                check_usemap_section_nr(nid, usemap);
+                return usemap;
+        }
        /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
        nid = 0;
@@ -280,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
+static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 {
        struct page *map;
        struct mem_section *ms = __nr_to_section(pnum);
diff --git a/mm/swap.c b/mm/swap.c
index 45c9f25a8a3b..2152e48a7b8f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,12 +31,13 @@
 #include <linux/backing-dev.h>
 #include <linux/memcontrol.h>
+#include "internal.h"
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
 /*
 * This path almost never happens for VM activity - pages are normally
@@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec)
                        zone = pagezone;
                        spin_lock(&zone->lru_lock);
                }
-                if (PageLRU(page) && !PageActive(page)) {
+                if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-                        list_move_tail(&page->lru, &zone->inactive_list);
+                        int lru = page_is_file_cache(page);
+                        list_move_tail(&page->lru, &zone->lru[lru].list);
                        pgmoved++;
                }
        }
@@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
 void  rotate_reclaimable_page(struct page *page)
 {
        if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
-            PageLRU(page)) {
+            !PageUnevictable(page) && PageLRU(page)) {
                struct pagevec *pvec;
                unsigned long flags;
@@ -157,12 +159,19 @@ void activate_page(struct page *page)
        struct zone *zone = page_zone(page);
        spin_lock_irq(&zone->lru_lock);
-        if (PageLRU(page) && !PageActive(page)) {
+        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
-                del_page_from_inactive_list(zone, page);
+                int file = page_is_file_cache(page);
+                int lru = LRU_BASE + file;
+                del_page_from_lru_list(zone, page, lru);
                SetPageActive(page);
-                add_page_to_active_list(zone, page);
+                lru += LRU_ACTIVE;
+                add_page_to_lru_list(zone, page, lru);
                __count_vm_event(PGACTIVATE);
-                mem_cgroup_move_lists(page, true);
+                mem_cgroup_move_lists(page, lru);
+                zone->recent_rotated[!!file]++;
+                zone->recent_scanned[!!file]++;
        }
        spin_unlock_irq(&zone->lru_lock);
 }
@@ -176,7 +185,8 @@ void activate_page(struct page *page)
 */
 void mark_page_accessed(struct page *page)
 {
-        if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
+        if (!PageActive(page) && !PageUnevictable(page) &&
+                        PageReferenced(page) && PageLRU(page)) {
                activate_page(page);
                ClearPageReferenced(page);
        } else if (!PageReferenced(page)) {
@@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page)
 EXPORT_SYMBOL(mark_page_accessed);
-/**
+void __lru_cache_add(struct page *page, enum lru_list lru)
- * lru_cache_add: add a page to the page lists
- * @page: the page to add
- */
-void lru_cache_add(struct page *page)
 {
-        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
        page_cache_get(page);
        if (!pagevec_add(pvec, page))
-                __pagevec_lru_add(pvec);
+                ____pagevec_lru_add(pvec, lru);
        put_cpu_var(lru_add_pvecs);
 }
-void lru_cache_add_active(struct page *page)
+/**
+ * lru_cache_add_lru - add a page to a page list
+ * @page: the page to be added to the LRU.
+ * @lru: the LRU list to which the page is added.
+ */
+void lru_cache_add_lru(struct page *page, enum lru_list lru)
 {
-        struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+        if (PageActive(page)) {
+                VM_BUG_ON(PageUnevictable(page));
+                ClearPageActive(page);
+        } else if (PageUnevictable(page)) {
+                VM_BUG_ON(PageActive(page));
+                ClearPageUnevictable(page);
+        }
-        page_cache_get(page);
+        VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
-        if (!pagevec_add(pvec, page))
+        __lru_cache_add(page, lru);
-                __pagevec_lru_add_active(pvec);
+}
-        put_cpu_var(lru_add_active_pvecs);
+/**
+ * add_page_to_unevictable_list - add a page to the unevictable list
+ * @page:  the page to be added to the unevictable list
+ *
+ * Add page directly to its zone's unevictable list.  To avoid races with
+ * tasks that might be making the page evictable, through eg. munlock,
+ * munmap or exit, while it's not on the lru, we want to add the page
+ * while it's locked or otherwise "invisible" to other tasks.  This is
+ * difficult to do when using the pagevec cache, so bypass that.
+ */
+void add_page_to_unevictable_list(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        spin_lock_irq(&zone->lru_lock);
+        SetPageUnevictable(page);
+        SetPageLRU(page);
+        add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
+        spin_unlock_irq(&zone->lru_lock);
+}
+/**
+ * lru_cache_add_active_or_unevictable
+ * @page:  the page to be added to LRU
+ * @vma:   vma in which page is mapped for determining reclaimability
+ *
+ * place @page on active or unevictable LRU list, depending on
+ * page_evictable().  Note that if the page is not evictable,
+ * it goes directly back onto it's zone's unevictable list.  It does
+ * NOT use a per cpu pagevec.
+ */
+void lru_cache_add_active_or_unevictable(struct page *page,
+                                        struct vm_area_struct *vma)
+{
+        if (page_evictable(page, vma))
+                lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
+        else
+                add_page_to_unevictable_list(page);
 }
 /*
@@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page)
 */
 static void drain_cpu_pagevecs(int cpu)
 {
+        struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
        struct pagevec *pvec;
+        int lru;
-        pvec = &per_cpu(lru_add_pvecs, cpu);
+        for_each_lru(lru) {
-        if (pagevec_count(pvec))
+                pvec = &pvecs[lru - LRU_BASE];
-                __pagevec_lru_add(pvec);
+                if (pagevec_count(pvec))
+                        ____pagevec_lru_add(pvec, lru);
-        pvec = &per_cpu(lru_add_active_pvecs, cpu);
+        }
-        if (pagevec_count(pvec))
-                __pagevec_lru_add_active(pvec);
        pvec = &per_cpu(lru_rotate_pvecs, cpu);
        if (pagevec_count(pvec)) {
@@ -244,7 +299,7 @@ void lru_add_drain(void)
        put_cpu();
 }
-#ifdef CONFIG_NUMA
+#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
 static void lru_add_drain_per_cpu(struct work_struct *dummy)
 {
        lru_add_drain();
@@ -278,9 +333,10 @@ int lru_add_drain_all(void)
 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 * for the remainder of the operation.
 *
- * The locking in this function is against shrink_cache(): we recheck the
+ * The locking in this function is against shrink_inactive_list(): we recheck
- * page count inside the lock to see whether shrink_cache grabbed the page
+ * the page count inside the lock to see whether shrink_inactive_list()
- * via the LRU.  If it did, give up: shrink_cache will free it.
+ * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
+ * will free it.
 */
 void release_pages(struct page **pages, int nr, int cold)
 {
@@ -307,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold)
                if (PageLRU(page)) {
                        struct zone *pagezone = page_zone(page);
                        if (pagezone != zone) {
                                if (zone)
                                        spin_unlock_irqrestore(&zone->lru_lock,
@@ -379,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
 * Add the passed pages to the LRU, then drop the caller's refcount
 * on them.  Reinitialises the caller's pagevec.
 */
-void __pagevec_lru_add(struct pagevec *pvec)
+void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 {
        int i;
        struct zone *zone = NULL;
+        VM_BUG_ON(is_unevictable_lru(lru));
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
@@ -394,9 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec)
                        zone = pagezone;
                        spin_lock_irq(&zone->lru_lock);
                }
+                VM_BUG_ON(PageActive(page));
+                VM_BUG_ON(PageUnevictable(page));
                VM_BUG_ON(PageLRU(page));
                SetPageLRU(page);
-                add_page_to_inactive_list(zone, page);
+                if (is_active_lru(lru))
+                        SetPageActive(page);
+                add_page_to_lru_list(zone, page, lru);
        }
        if (zone)
                spin_unlock_irq(&zone->lru_lock);
@@ -404,48 +466,45 @@ void __pagevec_lru_add(struct pagevec *pvec)
        pagevec_reinit(pvec);
 }
-EXPORT_SYMBOL(__pagevec_lru_add);
+EXPORT_SYMBOL(____pagevec_lru_add);
-void __pagevec_lru_add_active(struct pagevec *pvec)
+/*
+ * Try to drop buffers from the pages in a pagevec
+ */
+void pagevec_strip(struct pagevec *pvec)
 {
        int i;
-        struct zone *zone = NULL;
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
-                struct zone *pagezone = page_zone(page);
-                if (pagezone != zone) {
+                if (PagePrivate(page) && trylock_page(page)) {
-                        if (zone)
+                        if (PagePrivate(page))
-                                spin_unlock_irq(&zone->lru_lock);
+                                try_to_release_page(page, 0);
-                        zone = pagezone;
+                        unlock_page(page);
-                        spin_lock_irq(&zone->lru_lock);
                }
-                VM_BUG_ON(PageLRU(page));
-                SetPageLRU(page);
-                VM_BUG_ON(PageActive(page));
-                SetPageActive(page);
-                add_page_to_active_list(zone, page);
        }
-        if (zone)
-                spin_unlock_irq(&zone->lru_lock);
-        release_pages(pvec->pages, pvec->nr, pvec->cold);
-        pagevec_reinit(pvec);
 }
-/*
+/**
- * Try to drop buffers from the pages in a pagevec
+ * pagevec_swap_free - try to free swap space from the pages in a pagevec
+ * @pvec: pagevec with swapcache pages to free the swap space of
+ *
+ * The caller needs to hold an extra reference to each page and
+ * not hold the page lock on the pages.  This function uses a
+ * trylock on the page lock so it may not always free the swap
+ * space associated with a page.
 */
-void pagevec_strip(struct pagevec *pvec)
+void pagevec_swap_free(struct pagevec *pvec)
 {
        int i;
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
-                if (PagePrivate(page) && !TestSetPageLocked(page)) {
+                if (PageSwapCache(page) && trylock_page(page)) {
-                        if (PagePrivate(page))
+                        if (PageSwapCache(page))
-                                try_to_release_page(page, 0);
+                                remove_exclusive_swap_page_ref(page);
                        unlock_page(page);
                }
        }
@@ -493,7 +552,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
 */
 #define ACCT_THRESHOLD  max(16, NR_CPUS * 2)
-static DEFINE_PER_CPU(long, committed_space) = 0;
+static DEFINE_PER_CPU(long, committed_space);
 void vm_acct_memory(long pages)
 {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..3353c9029cef 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,13 +33,13 @@ static const struct address_space_operations swap_aops = {
 };
 static struct backing_dev_info swap_backing_dev_info = {
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
+        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
        .unplug_io_fn   = swap_unplug_io_fn,
 };
 struct address_space swapper_space = {
        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-        .tree_lock      = __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
+        .tree_lock      = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
        .a_ops          = &swap_aops,
        .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
        .backing_dev_info = &swap_backing_dev_info,
@@ -56,15 +56,16 @@ static struct {
 void show_swap_cache_info(void)
 {
-        printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
+        printk("%lu pages in swap cache\n", total_swapcache_pages);
+        printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
                swap_cache_info.add_total, swap_cache_info.del_total,
                swap_cache_info.find_success, swap_cache_info.find_total);
-        printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
+        printk("Free swap  = %ldkB\n", nr_swap_pages << (PAGE_SHIFT - 10));
        printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
 }
 /*
- * add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
 * but sets SwapCache flag and private instead of mapping and index.
 */
 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -74,21 +75,29 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
        BUG_ON(!PageLocked(page));
        BUG_ON(PageSwapCache(page));
        BUG_ON(PagePrivate(page));
+        BUG_ON(!PageSwapBacked(page));
        error = radix_tree_preload(gfp_mask);
        if (!error) {
-                write_lock_irq(&swapper_space.tree_lock);
+                page_cache_get(page);
+                SetPageSwapCache(page);
+                set_page_private(page, entry.val);
+                spin_lock_irq(&swapper_space.tree_lock);
                error = radix_tree_insert(&swapper_space.page_tree,
                                                entry.val, page);
-                if (!error) {
+                if (likely(!error)) {
-                        page_cache_get(page);
-                        SetPageSwapCache(page);
-                        set_page_private(page, entry.val);
                        total_swapcache_pages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
                        INC_CACHE_INFO(add_total);
                }
-                write_unlock_irq(&swapper_space.tree_lock);
+                spin_unlock_irq(&swapper_space.tree_lock);
                radix_tree_preload_end();
+                if (unlikely(error)) {
+                        set_page_private(page, 0UL);
+                        ClearPageSwapCache(page);
+                        page_cache_release(page);
+                }
        }
        return error;
 }
@@ -175,9 +184,9 @@ void delete_from_swap_cache(struct page *page)
        entry.val = page_private(page);
-        write_lock_irq(&swapper_space.tree_lock);
+        spin_lock_irq(&swapper_space.tree_lock);
        __delete_from_swap_cache(page);
-        write_unlock_irq(&swapper_space.tree_lock);
+        spin_unlock_irq(&swapper_space.tree_lock);
        swap_free(entry);
        page_cache_release(page);
@@ -193,7 +202,7 @@ void delete_from_swap_cache(struct page *page)
 */
 static inline void free_swap_cache(struct page *page)
 {
-        if (PageSwapCache(page) && !TestSetPageLocked(page)) {
+        if (PageSwapCache(page) && trylock_page(page)) {
                remove_exclusive_swap_page(page);
                unlock_page(page);
        }
@@ -294,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                 * re-using the just freed swap entry for an existing page.
                 * May fail (-ENOMEM) if radix-tree node allocation failed.
                 */
-                SetPageLocked(new_page);
+                __set_page_locked(new_page);
+                SetPageSwapBacked(new_page);
                err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
-                if (!err) {
+                if (likely(!err)) {
                        /*
                         * Initiate read into locked page and return.
                         */
-                        lru_cache_add_active(new_page);
+                        lru_cache_add_anon(new_page);
                        swap_readpage(NULL, new_page);
                        return new_page;
                }
-                ClearPageLocked(new_page);
+                ClearPageSwapBacked(new_page);
+                __clear_page_locked(new_page);
                swap_free(entry);
        } while (err != -ENOMEM);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bd1bb5920306..90cb67a5417c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,17 +33,18 @@
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
-DEFINE_SPINLOCK(swap_lock);
+static DEFINE_SPINLOCK(swap_lock);
-unsigned int nr_swapfiles;
+static unsigned int nr_swapfiles;
 long total_swap_pages;
 static int swap_overflow;
+static int least_priority;
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+static struct swap_list_t swap_list = {-1, -1};
 static struct swap_info_struct swap_info[MAX_SWAPFILES];
@@ -343,7 +344,7 @@ int can_share_swap_page(struct page *page)
 * Work out if there are any other processes sharing this
 * swap cache page. Free it if you can. Return success.
 */
-int remove_exclusive_swap_page(struct page *page)
+static int remove_exclusive_swap_page_count(struct page *page, int count)
 {
        int retval;
        struct swap_info_struct * p;
@@ -356,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page)
                return 0;
        if (PageWriteback(page))
                return 0;
-        if (page_count(page) != 2) /* 2: us + cache */
+        if (page_count(page) != count) /* us + cache + ptes */
                return 0;
        entry.val = page_private(page);
@@ -368,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
        retval = 0;
        if (p->swap_map[swp_offset(entry)] == 1) {
                /* Recheck the page count with the swapcache lock held.. */
-                write_lock_irq(&swapper_space.tree_lock);
+                spin_lock_irq(&swapper_space.tree_lock);
-                if ((page_count(page) == 2) && !PageWriteback(page)) {
+                if ((page_count(page) == count) && !PageWriteback(page)) {
                        __delete_from_swap_cache(page);
                        SetPageDirty(page);
                        retval = 1;
                }
-                write_unlock_irq(&swapper_space.tree_lock);
+                spin_unlock_irq(&swapper_space.tree_lock);
        }
        spin_unlock(&swap_lock);
@@ -387,6 +388,25 @@ int remove_exclusive_swap_page(struct page *page)
 }
 /*
+ * Most of the time the page should have two references: one for the
+ * process and one for the swap cache.
+ */
+int remove_exclusive_swap_page(struct page *page)
+{
+        return remove_exclusive_swap_page_count(page, 2);
+}
+/*
+ * The pageout code holds an extra reference to the page.  That raises
+ * the reference count to test for to 2 for a page that is only in the
+ * swap cache plus 1 for each process that maps the page.
+ */
+int remove_exclusive_swap_page_ref(struct page *page)
+{
+        return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
+}
+/*
 * Free the swap entry like above, but also try to
 * free the page cache entry if it is the last user.
 */
@@ -402,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry)
        if (p) {
                if (swap_entry_free(p, swp_offset(entry)) == 1) {
                        page = find_get_page(&swapper_space, entry.val);
-                        if (page && unlikely(TestSetPageLocked(page))) {
+                        if (page && !trylock_page(page)) {
                                page_cache_release(page);
                                page = NULL;
                        }
@@ -655,8 +675,8 @@ static int unuse_mm(struct mm_struct *mm,
        if (!down_read_trylock(&mm->mmap_sem)) {
                /*
-                 * Activate page so shrink_cache is unlikely to unmap its
+                 * Activate page so shrink_inactive_list is unlikely to unmap
-                 * ptes while lock is dropped, so swapoff can make progress.
+                 * its ptes while lock is dropped, so swapoff can make progress.
                 */
                activate_page(page);
                unlock_page(page);
@@ -1260,6 +1280,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
                /* just pick something that's safe... */
                swap_list.next = swap_list.head;
        }
+        if (p->prio < 0) {
+                for (i = p->next; i >= 0; i = swap_info[i].next)
+                        swap_info[i].prio = p->prio--;
+                least_priority++;
+        }
        nr_swap_pages -= p->pages;
        total_swap_pages -= p->pages;
        p->flags &= ~SWP_WRITEOK;
@@ -1272,9 +1297,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        if (err) {
                /* re-insert swap space back into swap_list */
                spin_lock(&swap_lock);
-                for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
+                if (p->prio < 0)
+                        p->prio = --least_priority;
+                prev = -1;
+                for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
                        if (p->prio >= swap_info[i].prio)
                                break;
+                        prev = i;
+                }
                p->next = i;
                if (prev < 0)
                        swap_list.head = swap_list.next = p - swap_info;
@@ -1447,7 +1477,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        unsigned int type;
        int i, prev;
        int error;
-        static int least_priority;
        union swap_header *swap_header = NULL;
        int swap_header_version;
        unsigned int nr_good_pages = 0;
@@ -1455,7 +1484,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        sector_t span;
        unsigned long maxpages = 1;
        int swapfilesize;
-        unsigned short *swap_map;
+        unsigned short *swap_map = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
        int did_down = 0;
@@ -1474,22 +1503,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        }
        if (type >= nr_swapfiles)
                nr_swapfiles = type+1;
+        memset(p, 0, sizeof(*p));
        INIT_LIST_HEAD(&p->extent_list);
        p->flags = SWP_USED;
-        p->swap_file = NULL;
-        p->old_block_size = 0;
-        p->swap_map = NULL;
-        p->lowest_bit = 0;
-        p->highest_bit = 0;
-        p->cluster_nr = 0;
-        p->inuse_pages = 0;
        p->next = -1;
-        if (swap_flags & SWAP_FLAG_PREFER) {
-                p->prio =
-                  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
-        } else {
-                p->prio = --least_priority;
-        }
        spin_unlock(&swap_lock);
        name = getname(specialfile);
        error = PTR_ERR(name);
@@ -1632,19 +1649,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                        goto bad_swap;
                /* OK, set up the swap map and apply the bad block list */
-                if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
+                swap_map = vmalloc(maxpages * sizeof(short));
+                if (!swap_map) {
                        error = -ENOMEM;
                        goto bad_swap;
                }
                error = 0;
-                memset(p->swap_map, 0, maxpages * sizeof(short));
+                memset(swap_map, 0, maxpages * sizeof(short));
                for (i = 0; i < swap_header->info.nr_badpages; i++) {
                        int page_nr = swap_header->info.badpages[i];
                        if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
                                error = -EINVAL;
                        else
-                                p->swap_map[page_nr] = SWAP_MAP_BAD;
+                                swap_map[page_nr] = SWAP_MAP_BAD;
                }
                nr_good_pages = swap_header->info.last_page -
                                swap_header->info.nr_badpages -
@@ -1654,7 +1672,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        }
        if (nr_good_pages) {
-                p->swap_map[0] = SWAP_MAP_BAD;
+                swap_map[0] = SWAP_MAP_BAD;
                p->max = maxpages;
                p->pages = nr_good_pages;
                nr_extents = setup_swap_extents(p, &span);
@@ -1672,6 +1690,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
+        if (swap_flags & SWAP_FLAG_PREFER)
+                p->prio =
+                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
+        else
+                p->prio = --least_priority;
+        p->swap_map = swap_map;
        p->flags = SWP_ACTIVE;
        nr_swap_pages += nr_good_pages;
        total_swap_pages += nr_good_pages;
@@ -1707,12 +1731,8 @@ bad_swap:
        destroy_swap_extents(p);
 bad_swap_2:
        spin_lock(&swap_lock);
-        swap_map = p->swap_map;
        p->swap_file = NULL;
-        p->swap_map = NULL;
        p->flags = 0;
-        if (!(swap_flags & SWAP_FLAG_PREFER))
-                ++least_priority;
        spin_unlock(&swap_lock);
        vfree(swap_map);
        if (swap_file)
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index ae532f501943..3e67d575ee6e 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -65,36 +65,37 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
        if (!dentry)
                goto put_memory;
+        error = -ENFILE;
+        file = get_empty_filp();
+        if (!file)
+                goto put_dentry;
        error = -ENOSPC;
        inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
        if (!inode)
-                goto put_dentry;
+                goto close_file;
        d_instantiate(dentry, inode);
-        error = -ENFILE;
+        inode->i_size = size;
-        file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
-                        &ramfs_file_operations);
-        if (!file)
-                goto put_dentry;
        inode->i_nlink = 0;     /* It is unlinked */
+        init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+                        &ramfs_file_operations);
-        /* notify everyone as to the change of file size */
+#ifndef CONFIG_MMU
-        error = do_truncate(dentry, size, 0, file);
+        error = ramfs_nommu_expand_for_mapping(inode, size);
-        if (error < 0)
+        if (error)
                goto close_file;
+#endif
        return file;
 close_file:
        put_filp(file);
-        return ERR_PTR(error);
 put_dentry:
        dput(dentry);
 put_memory:
        return ERR_PTR(error);
 }
+EXPORT_SYMBOL_GPL(shmem_file_setup);
 /**
 * shmem_zero_setup - setup a shared anonymous mapping
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb63414..1229211104f8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -3,7 +3,7 @@
 *
 * Copyright (C) 2002, Linus Torvalds
 *
- * 10Sep2002    akpm@zip.com.au
+ * 10Sep2002    Andrew Morton
 *              Initial version.
 */
@@ -18,6 +18,7 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>  /* grr. try_to_release_page,
                                   do_invalidatepage */
+#include "internal.h"
 /**
@@ -103,8 +104,8 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        cancel_dirty_page(page, PAGE_CACHE_SIZE);
+        clear_page_mlock(page);
        remove_from_page_cache(page);
-        ClearPageUptodate(page);
        ClearPageMappedToDisk(page);
        page_cache_release(page);       /* pagecache ref */
 }
@@ -128,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
        if (PagePrivate(page) && !try_to_release_page(page, 0))
                return 0;
+        clear_page_mlock(page);
        ret = remove_mapping(mapping, page);
        return ret;
@@ -188,7 +190,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        if (page_index > next)
                                next = page_index;
                        next++;
-                        if (TestSetPageLocked(page))
+                        if (!trylock_page(page))
                                continue;
                        if (PageWriteback(page)) {
                                unlock_page(page);
@@ -281,7 +283,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping,
                        pgoff_t index;
                        int lock_failed;
-                        lock_failed = TestSetPageLocked(page);
+                        lock_failed = !trylock_page(page);
                        /*
                         * We really shouldn't be looking at the ->index of an
@@ -349,18 +351,18 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
                return 0;
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        if (PageDirty(page))
                goto failed;
+        clear_page_mlock(page);
        BUG_ON(PagePrivate(page));
        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
-        ClearPageUptodate(page);
        page_cache_release(page);       /* pagecache ref */
        return 1;
 failed:
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
        return 0;
 }
@@ -382,7 +384,7 @@ static int do_launder_page(struct address_space *mapping, struct page *page)
 * Any pages which are found to be mapped into pagetables are unmapped prior to
 * invalidation.
 *
- * Returns -EIO if any pages could not be invalidated.
+ * Returns -EBUSY if any pages could not be invalidated.
 */
 int invalidate_inode_pages2_range(struct address_space *mapping,
                                  pgoff_t start, pgoff_t end)
@@ -442,7 +444,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                        ret2 = do_launder_page(mapping, page);
                        if (ret2 == 0) {
                                if (!invalidate_complete_page2(mapping, page))
-                                        ret2 = -EIO;
+                                        ret2 = -EBUSY;
                        }
                        if (ret2 < 0)
                                ret = ret2;
diff --git a/mm/util.c b/mm/util.c
index 8f18683825bc..cb00b748ce47 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,9 @@
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 /**
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
 EXPORT_SYMBOL(kmemdup);
 /**
- * krealloc - reallocate memory. The contents will remain unchanged.
+ * __krealloc - like krealloc() but don't free @p.
 * @p: object to reallocate memory for.
 * @new_size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
 *
- * The contents of the object pointed to are preserved up to the
+ * This function is like krealloc() except it never frees the originally
- * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+ * allocated buffer. Use this if you don't want to free the buffer immediately
- * behaves exactly like kmalloc().  If @size is 0 and @p is not a
+ * like, for example, with RCU.
- * %NULL pointer, the object pointed to is freed.
 */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
+void *__krealloc(const void *p, size_t new_size, gfp_t flags)
 {
        void *ret;
        size_t ks = 0;
-        if (unlikely(!new_size)) {
+        if (unlikely(!new_size))
-                kfree(p);
                return ZERO_SIZE_PTR;
-        }
        if (p)
                ks = ksize(p);
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
                return (void *)p;
        ret = kmalloc_track_caller(new_size, flags);
-        if (ret && p) {
+        if (ret && p)
                memcpy(ret, p, ks);
+        return ret;
+}
+EXPORT_SYMBOL(__krealloc);
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc().  If @size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+        void *ret;
+        if (unlikely(!new_size)) {
                kfree(p);
+                return ZERO_SIZE_PTR;
        }
+        ret = __krealloc(p, new_size, flags);
+        if (ret && p != ret)
+                kfree(p);
        return ret;
 }
 EXPORT_SYMBOL(krealloc);
@@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n)
        return p;
 }
 EXPORT_SYMBOL(strndup_user);
+#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+        mm->mmap_base = TASK_UNMAPPED_BASE;
+        mm->get_unmapped_area = arch_get_unmapped_area;
+        mm->unmap_area = arch_unmap_area;
+}
+#endif
+int __attribute__((weak)) get_user_pages_fast(unsigned long start,
+                                int nr_pages, int write, struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        int ret;
+        down_read(&mm->mmap_sem);
+        ret = get_user_pages(current, mm, start, nr_pages,
+                                        write, 0, pages, NULL);
+        up_read(&mm->mmap_sem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6e45b0f3d125..036536945dd9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -8,26 +8,28 @@
 *  Numa awareness, Christoph Lameter, SGI, June 2005
 */
+#include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/debugobjects.h>
-#include <linux/vmalloc.h>
 #include <linux/kallsyms.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/radix-tree.h>
+#include <linux/rcupdate.h>
+#include <asm/atomic.h>
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
-DEFINE_RWLOCK(vmlist_lock);
+/*** Page table manipulation functions ***/
-struct vm_struct *vmlist;
-static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
-                            int node, void *caller);
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
 {
@@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
        } while (pte++, addr += PAGE_SIZE, addr != end);
 }
-static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
-                                                unsigned long end)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
        } while (pmd++, addr = next, addr != end);
 }
-static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
+static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
-                                                unsigned long end)
 {
        pud_t *pud;
        unsigned long next;
@@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
        } while (pud++, addr = next, addr != end);
 }
-void unmap_kernel_range(unsigned long addr, unsigned long size)
+static void vunmap_page_range(unsigned long addr, unsigned long end)
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long start = addr;
-        unsigned long end = addr + size;
        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
@@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
                        continue;
                vunmap_pud_range(pgd, addr, next);
        } while (pgd++, addr = next, addr != end);
-        flush_tlb_kernel_range(start, end);
-}
-static void unmap_vm_area(struct vm_struct *area)
-{
-        unmap_kernel_range((unsigned long)area->addr, area->size);
 }
 static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
-                        unsigned long end, pgprot_t prot, struct page ***pages)
+                unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
        pte_t *pte;
+        /*
+         * nr is a running index into the array which helps higher level
+         * callers keep track of where we're up to.
+         */
        pte = pte_alloc_kernel(pmd, addr);
        if (!pte)
                return -ENOMEM;
        do {
-                struct page *page = **pages;
+                struct page *page = pages[*nr];
-                WARN_ON(!pte_none(*pte));
-                if (!page)
+                if (WARN_ON(!pte_none(*pte)))
+                        return -EBUSY;
+                if (WARN_ON(!page))
                        return -ENOMEM;
                set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
-                (*pages)++;
+                (*nr)++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        return 0;
 }
-static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
+static int vmap_pmd_range(pud_t *pud, unsigned long addr,
-                        unsigned long end, pgprot_t prot, struct page ***pages)
+                unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
                return -ENOMEM;
        do {
                next = pmd_addr_end(addr, end);
-                if (vmap_pte_range(pmd, addr, next, prot, pages))
+                if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
                        return -ENOMEM;
        } while (pmd++, addr = next, addr != end);
        return 0;
 }
-static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
+static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
-                        unsigned long end, pgprot_t prot, struct page ***pages)
+                unsigned long end, pgprot_t prot, struct page **pages, int *nr)
 {
        pud_t *pud;
        unsigned long next;
@@ -141,50 +140,78 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
                return -ENOMEM;
        do {
                next = pud_addr_end(addr, end);
-                if (vmap_pmd_range(pud, addr, next, prot, pages))
+                if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
                        return -ENOMEM;
        } while (pud++, addr = next, addr != end);
        return 0;
 }
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+/*
+ * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
+ * will have pfns corresponding to the "pages" array.
+ *
+ * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
+ */
+static int vmap_page_range(unsigned long addr, unsigned long end,
+                                pgprot_t prot, struct page **pages)
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long addr = (unsigned long) area->addr;
+        int err = 0;
-        unsigned long end = addr + area->size - PAGE_SIZE;
+        int nr = 0;
-        int err;
        BUG_ON(addr >= end);
        pgd = pgd_offset_k(addr);
        do {
                next = pgd_addr_end(addr, end);
-                err = vmap_pud_range(pgd, addr, next, prot, pages);
+                err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
-        flush_cache_vmap((unsigned long) area->addr, end);
+        flush_cache_vmap(addr, end);
-        return err;
+        if (unlikely(err))
+                return err;
+        return nr;
+}
+static inline int is_vmalloc_or_module_addr(const void *x)
+{
+        /*
+         * x86-64 and sparc64 put modules in a special place,
+         * and fall back on vmalloc() if that fails. Others
+         * just put it in the vmalloc space.
+         */
+#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
+        unsigned long addr = (unsigned long)x;
+        if (addr >= MODULES_VADDR && addr < MODULES_END)
+                return 1;
+#endif
+        return is_vmalloc_addr(x);
 }
-EXPORT_SYMBOL_GPL(map_vm_area);
 /*
- * Map a vmalloc()-space virtual address to the physical page.
+ * Walk a vmap address to the struct page it maps.
 */
 struct page *vmalloc_to_page(const void *vmalloc_addr)
 {
        unsigned long addr = (unsigned long) vmalloc_addr;
        struct page *page = NULL;
        pgd_t *pgd = pgd_offset_k(addr);
-        pud_t *pud;
-        pmd_t *pmd;
+        /*
-        pte_t *ptep, pte;
+         * XXX we might need to change this if we add VIRTUAL_BUG_ON for
+         * architectures that do not vmalloc module space
+         */
+        VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
        if (!pgd_none(*pgd)) {
-                pud = pud_offset(pgd, addr);
+                pud_t *pud = pud_offset(pgd, addr);
                if (!pud_none(*pud)) {
-                        pmd = pmd_offset(pud, addr);
+                        pmd_t *pmd = pmd_offset(pud, addr);
                        if (!pmd_none(*pmd)) {
+                                pte_t *ptep, pte;
                                ptep = pte_offset_map(pmd, addr);
                                pte = *ptep;
                                if (pte_present(pte))
@@ -206,13 +233,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 }
 EXPORT_SYMBOL(vmalloc_to_pfn);
-static struct vm_struct *
-__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
+/*** Global kva allocator ***/
-                unsigned long end, int node, gfp_t gfp_mask, void *caller)
+#define VM_LAZY_FREE    0x01
+#define VM_LAZY_FREEING 0x02
+#define VM_VM_AREA      0x04
+struct vmap_area {
+        unsigned long va_start;
+        unsigned long va_end;
+        unsigned long flags;
+        struct rb_node rb_node;         /* address sorted rbtree */
+        struct list_head list;          /* address sorted list */
+        struct list_head purge_list;    /* "lazy purge" list */
+        void *private;
+        struct rcu_head rcu_head;
+};
+static DEFINE_SPINLOCK(vmap_area_lock);
+static struct rb_root vmap_area_root = RB_ROOT;
+static LIST_HEAD(vmap_area_list);
+static struct vmap_area *__find_vmap_area(unsigned long addr)
 {
-        struct vm_struct **p, *tmp, *area;
+        struct rb_node *n = vmap_area_root.rb_node;
-        unsigned long align = 1;
+        while (n) {
+                struct vmap_area *va;
+                va = rb_entry(n, struct vmap_area, rb_node);
+                if (addr < va->va_start)
+                        n = n->rb_left;
+                else if (addr > va->va_start)
+                        n = n->rb_right;
+                else
+                        return va;
+        }
+        return NULL;
+}
+static void __insert_vmap_area(struct vmap_area *va)
+{
+        struct rb_node **p = &vmap_area_root.rb_node;
+        struct rb_node *parent = NULL;
+        struct rb_node *tmp;
+        while (*p) {
+                struct vmap_area *tmp;
+                parent = *p;
+                tmp = rb_entry(parent, struct vmap_area, rb_node);
+                if (va->va_start < tmp->va_end)
+                        p = &(*p)->rb_left;
+                else if (va->va_end > tmp->va_start)
+                        p = &(*p)->rb_right;
+                else
+                        BUG();
+        }
+        rb_link_node(&va->rb_node, parent, p);
+        rb_insert_color(&va->rb_node, &vmap_area_root);
+        /* address-sort this list so it is usable like the vmlist */
+        tmp = rb_prev(&va->rb_node);
+        if (tmp) {
+                struct vmap_area *prev;
+                prev = rb_entry(tmp, struct vmap_area, rb_node);
+                list_add_rcu(&va->list, &prev->list);
+        } else
+                list_add_rcu(&va->list, &vmap_area_list);
+}
+static void purge_vmap_area_lazy(void);
+/*
+ * Allocate a region of KVA of the specified size and alignment, within the
+ * vstart and vend.
+ */
+static struct vmap_area *alloc_vmap_area(unsigned long size,
+                                unsigned long align,
+                                unsigned long vstart, unsigned long vend,
+                                int node, gfp_t gfp_mask)
+{
+        struct vmap_area *va;
+        struct rb_node *n;
+        unsigned long addr;
+        int purged = 0;
+        BUG_ON(size & ~PAGE_MASK);
+        addr = ALIGN(vstart, align);
+        va = kmalloc_node(sizeof(struct vmap_area),
+                        gfp_mask & GFP_RECLAIM_MASK, node);
+        if (unlikely(!va))
+                return ERR_PTR(-ENOMEM);
+retry:
+        spin_lock(&vmap_area_lock);
+        /* XXX: could have a last_hole cache */
+        n = vmap_area_root.rb_node;
+        if (n) {
+                struct vmap_area *first = NULL;
+                do {
+                        struct vmap_area *tmp;
+                        tmp = rb_entry(n, struct vmap_area, rb_node);
+                        if (tmp->va_end >= addr) {
+                                if (!first && tmp->va_start < addr + size)
+                                        first = tmp;
+                                n = n->rb_left;
+                        } else {
+                                first = tmp;
+                                n = n->rb_right;
+                        }
+                } while (n);
+                if (!first)
+                        goto found;
+                if (first->va_end < addr) {
+                        n = rb_next(&first->rb_node);
+                        if (n)
+                                first = rb_entry(n, struct vmap_area, rb_node);
+                        else
+                                goto found;
+                }
+                while (addr + size >= first->va_start && addr + size <= vend) {
+                        addr = ALIGN(first->va_end + PAGE_SIZE, align);
+                        n = rb_next(&first->rb_node);
+                        if (n)
+                                first = rb_entry(n, struct vmap_area, rb_node);
+                        else
+                                goto found;
+                }
+        }
+found:
+        if (addr + size > vend) {
+                spin_unlock(&vmap_area_lock);
+                if (!purged) {
+                        purge_vmap_area_lazy();
+                        purged = 1;
+                        goto retry;
+                }
+                if (printk_ratelimit())
+                        printk(KERN_WARNING "vmap allocation failed: "
+                                 "use vmalloc=<size> to increase size.\n");
+                return ERR_PTR(-EBUSY);
+        }
+        BUG_ON(addr & (align-1));
+        va->va_start = addr;
+        va->va_end = addr + size;
+        va->flags = 0;
+        __insert_vmap_area(va);
+        spin_unlock(&vmap_area_lock);
+        return va;
+}
+static void rcu_free_va(struct rcu_head *head)
+{
+        struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
+        kfree(va);
+}
+static void __free_vmap_area(struct vmap_area *va)
+{
+        BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+        rb_erase(&va->rb_node, &vmap_area_root);
+        RB_CLEAR_NODE(&va->rb_node);
+        list_del_rcu(&va->list);
+        call_rcu(&va->rcu_head, rcu_free_va);
+}
+/*
+ * Free a region of KVA allocated by alloc_vmap_area
+ */
+static void free_vmap_area(struct vmap_area *va)
+{
+        spin_lock(&vmap_area_lock);
+        __free_vmap_area(va);
+        spin_unlock(&vmap_area_lock);
+}
+/*
+ * Clear the pagetable entries of a given vmap_area
+ */
+static void unmap_vmap_area(struct vmap_area *va)
+{
+        vunmap_page_range(va->va_start, va->va_end);
+}
+/*
+ * lazy_max_pages is the maximum amount of virtual address space we gather up
+ * before attempting to purge with a TLB flush.
+ *
+ * There is a tradeoff here: a larger number will cover more kernel page tables
+ * and take slightly longer to purge, but it will linearly reduce the number of
+ * global TLB flushes that must be performed. It would seem natural to scale
+ * this number up linearly with the number of CPUs (because vmapping activity
+ * could also scale linearly with the number of CPUs), however it is likely
+ * that in practice, workloads might be constrained in other ways that mean
+ * vmap activity will not scale linearly with CPUs. Also, I want to be
+ * conservative and not introduce a big latency on huge systems, so go with
+ * a less aggressive log scale. It will still be an improvement over the old
+ * code, and it will be simple to change the scale factor if we find that it
+ * becomes a problem on bigger systems.
+ */
+static unsigned long lazy_max_pages(void)
+{
+        unsigned int log;
+        log = fls(num_online_cpus());
+        return log * (32UL * 1024 * 1024 / PAGE_SIZE);
+}
+static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+/*
+ * Purges all lazily-freed vmap areas.
+ *
+ * If sync is 0 then don't purge if there is already a purge in progress.
+ * If force_flush is 1, then flush kernel TLBs between *start and *end even
+ * if we found no lazy vmap areas to unmap (callers can use this to optimise
+ * their own TLB flushing).
+ * Returns with *start = min(*start, lowest purged address)
+ *              *end = max(*end, highest purged address)
+ */
+static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
+                                        int sync, int force_flush)
+{
+        static DEFINE_SPINLOCK(purge_lock);
+        LIST_HEAD(valist);
+        struct vmap_area *va;
+        int nr = 0;
+        /*
+         * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
+         * should not expect such behaviour. This just simplifies locking for
+         * the case that isn't actually used at the moment anyway.
+         */
+        if (!sync && !force_flush) {
+                if (!spin_trylock(&purge_lock))
+                        return;
+        } else
+                spin_lock(&purge_lock);
+        rcu_read_lock();
+        list_for_each_entry_rcu(va, &vmap_area_list, list) {
+                if (va->flags & VM_LAZY_FREE) {
+                        if (va->va_start < *start)
+                                *start = va->va_start;
+                        if (va->va_end > *end)
+                                *end = va->va_end;
+                        nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
+                        unmap_vmap_area(va);
+                        list_add_tail(&va->purge_list, &valist);
+                        va->flags |= VM_LAZY_FREEING;
+                        va->flags &= ~VM_LAZY_FREE;
+                }
+        }
+        rcu_read_unlock();
+        if (nr) {
+                BUG_ON(nr > atomic_read(&vmap_lazy_nr));
+                atomic_sub(nr, &vmap_lazy_nr);
+        }
+        if (nr || force_flush)
+                flush_tlb_kernel_range(*start, *end);
+        if (nr) {
+                spin_lock(&vmap_area_lock);
+                list_for_each_entry(va, &valist, purge_list)
+                        __free_vmap_area(va);
+                spin_unlock(&vmap_area_lock);
+        }
+        spin_unlock(&purge_lock);
+}
+/*
+ * Kick off a purge of the outstanding lazy areas.
+ */
+static void purge_vmap_area_lazy(void)
+{
+        unsigned long start = ULONG_MAX, end = 0;
+        __purge_vmap_area_lazy(&start, &end, 0, 0);
+}
+/*
+ * Free and unmap a vmap area
+ */
+static void free_unmap_vmap_area(struct vmap_area *va)
+{
+        va->flags |= VM_LAZY_FREE;
+        atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
+        if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+                purge_vmap_area_lazy();
+}
+static struct vmap_area *find_vmap_area(unsigned long addr)
+{
+        struct vmap_area *va;
+        spin_lock(&vmap_area_lock);
+        va = __find_vmap_area(addr);
+        spin_unlock(&vmap_area_lock);
+        return va;
+}
+static void free_unmap_vmap_area_addr(unsigned long addr)
+{
+        struct vmap_area *va;
+        va = find_vmap_area(addr);
+        BUG_ON(!va);
+        free_unmap_vmap_area(va);
+}
+/*** Per cpu kva allocator ***/
+/*
+ * vmap space is limited especially on 32 bit architectures. Ensure there is
+ * room for at least 16 percpu vmap blocks per CPU.
+ */
+/*
+ * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
+ * to #define VMALLOC_SPACE             (VMALLOC_END-VMALLOC_START). Guess
+ * instead (we just need a rough idea)
+ */
+#if BITS_PER_LONG == 32
+#define VMALLOC_SPACE           (128UL*1024*1024)
+#else
+#define VMALLOC_SPACE           (128UL*1024*1024*1024)
+#endif
+#define VMALLOC_PAGES           (VMALLOC_SPACE / PAGE_SIZE)
+#define VMAP_MAX_ALLOC          BITS_PER_LONG   /* 256K with 4K pages */
+#define VMAP_BBMAP_BITS_MAX     1024    /* 4MB with 4K pages */
+#define VMAP_BBMAP_BITS_MIN     (VMAP_MAX_ALLOC*2)
+#define VMAP_MIN(x, y)          ((x) < (y) ? (x) : (y)) /* can't use min() */
+#define VMAP_MAX(x, y)          ((x) > (y) ? (x) : (y)) /* can't use max() */
+#define VMAP_BBMAP_BITS         VMAP_MIN(VMAP_BBMAP_BITS_MAX,           \
+                                        VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
+                                                VMALLOC_PAGES / NR_CPUS / 16))
+#define VMAP_BLOCK_SIZE         (VMAP_BBMAP_BITS * PAGE_SIZE)
+struct vmap_block_queue {
+        spinlock_t lock;
+        struct list_head free;
+        struct list_head dirty;
+        unsigned int nr_dirty;
+};
+struct vmap_block {
+        spinlock_t lock;
+        struct vmap_area *va;
+        struct vmap_block_queue *vbq;
+        unsigned long free, dirty;
+        DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
+        DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
+        union {
+                struct {
+                        struct list_head free_list;
+                        struct list_head dirty_list;
+                };
+                struct rcu_head rcu_head;
+        };
+};
+/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
+static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
+/*
+ * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * in the free path. Could get rid of this if we change the API to return a
+ * "cookie" from alloc, to be passed to free. But no big deal yet.
+ */
+static DEFINE_SPINLOCK(vmap_block_tree_lock);
+static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+/*
+ * We should probably have a fallback mechanism to allocate virtual memory
+ * out of partially filled vmap blocks. However vmap block sizing should be
+ * fairly reasonable according to the vmalloc size, so it shouldn't be a
+ * big problem.
+ */
+static unsigned long addr_to_vb_idx(unsigned long addr)
+{
+        addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
+        addr /= VMAP_BLOCK_SIZE;
+        return addr;
+}
+static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
+{
+        struct vmap_block_queue *vbq;
+        struct vmap_block *vb;
+        struct vmap_area *va;
+        unsigned long vb_idx;
+        int node, err;
+        node = numa_node_id();
+        vb = kmalloc_node(sizeof(struct vmap_block),
+                        gfp_mask & GFP_RECLAIM_MASK, node);
+        if (unlikely(!vb))
+                return ERR_PTR(-ENOMEM);
+        va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
+                                        VMALLOC_START, VMALLOC_END,
+                                        node, gfp_mask);
+        if (unlikely(IS_ERR(va))) {
+                kfree(vb);
+                return ERR_PTR(PTR_ERR(va));
+        }
+        err = radix_tree_preload(gfp_mask);
+        if (unlikely(err)) {
+                kfree(vb);
+                free_vmap_area(va);
+                return ERR_PTR(err);
+        }
+        spin_lock_init(&vb->lock);
+        vb->va = va;
+        vb->free = VMAP_BBMAP_BITS;
+        vb->dirty = 0;
+        bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
+        bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
+        INIT_LIST_HEAD(&vb->free_list);
+        INIT_LIST_HEAD(&vb->dirty_list);
+        vb_idx = addr_to_vb_idx(va->va_start);
+        spin_lock(&vmap_block_tree_lock);
+        err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
+        spin_unlock(&vmap_block_tree_lock);
+        BUG_ON(err);
+        radix_tree_preload_end();
+        vbq = &get_cpu_var(vmap_block_queue);
+        vb->vbq = vbq;
+        spin_lock(&vbq->lock);
+        list_add(&vb->free_list, &vbq->free);
+        spin_unlock(&vbq->lock);
+        put_cpu_var(vmap_cpu_blocks);
+        return vb;
+}
+static void rcu_free_vb(struct rcu_head *head)
+{
+        struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
+        kfree(vb);
+}
+static void free_vmap_block(struct vmap_block *vb)
+{
+        struct vmap_block *tmp;
+        unsigned long vb_idx;
+        spin_lock(&vb->vbq->lock);
+        if (!list_empty(&vb->free_list))
+                list_del(&vb->free_list);
+        if (!list_empty(&vb->dirty_list))
+                list_del(&vb->dirty_list);
+        spin_unlock(&vb->vbq->lock);
+        vb_idx = addr_to_vb_idx(vb->va->va_start);
+        spin_lock(&vmap_block_tree_lock);
+        tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
+        spin_unlock(&vmap_block_tree_lock);
+        BUG_ON(tmp != vb);
+        free_unmap_vmap_area(vb->va);
+        call_rcu(&vb->rcu_head, rcu_free_vb);
+}
+static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
+{
+        struct vmap_block_queue *vbq;
+        struct vmap_block *vb;
+        unsigned long addr = 0;
+        unsigned int order;
+        BUG_ON(size & ~PAGE_MASK);
+        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+        order = get_order(size);
+again:
+        rcu_read_lock();
+        vbq = &get_cpu_var(vmap_block_queue);
+        list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+                int i;
+                spin_lock(&vb->lock);
+                i = bitmap_find_free_region(vb->alloc_map,
+                                                VMAP_BBMAP_BITS, order);
+                if (i >= 0) {
+                        addr = vb->va->va_start + (i << PAGE_SHIFT);
+                        BUG_ON(addr_to_vb_idx(addr) !=
+                                        addr_to_vb_idx(vb->va->va_start));
+                        vb->free -= 1UL << order;
+                        if (vb->free == 0) {
+                                spin_lock(&vbq->lock);
+                                list_del_init(&vb->free_list);
+                                spin_unlock(&vbq->lock);
+                        }
+                        spin_unlock(&vb->lock);
+                        break;
+                }
+                spin_unlock(&vb->lock);
+        }
+        put_cpu_var(vmap_cpu_blocks);
+        rcu_read_unlock();
+        if (!addr) {
+                vb = new_vmap_block(gfp_mask);
+                if (IS_ERR(vb))
+                        return vb;
+                goto again;
+        }
+        return (void *)addr;
+}
+static void vb_free(const void *addr, unsigned long size)
+{
+        unsigned long offset;
+        unsigned long vb_idx;
+        unsigned int order;
+        struct vmap_block *vb;
+        BUG_ON(size & ~PAGE_MASK);
+        BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
+        order = get_order(size);
+        offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
+        vb_idx = addr_to_vb_idx((unsigned long)addr);
+        rcu_read_lock();
+        vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
+        rcu_read_unlock();
+        BUG_ON(!vb);
+        spin_lock(&vb->lock);
+        bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
+        if (!vb->dirty) {
+                spin_lock(&vb->vbq->lock);
+                list_add(&vb->dirty_list, &vb->vbq->dirty);
+                spin_unlock(&vb->vbq->lock);
+        }
+        vb->dirty += 1UL << order;
+        if (vb->dirty == VMAP_BBMAP_BITS) {
+                BUG_ON(vb->free || !list_empty(&vb->free_list));
+                spin_unlock(&vb->lock);
+                free_vmap_block(vb);
+        } else
+                spin_unlock(&vb->lock);
+}
+/**
+ * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
+ *
+ * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
+ * to amortize TLB flushing overheads. What this means is that any page you
+ * have now, may, in a former life, have been mapped into kernel virtual
+ * address by the vmap layer and so there might be some CPUs with TLB entries
+ * still referencing that page (additional to the regular 1:1 kernel mapping).
+ *
+ * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
+ * be sure that none of the pages we have control over will have any aliases
+ * from the vmap layer.
+ */
+void vm_unmap_aliases(void)
+{
+        unsigned long start = ULONG_MAX, end = 0;
+        int cpu;
+        int flush = 0;
+        for_each_possible_cpu(cpu) {
+                struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
+                struct vmap_block *vb;
+                rcu_read_lock();
+                list_for_each_entry_rcu(vb, &vbq->free, free_list) {
+                        int i;
+                        spin_lock(&vb->lock);
+                        i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
+                        while (i < VMAP_BBMAP_BITS) {
+                                unsigned long s, e;
+                                int j;
+                                j = find_next_zero_bit(vb->dirty_map,
+                                        VMAP_BBMAP_BITS, i);
+                                s = vb->va->va_start + (i << PAGE_SHIFT);
+                                e = vb->va->va_start + (j << PAGE_SHIFT);
+                                vunmap_page_range(s, e);
+                                flush = 1;
+                                if (s < start)
+                                        start = s;
+                                if (e > end)
+                                        end = e;
+                                i = j;
+                                i = find_next_bit(vb->dirty_map,
+                                                        VMAP_BBMAP_BITS, i);
+                        }
+                        spin_unlock(&vb->lock);
+                }
+                rcu_read_unlock();
+        }
+        __purge_vmap_area_lazy(&start, &end, 1, flush);
+}
+EXPORT_SYMBOL_GPL(vm_unmap_aliases);
+/**
+ * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
+ * @mem: the pointer returned by vm_map_ram
+ * @count: the count passed to that vm_map_ram call (cannot unmap partial)
+ */
+void vm_unmap_ram(const void *mem, unsigned int count)
+{
+        unsigned long size = count << PAGE_SHIFT;
+        unsigned long addr = (unsigned long)mem;
+        BUG_ON(!addr);
+        BUG_ON(addr < VMALLOC_START);
+        BUG_ON(addr > VMALLOC_END);
+        BUG_ON(addr & (PAGE_SIZE-1));
+        debug_check_no_locks_freed(mem, size);
+        if (likely(count <= VMAP_MAX_ALLOC))
+                vb_free(mem, size);
+        else
+                free_unmap_vmap_area_addr(addr);
+}
+EXPORT_SYMBOL(vm_unmap_ram);
+/**
+ * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
+ * @pages: an array of pointers to the pages to be mapped
+ * @count: number of pages
+ * @node: prefer to allocate data structures on this node
+ * @prot: memory protection to use. PAGE_KERNEL for regular RAM
+ * @returns: a pointer to the address that has been mapped, or NULL on failure
+ */
+void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+{
+        unsigned long size = count << PAGE_SHIFT;
        unsigned long addr;
+        void *mem;
+        if (likely(count <= VMAP_MAX_ALLOC)) {
+                mem = vb_alloc(size, GFP_KERNEL);
+                if (IS_ERR(mem))
+                        return NULL;
+                addr = (unsigned long)mem;
+        } else {
+                struct vmap_area *va;
+                va = alloc_vmap_area(size, PAGE_SIZE,
+                                VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
+                if (IS_ERR(va))
+                        return NULL;
+                addr = va->va_start;
+                mem = (void *)addr;
+        }
+        if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+                vm_unmap_ram(mem, count);
+                return NULL;
+        }
+        return mem;
+}
+EXPORT_SYMBOL(vm_map_ram);
+void __init vmalloc_init(void)
+{
+        int i;
+        for_each_possible_cpu(i) {
+                struct vmap_block_queue *vbq;
+                vbq = &per_cpu(vmap_block_queue, i);
+                spin_lock_init(&vbq->lock);
+                INIT_LIST_HEAD(&vbq->free);
+                INIT_LIST_HEAD(&vbq->dirty);
+                vbq->nr_dirty = 0;
+        }
+}
+void unmap_kernel_range(unsigned long addr, unsigned long size)
+{
+        unsigned long end = addr + size;
+        vunmap_page_range(addr, end);
+        flush_tlb_kernel_range(addr, end);
+}
+int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
+{
+        unsigned long addr = (unsigned long)area->addr;
+        unsigned long end = addr + area->size - PAGE_SIZE;
+        int err;
+        err = vmap_page_range(addr, end, prot, *pages);
+        if (err > 0) {
+                *pages += err;
+                err = 0;
+        }
+        return err;
+}
+EXPORT_SYMBOL_GPL(map_vm_area);
+/*** Old vmalloc interfaces ***/
+DEFINE_RWLOCK(vmlist_lock);
+struct vm_struct *vmlist;
+static struct vm_struct *__get_vm_area_node(unsigned long size,
+                unsigned long flags, unsigned long start, unsigned long end,
+                int node, gfp_t gfp_mask, void *caller)
+{
+        static struct vmap_area *va;
+        struct vm_struct *area;
+        struct vm_struct *tmp, **p;
+        unsigned long align = 1;
        BUG_ON(in_interrupt());
        if (flags & VM_IOREMAP) {
@@ -225,13 +990,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
                align = 1ul << bit;
        }
-        addr = ALIGN(start, align);
        size = PAGE_ALIGN(size);
        if (unlikely(!size))
                return NULL;
        area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!area))
                return NULL;
@@ -240,48 +1004,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
         */
        size += PAGE_SIZE;
-        write_lock(&vmlist_lock);
+        va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
-        for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) {
+        if (IS_ERR(va)) {
-                if ((unsigned long)tmp->addr < addr) {
+                kfree(area);
-                        if((unsigned long)tmp->addr + tmp->size >= addr)
+                return NULL;
-                                addr = ALIGN(tmp->size + 
-                                             (unsigned long)tmp->addr, align);
-                        continue;
-                }
-                if ((size + addr) < addr)
-                        goto out;
-                if (size + addr <= (unsigned long)tmp->addr)
-                        goto found;
-                addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
-                if (addr > end - size)
-                        goto out;
        }
-        if ((size + addr) < addr)
-                goto out;
-        if (addr > end - size)
-                goto out;
-found:
-        area->next = *p;
-        *p = area;
        area->flags = flags;
-        area->addr = (void *)addr;
+        area->addr = (void *)va->va_start;
        area->size = size;
        area->pages = NULL;
        area->nr_pages = 0;
        area->phys_addr = 0;
        area->caller = caller;
+        va->private = area;
+        va->flags |= VM_VM_AREA;
+        write_lock(&vmlist_lock);
+        for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
+                if (tmp->addr >= area->addr)
+                        break;
+        }
+        area->next = *p;
+        *p = area;
        write_unlock(&vmlist_lock);
        return area;
-out:
-        write_unlock(&vmlist_lock);
-        kfree(area);
-        if (printk_ratelimit())
-                printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
-        return NULL;
 }
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
@@ -321,39 +1069,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
                                  gfp_mask, __builtin_return_address(0));
 }
-/* Caller must hold vmlist_lock */
+static struct vm_struct *find_vm_area(const void *addr)
-static struct vm_struct *__find_vm_area(const void *addr)
 {
-        struct vm_struct *tmp;
+        struct vmap_area *va;
-        for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
+        va = find_vmap_area((unsigned long)addr);
-                 if (tmp->addr == addr)
+        if (va && va->flags & VM_VM_AREA)
-                        break;
+                return va->private;
-        }
-        return tmp;
-}
-/* Caller must hold vmlist_lock */
-static struct vm_struct *__remove_vm_area(const void *addr)
-{
-        struct vm_struct **p, *tmp;
-        for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
-                 if (tmp->addr == addr)
-                         goto found;
-        }
        return NULL;
-found:
-        unmap_vm_area(tmp);
-        *p = tmp->next;
-        /*
-         * Remove the guard page.
-         */
-        tmp->size -= PAGE_SIZE;
-        return tmp;
 }
 /**
@@ -366,11 +1090,24 @@ found:
 */
 struct vm_struct *remove_vm_area(const void *addr)
 {
-        struct vm_struct *v;
+        struct vmap_area *va;
-        write_lock(&vmlist_lock);
-        v = __remove_vm_area(addr);
+        va = find_vmap_area((unsigned long)addr);
-        write_unlock(&vmlist_lock);
+        if (va && va->flags & VM_VM_AREA) {
-        return v;
+                struct vm_struct *vm = va->private;
+                struct vm_struct *tmp, **p;
+                free_unmap_vmap_area(va);
+                vm->size -= PAGE_SIZE;
+                write_lock(&vmlist_lock);
+                for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
+                        ;
+                *p = tmp->next;
+                write_unlock(&vmlist_lock);
+                return vm;
+        }
+        return NULL;
 }
 static void __vunmap(const void *addr, int deallocate_pages)
@@ -381,16 +1118,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
                return;
        if ((PAGE_SIZE-1) & (unsigned long)addr) {
-                printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
+                WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
-                WARN_ON(1);
                return;
        }
        area = remove_vm_area(addr);
        if (unlikely(!area)) {
-                printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+                WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
                                addr);
-                WARN_ON(1);
                return;
        }
@@ -482,6 +1217,8 @@ void *vmap(struct page **pages, unsigned int count,
 }
 EXPORT_SYMBOL(vmap);
+static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
+                            int node, void *caller);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                 pgprot_t prot, int node, void *caller)
 {
@@ -608,10 +1345,8 @@ void *vmalloc_user(unsigned long size)
        ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
        if (ret) {
-                write_lock(&vmlist_lock);
+                area = find_vm_area(ret);
-                area = __find_vm_area(ret);
                area->flags |= VM_USERMAP;
-                write_unlock(&vmlist_lock);
        }
        return ret;
 }
@@ -691,10 +1426,8 @@ void *vmalloc_32_user(unsigned long size)
        ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
        if (ret) {
-                write_lock(&vmlist_lock);
+                area = find_vm_area(ret);
-                area = __find_vm_area(ret);
                area->flags |= VM_USERMAP;
-                write_unlock(&vmlist_lock);
        }
        return ret;
 }
@@ -795,26 +1528,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
        struct vm_struct *area;
        unsigned long uaddr = vma->vm_start;
        unsigned long usize = vma->vm_end - vma->vm_start;
-        int ret;
        if ((PAGE_SIZE-1) & (unsigned long)addr)
                return -EINVAL;
-        read_lock(&vmlist_lock);
+        area = find_vm_area(addr);
-        area = __find_vm_area(addr);
        if (!area)
-                goto out_einval_locked;
+                return -EINVAL;
        if (!(area->flags & VM_USERMAP))
-                goto out_einval_locked;
+                return -EINVAL;
        if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
-                goto out_einval_locked;
+                return -EINVAL;
-        read_unlock(&vmlist_lock);
        addr += pgoff << PAGE_SHIFT;
        do {
                struct page *page = vmalloc_to_page(addr);
+                int ret;
                ret = vm_insert_page(vma, uaddr, page);
                if (ret)
                        return ret;
@@ -827,11 +1559,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
        /* Prevent "things" like memory migration? VM_flags need a cleanup... */
        vma->vm_flags |= VM_RESERVED;
-        return ret;
+        return 0;
-out_einval_locked:
-        read_unlock(&vmlist_lock);
-        return -EINVAL;
 }
 EXPORT_SYMBOL(remap_vmalloc_range);
@@ -931,6 +1659,25 @@ static void s_stop(struct seq_file *m, void *p)
        read_unlock(&vmlist_lock);
 }
+static void show_numa_info(struct seq_file *m, struct vm_struct *v)
+{
+        if (NUMA_BUILD) {
+                unsigned int nr, *counters = m->private;
+                if (!counters)
+                        return;
+                memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+                for (nr = 0; nr < v->nr_pages; nr++)
+                        counters[page_to_nid(v->pages[nr])]++;
+                for_each_node_state(nr, N_HIGH_MEMORY)
+                        if (counters[nr])
+                                seq_printf(m, " N%u=%u", nr, counters[nr]);
+        }
+}
 static int s_show(struct seq_file *m, void *p)
 {
        struct vm_struct *v = p;
@@ -967,15 +1714,46 @@ static int s_show(struct seq_file *m, void *p)
        if (v->flags & VM_VPAGES)
                seq_printf(m, " vpages");
+        show_numa_info(m, v);
        seq_putc(m, '\n');
        return 0;
 }
-const struct seq_operations vmalloc_op = {
+static const struct seq_operations vmalloc_op = {
        .start = s_start,
        .next = s_next,
        .stop = s_stop,
        .show = s_show,
 };
+static int vmalloc_open(struct inode *inode, struct file *file)
+{
+        unsigned int *ptr = NULL;
+        int ret;
+        if (NUMA_BUILD)
+                ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+        ret = seq_open(file, &vmalloc_op);
+        if (!ret) {
+                struct seq_file *m = file->private_data;
+                m->private = ptr;
+        } else
+                kfree(ptr);
+        return ret;
+}
+static const struct file_operations proc_vmalloc_operations = {
+        .open           = vmalloc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release_private,
+};
+static int __init proc_vmalloc_init(void)
+{
+        proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
+        return 0;
+}
+module_init(proc_vmalloc_init);
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 967d30ccd92b..3b5860294bb6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -38,6 +38,8 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
+#include <linux/delayacct.h>
+#include <linux/sysctl.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -77,7 +79,7 @@ struct scan_control {
        unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
                        unsigned long *scanned, int order, int mode,
                        struct zone *z, struct mem_cgroup *mem_cont,
-                        int active);
+                        int active, int file);
 };
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -390,17 +392,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 }
 /*
- * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * Same as remove_mapping, but if the page is removed from the mapping, it
- * someone else has a ref on the page, abort and return 0.  If it was
+ * gets returned with a refcount of 0.
- * successfully detached, return 1.  Assumes the caller has a single ref on
- * this page.
 */
-int remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page)
 {
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        /*
         * The non racy check for a busy page.
         *
@@ -426,32 +426,131 @@ int remove_mapping(struct address_space *mapping, struct page *page)
         * Note that if SetPageDirty is always performed via set_page_dirty,
         * and thus under tree_lock, then this ordering is not required.
         */
-        if (unlikely(page_count(page) != 2))
+        if (!page_freeze_refs(page, 2))
                goto cannot_free;
-        smp_rmb();
+        /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
-        if (unlikely(PageDirty(page)))
+        if (unlikely(PageDirty(page))) {
+                page_unfreeze_refs(page, 2);
                goto cannot_free;
+        }
        if (PageSwapCache(page)) {
                swp_entry_t swap = { .val = page_private(page) };
                __delete_from_swap_cache(page);
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                swap_free(swap);
-                __put_page(page);       /* The pagecache ref */
+        } else {
-                return 1;
+                __remove_from_page_cache(page);
+                spin_unlock_irq(&mapping->tree_lock);
        }
-        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
-        __put_page(page);
        return 1;
 cannot_free:
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
        return 0;
 }
 /*
+ * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * someone else has a ref on the page, abort and return 0.  If it was
+ * successfully detached, return 1.  Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+        if (__remove_mapping(mapping, page)) {
+                /*
+                 * Unfreezing the refcount with 1 rather than 2 effectively
+                 * drops the pagecache ref for us without requiring another
+                 * atomic operation.
+                 */
+                page_unfreeze_refs(page, 1);
+                return 1;
+        }
+        return 0;
+}
+/**
+ * putback_lru_page - put previously isolated page onto appropriate LRU list
+ * @page: page to be put back to appropriate lru list
+ *
+ * Add previously isolated @page to appropriate LRU list.
+ * Page may still be unevictable for other reasons.
+ *
+ * lru_lock must not be held, interrupts must be enabled.
+ */
+#ifdef CONFIG_UNEVICTABLE_LRU
+void putback_lru_page(struct page *page)
+{
+        int lru;
+        int active = !!TestClearPageActive(page);
+        int was_unevictable = PageUnevictable(page);
+        VM_BUG_ON(PageLRU(page));
+redo:
+        ClearPageUnevictable(page);
+        if (page_evictable(page, NULL)) {
+                /*
+                 * For evictable pages, we can use the cache.
+                 * In event of a race, worst case is we end up with an
+                 * unevictable page on [in]active list.
+                 * We know how to handle that.
+                 */
+                lru = active + page_is_file_cache(page);
+                lru_cache_add_lru(page, lru);
+        } else {
+                /*
+                 * Put unevictable pages directly on zone's unevictable
+                 * list.
+                 */
+                lru = LRU_UNEVICTABLE;
+                add_page_to_unevictable_list(page);
+        }
+        mem_cgroup_move_lists(page, lru);
+        /*
+         * page's status can change while we move it among lru. If an evictable
+         * page is on unevictable list, it never be freed. To avoid that,
+         * check after we added it to the list, again.
+         */
+        if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
+                if (!isolate_lru_page(page)) {
+                        put_page(page);
+                        goto redo;
+                }
+                /* This means someone else dropped this page from LRU
+                 * So, it will be freed or putback to LRU again. There is
+                 * nothing to do here.
+                 */
+        }
+        if (was_unevictable && lru != LRU_UNEVICTABLE)
+                count_vm_event(UNEVICTABLE_PGRESCUED);
+        else if (!was_unevictable && lru == LRU_UNEVICTABLE)
+                count_vm_event(UNEVICTABLE_PGCULLED);
+        put_page(page);         /* drop ref from isolate */
+}
+#else /* CONFIG_UNEVICTABLE_LRU */
+void putback_lru_page(struct page *page)
+{
+        int lru;
+        VM_BUG_ON(PageLRU(page));
+        lru = !!TestClearPageActive(page) + page_is_file_cache(page);
+        lru_cache_add_lru(page, lru);
+        mem_cgroup_move_lists(page, lru);
+        put_page(page);
+}
+#endif /* CONFIG_UNEVICTABLE_LRU */
+/*
 * shrink_page_list() returns the number of reclaimed pages
 */
 static unsigned long shrink_page_list(struct list_head *page_list,
@@ -477,13 +576,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                page = lru_to_page(page_list);
                list_del(&page->lru);
-                if (TestSetPageLocked(page))
+                if (!trylock_page(page))
                        goto keep;
                VM_BUG_ON(PageActive(page));
                sc->nr_scanned++;
+                if (unlikely(!page_evictable(page, NULL)))
+                        goto cull_mlocked;
                if (!sc->may_swap && page_mapped(page))
                        goto keep_locked;
@@ -520,9 +622,19 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * Anonymous process memory has backing store?
                 * Try to allocate it some swap space here.
                 */
-                if (PageAnon(page) && !PageSwapCache(page))
+                if (PageAnon(page) && !PageSwapCache(page)) {
+                        switch (try_to_munlock(page)) {
+                        case SWAP_FAIL:         /* shouldn't happen */
+                        case SWAP_AGAIN:
+                                goto keep_locked;
+                        case SWAP_MLOCK:
+                                goto cull_mlocked;
+                        case SWAP_SUCCESS:
+                                ; /* fall thru'; add to swap cache */
+                        }
                        if (!add_to_swap(page, GFP_ATOMIC))
                                goto activate_locked;
+                }
 #endif /* CONFIG_SWAP */
                mapping = page_mapping(page);
@@ -537,6 +649,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto activate_locked;
                        case SWAP_AGAIN:
                                goto keep_locked;
+                        case SWAP_MLOCK:
+                                goto cull_mlocked;
                        case SWAP_SUCCESS:
                                ; /* try to free the page below */
                        }
@@ -563,7 +677,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 * A synchronous write - probably a ramdisk.  Go
                                 * ahead and try to reclaim the page.
                                 */
-                                if (TestSetPageLocked(page))
+                                if (!trylock_page(page))
                                        goto keep;
                                if (PageDirty(page) || PageWriteback(page))
                                        goto keep_locked;
@@ -583,7 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 * possible for a page to have PageDirty set, but it is actually
                 * clean (all its buffers are clean).  This happens if the
                 * buffers were written out directly, with submit_bh(). ext3
-                 * will do this, as well as the blockdev mapping. 
+                 * will do this, as well as the blockdev mapping.
                 * try_to_release_page() will discover that cleanness and will
                 * drop the buffers and mark the page clean - it can be freed.
                 *
@@ -597,32 +711,64 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PagePrivate(page)) {
                        if (!try_to_release_page(page, sc->gfp_mask))
                                goto activate_locked;
-                        if (!mapping && page_count(page) == 1)
+                        if (!mapping && page_count(page) == 1) {
-                                goto free_it;
+                                unlock_page(page);
+                                if (put_page_testzero(page))
+                                        goto free_it;
+                                else {
+                                        /*
+                                         * rare race with speculative reference.
+                                         * the speculative reference will free
+                                         * this page shortly, so we may
+                                         * increment nr_reclaimed here (and
+                                         * leave it off the LRU).
+                                         */
+                                        nr_reclaimed++;
+                                        continue;
+                                }
+                        }
                }
-                if (!mapping || !remove_mapping(mapping, page))
+                if (!mapping || !__remove_mapping(mapping, page))
                        goto keep_locked;
+                /*
+                 * At this point, we have no other references and there is
+                 * no way to pick any more up (removed from LRU, removed
+                 * from pagecache). Can use non-atomic bitops now (and
+                 * we obviously don't have to worry about waking up a process
+                 * waiting on the page lock, because there are no references.
+                 */
+                __clear_page_locked(page);
 free_it:
-                unlock_page(page);
                nr_reclaimed++;
-                if (!pagevec_add(&freed_pvec, page))
+                if (!pagevec_add(&freed_pvec, page)) {
-                        __pagevec_release_nonlru(&freed_pvec);
+                        __pagevec_free(&freed_pvec);
+                        pagevec_reinit(&freed_pvec);
+                }
+                continue;
+cull_mlocked:
+                unlock_page(page);
+                putback_lru_page(page);
                continue;
 activate_locked:
+                /* Not a candidate for swapping, so reclaim swap space. */
+                if (PageSwapCache(page) && vm_swap_full())
+                        remove_exclusive_swap_page_ref(page);
+                VM_BUG_ON(PageActive(page));
                SetPageActive(page);
                pgactivate++;
 keep_locked:
                unlock_page(page);
 keep:
                list_add(&page->lru, &ret_pages);
-                VM_BUG_ON(PageLRU(page));
+                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
        list_splice(&ret_pages, page_list);
        if (pagevec_count(&freed_pvec))
-                __pagevec_release_nonlru(&freed_pvec);
+                __pagevec_free(&freed_pvec);
        count_vm_events(PGACTIVATE, pgactivate);
        return nr_reclaimed;
 }
@@ -642,7 +788,7 @@ keep:
 *
 * returns 0 on success, -ve errno on failure.
 */
-int __isolate_lru_page(struct page *page, int mode)
+int __isolate_lru_page(struct page *page, int mode, int file)
 {
        int ret = -EINVAL;
@@ -658,6 +804,17 @@ int __isolate_lru_page(struct page *page, int mode)
        if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
                return ret;
+        if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
+                return ret;
+        /*
+         * When this function is being called for lumpy reclaim, we
+         * initially look into all LRU pages, active, inactive and
+         * unevictable; only give shrink_page_list evictable pages.
+         */
+        if (PageUnevictable(page))
+                return ret;
        ret = -EBUSY;
        if (likely(get_page_unless_zero(page))) {
                /*
@@ -688,12 +845,13 @@ int __isolate_lru_page(struct page *page, int mode)
 * @scanned:    The number of pages that were scanned.
 * @order:      The caller's attempted allocation order
 * @mode:       One of the LRU isolation modes
+ * @file:       True [1] if isolating file [!anon] pages
 *
 * returns how many pages were moved onto *@dst.
 */
 static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                struct list_head *src, struct list_head *dst,
-                unsigned long *scanned, int order, int mode)
+                unsigned long *scanned, int order, int mode, int file)
 {
        unsigned long nr_taken = 0;
        unsigned long scan;
@@ -710,7 +868,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                VM_BUG_ON(!PageLRU(page));
-                switch (__isolate_lru_page(page, mode)) {
+                switch (__isolate_lru_page(page, mode, file)) {
                case 0:
                        list_move(&page->lru, dst);
                        nr_taken++;
@@ -753,10 +911,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                                break;
                        cursor_page = pfn_to_page(pfn);
                        /* Check that we have not crossed a zone boundary. */
                        if (unlikely(page_zone_id(cursor_page) != zone_id))
                                continue;
-                        switch (__isolate_lru_page(cursor_page, mode)) {
+                        switch (__isolate_lru_page(cursor_page, mode, file)) {
                        case 0:
                                list_move(&cursor_page->lru, dst);
                                nr_taken++;
@@ -767,7 +926,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                                /* else it is being freed elsewhere */
                                list_move(&cursor_page->lru, src);
                        default:
-                                break;
+                                break;  /* ! on LRU or wrong list */
                        }
                }
        }
@@ -781,40 +940,93 @@ static unsigned long isolate_pages_global(unsigned long nr,
                                        unsigned long *scanned, int order,
                                        int mode, struct zone *z,
                                        struct mem_cgroup *mem_cont,
-                                        int active)
+                                        int active, int file)
 {
+        int lru = LRU_BASE;
        if (active)
-                return isolate_lru_pages(nr, &z->active_list, dst,
+                lru += LRU_ACTIVE;
-                                                scanned, order, mode);
+        if (file)
-        else
+                lru += LRU_FILE;
-                return isolate_lru_pages(nr, &z->inactive_list, dst,
+        return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
-                                                scanned, order, mode);
+                                                                mode, !!file);
 }
 /*
 * clear_active_flags() is a helper for shrink_active_list(), clearing
 * any active bits from the pages in the list.
 */
-static unsigned long clear_active_flags(struct list_head *page_list)
+static unsigned long clear_active_flags(struct list_head *page_list,
+                                        unsigned int *count)
 {
        int nr_active = 0;
+        int lru;
        struct page *page;
-        list_for_each_entry(page, page_list, lru)
+        list_for_each_entry(page, page_list, lru) {
+                lru = page_is_file_cache(page);
                if (PageActive(page)) {
+                        lru += LRU_ACTIVE;
                        ClearPageActive(page);
                        nr_active++;
                }
+                count[lru]++;
+        }
        return nr_active;
 }
+/**
+ * isolate_lru_page - tries to isolate a page from its LRU list
+ * @page: page to isolate from its LRU list
+ *
+ * Isolates a @page from an LRU list, clears PageLRU and adjusts the
+ * vmstat statistic corresponding to whatever LRU list the page was on.
+ *
+ * Returns 0 if the page was removed from an LRU list.
+ * Returns -EBUSY if the page was not on an LRU list.
+ *
+ * The returned page will have PageLRU() cleared.  If it was found on
+ * the active list, it will have PageActive set.  If it was found on
+ * the unevictable list, it will have the PageUnevictable bit set. That flag
+ * may need to be cleared by the caller before letting the page go.
+ *
+ * The vmstat statistic corresponding to the list on which the page was
+ * found will be decremented.
+ *
+ * Restrictions:
+ * (1) Must be called with an elevated refcount on the page. This is a
+ *     fundamentnal difference from isolate_lru_pages (which is called
+ *     without a stable reference).
+ * (2) the lru_lock must not be held.
+ * (3) interrupts must be enabled.
+ */
+int isolate_lru_page(struct page *page)
+{
+        int ret = -EBUSY;
+        if (PageLRU(page)) {
+                struct zone *zone = page_zone(page);
+                spin_lock_irq(&zone->lru_lock);
+                if (PageLRU(page) && get_page_unless_zero(page)) {
+                        int lru = page_lru(page);
+                        ret = 0;
+                        ClearPageLRU(page);
+                        del_page_from_lru_list(zone, page, lru);
+                }
+                spin_unlock_irq(&zone->lru_lock);
+        }
+        return ret;
+}
 /*
 * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
 * of reclaimed pages
 */
 static unsigned long shrink_inactive_list(unsigned long max_scan,
-                                struct zone *zone, struct scan_control *sc)
+                        struct zone *zone, struct scan_control *sc,
+                        int priority, int file)
 {
        LIST_HEAD(page_list);
        struct pagevec pvec;
@@ -831,20 +1043,43 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                unsigned long nr_scan;
                unsigned long nr_freed;
                unsigned long nr_active;
+                unsigned int count[NR_LRU_LISTS] = { 0, };
+                int mode = ISOLATE_INACTIVE;
+                /*
+                 * If we need a large contiguous chunk of memory, or have
+                 * trouble getting a small set of contiguous pages, we
+                 * will reclaim both active and inactive pages.
+                 *
+                 * We use the same threshold as pageout congestion_wait below.
+                 */
+                if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+                        mode = ISOLATE_BOTH;
+                else if (sc->order && priority < DEF_PRIORITY - 2)
+                        mode = ISOLATE_BOTH;
                nr_taken = sc->isolate_pages(sc->swap_cluster_max,
-                             &page_list, &nr_scan, sc->order,
+                             &page_list, &nr_scan, sc->order, mode,
-                             (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
+                                zone, sc->mem_cgroup, 0, file);
-                                             ISOLATE_BOTH : ISOLATE_INACTIVE,
+                nr_active = clear_active_flags(&page_list, count);
-                                zone, sc->mem_cgroup, 0);
-                nr_active = clear_active_flags(&page_list);
                __count_vm_events(PGDEACTIVATE, nr_active);
-                __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
+                __mod_zone_page_state(zone, NR_ACTIVE_FILE,
-                __mod_zone_page_state(zone, NR_INACTIVE,
+                                                -count[LRU_ACTIVE_FILE]);
-                                                -(nr_taken - nr_active));
+                __mod_zone_page_state(zone, NR_INACTIVE_FILE,
-                if (scan_global_lru(sc))
+                                                -count[LRU_INACTIVE_FILE]);
+                __mod_zone_page_state(zone, NR_ACTIVE_ANON,
+                                                -count[LRU_ACTIVE_ANON]);
+                __mod_zone_page_state(zone, NR_INACTIVE_ANON,
+                                                -count[LRU_INACTIVE_ANON]);
+                if (scan_global_lru(sc)) {
                        zone->pages_scanned += nr_scan;
+                        zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
+                        zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
+                        zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
+                        zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
+                }
                spin_unlock_irq(&zone->lru_lock);
                nr_scanned += nr_scan;
@@ -864,7 +1099,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                         * The attempt at page out may have made some
                         * of the pages active, mark them inactive again.
                         */
-                        nr_active = clear_active_flags(&page_list);
+                        nr_active = clear_active_flags(&page_list, count);
                        count_vm_events(PGDEACTIVATE, nr_active);
                        nr_freed += shrink_page_list(&page_list, sc,
@@ -889,14 +1124,24 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                 * Put back any unfreeable pages.
                 */
                while (!list_empty(&page_list)) {
+                        int lru;
                        page = lru_to_page(&page_list);
                        VM_BUG_ON(PageLRU(page));
-                        SetPageLRU(page);
                        list_del(&page->lru);
-                        if (PageActive(page))
+                        if (unlikely(!page_evictable(page, NULL))) {
-                                add_page_to_active_list(zone, page);
+                                spin_unlock_irq(&zone->lru_lock);
-                        else
+                                putback_lru_page(page);
-                                add_page_to_inactive_list(zone, page);
+                                spin_lock_irq(&zone->lru_lock);
+                                continue;
+                        }
+                        SetPageLRU(page);
+                        lru = page_lru(page);
+                        add_page_to_lru_list(zone, page, lru);
+                        mem_cgroup_move_lists(page, lru);
+                        if (PageActive(page) && scan_global_lru(sc)) {
+                                int file = !!page_is_file_cache(page);
+                                zone->recent_rotated[file]++;
+                        }
                        if (!pagevec_add(&pvec, page)) {
                                spin_unlock_irq(&zone->lru_lock);
                                __pagevec_release(&pvec);
@@ -927,115 +1172,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
 static inline int zone_is_near_oom(struct zone *zone)
 {
-        return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE)
+        return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
-                                + zone_page_state(zone, NR_INACTIVE))*3;
-}
-/*
- * Determine we should try to reclaim mapped pages.
- * This is called only when sc->mem_cgroup is NULL.
- */
-static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
-                                int priority)
-{
-        long mapped_ratio;
-        long distress;
-        long swap_tendency;
-        long imbalance;
-        int reclaim_mapped = 0;
-        int prev_priority;
-        if (scan_global_lru(sc) && zone_is_near_oom(zone))
-                return 1;
-        /*
-         * `distress' is a measure of how much trouble we're having
-         * reclaiming pages.  0 -> no problems.  100 -> great trouble.
-         */
-        if (scan_global_lru(sc))
-                prev_priority = zone->prev_priority;
-        else
-                prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
-        distress = 100 >> min(prev_priority, priority);
-        /*
-         * The point of this algorithm is to decide when to start
-         * reclaiming mapped memory instead of just pagecache.  Work out
-         * how much memory
-         * is mapped.
-         */
-        if (scan_global_lru(sc))
-                mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
-                                global_page_state(NR_ANON_PAGES)) * 100) /
-                                        vm_total_pages;
-        else
-                mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
-        /*
-         * Now decide how much we really want to unmap some pages.  The
-         * mapped ratio is downgraded - just because there's a lot of
-         * mapped memory doesn't necessarily mean that page reclaim
-         * isn't succeeding.
-         *
-         * The distress ratio is important - we don't want to start
-         * going oom.
-         *
-         * A 100% value of vm_swappiness overrides this algorithm
-         * altogether.
-         */
-        swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
-        /*
-         * If there's huge imbalance between active and inactive
-         * (think active 100 times larger than inactive) we should
-         * become more permissive, or the system will take too much
-         * cpu before it start swapping during memory pressure.
-         * Distress is about avoiding early-oom, this is about
-         * making swappiness graceful despite setting it to low
-         * values.
-         *
-         * Avoid div by zero with nr_inactive+1, and max resulting
-         * value is vm_total_pages.
-         */
-        if (scan_global_lru(sc)) {
-                imbalance  = zone_page_state(zone, NR_ACTIVE);
-                imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
-        } else
-                imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
-        /*
-         * Reduce the effect of imbalance if swappiness is low,
-         * this means for a swappiness very low, the imbalance
-         * must be much higher than 100 for this logic to make
-         * the difference.
-         *
-         * Max temporary value is vm_total_pages*100.
-         */
-        imbalance *= (vm_swappiness + 1);
-        imbalance /= 100;
-        /*
-         * If not much of the ram is mapped, makes the imbalance
-         * less relevant, it's high priority we refill the inactive
-         * list with mapped pages only in presence of high ratio of
-         * mapped pages.
-         *
-         * Max temporary value is vm_total_pages*100.
-         */
-        imbalance *= mapped_ratio;
-        imbalance /= 100;
-        /* apply imbalance feedback to swap_tendency */
-        swap_tendency += imbalance;
-        /*
-         * Now use this metric to decide whether to start moving mapped
-         * memory onto the inactive list.
-         */
-        if (swap_tendency >= 100)
-                reclaim_mapped = 1;
-        return reclaim_mapped;
 }
 /*
@@ -1058,53 +1195,71 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
-                                struct scan_control *sc, int priority)
+                        struct scan_control *sc, int priority, int file)
 {
        unsigned long pgmoved;
        int pgdeactivate = 0;
        unsigned long pgscanned;
        LIST_HEAD(l_hold);      /* The pages which were snipped off */
-        LIST_HEAD(l_inactive);  /* Pages to go onto the inactive_list */
+        LIST_HEAD(l_inactive);
-        LIST_HEAD(l_active);    /* Pages to go onto the active_list */
        struct page *page;
        struct pagevec pvec;
-        int reclaim_mapped = 0;
+        enum lru_list lru;
-        if (sc->may_swap)
-                reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
        pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
                                        ISOLATE_ACTIVE, zone,
-                                        sc->mem_cgroup, 1);
+                                        sc->mem_cgroup, 1, file);
        /*
         * zone->pages_scanned is used for detect zone's oom
         * mem_cgroup remembers nr_scan by itself.
         */
-        if (scan_global_lru(sc))
+        if (scan_global_lru(sc)) {
                zone->pages_scanned += pgscanned;
+                zone->recent_scanned[!!file] += pgmoved;
+        }
-        __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
+        if (file)
+                __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
+        else
+                __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
        spin_unlock_irq(&zone->lru_lock);
+        pgmoved = 0;
        while (!list_empty(&l_hold)) {
                cond_resched();
                page = lru_to_page(&l_hold);
                list_del(&page->lru);
-                if (page_mapped(page)) {
-                        if (!reclaim_mapped ||
+                if (unlikely(!page_evictable(page, NULL))) {
-                            (total_swap_pages == 0 && PageAnon(page)) ||
+                        putback_lru_page(page);
-                            page_referenced(page, 0, sc->mem_cgroup)) {
+                        continue;
-                                list_add(&page->lru, &l_active);
-                                continue;
-                        }
                }
+                /* page_referenced clears PageReferenced */
+                if (page_mapping_inuse(page) &&
+                    page_referenced(page, 0, sc->mem_cgroup))
+                        pgmoved++;
                list_add(&page->lru, &l_inactive);
        }
+        /*
+         * Count referenced pages from currently used mappings as
+         * rotated, even though they are moved to the inactive list.
+         * This helps balance scan pressure between file and anonymous
+         * pages in get_scan_ratio.
+         */
+        zone->recent_rotated[!!file] += pgmoved;
+        /*
+         * Move the pages to the [file or anon] inactive list.
+         */
        pagevec_init(&pvec, 1);
        pgmoved = 0;
+        lru = LRU_BASE + file * LRU_FILE;
        spin_lock_irq(&zone->lru_lock);
        while (!list_empty(&l_inactive)) {
                page = lru_to_page(&l_inactive);
@@ -1114,11 +1269,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                VM_BUG_ON(!PageActive(page));
                ClearPageActive(page);
-                list_move(&page->lru, &zone->inactive_list);
+                list_move(&page->lru, &zone->lru[lru].list);
-                mem_cgroup_move_lists(page, false);
+                mem_cgroup_move_lists(page, lru);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
-                        __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
+                        __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
                        spin_unlock_irq(&zone->lru_lock);
                        pgdeactivate += pgmoved;
                        pgmoved = 0;
@@ -1128,104 +1283,189 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                        spin_lock_irq(&zone->lru_lock);
                }
        }
-        __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
+        __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
        pgdeactivate += pgmoved;
        if (buffer_heads_over_limit) {
                spin_unlock_irq(&zone->lru_lock);
                pagevec_strip(&pvec);
                spin_lock_irq(&zone->lru_lock);
        }
-        pgmoved = 0;
-        while (!list_empty(&l_active)) {
-                page = lru_to_page(&l_active);
-                prefetchw_prev_lru_page(page, &l_active, flags);
-                VM_BUG_ON(PageLRU(page));
-                SetPageLRU(page);
-                VM_BUG_ON(!PageActive(page));
-                list_move(&page->lru, &zone->active_list);
-                mem_cgroup_move_lists(page, true);
-                pgmoved++;
-                if (!pagevec_add(&pvec, page)) {
-                        __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
-                        pgmoved = 0;
-                        spin_unlock_irq(&zone->lru_lock);
-                        __pagevec_release(&pvec);
-                        spin_lock_irq(&zone->lru_lock);
-                }
-        }
-        __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
        __count_zone_vm_events(PGREFILL, zone, pgscanned);
        __count_vm_events(PGDEACTIVATE, pgdeactivate);
        spin_unlock_irq(&zone->lru_lock);
+        if (vm_swap_full())
+                pagevec_swap_free(&pvec);
        pagevec_release(&pvec);
 }
+static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
+        struct zone *zone, struct scan_control *sc, int priority)
+{
+        int file = is_file_lru(lru);
+        if (lru == LRU_ACTIVE_FILE) {
+                shrink_active_list(nr_to_scan, zone, sc, priority, file);
+                return 0;
+        }
+        if (lru == LRU_ACTIVE_ANON &&
+            (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
+                shrink_active_list(nr_to_scan, zone, sc, priority, file);
+                return 0;
+        }
+        return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
+}
+/*
+ * Determine how aggressively the anon and file LRU lists should be
+ * scanned.  The relative value of each set of LRU lists is determined
+ * by looking at the fraction of the pages scanned we did rotate back
+ * onto the active list instead of evict.
+ *
+ * percent[0] specifies how much pressure to put on ram/swap backed
+ * memory, while percent[1] determines pressure on the file LRUs.
+ */
+static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
+                                        unsigned long *percent)
+{
+        unsigned long anon, file, free;
+        unsigned long anon_prio, file_prio;
+        unsigned long ap, fp;
+        anon  = zone_page_state(zone, NR_ACTIVE_ANON) +
+                zone_page_state(zone, NR_INACTIVE_ANON);
+        file  = zone_page_state(zone, NR_ACTIVE_FILE) +
+                zone_page_state(zone, NR_INACTIVE_FILE);
+        free  = zone_page_state(zone, NR_FREE_PAGES);
+        /* If we have no swap space, do not bother scanning anon pages. */
+        if (nr_swap_pages <= 0) {
+                percent[0] = 0;
+                percent[1] = 100;
+                return;
+        }
+        /* If we have very few page cache pages, force-scan anon pages. */
+        if (unlikely(file + free <= zone->pages_high)) {
+                percent[0] = 100;
+                percent[1] = 0;
+                return;
+        }
+        /*
+         * OK, so we have swap space and a fair amount of page cache
+         * pages.  We use the recently rotated / recently scanned
+         * ratios to determine how valuable each cache is.
+         *
+         * Because workloads change over time (and to avoid overflow)
+         * we keep these statistics as a floating average, which ends
+         * up weighing recent references more than old ones.
+         *
+         * anon in [0], file in [1]
+         */
+        if (unlikely(zone->recent_scanned[0] > anon / 4)) {
+                spin_lock_irq(&zone->lru_lock);
+                zone->recent_scanned[0] /= 2;
+                zone->recent_rotated[0] /= 2;
+                spin_unlock_irq(&zone->lru_lock);
+        }
+        if (unlikely(zone->recent_scanned[1] > file / 4)) {
+                spin_lock_irq(&zone->lru_lock);
+                zone->recent_scanned[1] /= 2;
+                zone->recent_rotated[1] /= 2;
+                spin_unlock_irq(&zone->lru_lock);
+        }
+        /*
+         * With swappiness at 100, anonymous and file have the same priority.
+         * This scanning priority is essentially the inverse of IO cost.
+         */
+        anon_prio = sc->swappiness;
+        file_prio = 200 - sc->swappiness;
+        /*
+         *                  anon       recent_rotated[0]
+         * %anon = 100 * ----------- / ----------------- * IO cost
+         *               anon + file      rotate_sum
+         */
+        ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
+        ap /= zone->recent_rotated[0] + 1;
+        fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
+        fp /= zone->recent_rotated[1] + 1;
+        /* Normalize to percentages */
+        percent[0] = 100 * ap / (ap + fp + 1);
+        percent[1] = 100 - percent[0];
+}
 /*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
 static unsigned long shrink_zone(int priority, struct zone *zone,
                                struct scan_control *sc)
 {
-        unsigned long nr_active;
+        unsigned long nr[NR_LRU_LISTS];
-        unsigned long nr_inactive;
        unsigned long nr_to_scan;
        unsigned long nr_reclaimed = 0;
+        unsigned long percent[2];       /* anon @ 0; file @ 1 */
+        enum lru_list l;
-        if (scan_global_lru(sc)) {
+        get_scan_ratio(zone, sc, percent);
-                /*
-                 * Add one to nr_to_scan just to make sure that the kernel
-                 * will slowly sift through the active list.
-                 */
-                zone->nr_scan_active +=
-                        (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
-                nr_active = zone->nr_scan_active;
-                zone->nr_scan_inactive +=
-                        (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
-                nr_inactive = zone->nr_scan_inactive;
-                if (nr_inactive >= sc->swap_cluster_max)
-                        zone->nr_scan_inactive = 0;
-                else
-                        nr_inactive = 0;
-                if (nr_active >= sc->swap_cluster_max)
-                        zone->nr_scan_active = 0;
-                else
-                        nr_active = 0;
-        } else {
-                /*
-                 * This reclaim occurs not because zone memory shortage but
-                 * because memory controller hits its limit.
-                 * Then, don't modify zone reclaim related data.
-                 */
-                nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
-                                        zone, priority);
-                nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
-                                        zone, priority);
-        }
+        for_each_evictable_lru(l) {
+                if (scan_global_lru(sc)) {
+                        int file = is_file_lru(l);
+                        int scan;
-        while (nr_active || nr_inactive) {
+                        scan = zone_page_state(zone, NR_LRU_BASE + l);
-                if (nr_active) {
+                        if (priority) {
-                        nr_to_scan = min(nr_active,
+                                scan >>= priority;
-                                        (unsigned long)sc->swap_cluster_max);
+                                scan = (scan * percent[file]) / 100;
-                        nr_active -= nr_to_scan;
+                        }
-                        shrink_active_list(nr_to_scan, zone, sc, priority);
+                        zone->lru[l].nr_scan += scan;
+                        nr[l] = zone->lru[l].nr_scan;
+                        if (nr[l] >= sc->swap_cluster_max)
+                                zone->lru[l].nr_scan = 0;
+                        else
+                                nr[l] = 0;
+                } else {
+                        /*
+                         * This reclaim occurs not because zone memory shortage
+                         * but because memory controller hits its limit.
+                         * Don't modify zone reclaim related data.
+                         */
+                        nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
+                                                                priority, l);
                }
+        }
-                if (nr_inactive) {
+        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
-                        nr_to_scan = min(nr_inactive,
+                                        nr[LRU_INACTIVE_FILE]) {
+                for_each_evictable_lru(l) {
+                        if (nr[l]) {
+                                nr_to_scan = min(nr[l],
                                        (unsigned long)sc->swap_cluster_max);
-                        nr_inactive -= nr_to_scan;
+                                nr[l] -= nr_to_scan;
-                        nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
-                                                                sc);
+                                nr_reclaimed += shrink_list(l, nr_to_scan,
+                                                        zone, sc, priority);
+                        }
                }
        }
+        /*
+         * Even if we did not try to evict anon pages at all, we want to
+         * rebalance the anon lru active/inactive ratio.
+         */
+        if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
+                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+        else if (!scan_global_lru(sc))
+                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
        throttle_vm_writeout(sc->gfp_mask);
        return nr_reclaimed;
 }
@@ -1286,7 +1526,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
        return nr_reclaimed;
 }
- 
 /*
 * This is the main entry point to direct page reclaim.
 *
@@ -1316,6 +1556,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        struct zone *zone;
        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
+        delayacct_freepages_start();
        if (scan_global_lru(sc))
                count_vm_event(ALLOCSTALL);
        /*
@@ -1327,8 +1569,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
-                        lru_pages += zone_page_state(zone, NR_ACTIVE)
+                        lru_pages += zone_lru_pages(zone);
-                                        + zone_page_state(zone, NR_INACTIVE);
                }
        }
@@ -1371,7 +1612,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
                        congestion_wait(WRITE, HZ/10);
        }
-        /* top priority shrink_caches still had more to do? don't OOM, then */
+        /* top priority shrink_zones still had more to do? don't OOM, then */
        if (!sc->all_unreclaimable && scan_global_lru(sc))
                ret = nr_reclaimed;
 out:
@@ -1396,6 +1637,8 @@ out:
        } else
                mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
+        delayacct_freepages_end();
        return ret;
 }
@@ -1516,6 +1759,14 @@ loop_again:
                            priority != DEF_PRIORITY)
                                continue;
+                        /*
+                         * Do some background aging of the anon list, to give
+                         * pages a chance to be referenced before reclaiming.
+                         */
+                        if (inactive_anon_is_low(zone))
+                                shrink_active_list(SWAP_CLUSTER_MAX, zone,
+                                                        &sc, priority, 0);
                        if (!zone_watermark_ok(zone, order, zone->pages_high,
                                               0, 0)) {
                                end_zone = i;
@@ -1528,8 +1779,7 @@ loop_again:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
-                        lru_pages += zone_page_state(zone, NR_ACTIVE)
+                        lru_pages += zone_lru_pages(zone);
-                                        + zone_page_state(zone, NR_INACTIVE);
                }
                /*
@@ -1573,8 +1823,7 @@ loop_again:
                        if (zone_is_all_unreclaimable(zone))
                                continue;
                        if (nr_slab == 0 && zone->pages_scanned >=
-                                (zone_page_state(zone, NR_ACTIVE)
+                                                (zone_lru_pages(zone) * 6))
-                                + zone_page_state(zone, NR_INACTIVE)) * 6)
                                        zone_set_flag(zone,
                                                      ZONE_ALL_UNRECLAIMABLE);
                        /*
@@ -1628,7 +1877,7 @@ out:
 /*
 * The background pageout daemon, started as a kernel thread
- * from the init process. 
+ * from the init process.
 *
 * This basically trickles out pages so that we have _some_
 * free memory available even if there is no other activity
@@ -1722,6 +1971,14 @@ void wakeup_kswapd(struct zone *zone, int order)
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
+unsigned long global_lru_pages(void)
+{
+        return global_page_state(NR_ACTIVE_ANON)
+                + global_page_state(NR_ACTIVE_FILE)
+                + global_page_state(NR_INACTIVE_ANON)
+                + global_page_state(NR_INACTIVE_FILE);
+}
 #ifdef CONFIG_PM
 /*
 * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
@@ -1735,6 +1992,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 {
        struct zone *zone;
        unsigned long nr_to_scan, ret = 0;
+        enum lru_list l;
        for_each_zone(zone) {
@@ -1744,38 +2002,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
                if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
                        continue;
-                /* For pass = 0 we don't shrink the active list */
+                for_each_evictable_lru(l) {
-                if (pass > 0) {
+                        /* For pass = 0, we don't shrink the active list */
-                        zone->nr_scan_active +=
+                        if (pass == 0 &&
-                                (zone_page_state(zone, NR_ACTIVE) >> prio) + 1;
+                                (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
-                        if (zone->nr_scan_active >= nr_pages || pass > 3) {
+                                continue;
-                                zone->nr_scan_active = 0;
+                        zone->lru[l].nr_scan +=
+                                (zone_page_state(zone, NR_LRU_BASE + l)
+                                                                >> prio) + 1;
+                        if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
+                                zone->lru[l].nr_scan = 0;
                                nr_to_scan = min(nr_pages,
-                                        zone_page_state(zone, NR_ACTIVE));
+                                        zone_page_state(zone,
-                                shrink_active_list(nr_to_scan, zone, sc, prio);
+                                                        NR_LRU_BASE + l));
+                                ret += shrink_list(l, nr_to_scan, zone,
+                                                                sc, prio);
+                                if (ret >= nr_pages)
+                                        return ret;
                        }
                }
-                zone->nr_scan_inactive +=
-                        (zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
-                if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
-                        zone->nr_scan_inactive = 0;
-                        nr_to_scan = min(nr_pages,
-                                zone_page_state(zone, NR_INACTIVE));
-                        ret += shrink_inactive_list(nr_to_scan, zone, sc);
-                        if (ret >= nr_pages)
-                                return ret;
-                }
        }
        return ret;
 }
-static unsigned long count_lru_pages(void)
-{
-        return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
-}
 /*
 * Try to free `nr_pages' of memory, system-wide, and return the number of
 * freed pages.
@@ -1801,7 +2052,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
        current->reclaim_state = &reclaim_state;
-        lru_pages = count_lru_pages();
+        lru_pages = global_lru_pages();
        nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
        /* If slab caches are huge, it's better to hit them first */
        while (nr_slab >= lru_pages) {
@@ -1844,7 +2095,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                        reclaim_state.reclaimed_slab = 0;
                        shrink_slab(sc.nr_scanned, sc.gfp_mask,
-                                        count_lru_pages());
+                                        global_lru_pages());
                        ret += reclaim_state.reclaimed_slab;
                        if (ret >= nr_pages)
                                goto out;
@@ -1861,7 +2112,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
        if (!ret) {
                do {
                        reclaim_state.reclaimed_slab = 0;
-                        shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
+                        shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
                        ret += reclaim_state.reclaimed_slab;
                } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
        }
@@ -1940,7 +2191,7 @@ module_init(kswapd_init)
 int zone_reclaim_mode __read_mostly;
 #define RECLAIM_OFF 0
-#define RECLAIM_ZONE (1<<0)     /* Run shrink_cache on the zone */
+#define RECLAIM_ZONE (1<<0)     /* Run shrink_inactive_list on the zone */
 #define RECLAIM_WRITE (1<<1)    /* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)     /* Swap pages out during reclaim */
@@ -2089,3 +2340,285 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        return ret;
 }
 #endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * page_evictable - test whether a page is evictable
+ * @page: the page to test
+ * @vma: the VMA in which the page is or will be mapped, may be NULL
+ *
+ * Test whether page is evictable--i.e., should be placed on active/inactive
+ * lists vs unevictable list.  The vma argument is !NULL when called from the
+ * fault path to determine how to instantate a new page.
+ *
+ * Reasons page might not be evictable:
+ * (1) page's mapping marked unevictable
+ * (2) page is part of an mlocked VMA
+ *
+ */
+int page_evictable(struct page *page, struct vm_area_struct *vma)
+{
+        if (mapping_unevictable(page_mapping(page)))
+                return 0;
+        if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
+                return 0;
+        return 1;
+}
+static void show_page_path(struct page *page)
+{
+        char buf[256];
+        if (page_is_file_cache(page)) {
+                struct address_space *mapping = page->mapping;
+                struct dentry *dentry;
+                pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+                spin_lock(&mapping->i_mmap_lock);
+                dentry = d_find_alias(mapping->host);
+                printk(KERN_INFO "rescued: %s %lu\n",
+                       dentry_path(dentry, buf, 256), pgoff);
+                spin_unlock(&mapping->i_mmap_lock);
+        } else {
+#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU)
+                struct anon_vma *anon_vma;
+                struct vm_area_struct *vma;
+                anon_vma = page_lock_anon_vma(page);
+                if (!anon_vma)
+                        return;
+                list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
+                        printk(KERN_INFO "rescued: anon %s\n",
+                               vma->vm_mm->owner->comm);
+                        break;
+                }
+                page_unlock_anon_vma(anon_vma);
+#endif
+        }
+}
+/**
+ * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
+ * @page: page to check evictability and move to appropriate lru list
+ * @zone: zone page is in
+ *
+ * Checks a page for evictability and moves the page to the appropriate
+ * zone lru list.
+ *
+ * Restrictions: zone->lru_lock must be held, page must be on LRU and must
+ * have PageUnevictable set.
+ */
+static void check_move_unevictable_page(struct page *page, struct zone *zone)
+{
+        VM_BUG_ON(PageActive(page));
+retry:
+        ClearPageUnevictable(page);
+        if (page_evictable(page, NULL)) {
+                enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
+                show_page_path(page);
+                __dec_zone_state(zone, NR_UNEVICTABLE);
+                list_move(&page->lru, &zone->lru[l].list);
+                __inc_zone_state(zone, NR_INACTIVE_ANON + l);
+                __count_vm_event(UNEVICTABLE_PGRESCUED);
+        } else {
+                /*
+                 * rotate unevictable list
+                 */
+                SetPageUnevictable(page);
+                list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
+                if (page_evictable(page, NULL))
+                        goto retry;
+        }
+}
+/**
+ * scan_mapping_unevictable_pages - scan an address space for evictable pages
+ * @mapping: struct address_space to scan for evictable pages
+ *
+ * Scan all pages in mapping.  Check unevictable pages for
+ * evictability and move them to the appropriate zone lru list.
+ */
+void scan_mapping_unevictable_pages(struct address_space *mapping)
+{
+        pgoff_t next = 0;
+        pgoff_t end   = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
+                         PAGE_CACHE_SHIFT;
+        struct zone *zone;
+        struct pagevec pvec;
+        if (mapping->nrpages == 0)
+                return;
+        pagevec_init(&pvec, 0);
+        while (next < end &&
+                pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+                int i;
+                int pg_scanned = 0;
+                zone = NULL;
+                for (i = 0; i < pagevec_count(&pvec); i++) {
+                        struct page *page = pvec.pages[i];
+                        pgoff_t page_index = page->index;
+                        struct zone *pagezone = page_zone(page);
+                        pg_scanned++;
+                        if (page_index > next)
+                                next = page_index;
+                        next++;
+                        if (pagezone != zone) {
+                                if (zone)
+                                        spin_unlock_irq(&zone->lru_lock);
+                                zone = pagezone;
+                                spin_lock_irq(&zone->lru_lock);
+                        }
+                        if (PageLRU(page) && PageUnevictable(page))
+                                check_move_unevictable_page(page, zone);
+                }
+                if (zone)
+                        spin_unlock_irq(&zone->lru_lock);
+                pagevec_release(&pvec);
+                count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
+        }
+}
+/**
+ * scan_zone_unevictable_pages - check unevictable list for evictable pages
+ * @zone - zone of which to scan the unevictable list
+ *
+ * Scan @zone's unevictable LRU lists to check for pages that have become
+ * evictable.  Move those that have to @zone's inactive list where they
+ * become candidates for reclaim, unless shrink_inactive_zone() decides
+ * to reactivate them.  Pages that are still unevictable are rotated
+ * back onto @zone's unevictable list.
+ */
+#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
+void scan_zone_unevictable_pages(struct zone *zone)
+{
+        struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
+        unsigned long scan;
+        unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
+        while (nr_to_scan > 0) {
+                unsigned long batch_size = min(nr_to_scan,
+                                                SCAN_UNEVICTABLE_BATCH_SIZE);
+                spin_lock_irq(&zone->lru_lock);
+                for (scan = 0;  scan < batch_size; scan++) {
+                        struct page *page = lru_to_page(l_unevictable);
+                        if (!trylock_page(page))
+                                continue;
+                        prefetchw_prev_lru_page(page, l_unevictable, flags);
+                        if (likely(PageLRU(page) && PageUnevictable(page)))
+                                check_move_unevictable_page(page, zone);
+                        unlock_page(page);
+                }
+                spin_unlock_irq(&zone->lru_lock);
+                nr_to_scan -= batch_size;
+        }
+}
+/**
+ * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
+ *
+ * A really big hammer:  scan all zones' unevictable LRU lists to check for
+ * pages that have become evictable.  Move those back to the zones'
+ * inactive list where they become candidates for reclaim.
+ * This occurs when, e.g., we have unswappable pages on the unevictable lists,
+ * and we add swap to the system.  As such, it runs in the context of a task
+ * that has possibly/probably made some previously unevictable pages
+ * evictable.
+ */
+void scan_all_zones_unevictable_pages(void)
+{
+        struct zone *zone;
+        for_each_zone(zone) {
+                scan_zone_unevictable_pages(zone);
+        }
+}
+/*
+ * scan_unevictable_pages [vm] sysctl handler.  On demand re-scan of
+ * all nodes' unevictable lists for evictable pages
+ */
+unsigned long scan_unevictable_pages;
+int scan_unevictable_handler(struct ctl_table *table, int write,
+                           struct file *file, void __user *buffer,
+                           size_t *length, loff_t *ppos)
+{
+        proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+        if (write && *(unsigned long *)table->data)
+                scan_all_zones_unevictable_pages();
+        scan_unevictable_pages = 0;
+        return 0;
+}
+/*
+ * per node 'scan_unevictable_pages' attribute.  On demand re-scan of
+ * a specified node's per zone unevictable lists for evictable pages.
+ */
+static ssize_t read_scan_unevictable_node(struct sys_device *dev,
+                                          struct sysdev_attribute *attr,
+                                          char *buf)
+{
+        return sprintf(buf, "0\n");     /* always zero; should fit... */
+}
+static ssize_t write_scan_unevictable_node(struct sys_device *dev,
+                                           struct sysdev_attribute *attr,
+                                        const char *buf, size_t count)
+{
+        struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
+        struct zone *zone;
+        unsigned long res;
+        unsigned long req = strict_strtoul(buf, 10, &res);
+        if (!req)
+                return 1;       /* zero is no-op */
+        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+                if (!populated_zone(zone))
+                        continue;
+                scan_zone_unevictable_pages(zone);
+        }
+        return 1;
+}
+static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
+                        read_scan_unevictable_node,
+                        write_scan_unevictable_node);
+int scan_unevictable_register_node(struct node *node)
+{
+        return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
+}
+void scan_unevictable_unregister_node(struct node *node)
+{
+        sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
+}
+#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db9eabb2c5b3..c3ccfda23adc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -8,11 +8,12 @@
 *  Copyright (C) 2006 Silicon Graphics, Inc.,
 *              Christoph Lameter <christoph@lameter.com>
 */
+#include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
+#include <linux/vmstat.h>
 #include <linux/sched.h>
 #ifdef CONFIG_VM_EVENT_COUNTERS
@@ -26,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
        memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
-        for_each_cpu_mask(cpu, *cpumask) {
+        for_each_cpu_mask_nr(cpu, *cpumask) {
                struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -383,7 +384,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
 #endif
 #ifdef CONFIG_PROC_FS
+#include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 static char * const migratetype_names[MIGRATE_TYPES] = {
@@ -515,9 +516,26 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
                        continue;
                page = pfn_to_page(pfn);
+#ifdef CONFIG_ARCH_FLATMEM_HAS_HOLES
+                /*
+                 * Ordinarily, memory holes in flatmem still have a valid
+                 * memmap for the PFN range. However, an architecture for
+                 * embedded systems (e.g. ARM) can free up the memmap backing
+                 * holes to save memory on the assumption the memmap is
+                 * never used. The page_zone linkages are then broken even
+                 * though pfn_valid() returns true. Skip the page if the
+                 * linkages are broken. Even if this test passed, the impact
+                 * is that the counters for the movable type are off but
+                 * fragmentation monitoring is likely meaningless on small
+                 * systems.
+                 */
+                if (page_zone(page) != zone)
+                        continue;
+#endif
                mtype = get_pageblock_migratetype(page);
-                count[mtype]++;
+                if (mtype < MIGRATE_TYPES)
+                        count[mtype]++;
        }
        /* Print counts */
@@ -563,20 +581,44 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
        return 0;
 }
-const struct seq_operations fragmentation_op = {
+static const struct seq_operations fragmentation_op = {
        .start  = frag_start,
        .next   = frag_next,
        .stop   = frag_stop,
        .show   = frag_show,
 };
-const struct seq_operations pagetypeinfo_op = {
+static int fragmentation_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &fragmentation_op);
+}
+static const struct file_operations fragmentation_file_operations = {
+        .open           = fragmentation_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
+static const struct seq_operations pagetypeinfo_op = {
        .start  = frag_start,
        .next   = frag_next,
        .stop   = frag_stop,
        .show   = pagetypeinfo_show,
 };
+static int pagetypeinfo_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &pagetypeinfo_op);
+}
+static const struct file_operations pagetypeinfo_file_ops = {
+        .open           = pagetypeinfo_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
 #ifdef CONFIG_ZONE_DMA
 #define TEXT_FOR_DMA(xx) xx "_dma",
 #else
@@ -601,8 +643,14 @@ const struct seq_operations pagetypeinfo_op = {
 static const char * const vmstat_text[] = {
        /* Zoned VM counters */
        "nr_free_pages",
-        "nr_inactive",
+        "nr_inactive_anon",
-        "nr_active",
+        "nr_active_anon",
+        "nr_inactive_file",
+        "nr_active_file",
+#ifdef CONFIG_UNEVICTABLE_LRU
+        "nr_unevictable",
+        "nr_mlock",
+#endif
        "nr_anon_pages",
        "nr_mapped",
        "nr_file_pages",
@@ -657,6 +705,16 @@ static const char * const vmstat_text[] = {
        "htlb_buddy_alloc_success",
        "htlb_buddy_alloc_fail",
 #endif
+#ifdef CONFIG_UNEVICTABLE_LRU
+        "unevictable_pgs_culled",
+        "unevictable_pgs_scanned",
+        "unevictable_pgs_rescued",
+        "unevictable_pgs_mlocked",
+        "unevictable_pgs_munlocked",
+        "unevictable_pgs_cleared",
+        "unevictable_pgs_stranded",
+        "unevictable_pgs_mlockfreed",
+#endif
 #endif
 };
@@ -670,7 +728,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   "\n        min      %lu"
                   "\n        low      %lu"
                   "\n        high     %lu"
-                   "\n        scanned  %lu (a: %lu i: %lu)"
+                   "\n        scanned  %lu (aa: %lu ia: %lu af: %lu if: %lu)"
                   "\n        spanned  %lu"
                   "\n        present  %lu",
                   zone_page_state(zone, NR_FREE_PAGES),
@@ -678,7 +736,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                   zone->pages_low,
                   zone->pages_high,
                   zone->pages_scanned,
-                   zone->nr_scan_active, zone->nr_scan_inactive,
+                   zone->lru[LRU_ACTIVE_ANON].nr_scan,
+                   zone->lru[LRU_INACTIVE_ANON].nr_scan,
+                   zone->lru[LRU_ACTIVE_FILE].nr_scan,
+                   zone->lru[LRU_INACTIVE_FILE].nr_scan,
                   zone->spanned_pages,
                   zone->present_pages);
@@ -715,10 +776,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
        seq_printf(m,
                   "\n  all_unreclaimable: %u"
                   "\n  prev_priority:     %i"
-                   "\n  start_pfn:         %lu",
+                   "\n  start_pfn:         %lu"
+                   "\n  inactive_ratio:    %u",
                           zone_is_all_unreclaimable(zone),
                   zone->prev_priority,
-                   zone->zone_start_pfn);
+                   zone->zone_start_pfn,
+                   zone->inactive_ratio);
        seq_putc(m, '\n');
 }
@@ -732,7 +795,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
        return 0;
 }
-const struct seq_operations zoneinfo_op = {
+static const struct seq_operations zoneinfo_op = {
        .start  = frag_start, /* iterate over all zones. The same as in
                               * fragmentation. */
        .next   = frag_next,
@@ -740,6 +803,18 @@ const struct seq_operations zoneinfo_op = {
        .show   = zoneinfo_show,
 };
+static int zoneinfo_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &zoneinfo_op);
+}
+static const struct file_operations proc_zoneinfo_file_operations = {
+        .open           = zoneinfo_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
 {
        unsigned long *v;
@@ -795,13 +870,24 @@ static void vmstat_stop(struct seq_file *m, void *arg)
        m->private = NULL;
 }
-const struct seq_operations vmstat_op = {
+static const struct seq_operations vmstat_op = {
        .start  = vmstat_start,
        .next   = vmstat_next,
        .stop   = vmstat_stop,
        .show   = vmstat_show,
 };
+static int vmstat_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &vmstat_op);
+}
+static const struct file_operations proc_vmstat_file_operations = {
+        .open           = vmstat_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = seq_release,
+};
 #endif /* CONFIG_PROC_FS */
 #ifdef CONFIG_SMP
@@ -859,9 +945,11 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
 static struct notifier_block __cpuinitdata vmstat_notifier =
        { &vmstat_cpuup_callback, NULL, 0 };
+#endif
 static int __init setup_vmstat(void)
 {
+#ifdef CONFIG_SMP
        int cpu;
        refresh_zone_stat_thresholds();
@@ -869,7 +957,13 @@ static int __init setup_vmstat(void)
        for_each_online_cpu(cpu)
                start_cpu_timer(cpu);
+#endif
+#ifdef CONFIG_PROC_FS
+        proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
+        proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
+        proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
+        proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
+#endif
        return 0;
 }
 module_init(setup_vmstat)
-#endif
author	Ingo Molnar <mingo@elte.hu>	2008-10-28 11:26:12 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-10-28 11:26:12 -0400
commit	7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
tree	e730a4565e0318140d2fbd2f0415d18a339d7336 /mm
parent	41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff)
parent	0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)