Merge commit 'v2.6.27-rc3' into x86/prototypes

Conflicts: include/asm-x86/dma-mapping.h Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2008-08-14 06:19:59 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-08-14 06:19:59 -0400
commit: 8d7ccaa545490cdffdfaff0842436a8dd85cf47b (patch)
tree: 8129b5907161bc6ae26deb3645ce1e280c5e1f51 /mm
parent: b2139aa0eec330c711c5a279db361e5ef1178e78 (diff)
parent: 30a2f3c60a84092c8084dfe788b710f8d0768cd4 (diff)
41 files changed, 3918 insertions, 1626 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c4de85285bb4..0bd9c2dbb2a0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS
 config MIGRATION
        bool "Page migration"
        def_bool y
-        depends on NUMA
+        depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
        help
          Allows the migration of the physical location of pages of processes
          while the virtual addresses are not changed. This is useful for
@@ -205,3 +205,6 @@ config NR_QUICK
 config VIRT_TO_BUS
        def_bool y
        depends on !ARCH_NO_VIRT_TO_BUS
+config MMU_NOTIFIER
+        bool
diff --git a/mm/Makefile b/mm/Makefile
index 18c143b3c46c..da4ccf015aea 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o pdflush.o \
                           readahead.o swap.o truncate.o vmscan.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                           page_isolation.o $(mmu-y)
+                           page_isolation.o mm_init.o $(mmu-y)
 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)    += bounce.o
@@ -25,6 +25,7 @@ obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
 obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 05f2b4009ccc..4297bc41bfd2 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -18,27 +18,28 @@
 * Depopulating per-cpu data for a cpu going offline would be a typical
 * use case. You need to register a cpu hotplug handler for that purpose.
 */
-void percpu_depopulate(void *__pdata, int cpu)
+static void percpu_depopulate(void *__pdata, int cpu)
 {
        struct percpu_data *pdata = __percpu_disguise(__pdata);
        kfree(pdata->ptrs[cpu]);
        pdata->ptrs[cpu] = NULL;
 }
-EXPORT_SYMBOL_GPL(percpu_depopulate);
 /**
 * percpu_depopulate_mask - depopulate per-cpu data for some cpu's
 * @__pdata: per-cpu data to depopulate
 * @mask: depopulate per-cpu data for cpu's selected through mask bits
 */
-void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
+static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask)
 {
        int cpu;
-        for_each_cpu_mask(cpu, *mask)
+        for_each_cpu_mask_nr(cpu, *mask)
                percpu_depopulate(__pdata, cpu);
 }
-EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
+#define percpu_depopulate_mask(__pdata, mask) \
+        __percpu_depopulate_mask((__pdata), &(mask))
 /**
 * percpu_populate - populate per-cpu data for given cpu
@@ -51,7 +52,7 @@ EXPORT_SYMBOL_GPL(__percpu_depopulate_mask);
 * use case. You need to register a cpu hotplug handler for that purpose.
 * Per-cpu object is populated with zeroed buffer.
 */
-void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
+static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
 {
        struct percpu_data *pdata = __percpu_disguise(__pdata);
        int node = cpu_to_node(cpu);
@@ -68,7 +69,6 @@ void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu)
                pdata->ptrs[cpu] = kzalloc(size, gfp);
        return pdata->ptrs[cpu];
 }
-EXPORT_SYMBOL_GPL(percpu_populate);
 /**
 * percpu_populate_mask - populate per-cpu data for more cpu's
@@ -79,14 +79,14 @@ EXPORT_SYMBOL_GPL(percpu_populate);
 *
 * Per-cpu objects are populated with zeroed buffers.
 */
-int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
+static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
-                           cpumask_t *mask)
+                                  cpumask_t *mask)
 {
        cpumask_t populated;
        int cpu;
        cpus_clear(populated);
-        for_each_cpu_mask(cpu, *mask)
+        for_each_cpu_mask_nr(cpu, *mask)
                if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) {
                        __percpu_depopulate_mask(__pdata, &populated);
                        return -ENOMEM;
@@ -94,7 +94,9 @@ int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
                        cpu_set(cpu, populated);
        return 0;
 }
-EXPORT_SYMBOL_GPL(__percpu_populate_mask);
+#define percpu_populate_mask(__pdata, size, gfp, mask) \
+        __percpu_populate_mask((__pdata), (size), (gfp), &(mask))
 /**
 * percpu_alloc_mask - initial setup of per-cpu data
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8d9f60e06f62..4af15d0340ad 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -1,12 +1,12 @@
 /*
- *  linux/mm/bootmem.c
+ *  bootmem - A boot-time physical memory allocator and configurator
 *
 *  Copyright (C) 1999 Ingo Molnar
- *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *                1999 Kanoj Sarcar, SGI
+ *                2008 Johannes Weiner
 *
- *  simple boot-time physical memory area allocator and
+ * Access to this subsystem has to be serialized externally (which is true
- *  free memory collector. It's used to deal with reserved
+ * for the boot process anyway).
- *  system memory and memory holes as well.
 */
 #include <linux/init.h>
 #include <linux/pfn.h>
@@ -19,15 +19,10 @@
 #include "internal.h"
-/*
- * Access to this subsystem has to be serialized externally. (this is
- * true for the boot process anyway)
- */
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
-static LIST_HEAD(bdata_list);
 #ifdef CONFIG_CRASH_DUMP
 /*
 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list);
 unsigned long saved_max_pfn;
 #endif
-/* return the number of _pages_ that will be allocated for the boot bitmap */
+bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
-unsigned long __init bootmem_bootmap_pages(unsigned long pages)
+static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
+static int bootmem_debug;
+static int __init bootmem_debug_setup(char *buf)
 {
-        unsigned long mapsize;
+        bootmem_debug = 1;
+        return 0;
+}
+early_param("bootmem_debug", bootmem_debug_setup);
-        mapsize = (pages+7)/8;
+#define bdebug(fmt, args...) ({                         \
-        mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
+        if (unlikely(bootmem_debug))                    \
-        mapsize >>= PAGE_SHIFT;
+                printk(KERN_INFO                        \
+                        "bootmem::%s " fmt,             \
+                        __FUNCTION__, ## args);         \
+})
-        return mapsize;
+static unsigned long __init bootmap_bytes(unsigned long pages)
+{
+        unsigned long bytes = (pages + 7) / 8;
+        return ALIGN(bytes, sizeof(long));
 }
-/*
+/**
- * link bdata in order
+ * bootmem_bootmap_pages - calculate bitmap size in pages
+ * @pages: number of pages the bitmap has to represent
 */
-static void __init link_bootmem(bootmem_data_t *bdata)
+unsigned long __init bootmem_bootmap_pages(unsigned long pages)
 {
-        bootmem_data_t *ent;
+        unsigned long bytes = bootmap_bytes(pages);
-        if (list_empty(&bdata_list)) {
+        return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
-                list_add(&bdata->list, &bdata_list);
-                return;
-        }
-        /* insert in order */
-        list_for_each_entry(ent, &bdata_list, list) {
-                if (bdata->node_boot_start < ent->node_boot_start) {
-                        list_add_tail(&bdata->list, &ent->list);
-                        return;
-                }
-        }
-        list_add_tail(&bdata->list, &bdata_list);
 }
 /*
- * Given an initialised bdata, it returns the size of the boot bitmap
+ * link bdata in order
 */
-static unsigned long __init get_mapsize(bootmem_data_t *bdata)
+static void __init link_bootmem(bootmem_data_t *bdata)
 {
-        unsigned long mapsize;
+        struct list_head *iter;
-        unsigned long start = PFN_DOWN(bdata->node_boot_start);
-        unsigned long end = bdata->node_low_pfn;
-        mapsize = ((end - start) + 7) / 8;
+        list_for_each(iter, &bdata_list) {
-        return ALIGN(mapsize, sizeof(long));
+                bootmem_data_t *ent;
+                ent = list_entry(iter, bootmem_data_t, list);
+                if (bdata->node_min_pfn < ent->node_min_pfn)
+                        break;
+        }
+        list_add_tail(&bdata->list, iter);
 }
 /*
 * Called once to set up the allocator itself.
 */
-static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
+static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
        unsigned long mapstart, unsigned long start, unsigned long end)
 {
-        bootmem_data_t *bdata = pgdat->bdata;
        unsigned long mapsize;
+        mminit_validate_memmodel_limits(&start, &end);
        bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
-        bdata->node_boot_start = PFN_PHYS(start);
+        bdata->node_min_pfn = start;
        bdata->node_low_pfn = end;
        link_bootmem(bdata);
@@ -100,429 +104,461 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
         * Initially all pages are reserved - setup_arch() has to
         * register free RAM areas explicitly.
         */
-        mapsize = get_mapsize(bdata);
+        mapsize = bootmap_bytes(end - start);
        memset(bdata->node_bootmem_map, 0xff, mapsize);
+        bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
+                bdata - bootmem_node_data, start, mapstart, end, mapsize);
        return mapsize;
 }
-/*
+/**
- * Marks a particular physical memory range as unallocatable. Usable RAM
+ * init_bootmem_node - register a node as boot memory
- * might be used for boot-time allocations - or it might get added
+ * @pgdat: node to register
- * to the free page pool later on.
+ * @freepfn: pfn where the bitmap for this node is to be placed
+ * @startpfn: first pfn on the node
+ * @endpfn: first pfn after the node
+ *
+ * Returns the number of bytes needed to hold the bitmap for this node.
 */
-static int __init can_reserve_bootmem_core(bootmem_data_t *bdata,
+unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
-                        unsigned long addr, unsigned long size, int flags)
+                                unsigned long startpfn, unsigned long endpfn)
 {
-        unsigned long sidx, eidx;
+        return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
-        unsigned long i;
+}
-        BUG_ON(!size);
+/**
+ * init_bootmem - register boot memory
+ * @start: pfn where the bitmap is to be placed
+ * @pages: number of available physical pages
+ *
+ * Returns the number of bytes needed to hold the bitmap.
+ */
+unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+{
+        max_low_pfn = pages;
+        min_low_pfn = start;
+        return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
+}
-        /* out of range, don't hold other */
+static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
-        if (addr + size < bdata->node_boot_start ||
+{
-                PFN_DOWN(addr) > bdata->node_low_pfn)
+        int aligned;
+        struct page *page;
+        unsigned long start, end, pages, count = 0;
+        if (!bdata->node_bootmem_map)
                return 0;
+        start = bdata->node_min_pfn;
+        end = bdata->node_low_pfn;
        /*
-         * Round up to index to the range.
+         * If the start is aligned to the machines wordsize, we might
+         * be able to free pages in bulks of that order.
         */
-        if (addr > bdata->node_boot_start)
+        aligned = !(start & (BITS_PER_LONG - 1));
-                sidx= PFN_DOWN(addr - bdata->node_boot_start);
-        else
-                sidx = 0;
-        eidx = PFN_UP(addr + size - bdata->node_boot_start);
+        bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
-        if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+                bdata - bootmem_node_data, start, end, aligned);
-                eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
-        for (i = sidx; i < eidx; i++) {
+        while (start < end) {
-                if (test_bit(i, bdata->node_bootmem_map)) {
+                unsigned long *map, idx, vec;
-                        if (flags & BOOTMEM_EXCLUSIVE)
-                                return -EBUSY;
-                }
-        }
-        return 0;
+                map = bdata->node_bootmem_map;
+                idx = start - bdata->node_min_pfn;
+                vec = ~map[idx / BITS_PER_LONG];
-}
+                if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+                        int order = ilog2(BITS_PER_LONG);
-static void __init reserve_bootmem_core(bootmem_data_t *bdata,
+                        __free_pages_bootmem(pfn_to_page(start), order);
-                        unsigned long addr, unsigned long size, int flags)
+                        count += BITS_PER_LONG;
-{
+                } else {
-        unsigned long sidx, eidx;
+                        unsigned long off = 0;
-        unsigned long i;
-        BUG_ON(!size);
-        /* out of range */
+                        while (vec && off < BITS_PER_LONG) {
-        if (addr + size < bdata->node_boot_start ||
+                                if (vec & 1) {
-                PFN_DOWN(addr) > bdata->node_low_pfn)
+                                        page = pfn_to_page(start + off);
-                return;
+                                        __free_pages_bootmem(page, 0);
+                                        count++;
+                                }
+                                vec >>= 1;
+                                off++;
+                        }
+                }
+                start += BITS_PER_LONG;
+        }
-        /*
+        page = virt_to_page(bdata->node_bootmem_map);
-         * Round up to index to the range.
+        pages = bdata->node_low_pfn - bdata->node_min_pfn;
-         */
+        pages = bootmem_bootmap_pages(pages);
-        if (addr > bdata->node_boot_start)
+        count += pages;
-                sidx= PFN_DOWN(addr - bdata->node_boot_start);
+        while (pages--)
-        else
+                __free_pages_bootmem(page++, 0);
-                sidx = 0;
-        eidx = PFN_UP(addr + size - bdata->node_boot_start);
+        bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
-        if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
-                eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
-        for (i = sidx; i < eidx; i++) {
+        return count;
-                if (test_and_set_bit(i, bdata->node_bootmem_map)) {
-#ifdef CONFIG_DEBUG_BOOTMEM
-                        printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
-#endif
-                }
-        }
 }
-static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
+/**
-                                     unsigned long size)
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
-        unsigned long sidx, eidx;
+        register_page_bootmem_info_node(pgdat);
-        unsigned long i;
+        return free_all_bootmem_core(pgdat->bdata);
+}
-        BUG_ON(!size);
-        /* out range */
+/**
-        if (addr + size < bdata->node_boot_start ||
+ * free_all_bootmem - release free pages to the buddy allocator
-                PFN_DOWN(addr) > bdata->node_low_pfn)
+ *
-                return;
+ * Returns the number of pages actually released.
-        /*
+ */
-         * round down end of usable mem, partially free pages are
+unsigned long __init free_all_bootmem(void)
-         * considered reserved.
+{
-         */
+        return free_all_bootmem_core(NODE_DATA(0)->bdata);
+}
-        if (addr >= bdata->node_boot_start && addr < bdata->last_success)
+static void __init __free(bootmem_data_t *bdata,
-                bdata->last_success = addr;
+                        unsigned long sidx, unsigned long eidx)
+{
+        unsigned long idx;
-        /*
+        bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
-         * Round up to index to the range.
+                sidx + bdata->node_min_pfn,
-         */
+                eidx + bdata->node_min_pfn);
-        if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start))
-                sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
-        else
-                sidx = 0;
-        eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
+        if (bdata->hint_idx > sidx)
-        if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+                bdata->hint_idx = sidx;
-                eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
-        for (i = sidx; i < eidx; i++) {
+        for (idx = sidx; idx < eidx; idx++)
-                if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
+                if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
                        BUG();
-        }
 }
-/*
+static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
- * We 'merge' subsequent allocations to save space. We might 'lose'
+                        unsigned long eidx, int flags)
- * some fraction of a page if allocations cannot be satisfied due to
- * size constraints on boxes where there is physical RAM space
- * fragmentation - in these cases (mostly large memory boxes) this
- * is not a problem.
- *
- * On low memory boxes we get it right in 100% of the cases.
- *
- * alignment has to be a power of 2 value.
- *
- * NOTE:  This function is _not_ reentrant.
- */
-void * __init
-__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
-              unsigned long align, unsigned long goal, unsigned long limit)
 {
-        unsigned long areasize, preferred;
+        unsigned long idx;
-        unsigned long i, start = 0, incr, eidx, end_pfn;
+        int exclusive = flags & BOOTMEM_EXCLUSIVE;
-        void *ret;
-        unsigned long node_boot_start;
+        bdebug("nid=%td start=%lx end=%lx flags=%x\n",
-        void *node_bootmem_map;
+                bdata - bootmem_node_data,
+                sidx + bdata->node_min_pfn,
-        if (!size) {
+                eidx + bdata->node_min_pfn,
-                printk("__alloc_bootmem_core(): zero-sized request\n");
+                flags);
-                BUG();
-        }
+        for (idx = sidx; idx < eidx; idx++)
-        BUG_ON(align & (align-1));
+                if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
+                        if (exclusive) {
-        /* on nodes without memory - bootmem_map is NULL */
+                                __free(bdata, sidx, idx);
-        if (!bdata->node_bootmem_map)
+                                return -EBUSY;
-                return NULL;
+                        }
+                        bdebug("silent double reserve of PFN %lx\n",
+                                idx + bdata->node_min_pfn);
+                }
+        return 0;
+}
-        /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */
+static int __init mark_bootmem_node(bootmem_data_t *bdata,
-        node_boot_start = bdata->node_boot_start;
+                                unsigned long start, unsigned long end,
-        node_bootmem_map = bdata->node_bootmem_map;
+                                int reserve, int flags)
-        if (align) {
+{
-                node_boot_start = ALIGN(bdata->node_boot_start, align);
+        unsigned long sidx, eidx;
-                if (node_boot_start > bdata->node_boot_start)
-                        node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
-                            PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
-        }
-        if (limit && node_boot_start >= limit)
+        bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
-                return NULL;
+                bdata - bootmem_node_data, start, end, reserve, flags);
-        end_pfn = bdata->node_low_pfn;
+        BUG_ON(start < bdata->node_min_pfn);
-        limit = PFN_DOWN(limit);
+        BUG_ON(end > bdata->node_low_pfn);
-        if (limit && end_pfn > limit)
-                end_pfn = limit;
-        eidx = end_pfn - PFN_DOWN(node_boot_start);
+        sidx = start - bdata->node_min_pfn;
+        eidx = end - bdata->node_min_pfn;
-        /*
+        if (reserve)
-         * We try to allocate bootmem pages above 'goal'
+                return __reserve(bdata, sidx, eidx, flags);
-         * first, then we try to allocate lower pages.
+        else
-         */
+                __free(bdata, sidx, eidx);
-        preferred = 0;
+        return 0;
-        if (goal && PFN_DOWN(goal) < end_pfn) {
+}
-                if (goal > node_boot_start)
-                        preferred = goal - node_boot_start;
-                if (bdata->last_success > node_boot_start &&
-                        bdata->last_success - node_boot_start >= preferred)
-                        if (!limit || (limit && limit > bdata->last_success))
-                                preferred = bdata->last_success - node_boot_start;
-        }
-        preferred = PFN_DOWN(ALIGN(preferred, align));
+static int __init mark_bootmem(unsigned long start, unsigned long end,
-        areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
+                                int reserve, int flags)
-        incr = align >> PAGE_SHIFT ? : 1;
+{
+        unsigned long pos;
+        bootmem_data_t *bdata;
-restart_scan:
+        pos = start;
-        for (i = preferred; i < eidx;) {
+        list_for_each_entry(bdata, &bdata_list, list) {
-                unsigned long j;
+                int err;
+                unsigned long max;
-                i = find_next_zero_bit(node_bootmem_map, eidx, i);
+                if (pos < bdata->node_min_pfn ||
-                i = ALIGN(i, incr);
+                    pos >= bdata->node_low_pfn) {
-                if (i >= eidx)
+                        BUG_ON(pos != start);
-                        break;
-                if (test_bit(i, node_bootmem_map)) {
-                        i += incr;
                        continue;
                }
-                for (j = i + 1; j < i + areasize; ++j) {
-                        if (j >= eidx)
-                                goto fail_block;
-                        if (test_bit(j, node_bootmem_map))
-                                goto fail_block;
-                }
-                start = i;
-                goto found;
-        fail_block:
-                i = ALIGN(j, incr);
-                if (i == j)
-                        i += incr;
-        }
-        if (preferred > 0) {
+                max = min(bdata->node_low_pfn, end);
-                preferred = 0;
-                goto restart_scan;
-        }
-        return NULL;
-found:
+                err = mark_bootmem_node(bdata, pos, max, reserve, flags);
-        bdata->last_success = PFN_PHYS(start) + node_boot_start;
+                if (reserve && err) {
-        BUG_ON(start >= eidx);
+                        mark_bootmem(start, pos, 0, 0);
+                        return err;
-        /*
-         * Is the next page of the previous allocation-end the start
-         * of this allocation's buffer? If yes then we can 'merge'
-         * the previous partial page with this allocation.
-         */
-        if (align < PAGE_SIZE &&
-            bdata->last_offset && bdata->last_pos+1 == start) {
-                unsigned long offset, remaining_size;
-                offset = ALIGN(bdata->last_offset, align);
-                BUG_ON(offset > PAGE_SIZE);
-                remaining_size = PAGE_SIZE - offset;
-                if (size < remaining_size) {
-                        areasize = 0;
-                        /* last_pos unchanged */
-                        bdata->last_offset = offset + size;
-                        ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-                                           offset + node_boot_start);
-                } else {
-                        remaining_size = size - remaining_size;
-                        areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
-                        ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-                                           offset + node_boot_start);
-                        bdata->last_pos = start + areasize - 1;
-                        bdata->last_offset = remaining_size;
                }
-                bdata->last_offset &= ~PAGE_MASK;
-        } else {
-                bdata->last_pos = start + areasize - 1;
-                bdata->last_offset = size & ~PAGE_MASK;
-                ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
-        }
-        /*
+                if (max == end)
-         * Reserve the area now:
+                        return 0;
-         */
+                pos = bdata->node_low_pfn;
-        for (i = start; i < start + areasize; i++)
+        }
-                if (unlikely(test_and_set_bit(i, node_bootmem_map)))
+        BUG();
-                        BUG();
-        memset(ret, 0, size);
-        return ret;
 }
-static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must reside completely on the specified node.
+ */
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+                              unsigned long size)
 {
-        struct page *page;
+        unsigned long start, end;
-        unsigned long pfn;
-        bootmem_data_t *bdata = pgdat->bdata;
-        unsigned long i, count, total = 0;
-        unsigned long idx;
-        unsigned long *map; 
-        int gofast = 0;
-        BUG_ON(!bdata->node_bootmem_map);
-        count = 0;
-        /* first extant page of the node */
-        pfn = PFN_DOWN(bdata->node_boot_start);
-        idx = bdata->node_low_pfn - pfn;
-        map = bdata->node_bootmem_map;
-        /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
-        if (bdata->node_boot_start == 0 ||
-            ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
-                gofast = 1;
-        for (i = 0; i < idx; ) {
-                unsigned long v = ~map[i / BITS_PER_LONG];
-                if (gofast && v == ~0UL) {
-                        int order;
-                        page = pfn_to_page(pfn);
-                        count += BITS_PER_LONG;
-                        order = ffs(BITS_PER_LONG) - 1;
-                        __free_pages_bootmem(page, order);
-                        i += BITS_PER_LONG;
-                        page += BITS_PER_LONG;
-                } else if (v) {
-                        unsigned long m;
-                        page = pfn_to_page(pfn);
-                        for (m = 1; m && i < idx; m<<=1, page++, i++) {
-                                if (v & m) {
-                                        count++;
-                                        __free_pages_bootmem(page, 0);
-                                }
-                        }
-                } else {
-                        i += BITS_PER_LONG;
-                }
-                pfn += BITS_PER_LONG;
-        }
-        total += count;
-        /*
+        start = PFN_UP(physaddr);
-         * Now free the allocator bitmap itself, it's not
+        end = PFN_DOWN(physaddr + size);
-         * needed anymore:
-         */
-        page = virt_to_page(bdata->node_bootmem_map);
-        count = 0;
-        idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
-        for (i = 0; i < idx; i++, page++) {
-                __free_pages_bootmem(page, 0);
-                count++;
-        }
-        total += count;
-        bdata->node_bootmem_map = NULL;
-        return total;
+        mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
 }
-unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
+/**
-                                unsigned long startpfn, unsigned long endpfn)
+ * free_bootmem - mark a page range as usable
-{
+ * @addr: starting address of the range
-        return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
+ * @size: size of the range in bytes
-}
+ *
+ * Partial pages will be considered reserved and left as they are.
-int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ *
-                                 unsigned long size, int flags)
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long addr, unsigned long size)
 {
-        int ret;
+        unsigned long start, end;
-        ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
+        start = PFN_UP(addr);
-        if (ret < 0)
+        end = PFN_DOWN(addr + size);
-                return -ENOMEM;
-        reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
-        return 0;
+        mark_bootmem(start, end, 0, 0);
 }
-void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+/**
-                              unsigned long size)
+ * reserve_bootmem_node - mark a page range as reserved
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must reside completely on the specified node.
+ */
+int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+                                 unsigned long size, int flags)
 {
-        free_bootmem_core(pgdat->bdata, physaddr, size);
+        unsigned long start, end;
-}
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+        start = PFN_DOWN(physaddr);
-{
+        end = PFN_UP(physaddr + size);
-        register_page_bootmem_info_node(pgdat);
-        return free_all_bootmem_core(pgdat);
-}
-unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+        return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
-{
-        max_low_pfn = pages;
-        min_low_pfn = start;
-        return init_bootmem_core(NODE_DATA(0), start, 0, pages);
 }
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+/**
+ * reserve_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
 int __init reserve_bootmem(unsigned long addr, unsigned long size,
                            int flags)
 {
-        bootmem_data_t *bdata;
+        unsigned long start, end;
-        int ret;
-        list_for_each_entry(bdata, &bdata_list, list) {
+        start = PFN_DOWN(addr);
-                ret = can_reserve_bootmem_core(bdata, addr, size, flags);
+        end = PFN_UP(addr + size);
-                if (ret < 0)
-                        return ret;
-        }
-        list_for_each_entry(bdata, &bdata_list, list)
-                reserve_bootmem_core(bdata, addr, size, flags);
-        return 0;
+        return mark_bootmem(start, end, 1, flags);
 }
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
-void __init free_bootmem(unsigned long addr, unsigned long size)
+static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+                                unsigned long size, unsigned long align,
+                                unsigned long goal, unsigned long limit)
 {
-        bootmem_data_t *bdata;
+        unsigned long fallback = 0;
-        list_for_each_entry(bdata, &bdata_list, list)
+        unsigned long min, max, start, sidx, midx, step;
-                free_bootmem_core(bdata, addr, size);
-}
-unsigned long __init free_all_bootmem(void)
+        BUG_ON(!size);
-{
+        BUG_ON(align & (align - 1));
-        return free_all_bootmem_core(NODE_DATA(0));
+        BUG_ON(limit && goal + size > limit);
+        if (!bdata->node_bootmem_map)
+                return NULL;
+        bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+                bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+                align, goal, limit);
+        min = bdata->node_min_pfn;
+        max = bdata->node_low_pfn;
+        goal >>= PAGE_SHIFT;
+        limit >>= PAGE_SHIFT;
+        if (limit && max > limit)
+                max = limit;
+        if (max <= min)
+                return NULL;
+        step = max(align >> PAGE_SHIFT, 1UL);
+        if (goal && min < goal && goal < max)
+                start = ALIGN(goal, step);
+        else
+                start = ALIGN(min, step);
+        sidx = start - bdata->node_min_pfn;;
+        midx = max - bdata->node_min_pfn;
+        if (bdata->hint_idx > sidx) {
+                /*
+                 * Handle the valid case of sidx being zero and still
+                 * catch the fallback below.
+                 */
+                fallback = sidx + 1;
+                sidx = ALIGN(bdata->hint_idx, step);
+        }
+        while (1) {
+                int merge;
+                void *region;
+                unsigned long eidx, i, start_off, end_off;
+find_block:
+                sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
+                sidx = ALIGN(sidx, step);
+                eidx = sidx + PFN_UP(size);
+                if (sidx >= midx || eidx > midx)
+                        break;
+                for (i = sidx; i < eidx; i++)
+                        if (test_bit(i, bdata->node_bootmem_map)) {
+                                sidx = ALIGN(i, step);
+                                if (sidx == i)
+                                        sidx += step;
+                                goto find_block;
+                        }
+                if (bdata->last_end_off &&
+                                PFN_DOWN(bdata->last_end_off) + 1 == sidx)
+                        start_off = ALIGN(bdata->last_end_off, align);
+                else
+                        start_off = PFN_PHYS(sidx);
+                merge = PFN_DOWN(start_off) < sidx;
+                end_off = start_off + size;
+                bdata->last_end_off = end_off;
+                bdata->hint_idx = PFN_UP(end_off);
+                /*
+                 * Reserve the area now:
+                 */
+                if (__reserve(bdata, PFN_DOWN(start_off) + merge,
+                                PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
+                        BUG();
+                region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
+                                start_off);
+                memset(region, 0, size);
+                return region;
+        }
+        if (fallback) {
+                sidx = ALIGN(fallback - 1, step);
+                fallback = 0;
+                goto find_block;
+        }
+        return NULL;
 }
-void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
-                                      unsigned long goal)
+                                        unsigned long align,
+                                        unsigned long goal,
+                                        unsigned long limit)
 {
        bootmem_data_t *bdata;
-        void *ptr;
+restart:
        list_for_each_entry(bdata, &bdata_list, list) {
-                ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
+                void *region;
-                if (ptr)
-                        return ptr;
+                if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
+                        continue;
+                if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
+                        break;
+                region = alloc_bootmem_core(bdata, size, align, goal, limit);
+                if (region)
+                        return region;
+        }
+        if (goal) {
+                goal = 0;
+                goto restart;
        }
        return NULL;
 }
-void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+/**
-                              unsigned long goal)
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+                                        unsigned long goal)
 {
-        void *mem = __alloc_bootmem_nopanic(size,align,goal);
+        return ___alloc_bootmem_nopanic(size, align, goal, 0);
+}
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+                                        unsigned long goal, unsigned long limit)
+{
+        void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
        if (mem)
                return mem;
@@ -534,78 +570,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return NULL;
 }
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+                              unsigned long goal)
+{
+        return ___alloc_bootmem(size, align, goal, 0);
+}
-void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
-                                   unsigned long align, unsigned long goal)
+                                unsigned long size, unsigned long align,
+                                unsigned long goal, unsigned long limit)
 {
        void *ptr;
-        ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+        ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
        if (ptr)
                return ptr;
-        return __alloc_bootmem(size, align, goal);
+        return ___alloc_bootmem(size, align, goal, limit);
+}
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+                                   unsigned long align, unsigned long goal)
+{
+        return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
 }
 #ifdef CONFIG_SPARSEMEM
+/**
+ * alloc_bootmem_section - allocate boot memory from a specific section
+ * @size: size of the request in bytes
+ * @section_nr: sparse map section to allocate from
+ *
+ * Return NULL on failure.
+ */
 void * __init alloc_bootmem_section(unsigned long size,
                                    unsigned long section_nr)
 {
-        void *ptr;
+        bootmem_data_t *bdata;
-        unsigned long limit, goal, start_nr, end_nr, pfn;
+        unsigned long pfn, goal, limit;
-        struct pglist_data *pgdat;
        pfn = section_nr_to_pfn(section_nr);
-        goal = PFN_PHYS(pfn);
+        goal = pfn << PAGE_SHIFT;
-        limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1;
+        limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
-        pgdat = NODE_DATA(early_pfn_to_nid(pfn));
+        bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
-        ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
-                                   limit);
-        if (!ptr)
+        return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
-                return NULL;
+}
+#endif
-        start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr)));
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-        end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size));
+                                   unsigned long align, unsigned long goal)
-        if (start_nr != section_nr || end_nr != section_nr) {
+{
-                printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n",
+        void *ptr;
-                       section_nr);
-                free_bootmem_core(pgdat->bdata, __pa(ptr), size);
-                ptr = NULL;
-        }
-        return ptr;
+        ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+        if (ptr)
+                return ptr;
+        return __alloc_bootmem_nopanic(size, align, goal);
 }
-#endif
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
 #endif
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
                                  unsigned long goal)
 {
-        bootmem_data_t *bdata;
+        return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
-        void *ptr;
-        list_for_each_entry(bdata, &bdata_list, list) {
-                ptr = __alloc_bootmem_core(bdata, size, align, goal,
-                                                ARCH_LOW_ADDRESS_LIMIT);
-                if (ptr)
-                        return ptr;
-        }
-        /*
-         * Whoops, we cannot satisfy the allocation request.
-         */
-        printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
-        panic("Out of low memory");
-        return NULL;
 }
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
                                       unsigned long align, unsigned long goal)
 {
-        return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
+        return ___alloc_bootmem_node(pgdat->bdata, size, align,
-                                    ARCH_LOW_ADDRESS_LIMIT);
+                                goal, ARCH_LOW_ADDRESS_LIMIT);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 65d9d9e2b755..54e968650855 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,9 +42,6 @@
 #include <asm/mman.h>
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-        loff_t offset, unsigned long nr_segs);
 /*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -112,13 +109,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 /*
 * Remove a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock.
 */
 void __remove_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
-        mem_cgroup_uncharge_page(page);
+        mem_cgroup_uncharge_cache_page(page);
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
        mapping->nrpages--;
@@ -144,9 +141,9 @@ void remove_from_page_cache(struct page *page)
        BUG_ON(!PageLocked(page));
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
 }
 static int sync_page(void *word)
@@ -445,48 +442,52 @@ int filemap_write_and_wait_range(struct address_space *mapping,
 }
 /**
- * add_to_page_cache - add newly allocated pagecache pages
+ * add_to_page_cache_locked - add a locked page to the pagecache
 * @page:       page to add
 * @mapping:    the page's address_space
 * @offset:     page index
 * @gfp_mask:   page allocation mode
 *
- * This function is used to add newly allocated pagecache pages;
+ * This function is used to add a page to the pagecache. It must be locked.
- * the page is new, so we can just run SetPageLocked() against it.
- * The other page state flags were set by rmqueue().
- *
 * This function does not add the page to the LRU.  The caller must do that.
 */
-int add_to_page_cache(struct page *page, struct address_space *mapping,
+int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
                pgoff_t offset, gfp_t gfp_mask)
 {
-        int error = mem_cgroup_cache_charge(page, current->mm,
+        int error;
+        VM_BUG_ON(!PageLocked(page));
+        error = mem_cgroup_cache_charge(page, current->mm,
                                        gfp_mask & ~__GFP_HIGHMEM);
        if (error)
                goto out;
        error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
        if (error == 0) {
-                write_lock_irq(&mapping->tree_lock);
+                page_cache_get(page);
+                page->mapping = mapping;
+                page->index = offset;
+                spin_lock_irq(&mapping->tree_lock);
                error = radix_tree_insert(&mapping->page_tree, offset, page);
-                if (!error) {
+                if (likely(!error)) {
-                        page_cache_get(page);
-                        SetPageLocked(page);
-                        page->mapping = mapping;
-                        page->index = offset;
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
-                } else
+                } else {
-                        mem_cgroup_uncharge_page(page);
+                        page->mapping = NULL;
+                        mem_cgroup_uncharge_cache_page(page);
+                        page_cache_release(page);
+                }
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                radix_tree_preload_end();
        } else
-                mem_cgroup_uncharge_page(page);
+                mem_cgroup_uncharge_cache_page(page);
 out:
        return error;
 }
-EXPORT_SYMBOL(add_to_page_cache);
+EXPORT_SYMBOL(add_to_page_cache_locked);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                pgoff_t offset, gfp_t gfp_mask)
@@ -557,14 +558,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
 *
 * The first mb is necessary to safely close the critical section opened by the
- * TestSetPageLocked(), the second mb is necessary to enforce ordering between
+ * test_and_set_bit() to lock the page; the second mb is necessary to enforce
- * the clear_bit and the read of the waitqueue (to avoid SMP races with a
+ * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
- * parallel wait_on_page_locked()).
+ * races with a parallel wait_on_page_locked()).
 */
 void unlock_page(struct page *page)
 {
        smp_mb__before_clear_bit();
-        if (!TestClearPageLocked(page))
+        if (!test_and_clear_bit(PG_locked, &page->flags))
                BUG();
        smp_mb__after_clear_bit(); 
        wake_up_page(page, PG_locked);
@@ -636,15 +637,35 @@ void __lock_page_nosync(struct page *page)
 * Is there a pagecache struct page at the given (mapping, offset) tuple?
 * If yes, increment its refcount and return it; if no, return NULL.
 */
-struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
+struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
 {
+        void **pagep;
        struct page *page;
-        read_lock_irq(&mapping->tree_lock);
+        rcu_read_lock();
-        page = radix_tree_lookup(&mapping->page_tree, offset);
+repeat:
-        if (page)
+        page = NULL;
-                page_cache_get(page);
+        pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
-        read_unlock_irq(&mapping->tree_lock);
+        if (pagep) {
+                page = radix_tree_deref_slot(pagep);
+                if (unlikely(!page || page == RADIX_TREE_RETRY))
+                        goto repeat;
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /*
+                 * Has the page moved?
+                 * This is part of the lockless pagecache protocol. See
+                 * include/linux/pagemap.h for details.
+                 */
+                if (unlikely(page != *pagep)) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+        }
+        rcu_read_unlock();
        return page;
 }
 EXPORT_SYMBOL(find_get_page);
@@ -659,32 +680,22 @@ EXPORT_SYMBOL(find_get_page);
 *
 * Returns zero if the page was not present. find_lock_page() may sleep.
 */
-struct page *find_lock_page(struct address_space *mapping,
+struct page *find_lock_page(struct address_space *mapping, pgoff_t offset)
-                                pgoff_t offset)
 {
        struct page *page;
 repeat:
-        read_lock_irq(&mapping->tree_lock);
+        page = find_get_page(mapping, offset);
-        page = radix_tree_lookup(&mapping->page_tree, offset);
        if (page) {
-                page_cache_get(page);
+                lock_page(page);
-                if (TestSetPageLocked(page)) {
+                /* Has the page been truncated? */
-                        read_unlock_irq(&mapping->tree_lock);
+                if (unlikely(page->mapping != mapping)) {
-                        __lock_page(page);
+                        unlock_page(page);
+                        page_cache_release(page);
-                        /* Has the page been truncated while we slept? */
+                        goto repeat;
-                        if (unlikely(page->mapping != mapping)) {
-                                unlock_page(page);
-                                page_cache_release(page);
-                                goto repeat;
-                        }
-                        VM_BUG_ON(page->index != offset);
-                        goto out;
                }
+                VM_BUG_ON(page->index != offset);
        }
-        read_unlock_irq(&mapping->tree_lock);
-out:
        return page;
 }
 EXPORT_SYMBOL(find_lock_page);
@@ -750,13 +761,39 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                                (void ***)pages, start, nr_pages);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
-        read_lock_irq(&mapping->tree_lock);
+                if (!page_cache_get_speculative(page))
-        ret = radix_tree_gang_lookup(&mapping->page_tree,
+                        goto repeat;
-                                (void **)pages, start, nr_pages);
-        for (i = 0; i < ret; i++)
+                /* Has the page moved? */
-                page_cache_get(pages[i]);
+                if (unlikely(page != *((void **)pages[i]))) {
-        read_unlock_irq(&mapping->tree_lock);
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                pages[ret] = page;
+                ret++;
+        }
+        rcu_read_unlock();
        return ret;
 }
@@ -777,19 +814,44 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+                                (void ***)pages, index, nr_pages);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
-        read_lock_irq(&mapping->tree_lock);
+                if (page->mapping == NULL || page->index != index)
-        ret = radix_tree_gang_lookup(&mapping->page_tree,
-                                (void **)pages, index, nr_pages);
-        for (i = 0; i < ret; i++) {
-                if (pages[i]->mapping == NULL || pages[i]->index != index)
                        break;
-                page_cache_get(pages[i]);
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /* Has the page moved? */
+                if (unlikely(page != *((void **)pages[i]))) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                pages[ret] = page;
+                ret++;
                index++;
        }
-        read_unlock_irq(&mapping->tree_lock);
+        rcu_read_unlock();
-        return i;
+        return ret;
 }
 EXPORT_SYMBOL(find_get_pages_contig);
@@ -809,15 +871,43 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
 {
        unsigned int i;
        unsigned int ret;
+        unsigned int nr_found;
+        rcu_read_lock();
+restart:
+        nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree,
+                                (void ***)pages, *index, nr_pages, tag);
+        ret = 0;
+        for (i = 0; i < nr_found; i++) {
+                struct page *page;
+repeat:
+                page = radix_tree_deref_slot((void **)pages[i]);
+                if (unlikely(!page))
+                        continue;
+                /*
+                 * this can only trigger if nr_found == 1, making livelock
+                 * a non issue.
+                 */
+                if (unlikely(page == RADIX_TREE_RETRY))
+                        goto restart;
+                if (!page_cache_get_speculative(page))
+                        goto repeat;
+                /* Has the page moved? */
+                if (unlikely(page != *((void **)pages[i]))) {
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                pages[ret] = page;
+                ret++;
+        }
+        rcu_read_unlock();
-        read_lock_irq(&mapping->tree_lock);
-        ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
-                                (void **)pages, *index, nr_pages, tag);
-        for (i = 0; i < ret; i++)
-                page_cache_get(pages[i]);
        if (ret)
                *index = pages[ret - 1]->index + 1;
-        read_unlock_irq(&mapping->tree_lock);
        return ret;
 }
 EXPORT_SYMBOL(find_get_pages_tag);
@@ -841,7 +931,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
        struct page *page = find_get_page(mapping, index);
        if (page) {
-                if (!TestSetPageLocked(page))
+                if (trylock_page(page))
                        return page;
                page_cache_release(page);
                return NULL;
@@ -933,8 +1023,17 @@ find_page:
                                        ra, filp, page,
                                        index, last_index - index);
                }
-                if (!PageUptodate(page))
+                if (!PageUptodate(page)) {
-                        goto page_not_up_to_date;
+                        if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
+                                        !mapping->a_ops->is_partially_uptodate)
+                                goto page_not_up_to_date;
+                        if (!trylock_page(page))
+                                goto page_not_up_to_date;
+                        if (!mapping->a_ops->is_partially_uptodate(page,
+                                                                desc, offset))
+                                goto page_not_up_to_date_locked;
+                        unlock_page(page);
+                }
 page_ok:
                /*
                 * i_size must be checked after we know the page is Uptodate.
@@ -1004,6 +1103,7 @@ page_not_up_to_date:
                if (lock_page_killable(page))
                        goto readpage_eio;
+page_not_up_to_date_locked:
                /* Did it get truncated before we got the lock? */
                if (!page->mapping) {
                        unlock_page(page);
@@ -1200,42 +1300,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                mapping = filp->f_mapping;
                inode = mapping->host;
-                retval = 0;
                if (!count)
                        goto out; /* skip atime */
                size = i_size_read(inode);
                if (pos < size) {
-                        retval = generic_file_direct_IO(READ, iocb,
+                        retval = filemap_write_and_wait(mapping);
-                                                iov, pos, nr_segs);
+                        if (!retval) {
+                                retval = mapping->a_ops->direct_IO(READ, iocb,
+                                                        iov, pos, nr_segs);
+                        }
                        if (retval > 0)
                                *ppos = pos + retval;
-                }
+                        if (retval) {
-                if (likely(retval != 0)) {
+                                file_accessed(filp);
-                        file_accessed(filp);
+                                goto out;
-                        goto out;
+                        }
                }
        }
-        retval = 0;
+        for (seg = 0; seg < nr_segs; seg++) {
-        if (count) {
+                read_descriptor_t desc;
-                for (seg = 0; seg < nr_segs; seg++) {
-                        read_descriptor_t desc;
-                        desc.written = 0;
+                desc.written = 0;
-                        desc.arg.buf = iov[seg].iov_base;
+                desc.arg.buf = iov[seg].iov_base;
-                        desc.count = iov[seg].iov_len;
+                desc.count = iov[seg].iov_len;
-                        if (desc.count == 0)
+                if (desc.count == 0)
-                                continue;
+                        continue;
-                        desc.error = 0;
+                desc.error = 0;
-                        do_generic_file_read(filp,ppos,&desc,file_read_actor);
+                do_generic_file_read(filp, ppos, &desc, file_read_actor);
-                        retval += desc.written;
+                retval += desc.written;
-                        if (desc.error) {
+                if (desc.error) {
-                                retval = retval ?: desc.error;
+                        retval = retval ?: desc.error;
-                                break;
+                        break;
-                        }
-                        if (desc.count > 0)
-                                break;
                }
+                if (desc.count > 0)
+                        break;
        }
 out:
        return retval;
@@ -1669,8 +1768,9 @@ static int __remove_suid(struct dentry *dentry, int kill)
        return notify_change(dentry, &newattrs);
 }
-int remove_suid(struct dentry *dentry)
+int file_remove_suid(struct file *file)
 {
+        struct dentry *dentry = file->f_path.dentry;
        int killsuid = should_remove_suid(dentry);
        int killpriv = security_inode_need_killpriv(dentry);
        int error = 0;
@@ -1684,7 +1784,7 @@ int remove_suid(struct dentry *dentry)
        return error;
 }
-EXPORT_SYMBOL(remove_suid);
+EXPORT_SYMBOL(file_remove_suid);
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
                        const struct iovec *iov, size_t base, size_t bytes)
@@ -1779,7 +1879,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
                 * The !iov->iov_len check ensures we skip over unlikely
                 * zero-length segments (without overruning the iovec).
                 */
-                while (bytes || unlikely(!iov->iov_len && i->count)) {
+                while (bytes || unlikely(i->count && !iov->iov_len)) {
                        int copy;
                        copy = min(bytes, iov->iov_len - base);
@@ -2004,11 +2104,55 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        struct address_space *mapping = file->f_mapping;
        struct inode    *inode = mapping->host;
        ssize_t         written;
+        size_t          write_len;
+        pgoff_t         end;
        if (count != ocount)
                *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
-        written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+        /*
+         * Unmap all mmappings of the file up-front.
+         *
+         * This will cause any pte dirty bits to be propagated into the
+         * pageframes for the subsequent filemap_write_and_wait().
+         */
+        write_len = iov_length(iov, *nr_segs);
+        end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+        if (mapping_mapped(mapping))
+                unmap_mapping_range(mapping, pos, write_len, 0);
+        written = filemap_write_and_wait(mapping);
+        if (written)
+                goto out;
+        /*
+         * After a write we want buffered reads to be sure to go to disk to get
+         * the new data.  We invalidate clean cached page from the region we're
+         * about to write.  We do this *before* the write so that we can return
+         * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
+         */
+        if (mapping->nrpages) {
+                written = invalidate_inode_pages2_range(mapping,
+                                        pos >> PAGE_CACHE_SHIFT, end);
+                if (written)
+                        goto out;
+        }
+        written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+        /*
+         * Finally, try again to invalidate clean pages which might have been
+         * cached by non-direct readahead, or faulted in by get_user_pages()
+         * if the source of the write was an mmap'ed region of the file
+         * we're writing.  Either one is a pretty crazy thing to do,
+         * so we don't support it 100%.  If this invalidation
+         * fails, tough, the write still worked...
+         */
+        if (mapping->nrpages) {
+                invalidate_inode_pages2_range(mapping,
+                                              pos >> PAGE_CACHE_SHIFT, end);
+        }
        if (written > 0) {
                loff_t end = pos + written;
                if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -2024,6 +2168,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
         * i_mutex is held, which protects generic_osync_inode() from
         * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
         */
+out:
        if ((written >= 0 || written == -EIOCBQUEUED) &&
            ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
@@ -2395,7 +2540,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
        if (count == 0)
                goto out;
-        err = remove_suid(file->f_path.dentry);
+        err = file_remove_suid(file);
        if (err)
                goto out;
@@ -2511,66 +2656,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 }
 EXPORT_SYMBOL(generic_file_aio_write);
-/*
- * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
- * went wrong during pagecache shootdown.
- */
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-        loff_t offset, unsigned long nr_segs)
-{
-        struct file *file = iocb->ki_filp;
-        struct address_space *mapping = file->f_mapping;
-        ssize_t retval;
-        size_t write_len;
-        pgoff_t end = 0; /* silence gcc */
-        /*
-         * If it's a write, unmap all mmappings of the file up-front.  This
-         * will cause any pte dirty bits to be propagated into the pageframes
-         * for the subsequent filemap_write_and_wait().
-         */
-        if (rw == WRITE) {
-                write_len = iov_length(iov, nr_segs);
-                end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
-                if (mapping_mapped(mapping))
-                        unmap_mapping_range(mapping, offset, write_len, 0);
-        }
-        retval = filemap_write_and_wait(mapping);
-        if (retval)
-                goto out;
-        /*
-         * After a write we want buffered reads to be sure to go to disk to get
-         * the new data.  We invalidate clean cached page from the region we're
-         * about to write.  We do this *before* the write so that we can return
-         * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
-         */
-        if (rw == WRITE && mapping->nrpages) {
-                retval = invalidate_inode_pages2_range(mapping,
-                                        offset >> PAGE_CACHE_SHIFT, end);
-                if (retval)
-                        goto out;
-        }
-        retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
-        /*
-         * Finally, try again to invalidate clean pages which might have been
-         * cached by non-direct readahead, or faulted in by get_user_pages()
-         * if the source of the write was an mmap'ed region of the file
-         * we're writing.  Either one is a pretty crazy thing to do,
-         * so we don't support it 100%.  If this invalidation
-         * fails, tough, the write still worked...
-         */
-        if (rw == WRITE && mapping->nrpages) {
-                invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
-        }
-out:
-        return retval;
-}
 /**
 * try_to_release_page() - release old fs-specific metadata on a page
 *
@@ -2582,9 +2667,8 @@ out:
 * Otherwise return zero.
 *
 * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
 *
- * NOTE: @gfp_mask may go away, and this function may become non-blocking.
 */
 int try_to_release_page(struct page *page, gfp_t gfp_mask)
 {
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 3e744abcce9d..380ab402d711 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/uio.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 #include <linux/sched.h>
 #include <asm/tlbflush.h>
 #include <asm/io.h>
@@ -188,7 +189,7 @@ __xip_unmap (struct address_space * mapping,
                if (pte) {
                        /* Nuke the page table entry. */
                        flush_cache_page(vma, address, pte_pfn(*pte));
-                        pteval = ptep_clear_flush(vma, address, pte);
+                        pteval = ptep_clear_flush_notify(vma, address, pte);
                        page_remove_rmap(page, vma);
                        dec_mm_counter(mm, file_rss);
                        BUG_ON(pte_dirty(pteval));
@@ -380,7 +381,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        if (count == 0)
                goto out_backing;
-        ret = remove_suid(filp->f_path.dentry);
+        ret = file_remove_suid(filp);
        if (ret)
                goto out_backing;
diff --git a/mm/fremap.c b/mm/fremap.c
index 07a9c82ce1a3..7881638e4a12 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 #include <asm/mmu_context.h>
 #include <asm/cacheflush.h>
@@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
                spin_unlock(&mapping->i_mmap_lock);
        }
+        mmu_notifier_invalidate_range_start(mm, start, start + size);
        err = populate_range(mm, vma, start, size, pgoff);
+        mmu_notifier_invalidate_range_end(mm, start, start + size);
        if (!err && !(flags & MAP_NONBLOCK)) {
                if (unlikely(has_write_lock)) {
                        downgrade_write(&mm->mmap_sem);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ab171274ef21..67a71191136e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -9,43 +9,357 @@
 #include <linux/mm.h>
 #include <linux/sysctl.h>
 #include <linux/highmem.h>
+#include <linux/mmu_notifier.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
+#include <linux/bootmem.h>
+#include <linux/sysfs.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/io.h>
 #include <linux/hugetlb.h>
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
-static unsigned long surplus_huge_pages;
-static unsigned long nr_overcommit_huge_pages;
-unsigned long max_huge_pages;
-unsigned long sysctl_overcommit_huge_pages;
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static unsigned int nr_huge_pages_node[MAX_NUMNODES];
-static unsigned int free_huge_pages_node[MAX_NUMNODES];
-static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-static int hugetlb_next_nid;
+static int max_hstate;
+unsigned int default_hstate_idx;
+struct hstate hstates[HUGE_MAX_HSTATE];
+__initdata LIST_HEAD(huge_boot_pages);
+/* for command line parsing */
+static struct hstate * __initdata parsed_hstate;
+static unsigned long __initdata default_hstate_max_huge_pages;
+static unsigned long __initdata default_hstate_size;
+#define for_each_hstate(h) \
+        for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
 static DEFINE_SPINLOCK(hugetlb_lock);
-static void clear_huge_page(struct page *page, unsigned long addr)
+/*
+ * Region tracking -- allows tracking of reservations and instantiated pages
+ *                    across the pages in a mapping.
+ *
+ * The region data structures are protected by a combination of the mmap_sem
+ * and the hugetlb_instantion_mutex.  To access or modify a region the caller
+ * must either hold the mmap_sem for write, or the mmap_sem for read and
+ * the hugetlb_instantiation mutex:
+ *
+ *      down_write(&mm->mmap_sem);
+ * or
+ *      down_read(&mm->mmap_sem);
+ *      mutex_lock(&hugetlb_instantiation_mutex);
+ */
+struct file_region {
+        struct list_head link;
+        long from;
+        long to;
+};
+static long region_add(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg, *trg;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        /* Check for and consume any regions we now overlap with. */
+        nrg = rg;
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        break;
+                /* If this area reaches higher then extend our area to
+                 * include it completely.  If this is not the first area
+                 * which we intend to reuse, free it. */
+                if (rg->to > t)
+                        t = rg->to;
+                if (rg != nrg) {
+                        list_del(&rg->link);
+                        kfree(rg);
+                }
+        }
+        nrg->from = f;
+        nrg->to = t;
+        return 0;
+}
+static long region_chg(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg;
+        long chg = 0;
+        /* Locate the region we are before or in. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* If we are below the current region then a new region is required.
+         * Subtle, allocate a new region at the position but make it zero
+         * size such that we can guarantee to record the reservation. */
+        if (&rg->link == head || t < rg->from) {
+                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+                if (!nrg)
+                        return -ENOMEM;
+                nrg->from = f;
+                nrg->to   = f;
+                INIT_LIST_HEAD(&nrg->link);
+                list_add(&nrg->link, rg->link.prev);
+                return t - f;
+        }
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        chg = t - f;
+        /* Check for and consume any regions we now overlap with. */
+        list_for_each_entry(rg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        return chg;
+                /* We overlap with this area, if it extends futher than
+                 * us then we must extend ourselves.  Account for its
+                 * existing reservation. */
+                if (rg->to > t) {
+                        chg += rg->to - t;
+                        t = rg->to;
+                }
+                chg -= rg->to - rg->from;
+        }
+        return chg;
+}
+static long region_truncate(struct list_head *head, long end)
+{
+        struct file_region *rg, *trg;
+        long chg = 0;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (end <= rg->to)
+                        break;
+        if (&rg->link == head)
+                return 0;
+        /* If we are in the middle of a region then adjust it. */
+        if (end > rg->from) {
+                chg = rg->to - end;
+                rg->to = end;
+                rg = list_entry(rg->link.next, typeof(*rg), link);
+        }
+        /* Drop any remaining regions. */
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                chg += rg->to - rg->from;
+                list_del(&rg->link);
+                kfree(rg);
+        }
+        return chg;
+}
+static long region_count(struct list_head *head, long f, long t)
+{
+        struct file_region *rg;
+        long chg = 0;
+        /* Locate each segment we overlap with, and count that overlap. */
+        list_for_each_entry(rg, head, link) {
+                int seg_from;
+                int seg_to;
+                if (rg->to <= f)
+                        continue;
+                if (rg->from >= t)
+                        break;
+                seg_from = max(rg->from, f);
+                seg_to = min(rg->to, t);
+                chg += seg_to - seg_from;
+        }
+        return chg;
+}
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in pagecache page units; huge pages here.
+ */
+static pgoff_t vma_hugecache_offset(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long address)
+{
+        return ((address - vma->vm_start) >> huge_page_shift(h)) +
+                        (vma->vm_pgoff >> huge_page_order(h));
+}
+/*
+ * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
+ * bits of the reservation map pointer, which are always clear due to
+ * alignment.
+ */
+#define HPAGE_RESV_OWNER    (1UL << 0)
+#define HPAGE_RESV_UNMAPPED (1UL << 1)
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ *
+ * The private mapping reservation is represented in a subtly different
+ * manner to a shared mapping.  A shared mapping has a region map associated
+ * with the underlying file, this region map represents the backing file
+ * pages which have ever had a reservation assigned which this persists even
+ * after the page is instantiated.  A private mapping has a region map
+ * associated with the original mmap which is attached to all VMAs which
+ * reference it, this region map represents those offsets which have consumed
+ * reservation ie. where pages have been instantiated.
+ */
+static unsigned long get_vma_private_data(struct vm_area_struct *vma)
+{
+        return (unsigned long)vma->vm_private_data;
+}
+static void set_vma_private_data(struct vm_area_struct *vma,
+                                                        unsigned long value)
+{
+        vma->vm_private_data = (void *)value;
+}
+struct resv_map {
+        struct kref refs;
+        struct list_head regions;
+};
+struct resv_map *resv_map_alloc(void)
+{
+        struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
+        if (!resv_map)
+                return NULL;
+        kref_init(&resv_map->refs);
+        INIT_LIST_HEAD(&resv_map->regions);
+        return resv_map;
+}
+void resv_map_release(struct kref *ref)
+{
+        struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+        /* Clear out any active regions before we release the map. */
+        region_truncate(&resv_map->regions, 0);
+        kfree(resv_map);
+}
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        if (!(vma->vm_flags & VM_SHARED))
+                return (struct resv_map *)(get_vma_private_data(vma) &
+                                                        ~HPAGE_RESV_MASK);
+        return 0;
+}
+static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        VM_BUG_ON(vma->vm_flags & VM_SHARED);
+        set_vma_private_data(vma, (get_vma_private_data(vma) &
+                                HPAGE_RESV_MASK) | (unsigned long)map);
+}
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        VM_BUG_ON(vma->vm_flags & VM_SHARED);
+        set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+}
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        return (get_vma_private_data(vma) & flag) != 0;
+}
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct hstate *h,
+                        struct vm_area_struct *vma)
+{
+        if (vma->vm_flags & VM_NORESERVE)
+                return;
+        if (vma->vm_flags & VM_SHARED) {
+                /* Shared mappings always use reserves */
+                h->resv_huge_pages--;
+        } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                /*
+                 * Only the process that called mmap() has reserves for
+                 * private mappings.
+                 */
+                h->resv_huge_pages--;
+        }
+}
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        if (!(vma->vm_flags & VM_SHARED))
+                vma->vm_private_data = (void *)0;
+}
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_reserves(struct vm_area_struct *vma)
+{
+        if (vma->vm_flags & VM_SHARED)
+                return 1;
+        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+                return 1;
+        return 0;
+}
+static void clear_huge_page(struct page *page,
+                        unsigned long addr, unsigned long sz)
 {
        int i;
        might_sleep();
-        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+        for (i = 0; i < sz/PAGE_SIZE; i++) {
                cond_resched();
                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
        }
@@ -55,42 +369,44 @@ static void copy_huge_page(struct page *dst, struct page *src,
                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
+        struct hstate *h = hstate_vma(vma);
        might_sleep();
-        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+        for (i = 0; i < pages_per_huge_page(h); i++) {
                cond_resched();
                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
        }
 }
-static void enqueue_huge_page(struct page *page)
+static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
-        list_add(&page->lru, &hugepage_freelists[nid]);
+        list_add(&page->lru, &h->hugepage_freelists[nid]);
-        free_huge_pages++;
+        h->free_huge_pages++;
-        free_huge_pages_node[nid]++;
+        h->free_huge_pages_node[nid]++;
 }
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct hstate *h)
 {
        int nid;
        struct page *page = NULL;
        for (nid = 0; nid < MAX_NUMNODES; ++nid) {
-                if (!list_empty(&hugepage_freelists[nid])) {
+                if (!list_empty(&h->hugepage_freelists[nid])) {
-                        page = list_entry(hugepage_freelists[nid].next,
+                        page = list_entry(h->hugepage_freelists[nid].next,
                                          struct page, lru);
                        list_del(&page->lru);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[nid]--;
+                        h->free_huge_pages_node[nid]--;
                        break;
                }
        }
        return page;
 }
-static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
+static struct page *dequeue_huge_page_vma(struct hstate *h,
-                                unsigned long address)
+                                struct vm_area_struct *vma,
+                                unsigned long address, int avoid_reserve)
 {
        int nid;
        struct page *page = NULL;
@@ -101,18 +417,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
        struct zone *zone;
        struct zoneref *z;
+        /*
+         * A child process with MAP_PRIVATE mappings created by their parent
+         * have no page reserves. This check ensures that reservations are
+         * not "stolen". The child may still get SIGKILLed
+         */
+        if (!vma_has_reserves(vma) &&
+                        h->free_huge_pages - h->resv_huge_pages == 0)
+                return NULL;
+        /* If reserves cannot be used, ensure enough pages are in the pool */
+        if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+                return NULL;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
                nid = zone_to_nid(zone);
                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
-                    !list_empty(&hugepage_freelists[nid])) {
+                    !list_empty(&h->hugepage_freelists[nid])) {
-                        page = list_entry(hugepage_freelists[nid].next,
+                        page = list_entry(h->hugepage_freelists[nid].next,
                                          struct page, lru);
                        list_del(&page->lru);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[nid]--;
+                        h->free_huge_pages_node[nid]--;
-                        if (vma && vma->vm_flags & VM_MAYSHARE)
-                                resv_huge_pages--;
+                        if (!avoid_reserve)
+                                decrement_hugepage_resv_vma(h, vma);
                        break;
                }
        }
@@ -120,12 +451,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
        return page;
 }
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
-        nr_huge_pages--;
-        nr_huge_pages_node[page_to_nid(page)]--;
+        h->nr_huge_pages--;
-        for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+        h->nr_huge_pages_node[page_to_nid(page)]--;
+        for (i = 0; i < pages_per_huge_page(h); i++) {
                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
                                1 << PG_private | 1<< PG_writeback);
@@ -133,11 +465,27 @@ static void update_and_free_page(struct page *page)
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
        arch_release_hugepage(page);
-        __free_pages(page, HUGETLB_PAGE_ORDER);
+        __free_pages(page, huge_page_order(h));
+}
+struct hstate *size_to_hstate(unsigned long size)
+{
+        struct hstate *h;
+        for_each_hstate(h) {
+                if (huge_page_size(h) == size)
+                        return h;
+        }
+        return NULL;
 }
 static void free_huge_page(struct page *page)
 {
+        /*
+         * Can't pass hstate in here because it is called from the
+         * compound page destructor.
+         */
+        struct hstate *h = page_hstate(page);
        int nid = page_to_nid(page);
        struct address_space *mapping;
@@ -147,12 +495,12 @@ static void free_huge_page(struct page *page)
        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
-        if (surplus_huge_pages_node[nid]) {
+        if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
-                update_and_free_page(page);
+                update_and_free_page(h, page);
-                surplus_huge_pages--;
+                h->surplus_huge_pages--;
-                surplus_huge_pages_node[nid]--;
+                h->surplus_huge_pages_node[nid]--;
        } else {
-                enqueue_huge_page(page);
+                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
        if (mapping)
@@ -164,7 +512,7 @@ static void free_huge_page(struct page *page)
 * balanced by operating on them in a round-robin fashion.
 * Returns 1 if an adjustment was made.
 */
-static int adjust_pool_surplus(int delta)
+static int adjust_pool_surplus(struct hstate *h, int delta)
 {
        static int prev_nid;
        int nid = prev_nid;
@@ -177,15 +525,15 @@ static int adjust_pool_surplus(int delta)
                        nid = first_node(node_online_map);
                /* To shrink on this node, there must be a surplus page */
-                if (delta < 0 && !surplus_huge_pages_node[nid])
+                if (delta < 0 && !h->surplus_huge_pages_node[nid])
                        continue;
                /* Surplus cannot exceed the total number of pages */
-                if (delta > 0 && surplus_huge_pages_node[nid] >=
+                if (delta > 0 && h->surplus_huge_pages_node[nid] >=
-                                                nr_huge_pages_node[nid])
+                                                h->nr_huge_pages_node[nid])
                        continue;
-                surplus_huge_pages += delta;
+                h->surplus_huge_pages += delta;
-                surplus_huge_pages_node[nid] += delta;
+                h->surplus_huge_pages_node[nid] += delta;
                ret = 1;
                break;
        } while (nid != prev_nid);
@@ -194,59 +542,74 @@ static int adjust_pool_surplus(int delta)
        return ret;
 }
-static struct page *alloc_fresh_huge_page_node(int nid)
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+        set_compound_page_dtor(page, free_huge_page);
+        spin_lock(&hugetlb_lock);
+        h->nr_huge_pages++;
+        h->nr_huge_pages_node[nid]++;
+        spin_unlock(&hugetlb_lock);
+        put_page(page); /* free it into the hugepage allocator */
+}
+static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
+        if (h->order >= MAX_ORDER)
+                return NULL;
        page = alloc_pages_node(nid,
                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
                                                __GFP_REPEAT|__GFP_NOWARN,
-                HUGETLB_PAGE_ORDER);
+                huge_page_order(h));
        if (page) {
                if (arch_prepare_hugepage(page)) {
-                        __free_pages(page, HUGETLB_PAGE_ORDER);
+                        __free_pages(page, huge_page_order(h));
                        return NULL;
                }
-                set_compound_page_dtor(page, free_huge_page);
+                prep_new_huge_page(h, page, nid);
-                spin_lock(&hugetlb_lock);
-                nr_huge_pages++;
-                nr_huge_pages_node[nid]++;
-                spin_unlock(&hugetlb_lock);
-                put_page(page); /* free it into the hugepage allocator */
        }
        return page;
 }
-static int alloc_fresh_huge_page(void)
+/*
+ * Use a helper variable to find the next node and then
+ * copy it back to hugetlb_next_nid afterwards:
+ * otherwise there's a window in which a racer might
+ * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * But we don't need to use a spin_lock here: it really
+ * doesn't matter if occasionally a racer chooses the
+ * same nid as we do.  Move nid forward in the mask even
+ * if we just successfully allocated a hugepage so that
+ * the next caller gets hugepages on the next node.
+ */
+static int hstate_next_node(struct hstate *h)
+{
+        int next_nid;
+        next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+        if (next_nid == MAX_NUMNODES)
+                next_nid = first_node(node_online_map);
+        h->hugetlb_next_nid = next_nid;
+        return next_nid;
+}
+static int alloc_fresh_huge_page(struct hstate *h)
 {
        struct page *page;
        int start_nid;
        int next_nid;
        int ret = 0;
-        start_nid = hugetlb_next_nid;
+        start_nid = h->hugetlb_next_nid;
        do {
-                page = alloc_fresh_huge_page_node(hugetlb_next_nid);
+                page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
                if (page)
                        ret = 1;
-                /*
+                next_nid = hstate_next_node(h);
-                 * Use a helper variable to find the next node and then
+        } while (!page && h->hugetlb_next_nid != start_nid);
-                 * copy it back to hugetlb_next_nid afterwards:
-                 * otherwise there's a window in which a racer might
-                 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
-                 * But we don't need to use a spin_lock here: it really
-                 * doesn't matter if occasionally a racer chooses the
-                 * same nid as we do.  Move nid forward in the mask even
-                 * if we just successfully allocated a hugepage so that
-                 * the next caller gets hugepages on the next node.
-                 */
-                next_nid = next_node(hugetlb_next_nid, node_online_map);
-                if (next_nid == MAX_NUMNODES)
-                        next_nid = first_node(node_online_map);
-                hugetlb_next_nid = next_nid;
-        } while (!page && hugetlb_next_nid != start_nid);
        if (ret)
                count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -256,12 +619,15 @@ static int alloc_fresh_huge_page(void)
        return ret;
 }
-static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
+static struct page *alloc_buddy_huge_page(struct hstate *h,
-                                                unsigned long address)
+                        struct vm_area_struct *vma, unsigned long address)
 {
        struct page *page;
        unsigned int nid;
+        if (h->order >= MAX_ORDER)
+                return NULL;
        /*
         * Assume we will successfully allocate the surplus page to
         * prevent racing processes from causing the surplus to exceed
@@ -286,18 +652,23 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
         * per-node value is checked there.
         */
        spin_lock(&hugetlb_lock);
-        if (surplus_huge_pages >= nr_overcommit_huge_pages) {
+        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                spin_unlock(&hugetlb_lock);
                return NULL;
        } else {
-                nr_huge_pages++;
+                h->nr_huge_pages++;
-                surplus_huge_pages++;
+                h->surplus_huge_pages++;
        }
        spin_unlock(&hugetlb_lock);
        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
                                        __GFP_REPEAT|__GFP_NOWARN,
-                                        HUGETLB_PAGE_ORDER);
+                                        huge_page_order(h));
+        if (page && arch_prepare_hugepage(page)) {
+                __free_pages(page, huge_page_order(h));
+                return NULL;
+        }
        spin_lock(&hugetlb_lock);
        if (page) {
@@ -312,12 +683,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
                /*
                 * We incremented the global counters already
                 */
-                nr_huge_pages_node[nid]++;
+                h->nr_huge_pages_node[nid]++;
-                surplus_huge_pages_node[nid]++;
+                h->surplus_huge_pages_node[nid]++;
                __count_vm_event(HTLB_BUDDY_PGALLOC);
        } else {
-                nr_huge_pages--;
+                h->nr_huge_pages--;
-                surplus_huge_pages--;
+                h->surplus_huge_pages--;
                __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
        }
        spin_unlock(&hugetlb_lock);
@@ -329,16 +700,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
 * Increase the hugetlb pool such that it can accomodate a reservation
 * of size 'delta'.
 */
-static int gather_surplus_pages(int delta)
+static int gather_surplus_pages(struct hstate *h, int delta)
 {
        struct list_head surplus_list;
        struct page *page, *tmp;
        int ret, i;
        int needed, allocated;
-        needed = (resv_huge_pages + delta) - free_huge_pages;
+        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
        if (needed <= 0) {
-                resv_huge_pages += delta;
+                h->resv_huge_pages += delta;
                return 0;
        }
@@ -349,7 +720,7 @@ static int gather_surplus_pages(int delta)
 retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
-                page = alloc_buddy_huge_page(NULL, 0);
+                page = alloc_buddy_huge_page(h, NULL, 0);
                if (!page) {
                        /*
                         * We were not able to allocate enough pages to
@@ -370,7 +741,8 @@ retry:
         * because either resv_huge_pages or free_huge_pages may have changed.
         */
        spin_lock(&hugetlb_lock);
-        needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
+        needed = (h->resv_huge_pages + delta) -
+                        (h->free_huge_pages + allocated);
        if (needed > 0)
                goto retry;
@@ -383,7 +755,7 @@ retry:
         * before they are reserved.
         */
        needed += allocated;
-        resv_huge_pages += delta;
+        h->resv_huge_pages += delta;
        ret = 0;
 free:
        /* Free the needed pages to the hugetlb pool */
@@ -391,7 +763,7 @@ free:
                if ((--needed) < 0)
                        break;
                list_del(&page->lru);
-                enqueue_huge_page(page);
+                enqueue_huge_page(h, page);
        }
        /* Free unnecessary surplus pages to the buddy allocator */
@@ -419,7 +791,8 @@ free:
 * allocated to satisfy the reservation must be explicitly freed if they were
 * never used.
 */
-static void return_unused_surplus_pages(unsigned long unused_resv_pages)
+static void return_unused_surplus_pages(struct hstate *h,
+                                        unsigned long unused_resv_pages)
 {
        static int nid = -1;
        struct page *page;
@@ -434,157 +807,269 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
        unsigned long remaining_iterations = num_online_nodes();
        /* Uncommit the reservation */
-        resv_huge_pages -= unused_resv_pages;
+        h->resv_huge_pages -= unused_resv_pages;
-        nr_pages = min(unused_resv_pages, surplus_huge_pages);
+        /* Cannot return gigantic pages currently */
+        if (h->order >= MAX_ORDER)
+                return;
+        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
        while (remaining_iterations-- && nr_pages) {
                nid = next_node(nid, node_online_map);
                if (nid == MAX_NUMNODES)
                        nid = first_node(node_online_map);
-                if (!surplus_huge_pages_node[nid])
+                if (!h->surplus_huge_pages_node[nid])
                        continue;
-                if (!list_empty(&hugepage_freelists[nid])) {
+                if (!list_empty(&h->hugepage_freelists[nid])) {
-                        page = list_entry(hugepage_freelists[nid].next,
+                        page = list_entry(h->hugepage_freelists[nid].next,
                                          struct page, lru);
                        list_del(&page->lru);
-                        update_and_free_page(page);
+                        update_and_free_page(h, page);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[nid]--;
+                        h->free_huge_pages_node[nid]--;
-                        surplus_huge_pages--;
+                        h->surplus_huge_pages--;
-                        surplus_huge_pages_node[nid]--;
+                        h->surplus_huge_pages_node[nid]--;
                        nr_pages--;
                        remaining_iterations = num_online_nodes();
                }
        }
 }
+/*
+ * Determine if the huge page at addr within the vma has an associated
+ * reservation.  Where it does not we will need to logically increase
+ * reservation and actually increase quota before an allocation can occur.
+ * Where any new reservation would be required the reservation change is
+ * prepared, but not committed.  Once the page has been quota'd allocated
+ * an instantiated the change should be committed via vma_commit_reservation.
+ * No action is required on failure.
+ */
+static int vma_needs_reservation(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long addr)
+{
+        struct address_space *mapping = vma->vm_file->f_mapping;
+        struct inode *inode = mapping->host;
+        if (vma->vm_flags & VM_SHARED) {
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+                return region_chg(&inode->i_mapping->private_list,
+                                                        idx, idx + 1);
+        } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                return 1;
+        } else  {
+                int err;
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+                struct resv_map *reservations = vma_resv_map(vma);
-static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
+                err = region_chg(&reservations->regions, idx, idx + 1);
-                                                unsigned long addr)
+                if (err < 0)
+                        return err;
+                return 0;
+        }
+}
+static void vma_commit_reservation(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long addr)
 {
-        struct page *page;
+        struct address_space *mapping = vma->vm_file->f_mapping;
+        struct inode *inode = mapping->host;
-        spin_lock(&hugetlb_lock);
+        if (vma->vm_flags & VM_SHARED) {
-        page = dequeue_huge_page_vma(vma, addr);
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-        spin_unlock(&hugetlb_lock);
+                region_add(&inode->i_mapping->private_list, idx, idx + 1);
-        return page ? page : ERR_PTR(-VM_FAULT_OOM);
+        } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+                struct resv_map *reservations = vma_resv_map(vma);
+                /* Mark this page used in the map. */
+                region_add(&reservations->regions, idx, idx + 1);
+        }
 }
-static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
-                                                unsigned long addr)
+                                    unsigned long addr, int avoid_reserve)
 {
-        struct page *page = NULL;
+        struct hstate *h = hstate_vma(vma);
+        struct page *page;
+        struct address_space *mapping = vma->vm_file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned int chg;
-        if (hugetlb_get_quota(vma->vm_file->f_mapping, 1))
+        /*
-                return ERR_PTR(-VM_FAULT_SIGBUS);
+         * Processes that did not create the mapping will have no reserves and
+         * will not have accounted against quota. Check that the quota can be
+         * made before satisfying the allocation
+         * MAP_NORESERVE mappings may also need pages and quota allocated
+         * if no reserve mapping overlaps.
+         */
+        chg = vma_needs_reservation(h, vma, addr);
+        if (chg < 0)
+                return ERR_PTR(chg);
+        if (chg)
+                if (hugetlb_get_quota(inode->i_mapping, chg))
+                        return ERR_PTR(-ENOSPC);
        spin_lock(&hugetlb_lock);
-        if (free_huge_pages > resv_huge_pages)
+        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
-                page = dequeue_huge_page_vma(vma, addr);
        spin_unlock(&hugetlb_lock);
        if (!page) {
-                page = alloc_buddy_huge_page(vma, addr);
+                page = alloc_buddy_huge_page(h, vma, addr);
                if (!page) {
-                        hugetlb_put_quota(vma->vm_file->f_mapping, 1);
+                        hugetlb_put_quota(inode->i_mapping, chg);
                        return ERR_PTR(-VM_FAULT_OOM);
                }
        }
+        set_page_refcounted(page);
+        set_page_private(page, (unsigned long) mapping);
+        vma_commit_reservation(h, vma, addr);
        return page;
 }
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
-                                    unsigned long addr)
 {
-        struct page *page;
+        struct huge_bootmem_page *m;
-        struct address_space *mapping = vma->vm_file->f_mapping;
+        int nr_nodes = nodes_weight(node_online_map);
-        if (vma->vm_flags & VM_MAYSHARE)
+        while (nr_nodes) {
-                page = alloc_huge_page_shared(vma, addr);
+                void *addr;
-        else
-                page = alloc_huge_page_private(vma, addr);
+                addr = __alloc_bootmem_node_nopanic(
+                                NODE_DATA(h->hugetlb_next_nid),
+                                huge_page_size(h), huge_page_size(h), 0);
-        if (!IS_ERR(page)) {
+                if (addr) {
-                set_page_refcounted(page);
+                        /*
-                set_page_private(page, (unsigned long) mapping);
+                         * Use the beginning of the huge page to store the
+                         * huge_bootmem_page struct (until gather_bootmem
+                         * puts them into the mem_map).
+                         */
+                        m = addr;
+                        if (m)
+                                goto found;
+                }
+                hstate_next_node(h);
+                nr_nodes--;
        }
-        return page;
+        return 0;
+found:
+        BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+        /* Put them into a private list first because mem_map is not up yet */
+        list_add(&m->list, &huge_boot_pages);
+        m->hstate = h;
+        return 1;
 }
-static int __init hugetlb_init(void)
+/* Put bootmem huge pages into the standard lists after mem_map is up */
+static void __init gather_bootmem_prealloc(void)
 {
-        unsigned long i;
+        struct huge_bootmem_page *m;
-        if (HPAGE_SHIFT == 0)
+        list_for_each_entry(m, &huge_boot_pages, list) {
-                return 0;
+                struct page *page = virt_to_page(m);
+                struct hstate *h = m->hstate;
-        for (i = 0; i < MAX_NUMNODES; ++i)
+                __ClearPageReserved(page);
-                INIT_LIST_HEAD(&hugepage_freelists[i]);
+                WARN_ON(page_count(page) != 1);
+                prep_compound_page(page, h->order);
+                prep_new_huge_page(h, page, page_to_nid(page));
+        }
+}
-        hugetlb_next_nid = first_node(node_online_map);
+static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
+{
+        unsigned long i;
-        for (i = 0; i < max_huge_pages; ++i) {
+        for (i = 0; i < h->max_huge_pages; ++i) {
-                if (!alloc_fresh_huge_page())
+                if (h->order >= MAX_ORDER) {
+                        if (!alloc_bootmem_huge_page(h))
+                                break;
+                } else if (!alloc_fresh_huge_page(h))
                        break;
        }
-        max_huge_pages = free_huge_pages = nr_huge_pages = i;
+        h->max_huge_pages = i;
-        printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
-        return 0;
 }
-module_init(hugetlb_init);
-static int __init hugetlb_setup(char *s)
+static void __init hugetlb_init_hstates(void)
 {
-        if (sscanf(s, "%lu", &max_huge_pages) <= 0)
+        struct hstate *h;
-                max_huge_pages = 0;
-        return 1;
+        for_each_hstate(h) {
+                /* oversize hugepages were init'ed in early boot */
+                if (h->order < MAX_ORDER)
+                        hugetlb_hstate_alloc_pages(h);
+        }
 }
-__setup("hugepages=", hugetlb_setup);
-static unsigned int cpuset_mems_nr(unsigned int *array)
+static char * __init memfmt(char *buf, unsigned long n)
 {
-        int node;
+        if (n >= (1UL << 30))
-        unsigned int nr = 0;
+                sprintf(buf, "%lu GB", n >> 30);
+        else if (n >= (1UL << 20))
-        for_each_node_mask(node, cpuset_current_mems_allowed)
+                sprintf(buf, "%lu MB", n >> 20);
-                nr += array[node];
+        else
+                sprintf(buf, "%lu KB", n >> 10);
+        return buf;
+}
-        return nr;
+static void __init report_hugepages(void)
+{
+        struct hstate *h;
+        for_each_hstate(h) {
+                char buf[32];
+                printk(KERN_INFO "HugeTLB registered %s page size, "
+                                 "pre-allocated %ld pages\n",
+                        memfmt(buf, huge_page_size(h)),
+                        h->free_huge_pages);
+        }
 }
-#ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
-static void try_to_free_low(unsigned long count)
+static void try_to_free_low(struct hstate *h, unsigned long count)
 {
        int i;
+        if (h->order >= MAX_ORDER)
+                return;
        for (i = 0; i < MAX_NUMNODES; ++i) {
                struct page *page, *next;
-                list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
+                struct list_head *freel = &h->hugepage_freelists[i];
-                        if (count >= nr_huge_pages)
+                list_for_each_entry_safe(page, next, freel, lru) {
+                        if (count >= h->nr_huge_pages)
                                return;
                        if (PageHighMem(page))
                                continue;
                        list_del(&page->lru);
-                        update_and_free_page(page);
+                        update_and_free_page(h, page);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[page_to_nid(page)]--;
+                        h->free_huge_pages_node[page_to_nid(page)]--;
                }
        }
 }
 #else
-static inline void try_to_free_low(unsigned long count)
+static inline void try_to_free_low(struct hstate *h, unsigned long count)
 {
 }
 #endif
-#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
+#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(unsigned long count)
+static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
 {
        unsigned long min_count, ret;
+        if (h->order >= MAX_ORDER)
+                return h->max_huge_pages;
        /*
         * Increase the pool size
         * First take pages out of surplus state.  Then make up the
@@ -597,20 +1082,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
         * within all the constraints specified by the sysctls.
         */
        spin_lock(&hugetlb_lock);
-        while (surplus_huge_pages && count > persistent_huge_pages) {
+        while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
-                if (!adjust_pool_surplus(-1))
+                if (!adjust_pool_surplus(h, -1))
                        break;
        }
-        while (count > persistent_huge_pages) {
+        while (count > persistent_huge_pages(h)) {
-                int ret;
                /*
                 * If this allocation races such that we no longer need the
                 * page, free_huge_page will handle it by freeing the page
                 * and reducing the surplus.
                 */
                spin_unlock(&hugetlb_lock);
-                ret = alloc_fresh_huge_page();
+                ret = alloc_fresh_huge_page(h);
                spin_lock(&hugetlb_lock);
                if (!ret)
                        goto out;
@@ -632,31 +1116,305 @@ static unsigned long set_max_huge_pages(unsigned long count)
         * and won't grow the pool anywhere else. Not until one of the
         * sysctls are changed, or the surplus pages go out of use.
         */
-        min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
+        min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
        min_count = max(count, min_count);
-        try_to_free_low(min_count);
+        try_to_free_low(h, min_count);
-        while (min_count < persistent_huge_pages) {
+        while (min_count < persistent_huge_pages(h)) {
-                struct page *page = dequeue_huge_page();
+                struct page *page = dequeue_huge_page(h);
                if (!page)
                        break;
-                update_and_free_page(page);
+                update_and_free_page(h, page);
        }
-        while (count < persistent_huge_pages) {
+        while (count < persistent_huge_pages(h)) {
-                if (!adjust_pool_surplus(1))
+                if (!adjust_pool_surplus(h, 1))
                        break;
        }
 out:
-        ret = persistent_huge_pages;
+        ret = persistent_huge_pages(h);
        spin_unlock(&hugetlb_lock);
        return ret;
 }
+#define HSTATE_ATTR_RO(_name) \
+        static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define HSTATE_ATTR(_name) \
+        static struct kobj_attribute _name##_attr = \
+                __ATTR(_name, 0644, _name##_show, _name##_store)
+static struct kobject *hugepages_kobj;
+static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+static struct hstate *kobj_to_hstate(struct kobject *kobj)
+{
+        int i;
+        for (i = 0; i < HUGE_MAX_HSTATE; i++)
+                if (hstate_kobjs[i] == kobj)
+                        return &hstates[i];
+        BUG();
+        return NULL;
+}
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->nr_huge_pages);
+}
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+                struct kobj_attribute *attr, const char *buf, size_t count)
+{
+        int err;
+        unsigned long input;
+        struct hstate *h = kobj_to_hstate(kobj);
+        err = strict_strtoul(buf, 10, &input);
+        if (err)
+                return 0;
+        h->max_huge_pages = set_max_huge_pages(h, input);
+        return count;
+}
+HSTATE_ATTR(nr_hugepages);
+static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
+                struct kobj_attribute *attr, const char *buf, size_t count)
+{
+        int err;
+        unsigned long input;
+        struct hstate *h = kobj_to_hstate(kobj);
+        err = strict_strtoul(buf, 10, &input);
+        if (err)
+                return 0;
+        spin_lock(&hugetlb_lock);
+        h->nr_overcommit_huge_pages = input;
+        spin_unlock(&hugetlb_lock);
+        return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+static ssize_t free_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+static ssize_t resv_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+static ssize_t surplus_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+static struct attribute *hstate_attrs[] = {
+        &nr_hugepages_attr.attr,
+        &nr_overcommit_hugepages_attr.attr,
+        &free_hugepages_attr.attr,
+        &resv_hugepages_attr.attr,
+        &surplus_hugepages_attr.attr,
+        NULL,
+};
+static struct attribute_group hstate_attr_group = {
+        .attrs = hstate_attrs,
+};
+static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
+{
+        int retval;
+        hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
+                                                        hugepages_kobj);
+        if (!hstate_kobjs[h - hstates])
+                return -ENOMEM;
+        retval = sysfs_create_group(hstate_kobjs[h - hstates],
+                                                        &hstate_attr_group);
+        if (retval)
+                kobject_put(hstate_kobjs[h - hstates]);
+        return retval;
+}
+static void __init hugetlb_sysfs_init(void)
+{
+        struct hstate *h;
+        int err;
+        hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+        if (!hugepages_kobj)
+                return;
+        for_each_hstate(h) {
+                err = hugetlb_sysfs_add_hstate(h);
+                if (err)
+                        printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
+                                                                h->name);
+        }
+}
+static void __exit hugetlb_exit(void)
+{
+        struct hstate *h;
+        for_each_hstate(h) {
+                kobject_put(hstate_kobjs[h - hstates]);
+        }
+        kobject_put(hugepages_kobj);
+}
+module_exit(hugetlb_exit);
+static int __init hugetlb_init(void)
+{
+        /* Some platform decide whether they support huge pages at boot
+         * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
+         * there is no such support
+         */
+        if (HPAGE_SHIFT == 0)
+                return 0;
+        if (!size_to_hstate(default_hstate_size)) {
+                default_hstate_size = HPAGE_SIZE;
+                if (!size_to_hstate(default_hstate_size))
+                        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+        }
+        default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+        if (default_hstate_max_huge_pages)
+                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+        hugetlb_init_hstates();
+        gather_bootmem_prealloc();
+        report_hugepages();
+        hugetlb_sysfs_init();
+        return 0;
+}
+module_init(hugetlb_init);
+/* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_add_hstate(unsigned order)
+{
+        struct hstate *h;
+        unsigned long i;
+        if (size_to_hstate(PAGE_SIZE << order)) {
+                printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
+                return;
+        }
+        BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+        BUG_ON(order == 0);
+        h = &hstates[max_hstate++];
+        h->order = order;
+        h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+        h->nr_huge_pages = 0;
+        h->free_huge_pages = 0;
+        for (i = 0; i < MAX_NUMNODES; ++i)
+                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+        h->hugetlb_next_nid = first_node(node_online_map);
+        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
+                                        huge_page_size(h)/1024);
+        parsed_hstate = h;
+}
+static int __init hugetlb_nrpages_setup(char *s)
+{
+        unsigned long *mhp;
+        static unsigned long *last_mhp;
+        /*
+         * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+         * so this hugepages= parameter goes to the "default hstate".
+         */
+        if (!max_hstate)
+                mhp = &default_hstate_max_huge_pages;
+        else
+                mhp = &parsed_hstate->max_huge_pages;
+        if (mhp == last_mhp) {
+                printk(KERN_WARNING "hugepages= specified twice without "
+                        "interleaving hugepagesz=, ignoring\n");
+                return 1;
+        }
+        if (sscanf(s, "%lu", mhp) <= 0)
+                *mhp = 0;
+        /*
+         * Global state is always initialized later in hugetlb_init.
+         * But we need to allocate >= MAX_ORDER hstates here early to still
+         * use the bootmem allocator.
+         */
+        if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+                hugetlb_hstate_alloc_pages(parsed_hstate);
+        last_mhp = mhp;
+        return 1;
+}
+__setup("hugepages=", hugetlb_nrpages_setup);
+static int __init hugetlb_default_setup(char *s)
+{
+        default_hstate_size = memparse(s, &s);
+        return 1;
+}
+__setup("default_hugepagesz=", hugetlb_default_setup);
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+        int node;
+        unsigned int nr = 0;
+        for_each_node_mask(node, cpuset_current_mems_allowed)
+                nr += array[node];
+        return nr;
+}
+#ifdef CONFIG_SYSCTL
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
                           struct file *file, void __user *buffer,
                           size_t *length, loff_t *ppos)
 {
+        struct hstate *h = &default_hstate;
+        unsigned long tmp;
+        if (!write)
+                tmp = h->max_huge_pages;
+        table->data = &tmp;
+        table->maxlen = sizeof(unsigned long);
        proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
-        max_huge_pages = set_max_huge_pages(max_huge_pages);
+        if (write)
+                h->max_huge_pages = set_max_huge_pages(h, tmp);
        return 0;
 }
@@ -676,10 +1434,22 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
                        struct file *file, void __user *buffer,
                        size_t *length, loff_t *ppos)
 {
+        struct hstate *h = &default_hstate;
+        unsigned long tmp;
+        if (!write)
+                tmp = h->nr_overcommit_huge_pages;
+        table->data = &tmp;
+        table->maxlen = sizeof(unsigned long);
        proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
-        spin_lock(&hugetlb_lock);
-        nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
+        if (write) {
-        spin_unlock(&hugetlb_lock);
+                spin_lock(&hugetlb_lock);
+                h->nr_overcommit_huge_pages = tmp;
+                spin_unlock(&hugetlb_lock);
+        }
        return 0;
 }
@@ -687,34 +1457,118 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 int hugetlb_report_meminfo(char *buf)
 {
+        struct hstate *h = &default_hstate;
        return sprintf(buf,
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
                        "HugePages_Rsvd:  %5lu\n"
                        "HugePages_Surp:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
-                        nr_huge_pages,
+                        h->nr_huge_pages,
-                        free_huge_pages,
+                        h->free_huge_pages,
-                        resv_huge_pages,
+                        h->resv_huge_pages,
-                        surplus_huge_pages,
+                        h->surplus_huge_pages,
-                        HPAGE_SIZE/1024);
+                        1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
 }
 int hugetlb_report_node_meminfo(int nid, char *buf)
 {
+        struct hstate *h = &default_hstate;
        return sprintf(buf,
                "Node %d HugePages_Total: %5u\n"
                "Node %d HugePages_Free:  %5u\n"
                "Node %d HugePages_Surp:  %5u\n",
-                nid, nr_huge_pages_node[nid],
+                nid, h->nr_huge_pages_node[nid],
-                nid, free_huge_pages_node[nid],
+                nid, h->free_huge_pages_node[nid],
-                nid, surplus_huge_pages_node[nid]);
+                nid, h->surplus_huge_pages_node[nid]);
 }
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-        return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+        struct hstate *h = &default_hstate;
+        return h->nr_huge_pages * pages_per_huge_page(h);
+}
+static int hugetlb_acct_memory(struct hstate *h, long delta)
+{
+        int ret = -ENOMEM;
+        spin_lock(&hugetlb_lock);
+        /*
+         * When cpuset is configured, it breaks the strict hugetlb page
+         * reservation as the accounting is done on a global variable. Such
+         * reservation is completely rubbish in the presence of cpuset because
+         * the reservation is not checked against page availability for the
+         * current cpuset. Application can still potentially OOM'ed by kernel
+         * with lack of free htlb page in cpuset that the task is in.
+         * Attempt to enforce strict accounting with cpuset is almost
+         * impossible (or too ugly) because cpuset is too fluid that
+         * task or memory node can be dynamically moved between cpusets.
+         *
+         * The change of semantics for shared hugetlb mapping with cpuset is
+         * undesirable. However, in order to preserve some of the semantics,
+         * we fall back to check against current free page availability as
+         * a best attempt and hopefully to minimize the impact of changing
+         * semantics that cpuset has.
+         */
+        if (delta > 0) {
+                if (gather_surplus_pages(h, delta) < 0)
+                        goto out;
+                if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+                        return_unused_surplus_pages(h, delta);
+                        goto out;
+                }
+        }
+        ret = 0;
+        if (delta < 0)
+                return_unused_surplus_pages(h, (unsigned long) -delta);
+out:
+        spin_unlock(&hugetlb_lock);
+        return ret;
+}
+static void hugetlb_vm_op_open(struct vm_area_struct *vma)
+{
+        struct resv_map *reservations = vma_resv_map(vma);
+        /*
+         * This new VMA should share its siblings reservation map if present.
+         * The VMA will only ever have a valid reservation map pointer where
+         * it is being copied for another still existing VMA.  As that VMA
+         * has a reference to the reservation map it cannot dissappear until
+         * after this open call completes.  It is therefore safe to take a
+         * new reference here without additional locking.
+         */
+        if (reservations)
+                kref_get(&reservations->refs);
+}
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+        struct hstate *h = hstate_vma(vma);
+        struct resv_map *reservations = vma_resv_map(vma);
+        unsigned long reserve;
+        unsigned long start;
+        unsigned long end;
+        if (reservations) {
+                start = vma_hugecache_offset(h, vma, vma->vm_start);
+                end = vma_hugecache_offset(h, vma, vma->vm_end);
+                reserve = (end - start) -
+                        region_count(&reservations->regions, start, end);
+                kref_put(&reservations->refs, resv_map_release);
+                if (reserve) {
+                        hugetlb_acct_memory(h, -reserve);
+                        hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+                }
+        }
 }
 /*
@@ -731,6 +1585,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
+        .open = hugetlb_vm_op_open,
+        .close = hugetlb_vm_op_close,
 };
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -769,14 +1625,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        struct page *ptepage;
        unsigned long addr;
        int cow;
+        struct hstate *h = hstate_vma(vma);
+        unsigned long sz = huge_page_size(h);
        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-        for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
                src_pte = huge_pte_offset(src, addr);
                if (!src_pte)
                        continue;
-                dst_pte = huge_pte_alloc(dst, addr);
+                dst_pte = huge_pte_alloc(dst, addr, sz);
                if (!dst_pte)
                        goto nomem;
@@ -804,7 +1662,7 @@ nomem:
 }
 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                            unsigned long end)
+                            unsigned long end, struct page *ref_page)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -812,6 +1670,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        pte_t pte;
        struct page *page;
        struct page *tmp;
+        struct hstate *h = hstate_vma(vma);
+        unsigned long sz = huge_page_size(h);
        /*
         * A page gathering list, protected by per file i_mmap_lock. The
         * lock is used to avoid list corruption from multiple unmapping
@@ -820,11 +1681,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        LIST_HEAD(page_list);
        WARN_ON(!is_vm_hugetlb_page(vma));
-        BUG_ON(start & ~HPAGE_MASK);
+        BUG_ON(start & ~huge_page_mask(h));
-        BUG_ON(end & ~HPAGE_MASK);
+        BUG_ON(end & ~huge_page_mask(h));
+        mmu_notifier_invalidate_range_start(mm, start, end);
        spin_lock(&mm->page_table_lock);
-        for (address = start; address < end; address += HPAGE_SIZE) {
+        for (address = start; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
@@ -832,6 +1694,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                if (huge_pmd_unshare(mm, &address, ptep))
                        continue;
+                /*
+                 * If a reference page is supplied, it is because a specific
+                 * page is being unmapped, not a range. Ensure the page we
+                 * are about to unmap is the actual page of interest.
+                 */
+                if (ref_page) {
+                        pte = huge_ptep_get(ptep);
+                        if (huge_pte_none(pte))
+                                continue;
+                        page = pte_page(pte);
+                        if (page != ref_page)
+                                continue;
+                        /*
+                         * Mark the VMA as having unmapped its page so that
+                         * future faults in this VMA will fail rather than
+                         * looking like data was lost
+                         */
+                        set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+                }
                pte = huge_ptep_get_and_clear(mm, address, ptep);
                if (huge_pte_none(pte))
                        continue;
@@ -843,6 +1726,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        }
        spin_unlock(&mm->page_table_lock);
        flush_tlb_range(vma, start, end);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        list_for_each_entry_safe(page, tmp, &page_list, lru) {
                list_del(&page->lru);
                put_page(page);
@@ -850,31 +1734,71 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 }
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                          unsigned long end)
+                          unsigned long end, struct page *ref_page)
 {
+        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+        __unmap_hugepage_range(vma, start, end, ref_page);
+        spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+}
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+int unmap_ref_private(struct mm_struct *mm,
+                                        struct vm_area_struct *vma,
+                                        struct page *page,
+                                        unsigned long address)
+{
+        struct vm_area_struct *iter_vma;
+        struct address_space *mapping;
+        struct prio_tree_iter iter;
+        pgoff_t pgoff;
        /*
-         * It is undesirable to test vma->vm_file as it should be non-null
+         * vm_pgoff is in PAGE_SIZE units, hence the different calculation
-         * for valid hugetlb area. However, vm_file will be NULL in the error
+         * from page cache lookup which is in HPAGE_SIZE units.
-         * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
-         * do_mmap_pgoff() nullifies vma->vm_file before calling this function
-         * to clean up. Since no pte has actually been setup, it is safe to
-         * do nothing in this case.
         */
-        if (vma->vm_file) {
+        address = address & huge_page_mask(hstate_vma(vma));
-                spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
-                __unmap_hugepage_range(vma, start, end);
+                + (vma->vm_pgoff >> PAGE_SHIFT);
-                spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+        mapping = (struct address_space *)page_private(page);
+        vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+                /* Do not unmap the current VMA */
+                if (iter_vma == vma)
+                        continue;
+                /*
+                 * Unmap the page from other VMAs without their own reserves.
+                 * They get marked to be SIGKILLed if they fault in these
+                 * areas. This is because a future no-page fault on this VMA
+                 * could insert a zeroed page instead of the data existing
+                 * from the time of fork. This would look like data corruption
+                 */
+                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+                        unmap_hugepage_range(iter_vma,
+                                address, address + HPAGE_SIZE,
+                                page);
        }
+        return 1;
 }
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long address, pte_t *ptep, pte_t pte)
+                        unsigned long address, pte_t *ptep, pte_t pte,
+                        struct page *pagecache_page)
 {
+        struct hstate *h = hstate_vma(vma);
        struct page *old_page, *new_page;
        int avoidcopy;
+        int outside_reserve = 0;
        old_page = pte_page(pte);
+retry_avoidcopy:
        /* If no-one else is actually using this page, avoid the copy
         * and just make the page writable */
        avoidcopy = (page_count(old_page) == 1);
@@ -883,11 +1807,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                return 0;
        }
+        /*
+         * If the process that created a MAP_PRIVATE mapping is about to
+         * perform a COW due to a shared page count, attempt to satisfy
+         * the allocation without using the existing reserves. The pagecache
+         * page is used to determine if the reserve at this address was
+         * consumed or not. If reserves were used, a partial faulted mapping
+         * at the time of fork() could consume its reserves on COW instead
+         * of the full address range.
+         */
+        if (!(vma->vm_flags & VM_SHARED) &&
+                        is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+                        old_page != pagecache_page)
+                outside_reserve = 1;
        page_cache_get(old_page);
-        new_page = alloc_huge_page(vma, address);
+        new_page = alloc_huge_page(vma, address, outside_reserve);
        if (IS_ERR(new_page)) {
                page_cache_release(old_page);
+                /*
+                 * If a process owning a MAP_PRIVATE mapping fails to COW,
+                 * it is due to references held by a child and an insufficient
+                 * huge page pool. To guarantee the original mappers
+                 * reliability, unmap the page from child processes. The child
+                 * may get SIGKILLed if it later faults.
+                 */
+                if (outside_reserve) {
+                        BUG_ON(huge_pte_none(pte));
+                        if (unmap_ref_private(mm, vma, old_page, address)) {
+                                BUG_ON(page_count(old_page) != 1);
+                                BUG_ON(huge_pte_none(pte));
+                                goto retry_avoidcopy;
+                        }
+                        WARN_ON_ONCE(1);
+                }
                return -PTR_ERR(new_page);
        }
@@ -896,7 +1852,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        __SetPageUptodate(new_page);
        spin_lock(&mm->page_table_lock);
-        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
                /* Break COW */
                huge_ptep_clear_flush(vma, address, ptep);
@@ -910,19 +1866,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        return 0;
 }
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long address)
+{
+        struct address_space *mapping;
+        pgoff_t idx;
+        mapping = vma->vm_file->f_mapping;
+        idx = vma_hugecache_offset(h, vma, address);
+        return find_lock_page(mapping, idx);
+}
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, int write_access)
 {
+        struct hstate *h = hstate_vma(vma);
        int ret = VM_FAULT_SIGBUS;
-        unsigned long idx;
+        pgoff_t idx;
        unsigned long size;
        struct page *page;
        struct address_space *mapping;
        pte_t new_pte;
+        /*
+         * Currently, we are forced to kill the process in the event the
+         * original mapper has unmapped pages from the child due to a failed
+         * COW. Warn that such a situation has occured as it may not be obvious
+         */
+        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+                printk(KERN_WARNING
+                        "PID %d killed due to inadequate hugepage pool\n",
+                        current->pid);
+                return ret;
+        }
        mapping = vma->vm_file->f_mapping;
-        idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+        idx = vma_hugecache_offset(h, vma, address);
-                + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
        /*
         * Use page lock to guard against racing truncation
@@ -931,15 +1912,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
-                size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+                size = i_size_read(mapping->host) >> huge_page_shift(h);
                if (idx >= size)
                        goto out;
-                page = alloc_huge_page(vma, address);
+                page = alloc_huge_page(vma, address, 0);
                if (IS_ERR(page)) {
                        ret = -PTR_ERR(page);
                        goto out;
                }
-                clear_huge_page(page, address);
+                clear_huge_page(page, address, huge_page_size(h));
                __SetPageUptodate(page);
                if (vma->vm_flags & VM_SHARED) {
@@ -955,14 +1936,26 @@ retry:
                        }
                        spin_lock(&inode->i_lock);
-                        inode->i_blocks += BLOCKS_PER_HUGEPAGE;
+                        inode->i_blocks += blocks_per_huge_page(h);
                        spin_unlock(&inode->i_lock);
                } else
                        lock_page(page);
        }
+        /*
+         * If we are going to COW a private mapping later, we examine the
+         * pending reservations for this page now. This will ensure that
+         * any allocations necessary to record that reservation occur outside
+         * the spinlock.
+         */
+        if (write_access && !(vma->vm_flags & VM_SHARED))
+                if (vma_needs_reservation(h, vma, address) < 0) {
+                        ret = VM_FAULT_OOM;
+                        goto backout_unlocked;
+                }
        spin_lock(&mm->page_table_lock);
-        size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+        size = i_size_read(mapping->host) >> huge_page_shift(h);
        if (idx >= size)
                goto backout;
@@ -976,7 +1969,7 @@ retry:
        if (write_access && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
-                ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+                ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
        }
        spin_unlock(&mm->page_table_lock);
@@ -986,6 +1979,7 @@ out:
 backout:
        spin_unlock(&mm->page_table_lock);
+backout_unlocked:
        unlock_page(page);
        put_page(page);
        goto out;
@@ -997,9 +1991,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t entry;
        int ret;
+        struct page *pagecache_page = NULL;
        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+        struct hstate *h = hstate_vma(vma);
-        ptep = huge_pte_alloc(mm, address);
+        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
        if (!ptep)
                return VM_FAULT_OOM;
@@ -1012,23 +2008,58 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        entry = huge_ptep_get(ptep);
        if (huge_pte_none(entry)) {
                ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
-                mutex_unlock(&hugetlb_instantiation_mutex);
+                goto out_unlock;
-                return ret;
        }
        ret = 0;
+        /*
+         * If we are going to COW the mapping later, we examine the pending
+         * reservations for this page now. This will ensure that any
+         * allocations necessary to record that reservation occur outside the
+         * spinlock. For private mappings, we also lookup the pagecache
+         * page now as it is used to determine if a reservation has been
+         * consumed.
+         */
+        if (write_access && !pte_write(entry)) {
+                if (vma_needs_reservation(h, vma, address) < 0) {
+                        ret = VM_FAULT_OOM;
+                        goto out_unlock;
+                }
+                if (!(vma->vm_flags & VM_SHARED))
+                        pagecache_page = hugetlbfs_pagecache_page(h,
+                                                                vma, address);
+        }
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
        if (likely(pte_same(entry, huge_ptep_get(ptep))))
                if (write_access && !pte_write(entry))
-                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
+                        ret = hugetlb_cow(mm, vma, address, ptep, entry,
+                                                        pagecache_page);
        spin_unlock(&mm->page_table_lock);
+        if (pagecache_page) {
+                unlock_page(pagecache_page);
+                put_page(pagecache_page);
+        }
+out_unlock:
        mutex_unlock(&hugetlb_instantiation_mutex);
        return ret;
 }
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+               pud_t *pud, int write)
+{
+        BUG();
+        return NULL;
+}
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i,
@@ -1037,6 +2068,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long pfn_offset;
        unsigned long vaddr = *position;
        int remainder = *length;
+        struct hstate *h = hstate_vma(vma);
        spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
@@ -1048,7 +2080,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * each hugepage.  We have to make * sure we get the
                 * first, for the page indexing below to work.
                 */
-                pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+                pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
                if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
                    (write && !pte_write(huge_ptep_get(pte)))) {
@@ -1066,7 +2098,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        break;
                }
-                pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+                pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
                page = pte_page(huge_ptep_get(pte));
 same_page:
                if (pages) {
@@ -1082,7 +2114,7 @@ same_page:
                --remainder;
                ++i;
                if (vaddr < vma->vm_end && remainder &&
-                                pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+                                pfn_offset < pages_per_huge_page(h)) {
                        /*
                         * We use pfn_offset to avoid touching the pageframes
                         * of this compound page.
@@ -1104,13 +2136,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        unsigned long start = address;
        pte_t *ptep;
        pte_t pte;
+        struct hstate *h = hstate_vma(vma);
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
        spin_lock(&mm->page_table_lock);
-        for (; address < end; address += HPAGE_SIZE) {
+        for (; address < end; address += huge_page_size(h)) {
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
@@ -1128,195 +2161,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        flush_tlb_range(vma, start, end);
 }
-struct file_region {
+int hugetlb_reserve_pages(struct inode *inode,
-        struct list_head link;
+                                        long from, long to,
-        long from;
+                                        struct vm_area_struct *vma)
-        long to;
-};
-static long region_add(struct list_head *head, long f, long t)
-{
-        struct file_region *rg, *nrg, *trg;
-        /* Locate the region we are either in or before. */
-        list_for_each_entry(rg, head, link)
-                if (f <= rg->to)
-                        break;
-        /* Round our left edge to the current segment if it encloses us. */
-        if (f > rg->from)
-                f = rg->from;
-        /* Check for and consume any regions we now overlap with. */
-        nrg = rg;
-        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-                if (&rg->link == head)
-                        break;
-                if (rg->from > t)
-                        break;
-                /* If this area reaches higher then extend our area to
-                 * include it completely.  If this is not the first area
-                 * which we intend to reuse, free it. */
-                if (rg->to > t)
-                        t = rg->to;
-                if (rg != nrg) {
-                        list_del(&rg->link);
-                        kfree(rg);
-                }
-        }
-        nrg->from = f;
-        nrg->to = t;
-        return 0;
-}
-static long region_chg(struct list_head *head, long f, long t)
 {
-        struct file_region *rg, *nrg;
+        long ret, chg;
-        long chg = 0;
+        struct hstate *h = hstate_inode(inode);
-        /* Locate the region we are before or in. */
-        list_for_each_entry(rg, head, link)
-                if (f <= rg->to)
-                        break;
-        /* If we are below the current region then a new region is required.
-         * Subtle, allocate a new region at the position but make it zero
-         * size such that we can guarantee to record the reservation. */
-        if (&rg->link == head || t < rg->from) {
-                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
-                if (!nrg)
-                        return -ENOMEM;
-                nrg->from = f;
-                nrg->to   = f;
-                INIT_LIST_HEAD(&nrg->link);
-                list_add(&nrg->link, rg->link.prev);
-                return t - f;
-        }
-        /* Round our left edge to the current segment if it encloses us. */
-        if (f > rg->from)
-                f = rg->from;
-        chg = t - f;
-        /* Check for and consume any regions we now overlap with. */
-        list_for_each_entry(rg, rg->link.prev, link) {
-                if (&rg->link == head)
-                        break;
-                if (rg->from > t)
-                        return chg;
-                /* We overlap with this area, if it extends futher than
-                 * us then we must extend ourselves.  Account for its
-                 * existing reservation. */
-                if (rg->to > t) {
-                        chg += rg->to - t;
-                        t = rg->to;
-                }
-                chg -= rg->to - rg->from;
-        }
-        return chg;
-}
-static long region_truncate(struct list_head *head, long end)
-{
-        struct file_region *rg, *trg;
-        long chg = 0;
-        /* Locate the region we are either in or before. */
+        if (vma && vma->vm_flags & VM_NORESERVE)
-        list_for_each_entry(rg, head, link)
-                if (end <= rg->to)
-                        break;
-        if (&rg->link == head)
                return 0;
-        /* If we are in the middle of a region then adjust it. */
-        if (end > rg->from) {
-                chg = rg->to - end;
-                rg->to = end;
-                rg = list_entry(rg->link.next, typeof(*rg), link);
-        }
-        /* Drop any remaining regions. */
-        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-                if (&rg->link == head)
-                        break;
-                chg += rg->to - rg->from;
-                list_del(&rg->link);
-                kfree(rg);
-        }
-        return chg;
-}
-static int hugetlb_acct_memory(long delta)
-{
-        int ret = -ENOMEM;
-        spin_lock(&hugetlb_lock);
        /*
-         * When cpuset is configured, it breaks the strict hugetlb page
+         * Shared mappings base their reservation on the number of pages that
-         * reservation as the accounting is done on a global variable. Such
+         * are already allocated on behalf of the file. Private mappings need
-         * reservation is completely rubbish in the presence of cpuset because
+         * to reserve the full area even if read-only as mprotect() may be
-         * the reservation is not checked against page availability for the
+         * called to make the mapping read-write. Assume !vma is a shm mapping
-         * current cpuset. Application can still potentially OOM'ed by kernel
-         * with lack of free htlb page in cpuset that the task is in.
-         * Attempt to enforce strict accounting with cpuset is almost
-         * impossible (or too ugly) because cpuset is too fluid that
-         * task or memory node can be dynamically moved between cpusets.
-         *
-         * The change of semantics for shared hugetlb mapping with cpuset is
-         * undesirable. However, in order to preserve some of the semantics,
-         * we fall back to check against current free page availability as
-         * a best attempt and hopefully to minimize the impact of changing
-         * semantics that cpuset has.
         */
-        if (delta > 0) {
+        if (!vma || vma->vm_flags & VM_SHARED)
-                if (gather_surplus_pages(delta) < 0)
+                chg = region_chg(&inode->i_mapping->private_list, from, to);
-                        goto out;
+        else {
+                struct resv_map *resv_map = resv_map_alloc();
-                if (delta > cpuset_mems_nr(free_huge_pages_node)) {
+                if (!resv_map)
-                        return_unused_surplus_pages(delta);
+                        return -ENOMEM;
-                        goto out;
-                }
-        }
-        ret = 0;
-        if (delta < 0)
-                return_unused_surplus_pages((unsigned long) -delta);
-out:
+                chg = to - from;
-        spin_unlock(&hugetlb_lock);
-        return ret;
-}
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+                set_vma_resv_map(vma, resv_map);
-{
+                set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
-        long ret, chg;
+        }
-        chg = region_chg(&inode->i_mapping->private_list, from, to);
        if (chg < 0)
                return chg;
        if (hugetlb_get_quota(inode->i_mapping, chg))
                return -ENOSPC;
-        ret = hugetlb_acct_memory(chg);
+        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
                hugetlb_put_quota(inode->i_mapping, chg);
                return ret;
        }
-        region_add(&inode->i_mapping->private_list, from, to);
+        if (!vma || vma->vm_flags & VM_SHARED)
+                region_add(&inode->i_mapping->private_list, from, to);
        return 0;
 }
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
+        struct hstate *h = hstate_inode(inode);
        long chg = region_truncate(&inode->i_mapping->private_list, offset);
        spin_lock(&inode->i_lock);
-        inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed;
+        inode->i_blocks -= blocks_per_huge_page(h);
        spin_unlock(&inode->i_lock);
        hugetlb_put_quota(inode->i_mapping, (chg - freed));
-        hugetlb_acct_memory(-(chg - freed));
+        hugetlb_acct_memory(h, -(chg - freed));
 }
diff --git a/mm/internal.h b/mm/internal.h
index 0034e947e4bc..1f43f7416972 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,11 @@
 #include <linux/mm.h>
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+                unsigned long floor, unsigned long ceiling);
+extern void prep_compound_page(struct page *page, unsigned long order);
 static inline void set_page_count(struct page *page, int v)
 {
        atomic_set(&page->_count, v);
@@ -59,4 +64,60 @@ static inline unsigned long page_order(struct page *page)
 #define __paginginit __init
 #endif
+/* Memory initialisation debug and verification */
+enum mminit_level {
+        MMINIT_WARNING,
+        MMINIT_VERIFY,
+        MMINIT_TRACE
+};
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+extern int mminit_loglevel;
+#define mminit_dprintk(level, prefix, fmt, arg...) \
+do { \
+        if (level < mminit_loglevel) { \
+                printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
+                printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+        } \
+} while (0)
+extern void mminit_verify_pageflags_layout(void);
+extern void mminit_verify_page_links(struct page *page,
+                enum zone_type zone, unsigned long nid, unsigned long pfn);
+extern void mminit_verify_zonelist(void);
+#else
+static inline void mminit_dprintk(enum mminit_level level,
+                                const char *prefix, const char *fmt, ...)
+{
+}
+static inline void mminit_verify_pageflags_layout(void)
+{
+}
+static inline void mminit_verify_page_links(struct page *page,
+                enum zone_type zone, unsigned long nid, unsigned long pfn)
+{
+}
+static inline void mminit_verify_zonelist(void)
+{
+}
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
+#if defined(CONFIG_SPARSEMEM)
+extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+                                unsigned long *end_pfn);
+#else
+static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+                                unsigned long *end_pfn)
+{
+}
+#endif /* CONFIG_SPARSEMEM */
 #endif
diff --git a/mm/madvise.c b/mm/madvise.c
index 23a0ec3e0ea0..f9349c18a1b5 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -132,10 +132,10 @@ static long madvise_willneed(struct vm_area_struct * vma,
 * Application no longer needs these pages.  If the pages are dirty,
 * it's OK to just throw them away.  The app will be more careful about
 * data it wants to keep.  Be sure to free swap resources too.  The
- * zap_page_range call sets things up for refill_inactive to actually free
+ * zap_page_range call sets things up for shrink_active_list to actually free
 * these pages later if no one else has touched them in the meantime,
 * although we could add these pages to a global reuse list for
- * refill_inactive to pick up before reclaiming other pages.
+ * shrink_active_list to pick up before reclaiming other pages.
 *
 * NB: This interface discards data rather than pushes it out to swap,
 * as some implementations do.  This has performance implications for
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e46451e1d9b7..0f1f7a7374ba 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,9 +35,9 @@
 #include <asm/uaccess.h>
-struct cgroup_subsys mem_cgroup_subsys;
+struct cgroup_subsys mem_cgroup_subsys __read_mostly;
-static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
+static struct kmem_cache *page_cgroup_cache __read_mostly;
-static struct kmem_cache *page_cgroup_cache;
+#define MEM_CGROUP_RECLAIM_RETRIES      5
 /*
 * Statistics for memory cgroup.
@@ -166,7 +166,6 @@ struct page_cgroup {
        struct list_head lru;           /* per cgroup LRU list */
        struct page *page;
        struct mem_cgroup *mem_cgroup;
-        int ref_cnt;                    /* cached, mapped, migrating */
        int flags;
 };
 #define PAGE_CGROUP_FLAG_CACHE  (0x1)   /* charged as cache */
@@ -185,6 +184,7 @@ static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
        MEM_CGROUP_CHARGE_TYPE_MAPPED,
+        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
 };
 /*
@@ -296,7 +296,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
        mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
-        list_del_init(&pc->lru);
+        list_del(&pc->lru);
 }
 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
@@ -354,6 +354,9 @@ void mem_cgroup_move_lists(struct page *page, bool active)
        struct mem_cgroup_per_zone *mz;
        unsigned long flags;
+        if (mem_cgroup_subsys.disabled)
+                return;
        /*
         * We cannot lock_page_cgroup while holding zone's lru_lock,
         * because other holders of lock_page_cgroup can be interrupted
@@ -524,7 +527,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 * < 0 if the cgroup is over its limit
 */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-                                gfp_t gfp_mask, enum charge_type ctype)
+                                gfp_t gfp_mask, enum charge_type ctype,
+                                struct mem_cgroup *memcg)
 {
        struct mem_cgroup *mem;
        struct page_cgroup *pc;
@@ -532,35 +536,8 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct mem_cgroup_per_zone *mz;
-        if (mem_cgroup_subsys.disabled)
+        pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
-                return 0;
+        if (unlikely(pc == NULL))
-        /*
-         * Should page_cgroup's go to their own slab?
-         * One could optimize the performance of the charging routine
-         * by saving a bit in the page_flags and using it as a lock
-         * to see if the cgroup page already has a page_cgroup associated
-         * with it
-         */
-retry:
-        lock_page_cgroup(page);
-        pc = page_get_page_cgroup(page);
-        /*
-         * The page_cgroup exists and
-         * the page has already been accounted.
-         */
-        if (pc) {
-                VM_BUG_ON(pc->page != page);
-                VM_BUG_ON(pc->ref_cnt <= 0);
-                pc->ref_cnt++;
-                unlock_page_cgroup(page);
-                goto done;
-        }
-        unlock_page_cgroup(page);
-        pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
-        if (pc == NULL)
                goto err;
        /*
@@ -569,16 +546,18 @@ retry:
         * thread group leader migrates. It's possible that mm is not
         * set, if so charge the init_mm (happens for pagecache usage).
         */
-        if (!mm)
+        if (likely(!memcg)) {
-                mm = &init_mm;
+                rcu_read_lock();
+                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-        rcu_read_lock();
+                /*
-        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+                 * For every charge from the cgroup, increment reference count
-        /*
+                 */
-         * For every charge from the cgroup, increment reference count
+                css_get(&mem->css);
-         */
+                rcu_read_unlock();
-        css_get(&mem->css);
+        } else {
-        rcu_read_unlock();
+                mem = memcg;
+                css_get(&memcg->css);
+        }
        while (res_counter_charge(&mem->res, PAGE_SIZE)) {
                if (!(gfp_mask & __GFP_WAIT))
@@ -603,25 +582,24 @@ retry:
                }
        }
-        pc->ref_cnt = 1;
        pc->mem_cgroup = mem;
        pc->page = page;
-        pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
+        /*
+         * If a page is accounted as a page cache, insert to inactive list.
+         * If anon, insert to active list.
+         */
        if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
                pc->flags = PAGE_CGROUP_FLAG_CACHE;
+        else
+                pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
        lock_page_cgroup(page);
-        if (page_get_page_cgroup(page)) {
+        if (unlikely(page_get_page_cgroup(page))) {
                unlock_page_cgroup(page);
-                /*
-                 * Another charge has been added to this page already.
-                 * We take lock_page_cgroup(page) again and read
-                 * page->cgroup, increment refcnt.... just retry is OK.
-                 */
                res_counter_uncharge(&mem->res, PAGE_SIZE);
                css_put(&mem->css);
                kmem_cache_free(page_cgroup_cache, pc);
-                goto retry;
+                goto done;
        }
        page_assign_page_cgroup(page, pc);
@@ -642,24 +620,65 @@ err:
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
+        if (mem_cgroup_subsys.disabled)
+                return 0;
+        /*
+         * If already mapped, we don't have to account.
+         * If page cache, page->mapping has address_space.
+         * But page->mapping may have out-of-use anon_vma pointer,
+         * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
+         * is NULL.
+         */
+        if (page_mapped(page) || (page->mapping && !PageAnon(page)))
+                return 0;
+        if (unlikely(!mm))
+                mm = &init_mm;
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 }
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
-        if (!mm)
+        if (mem_cgroup_subsys.disabled)
+                return 0;
+        /*
+         * Corner case handling. This is called from add_to_page_cache()
+         * in usual. But some FS (shmem) precharges this page before calling it
+         * and call add_to_page_cache() with GFP_NOWAIT.
+         *
+         * For GFP_NOWAIT case, the page may be pre-charged before calling
+         * add_to_page_cache(). (See shmem.c) check it here and avoid to call
+         * charge twice. (It works but has to pay a bit larger cost.)
+         */
+        if (!(gfp_mask & __GFP_WAIT)) {
+                struct page_cgroup *pc;
+                lock_page_cgroup(page);
+                pc = page_get_page_cgroup(page);
+                if (pc) {
+                        VM_BUG_ON(pc->page != page);
+                        VM_BUG_ON(!pc->mem_cgroup);
+                        unlock_page_cgroup(page);
+                        return 0;
+                }
+                unlock_page_cgroup(page);
+        }
+        if (unlikely(!mm))
                mm = &init_mm;
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_CACHE);
+                                MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
 }
 /*
- * Uncharging is always a welcome operation, we never complain, simply
+ * uncharge if !page_mapped(page)
- * uncharge.
 */
-void mem_cgroup_uncharge_page(struct page *page)
+static void
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
        struct page_cgroup *pc;
        struct mem_cgroup *mem;
@@ -674,98 +693,153 @@ void mem_cgroup_uncharge_page(struct page *page)
         */
        lock_page_cgroup(page);
        pc = page_get_page_cgroup(page);
-        if (!pc)
+        if (unlikely(!pc))
                goto unlock;
        VM_BUG_ON(pc->page != page);
-        VM_BUG_ON(pc->ref_cnt <= 0);
-        if (--(pc->ref_cnt) == 0) {
+        if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
-                mz = page_cgroup_zoneinfo(pc);
+            && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
-                spin_lock_irqsave(&mz->lru_lock, flags);
+                || page_mapped(page)))
-                __mem_cgroup_remove_list(mz, pc);
+                goto unlock;
-                spin_unlock_irqrestore(&mz->lru_lock, flags);
-                page_assign_page_cgroup(page, NULL);
+        mz = page_cgroup_zoneinfo(pc);
-                unlock_page_cgroup(page);
+        spin_lock_irqsave(&mz->lru_lock, flags);
+        __mem_cgroup_remove_list(mz, pc);
+        spin_unlock_irqrestore(&mz->lru_lock, flags);
-                mem = pc->mem_cgroup;
+        page_assign_page_cgroup(page, NULL);
-                res_counter_uncharge(&mem->res, PAGE_SIZE);
+        unlock_page_cgroup(page);
-                css_put(&mem->css);
-                kmem_cache_free(page_cgroup_cache, pc);
+        mem = pc->mem_cgroup;
-                return;
+        res_counter_uncharge(&mem->res, PAGE_SIZE);
-        }
+        css_put(&mem->css);
+        kmem_cache_free(page_cgroup_cache, pc);
+        return;
 unlock:
        unlock_page_cgroup(page);
 }
+void mem_cgroup_uncharge_page(struct page *page)
+{
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+}
+void mem_cgroup_uncharge_cache_page(struct page *page)
+{
+        VM_BUG_ON(page_mapped(page));
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
+}
 /*
- * Returns non-zero if a page (under migration) has valid page_cgroup member.
+ * Before starting migration, account against new page.
- * Refcnt of page_cgroup is incremented.
 */
-int mem_cgroup_prepare_migration(struct page *page)
+int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
 {
        struct page_cgroup *pc;
+        struct mem_cgroup *mem = NULL;
+        enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+        int ret = 0;
        if (mem_cgroup_subsys.disabled)
                return 0;
        lock_page_cgroup(page);
        pc = page_get_page_cgroup(page);
-        if (pc)
+        if (pc) {
-                pc->ref_cnt++;
+                mem = pc->mem_cgroup;
+                css_get(&mem->css);
+                if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
+                        ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        }
        unlock_page_cgroup(page);
-        return pc != NULL;
+        if (mem) {
+                ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
+                        ctype, mem);
+                css_put(&mem->css);
+        }
+        return ret;
 }
-void mem_cgroup_end_migration(struct page *page)
+/* remove redundant charge if migration failed*/
+void mem_cgroup_end_migration(struct page *newpage)
 {
-        mem_cgroup_uncharge_page(page);
+        /*
+         * At success, page->mapping is not NULL.
+         * special rollback care is necessary when
+         * 1. at migration failure. (newpage->mapping is cleared in this case)
+         * 2. the newpage was moved but not remapped again because the task
+         *    exits and the newpage is obsolete. In this case, the new page
+         *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
+         *    always for avoiding mess. The  page_cgroup will be removed if
+         *    unnecessary. File cache pages is still on radix-tree. Don't
+         *    care it.
+         */
+        if (!newpage->mapping)
+                __mem_cgroup_uncharge_common(newpage,
+                                         MEM_CGROUP_CHARGE_TYPE_FORCE);
+        else if (PageAnon(newpage))
+                mem_cgroup_uncharge_page(newpage);
 }
 /*
- * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
+ * A call to try to shrink memory usage under specified resource controller.
- * And no race with uncharge() routines because page_cgroup for *page*
+ * This is typically used for page reclaiming for shmem for reducing side
- * has extra one reference by mem_cgroup_prepare_migration.
+ * effect of page allocation from shmem, which is used by some mem_cgroup.
 */
-void mem_cgroup_page_migration(struct page *page, struct page *newpage)
+int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 {
-        struct page_cgroup *pc;
+        struct mem_cgroup *mem;
-        struct mem_cgroup_per_zone *mz;
+        int progress = 0;
-        unsigned long flags;
+        int retry = MEM_CGROUP_RECLAIM_RETRIES;
-        lock_page_cgroup(page);
+        if (mem_cgroup_subsys.disabled)
-        pc = page_get_page_cgroup(page);
+                return 0;
-        if (!pc) {
+        if (!mm)
-                unlock_page_cgroup(page);
+                return 0;
-                return;
-        }
-        mz = page_cgroup_zoneinfo(pc);
+        rcu_read_lock();
-        spin_lock_irqsave(&mz->lru_lock, flags);
+        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-        __mem_cgroup_remove_list(mz, pc);
+        css_get(&mem->css);
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        rcu_read_unlock();
-        page_assign_page_cgroup(page, NULL);
+        do {
-        unlock_page_cgroup(page);
+                progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
+        } while (!progress && --retry);
-        pc->page = newpage;
+        css_put(&mem->css);
-        lock_page_cgroup(newpage);
+        if (!retry)
-        page_assign_page_cgroup(newpage, pc);
+                return -ENOMEM;
+        return 0;
+}
-        mz = page_cgroup_zoneinfo(pc);
+int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
-        spin_lock_irqsave(&mz->lru_lock, flags);
+{
-        __mem_cgroup_add_list(mz, pc);
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
+        int progress;
+        int ret = 0;
-        unlock_page_cgroup(newpage);
+        while (res_counter_set_limit(&memcg->res, val)) {
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                if (!retry_count) {
+                        ret = -EBUSY;
+                        break;
+                }
+                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
+                if (!progress)
+                        retry_count--;
+        }
+        return ret;
 }
 /*
 * This routine traverse page_cgroup in given list and drop them all.
- * This routine ignores page_cgroup->ref_cnt.
 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 */
 #define FORCE_UNCHARGE_BATCH    (128)
@@ -790,12 +864,20 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
                page = pc->page;
                get_page(page);
                spin_unlock_irqrestore(&mz->lru_lock, flags);
-                mem_cgroup_uncharge_page(page);
+                /*
-                put_page(page);
+                 * Check if this page is on LRU. !LRU page can be found
-                if (--count <= 0) {
+                 * if it's under page migration.
-                        count = FORCE_UNCHARGE_BATCH;
+                 */
+                if (PageLRU(page)) {
+                        __mem_cgroup_uncharge_common(page,
+                                        MEM_CGROUP_CHARGE_TYPE_FORCE);
+                        put_page(page);
+                        if (--count <= 0) {
+                                count = FORCE_UNCHARGE_BATCH;
+                                cond_resched();
+                        }
+                } else
                        cond_resched();
-                }
                spin_lock_irqsave(&mz->lru_lock, flags);
        }
        spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -810,9 +892,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
        int ret = -EBUSY;
        int node, zid;
-        if (mem_cgroup_subsys.disabled)
-                return 0;
        css_get(&mem->css);
        /*
         * page reclaim code (kswapd etc..) will move pages between
@@ -838,32 +917,34 @@ out:
        return ret;
 }
-static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
-{
-        *tmp = memparse(buf, &buf);
-        if (*buf != '\0')
-                return -EINVAL;
-        /*
-         * Round up the value to the closest page size
-         */
-        *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
-        return 0;
-}
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
        return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
                                    cft->private);
 }
+/*
-static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+ * The user of this function is...
-                                struct file *file, const char __user *userbuf,
+ * RES_LIMIT.
-                                size_t nbytes, loff_t *ppos)
+ */
+static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+                            const char *buffer)
 {
-        return res_counter_write(&mem_cgroup_from_cont(cont)->res,
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-                                cft->private, userbuf, nbytes, ppos,
+        unsigned long long val;
-                                mem_cgroup_write_strategy);
+        int ret;
+        switch (cft->private) {
+        case RES_LIMIT:
+                /* This function does all necessary parse...reuse it */
+                ret = res_counter_memparse_write_strategy(buffer, &val);
+                if (!ret)
+                        ret = mem_cgroup_resize_limit(memcg, val);
+                break;
+        default:
+                ret = -EINVAL; /* should be BUG() ? */
+                break;
+        }
+        return ret;
 }
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -940,7 +1021,7 @@ static struct cftype mem_cgroup_files[] = {
        {
                .name = "limit_in_bytes",
                .private = RES_LIMIT,
-                .write = mem_cgroup_write,
+                .write_string = mem_cgroup_write,
                .read_u64 = mem_cgroup_read,
        },
        {
@@ -1070,8 +1151,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
                                struct cgroup *cont)
 {
-        if (mem_cgroup_subsys.disabled)
-                return 0;
        return cgroup_add_files(cont, ss, mem_cgroup_files,
                                        ARRAY_SIZE(mem_cgroup_files));
 }
@@ -1084,9 +1163,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
        struct mm_struct *mm;
        struct mem_cgroup *mem, *old_mem;
-        if (mem_cgroup_subsys.disabled)
-                return;
        mm = get_task_mm(p);
        if (mm == NULL)
                return;
@@ -1094,9 +1170,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
        mem = mem_cgroup_from_cont(cont);
        old_mem = mem_cgroup_from_cont(old_cont);
-        if (mem == old_mem)
-                goto out;
        /*
         * Only thread group leaders are allowed to migrate, the mm_struct is
         * in effect owned by the leader
diff --git a/mm/memory.c b/mm/memory.c
index 2302d228fe04..1002f473f497 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -51,6 +51,7 @@
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -61,6 +62,8 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
+#include "internal.h"
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -211,7 +214,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 *
 * Must be called with pagetable lock held.
 */
-void free_pgd_range(struct mmu_gather **tlb,
+void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
                        unsigned long floor, unsigned long ceiling)
 {
@@ -262,16 +265,16 @@ void free_pgd_range(struct mmu_gather **tlb,
                return;
        start = addr;
-        pgd = pgd_offset((*tlb)->mm, addr);
+        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+                free_pud_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
 }
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long floor, unsigned long ceiling)
 {
        while (vma) {
@@ -372,7 +375,8 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 *
 * The calling function must still handle the error.
 */
-void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+static void print_bad_pte(struct vm_area_struct *vma, pte_t pte,
+                          unsigned long vaddr)
 {
        printk(KERN_ERR "Bad pte = %08llx, process = %s, "
                        "vm_flags = %lx, vaddr = %lx\n",
@@ -649,6 +653,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        unsigned long next;
        unsigned long addr = vma->vm_start;
        unsigned long end = vma->vm_end;
+        int ret;
        /*
         * Don't copy ptes where a page fault will fill them correctly.
@@ -664,17 +669,33 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (is_vm_hugetlb_page(vma))
                return copy_hugetlb_page_range(dst_mm, src_mm, vma);
+        /*
+         * We need to invalidate the secondary MMU mappings only when
+         * there could be a permission downgrade on the ptes of the
+         * parent mm. And a permission downgrade will only happen if
+         * is_cow_mapping() returns true.
+         */
+        if (is_cow_mapping(vma->vm_flags))
+                mmu_notifier_invalidate_range_start(src_mm, addr, end);
+        ret = 0;
        dst_pgd = pgd_offset(dst_mm, addr);
        src_pgd = pgd_offset(src_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(src_pgd))
                        continue;
-                if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
+                if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
-                                                vma, addr, next))
+                                            vma, addr, next))) {
-                        return -ENOMEM;
+                        ret = -ENOMEM;
+                        break;
+                }
        } while (dst_pgd++, src_pgd++, addr = next, addr != end);
-        return 0;
+        if (is_cow_mapping(vma->vm_flags))
+                mmu_notifier_invalidate_range_end(src_mm,
+                                                  vma->vm_start, end);
+        return ret;
 }
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -878,7 +899,9 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
        unsigned long start = start_addr;
        spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
        int fullmm = (*tlbp)->fullmm;
+        struct mm_struct *mm = vma->vm_mm;
+        mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
        for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
                unsigned long end;
@@ -899,9 +922,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        }
                        if (unlikely(is_vm_hugetlb_page(vma))) {
-                                unmap_hugepage_range(vma, start, end);
+                                /*
-                                zap_work -= (end - start) /
+                                 * It is undesirable to test vma->vm_file as it
-                                                (HPAGE_SIZE / PAGE_SIZE);
+                                 * should be non-null for valid hugetlb area.
+                                 * However, vm_file will be NULL in the error
+                                 * cleanup path of do_mmap_pgoff. When
+                                 * hugetlbfs ->mmap method fails,
+                                 * do_mmap_pgoff() nullifies vma->vm_file
+                                 * before calling this function to clean up.
+                                 * Since no pte has actually been setup, it is
+                                 * safe to do nothing in this case.
+                                 */
+                                if (vma->vm_file) {
+                                        unmap_hugepage_range(vma, start, end, NULL);
+                                        zap_work -= (end - start) /
+                                        pages_per_huge_page(hstate_vma(vma));
+                                }
                                start = end;
                        } else
                                start = unmap_page_range(*tlbp, vma,
@@ -929,6 +966,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                }
        }
 out:
+        mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
        return start;   /* which is now the end (or restart) address */
 }
@@ -956,6 +994,29 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
        return end;
 }
+/**
+ * zap_vma_ptes - remove ptes mapping the vma
+ * @vma: vm_area_struct holding ptes to be zapped
+ * @address: starting address of pages to zap
+ * @size: number of bytes to zap
+ *
+ * This function only unmaps ptes assigned to VM_PFNMAP vmas.
+ *
+ * The entire address range must be fully contained within the vma.
+ *
+ * Returns 0 if successful.
+ */
+int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
+                unsigned long size)
+{
+        if (address < vma->vm_start || address + size > vma->vm_end ||
+                        !(vma->vm_flags & VM_PFNMAP))
+                return -1;
+        zap_page_range(vma, address, size, NULL);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(zap_vma_ptes);
 /*
 * Do a quick page-table lookup for a single page.
 */
@@ -982,19 +1043,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                goto no_page_table;
        pud = pud_offset(pgd, address);
-        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+        if (pud_none(*pud))
                goto no_page_table;
-        
+        if (pud_huge(*pud)) {
+                BUG_ON(flags & FOLL_GET);
+                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+                goto out;
+        }
+        if (unlikely(pud_bad(*pud)))
+                goto no_page_table;
        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                goto no_page_table;
        if (pmd_huge(*pmd)) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
        if (unlikely(pmd_bad(*pmd)))
                goto no_page_table;
@@ -1058,11 +1124,9 @@ static inline int use_zero_page(struct vm_area_struct *vma)
        if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
                return 0;
        /*
-         * And if we have a fault or a nopfn routine, it's not an
+         * And if we have a fault routine, it's not an anonymous region.
-         * anonymous region.
         */
-        return !vma->vm_ops ||
+        return !vma->vm_ops || !vma->vm_ops->fault;
-                (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
 }
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -1338,6 +1402,11 @@ out:
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return NULL.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
 */
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
@@ -1548,6 +1617,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
        unsigned long next;
        int err;
+        BUG_ON(pud_huge(*pud));
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
@@ -1589,10 +1660,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long end = addr + size;
+        unsigned long start = addr, end = addr + size;
        int err;
        BUG_ON(addr >= end);
+        mmu_notifier_invalidate_range_start(mm, start, end);
        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -1600,6 +1672,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1716,7 +1789,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * not dirty accountable.
         */
        if (PageAnon(old_page)) {
-                if (!TestSetPageLocked(old_page)) {
+                if (trylock_page(old_page)) {
                        reuse = can_share_swap_page(old_page);
                        unlock_page(old_page);
                }
@@ -1812,7 +1885,7 @@ gotten:
                 * seen in the presence of one thread doing SMC and another
                 * thread doing COW.
                 */
-                ptep_clear_flush(vma, address, page_table);
+                ptep_clear_flush_notify(vma, address, page_table);
                set_pte_at(mm, address, page_table, entry);
                update_mmu_cache(vma, address, entry);
                lru_cache_add_active(new_page);
@@ -2501,59 +2574,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
-/*
- * do_no_pfn() tries to create a new page mapping for a page without
- * a struct_page backing it
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- *
- * It is expected that the ->nopfn handler always returns the same pfn
- * for a given virtual mapping.
- *
- * Mark this `noinline' to prevent it from bloating the main pagefault code.
- */
-static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
-                     unsigned long address, pte_t *page_table, pmd_t *pmd,
-                     int write_access)
-{
-        spinlock_t *ptl;
-        pte_t entry;
-        unsigned long pfn;
-        pte_unmap(page_table);
-        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
-        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
-        pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
-        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
-        if (unlikely(pfn == NOPFN_OOM))
-                return VM_FAULT_OOM;
-        else if (unlikely(pfn == NOPFN_SIGBUS))
-                return VM_FAULT_SIGBUS;
-        else if (unlikely(pfn == NOPFN_REFAULT))
-                return 0;
-        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-        /* Only go through if we didn't race with anybody else... */
-        if (pte_none(*page_table)) {
-                entry = pfn_pte(pfn, vma->vm_page_prot);
-                if (write_access)
-                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                set_pte_at(mm, address, page_table, entry);
-        }
-        pte_unmap_unlock(page_table, ptl);
-        return 0;
-}
 /*
 * Fault of a previously existing named mapping. Repopulate the pte
 * from the encoded file_pte if possible. This enables swappable
@@ -2614,9 +2634,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                                if (likely(vma->vm_ops->fault))
                                        return do_linear_fault(mm, vma, address,
                                                pte, pmd, write_access, entry);
-                                if (unlikely(vma->vm_ops->nopfn))
-                                        return do_no_pfn(mm, vma, address, pte,
-                                                         pmd, write_access);
                        }
                        return do_anonymous_page(mm, vma, address,
                                                 pte, pmd, write_access);
@@ -2748,16 +2765,26 @@ int make_pages_present(unsigned long addr, unsigned long end)
        vma = find_vma(current->mm, addr);
        if (!vma)
-                return -1;
+                return -ENOMEM;
        write = (vma->vm_flags & VM_WRITE) != 0;
        BUG_ON(addr >= end);
        BUG_ON(end > vma->vm_end);
        len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
        ret = get_user_pages(current, current->mm, addr,
                        len, write, 0, NULL, NULL);
-        if (ret < 0)
+        if (ret < 0) {
+                /*
+                   SUS require strange return value to mlock
+                    - invalid addr generate to ENOMEM.
+                    - out of memory should generate EAGAIN.
+                */
+                if (ret == -EFAULT)
+                        ret = -ENOMEM;
+                else if (ret == -ENOMEM)
+                        ret = -EAGAIN;
                return ret;
-        return ret == len ? 0 : -1;
+        }
+        return ret == len ? 0 : -ENOMEM;
 }
 #if !defined(__HAVE_ARCH_GATE_AREA)
@@ -2804,6 +2831,86 @@ int in_gate_area_no_task(unsigned long addr)
 #endif  /* __HAVE_ARCH_GATE_AREA */
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+static resource_size_t follow_phys(struct vm_area_struct *vma,
+                        unsigned long address, unsigned int flags,
+                        unsigned long *prot)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *ptep, pte;
+        spinlock_t *ptl;
+        resource_size_t phys_addr = 0;
+        struct mm_struct *mm = vma->vm_mm;
+        VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
+        pgd = pgd_offset(mm, address);
+        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+                goto no_page_table;
+        pud = pud_offset(pgd, address);
+        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+                goto no_page_table;
+        pmd = pmd_offset(pud, address);
+        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+                goto no_page_table;
+        /* We cannot handle huge page PFN maps. Luckily they don't exist. */
+        if (pmd_huge(*pmd))
+                goto no_page_table;
+        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+        if (!ptep)
+                goto out;
+        pte = *ptep;
+        if (!pte_present(pte))
+                goto unlock;
+        if ((flags & FOLL_WRITE) && !pte_write(pte))
+                goto unlock;
+        phys_addr = pte_pfn(pte);
+        phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
+        *prot = pgprot_val(pte_pgprot(pte));
+unlock:
+        pte_unmap_unlock(ptep, ptl);
+out:
+        return phys_addr;
+no_page_table:
+        return 0;
+}
+int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
+                        void *buf, int len, int write)
+{
+        resource_size_t phys_addr;
+        unsigned long prot = 0;
+        void *maddr;
+        int offset = addr & (PAGE_SIZE-1);
+        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+                return -EINVAL;
+        phys_addr = follow_phys(vma, addr, write, &prot);
+        if (!phys_addr)
+                return -EINVAL;
+        maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+        if (write)
+                memcpy_toio(maddr + offset, buf, len);
+        else
+                memcpy_fromio(buf, maddr + offset, len);
+        iounmap(maddr);
+        return len;
+}
+#endif
 /*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
@@ -2813,7 +2920,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 {
        struct mm_struct *mm;
        struct vm_area_struct *vma;
-        struct page *page;
        void *old_buf = buf;
        mm = get_task_mm(tsk);
@@ -2825,28 +2931,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
        while (len) {
                int bytes, ret, offset;
                void *maddr;
+                struct page *page = NULL;
                ret = get_user_pages(tsk, mm, addr, 1,
                                write, 1, &page, &vma);
-                if (ret <= 0)
+                if (ret <= 0) {
-                        break;
+                        /*
+                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
-                bytes = len;
+                         * we can access using slightly different code.
-                offset = addr & (PAGE_SIZE-1);
+                         */
-                if (bytes > PAGE_SIZE-offset)
+#ifdef CONFIG_HAVE_IOREMAP_PROT
-                        bytes = PAGE_SIZE-offset;
+                        vma = find_vma(mm, addr);
+                        if (!vma)
-                maddr = kmap(page);
+                                break;
-                if (write) {
+                        if (vma->vm_ops && vma->vm_ops->access)
-                        copy_to_user_page(vma, page, addr,
+                                ret = vma->vm_ops->access(vma, addr, buf,
-                                          maddr + offset, buf, bytes);
+                                                          len, write);
-                        set_page_dirty_lock(page);
+                        if (ret <= 0)
+#endif
+                                break;
+                        bytes = ret;
                } else {
-                        copy_from_user_page(vma, page, addr,
+                        bytes = len;
-                                            buf, maddr + offset, bytes);
+                        offset = addr & (PAGE_SIZE-1);
+                        if (bytes > PAGE_SIZE-offset)
+                                bytes = PAGE_SIZE-offset;
+                        maddr = kmap(page);
+                        if (write) {
+                                copy_to_user_page(vma, page, addr,
+                                                  maddr + offset, buf, bytes);
+                                set_page_dirty_lock(page);
+                        } else {
+                                copy_from_user_page(vma, page, addr,
+                                                    buf, maddr + offset, bytes);
+                        }
+                        kunmap(page);
+                        page_cache_release(page);
                }
-                kunmap(page);
-                page_cache_release(page);
                len -= bytes;
                buf += bytes;
                addr += bytes;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 833f854eabe5..89fee2dcb039 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -62,9 +62,9 @@ static void release_memory_resource(struct resource *res)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info,  struct page *page, int magic)
+static void get_page_bootmem(unsigned long info,  struct page *page, int type)
 {
-        atomic_set(&page->_mapcount, magic);
+        atomic_set(&page->_mapcount, type);
        SetPagePrivate(page);
        set_page_private(page, info);
        atomic_inc(&page->_count);
@@ -72,10 +72,10 @@ static void get_page_bootmem(unsigned long info,  struct page *page, int magic)
 void put_page_bootmem(struct page *page)
 {
-        int magic;
+        int type;
-        magic = atomic_read(&page->_mapcount);
+        type = atomic_read(&page->_mapcount);
-        BUG_ON(magic >= -1);
+        BUG_ON(type >= -1);
        if (atomic_dec_return(&page->_count) == 1) {
                ClearPagePrivate(page);
@@ -86,7 +86,7 @@ void put_page_bootmem(struct page *page)
 }
-void register_page_bootmem_info_section(unsigned long start_pfn)
+static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
        unsigned long *usemap, mapsize, section_nr, i;
        struct mem_section *ms;
@@ -119,7 +119,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn)
        mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
        for (i = 0; i < mapsize; i++, page++)
-                get_page_bootmem(section_nr, page, MIX_INFO);
+                get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 }
@@ -429,7 +429,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        if (need_zonelists_rebuild)
                build_all_zonelists();
-        vm_total_pages = nr_free_pagecache_pages();
+        else
+                vm_total_pages = nr_free_pagecache_pages();
        writeback_set_ratelimit();
        if (onlined_pages)
@@ -455,7 +457,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
        /* we can use NODE_DATA(nid) from here */
        /* init node's zones as empty zones, we don't have any present pages.*/
-        free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
+        free_area_init_node(nid, zones_size, start_pfn, zholes_size);
        return pgdat;
 }
@@ -521,6 +523,66 @@ EXPORT_SYMBOL_GPL(add_memory);
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
+ * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
+ * set and the size of the free page is given by page_order(). Using this,
+ * the function determines if the pageblock contains only free pages.
+ * Due to buddy contraints, a free page at least the size of a pageblock will
+ * be located at the start of the pageblock
+ */
+static inline int pageblock_free(struct page *page)
+{
+        return PageBuddy(page) && page_order(page) >= pageblock_order;
+}
+/* Return the start of the next active pageblock after a given page */
+static struct page *next_active_pageblock(struct page *page)
+{
+        int pageblocks_stride;
+        /* Ensure the starting page is pageblock-aligned */
+        BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
+        /* Move forward by at least 1 * pageblock_nr_pages */
+        pageblocks_stride = 1;
+        /* If the entire pageblock is free, move to the end of free page */
+        if (pageblock_free(page))
+                pageblocks_stride += page_order(page) - pageblock_order;
+        return page + (pageblocks_stride * pageblock_nr_pages);
+}
+/* Checks if this range of memory is likely to be hot-removable. */
+int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+{
+        int type;
+        struct page *page = pfn_to_page(start_pfn);
+        struct page *end_page = page + nr_pages;
+        /* Check the starting page of each pageblock within the range */
+        for (; page < end_page; page = next_active_pageblock(page)) {
+                type = get_pageblock_migratetype(page);
+                /*
+                 * A pageblock containing MOVABLE or free pages is considered
+                 * removable
+                 */
+                if (type != MIGRATE_MOVABLE && !pageblock_free(page))
+                        return 0;
+                /*
+                 * A pageblock starting with a PageReserved page is not
+                 * considered removable.
+                 */
+                if (PageReserved(page))
+                        return 0;
+        }
+        /* All pageblocks in the memory block are likely to be hot-removable */
+        return 1;
+}
+/*
 * Confirm all pages in a range [start, end) is belongs to the same zone.
 */
 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c3..83369058ec13 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 int do_migrate_pages(struct mm_struct *mm,
        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 {
-        LIST_HEAD(pagelist);
        int busy = 0;
        int err = 0;
        nodemask_t tmp;
@@ -1481,7 +1480,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
-                                                HPAGE_SHIFT), gfp_flags);
+                                huge_page_shift(hstate_vma(vma))), gfp_flags);
        } else {
                zl = policy_zonelist(gfp_flags, *mpol);
                if ((*mpol)->mode == MPOL_BIND)
@@ -2220,9 +2219,12 @@ static void check_huge_range(struct vm_area_struct *vma,
 {
        unsigned long addr;
        struct page *page;
+        struct hstate *h = hstate_vma(vma);
+        unsigned long sz = huge_page_size(h);
-        for (addr = start; addr < end; addr += HPAGE_SIZE) {
+        for (addr = start; addr < end; addr += sz) {
-                pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+                pte_t *ptep = huge_pte_offset(vma->vm_mm,
+                                                addr & huge_page_mask(h));
                pte_t pte;
                if (!ptep)
diff --git a/mm/migrate.c b/mm/migrate.c
index 55bd355d170d..2a80136b23bb 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -30,6 +30,7 @@
 #include <linux/vmalloc.h>
 #include <linux/security.h>
 #include <linux/memcontrol.h>
+#include <linux/syscalls.h>
 #include "internal.h"
@@ -284,7 +285,15 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
        page = migration_entry_to_page(entry);
-        get_page(page);
+        /*
+         * Once radix-tree replacement of page migration started, page_count
+         * *must* be zero. And, we don't want to call wait_on_page_locked()
+         * against a page without get_page().
+         * So, we use get_page_unless_zero(), here. Even failed, page fault
+         * will occur again.
+         */
+        if (!get_page_unless_zero(page))
+                goto out;
        pte_unmap_unlock(ptep, ptl);
        wait_on_page_locked(page);
        put_page(page);
@@ -304,6 +313,7 @@ out:
 static int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page)
 {
+        int expected_count;
        void **pslot;
        if (!mapping) {
@@ -313,14 +323,20 @@ static int migrate_page_move_mapping(struct address_space *mapping,
                return 0;
        }
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        pslot = radix_tree_lookup_slot(&mapping->page_tree,
                                        page_index(page));
-        if (page_count(page) != 2 + !!PagePrivate(page) ||
+        expected_count = 2 + !!PagePrivate(page);
+        if (page_count(page) != expected_count ||
                        (struct page *)radix_tree_deref_slot(pslot) != page) {
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
+                return -EAGAIN;
+        }
+        if (!page_freeze_refs(page, expected_count)) {
+                spin_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
        }
@@ -337,6 +353,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        radix_tree_replace_slot(pslot, newpage);
+        page_unfreeze_refs(page, expected_count);
        /*
         * Drop cache reference from old page.
         * We know this isn't the last reference.
@@ -356,7 +373,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        __dec_zone_page_state(page, NR_FILE_PAGES);
        __inc_zone_page_state(newpage, NR_FILE_PAGES);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
+        if (!PageSwapCache(newpage))
+                mem_cgroup_uncharge_cache_page(page);
        return 0;
 }
@@ -586,7 +605,7 @@ static int move_to_new_page(struct page *newpage, struct page *page)
         * establishing additional references. We are the only one
         * holding a reference to the new page at this point.
         */
-        if (TestSetPageLocked(newpage))
+        if (!trylock_page(newpage))
                BUG();
        /* Prepare mapping for the new page.*/
@@ -610,7 +629,6 @@ static int move_to_new_page(struct page *newpage, struct page *page)
                rc = fallback_migrate_page(mapping, newpage, page);
        if (!rc) {
-                mem_cgroup_page_migration(page, newpage);
                remove_migration_ptes(page, newpage);
        } else
                newpage->mapping = NULL;
@@ -640,8 +658,16 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                /* page was freed from under us. So we are done. */
                goto move_newpage;
+        charge = mem_cgroup_prepare_migration(page, newpage);
+        if (charge == -ENOMEM) {
+                rc = -ENOMEM;
+                goto move_newpage;
+        }
+        /* prepare cgroup just returns 0 or -ENOMEM */
+        BUG_ON(charge);
        rc = -EAGAIN;
-        if (TestSetPageLocked(page)) {
+        if (!trylock_page(page)) {
                if (!force)
                        goto move_newpage;
                lock_page(page);
@@ -691,19 +717,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                goto rcu_unlock;
        }
-        charge = mem_cgroup_prepare_migration(page);
        /* Establish migration ptes or remove ptes */
        try_to_unmap(page, 1);
        if (!page_mapped(page))
                rc = move_to_new_page(newpage, page);
-        if (rc) {
+        if (rc)
                remove_migration_ptes(page, page);
-                if (charge)
-                        mem_cgroup_end_migration(page);
-        } else if (charge)
-                mem_cgroup_end_migration(newpage);
 rcu_unlock:
        if (rcu_locked)
                rcu_read_unlock();
@@ -724,6 +745,8 @@ unlock:
        }
 move_newpage:
+        if (!charge)
+                mem_cgroup_end_migration(newpage);
        /*
         * Move the new page to the LRU. If migration was not successful
         * then this will free the page.
@@ -1070,7 +1093,6 @@ out2:
        mmput(mm);
        return err;
 }
-#endif
 /*
 * Call migration functions in the vma_ops that may prepare
@@ -1092,3 +1114,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
        }
        return err;
 }
+#endif
diff --git a/mm/mlock.c b/mm/mlock.c
index 7b2656055d6a..01fbe93eff5c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -78,8 +78,6 @@ success:
        mm->locked_vm -= pages;
 out:
-        if (ret == -ENOMEM)
-                ret = -EAGAIN;
        return ret;
 }
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 000000000000..936ef2efd892
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,152 @@
+/*
+ * mm_init.c - Memory initialisation verification and debugging
+ *
+ * Copyright 2008 IBM Corporation, 2008
+ * Author Mel Gorman <mel@csn.ul.ie>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include "internal.h"
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+int __meminitdata mminit_loglevel;
+#ifndef SECTIONS_SHIFT
+#define SECTIONS_SHIFT  0
+#endif
+/* The zonelists are simply reported, validation is manual. */
+void mminit_verify_zonelist(void)
+{
+        int nid;
+        if (mminit_loglevel < MMINIT_VERIFY)
+                return;
+        for_each_online_node(nid) {
+                pg_data_t *pgdat = NODE_DATA(nid);
+                struct zone *zone;
+                struct zoneref *z;
+                struct zonelist *zonelist;
+                int i, listid, zoneid;
+                BUG_ON(MAX_ZONELISTS > 2);
+                for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
+                        /* Identify the zone and nodelist */
+                        zoneid = i % MAX_NR_ZONES;
+                        listid = i / MAX_NR_ZONES;
+                        zonelist = &pgdat->node_zonelists[listid];
+                        zone = &pgdat->node_zones[zoneid];
+                        if (!populated_zone(zone))
+                                continue;
+                        /* Print information about the zonelist */
+                        printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
+                                listid > 0 ? "thisnode" : "general", nid,
+                                zone->name);
+                        /* Iterate the zonelist */
+                        for_each_zone_zonelist(zone, z, zonelist, zoneid) {
+#ifdef CONFIG_NUMA
+                                printk(KERN_CONT "%d:%s ",
+                                        zone->node, zone->name);
+#else
+                                printk(KERN_CONT "0:%s ", zone->name);
+#endif /* CONFIG_NUMA */
+                        }
+                        printk(KERN_CONT "\n");
+                }
+        }
+}
+void __init mminit_verify_pageflags_layout(void)
+{
+        int shift, width;
+        unsigned long or_mask, add_mask;
+        shift = 8 * sizeof(unsigned long);
+        width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
+                "Section %d Node %d Zone %d Flags %d\n",
+                SECTIONS_WIDTH,
+                NODES_WIDTH,
+                ZONES_WIDTH,
+                NR_PAGEFLAGS);
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
+                "Section %d Node %d Zone %d\n",
+                SECTIONS_SHIFT,
+                NODES_SHIFT,
+                ZONES_SHIFT);
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
+                "Section %lu Node %lu Zone %lu\n",
+                (unsigned long)SECTIONS_PGSHIFT,
+                (unsigned long)NODES_PGSHIFT,
+                (unsigned long)ZONES_PGSHIFT);
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
+                "Zone ID: %lu -> %lu\n",
+                (unsigned long)ZONEID_PGOFF,
+                (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
+                "location: %d -> %d unused %d -> %d flags %d -> %d\n",
+                shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
+                "Node not in page flags");
+#endif
+        if (SECTIONS_WIDTH) {
+                shift -= SECTIONS_WIDTH;
+                BUG_ON(shift != SECTIONS_PGSHIFT);
+        }
+        if (NODES_WIDTH) {
+                shift -= NODES_WIDTH;
+                BUG_ON(shift != NODES_PGSHIFT);
+        }
+        if (ZONES_WIDTH) {
+                shift -= ZONES_WIDTH;
+                BUG_ON(shift != ZONES_PGSHIFT);
+        }
+        /* Check for bitmask overlaps */
+        or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
+                        (NODES_MASK << NODES_PGSHIFT) |
+                        (SECTIONS_MASK << SECTIONS_PGSHIFT);
+        add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
+                        (NODES_MASK << NODES_PGSHIFT) +
+                        (SECTIONS_MASK << SECTIONS_PGSHIFT);
+        BUG_ON(or_mask != add_mask);
+}
+void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
+                        unsigned long nid, unsigned long pfn)
+{
+        BUG_ON(page_to_nid(page) != nid);
+        BUG_ON(page_zonenum(page) != zone);
+        BUG_ON(page_to_pfn(page) != pfn);
+}
+static __init int set_mminit_loglevel(char *str)
+{
+        get_option(&str, &mminit_loglevel);
+        return 0;
+}
+early_param("mminit_loglevel", set_mminit_loglevel);
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+struct kobject *mm_kobj;
+EXPORT_SYMBOL_GPL(mm_kobj);
+static int __init mm_sysfs_init(void)
+{
+        mm_kobj = kobject_create_and_add("mm", kernel_kobj);
+        if (!mm_kobj)
+                return -ENOMEM;
+        return 0;
+}
+__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d102b956fd8..339cf5c4d5d8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -26,12 +26,15 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
+#include "internal.h"
 #ifndef arch_mmap_check
 #define arch_mmap_check(addr, len, flags)       (0)
 #endif
@@ -367,7 +370,7 @@ find_vma_prepare(struct mm_struct *mm, unsigned long addr,
                if (vma_tmp->vm_end > addr) {
                        vma = vma_tmp;
                        if (vma_tmp->vm_start <= addr)
-                                return vma;
+                                break;
                        __rb_link = &__rb_parent->rb_left;
                } else {
                        rb_prev = __rb_parent;
@@ -1108,6 +1111,9 @@ munmap_back:
        if (!may_expand_vm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;
+        if (flags & MAP_NORESERVE)
+                vm_flags |= VM_NORESERVE;
        if (accountable && (!(flags & MAP_NORESERVE) ||
                            sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
                if (vm_flags & VM_SHARED) {
@@ -1763,7 +1769,7 @@ static void unmap_region(struct mm_struct *mm,
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+        free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
                                 next? next->vm_start: 0);
        tlb_finish_mmu(tlb, start, end);
 }
@@ -1807,7 +1813,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        struct mempolicy *pol;
        struct vm_area_struct *new;
-        if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+        if (is_vm_hugetlb_page(vma) && (addr &
+                                        ~(huge_page_mask(hstate_vma(vma)))))
                return -EINVAL;
        if (mm->map_count >= sysctl_max_map_count)
@@ -2055,6 +2062,7 @@ void exit_mmap(struct mm_struct *mm)
        /* mm's last user has gone, and its about to be pulled down */
        arch_exit_mmap(mm);
+        mmu_notifier_release(mm);
        lru_add_drain();
        flush_cache_mm(mm);
@@ -2063,7 +2071,7 @@ void exit_mmap(struct mm_struct *mm)
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+        free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
        tlb_finish_mmu(tlb, 0, end);
        /*
@@ -2262,3 +2270,167 @@ int install_special_mapping(struct mm_struct *mm,
        return 0;
 }
+static DEFINE_MUTEX(mm_all_locks_mutex);
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
+{
+        if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+                /*
+                 * We can safely modify head.next after taking the
+                 * anon_vma->lock. If some other vma in this mm shares
+                 * the same anon_vma we won't take it again.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us thanks to the
+                 * anon_vma->lock.
+                 */
+                if (__test_and_set_bit(0, (unsigned long *)
+                                       &anon_vma->head.next))
+                        BUG();
+        }
+}
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+{
+        if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change from under us because
+                 * we hold the mm_all_locks_mutex.
+                 *
+                 * Operations on ->flags have to be atomic because
+                 * even if AS_MM_ALL_LOCKS is stable thanks to the
+                 * mm_all_locks_mutex, there may be other cpus
+                 * changing other bitflags in parallel to us.
+                 */
+                if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+                        BUG();
+                spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem);
+        }
+}
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_sem in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_sem until mm_drop_all_locks() returns.
+ *
+ * mmap_sem in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout (for example populate_range() with
+ * nonlinear vmas). It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We can take all the locks in random order because the VM code
+ * taking i_mmap_lock or anon_vma->lock outside the mmap_sem never
+ * takes more than one of them in a row. Secondly we're protected
+ * against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        int ret = -EINTR;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        mutex_lock(&mm_all_locks_mutex);
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_lock_mapping(mm, vma->vm_file->f_mapping);
+        }
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (signal_pending(current))
+                        goto out_unlock;
+                if (vma->anon_vma)
+                        vm_lock_anon_vma(mm, vma->anon_vma);
+        }
+        ret = 0;
+out_unlock:
+        if (ret)
+                mm_drop_all_locks(mm);
+        return ret;
+}
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+        if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+                /*
+                 * The LSB of head.next can't change to 0 from under
+                 * us because we hold the mm_all_locks_mutex.
+                 *
+                 * We must however clear the bitflag before unlocking
+                 * the vma so the users using the anon_vma->head will
+                 * never see our bitflag.
+                 *
+                 * No need of atomic instructions here, head.next
+                 * can't change from under us until we release the
+                 * anon_vma->lock.
+                 */
+                if (!__test_and_clear_bit(0, (unsigned long *)
+                                          &anon_vma->head.next))
+                        BUG();
+                spin_unlock(&anon_vma->lock);
+        }
+}
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+        if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+                /*
+                 * AS_MM_ALL_LOCKS can't change to 0 from under us
+                 * because we hold the mm_all_locks_mutex.
+                 */
+                spin_unlock(&mapping->i_mmap_lock);
+                if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+                                        &mapping->flags))
+                        BUG();
+        }
+}
+/*
+ * The mmap_sem cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+        struct vm_area_struct *vma;
+        BUG_ON(down_read_trylock(&mm->mmap_sem));
+        BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                if (vma->anon_vma)
+                        vm_unlock_anon_vma(vma->anon_vma);
+                if (vma->vm_file && vma->vm_file->f_mapping)
+                        vm_unlock_mapping(vma->vm_file->f_mapping);
+        }
+        mutex_unlock(&mm_all_locks_mutex);
+}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
new file mode 100644
index 000000000000..5f4ef0250bee
--- /dev/null
+++ b/mm/mmu_notifier.c
@@ -0,0 +1,277 @@
+/*
+ *  linux/mm/mmu_notifier.c
+ *
+ *  Copyright (C) 2008  Qumranet, Inc.
+ *  Copyright (C) 2008  SGI
+ *             Christoph Lameter <clameter@sgi.com>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+#include <linux/rculist.h>
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/err.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>
+/*
+ * This function can't run concurrently against mmu_notifier_register
+ * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
+ * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
+ * in parallel despite there being no task using this mm any more,
+ * through the vmas outside of the exit_mmap context, such as with
+ * vmtruncate. This serializes against mmu_notifier_unregister with
+ * the mmu_notifier_mm->lock in addition to RCU and it serializes
+ * against the other mmu notifiers with RCU. struct mmu_notifier_mm
+ * can't go away from under us as exit_mmap holds an mm_count pin
+ * itself.
+ */
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+        struct mmu_notifier *mn;
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) {
+                mn = hlist_entry(mm->mmu_notifier_mm->list.first,
+                                 struct mmu_notifier,
+                                 hlist);
+                /*
+                 * We arrived before mmu_notifier_unregister so
+                 * mmu_notifier_unregister will do nothing other than
+                 * to wait ->release to finish and
+                 * mmu_notifier_unregister to return.
+                 */
+                hlist_del_init_rcu(&mn->hlist);
+                /*
+                 * RCU here will block mmu_notifier_unregister until
+                 * ->release returns.
+                 */
+                rcu_read_lock();
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+                /*
+                 * if ->release runs before mmu_notifier_unregister it
+                 * must be handled as it's the only way for the driver
+                 * to flush all existing sptes and stop the driver
+                 * from establishing any more sptes before all the
+                 * pages in the mm are freed.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+                rcu_read_unlock();
+                spin_lock(&mm->mmu_notifier_mm->lock);
+        }
+        spin_unlock(&mm->mmu_notifier_mm->lock);
+        /*
+         * synchronize_rcu here prevents mmu_notifier_release to
+         * return to exit_mmap (which would proceed freeing all pages
+         * in the mm) until the ->release method returns, if it was
+         * invoked by mmu_notifier_unregister.
+         *
+         * The mmu_notifier_mm can't go away from under us because one
+         * mm_count is hold by exit_mmap.
+         */
+        synchronize_rcu();
+}
+/*
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+                                        unsigned long address)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        int young = 0;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->clear_flush_young)
+                        young |= mn->ops->clear_flush_young(mn, mm, address);
+        }
+        rcu_read_unlock();
+        return young;
+}
+void __mmu_notifier_invalidate_page(struct mm_struct *mm,
+                                          unsigned long address)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_page)
+                        mn->ops->invalidate_page(mn, mm, address);
+        }
+        rcu_read_unlock();
+}
+void __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
+                                  unsigned long start, unsigned long end)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_range_start)
+                        mn->ops->invalidate_range_start(mn, mm, start, end);
+        }
+        rcu_read_unlock();
+}
+void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
+                                  unsigned long start, unsigned long end)
+{
+        struct mmu_notifier *mn;
+        struct hlist_node *n;
+        rcu_read_lock();
+        hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) {
+                if (mn->ops->invalidate_range_end)
+                        mn->ops->invalidate_range_end(mn, mm, start, end);
+        }
+        rcu_read_unlock();
+}
+static int do_mmu_notifier_register(struct mmu_notifier *mn,
+                                    struct mm_struct *mm,
+                                    int take_mmap_sem)
+{
+        struct mmu_notifier_mm *mmu_notifier_mm;
+        int ret;
+        BUG_ON(atomic_read(&mm->mm_users) <= 0);
+        ret = -ENOMEM;
+        mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+        if (unlikely(!mmu_notifier_mm))
+                goto out;
+        if (take_mmap_sem)
+                down_write(&mm->mmap_sem);
+        ret = mm_take_all_locks(mm);
+        if (unlikely(ret))
+                goto out_cleanup;
+        if (!mm_has_notifiers(mm)) {
+                INIT_HLIST_HEAD(&mmu_notifier_mm->list);
+                spin_lock_init(&mmu_notifier_mm->lock);
+                mm->mmu_notifier_mm = mmu_notifier_mm;
+                mmu_notifier_mm = NULL;
+        }
+        atomic_inc(&mm->mm_count);
+        /*
+         * Serialize the update against mmu_notifier_unregister. A
+         * side note: mmu_notifier_release can't run concurrently with
+         * us because we hold the mm_users pin (either implicitly as
+         * current->mm or explicitly with get_task_mm() or similar).
+         * We can't race against any other mmu notifier method either
+         * thanks to mm_take_all_locks().
+         */
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
+        spin_unlock(&mm->mmu_notifier_mm->lock);
+        mm_drop_all_locks(mm);
+out_cleanup:
+        if (take_mmap_sem)
+                up_write(&mm->mmap_sem);
+        /* kfree() does nothing if mmu_notifier_mm is NULL */
+        kfree(mmu_notifier_mm);
+out:
+        BUG_ON(atomic_read(&mm->mm_users) <= 0);
+        return ret;
+}
+/*
+ * Must not hold mmap_sem nor any other VM related lock when calling
+ * this registration function. Must also ensure mm_users can't go down
+ * to zero while this runs to avoid races with mmu_notifier_release,
+ * so mm has to be current->mm or the mm should be pinned safely such
+ * as with get_task_mm(). If the mm is not current->mm, the mm_users
+ * pin should be released by calling mmput after mmu_notifier_register
+ * returns. mmu_notifier_unregister must be always called to
+ * unregister the notifier. mm_count is automatically pinned to allow
+ * mmu_notifier_unregister to safely run at any time later, before or
+ * after exit_mmap. ->release will always be called before exit_mmap
+ * frees the pages.
+ */
+int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        return do_mmu_notifier_register(mn, mm, 1);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+/*
+ * Same as mmu_notifier_register but here the caller must hold the
+ * mmap_sem in write mode.
+ */
+int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        return do_mmu_notifier_register(mn, mm, 0);
+}
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
+/* this is called after the last mmu_notifier_unregister() returned */
+void __mmu_notifier_mm_destroy(struct mm_struct *mm)
+{
+        BUG_ON(!hlist_empty(&mm->mmu_notifier_mm->list));
+        kfree(mm->mmu_notifier_mm);
+        mm->mmu_notifier_mm = LIST_POISON1; /* debug */
+}
+/*
+ * This releases the mm_count pin automatically and frees the mm
+ * structure if it was the last user of it. It serializes against
+ * running mmu notifiers with RCU and against mmu_notifier_unregister
+ * with the unregister lock + RCU. All sptes must be dropped before
+ * calling mmu_notifier_unregister. ->release or any other notifier
+ * method may be invoked concurrently with mmu_notifier_unregister,
+ * and only after mmu_notifier_unregister returned we're guaranteed
+ * that ->release or any other method can't run anymore.
+ */
+void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+        BUG_ON(atomic_read(&mm->mm_count) <= 0);
+        spin_lock(&mm->mmu_notifier_mm->lock);
+        if (!hlist_unhashed(&mn->hlist)) {
+                hlist_del_rcu(&mn->hlist);
+                /*
+                 * RCU here will force exit_mmap to wait ->release to finish
+                 * before freeing the pages.
+                 */
+                rcu_read_lock();
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+                /*
+                 * exit_mmap will block in mmu_notifier_release to
+                 * guarantee ->release is called before freeing the
+                 * pages.
+                 */
+                if (mn->ops->release)
+                        mn->ops->release(mn, mm);
+                rcu_read_unlock();
+        } else
+                spin_unlock(&mm->mmu_notifier_mm->lock);
+        /*
+         * Wait any running method to finish, of course including
+         * ->release if it was run by mmu_notifier_relase instead of us.
+         */
+        synchronize_rcu();
+        BUG_ON(atomic_read(&mm->mm_count) <= 0);
+        mmdrop(mm);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 360d9cc8b38c..fded06f923f4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -153,12 +154,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
         * If we make a private mapping writable we increase our commit;
         * but (without finer accounting) cannot reduce our commit if we
         * make it unwritable again.
-         *
-         * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
-         * a MAP_NORESERVE private mapping to writable will now reserve.
         */
        if (newflags & VM_WRITE) {
-                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
+                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
+                                                VM_SHARED|VM_NORESERVE))) {
                        charged = nrpages;
                        if (security_vm_enough_memory(charged))
                                return -ENOMEM;
@@ -205,10 +204,12 @@ success:
                dirty_accountable = 1;
        }
+        mmu_notifier_invalidate_range_start(mm, start, end);
        if (is_vm_hugetlb_page(vma))
                hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
        else
                change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+        mmu_notifier_invalidate_range_end(mm, start, end);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
        return 0;
diff --git a/mm/mremap.c b/mm/mremap.c
index 08e3c7f2bd15..1a7743923c8c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        struct mm_struct *mm = vma->vm_mm;
        pte_t *old_pte, *new_pte, pte;
        spinlock_t *old_ptl, *new_ptl;
+        unsigned long old_start;
+        old_start = old_addr;
+        mmu_notifier_invalidate_range_start(vma->vm_mm,
+                                            old_start, old_end);
        if (vma->vm_file) {
                /*
                 * Subtle point from Rajesh Venkatasubramanian: before
@@ -116,6 +121,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
        pte_unmap_unlock(old_pte - 1, old_ptl);
        if (mapping)
                spin_unlock(&mapping->i_mmap_lock);
+        mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
 }
 #define LATENCY_LIMIT   (64 * PAGE_SIZE)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4462b6a3fcb9..ed75bc962fbe 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -22,7 +22,7 @@
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-#include <linux/ptrace.h>
+#include <linux/tracehook.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/mount.h>
@@ -266,6 +266,27 @@ void *vmalloc_node(unsigned long size, int node)
 }
 EXPORT_SYMBOL(vmalloc_node);
+#ifndef PAGE_KERNEL_EXEC
+# define PAGE_KERNEL_EXEC PAGE_KERNEL
+#endif
+/**
+ *      vmalloc_exec  -  allocate virtually contiguous, executable memory
+ *      @size:          allocation size
+ *
+ *      Kernel-internal function to allocate enough pages to cover @size
+ *      the page level allocator and map them into contiguous and
+ *      executable kernel virtual space.
+ *
+ *      For tight control over page level allocator and protection flags
+ *      use __vmalloc() instead.
+ */
+void *vmalloc_exec(unsigned long size)
+{
+        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
+}
 /**
 * vmalloc_32  -  allocate virtually contiguous memory (32bit addressable)
 *      @size:          allocation size
@@ -745,7 +766,7 @@ static unsigned long determine_vm_flags(struct file *file,
         * it's being traced - otherwise breakpoints set in it may interfere
         * with another untraced process
         */
-        if ((flags & MAP_PRIVATE) && (current->ptrace & PT_PTRACED))
+        if ((flags & MAP_PRIVATE) && tracehook_expect_breakpoints(current))
                vm_flags &= ~VM_MAYSHARE;
        return vm_flags;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 94c6d8988ab3..24de8b65fdbd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1088,7 +1088,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                if (!mapping)
                        return 1;
-                write_lock_irq(&mapping->tree_lock);
+                spin_lock_irq(&mapping->tree_lock);
                mapping2 = page_mapping(page);
                if (mapping2) { /* Race with truncate? */
                        BUG_ON(mapping2 != mapping);
@@ -1102,7 +1102,7 @@ int __set_page_dirty_nobuffers(struct page *page)
                        radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
                }
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -1258,7 +1258,7 @@ int test_clear_page_writeback(struct page *page)
                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
-                write_lock_irqsave(&mapping->tree_lock, flags);
+                spin_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestClearPageWriteback(page);
                if (ret) {
                        radix_tree_tag_clear(&mapping->page_tree,
@@ -1269,7 +1269,7 @@ int test_clear_page_writeback(struct page *page)
                                __bdi_writeout_inc(bdi);
                        }
                }
-                write_unlock_irqrestore(&mapping->tree_lock, flags);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestClearPageWriteback(page);
        }
@@ -1287,7 +1287,7 @@ int test_set_page_writeback(struct page *page)
                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
-                write_lock_irqsave(&mapping->tree_lock, flags);
+                spin_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestSetPageWriteback(page);
                if (!ret) {
                        radix_tree_tag_set(&mapping->page_tree,
@@ -1300,7 +1300,7 @@ int test_set_page_writeback(struct page *page)
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
-                write_unlock_irqrestore(&mapping->tree_lock, flags);
+                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestSetPageWriteback(page);
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 79ac4afc908c..af982f7cdb2a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve;
  static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
-  unsigned long __initdata required_kernelcore;
+  static unsigned long __initdata required_kernelcore;
  static unsigned long __initdata required_movablecore;
-  unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
  int movable_zone;
@@ -264,7 +264,7 @@ static void free_compound_page(struct page *page)
        __free_pages_ok(page, compound_order(page));
 }
-static void prep_compound_page(struct page *page, unsigned long order)
+void prep_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
@@ -432,8 +432,9 @@ static inline void __free_one_page(struct page *page,
                buddy = __page_find_buddy(page, page_idx, order);
                if (!page_is_buddy(page, buddy, order))
-                        break;          /* Move the buddy up one level. */
+                        break;
+                /* Our buddy is free, merge with it and move up one order. */
                list_del(&buddy->lru);
                zone->free_area[order].nr_free--;
                rmv_page_order(buddy);
@@ -532,7 +533,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 /*
 * permit the bootmem allocator to evade page validation on high-order frees
 */
-void __free_pages_bootmem(struct page *page, unsigned int order)
+void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
        if (order == 0) {
                __ClearPageReserved(page);
@@ -673,9 +674,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 */
-int move_freepages(struct zone *zone,
+static int move_freepages(struct zone *zone,
-                        struct page *start_page, struct page *end_page,
+                          struct page *start_page, struct page *end_page,
-                        int migratetype)
+                          int migratetype)
 {
        struct page *page;
        unsigned long order;
@@ -714,7 +715,8 @@ int move_freepages(struct zone *zone,
        return pages_moved;
 }
-int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
+static int move_freepages_block(struct zone *zone, struct page *page,
+                                int migratetype)
 {
        unsigned long start_pfn, end_pfn;
        struct page *start_page, *end_page;
@@ -1429,7 +1431,7 @@ try_next_zone:
 /*
 * This is the 'heart' of the zoned buddy allocator.
 */
-static struct page *
+struct page *
 __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
                        struct zonelist *zonelist, nodemask_t *nodemask)
 {
@@ -1632,22 +1634,7 @@ nopage:
 got_pg:
        return page;
 }
+EXPORT_SYMBOL(__alloc_pages_internal);
-struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
-                struct zonelist *zonelist)
-{
-        return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
-}
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-                struct zonelist *zonelist, nodemask_t *nodemask)
-{
-        return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
-}
-EXPORT_SYMBOL(__alloc_pages);
 /*
 * Common helper functions.
@@ -1711,6 +1698,59 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
+/**
+ * alloc_pages_exact - allocate an exact number physically-contiguous pages.
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * This function is similar to alloc_pages(), except that it allocates the
+ * minimum number of pages to satisfy the request.  alloc_pages() can only
+ * allocate memory in power-of-two pages.
+ *
+ * This function is also limited by MAX_ORDER.
+ *
+ * Memory allocated by this function must be released by free_pages_exact().
+ */
+void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
+{
+        unsigned int order = get_order(size);
+        unsigned long addr;
+        addr = __get_free_pages(gfp_mask, order);
+        if (addr) {
+                unsigned long alloc_end = addr + (PAGE_SIZE << order);
+                unsigned long used = addr + PAGE_ALIGN(size);
+                split_page(virt_to_page(addr), order);
+                while (used < alloc_end) {
+                        free_page(used);
+                        used += PAGE_SIZE;
+                }
+        }
+        return (void *)addr;
+}
+EXPORT_SYMBOL(alloc_pages_exact);
+/**
+ * free_pages_exact - release memory allocated via alloc_pages_exact()
+ * @virt: the value returned by alloc_pages_exact.
+ * @size: size of allocation, same value as passed to alloc_pages_exact().
+ *
+ * Release the memory allocated by a previous call to alloc_pages_exact.
+ */
+void free_pages_exact(void *virt, size_t size)
+{
+        unsigned long addr = (unsigned long)virt;
+        unsigned long end = addr + PAGE_ALIGN(size);
+        while (addr < end) {
+                free_page(addr);
+                addr += PAGE_SIZE;
+        }
+}
+EXPORT_SYMBOL(free_pages_exact);
 static unsigned int nr_free_zone_pages(int offset)
 {
        struct zoneref *z;
@@ -2332,7 +2372,7 @@ static void build_zonelist_cache(pg_data_t *pgdat)
 #endif  /* CONFIG_NUMA */
-/* return values int ....just for stop_machine_run() */
+/* return values int ....just for stop_machine() */
 static int __build_all_zonelists(void *dummy)
 {
        int nid;
@@ -2352,11 +2392,12 @@ void build_all_zonelists(void)
        if (system_state == SYSTEM_BOOTING) {
                __build_all_zonelists(NULL);
+                mminit_verify_zonelist();
                cpuset_init_current_mems_allowed();
        } else {
                /* we have to stop all cpus to guarantee there is no user
                   of zonelist */
-                stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+                stop_machine(__build_all_zonelists, NULL, NULL);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
@@ -2534,6 +2575,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                }
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
+                mminit_verify_page_links(page, zone, nid, pfn);
                init_page_count(page);
                reset_page_mapcount(page);
                SetPageReserved(page);
@@ -2611,7 +2653,7 @@ static int zone_batchsize(struct zone *zone)
        return batch;
 }
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
        struct per_cpu_pages *pcp;
@@ -2836,6 +2878,12 @@ __meminit int init_currently_empty_zone(struct zone *zone,
        zone->zone_start_pfn = zone_start_pfn;
+        mminit_dprintk(MMINIT_TRACE, "memmap_init",
+                        "Initialising map node %d zone %lu pfns %lu -> %lu\n",
+                        pgdat->node_id,
+                        (unsigned long)zone_idx(zone),
+                        zone_start_pfn, (zone_start_pfn + size));
        zone_init_free_lists(zone);
        return 0;
@@ -2975,7 +3023,8 @@ void __init sparse_memory_present_with_active_regions(int nid)
 void __init push_node_boundaries(unsigned int nid,
                unsigned long start_pfn, unsigned long end_pfn)
 {
-        printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
+        mminit_dprintk(MMINIT_TRACE, "zoneboundary",
+                        "Entering push_node_boundaries(%u, %lu, %lu)\n",
                        nid, start_pfn, end_pfn);
        /* Initialise the boundary for this node if necessary */
@@ -2993,7 +3042,8 @@ void __init push_node_boundaries(unsigned int nid,
 static void __meminit account_node_boundary(unsigned int nid,
                unsigned long *start_pfn, unsigned long *end_pfn)
 {
-        printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
+        mminit_dprintk(MMINIT_TRACE, "zoneboundary",
+                        "Entering account_node_boundary(%u, %lu, %lu)\n",
                        nid, *start_pfn, *end_pfn);
        /* Return if boundary information has not been provided */
@@ -3050,7 +3100,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
 * assumption is made that zones within a node are ordered in monotonic
 * increasing memory addresses so that the "highest" populated zone is used
 */
-void __init find_usable_zone_for_movable(void)
+static void __init find_usable_zone_for_movable(void)
 {
        int zone_index;
        for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
@@ -3076,7 +3126,7 @@ void __init find_usable_zone_for_movable(void)
 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
 * zones within a node are in order of monotonic increases memory addresses
 */
-void __meminit adjust_zone_range_for_zone_movable(int nid,
+static void __meminit adjust_zone_range_for_zone_movable(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
@@ -3137,7 +3187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
 * then all holes in the requested range will be accounted for.
 */
-unsigned long __meminit __absent_pages_in_range(int nid,
+static unsigned long __meminit __absent_pages_in_range(int nid,
                                unsigned long range_start_pfn,
                                unsigned long range_end_pfn)
 {
@@ -3368,8 +3418,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                        PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
                if (realsize >= memmap_pages) {
                        realsize -= memmap_pages;
-                        printk(KERN_DEBUG
+                        mminit_dprintk(MMINIT_TRACE, "memmap_init",
-                                "  %s zone: %lu pages used for memmap\n",
+                                "%s zone: %lu pages used for memmap\n",
                                zone_names[j], memmap_pages);
                } else
                        printk(KERN_WARNING
@@ -3379,7 +3429,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                /* Account for reserved pages */
                if (j == 0 && realsize > dma_reserve) {
                        realsize -= dma_reserve;
-                        printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
+                        mminit_dprintk(MMINIT_TRACE, "memmap_init",
+                                        "%s zone: %lu pages reserved\n",
                                        zone_names[0], dma_reserve);
                }
@@ -3464,10 +3515,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
-void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
+void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
-                unsigned long *zones_size, unsigned long node_start_pfn,
+                unsigned long node_start_pfn, unsigned long *zholes_size)
-                unsigned long *zholes_size)
 {
+        pg_data_t *pgdat = NODE_DATA(nid);
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
        calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -3520,10 +3572,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 {
        int i;
-        printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) "
+        mminit_dprintk(MMINIT_TRACE, "memory_register",
-                          "%d entries of %d used\n",
+                        "Entering add_active_range(%d, %#lx, %#lx) "
-                          nid, start_pfn, end_pfn,
+                        "%d entries of %d used\n",
-                          nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+                        nid, start_pfn, end_pfn,
+                        nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+        mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
        /* Merge with existing active regions if possible */
        for (i = 0; i < nr_nodemap_entries; i++) {
@@ -3669,7 +3724,7 @@ static void __init sort_node_map(void)
 }
 /* Find the lowest pfn for a node */
-unsigned long __init find_min_pfn_for_node(int nid)
+static unsigned long __init find_min_pfn_for_node(int nid)
 {
        int i;
        unsigned long min_pfn = ULONG_MAX;
@@ -3698,23 +3753,6 @@ unsigned long __init find_min_pfn_with_active_regions(void)
        return find_min_pfn_for_node(MAX_NUMNODES);
 }
-/**
- * find_max_pfn_with_active_regions - Find the maximum PFN registered
- *
- * It returns the maximum PFN based on information provided via
- * add_active_range().
- */
-unsigned long __init find_max_pfn_with_active_regions(void)
-{
-        int i;
-        unsigned long max_pfn = 0;
-        for (i = 0; i < nr_nodemap_entries; i++)
-                max_pfn = max(max_pfn, early_node_map[i].end_pfn);
-        return max_pfn;
-}
 /*
 * early_calculate_totalpages()
 * Sum pages in active regions for movable zone.
@@ -3741,7 +3779,7 @@ static unsigned long __init early_calculate_totalpages(void)
 * memory. When they don't, some nodes will have more kernelcore than
 * others
 */
-void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
 {
        int i, nid;
        unsigned long usable_startpfn;
@@ -3957,10 +3995,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                                                early_node_map[i].end_pfn);
        /* Initialise every node */
+        mminit_verify_pageflags_layout();
        setup_nr_node_ids();
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
-                free_area_init_node(nid, pgdat, NULL,
+                free_area_init_node(nid, NULL,
                                find_min_pfn_for_node(nid), NULL);
                /* Any memory on that node */
@@ -4025,15 +4064,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-static bootmem_data_t contig_bootmem_data;
+struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] };
-struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
 EXPORT_SYMBOL(contig_page_data);
 #endif
 void __init free_area_init(unsigned long *zones_size)
 {
-        free_area_init_node(0, NODE_DATA(0), zones_size,
+        free_area_init_node(0, zones_size,
                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
@@ -4400,7 +4437,7 @@ void *__init alloc_large_system_hash(const char *tablename,
        do {
                size = bucketsize << log2qty;
                if (flags & HASH_EARLY)
-                        table = alloc_bootmem(size);
+                        table = alloc_bootmem_nopanic(size);
                else if (hashdist)
                        table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
                else {
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 9d834aa4b979..0cbe0c60c6bf 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work)
                 * Thread creation: For how long have there been zero
                 * available threads?
                 */
-                if (jiffies - last_empty_jifs > 1 * HZ) {
+                if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
                        /* unlocked list_empty() test is OK here */
                        if (list_empty(&pdflush_list)) {
                                /* unlocked test is OK here */
@@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work)
                if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
                        continue;
                pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
-                if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
+                if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
                        /* Limit exit rate */
                        pdf->when_i_went_to_sleep = jiffies;
                        break;                                  /* exeunt */
diff --git a/mm/readahead.c b/mm/readahead.c
index d8723a5f6496..77e8ddf945e9 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -382,9 +382,9 @@ ondemand_readahead(struct address_space *mapping,
        if (hit_readahead_marker) {
                pgoff_t start;
-                read_lock_irq(&mapping->tree_lock);
+                rcu_read_lock();
-                start = radix_tree_next_hole(&mapping->page_tree, offset, max+1);
+                start = radix_tree_next_hole(&mapping->page_tree, offset,max+1);
-                read_unlock_irq(&mapping->tree_lock);
+                rcu_read_unlock();
                if (!start || start - offset > max)
                        return 0;
diff --git a/mm/rmap.c b/mm/rmap.c
index bf0a5b7cfb8e..1ea4e6fcee77 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 #include <asm/tlbflush.h>
@@ -138,7 +139,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
                anon_vma_free(anon_vma);
 }
-static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
+static void anon_vma_ctor(void *data)
 {
        struct anon_vma *anon_vma = data;
@@ -287,7 +288,7 @@ static int page_referenced_one(struct page *page,
        if (vma->vm_flags & VM_LOCKED) {
                referenced++;
                *mapcount = 1;  /* break early from loop */
-        } else if (ptep_clear_flush_young(vma, address, pte))
+        } else if (ptep_clear_flush_young_notify(vma, address, pte))
                referenced++;
        /* Pretend the page is referenced if the task has the
@@ -421,7 +422,7 @@ int page_referenced(struct page *page, int is_locked,
                        referenced += page_referenced_anon(page, mem_cont);
                else if (is_locked)
                        referenced += page_referenced_file(page, mem_cont);
-                else if (TestSetPageLocked(page))
+                else if (!trylock_page(page))
                        referenced++;
                else {
                        if (page->mapping)
@@ -457,7 +458,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
                pte_t entry;
                flush_cache_page(vma, address, pte_pfn(*pte));
-                entry = ptep_clear_flush(vma, address, pte);
+                entry = ptep_clear_flush_notify(vma, address, pte);
                entry = pte_wrprotect(entry);
                entry = pte_mkclean(entry);
                set_pte_at(mm, address, pte, entry);
@@ -576,14 +577,8 @@ void page_add_anon_rmap(struct page *page,
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        if (atomic_inc_and_test(&page->_mapcount))
                __page_set_anon_rmap(page, vma, address);
-        else {
+        else
                __page_check_anon_rmap(page, vma, address);
-                /*
-                 * We unconditionally charged during prepare, we uncharge here
-                 * This takes care of balancing the reference counts
-                 */
-                mem_cgroup_uncharge_page(page);
-        }
 }
 /**
@@ -614,12 +609,6 @@ void page_add_file_rmap(struct page *page)
 {
        if (atomic_inc_and_test(&page->_mapcount))
                __inc_zone_page_state(page, NR_FILE_MAPPED);
-        else
-                /*
-                 * We unconditionally charged during prepare, we uncharge here
-                 * This takes care of balancing the reference counts
-                 */
-                mem_cgroup_uncharge_page(page);
 }
 #ifdef CONFIG_DEBUG_VM
@@ -678,7 +667,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                 * Leaving it set also helps swapoff to reinstate ptes
                 * faster for those pages still in swapcache.
                 */
-                if (page_test_dirty(page)) {
+                if ((!PageAnon(page) || PageSwapCache(page)) &&
+                    page_test_dirty(page)) {
                        page_clear_dirty(page);
                        set_page_dirty(page);
                }
@@ -717,14 +707,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * skipped over this mm) then we should reactivate it.
         */
        if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-                        (ptep_clear_flush_young(vma, address, pte)))) {
+                        (ptep_clear_flush_young_notify(vma, address, pte)))) {
                ret = SWAP_FAIL;
                goto out_unmap;
        }
        /* Nuke the page table entry. */
        flush_cache_page(vma, address, page_to_pfn(page));
-        pteval = ptep_clear_flush(vma, address, pte);
+        pteval = ptep_clear_flush_notify(vma, address, pte);
        /* Move the dirty bit to the physical page now the pte is gone. */
        if (pte_dirty(pteval))
@@ -849,12 +839,12 @@ static void try_to_unmap_cluster(unsigned long cursor,
                page = vm_normal_page(vma, address, *pte);
                BUG_ON(!page || PageAnon(page));
-                if (ptep_clear_flush_young(vma, address, pte))
+                if (ptep_clear_flush_young_notify(vma, address, pte))
                        continue;
                /* Nuke the page table entry. */
                flush_cache_page(vma, address, pte_pfn(*pte));
-                pteval = ptep_clear_flush(vma, address, pte);
+                pteval = ptep_clear_flush_notify(vma, address, pte);
                /* If nonlinear, store the file page offset in the pte. */
                if (page->index != linear_page_index(vma, address))
diff --git a/mm/shmem.c b/mm/shmem.c
index e2a6ae1a44e9..04fb4f1ab88e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -922,20 +922,26 @@ found:
        error = 1;
        if (!inode)
                goto out;
-        /* Precharge page while we can wait, compensate afterwards */
+        /* Precharge page using GFP_KERNEL while we can wait */
        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
        if (error)
                goto out;
        error = radix_tree_preload(GFP_KERNEL);
-        if (error)
+        if (error) {
-                goto uncharge;
+                mem_cgroup_uncharge_cache_page(page);
+                goto out;
+        }
        error = 1;
        spin_lock(&info->lock);
        ptr = shmem_swp_entry(info, idx, NULL);
-        if (ptr && ptr->val == entry.val)
+        if (ptr && ptr->val == entry.val) {
-                error = add_to_page_cache(page, inode->i_mapping,
+                error = add_to_page_cache_locked(page, inode->i_mapping,
                                                idx, GFP_NOWAIT);
+                /* does mem_cgroup_uncharge_cache_page on error */
+        } else  /* we must compensate for our precharge above */
+                mem_cgroup_uncharge_cache_page(page);
        if (error == -EEXIST) {
                struct page *filepage = find_get_page(inode->i_mapping, idx);
                error = 1;
@@ -961,8 +967,6 @@ found:
                shmem_swp_unmap(ptr);
        spin_unlock(&info->lock);
        radix_tree_preload_end();
-uncharge:
-        mem_cgroup_uncharge_page(page);
 out:
        unlock_page(page);
        page_cache_release(page);
@@ -1261,7 +1265,7 @@ repeat:
                }
                /* We have to do this with page locked to prevent races */
-                if (TestSetPageLocked(swappage)) {
+                if (!trylock_page(swappage)) {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
                        wait_on_page_locked(swappage);
@@ -1297,8 +1301,8 @@ repeat:
                        SetPageUptodate(filepage);
                        set_page_dirty(filepage);
                        swap_free(swap);
-                } else if (!(error = add_to_page_cache(
+                } else if (!(error = add_to_page_cache_locked(swappage, mapping,
-                                swappage, mapping, idx, GFP_NOWAIT))) {
+                                        idx, GFP_NOWAIT))) {
                        info->flags |= SHMEM_PAGEIN;
                        shmem_swp_set(info, entry, 0);
                        shmem_swp_unmap(entry);
@@ -1311,24 +1315,21 @@ repeat:
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
                        unlock_page(swappage);
+                        page_cache_release(swappage);
                        if (error == -ENOMEM) {
                                /* allow reclaim from this memory cgroup */
-                                error = mem_cgroup_cache_charge(swappage,
+                                error = mem_cgroup_shrink_usage(current->mm,
-                                        current->mm, gfp & ~__GFP_HIGHMEM);
+                                                                gfp);
-                                if (error) {
+                                if (error)
-                                        page_cache_release(swappage);
                                        goto failed;
-                                }
-                                mem_cgroup_uncharge_page(swappage);
                        }
-                        page_cache_release(swappage);
                        goto repeat;
                }
        } else if (sgp == SGP_READ && !filepage) {
                shmem_swp_unmap(entry);
                filepage = find_get_page(mapping, idx);
                if (filepage &&
-                    (!PageUptodate(filepage) || TestSetPageLocked(filepage))) {
+                    (!PageUptodate(filepage) || !trylock_page(filepage))) {
                        spin_unlock(&info->lock);
                        wait_on_page_locked(filepage);
                        page_cache_release(filepage);
@@ -1358,6 +1359,8 @@ repeat:
                }
                if (!filepage) {
+                        int ret;
                        spin_unlock(&info->lock);
                        filepage = shmem_alloc_page(gfp, info, idx);
                        if (!filepage) {
@@ -1386,10 +1389,18 @@ repeat:
                                swap = *entry;
                                shmem_swp_unmap(entry);
                        }
-                        if (error || swap.val || 0 != add_to_page_cache_lru(
+                        ret = error || swap.val;
-                                        filepage, mapping, idx, GFP_NOWAIT)) {
+                        if (ret)
+                                mem_cgroup_uncharge_cache_page(filepage);
+                        else
+                                ret = add_to_page_cache_lru(filepage, mapping,
+                                                idx, GFP_NOWAIT);
+                        /*
+                         * At add_to_page_cache_lru() failure, uncharge will
+                         * be done automatically.
+                         */
+                        if (ret) {
                                spin_unlock(&info->lock);
-                                mem_cgroup_uncharge_page(filepage);
                                page_cache_release(filepage);
                                shmem_unacct_blocks(info->flags, 1);
                                shmem_free_blocks(inode, 1);
@@ -1398,7 +1409,6 @@ repeat:
                                        goto failed;
                                goto repeat;
                        }
-                        mem_cgroup_uncharge_page(filepage);
                        info->flags |= SHMEM_PAGEIN;
                }
@@ -1503,7 +1513,6 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                inode->i_uid = current->fsuid;
                inode->i_gid = current->fsgid;
                inode->i_blocks = 0;
-                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_generation = get_seconds();
@@ -1518,6 +1527,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                        init_special_inode(inode, mode, dev);
                        break;
                case S_IFREG:
+                        inode->i_mapping->a_ops = &shmem_aops;
                        inode->i_op = &shmem_inode_operations;
                        inode->i_fop = &shmem_file_operations;
                        mpol_shared_policy_init(&info->policy,
@@ -1690,26 +1700,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
        file_accessed(filp);
 }
-static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+static ssize_t shmem_file_aio_read(struct kiocb *iocb,
+                const struct iovec *iov, unsigned long nr_segs, loff_t pos)
 {
-        read_descriptor_t desc;
+        struct file *filp = iocb->ki_filp;
+        ssize_t retval;
+        unsigned long seg;
+        size_t count;
+        loff_t *ppos = &iocb->ki_pos;
-        if ((ssize_t) count < 0)
+        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-                return -EINVAL;
+        if (retval)
-        if (!access_ok(VERIFY_WRITE, buf, count))
+                return retval;
-                return -EFAULT;
-        if (!count)
-                return 0;
-        desc.written = 0;
+        for (seg = 0; seg < nr_segs; seg++) {
-        desc.count = count;
+                read_descriptor_t desc;
-        desc.arg.buf = buf;
-        desc.error = 0;
-        do_shmem_file_read(filp, ppos, &desc, file_read_actor);
+                desc.written = 0;
-        if (desc.written)
+                desc.arg.buf = iov[seg].iov_base;
-                return desc.written;
+                desc.count = iov[seg].iov_len;
-        return desc.error;
+                if (desc.count == 0)
+                        continue;
+                desc.error = 0;
+                do_shmem_file_read(filp, ppos, &desc, file_read_actor);
+                retval += desc.written;
+                if (desc.error) {
+                        retval = retval ?: desc.error;
+                        break;
+                }
+                if (desc.count > 0)
+                        break;
+        }
+        return retval;
 }
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1907,6 +1929,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
                        return error;
                }
                unlock_page(page);
+                inode->i_mapping->a_ops = &shmem_aops;
                inode->i_op = &shmem_symlink_inode_operations;
                kaddr = kmap_atomic(page, KM_USER0);
                memcpy(kaddr, symname, len);
@@ -2330,7 +2353,7 @@ static void shmem_destroy_inode(struct inode *inode)
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
-static void init_once(struct kmem_cache *cachep, void *foo)
+static void init_once(void *foo)
 {
        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
@@ -2369,8 +2392,9 @@ static const struct file_operations shmem_file_operations = {
        .mmap           = shmem_mmap,
 #ifdef CONFIG_TMPFS
        .llseek         = generic_file_llseek,
-        .read           = shmem_file_read,
+        .read           = do_sync_read,
        .write          = do_sync_write,
+        .aio_read       = shmem_file_aio_read,
        .aio_write      = generic_file_aio_write,
        .fsync          = simple_sync_file,
        .splice_read    = generic_file_splice_read,
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c
index f5664c5b9eb1..8e5aadd7dcd6 100644
--- a/mm/shmem_acl.c
+++ b/mm/shmem_acl.c
@@ -191,7 +191,7 @@ shmem_check_acl(struct inode *inode, int mask)
 * shmem_permission  -  permission() inode operation
 */
 int
-shmem_permission(struct inode *inode, int mask, struct nameidata *nd)
+shmem_permission(struct inode *inode, int mask)
 {
        return generic_permission(inode, mask, shmem_check_acl);
 }
diff --git a/mm/slab.c b/mm/slab.c
index 052e7d64537e..e76eee466886 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -406,7 +406,7 @@ struct kmem_cache {
        unsigned int dflags;            /* dynamic flags */
        /* constructor func */
-        void (*ctor)(struct kmem_cache *, void *);
+        void (*ctor)(void *obj);
 /* 5) cache creation/removal */
        const char *name;
@@ -2137,8 +2137,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-        unsigned long flags,
+        unsigned long flags, void (*ctor)(void *))
-        void (*ctor)(struct kmem_cache *, void *))
 {
        size_t left_over, slab_size, ralign;
        struct kmem_cache *cachep = NULL, *pc;
@@ -2653,7 +2652,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                 * They must also be threaded.
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-                        cachep->ctor(cachep, objp + obj_offset(cachep));
+                        cachep->ctor(objp + obj_offset(cachep));
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2669,7 +2668,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                         cachep->buffer_size / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
-                        cachep->ctor(cachep, objp);
+                        cachep->ctor(objp);
 #endif
                slab_bufctl(slabp)[i] = i + 1;
        }
@@ -3093,7 +3092,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #endif
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
-                cachep->ctor(cachep, objp);
+                cachep->ctor(objp);
 #if ARCH_SLAB_MINALIGN
        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -4473,4 +4472,3 @@ size_t ksize(const void *objp)
        return obj_size(virt_to_cache(objp));
 }
-EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index a3ad6671adf1..4c82dd41f32e 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large);
 */
 static inline int slob_page(struct slob_page *sp)
 {
-        return test_bit(PG_active, &sp->flags);
+        return PageSlobPage((struct page *)sp);
 }
 static inline void set_slob_page(struct slob_page *sp)
 {
-        __set_bit(PG_active, &sp->flags);
+        __SetPageSlobPage((struct page *)sp);
 }
 static inline void clear_slob_page(struct slob_page *sp)
 {
-        __clear_bit(PG_active, &sp->flags);
+        __ClearPageSlobPage((struct page *)sp);
 }
 /*
@@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp)
 */
 static inline int slob_page_free(struct slob_page *sp)
 {
-        return test_bit(PG_private, &sp->flags);
+        return PageSlobFree((struct page *)sp);
 }
 static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
 {
        list_add(&sp->list, list);
-        __set_bit(PG_private, &sp->flags);
+        __SetPageSlobFree((struct page *)sp);
 }
 static inline void clear_slob_page_free(struct slob_page *sp)
 {
        list_del(&sp->list);
-        __clear_bit(PG_private, &sp->flags);
+        __ClearPageSlobFree((struct page *)sp);
 }
 #define SLOB_UNIT sizeof(slob_t)
@@ -519,18 +519,16 @@ size_t ksize(const void *block)
        else
                return sp->page.private;
 }
-EXPORT_SYMBOL(ksize);
 struct kmem_cache {
        unsigned int size, align;
        unsigned long flags;
        const char *name;
-        void (*ctor)(struct kmem_cache *, void *);
+        void (*ctor)(void *);
 };
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-        size_t align, unsigned long flags,
+        size_t align, unsigned long flags, void (*ctor)(void *))
-        void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *c;
@@ -575,7 +573,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
                b = slob_new_page(flags, get_order(c->size), node);
        if (c->ctor)
-                c->ctor(c, b);
+                c->ctor(b);
        return b;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 6d4a49c1ff2f..4f5b96149458 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -102,44 +102,12 @@
 *                      the fast path and disables lockless freelists.
 */
-#define FROZEN (1 << PG_active)
 #ifdef CONFIG_SLUB_DEBUG
-#define SLABDEBUG (1 << PG_error)
+#define SLABDEBUG 1
 #else
 #define SLABDEBUG 0
 #endif
-static inline int SlabFrozen(struct page *page)
-{
-        return page->flags & FROZEN;
-}
-static inline void SetSlabFrozen(struct page *page)
-{
-        page->flags |= FROZEN;
-}
-static inline void ClearSlabFrozen(struct page *page)
-{
-        page->flags &= ~FROZEN;
-}
-static inline int SlabDebug(struct page *page)
-{
-        return page->flags & SLABDEBUG;
-}
-static inline void SetSlabDebug(struct page *page)
-{
-        page->flags |= SLABDEBUG;
-}
-static inline void ClearSlabDebug(struct page *page)
-{
-        page->flags &= ~SLABDEBUG;
-}
 /*
 * Issues still to be resolved:
 *
@@ -971,7 +939,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
        }
        /* Special debug activities for freeing objects */
-        if (!SlabFrozen(page) && !page->freelist)
+        if (!PageSlubFrozen(page) && !page->freelist)
                remove_full(s, page);
        if (s->flags & SLAB_STORE_USER)
                set_track(s, object, TRACK_FREE, addr);
@@ -1044,7 +1012,7 @@ __setup("slub_debug", setup_slub_debug);
 static unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(struct kmem_cache *, void *))
+        void (*ctor)(void *))
 {
        /*
         * Enable debugging if selected on the kernel commandline.
@@ -1072,7 +1040,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(struct kmem_cache *, void *))
+        void (*ctor)(void *))
 {
        return flags;
 }
@@ -1135,7 +1103,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 {
        setup_object_debug(s, page, object);
        if (unlikely(s->ctor))
-                s->ctor(s, object);
+                s->ctor(object);
 }
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1157,7 +1125,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        page->flags |= 1 << PG_slab;
        if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
                        SLAB_STORE_USER | SLAB_TRACE))
-                SetSlabDebug(page);
+                __SetPageSlubDebug(page);
        start = page_address(page);
@@ -1184,14 +1152,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        int order = compound_order(page);
        int pages = 1 << order;
-        if (unlikely(SlabDebug(page))) {
+        if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
                void *p;
                slab_pad_check(s, page);
                for_each_object(p, s, page_address(page),
                                                page->objects)
                        check_object(s, page, p, 0);
-                ClearSlabDebug(page);
+                __ClearPageSlubDebug(page);
        }
        mod_zone_page_state(page_zone(page),
@@ -1288,7 +1256,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
        if (slab_trylock(page)) {
                list_del(&page->lru);
                n->nr_partial--;
-                SetSlabFrozen(page);
+                __SetPageSlubFrozen(page);
                return 1;
        }
        return 0;
@@ -1361,7 +1329,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
                n = get_node(s, zone_to_nid(zone));
                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
-                                n->nr_partial > MIN_PARTIAL) {
+                                n->nr_partial > n->min_partial) {
                        page = get_partial_node(n);
                        if (page)
                                return page;
@@ -1398,7 +1366,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
        struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
-        ClearSlabFrozen(page);
+        __ClearPageSlubFrozen(page);
        if (page->inuse) {
                if (page->freelist) {
@@ -1406,13 +1374,14 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
                        stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
                } else {
                        stat(c, DEACTIVATE_FULL);
-                        if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
+                        if (SLABDEBUG && PageSlubDebug(page) &&
+                                                (s->flags & SLAB_STORE_USER))
                                add_full(n, page);
                }
                slab_unlock(page);
        } else {
                stat(c, DEACTIVATE_EMPTY);
-                if (n->nr_partial < MIN_PARTIAL) {
+                if (n->nr_partial < n->min_partial) {
                        /*
                         * Adding an empty slab to the partial slabs in order
                         * to avoid page allocator overhead. This slab needs
@@ -1551,7 +1520,7 @@ load_freelist:
        object = c->page->freelist;
        if (unlikely(!object))
                goto another_slab;
-        if (unlikely(SlabDebug(c->page)))
+        if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
                goto debug;
        c->freelist = object[c->offset];
@@ -1588,7 +1557,7 @@ new_slab:
                if (c->page)
                        flush_slab(s, c);
                slab_lock(new);
-                SetSlabFrozen(new);
+                __SetPageSlubFrozen(new);
                c->page = new;
                goto load_freelist;
        }
@@ -1674,7 +1643,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
        stat(c, FREE_SLOWPATH);
        slab_lock(page);
-        if (unlikely(SlabDebug(page)))
+        if (unlikely(SLABDEBUG && PageSlubDebug(page)))
                goto debug;
 checks_ok:
@@ -1682,7 +1651,7 @@ checks_ok:
        page->freelist = object;
        page->inuse--;
-        if (unlikely(SlabFrozen(page))) {
+        if (unlikely(PageSlubFrozen(page))) {
                stat(c, FREE_FROZEN);
                goto out_unlock;
        }
@@ -1944,9 +1913,21 @@ static void init_kmem_cache_cpu(struct kmem_cache *s,
 #endif
 }
-static void init_kmem_cache_node(struct kmem_cache_node *n)
+static void
+init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
 {
        n->nr_partial = 0;
+        /*
+         * The larger the object size is, the more pages we want on the partial
+         * list to avoid pounding the page allocator excessively.
+         */
+        n->min_partial = ilog2(s->size);
+        if (n->min_partial < MIN_PARTIAL)
+                n->min_partial = MIN_PARTIAL;
+        else if (n->min_partial > MAX_PARTIAL)
+                n->min_partial = MAX_PARTIAL;
        spin_lock_init(&n->list_lock);
        INIT_LIST_HEAD(&n->partial);
 #ifdef CONFIG_SLUB_DEBUG
@@ -2118,7 +2099,7 @@ static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
        init_object(kmalloc_caches, n, 1);
        init_tracking(kmalloc_caches, n);
 #endif
-        init_kmem_cache_node(n);
+        init_kmem_cache_node(n, kmalloc_caches);
        inc_slabs_node(kmalloc_caches, node, page->objects);
        /*
@@ -2175,7 +2156,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
                }
                s->node[node] = n;
-                init_kmem_cache_node(n);
+                init_kmem_cache_node(n, s);
        }
        return 1;
 }
@@ -2186,7 +2167,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
 {
-        init_kmem_cache_node(&s->local_node);
+        init_kmem_cache_node(&s->local_node, s);
        return 1;
 }
 #endif
@@ -2317,7 +2298,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
                const char *name, size_t size,
                size_t align, unsigned long flags,
-                void (*ctor)(struct kmem_cache *, void *))
+                void (*ctor)(void *))
 {
        memset(s, 0, kmem_size);
        s->name = name;
@@ -2746,7 +2727,6 @@ size_t ksize(const void *object)
         */
        return s->size;
 }
-EXPORT_SYMBOL(ksize);
 void kfree(const void *x)
 {
@@ -2921,7 +2901,7 @@ static int slab_mem_going_online_callback(void *arg)
                        ret = -ENOMEM;
                        goto out;
                }
-                init_kmem_cache_node(n);
+                init_kmem_cache_node(n, s);
                s->node[nid] = n;
        }
 out:
@@ -3073,7 +3053,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 static struct kmem_cache *find_mergeable(size_t size,
                size_t align, unsigned long flags, const char *name,
-                void (*ctor)(struct kmem_cache *, void *))
+                void (*ctor)(void *))
 {
        struct kmem_cache *s;
@@ -3113,8 +3093,7 @@ static struct kmem_cache *find_mergeable(size_t size,
 }
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
-                size_t align, unsigned long flags,
+                size_t align, unsigned long flags, void (*ctor)(void *))
-                void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *s;
@@ -3317,12 +3296,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
                        s->name, page);
        if (s->flags & DEBUG_DEFAULT_FLAGS) {
-                if (!SlabDebug(page))
+                if (!PageSlubDebug(page))
-                        printk(KERN_ERR "SLUB %s: SlabDebug not set "
+                        printk(KERN_ERR "SLUB %s: SlubDebug not set "
                                "on slab 0x%p\n", s->name, page);
        } else {
-                if (SlabDebug(page))
+                if (PageSlubDebug(page))
-                        printk(KERN_ERR "SLUB %s: SlabDebug set on "
+                        printk(KERN_ERR "SLUB %s: SlubDebug set on "
                                "slab 0x%p\n", s->name, page);
        }
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 36511c7b5e2c..39db301b920d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -147,22 +147,41 @@ static inline int sparse_early_nid(struct mem_section *section)
        return (section->section_mem_map >> SECTION_NID_SHIFT);
 }
-/* Record a memory area against a node. */
+/* Validate the physical addressing limitations of the model */
-void __init memory_present(int nid, unsigned long start, unsigned long end)
+void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
+                                                unsigned long *end_pfn)
 {
-        unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+        unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
-        unsigned long pfn;
        /*
         * Sanity checks - do not allow an architecture to pass
         * in larger pfns than the maximum scope of sparsemem:
         */
-        if (start >= max_arch_pfn)
+        if (*start_pfn > max_sparsemem_pfn) {
-                return;
+                mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
-        if (end >= max_arch_pfn)
+                        "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
-                end = max_arch_pfn;
+                        *start_pfn, *end_pfn, max_sparsemem_pfn);
+                WARN_ON_ONCE(1);
+                *start_pfn = max_sparsemem_pfn;
+                *end_pfn = max_sparsemem_pfn;
+        }
+        if (*end_pfn > max_sparsemem_pfn) {
+                mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
+                        "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
+                        *start_pfn, *end_pfn, max_sparsemem_pfn);
+                WARN_ON_ONCE(1);
+                *end_pfn = max_sparsemem_pfn;
+        }
+}
+/* Record a memory area against a node. */
+void __init memory_present(int nid, unsigned long start, unsigned long end)
+{
+        unsigned long pfn;
        start &= PAGE_SECTION_MASK;
+        mminit_validate_memmodel_limits(&start, &end);
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
                unsigned long section = pfn_to_section_nr(pfn);
                struct mem_section *ms;
@@ -187,6 +206,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
        unsigned long pfn;
        unsigned long nr_pages = 0;
+        mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
                if (nid != early_pfn_to_nid(pfn))
                        continue;
@@ -248,16 +268,92 @@ static unsigned long *__kmalloc_section_usemap(void)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static unsigned long * __init
+sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+{
+        unsigned long section_nr;
+        /*
+         * A page may contain usemaps for other sections preventing the
+         * page being freed and making a section unremovable while
+         * other sections referencing the usemap retmain active. Similarly,
+         * a pgdat can prevent a section being removed. If section A
+         * contains a pgdat and section B contains the usemap, both
+         * sections become inter-dependent. This allocates usemaps
+         * from the same section as the pgdat where possible to avoid
+         * this problem.
+         */
+        section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+        return alloc_bootmem_section(usemap_size(), section_nr);
+}
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+        unsigned long usemap_snr, pgdat_snr;
+        static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
+        static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
+        struct pglist_data *pgdat = NODE_DATA(nid);
+        int usemap_nid;
+        usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+        pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+        if (usemap_snr == pgdat_snr)
+                return;
+        if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
+                /* skip redundant message */
+                return;
+        old_usemap_snr = usemap_snr;
+        old_pgdat_snr = pgdat_snr;
+        usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
+        if (usemap_nid != nid) {
+                printk(KERN_INFO
+                       "node %d must be removed before remove section %ld\n",
+                       nid, usemap_snr);
+                return;
+        }
+        /*
+         * There is a circular dependency.
+         * Some platforms allow un-removable section because they will just
+         * gather other removable sections for dynamic partitioning.
+         * Just notify un-removable section's number here.
+         */
+        printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
+               pgdat_snr, nid);
+        printk(KERN_CONT
+               " have a circular dependency on usemap and pgdat allocations\n");
+}
+#else
+static unsigned long * __init
+sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+{
+        return NULL;
+}
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
 {
        unsigned long *usemap;
        struct mem_section *ms = __nr_to_section(pnum);
        int nid = sparse_early_nid(ms);
-        usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+        usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
        if (usemap)
                return usemap;
+        usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+        if (usemap) {
+                check_usemap_section_nr(nid, usemap);
+                return usemap;
+        }
        /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
        nid = 0;
@@ -280,7 +376,7 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
-struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
+static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 {
        struct page *map;
        struct mem_section *ms = __nr_to_section(pnum);
diff --git a/mm/swap.c b/mm/swap.c
index 45c9f25a8a3b..9e0cb3118079 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,9 +34,9 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 /*
 * This path almost never happens for VM activity - pages are normally
@@ -278,9 +278,10 @@ int lru_add_drain_all(void)
 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 * for the remainder of the operation.
 *
- * The locking in this function is against shrink_cache(): we recheck the
+ * The locking in this function is against shrink_inactive_list(): we recheck
- * page count inside the lock to see whether shrink_cache grabbed the page
+ * the page count inside the lock to see whether shrink_inactive_list()
- * via the LRU.  If it did, give up: shrink_cache will free it.
+ * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
+ * will free it.
 */
 void release_pages(struct page **pages, int nr, int cold)
 {
@@ -443,7 +444,7 @@ void pagevec_strip(struct pagevec *pvec)
        for (i = 0; i < pagevec_count(pvec); i++) {
                struct page *page = pvec->pages[i];
-                if (PagePrivate(page) && !TestSetPageLocked(page)) {
+                if (PagePrivate(page) && trylock_page(page)) {
                        if (PagePrivate(page))
                                try_to_release_page(page, 0);
                        unlock_page(page);
@@ -493,7 +494,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
 */
 #define ACCT_THRESHOLD  max(16, NR_CPUS * 2)
-static DEFINE_PER_CPU(long, committed_space) = 0;
+static DEFINE_PER_CPU(long, committed_space);
 void vm_acct_memory(long pages)
 {
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d8aadaf2a0ba..167cf2dc8a03 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -39,7 +39,7 @@ static struct backing_dev_info swap_backing_dev_info = {
 struct address_space swapper_space = {
        .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-        .tree_lock      = __RW_LOCK_UNLOCKED(swapper_space.tree_lock),
+        .tree_lock      = __SPIN_LOCK_UNLOCKED(swapper_space.tree_lock),
        .a_ops          = &swap_aops,
        .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
        .backing_dev_info = &swap_backing_dev_info,
@@ -56,7 +56,8 @@ static struct {
 void show_swap_cache_info(void)
 {
-        printk("Swap cache: add %lu, delete %lu, find %lu/%lu\n",
+        printk("%lu pages in swap cache\n", total_swapcache_pages);
+        printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
                swap_cache_info.add_total, swap_cache_info.del_total,
                swap_cache_info.find_success, swap_cache_info.find_total);
        printk("Free swap  = %lukB\n", nr_swap_pages << (PAGE_SHIFT - 10));
@@ -64,7 +65,7 @@ void show_swap_cache_info(void)
 }
 /*
- * add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
 * but sets SwapCache flag and private instead of mapping and index.
 */
 int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
@@ -76,19 +77,26 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
        BUG_ON(PagePrivate(page));
        error = radix_tree_preload(gfp_mask);
        if (!error) {
-                write_lock_irq(&swapper_space.tree_lock);
+                page_cache_get(page);
+                SetPageSwapCache(page);
+                set_page_private(page, entry.val);
+                spin_lock_irq(&swapper_space.tree_lock);
                error = radix_tree_insert(&swapper_space.page_tree,
                                                entry.val, page);
-                if (!error) {
+                if (likely(!error)) {
-                        page_cache_get(page);
-                        SetPageSwapCache(page);
-                        set_page_private(page, entry.val);
                        total_swapcache_pages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
                        INC_CACHE_INFO(add_total);
                }
-                write_unlock_irq(&swapper_space.tree_lock);
+                spin_unlock_irq(&swapper_space.tree_lock);
                radix_tree_preload_end();
+                if (unlikely(error)) {
+                        set_page_private(page, 0UL);
+                        ClearPageSwapCache(page);
+                        page_cache_release(page);
+                }
        }
        return error;
 }
@@ -175,9 +183,9 @@ void delete_from_swap_cache(struct page *page)
        entry.val = page_private(page);
-        write_lock_irq(&swapper_space.tree_lock);
+        spin_lock_irq(&swapper_space.tree_lock);
        __delete_from_swap_cache(page);
-        write_unlock_irq(&swapper_space.tree_lock);
+        spin_unlock_irq(&swapper_space.tree_lock);
        swap_free(entry);
        page_cache_release(page);
@@ -193,7 +201,7 @@ void delete_from_swap_cache(struct page *page)
 */
 static inline void free_swap_cache(struct page *page)
 {
-        if (PageSwapCache(page) && !TestSetPageLocked(page)) {
+        if (PageSwapCache(page) && trylock_page(page)) {
                remove_exclusive_swap_page(page);
                unlock_page(page);
        }
@@ -294,9 +302,9 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                 * re-using the just freed swap entry for an existing page.
                 * May fail (-ENOMEM) if radix-tree node allocation failed.
                 */
-                SetPageLocked(new_page);
+                set_page_locked(new_page);
                err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
-                if (!err) {
+                if (likely(!err)) {
                        /*
                         * Initiate read into locked page and return.
                         */
@@ -304,7 +312,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
                        swap_readpage(NULL, new_page);
                        return new_page;
                }
-                ClearPageLocked(new_page);
+                clear_page_locked(new_page);
                swap_free(entry);
        } while (err != -ENOMEM);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bd1bb5920306..1e330f2998fa 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -33,17 +33,18 @@
 #include <asm/tlbflush.h>
 #include <linux/swapops.h>
-DEFINE_SPINLOCK(swap_lock);
+static DEFINE_SPINLOCK(swap_lock);
-unsigned int nr_swapfiles;
+static unsigned int nr_swapfiles;
 long total_swap_pages;
 static int swap_overflow;
+static int least_priority;
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
-struct swap_list_t swap_list = {-1, -1};
+static struct swap_list_t swap_list = {-1, -1};
 static struct swap_info_struct swap_info[MAX_SWAPFILES];
@@ -368,13 +369,13 @@ int remove_exclusive_swap_page(struct page *page)
        retval = 0;
        if (p->swap_map[swp_offset(entry)] == 1) {
                /* Recheck the page count with the swapcache lock held.. */
-                write_lock_irq(&swapper_space.tree_lock);
+                spin_lock_irq(&swapper_space.tree_lock);
                if ((page_count(page) == 2) && !PageWriteback(page)) {
                        __delete_from_swap_cache(page);
                        SetPageDirty(page);
                        retval = 1;
                }
-                write_unlock_irq(&swapper_space.tree_lock);
+                spin_unlock_irq(&swapper_space.tree_lock);
        }
        spin_unlock(&swap_lock);
@@ -402,7 +403,7 @@ void free_swap_and_cache(swp_entry_t entry)
        if (p) {
                if (swap_entry_free(p, swp_offset(entry)) == 1) {
                        page = find_get_page(&swapper_space, entry.val);
-                        if (page && unlikely(TestSetPageLocked(page))) {
+                        if (page && unlikely(!trylock_page(page))) {
                                page_cache_release(page);
                                page = NULL;
                        }
@@ -655,8 +656,8 @@ static int unuse_mm(struct mm_struct *mm,
        if (!down_read_trylock(&mm->mmap_sem)) {
                /*
-                 * Activate page so shrink_cache is unlikely to unmap its
+                 * Activate page so shrink_inactive_list is unlikely to unmap
-                 * ptes while lock is dropped, so swapoff can make progress.
+                 * its ptes while lock is dropped, so swapoff can make progress.
                 */
                activate_page(page);
                unlock_page(page);
@@ -1260,6 +1261,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
                /* just pick something that's safe... */
                swap_list.next = swap_list.head;
        }
+        if (p->prio < 0) {
+                for (i = p->next; i >= 0; i = swap_info[i].next)
+                        swap_info[i].prio = p->prio--;
+                least_priority++;
+        }
        nr_swap_pages -= p->pages;
        total_swap_pages -= p->pages;
        p->flags &= ~SWP_WRITEOK;
@@ -1272,9 +1278,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        if (err) {
                /* re-insert swap space back into swap_list */
                spin_lock(&swap_lock);
-                for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
+                if (p->prio < 0)
+                        p->prio = --least_priority;
+                prev = -1;
+                for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
                        if (p->prio >= swap_info[i].prio)
                                break;
+                        prev = i;
+                }
                p->next = i;
                if (prev < 0)
                        swap_list.head = swap_list.next = p - swap_info;
@@ -1447,7 +1458,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        unsigned int type;
        int i, prev;
        int error;
-        static int least_priority;
        union swap_header *swap_header = NULL;
        int swap_header_version;
        unsigned int nr_good_pages = 0;
@@ -1455,7 +1465,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        sector_t span;
        unsigned long maxpages = 1;
        int swapfilesize;
-        unsigned short *swap_map;
+        unsigned short *swap_map = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
        int did_down = 0;
@@ -1474,22 +1484,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        }
        if (type >= nr_swapfiles)
                nr_swapfiles = type+1;
+        memset(p, 0, sizeof(*p));
        INIT_LIST_HEAD(&p->extent_list);
        p->flags = SWP_USED;
-        p->swap_file = NULL;
-        p->old_block_size = 0;
-        p->swap_map = NULL;
-        p->lowest_bit = 0;
-        p->highest_bit = 0;
-        p->cluster_nr = 0;
-        p->inuse_pages = 0;
        p->next = -1;
-        if (swap_flags & SWAP_FLAG_PREFER) {
-                p->prio =
-                  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
-        } else {
-                p->prio = --least_priority;
-        }
        spin_unlock(&swap_lock);
        name = getname(specialfile);
        error = PTR_ERR(name);
@@ -1632,19 +1630,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                        goto bad_swap;
                /* OK, set up the swap map and apply the bad block list */
-                if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
+                swap_map = vmalloc(maxpages * sizeof(short));
+                if (!swap_map) {
                        error = -ENOMEM;
                        goto bad_swap;
                }
                error = 0;
-                memset(p->swap_map, 0, maxpages * sizeof(short));
+                memset(swap_map, 0, maxpages * sizeof(short));
                for (i = 0; i < swap_header->info.nr_badpages; i++) {
                        int page_nr = swap_header->info.badpages[i];
                        if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
                                error = -EINVAL;
                        else
-                                p->swap_map[page_nr] = SWAP_MAP_BAD;
+                                swap_map[page_nr] = SWAP_MAP_BAD;
                }
                nr_good_pages = swap_header->info.last_page -
                                swap_header->info.nr_badpages -
@@ -1654,7 +1653,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        }
        if (nr_good_pages) {
-                p->swap_map[0] = SWAP_MAP_BAD;
+                swap_map[0] = SWAP_MAP_BAD;
                p->max = maxpages;
                p->pages = nr_good_pages;
                nr_extents = setup_swap_extents(p, &span);
@@ -1672,6 +1671,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
+        if (swap_flags & SWAP_FLAG_PREFER)
+                p->prio =
+                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
+        else
+                p->prio = --least_priority;
+        p->swap_map = swap_map;
        p->flags = SWP_ACTIVE;
        nr_swap_pages += nr_good_pages;
        total_swap_pages += nr_good_pages;
@@ -1707,12 +1712,8 @@ bad_swap:
        destroy_swap_extents(p);
 bad_swap_2:
        spin_lock(&swap_lock);
-        swap_map = p->swap_map;
        p->swap_file = NULL;
-        p->swap_map = NULL;
        p->flags = 0;
-        if (!(swap_flags & SWAP_FLAG_PREFER))
-                ++least_priority;
        spin_unlock(&swap_lock);
        vfree(swap_map);
        if (swap_file)
diff --git a/mm/truncate.c b/mm/truncate.c
index b8961cb63414..250505091d37 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -104,7 +104,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        cancel_dirty_page(page, PAGE_CACHE_SIZE);
        remove_from_page_cache(page);
-        ClearPageUptodate(page);
        ClearPageMappedToDisk(page);
        page_cache_release(page);       /* pagecache ref */
 }
@@ -188,7 +187,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        if (page_index > next)
                                next = page_index;
                        next++;
-                        if (TestSetPageLocked(page))
+                        if (!trylock_page(page))
                                continue;
                        if (PageWriteback(page)) {
                                unlock_page(page);
@@ -281,7 +280,7 @@ unsigned long __invalidate_mapping_pages(struct address_space *mapping,
                        pgoff_t index;
                        int lock_failed;
-                        lock_failed = TestSetPageLocked(page);
+                        lock_failed = !trylock_page(page);
                        /*
                         * We really shouldn't be looking at the ->index of an
@@ -349,18 +348,17 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL))
                return 0;
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        if (PageDirty(page))
                goto failed;
        BUG_ON(PagePrivate(page));
        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
-        ClearPageUptodate(page);
        page_cache_release(page);       /* pagecache ref */
        return 1;
 failed:
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
        return 0;
 }
diff --git a/mm/util.c b/mm/util.c
index 8f18683825bc..cb00b748ce47 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,7 +1,9 @@
+#include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 /**
@@ -68,25 +70,22 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
 EXPORT_SYMBOL(kmemdup);
 /**
- * krealloc - reallocate memory. The contents will remain unchanged.
+ * __krealloc - like krealloc() but don't free @p.
 * @p: object to reallocate memory for.
 * @new_size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
 *
- * The contents of the object pointed to are preserved up to the
+ * This function is like krealloc() except it never frees the originally
- * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+ * allocated buffer. Use this if you don't want to free the buffer immediately
- * behaves exactly like kmalloc().  If @size is 0 and @p is not a
+ * like, for example, with RCU.
- * %NULL pointer, the object pointed to is freed.
 */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
+void *__krealloc(const void *p, size_t new_size, gfp_t flags)
 {
        void *ret;
        size_t ks = 0;
-        if (unlikely(!new_size)) {
+        if (unlikely(!new_size))
-                kfree(p);
                return ZERO_SIZE_PTR;
-        }
        if (p)
                ks = ksize(p);
@@ -95,10 +94,37 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
                return (void *)p;
        ret = kmalloc_track_caller(new_size, flags);
-        if (ret && p) {
+        if (ret && p)
                memcpy(ret, p, ks);
+        return ret;
+}
+EXPORT_SYMBOL(__krealloc);
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc().  If @size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+        void *ret;
+        if (unlikely(!new_size)) {
                kfree(p);
+                return ZERO_SIZE_PTR;
        }
+        ret = __krealloc(p, new_size, flags);
+        if (ret && p != ret)
+                kfree(p);
        return ret;
 }
 EXPORT_SYMBOL(krealloc);
@@ -136,3 +162,27 @@ char *strndup_user(const char __user *s, long n)
        return p;
 }
 EXPORT_SYMBOL(strndup_user);
+#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT
+void arch_pick_mmap_layout(struct mm_struct *mm)
+{
+        mm->mmap_base = TASK_UNMAPPED_BASE;
+        mm->get_unmapped_area = arch_get_unmapped_area;
+        mm->unmap_area = arch_unmap_area;
+}
+#endif
+int __attribute__((weak)) get_user_pages_fast(unsigned long start,
+                                int nr_pages, int write, struct page **pages)
+{
+        struct mm_struct *mm = current->mm;
+        int ret;
+        down_read(&mm->mmap_sem);
+        ret = get_user_pages(current, mm, start, nr_pages,
+                                        write, 0, pages, NULL);
+        up_read(&mm->mmap_sem);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6e45b0f3d125..85b9a0d2c877 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -381,16 +381,14 @@ static void __vunmap(const void *addr, int deallocate_pages)
                return;
        if ((PAGE_SIZE-1) & (unsigned long)addr) {
-                printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
+                WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
-                WARN_ON(1);
                return;
        }
        area = remove_vm_area(addr);
        if (unlikely(!area)) {
-                printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
+                WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
                                addr);
-                WARN_ON(1);
                return;
        }
@@ -931,6 +929,25 @@ static void s_stop(struct seq_file *m, void *p)
        read_unlock(&vmlist_lock);
 }
+static void show_numa_info(struct seq_file *m, struct vm_struct *v)
+{
+        if (NUMA_BUILD) {
+                unsigned int nr, *counters = m->private;
+                if (!counters)
+                        return;
+                memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+                for (nr = 0; nr < v->nr_pages; nr++)
+                        counters[page_to_nid(v->pages[nr])]++;
+                for_each_node_state(nr, N_HIGH_MEMORY)
+                        if (counters[nr])
+                                seq_printf(m, " N%u=%u", nr, counters[nr]);
+        }
+}
 static int s_show(struct seq_file *m, void *p)
 {
        struct vm_struct *v = p;
@@ -967,6 +984,7 @@ static int s_show(struct seq_file *m, void *p)
        if (v->flags & VM_VPAGES)
                seq_printf(m, " vpages");
+        show_numa_info(m, v);
        seq_putc(m, '\n');
        return 0;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 967d30ccd92b..1ff1a58e7c10 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -38,6 +38,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
+#include <linux/delayacct.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -390,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 }
 /*
- * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * Same as remove_mapping, but if the page is removed from the mapping, it
- * someone else has a ref on the page, abort and return 0.  If it was
+ * gets returned with a refcount of 0.
- * successfully detached, return 1.  Assumes the caller has a single ref on
- * this page.
 */
-int remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page)
 {
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
-        write_lock_irq(&mapping->tree_lock);
+        spin_lock_irq(&mapping->tree_lock);
        /*
         * The non racy check for a busy page.
         *
@@ -426,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page)
         * Note that if SetPageDirty is always performed via set_page_dirty,
         * and thus under tree_lock, then this ordering is not required.
         */
-        if (unlikely(page_count(page) != 2))
+        if (!page_freeze_refs(page, 2))
                goto cannot_free;
-        smp_rmb();
+        /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
-        if (unlikely(PageDirty(page)))
+        if (unlikely(PageDirty(page))) {
+                page_unfreeze_refs(page, 2);
                goto cannot_free;
+        }
        if (PageSwapCache(page)) {
                swp_entry_t swap = { .val = page_private(page) };
                __delete_from_swap_cache(page);
-                write_unlock_irq(&mapping->tree_lock);
+                spin_unlock_irq(&mapping->tree_lock);
                swap_free(swap);
-                __put_page(page);       /* The pagecache ref */
+        } else {
-                return 1;
+                __remove_from_page_cache(page);
+                spin_unlock_irq(&mapping->tree_lock);
        }
-        __remove_from_page_cache(page);
-        write_unlock_irq(&mapping->tree_lock);
-        __put_page(page);
        return 1;
 cannot_free:
-        write_unlock_irq(&mapping->tree_lock);
+        spin_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
+/*
+ * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * someone else has a ref on the page, abort and return 0.  If it was
+ * successfully detached, return 1.  Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+        if (__remove_mapping(mapping, page)) {
+                /*
+                 * Unfreezing the refcount with 1 rather than 2 effectively
+                 * drops the pagecache ref for us without requiring another
+                 * atomic operation.
+                 */
+                page_unfreeze_refs(page, 1);
+                return 1;
+        }
        return 0;
 }
@@ -477,7 +496,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                page = lru_to_page(page_list);
                list_del(&page->lru);
-                if (TestSetPageLocked(page))
+                if (!trylock_page(page))
                        goto keep;
                VM_BUG_ON(PageActive(page));
@@ -563,7 +582,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 * A synchronous write - probably a ramdisk.  Go
                                 * ahead and try to reclaim the page.
                                 */
-                                if (TestSetPageLocked(page))
+                                if (!trylock_page(page))
                                        goto keep;
                                if (PageDirty(page) || PageWriteback(page))
                                        goto keep_locked;
@@ -597,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PagePrivate(page)) {
                        if (!try_to_release_page(page, sc->gfp_mask))
                                goto activate_locked;
-                        if (!mapping && page_count(page) == 1)
+                        if (!mapping && page_count(page) == 1) {
-                                goto free_it;
+                                unlock_page(page);
+                                if (put_page_testzero(page))
+                                        goto free_it;
+                                else {
+                                        /*
+                                         * rare race with speculative reference.
+                                         * the speculative reference will free
+                                         * this page shortly, so we may
+                                         * increment nr_reclaimed here (and
+                                         * leave it off the LRU).
+                                         */
+                                        nr_reclaimed++;
+                                        continue;
+                                }
+                        }
                }
-                if (!mapping || !remove_mapping(mapping, page))
+                if (!mapping || !__remove_mapping(mapping, page))
                        goto keep_locked;
-free_it:
                unlock_page(page);
+free_it:
                nr_reclaimed++;
-                if (!pagevec_add(&freed_pvec, page))
+                if (!pagevec_add(&freed_pvec, page)) {
-                        __pagevec_release_nonlru(&freed_pvec);
+                        __pagevec_free(&freed_pvec);
+                        pagevec_reinit(&freed_pvec);
+                }
                continue;
 activate_locked:
@@ -622,7 +657,7 @@ keep:
        }
        list_splice(&ret_pages, page_list);
        if (pagevec_count(&freed_pvec))
-                __pagevec_release_nonlru(&freed_pvec);
+                __pagevec_free(&freed_pvec);
        count_vm_events(PGACTIVATE, pgactivate);
        return nr_reclaimed;
 }
@@ -1316,6 +1351,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        struct zone *zone;
        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
+        delayacct_freepages_start();
        if (scan_global_lru(sc))
                count_vm_event(ALLOCSTALL);
        /*
@@ -1371,7 +1408,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
                        congestion_wait(WRITE, HZ/10);
        }
-        /* top priority shrink_caches still had more to do? don't OOM, then */
+        /* top priority shrink_zones still had more to do? don't OOM, then */
        if (!sc->all_unreclaimable && scan_global_lru(sc))
                ret = nr_reclaimed;
 out:
@@ -1396,6 +1433,8 @@ out:
        } else
                mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
+        delayacct_freepages_end();
        return ret;
 }
@@ -1940,7 +1979,7 @@ module_init(kswapd_init)
 int zone_reclaim_mode __read_mostly;
 #define RECLAIM_OFF 0
-#define RECLAIM_ZONE (1<<0)     /* Run shrink_cache on the zone */
+#define RECLAIM_ZONE (1<<0)     /* Run shrink_inactive_list on the zone */
 #define RECLAIM_WRITE (1<<1)    /* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)     /* Swap pages out during reclaim */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db9eabb2c5b3..b0d08e667ece 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -13,6 +13,7 @@
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
+#include <linux/vmstat.h>
 #include <linux/sched.h>
 #ifdef CONFIG_VM_EVENT_COUNTERS
@@ -26,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
        memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
-        for_each_cpu_mask(cpu, *cpumask) {
+        for_each_cpu_mask_nr(cpu, *cpumask) {
                struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
author	Ingo Molnar <mingo@elte.hu>	2008-08-14 06:19:59 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-08-14 06:19:59 -0400
commit	8d7ccaa545490cdffdfaff0842436a8dd85cf47b (patch)
tree	8129b5907161bc6ae26deb3645ce1e280c5e1f51 /mm
parent	b2139aa0eec330c711c5a279db361e5ef1178e78 (diff)
parent	30a2f3c60a84092c8084dfe788b710f8d0768cd4 (diff)