Merge branch 'linus' into x86/xen

author: Ingo Molnar <mingo@elte.hu> 2008-07-26 11:48:49 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-07-26 11:48:49 -0400
commit: c3cc99ff5d24e2eeaf7ec2032e720681916990e3 (patch)
tree: c3e74171bbbd2adde9d60b9db1c440415c8d2831 /mm
parent: 38ffbe66d59051fd9cfcfc8545f164700e2fa3bc (diff)
parent: 024e8ac04453b3525448c31ef39848cf675ba6db (diff)
26 files changed, 2858 insertions, 1348 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c4de85285bb4..aa799007a11b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -174,7 +174,7 @@ config SPLIT_PTLOCK_CPUS
 config MIGRATION
        bool "Page migration"
        def_bool y
-        depends on NUMA
+        depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
        help
          Allows the migration of the physical location of pages of processes
          while the virtual addresses are not changed. This is useful for
diff --git a/mm/Makefile b/mm/Makefile
index 18c143b3c46c..06ca2381fef1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           maccess.o page_alloc.o page-writeback.o pdflush.o \
                           readahead.o swap.o truncate.o vmscan.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                           page_isolation.o $(mmu-y)
+                           page_isolation.o mm_init.o $(mmu-y)
 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)    += bounce.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 8d9f60e06f62..4af15d0340ad 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -1,12 +1,12 @@
 /*
- *  linux/mm/bootmem.c
+ *  bootmem - A boot-time physical memory allocator and configurator
 *
 *  Copyright (C) 1999 Ingo Molnar
- *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
+ *                1999 Kanoj Sarcar, SGI
+ *                2008 Johannes Weiner
 *
- *  simple boot-time physical memory area allocator and
+ * Access to this subsystem has to be serialized externally (which is true
- *  free memory collector. It's used to deal with reserved
+ * for the boot process anyway).
- *  system memory and memory holes as well.
 */
 #include <linux/init.h>
 #include <linux/pfn.h>
@@ -19,15 +19,10 @@
 #include "internal.h"
-/*
- * Access to this subsystem has to be serialized externally. (this is
- * true for the boot process anyway)
- */
 unsigned long max_low_pfn;
 unsigned long min_low_pfn;
 unsigned long max_pfn;
-static LIST_HEAD(bdata_list);
 #ifdef CONFIG_CRASH_DUMP
 /*
 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -36,63 +31,72 @@ static LIST_HEAD(bdata_list);
 unsigned long saved_max_pfn;
 #endif
-/* return the number of _pages_ that will be allocated for the boot bitmap */
+bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
-unsigned long __init bootmem_bootmap_pages(unsigned long pages)
+static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
+static int bootmem_debug;
+static int __init bootmem_debug_setup(char *buf)
 {
-        unsigned long mapsize;
+        bootmem_debug = 1;
+        return 0;
+}
+early_param("bootmem_debug", bootmem_debug_setup);
-        mapsize = (pages+7)/8;
+#define bdebug(fmt, args...) ({                         \
-        mapsize = (mapsize + ~PAGE_MASK) & PAGE_MASK;
+        if (unlikely(bootmem_debug))                    \
-        mapsize >>= PAGE_SHIFT;
+                printk(KERN_INFO                        \
+                        "bootmem::%s " fmt,             \
+                        __FUNCTION__, ## args);         \
+})
-        return mapsize;
+static unsigned long __init bootmap_bytes(unsigned long pages)
+{
+        unsigned long bytes = (pages + 7) / 8;
+        return ALIGN(bytes, sizeof(long));
 }
-/*
+/**
- * link bdata in order
+ * bootmem_bootmap_pages - calculate bitmap size in pages
+ * @pages: number of pages the bitmap has to represent
 */
-static void __init link_bootmem(bootmem_data_t *bdata)
+unsigned long __init bootmem_bootmap_pages(unsigned long pages)
 {
-        bootmem_data_t *ent;
+        unsigned long bytes = bootmap_bytes(pages);
-        if (list_empty(&bdata_list)) {
+        return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
-                list_add(&bdata->list, &bdata_list);
-                return;
-        }
-        /* insert in order */
-        list_for_each_entry(ent, &bdata_list, list) {
-                if (bdata->node_boot_start < ent->node_boot_start) {
-                        list_add_tail(&bdata->list, &ent->list);
-                        return;
-                }
-        }
-        list_add_tail(&bdata->list, &bdata_list);
 }
 /*
- * Given an initialised bdata, it returns the size of the boot bitmap
+ * link bdata in order
 */
-static unsigned long __init get_mapsize(bootmem_data_t *bdata)
+static void __init link_bootmem(bootmem_data_t *bdata)
 {
-        unsigned long mapsize;
+        struct list_head *iter;
-        unsigned long start = PFN_DOWN(bdata->node_boot_start);
-        unsigned long end = bdata->node_low_pfn;
-        mapsize = ((end - start) + 7) / 8;
+        list_for_each(iter, &bdata_list) {
-        return ALIGN(mapsize, sizeof(long));
+                bootmem_data_t *ent;
+                ent = list_entry(iter, bootmem_data_t, list);
+                if (bdata->node_min_pfn < ent->node_min_pfn)
+                        break;
+        }
+        list_add_tail(&bdata->list, iter);
 }
 /*
 * Called once to set up the allocator itself.
 */
-static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
+static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
        unsigned long mapstart, unsigned long start, unsigned long end)
 {
-        bootmem_data_t *bdata = pgdat->bdata;
        unsigned long mapsize;
+        mminit_validate_memmodel_limits(&start, &end);
        bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
-        bdata->node_boot_start = PFN_PHYS(start);
+        bdata->node_min_pfn = start;
        bdata->node_low_pfn = end;
        link_bootmem(bdata);
@@ -100,429 +104,461 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
         * Initially all pages are reserved - setup_arch() has to
         * register free RAM areas explicitly.
         */
-        mapsize = get_mapsize(bdata);
+        mapsize = bootmap_bytes(end - start);
        memset(bdata->node_bootmem_map, 0xff, mapsize);
+        bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
+                bdata - bootmem_node_data, start, mapstart, end, mapsize);
        return mapsize;
 }
-/*
+/**
- * Marks a particular physical memory range as unallocatable. Usable RAM
+ * init_bootmem_node - register a node as boot memory
- * might be used for boot-time allocations - or it might get added
+ * @pgdat: node to register
- * to the free page pool later on.
+ * @freepfn: pfn where the bitmap for this node is to be placed
+ * @startpfn: first pfn on the node
+ * @endpfn: first pfn after the node
+ *
+ * Returns the number of bytes needed to hold the bitmap for this node.
 */
-static int __init can_reserve_bootmem_core(bootmem_data_t *bdata,
+unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
-                        unsigned long addr, unsigned long size, int flags)
+                                unsigned long startpfn, unsigned long endpfn)
 {
-        unsigned long sidx, eidx;
+        return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
-        unsigned long i;
+}
-        BUG_ON(!size);
+/**
+ * init_bootmem - register boot memory
+ * @start: pfn where the bitmap is to be placed
+ * @pages: number of available physical pages
+ *
+ * Returns the number of bytes needed to hold the bitmap.
+ */
+unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+{
+        max_low_pfn = pages;
+        min_low_pfn = start;
+        return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
+}
-        /* out of range, don't hold other */
+static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
-        if (addr + size < bdata->node_boot_start ||
+{
-                PFN_DOWN(addr) > bdata->node_low_pfn)
+        int aligned;
+        struct page *page;
+        unsigned long start, end, pages, count = 0;
+        if (!bdata->node_bootmem_map)
                return 0;
+        start = bdata->node_min_pfn;
+        end = bdata->node_low_pfn;
        /*
-         * Round up to index to the range.
+         * If the start is aligned to the machines wordsize, we might
+         * be able to free pages in bulks of that order.
         */
-        if (addr > bdata->node_boot_start)
+        aligned = !(start & (BITS_PER_LONG - 1));
-                sidx= PFN_DOWN(addr - bdata->node_boot_start);
-        else
-                sidx = 0;
-        eidx = PFN_UP(addr + size - bdata->node_boot_start);
+        bdebug("nid=%td start=%lx end=%lx aligned=%d\n",
-        if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+                bdata - bootmem_node_data, start, end, aligned);
-                eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
-        for (i = sidx; i < eidx; i++) {
+        while (start < end) {
-                if (test_bit(i, bdata->node_bootmem_map)) {
+                unsigned long *map, idx, vec;
-                        if (flags & BOOTMEM_EXCLUSIVE)
-                                return -EBUSY;
-                }
-        }
-        return 0;
+                map = bdata->node_bootmem_map;
+                idx = start - bdata->node_min_pfn;
+                vec = ~map[idx / BITS_PER_LONG];
-}
+                if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) {
+                        int order = ilog2(BITS_PER_LONG);
-static void __init reserve_bootmem_core(bootmem_data_t *bdata,
+                        __free_pages_bootmem(pfn_to_page(start), order);
-                        unsigned long addr, unsigned long size, int flags)
+                        count += BITS_PER_LONG;
-{
+                } else {
-        unsigned long sidx, eidx;
+                        unsigned long off = 0;
-        unsigned long i;
-        BUG_ON(!size);
-        /* out of range */
+                        while (vec && off < BITS_PER_LONG) {
-        if (addr + size < bdata->node_boot_start ||
+                                if (vec & 1) {
-                PFN_DOWN(addr) > bdata->node_low_pfn)
+                                        page = pfn_to_page(start + off);
-                return;
+                                        __free_pages_bootmem(page, 0);
+                                        count++;
+                                }
+                                vec >>= 1;
+                                off++;
+                        }
+                }
+                start += BITS_PER_LONG;
+        }
-        /*
+        page = virt_to_page(bdata->node_bootmem_map);
-         * Round up to index to the range.
+        pages = bdata->node_low_pfn - bdata->node_min_pfn;
-         */
+        pages = bootmem_bootmap_pages(pages);
-        if (addr > bdata->node_boot_start)
+        count += pages;
-                sidx= PFN_DOWN(addr - bdata->node_boot_start);
+        while (pages--)
-        else
+                __free_pages_bootmem(page++, 0);
-                sidx = 0;
-        eidx = PFN_UP(addr + size - bdata->node_boot_start);
+        bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
-        if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
-                eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
-        for (i = sidx; i < eidx; i++) {
+        return count;
-                if (test_and_set_bit(i, bdata->node_bootmem_map)) {
-#ifdef CONFIG_DEBUG_BOOTMEM
-                        printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
-#endif
-                }
-        }
 }
-static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
+/**
-                                     unsigned long size)
+ * free_all_bootmem_node - release a node's free pages to the buddy allocator
+ * @pgdat: node to be released
+ *
+ * Returns the number of pages actually released.
+ */
+unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
-        unsigned long sidx, eidx;
+        register_page_bootmem_info_node(pgdat);
-        unsigned long i;
+        return free_all_bootmem_core(pgdat->bdata);
+}
-        BUG_ON(!size);
-        /* out range */
+/**
-        if (addr + size < bdata->node_boot_start ||
+ * free_all_bootmem - release free pages to the buddy allocator
-                PFN_DOWN(addr) > bdata->node_low_pfn)
+ *
-                return;
+ * Returns the number of pages actually released.
-        /*
+ */
-         * round down end of usable mem, partially free pages are
+unsigned long __init free_all_bootmem(void)
-         * considered reserved.
+{
-         */
+        return free_all_bootmem_core(NODE_DATA(0)->bdata);
+}
-        if (addr >= bdata->node_boot_start && addr < bdata->last_success)
+static void __init __free(bootmem_data_t *bdata,
-                bdata->last_success = addr;
+                        unsigned long sidx, unsigned long eidx)
+{
+        unsigned long idx;
-        /*
+        bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
-         * Round up to index to the range.
+                sidx + bdata->node_min_pfn,
-         */
+                eidx + bdata->node_min_pfn);
-        if (PFN_UP(addr) > PFN_DOWN(bdata->node_boot_start))
-                sidx = PFN_UP(addr) - PFN_DOWN(bdata->node_boot_start);
-        else
-                sidx = 0;
-        eidx = PFN_DOWN(addr + size - bdata->node_boot_start);
+        if (bdata->hint_idx > sidx)
-        if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+                bdata->hint_idx = sidx;
-                eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
-        for (i = sidx; i < eidx; i++) {
+        for (idx = sidx; idx < eidx; idx++)
-                if (unlikely(!test_and_clear_bit(i, bdata->node_bootmem_map)))
+                if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
                        BUG();
-        }
 }
-/*
+static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
- * We 'merge' subsequent allocations to save space. We might 'lose'
+                        unsigned long eidx, int flags)
- * some fraction of a page if allocations cannot be satisfied due to
- * size constraints on boxes where there is physical RAM space
- * fragmentation - in these cases (mostly large memory boxes) this
- * is not a problem.
- *
- * On low memory boxes we get it right in 100% of the cases.
- *
- * alignment has to be a power of 2 value.
- *
- * NOTE:  This function is _not_ reentrant.
- */
-void * __init
-__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
-              unsigned long align, unsigned long goal, unsigned long limit)
 {
-        unsigned long areasize, preferred;
+        unsigned long idx;
-        unsigned long i, start = 0, incr, eidx, end_pfn;
+        int exclusive = flags & BOOTMEM_EXCLUSIVE;
-        void *ret;
-        unsigned long node_boot_start;
+        bdebug("nid=%td start=%lx end=%lx flags=%x\n",
-        void *node_bootmem_map;
+                bdata - bootmem_node_data,
+                sidx + bdata->node_min_pfn,
-        if (!size) {
+                eidx + bdata->node_min_pfn,
-                printk("__alloc_bootmem_core(): zero-sized request\n");
+                flags);
-                BUG();
-        }
+        for (idx = sidx; idx < eidx; idx++)
-        BUG_ON(align & (align-1));
+                if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
+                        if (exclusive) {
-        /* on nodes without memory - bootmem_map is NULL */
+                                __free(bdata, sidx, idx);
-        if (!bdata->node_bootmem_map)
+                                return -EBUSY;
-                return NULL;
+                        }
+                        bdebug("silent double reserve of PFN %lx\n",
+                                idx + bdata->node_min_pfn);
+                }
+        return 0;
+}
-        /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */
+static int __init mark_bootmem_node(bootmem_data_t *bdata,
-        node_boot_start = bdata->node_boot_start;
+                                unsigned long start, unsigned long end,
-        node_bootmem_map = bdata->node_bootmem_map;
+                                int reserve, int flags)
-        if (align) {
+{
-                node_boot_start = ALIGN(bdata->node_boot_start, align);
+        unsigned long sidx, eidx;
-                if (node_boot_start > bdata->node_boot_start)
-                        node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
-                            PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
-        }
-        if (limit && node_boot_start >= limit)
+        bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
-                return NULL;
+                bdata - bootmem_node_data, start, end, reserve, flags);
-        end_pfn = bdata->node_low_pfn;
+        BUG_ON(start < bdata->node_min_pfn);
-        limit = PFN_DOWN(limit);
+        BUG_ON(end > bdata->node_low_pfn);
-        if (limit && end_pfn > limit)
-                end_pfn = limit;
-        eidx = end_pfn - PFN_DOWN(node_boot_start);
+        sidx = start - bdata->node_min_pfn;
+        eidx = end - bdata->node_min_pfn;
-        /*
+        if (reserve)
-         * We try to allocate bootmem pages above 'goal'
+                return __reserve(bdata, sidx, eidx, flags);
-         * first, then we try to allocate lower pages.
+        else
-         */
+                __free(bdata, sidx, eidx);
-        preferred = 0;
+        return 0;
-        if (goal && PFN_DOWN(goal) < end_pfn) {
+}
-                if (goal > node_boot_start)
-                        preferred = goal - node_boot_start;
-                if (bdata->last_success > node_boot_start &&
-                        bdata->last_success - node_boot_start >= preferred)
-                        if (!limit || (limit && limit > bdata->last_success))
-                                preferred = bdata->last_success - node_boot_start;
-        }
-        preferred = PFN_DOWN(ALIGN(preferred, align));
+static int __init mark_bootmem(unsigned long start, unsigned long end,
-        areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
+                                int reserve, int flags)
-        incr = align >> PAGE_SHIFT ? : 1;
+{
+        unsigned long pos;
+        bootmem_data_t *bdata;
-restart_scan:
+        pos = start;
-        for (i = preferred; i < eidx;) {
+        list_for_each_entry(bdata, &bdata_list, list) {
-                unsigned long j;
+                int err;
+                unsigned long max;
-                i = find_next_zero_bit(node_bootmem_map, eidx, i);
+                if (pos < bdata->node_min_pfn ||
-                i = ALIGN(i, incr);
+                    pos >= bdata->node_low_pfn) {
-                if (i >= eidx)
+                        BUG_ON(pos != start);
-                        break;
-                if (test_bit(i, node_bootmem_map)) {
-                        i += incr;
                        continue;
                }
-                for (j = i + 1; j < i + areasize; ++j) {
-                        if (j >= eidx)
-                                goto fail_block;
-                        if (test_bit(j, node_bootmem_map))
-                                goto fail_block;
-                }
-                start = i;
-                goto found;
-        fail_block:
-                i = ALIGN(j, incr);
-                if (i == j)
-                        i += incr;
-        }
-        if (preferred > 0) {
+                max = min(bdata->node_low_pfn, end);
-                preferred = 0;
-                goto restart_scan;
-        }
-        return NULL;
-found:
+                err = mark_bootmem_node(bdata, pos, max, reserve, flags);
-        bdata->last_success = PFN_PHYS(start) + node_boot_start;
+                if (reserve && err) {
-        BUG_ON(start >= eidx);
+                        mark_bootmem(start, pos, 0, 0);
+                        return err;
-        /*
-         * Is the next page of the previous allocation-end the start
-         * of this allocation's buffer? If yes then we can 'merge'
-         * the previous partial page with this allocation.
-         */
-        if (align < PAGE_SIZE &&
-            bdata->last_offset && bdata->last_pos+1 == start) {
-                unsigned long offset, remaining_size;
-                offset = ALIGN(bdata->last_offset, align);
-                BUG_ON(offset > PAGE_SIZE);
-                remaining_size = PAGE_SIZE - offset;
-                if (size < remaining_size) {
-                        areasize = 0;
-                        /* last_pos unchanged */
-                        bdata->last_offset = offset + size;
-                        ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-                                           offset + node_boot_start);
-                } else {
-                        remaining_size = size - remaining_size;
-                        areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
-                        ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-                                           offset + node_boot_start);
-                        bdata->last_pos = start + areasize - 1;
-                        bdata->last_offset = remaining_size;
                }
-                bdata->last_offset &= ~PAGE_MASK;
-        } else {
-                bdata->last_pos = start + areasize - 1;
-                bdata->last_offset = size & ~PAGE_MASK;
-                ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
-        }
-        /*
+                if (max == end)
-         * Reserve the area now:
+                        return 0;
-         */
+                pos = bdata->node_low_pfn;
-        for (i = start; i < start + areasize; i++)
+        }
-                if (unlikely(test_and_set_bit(i, node_bootmem_map)))
+        BUG();
-                        BUG();
-        memset(ret, 0, size);
-        return ret;
 }
-static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
+/**
+ * free_bootmem_node - mark a page range as usable
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ *
+ * Partial pages will be considered reserved and left as they are.
+ *
+ * The range must reside completely on the specified node.
+ */
+void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+                              unsigned long size)
 {
-        struct page *page;
+        unsigned long start, end;
-        unsigned long pfn;
-        bootmem_data_t *bdata = pgdat->bdata;
-        unsigned long i, count, total = 0;
-        unsigned long idx;
-        unsigned long *map; 
-        int gofast = 0;
-        BUG_ON(!bdata->node_bootmem_map);
-        count = 0;
-        /* first extant page of the node */
-        pfn = PFN_DOWN(bdata->node_boot_start);
-        idx = bdata->node_low_pfn - pfn;
-        map = bdata->node_bootmem_map;
-        /* Check physaddr is O(LOG2(BITS_PER_LONG)) page aligned */
-        if (bdata->node_boot_start == 0 ||
-            ffs(bdata->node_boot_start) - PAGE_SHIFT > ffs(BITS_PER_LONG))
-                gofast = 1;
-        for (i = 0; i < idx; ) {
-                unsigned long v = ~map[i / BITS_PER_LONG];
-                if (gofast && v == ~0UL) {
-                        int order;
-                        page = pfn_to_page(pfn);
-                        count += BITS_PER_LONG;
-                        order = ffs(BITS_PER_LONG) - 1;
-                        __free_pages_bootmem(page, order);
-                        i += BITS_PER_LONG;
-                        page += BITS_PER_LONG;
-                } else if (v) {
-                        unsigned long m;
-                        page = pfn_to_page(pfn);
-                        for (m = 1; m && i < idx; m<<=1, page++, i++) {
-                                if (v & m) {
-                                        count++;
-                                        __free_pages_bootmem(page, 0);
-                                }
-                        }
-                } else {
-                        i += BITS_PER_LONG;
-                }
-                pfn += BITS_PER_LONG;
-        }
-        total += count;
-        /*
+        start = PFN_UP(physaddr);
-         * Now free the allocator bitmap itself, it's not
+        end = PFN_DOWN(physaddr + size);
-         * needed anymore:
-         */
-        page = virt_to_page(bdata->node_bootmem_map);
-        count = 0;
-        idx = (get_mapsize(bdata) + PAGE_SIZE-1) >> PAGE_SHIFT;
-        for (i = 0; i < idx; i++, page++) {
-                __free_pages_bootmem(page, 0);
-                count++;
-        }
-        total += count;
-        bdata->node_bootmem_map = NULL;
-        return total;
+        mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
 }
-unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
+/**
-                                unsigned long startpfn, unsigned long endpfn)
+ * free_bootmem - mark a page range as usable
-{
+ * @addr: starting address of the range
-        return init_bootmem_core(pgdat, freepfn, startpfn, endpfn);
+ * @size: size of the range in bytes
-}
+ *
+ * Partial pages will be considered reserved and left as they are.
-int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+ *
-                                 unsigned long size, int flags)
+ * The range must be contiguous but may span node boundaries.
+ */
+void __init free_bootmem(unsigned long addr, unsigned long size)
 {
-        int ret;
+        unsigned long start, end;
-        ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
+        start = PFN_UP(addr);
-        if (ret < 0)
+        end = PFN_DOWN(addr + size);
-                return -ENOMEM;
-        reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
-        return 0;
+        mark_bootmem(start, end, 0, 0);
 }
-void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+/**
-                              unsigned long size)
+ * reserve_bootmem_node - mark a page range as reserved
+ * @pgdat: node the range resides on
+ * @physaddr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must reside completely on the specified node.
+ */
+int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
+                                 unsigned long size, int flags)
 {
-        free_bootmem_core(pgdat->bdata, physaddr, size);
+        unsigned long start, end;
-}
-unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
+        start = PFN_DOWN(physaddr);
-{
+        end = PFN_UP(physaddr + size);
-        register_page_bootmem_info_node(pgdat);
-        return free_all_bootmem_core(pgdat);
-}
-unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
+        return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
-{
-        max_low_pfn = pages;
-        min_low_pfn = start;
-        return init_bootmem_core(NODE_DATA(0), start, 0, pages);
 }
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
+/**
+ * reserve_bootmem - mark a page range as usable
+ * @addr: starting address of the range
+ * @size: size of the range in bytes
+ * @flags: reservation flags (see linux/bootmem.h)
+ *
+ * Partial pages will be reserved.
+ *
+ * The range must be contiguous but may span node boundaries.
+ */
 int __init reserve_bootmem(unsigned long addr, unsigned long size,
                            int flags)
 {
-        bootmem_data_t *bdata;
+        unsigned long start, end;
-        int ret;
-        list_for_each_entry(bdata, &bdata_list, list) {
+        start = PFN_DOWN(addr);
-                ret = can_reserve_bootmem_core(bdata, addr, size, flags);
+        end = PFN_UP(addr + size);
-                if (ret < 0)
-                        return ret;
-        }
-        list_for_each_entry(bdata, &bdata_list, list)
-                reserve_bootmem_core(bdata, addr, size, flags);
-        return 0;
+        return mark_bootmem(start, end, 1, flags);
 }
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
-void __init free_bootmem(unsigned long addr, unsigned long size)
+static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
+                                unsigned long size, unsigned long align,
+                                unsigned long goal, unsigned long limit)
 {
-        bootmem_data_t *bdata;
+        unsigned long fallback = 0;
-        list_for_each_entry(bdata, &bdata_list, list)
+        unsigned long min, max, start, sidx, midx, step;
-                free_bootmem_core(bdata, addr, size);
-}
-unsigned long __init free_all_bootmem(void)
+        BUG_ON(!size);
-{
+        BUG_ON(align & (align - 1));
-        return free_all_bootmem_core(NODE_DATA(0));
+        BUG_ON(limit && goal + size > limit);
+        if (!bdata->node_bootmem_map)
+                return NULL;
+        bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
+                bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
+                align, goal, limit);
+        min = bdata->node_min_pfn;
+        max = bdata->node_low_pfn;
+        goal >>= PAGE_SHIFT;
+        limit >>= PAGE_SHIFT;
+        if (limit && max > limit)
+                max = limit;
+        if (max <= min)
+                return NULL;
+        step = max(align >> PAGE_SHIFT, 1UL);
+        if (goal && min < goal && goal < max)
+                start = ALIGN(goal, step);
+        else
+                start = ALIGN(min, step);
+        sidx = start - bdata->node_min_pfn;;
+        midx = max - bdata->node_min_pfn;
+        if (bdata->hint_idx > sidx) {
+                /*
+                 * Handle the valid case of sidx being zero and still
+                 * catch the fallback below.
+                 */
+                fallback = sidx + 1;
+                sidx = ALIGN(bdata->hint_idx, step);
+        }
+        while (1) {
+                int merge;
+                void *region;
+                unsigned long eidx, i, start_off, end_off;
+find_block:
+                sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
+                sidx = ALIGN(sidx, step);
+                eidx = sidx + PFN_UP(size);
+                if (sidx >= midx || eidx > midx)
+                        break;
+                for (i = sidx; i < eidx; i++)
+                        if (test_bit(i, bdata->node_bootmem_map)) {
+                                sidx = ALIGN(i, step);
+                                if (sidx == i)
+                                        sidx += step;
+                                goto find_block;
+                        }
+                if (bdata->last_end_off &&
+                                PFN_DOWN(bdata->last_end_off) + 1 == sidx)
+                        start_off = ALIGN(bdata->last_end_off, align);
+                else
+                        start_off = PFN_PHYS(sidx);
+                merge = PFN_DOWN(start_off) < sidx;
+                end_off = start_off + size;
+                bdata->last_end_off = end_off;
+                bdata->hint_idx = PFN_UP(end_off);
+                /*
+                 * Reserve the area now:
+                 */
+                if (__reserve(bdata, PFN_DOWN(start_off) + merge,
+                                PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
+                        BUG();
+                region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
+                                start_off);
+                memset(region, 0, size);
+                return region;
+        }
+        if (fallback) {
+                sidx = ALIGN(fallback - 1, step);
+                fallback = 0;
+                goto find_block;
+        }
+        return NULL;
 }
-void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+static void * __init ___alloc_bootmem_nopanic(unsigned long size,
-                                      unsigned long goal)
+                                        unsigned long align,
+                                        unsigned long goal,
+                                        unsigned long limit)
 {
        bootmem_data_t *bdata;
-        void *ptr;
+restart:
        list_for_each_entry(bdata, &bdata_list, list) {
-                ptr = __alloc_bootmem_core(bdata, size, align, goal, 0);
+                void *region;
-                if (ptr)
-                        return ptr;
+                if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
+                        continue;
+                if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
+                        break;
+                region = alloc_bootmem_core(bdata, size, align, goal, limit);
+                if (region)
+                        return region;
+        }
+        if (goal) {
+                goal = 0;
+                goto restart;
        }
        return NULL;
 }
-void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+/**
-                              unsigned long goal)
+ * __alloc_bootmem_nopanic - allocate boot memory without panicking
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * Returns NULL on failure.
+ */
+void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
+                                        unsigned long goal)
 {
-        void *mem = __alloc_bootmem_nopanic(size,align,goal);
+        return ___alloc_bootmem_nopanic(size, align, goal, 0);
+}
+static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
+                                        unsigned long goal, unsigned long limit)
+{
+        void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
        if (mem)
                return mem;
@@ -534,78 +570,135 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align,
        return NULL;
 }
+/**
+ * __alloc_bootmem - allocate boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem(unsigned long size, unsigned long align,
+                              unsigned long goal)
+{
+        return ___alloc_bootmem(size, align, goal, 0);
+}
-void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
-                                   unsigned long align, unsigned long goal)
+                                unsigned long size, unsigned long align,
+                                unsigned long goal, unsigned long limit)
 {
        void *ptr;
-        ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+        ptr = alloc_bootmem_core(bdata, size, align, goal, limit);
        if (ptr)
                return ptr;
-        return __alloc_bootmem(size, align, goal);
+        return ___alloc_bootmem(size, align, goal, limit);
+}
+/**
+ * __alloc_bootmem_node - allocate boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
+                                   unsigned long align, unsigned long goal)
+{
+        return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
 }
 #ifdef CONFIG_SPARSEMEM
+/**
+ * alloc_bootmem_section - allocate boot memory from a specific section
+ * @size: size of the request in bytes
+ * @section_nr: sparse map section to allocate from
+ *
+ * Return NULL on failure.
+ */
 void * __init alloc_bootmem_section(unsigned long size,
                                    unsigned long section_nr)
 {
-        void *ptr;
+        bootmem_data_t *bdata;
-        unsigned long limit, goal, start_nr, end_nr, pfn;
+        unsigned long pfn, goal, limit;
-        struct pglist_data *pgdat;
        pfn = section_nr_to_pfn(section_nr);
-        goal = PFN_PHYS(pfn);
+        goal = pfn << PAGE_SHIFT;
-        limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1;
+        limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT;
-        pgdat = NODE_DATA(early_pfn_to_nid(pfn));
+        bdata = &bootmem_node_data[early_pfn_to_nid(pfn)];
-        ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
-                                   limit);
-        if (!ptr)
+        return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit);
-                return NULL;
+}
+#endif
-        start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr)));
+void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
-        end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size));
+                                   unsigned long align, unsigned long goal)
-        if (start_nr != section_nr || end_nr != section_nr) {
+{
-                printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n",
+        void *ptr;
-                       section_nr);
-                free_bootmem_core(pgdat->bdata, __pa(ptr), size);
-                ptr = NULL;
-        }
-        return ptr;
+        ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
+        if (ptr)
+                return ptr;
+        return __alloc_bootmem_nopanic(size, align, goal);
 }
-#endif
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
 #endif
+/**
+ * __alloc_bootmem_low - allocate low boot memory
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may happen on any node in the system.
+ *
+ * The function panics if the request can not be satisfied.
+ */
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
                                  unsigned long goal)
 {
-        bootmem_data_t *bdata;
+        return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
-        void *ptr;
-        list_for_each_entry(bdata, &bdata_list, list) {
-                ptr = __alloc_bootmem_core(bdata, size, align, goal,
-                                                ARCH_LOW_ADDRESS_LIMIT);
-                if (ptr)
-                        return ptr;
-        }
-        /*
-         * Whoops, we cannot satisfy the allocation request.
-         */
-        printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
-        panic("Out of low memory");
-        return NULL;
 }
+/**
+ * __alloc_bootmem_low_node - allocate low boot memory from a specific node
+ * @pgdat: node to allocate from
+ * @size: size of the request in bytes
+ * @align: alignment of the region
+ * @goal: preferred starting address of the region
+ *
+ * The goal is dropped if it can not be satisfied and the allocation will
+ * fall back to memory below @goal.
+ *
+ * Allocation may fall back to any node in the system if the specified node
+ * can not hold the requested memory.
+ *
+ * The function panics if the request can not be satisfied.
+ */
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
                                       unsigned long align, unsigned long goal)
 {
-        return __alloc_bootmem_core(pgdat->bdata, size, align, goal,
+        return ___alloc_bootmem_node(pgdat->bdata, size, align,
-                                    ARCH_LOW_ADDRESS_LIMIT);
+                                goal, ARCH_LOW_ADDRESS_LIMIT);
 }
diff --git a/mm/filemap.c b/mm/filemap.c
index 65d9d9e2b755..2d3ec1ffc66e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -42,9 +42,6 @@
 #include <asm/mman.h>
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-        loff_t offset, unsigned long nr_segs);
 /*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
@@ -118,7 +115,7 @@ void __remove_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
-        mem_cgroup_uncharge_page(page);
+        mem_cgroup_uncharge_cache_page(page);
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
        mapping->nrpages--;
@@ -477,12 +474,12 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
                        mapping->nrpages++;
                        __inc_zone_page_state(page, NR_FILE_PAGES);
                } else
-                        mem_cgroup_uncharge_page(page);
+                        mem_cgroup_uncharge_cache_page(page);
                write_unlock_irq(&mapping->tree_lock);
                radix_tree_preload_end();
        } else
-                mem_cgroup_uncharge_page(page);
+                mem_cgroup_uncharge_cache_page(page);
 out:
        return error;
 }
@@ -1200,42 +1197,41 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
                mapping = filp->f_mapping;
                inode = mapping->host;
-                retval = 0;
                if (!count)
                        goto out; /* skip atime */
                size = i_size_read(inode);
                if (pos < size) {
-                        retval = generic_file_direct_IO(READ, iocb,
+                        retval = filemap_write_and_wait(mapping);
-                                                iov, pos, nr_segs);
+                        if (!retval) {
+                                retval = mapping->a_ops->direct_IO(READ, iocb,
+                                                        iov, pos, nr_segs);
+                        }
                        if (retval > 0)
                                *ppos = pos + retval;
-                }
+                        if (retval) {
-                if (likely(retval != 0)) {
+                                file_accessed(filp);
-                        file_accessed(filp);
+                                goto out;
-                        goto out;
+                        }
                }
        }
-        retval = 0;
+        for (seg = 0; seg < nr_segs; seg++) {
-        if (count) {
+                read_descriptor_t desc;
-                for (seg = 0; seg < nr_segs; seg++) {
-                        read_descriptor_t desc;
-                        desc.written = 0;
+                desc.written = 0;
-                        desc.arg.buf = iov[seg].iov_base;
+                desc.arg.buf = iov[seg].iov_base;
-                        desc.count = iov[seg].iov_len;
+                desc.count = iov[seg].iov_len;
-                        if (desc.count == 0)
+                if (desc.count == 0)
-                                continue;
+                        continue;
-                        desc.error = 0;
+                desc.error = 0;
-                        do_generic_file_read(filp,ppos,&desc,file_read_actor);
+                do_generic_file_read(filp, ppos, &desc, file_read_actor);
-                        retval += desc.written;
+                retval += desc.written;
-                        if (desc.error) {
+                if (desc.error) {
-                                retval = retval ?: desc.error;
+                        retval = retval ?: desc.error;
-                                break;
+                        break;
-                        }
-                        if (desc.count > 0)
-                                break;
                }
+                if (desc.count > 0)
+                        break;
        }
 out:
        return retval;
@@ -2004,11 +2000,55 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        struct address_space *mapping = file->f_mapping;
        struct inode    *inode = mapping->host;
        ssize_t         written;
+        size_t          write_len;
+        pgoff_t         end;
        if (count != ocount)
                *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
-        written = generic_file_direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+        /*
+         * Unmap all mmappings of the file up-front.
+         *
+         * This will cause any pte dirty bits to be propagated into the
+         * pageframes for the subsequent filemap_write_and_wait().
+         */
+        write_len = iov_length(iov, *nr_segs);
+        end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
+        if (mapping_mapped(mapping))
+                unmap_mapping_range(mapping, pos, write_len, 0);
+        written = filemap_write_and_wait(mapping);
+        if (written)
+                goto out;
+        /*
+         * After a write we want buffered reads to be sure to go to disk to get
+         * the new data.  We invalidate clean cached page from the region we're
+         * about to write.  We do this *before* the write so that we can return
+         * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
+         */
+        if (mapping->nrpages) {
+                written = invalidate_inode_pages2_range(mapping,
+                                        pos >> PAGE_CACHE_SHIFT, end);
+                if (written)
+                        goto out;
+        }
+        written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+        /*
+         * Finally, try again to invalidate clean pages which might have been
+         * cached by non-direct readahead, or faulted in by get_user_pages()
+         * if the source of the write was an mmap'ed region of the file
+         * we're writing.  Either one is a pretty crazy thing to do,
+         * so we don't support it 100%.  If this invalidation
+         * fails, tough, the write still worked...
+         */
+        if (mapping->nrpages) {
+                invalidate_inode_pages2_range(mapping,
+                                              pos >> PAGE_CACHE_SHIFT, end);
+        }
        if (written > 0) {
                loff_t end = pos + written;
                if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
@@ -2024,6 +2064,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
         * i_mutex is held, which protects generic_osync_inode() from
         * livelocking.  AIO O_DIRECT ops attempt to sync metadata here.
         */
+out:
        if ((written >= 0 || written == -EIOCBQUEUED) &&
            ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
@@ -2511,66 +2552,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 }
 EXPORT_SYMBOL(generic_file_aio_write);
-/*
- * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
- * went wrong during pagecache shootdown.
- */
-static ssize_t
-generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
-        loff_t offset, unsigned long nr_segs)
-{
-        struct file *file = iocb->ki_filp;
-        struct address_space *mapping = file->f_mapping;
-        ssize_t retval;
-        size_t write_len;
-        pgoff_t end = 0; /* silence gcc */
-        /*
-         * If it's a write, unmap all mmappings of the file up-front.  This
-         * will cause any pte dirty bits to be propagated into the pageframes
-         * for the subsequent filemap_write_and_wait().
-         */
-        if (rw == WRITE) {
-                write_len = iov_length(iov, nr_segs);
-                end = (offset + write_len - 1) >> PAGE_CACHE_SHIFT;
-                if (mapping_mapped(mapping))
-                        unmap_mapping_range(mapping, offset, write_len, 0);
-        }
-        retval = filemap_write_and_wait(mapping);
-        if (retval)
-                goto out;
-        /*
-         * After a write we want buffered reads to be sure to go to disk to get
-         * the new data.  We invalidate clean cached page from the region we're
-         * about to write.  We do this *before* the write so that we can return
-         * -EIO without clobbering -EIOCBQUEUED from ->direct_IO().
-         */
-        if (rw == WRITE && mapping->nrpages) {
-                retval = invalidate_inode_pages2_range(mapping,
-                                        offset >> PAGE_CACHE_SHIFT, end);
-                if (retval)
-                        goto out;
-        }
-        retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
-        /*
-         * Finally, try again to invalidate clean pages which might have been
-         * cached by non-direct readahead, or faulted in by get_user_pages()
-         * if the source of the write was an mmap'ed region of the file
-         * we're writing.  Either one is a pretty crazy thing to do,
-         * so we don't support it 100%.  If this invalidation
-         * fails, tough, the write still worked...
-         */
-        if (rw == WRITE && mapping->nrpages) {
-                invalidate_inode_pages2_range(mapping, offset >> PAGE_CACHE_SHIFT, end);
-        }
-out:
-        return retval;
-}
 /**
 * try_to_release_page() - release old fs-specific metadata on a page
 *
@@ -2582,9 +2563,8 @@ out:
 * Otherwise return zero.
 *
 * The @gfp_mask argument specifies whether I/O may be performed to release
- * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
 *
- * NOTE: @gfp_mask may go away, and this function may become non-blocking.
 */
 int try_to_release_page(struct page *page, gfp_t gfp_mask)
 {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ab171274ef21..a8bf4ab01f86 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -14,6 +14,8 @@
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
+#include <linux/bootmem.h>
+#include <linux/sysfs.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -22,30 +24,340 @@
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
-static unsigned long surplus_huge_pages;
-static unsigned long nr_overcommit_huge_pages;
-unsigned long max_huge_pages;
-unsigned long sysctl_overcommit_huge_pages;
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static unsigned int nr_huge_pages_node[MAX_NUMNODES];
-static unsigned int free_huge_pages_node[MAX_NUMNODES];
-static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
-static int hugetlb_next_nid;
+static int max_hstate;
+unsigned int default_hstate_idx;
+struct hstate hstates[HUGE_MAX_HSTATE];
+__initdata LIST_HEAD(huge_boot_pages);
+/* for command line parsing */
+static struct hstate * __initdata parsed_hstate;
+static unsigned long __initdata default_hstate_max_huge_pages;
+static unsigned long __initdata default_hstate_size;
+#define for_each_hstate(h) \
+        for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++)
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
 static DEFINE_SPINLOCK(hugetlb_lock);
-static void clear_huge_page(struct page *page, unsigned long addr)
+/*
+ * Region tracking -- allows tracking of reservations and instantiated pages
+ *                    across the pages in a mapping.
+ *
+ * The region data structures are protected by a combination of the mmap_sem
+ * and the hugetlb_instantion_mutex.  To access or modify a region the caller
+ * must either hold the mmap_sem for write, or the mmap_sem for read and
+ * the hugetlb_instantiation mutex:
+ *
+ *      down_write(&mm->mmap_sem);
+ * or
+ *      down_read(&mm->mmap_sem);
+ *      mutex_lock(&hugetlb_instantiation_mutex);
+ */
+struct file_region {
+        struct list_head link;
+        long from;
+        long to;
+};
+static long region_add(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg, *trg;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        /* Check for and consume any regions we now overlap with. */
+        nrg = rg;
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        break;
+                /* If this area reaches higher then extend our area to
+                 * include it completely.  If this is not the first area
+                 * which we intend to reuse, free it. */
+                if (rg->to > t)
+                        t = rg->to;
+                if (rg != nrg) {
+                        list_del(&rg->link);
+                        kfree(rg);
+                }
+        }
+        nrg->from = f;
+        nrg->to = t;
+        return 0;
+}
+static long region_chg(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg;
+        long chg = 0;
+        /* Locate the region we are before or in. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* If we are below the current region then a new region is required.
+         * Subtle, allocate a new region at the position but make it zero
+         * size such that we can guarantee to record the reservation. */
+        if (&rg->link == head || t < rg->from) {
+                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+                if (!nrg)
+                        return -ENOMEM;
+                nrg->from = f;
+                nrg->to   = f;
+                INIT_LIST_HEAD(&nrg->link);
+                list_add(&nrg->link, rg->link.prev);
+                return t - f;
+        }
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        chg = t - f;
+        /* Check for and consume any regions we now overlap with. */
+        list_for_each_entry(rg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        return chg;
+                /* We overlap with this area, if it extends futher than
+                 * us then we must extend ourselves.  Account for its
+                 * existing reservation. */
+                if (rg->to > t) {
+                        chg += rg->to - t;
+                        t = rg->to;
+                }
+                chg -= rg->to - rg->from;
+        }
+        return chg;
+}
+static long region_truncate(struct list_head *head, long end)
+{
+        struct file_region *rg, *trg;
+        long chg = 0;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (end <= rg->to)
+                        break;
+        if (&rg->link == head)
+                return 0;
+        /* If we are in the middle of a region then adjust it. */
+        if (end > rg->from) {
+                chg = rg->to - end;
+                rg->to = end;
+                rg = list_entry(rg->link.next, typeof(*rg), link);
+        }
+        /* Drop any remaining regions. */
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                chg += rg->to - rg->from;
+                list_del(&rg->link);
+                kfree(rg);
+        }
+        return chg;
+}
+static long region_count(struct list_head *head, long f, long t)
+{
+        struct file_region *rg;
+        long chg = 0;
+        /* Locate each segment we overlap with, and count that overlap. */
+        list_for_each_entry(rg, head, link) {
+                int seg_from;
+                int seg_to;
+                if (rg->to <= f)
+                        continue;
+                if (rg->from >= t)
+                        break;
+                seg_from = max(rg->from, f);
+                seg_to = min(rg->to, t);
+                chg += seg_to - seg_from;
+        }
+        return chg;
+}
+/*
+ * Convert the address within this vma to the page offset within
+ * the mapping, in pagecache page units; huge pages here.
+ */
+static pgoff_t vma_hugecache_offset(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long address)
+{
+        return ((address - vma->vm_start) >> huge_page_shift(h)) +
+                        (vma->vm_pgoff >> huge_page_order(h));
+}
+/*
+ * Flags for MAP_PRIVATE reservations.  These are stored in the bottom
+ * bits of the reservation map pointer, which are always clear due to
+ * alignment.
+ */
+#define HPAGE_RESV_OWNER    (1UL << 0)
+#define HPAGE_RESV_UNMAPPED (1UL << 1)
+#define HPAGE_RESV_MASK (HPAGE_RESV_OWNER | HPAGE_RESV_UNMAPPED)
+/*
+ * These helpers are used to track how many pages are reserved for
+ * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
+ * is guaranteed to have their future faults succeed.
+ *
+ * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * the reserve counters are updated with the hugetlb_lock held. It is safe
+ * to reset the VMA at fork() time as it is not in use yet and there is no
+ * chance of the global counters getting corrupted as a result of the values.
+ *
+ * The private mapping reservation is represented in a subtly different
+ * manner to a shared mapping.  A shared mapping has a region map associated
+ * with the underlying file, this region map represents the backing file
+ * pages which have ever had a reservation assigned which this persists even
+ * after the page is instantiated.  A private mapping has a region map
+ * associated with the original mmap which is attached to all VMAs which
+ * reference it, this region map represents those offsets which have consumed
+ * reservation ie. where pages have been instantiated.
+ */
+static unsigned long get_vma_private_data(struct vm_area_struct *vma)
+{
+        return (unsigned long)vma->vm_private_data;
+}
+static void set_vma_private_data(struct vm_area_struct *vma,
+                                                        unsigned long value)
+{
+        vma->vm_private_data = (void *)value;
+}
+struct resv_map {
+        struct kref refs;
+        struct list_head regions;
+};
+struct resv_map *resv_map_alloc(void)
+{
+        struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
+        if (!resv_map)
+                return NULL;
+        kref_init(&resv_map->refs);
+        INIT_LIST_HEAD(&resv_map->regions);
+        return resv_map;
+}
+void resv_map_release(struct kref *ref)
+{
+        struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
+        /* Clear out any active regions before we release the map. */
+        region_truncate(&resv_map->regions, 0);
+        kfree(resv_map);
+}
+static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        if (!(vma->vm_flags & VM_SHARED))
+                return (struct resv_map *)(get_vma_private_data(vma) &
+                                                        ~HPAGE_RESV_MASK);
+        return 0;
+}
+static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        VM_BUG_ON(vma->vm_flags & VM_SHARED);
+        set_vma_private_data(vma, (get_vma_private_data(vma) &
+                                HPAGE_RESV_MASK) | (unsigned long)map);
+}
+static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        VM_BUG_ON(vma->vm_flags & VM_SHARED);
+        set_vma_private_data(vma, get_vma_private_data(vma) | flags);
+}
+static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        return (get_vma_private_data(vma) & flag) != 0;
+}
+/* Decrement the reserved pages in the hugepage pool by one */
+static void decrement_hugepage_resv_vma(struct hstate *h,
+                        struct vm_area_struct *vma)
+{
+        if (vma->vm_flags & VM_NORESERVE)
+                return;
+        if (vma->vm_flags & VM_SHARED) {
+                /* Shared mappings always use reserves */
+                h->resv_huge_pages--;
+        } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                /*
+                 * Only the process that called mmap() has reserves for
+                 * private mappings.
+                 */
+                h->resv_huge_pages--;
+        }
+}
+/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
+void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+{
+        VM_BUG_ON(!is_vm_hugetlb_page(vma));
+        if (!(vma->vm_flags & VM_SHARED))
+                vma->vm_private_data = (void *)0;
+}
+/* Returns true if the VMA has associated reserve pages */
+static int vma_has_reserves(struct vm_area_struct *vma)
+{
+        if (vma->vm_flags & VM_SHARED)
+                return 1;
+        if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+                return 1;
+        return 0;
+}
+static void clear_huge_page(struct page *page,
+                        unsigned long addr, unsigned long sz)
 {
        int i;
        might_sleep();
-        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+        for (i = 0; i < sz/PAGE_SIZE; i++) {
                cond_resched();
                clear_user_highpage(page + i, addr + i * PAGE_SIZE);
        }
@@ -55,42 +367,44 @@ static void copy_huge_page(struct page *dst, struct page *src,
                           unsigned long addr, struct vm_area_struct *vma)
 {
        int i;
+        struct hstate *h = hstate_vma(vma);
        might_sleep();
-        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+        for (i = 0; i < pages_per_huge_page(h); i++) {
                cond_resched();
                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
        }
 }
-static void enqueue_huge_page(struct page *page)
+static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
-        list_add(&page->lru, &hugepage_freelists[nid]);
+        list_add(&page->lru, &h->hugepage_freelists[nid]);
-        free_huge_pages++;
+        h->free_huge_pages++;
-        free_huge_pages_node[nid]++;
+        h->free_huge_pages_node[nid]++;
 }
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct hstate *h)
 {
        int nid;
        struct page *page = NULL;
        for (nid = 0; nid < MAX_NUMNODES; ++nid) {
-                if (!list_empty(&hugepage_freelists[nid])) {
+                if (!list_empty(&h->hugepage_freelists[nid])) {
-                        page = list_entry(hugepage_freelists[nid].next,
+                        page = list_entry(h->hugepage_freelists[nid].next,
                                          struct page, lru);
                        list_del(&page->lru);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[nid]--;
+                        h->free_huge_pages_node[nid]--;
                        break;
                }
        }
        return page;
 }
-static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
+static struct page *dequeue_huge_page_vma(struct hstate *h,
-                                unsigned long address)
+                                struct vm_area_struct *vma,
+                                unsigned long address, int avoid_reserve)
 {
        int nid;
        struct page *page = NULL;
@@ -101,18 +415,33 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
        struct zone *zone;
        struct zoneref *z;
+        /*
+         * A child process with MAP_PRIVATE mappings created by their parent
+         * have no page reserves. This check ensures that reservations are
+         * not "stolen". The child may still get SIGKILLed
+         */
+        if (!vma_has_reserves(vma) &&
+                        h->free_huge_pages - h->resv_huge_pages == 0)
+                return NULL;
+        /* If reserves cannot be used, ensure enough pages are in the pool */
+        if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+                return NULL;
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                                                MAX_NR_ZONES - 1, nodemask) {
                nid = zone_to_nid(zone);
                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
-                    !list_empty(&hugepage_freelists[nid])) {
+                    !list_empty(&h->hugepage_freelists[nid])) {
-                        page = list_entry(hugepage_freelists[nid].next,
+                        page = list_entry(h->hugepage_freelists[nid].next,
                                          struct page, lru);
                        list_del(&page->lru);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[nid]--;
+                        h->free_huge_pages_node[nid]--;
-                        if (vma && vma->vm_flags & VM_MAYSHARE)
-                                resv_huge_pages--;
+                        if (!avoid_reserve)
+                                decrement_hugepage_resv_vma(h, vma);
                        break;
                }
        }
@@ -120,12 +449,13 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
        return page;
 }
-static void update_and_free_page(struct page *page)
+static void update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
-        nr_huge_pages--;
-        nr_huge_pages_node[page_to_nid(page)]--;
+        h->nr_huge_pages--;
-        for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+        h->nr_huge_pages_node[page_to_nid(page)]--;
+        for (i = 0; i < pages_per_huge_page(h); i++) {
                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
                                1 << PG_private | 1<< PG_writeback);
@@ -133,11 +463,27 @@ static void update_and_free_page(struct page *page)
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
        arch_release_hugepage(page);
-        __free_pages(page, HUGETLB_PAGE_ORDER);
+        __free_pages(page, huge_page_order(h));
+}
+struct hstate *size_to_hstate(unsigned long size)
+{
+        struct hstate *h;
+        for_each_hstate(h) {
+                if (huge_page_size(h) == size)
+                        return h;
+        }
+        return NULL;
 }
 static void free_huge_page(struct page *page)
 {
+        /*
+         * Can't pass hstate in here because it is called from the
+         * compound page destructor.
+         */
+        struct hstate *h = page_hstate(page);
        int nid = page_to_nid(page);
        struct address_space *mapping;
@@ -147,12 +493,12 @@ static void free_huge_page(struct page *page)
        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
-        if (surplus_huge_pages_node[nid]) {
+        if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
-                update_and_free_page(page);
+                update_and_free_page(h, page);
-                surplus_huge_pages--;
+                h->surplus_huge_pages--;
-                surplus_huge_pages_node[nid]--;
+                h->surplus_huge_pages_node[nid]--;
        } else {
-                enqueue_huge_page(page);
+                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
        if (mapping)
@@ -164,7 +510,7 @@ static void free_huge_page(struct page *page)
 * balanced by operating on them in a round-robin fashion.
 * Returns 1 if an adjustment was made.
 */
-static int adjust_pool_surplus(int delta)
+static int adjust_pool_surplus(struct hstate *h, int delta)
 {
        static int prev_nid;
        int nid = prev_nid;
@@ -177,15 +523,15 @@ static int adjust_pool_surplus(int delta)
                        nid = first_node(node_online_map);
                /* To shrink on this node, there must be a surplus page */
-                if (delta < 0 && !surplus_huge_pages_node[nid])
+                if (delta < 0 && !h->surplus_huge_pages_node[nid])
                        continue;
                /* Surplus cannot exceed the total number of pages */
-                if (delta > 0 && surplus_huge_pages_node[nid] >=
+                if (delta > 0 && h->surplus_huge_pages_node[nid] >=
-                                                nr_huge_pages_node[nid])
+                                                h->nr_huge_pages_node[nid])
                        continue;
-                surplus_huge_pages += delta;
+                h->surplus_huge_pages += delta;
-                surplus_huge_pages_node[nid] += delta;
+                h->surplus_huge_pages_node[nid] += delta;
                ret = 1;
                break;
        } while (nid != prev_nid);
@@ -194,59 +540,74 @@ static int adjust_pool_surplus(int delta)
        return ret;
 }
-static struct page *alloc_fresh_huge_page_node(int nid)
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+        set_compound_page_dtor(page, free_huge_page);
+        spin_lock(&hugetlb_lock);
+        h->nr_huge_pages++;
+        h->nr_huge_pages_node[nid]++;
+        spin_unlock(&hugetlb_lock);
+        put_page(page); /* free it into the hugepage allocator */
+}
+static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
+        if (h->order >= MAX_ORDER)
+                return NULL;
        page = alloc_pages_node(nid,
                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
                                                __GFP_REPEAT|__GFP_NOWARN,
-                HUGETLB_PAGE_ORDER);
+                huge_page_order(h));
        if (page) {
                if (arch_prepare_hugepage(page)) {
                        __free_pages(page, HUGETLB_PAGE_ORDER);
                        return NULL;
                }
-                set_compound_page_dtor(page, free_huge_page);
+                prep_new_huge_page(h, page, nid);
-                spin_lock(&hugetlb_lock);
-                nr_huge_pages++;
-                nr_huge_pages_node[nid]++;
-                spin_unlock(&hugetlb_lock);
-                put_page(page); /* free it into the hugepage allocator */
        }
        return page;
 }
-static int alloc_fresh_huge_page(void)
+/*
+ * Use a helper variable to find the next node and then
+ * copy it back to hugetlb_next_nid afterwards:
+ * otherwise there's a window in which a racer might
+ * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+ * But we don't need to use a spin_lock here: it really
+ * doesn't matter if occasionally a racer chooses the
+ * same nid as we do.  Move nid forward in the mask even
+ * if we just successfully allocated a hugepage so that
+ * the next caller gets hugepages on the next node.
+ */
+static int hstate_next_node(struct hstate *h)
+{
+        int next_nid;
+        next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+        if (next_nid == MAX_NUMNODES)
+                next_nid = first_node(node_online_map);
+        h->hugetlb_next_nid = next_nid;
+        return next_nid;
+}
+static int alloc_fresh_huge_page(struct hstate *h)
 {
        struct page *page;
        int start_nid;
        int next_nid;
        int ret = 0;
-        start_nid = hugetlb_next_nid;
+        start_nid = h->hugetlb_next_nid;
        do {
-                page = alloc_fresh_huge_page_node(hugetlb_next_nid);
+                page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
                if (page)
                        ret = 1;
-                /*
+                next_nid = hstate_next_node(h);
-                 * Use a helper variable to find the next node and then
+        } while (!page && h->hugetlb_next_nid != start_nid);
-                 * copy it back to hugetlb_next_nid afterwards:
-                 * otherwise there's a window in which a racer might
-                 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
-                 * But we don't need to use a spin_lock here: it really
-                 * doesn't matter if occasionally a racer chooses the
-                 * same nid as we do.  Move nid forward in the mask even
-                 * if we just successfully allocated a hugepage so that
-                 * the next caller gets hugepages on the next node.
-                 */
-                next_nid = next_node(hugetlb_next_nid, node_online_map);
-                if (next_nid == MAX_NUMNODES)
-                        next_nid = first_node(node_online_map);
-                hugetlb_next_nid = next_nid;
-        } while (!page && hugetlb_next_nid != start_nid);
        if (ret)
                count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -256,12 +617,15 @@ static int alloc_fresh_huge_page(void)
        return ret;
 }
-static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
+static struct page *alloc_buddy_huge_page(struct hstate *h,
-                                                unsigned long address)
+                        struct vm_area_struct *vma, unsigned long address)
 {
        struct page *page;
        unsigned int nid;
+        if (h->order >= MAX_ORDER)
+                return NULL;
        /*
         * Assume we will successfully allocate the surplus page to
         * prevent racing processes from causing the surplus to exceed
@@ -286,18 +650,18 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
         * per-node value is checked there.
         */
        spin_lock(&hugetlb_lock);
-        if (surplus_huge_pages >= nr_overcommit_huge_pages) {
+        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                spin_unlock(&hugetlb_lock);
                return NULL;
        } else {
-                nr_huge_pages++;
+                h->nr_huge_pages++;
-                surplus_huge_pages++;
+                h->surplus_huge_pages++;
        }
        spin_unlock(&hugetlb_lock);
        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
                                        __GFP_REPEAT|__GFP_NOWARN,
-                                        HUGETLB_PAGE_ORDER);
+                                        huge_page_order(h));
        spin_lock(&hugetlb_lock);
        if (page) {
@@ -312,12 +676,12 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
                /*
                 * We incremented the global counters already
                 */
-                nr_huge_pages_node[nid]++;
+                h->nr_huge_pages_node[nid]++;
-                surplus_huge_pages_node[nid]++;
+                h->surplus_huge_pages_node[nid]++;
                __count_vm_event(HTLB_BUDDY_PGALLOC);
        } else {
-                nr_huge_pages--;
+                h->nr_huge_pages--;
-                surplus_huge_pages--;
+                h->surplus_huge_pages--;
                __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
        }
        spin_unlock(&hugetlb_lock);
@@ -329,16 +693,16 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
 * Increase the hugetlb pool such that it can accomodate a reservation
 * of size 'delta'.
 */
-static int gather_surplus_pages(int delta)
+static int gather_surplus_pages(struct hstate *h, int delta)
 {
        struct list_head surplus_list;
        struct page *page, *tmp;
        int ret, i;
        int needed, allocated;
-        needed = (resv_huge_pages + delta) - free_huge_pages;
+        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
        if (needed <= 0) {
-                resv_huge_pages += delta;
+                h->resv_huge_pages += delta;
                return 0;
        }
@@ -349,7 +713,7 @@ static int gather_surplus_pages(int delta)
 retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
-                page = alloc_buddy_huge_page(NULL, 0);
+                page = alloc_buddy_huge_page(h, NULL, 0);
                if (!page) {
                        /*
                         * We were not able to allocate enough pages to
@@ -370,7 +734,8 @@ retry:
         * because either resv_huge_pages or free_huge_pages may have changed.
         */
        spin_lock(&hugetlb_lock);
-        needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
+        needed = (h->resv_huge_pages + delta) -
+                        (h->free_huge_pages + allocated);
        if (needed > 0)
                goto retry;
@@ -383,7 +748,7 @@ retry:
         * before they are reserved.
         */
        needed += allocated;
-        resv_huge_pages += delta;
+        h->resv_huge_pages += delta;
        ret = 0;
 free:
        /* Free the needed pages to the hugetlb pool */
@@ -391,7 +756,7 @@ free:
                if ((--needed) < 0)
                        break;
                list_del(&page->lru);
-                enqueue_huge_page(page);
+                enqueue_huge_page(h, page);
        }
        /* Free unnecessary surplus pages to the buddy allocator */
@@ -419,7 +784,8 @@ free:
 * allocated to satisfy the reservation must be explicitly freed if they were
 * never used.
 */
-static void return_unused_surplus_pages(unsigned long unused_resv_pages)
+static void return_unused_surplus_pages(struct hstate *h,
+                                        unsigned long unused_resv_pages)
 {
        static int nid = -1;
        struct page *page;
@@ -434,114 +800,231 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages)
        unsigned long remaining_iterations = num_online_nodes();
        /* Uncommit the reservation */
-        resv_huge_pages -= unused_resv_pages;
+        h->resv_huge_pages -= unused_resv_pages;
+        /* Cannot return gigantic pages currently */
+        if (h->order >= MAX_ORDER)
+                return;
-        nr_pages = min(unused_resv_pages, surplus_huge_pages);
+        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
        while (remaining_iterations-- && nr_pages) {
                nid = next_node(nid, node_online_map);
                if (nid == MAX_NUMNODES)
                        nid = first_node(node_online_map);
-                if (!surplus_huge_pages_node[nid])
+                if (!h->surplus_huge_pages_node[nid])
                        continue;
-                if (!list_empty(&hugepage_freelists[nid])) {
+                if (!list_empty(&h->hugepage_freelists[nid])) {
-                        page = list_entry(hugepage_freelists[nid].next,
+                        page = list_entry(h->hugepage_freelists[nid].next,
                                          struct page, lru);
                        list_del(&page->lru);
-                        update_and_free_page(page);
+                        update_and_free_page(h, page);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[nid]--;
+                        h->free_huge_pages_node[nid]--;
-                        surplus_huge_pages--;
+                        h->surplus_huge_pages--;
-                        surplus_huge_pages_node[nid]--;
+                        h->surplus_huge_pages_node[nid]--;
                        nr_pages--;
                        remaining_iterations = num_online_nodes();
                }
        }
 }
+/*
+ * Determine if the huge page at addr within the vma has an associated
+ * reservation.  Where it does not we will need to logically increase
+ * reservation and actually increase quota before an allocation can occur.
+ * Where any new reservation would be required the reservation change is
+ * prepared, but not committed.  Once the page has been quota'd allocated
+ * an instantiated the change should be committed via vma_commit_reservation.
+ * No action is required on failure.
+ */
+static int vma_needs_reservation(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long addr)
+{
+        struct address_space *mapping = vma->vm_file->f_mapping;
+        struct inode *inode = mapping->host;
+        if (vma->vm_flags & VM_SHARED) {
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+                return region_chg(&inode->i_mapping->private_list,
+                                                        idx, idx + 1);
+        } else if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                return 1;
-static struct page *alloc_huge_page_shared(struct vm_area_struct *vma,
+        } else  {
-                                                unsigned long addr)
+                int err;
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+                struct resv_map *reservations = vma_resv_map(vma);
+                err = region_chg(&reservations->regions, idx, idx + 1);
+                if (err < 0)
+                        return err;
+                return 0;
+        }
+}
+static void vma_commit_reservation(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long addr)
 {
-        struct page *page;
+        struct address_space *mapping = vma->vm_file->f_mapping;
+        struct inode *inode = mapping->host;
-        spin_lock(&hugetlb_lock);
+        if (vma->vm_flags & VM_SHARED) {
-        page = dequeue_huge_page_vma(vma, addr);
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-        spin_unlock(&hugetlb_lock);
+                region_add(&inode->i_mapping->private_list, idx, idx + 1);
-        return page ? page : ERR_PTR(-VM_FAULT_OOM);
+        } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+                pgoff_t idx = vma_hugecache_offset(h, vma, addr);
+                struct resv_map *reservations = vma_resv_map(vma);
+                /* Mark this page used in the map. */
+                region_add(&reservations->regions, idx, idx + 1);
+        }
 }
-static struct page *alloc_huge_page_private(struct vm_area_struct *vma,
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
-                                                unsigned long addr)
+                                    unsigned long addr, int avoid_reserve)
 {
-        struct page *page = NULL;
+        struct hstate *h = hstate_vma(vma);
+        struct page *page;
+        struct address_space *mapping = vma->vm_file->f_mapping;
+        struct inode *inode = mapping->host;
+        unsigned int chg;
-        if (hugetlb_get_quota(vma->vm_file->f_mapping, 1))
+        /*
-                return ERR_PTR(-VM_FAULT_SIGBUS);
+         * Processes that did not create the mapping will have no reserves and
+         * will not have accounted against quota. Check that the quota can be
+         * made before satisfying the allocation
+         * MAP_NORESERVE mappings may also need pages and quota allocated
+         * if no reserve mapping overlaps.
+         */
+        chg = vma_needs_reservation(h, vma, addr);
+        if (chg < 0)
+                return ERR_PTR(chg);
+        if (chg)
+                if (hugetlb_get_quota(inode->i_mapping, chg))
+                        return ERR_PTR(-ENOSPC);
        spin_lock(&hugetlb_lock);
-        if (free_huge_pages > resv_huge_pages)
+        page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
-                page = dequeue_huge_page_vma(vma, addr);
        spin_unlock(&hugetlb_lock);
        if (!page) {
-                page = alloc_buddy_huge_page(vma, addr);
+                page = alloc_buddy_huge_page(h, vma, addr);
                if (!page) {
-                        hugetlb_put_quota(vma->vm_file->f_mapping, 1);
+                        hugetlb_put_quota(inode->i_mapping, chg);
                        return ERR_PTR(-VM_FAULT_OOM);
                }
        }
+        set_page_refcounted(page);
+        set_page_private(page, (unsigned long) mapping);
+        vma_commit_reservation(h, vma, addr);
        return page;
 }
-static struct page *alloc_huge_page(struct vm_area_struct *vma,
+__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
-                                    unsigned long addr)
 {
-        struct page *page;
+        struct huge_bootmem_page *m;
-        struct address_space *mapping = vma->vm_file->f_mapping;
+        int nr_nodes = nodes_weight(node_online_map);
-        if (vma->vm_flags & VM_MAYSHARE)
+        while (nr_nodes) {
-                page = alloc_huge_page_shared(vma, addr);
+                void *addr;
-        else
-                page = alloc_huge_page_private(vma, addr);
+                addr = __alloc_bootmem_node_nopanic(
+                                NODE_DATA(h->hugetlb_next_nid),
+                                huge_page_size(h), huge_page_size(h), 0);
+                if (addr) {
+                        /*
+                         * Use the beginning of the huge page to store the
+                         * huge_bootmem_page struct (until gather_bootmem
+                         * puts them into the mem_map).
+                         */
+                        m = addr;
+                        if (m)
+                                goto found;
+                }
+                hstate_next_node(h);
+                nr_nodes--;
+        }
+        return 0;
+found:
+        BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1));
+        /* Put them into a private list first because mem_map is not up yet */
+        list_add(&m->list, &huge_boot_pages);
+        m->hstate = h;
+        return 1;
+}
-        if (!IS_ERR(page)) {
+/* Put bootmem huge pages into the standard lists after mem_map is up */
-                set_page_refcounted(page);
+static void __init gather_bootmem_prealloc(void)
-                set_page_private(page, (unsigned long) mapping);
+{
+        struct huge_bootmem_page *m;
+        list_for_each_entry(m, &huge_boot_pages, list) {
+                struct page *page = virt_to_page(m);
+                struct hstate *h = m->hstate;
+                __ClearPageReserved(page);
+                WARN_ON(page_count(page) != 1);
+                prep_compound_page(page, h->order);
+                prep_new_huge_page(h, page, page_to_nid(page));
        }
-        return page;
 }
-static int __init hugetlb_init(void)
+static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 {
        unsigned long i;
-        if (HPAGE_SHIFT == 0)
+        for (i = 0; i < h->max_huge_pages; ++i) {
-                return 0;
+                if (h->order >= MAX_ORDER) {
+                        if (!alloc_bootmem_huge_page(h))
-        for (i = 0; i < MAX_NUMNODES; ++i)
+                                break;
-                INIT_LIST_HEAD(&hugepage_freelists[i]);
+                } else if (!alloc_fresh_huge_page(h))
+                        break;
+        }
+        h->max_huge_pages = i;
+}
-        hugetlb_next_nid = first_node(node_online_map);
+static void __init hugetlb_init_hstates(void)
+{
+        struct hstate *h;
-        for (i = 0; i < max_huge_pages; ++i) {
+        for_each_hstate(h) {
-                if (!alloc_fresh_huge_page())
+                /* oversize hugepages were init'ed in early boot */
-                        break;
+                if (h->order < MAX_ORDER)
+                        hugetlb_hstate_alloc_pages(h);
        }
-        max_huge_pages = free_huge_pages = nr_huge_pages = i;
-        printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
-        return 0;
 }
-module_init(hugetlb_init);
-static int __init hugetlb_setup(char *s)
+static char * __init memfmt(char *buf, unsigned long n)
 {
-        if (sscanf(s, "%lu", &max_huge_pages) <= 0)
+        if (n >= (1UL << 30))
-                max_huge_pages = 0;
+                sprintf(buf, "%lu GB", n >> 30);
-        return 1;
+        else if (n >= (1UL << 20))
+                sprintf(buf, "%lu MB", n >> 20);
+        else
+                sprintf(buf, "%lu KB", n >> 10);
+        return buf;
+}
+static void __init report_hugepages(void)
+{
+        struct hstate *h;
+        for_each_hstate(h) {
+                char buf[32];
+                printk(KERN_INFO "HugeTLB registered %s page size, "
+                                 "pre-allocated %ld pages\n",
+                        memfmt(buf, huge_page_size(h)),
+                        h->free_huge_pages);
+        }
 }
-__setup("hugepages=", hugetlb_setup);
 static unsigned int cpuset_mems_nr(unsigned int *array)
 {
@@ -556,35 +1039,42 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 #ifdef CONFIG_SYSCTL
 #ifdef CONFIG_HIGHMEM
-static void try_to_free_low(unsigned long count)
+static void try_to_free_low(struct hstate *h, unsigned long count)
 {
        int i;
+        if (h->order >= MAX_ORDER)
+                return;
        for (i = 0; i < MAX_NUMNODES; ++i) {
                struct page *page, *next;
-                list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
+                struct list_head *freel = &h->hugepage_freelists[i];
-                        if (count >= nr_huge_pages)
+                list_for_each_entry_safe(page, next, freel, lru) {
+                        if (count >= h->nr_huge_pages)
                                return;
                        if (PageHighMem(page))
                                continue;
                        list_del(&page->lru);
-                        update_and_free_page(page);
+                        update_and_free_page(h, page);
-                        free_huge_pages--;
+                        h->free_huge_pages--;
-                        free_huge_pages_node[page_to_nid(page)]--;
+                        h->free_huge_pages_node[page_to_nid(page)]--;
                }
        }
 }
 #else
-static inline void try_to_free_low(unsigned long count)
+static inline void try_to_free_low(struct hstate *h, unsigned long count)
 {
 }
 #endif
-#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
+#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(unsigned long count)
+static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
 {
        unsigned long min_count, ret;
+        if (h->order >= MAX_ORDER)
+                return h->max_huge_pages;
        /*
         * Increase the pool size
         * First take pages out of surplus state.  Then make up the
@@ -597,20 +1087,19 @@ static unsigned long set_max_huge_pages(unsigned long count)
         * within all the constraints specified by the sysctls.
         */
        spin_lock(&hugetlb_lock);
-        while (surplus_huge_pages && count > persistent_huge_pages) {
+        while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
-                if (!adjust_pool_surplus(-1))
+                if (!adjust_pool_surplus(h, -1))
                        break;
        }
-        while (count > persistent_huge_pages) {
+        while (count > persistent_huge_pages(h)) {
-                int ret;
                /*
                 * If this allocation races such that we no longer need the
                 * page, free_huge_page will handle it by freeing the page
                 * and reducing the surplus.
                 */
                spin_unlock(&hugetlb_lock);
-                ret = alloc_fresh_huge_page();
+                ret = alloc_fresh_huge_page(h);
                spin_lock(&hugetlb_lock);
                if (!ret)
                        goto out;
@@ -632,31 +1121,288 @@ static unsigned long set_max_huge_pages(unsigned long count)
         * and won't grow the pool anywhere else. Not until one of the
         * sysctls are changed, or the surplus pages go out of use.
         */
-        min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
+        min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
        min_count = max(count, min_count);
-        try_to_free_low(min_count);
+        try_to_free_low(h, min_count);
-        while (min_count < persistent_huge_pages) {
+        while (min_count < persistent_huge_pages(h)) {
-                struct page *page = dequeue_huge_page();
+                struct page *page = dequeue_huge_page(h);
                if (!page)
                        break;
-                update_and_free_page(page);
+                update_and_free_page(h, page);
        }
-        while (count < persistent_huge_pages) {
+        while (count < persistent_huge_pages(h)) {
-                if (!adjust_pool_surplus(1))
+                if (!adjust_pool_surplus(h, 1))
                        break;
        }
 out:
-        ret = persistent_huge_pages;
+        ret = persistent_huge_pages(h);
        spin_unlock(&hugetlb_lock);
        return ret;
 }
+#define HSTATE_ATTR_RO(_name) \
+        static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define HSTATE_ATTR(_name) \
+        static struct kobj_attribute _name##_attr = \
+                __ATTR(_name, 0644, _name##_show, _name##_store)
+static struct kobject *hugepages_kobj;
+static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE];
+static struct hstate *kobj_to_hstate(struct kobject *kobj)
+{
+        int i;
+        for (i = 0; i < HUGE_MAX_HSTATE; i++)
+                if (hstate_kobjs[i] == kobj)
+                        return &hstates[i];
+        BUG();
+        return NULL;
+}
+static ssize_t nr_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->nr_huge_pages);
+}
+static ssize_t nr_hugepages_store(struct kobject *kobj,
+                struct kobj_attribute *attr, const char *buf, size_t count)
+{
+        int err;
+        unsigned long input;
+        struct hstate *h = kobj_to_hstate(kobj);
+        err = strict_strtoul(buf, 10, &input);
+        if (err)
+                return 0;
+        h->max_huge_pages = set_max_huge_pages(h, input);
+        return count;
+}
+HSTATE_ATTR(nr_hugepages);
+static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+}
+static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
+                struct kobj_attribute *attr, const char *buf, size_t count)
+{
+        int err;
+        unsigned long input;
+        struct hstate *h = kobj_to_hstate(kobj);
+        err = strict_strtoul(buf, 10, &input);
+        if (err)
+                return 0;
+        spin_lock(&hugetlb_lock);
+        h->nr_overcommit_huge_pages = input;
+        spin_unlock(&hugetlb_lock);
+        return count;
+}
+HSTATE_ATTR(nr_overcommit_hugepages);
+static ssize_t free_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->free_huge_pages);
+}
+HSTATE_ATTR_RO(free_hugepages);
+static ssize_t resv_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->resv_huge_pages);
+}
+HSTATE_ATTR_RO(resv_hugepages);
+static ssize_t surplus_hugepages_show(struct kobject *kobj,
+                                        struct kobj_attribute *attr, char *buf)
+{
+        struct hstate *h = kobj_to_hstate(kobj);
+        return sprintf(buf, "%lu\n", h->surplus_huge_pages);
+}
+HSTATE_ATTR_RO(surplus_hugepages);
+static struct attribute *hstate_attrs[] = {
+        &nr_hugepages_attr.attr,
+        &nr_overcommit_hugepages_attr.attr,
+        &free_hugepages_attr.attr,
+        &resv_hugepages_attr.attr,
+        &surplus_hugepages_attr.attr,
+        NULL,
+};
+static struct attribute_group hstate_attr_group = {
+        .attrs = hstate_attrs,
+};
+static int __init hugetlb_sysfs_add_hstate(struct hstate *h)
+{
+        int retval;
+        hstate_kobjs[h - hstates] = kobject_create_and_add(h->name,
+                                                        hugepages_kobj);
+        if (!hstate_kobjs[h - hstates])
+                return -ENOMEM;
+        retval = sysfs_create_group(hstate_kobjs[h - hstates],
+                                                        &hstate_attr_group);
+        if (retval)
+                kobject_put(hstate_kobjs[h - hstates]);
+        return retval;
+}
+static void __init hugetlb_sysfs_init(void)
+{
+        struct hstate *h;
+        int err;
+        hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+        if (!hugepages_kobj)
+                return;
+        for_each_hstate(h) {
+                err = hugetlb_sysfs_add_hstate(h);
+                if (err)
+                        printk(KERN_ERR "Hugetlb: Unable to add hstate %s",
+                                                                h->name);
+        }
+}
+static void __exit hugetlb_exit(void)
+{
+        struct hstate *h;
+        for_each_hstate(h) {
+                kobject_put(hstate_kobjs[h - hstates]);
+        }
+        kobject_put(hugepages_kobj);
+}
+module_exit(hugetlb_exit);
+static int __init hugetlb_init(void)
+{
+        BUILD_BUG_ON(HPAGE_SHIFT == 0);
+        if (!size_to_hstate(default_hstate_size)) {
+                default_hstate_size = HPAGE_SIZE;
+                if (!size_to_hstate(default_hstate_size))
+                        hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+        }
+        default_hstate_idx = size_to_hstate(default_hstate_size) - hstates;
+        if (default_hstate_max_huge_pages)
+                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+        hugetlb_init_hstates();
+        gather_bootmem_prealloc();
+        report_hugepages();
+        hugetlb_sysfs_init();
+        return 0;
+}
+module_init(hugetlb_init);
+/* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_add_hstate(unsigned order)
+{
+        struct hstate *h;
+        unsigned long i;
+        if (size_to_hstate(PAGE_SIZE << order)) {
+                printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n");
+                return;
+        }
+        BUG_ON(max_hstate >= HUGE_MAX_HSTATE);
+        BUG_ON(order == 0);
+        h = &hstates[max_hstate++];
+        h->order = order;
+        h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
+        h->nr_huge_pages = 0;
+        h->free_huge_pages = 0;
+        for (i = 0; i < MAX_NUMNODES; ++i)
+                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
+        h->hugetlb_next_nid = first_node(node_online_map);
+        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
+                                        huge_page_size(h)/1024);
+        parsed_hstate = h;
+}
+static int __init hugetlb_nrpages_setup(char *s)
+{
+        unsigned long *mhp;
+        static unsigned long *last_mhp;
+        /*
+         * !max_hstate means we haven't parsed a hugepagesz= parameter yet,
+         * so this hugepages= parameter goes to the "default hstate".
+         */
+        if (!max_hstate)
+                mhp = &default_hstate_max_huge_pages;
+        else
+                mhp = &parsed_hstate->max_huge_pages;
+        if (mhp == last_mhp) {
+                printk(KERN_WARNING "hugepages= specified twice without "
+                        "interleaving hugepagesz=, ignoring\n");
+                return 1;
+        }
+        if (sscanf(s, "%lu", mhp) <= 0)
+                *mhp = 0;
+        /*
+         * Global state is always initialized later in hugetlb_init.
+         * But we need to allocate >= MAX_ORDER hstates here early to still
+         * use the bootmem allocator.
+         */
+        if (max_hstate && parsed_hstate->order >= MAX_ORDER)
+                hugetlb_hstate_alloc_pages(parsed_hstate);
+        last_mhp = mhp;
+        return 1;
+}
+__setup("hugepages=", hugetlb_nrpages_setup);
+static int __init hugetlb_default_setup(char *s)
+{
+        default_hstate_size = memparse(s, &s);
+        return 1;
+}
+__setup("default_hugepagesz=", hugetlb_default_setup);
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
                           struct file *file, void __user *buffer,
                           size_t *length, loff_t *ppos)
 {
+        struct hstate *h = &default_hstate;
+        unsigned long tmp;
+        if (!write)
+                tmp = h->max_huge_pages;
+        table->data = &tmp;
+        table->maxlen = sizeof(unsigned long);
        proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
-        max_huge_pages = set_max_huge_pages(max_huge_pages);
+        if (write)
+                h->max_huge_pages = set_max_huge_pages(h, tmp);
        return 0;
 }
@@ -676,10 +1422,22 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
                        struct file *file, void __user *buffer,
                        size_t *length, loff_t *ppos)
 {
+        struct hstate *h = &default_hstate;
+        unsigned long tmp;
+        if (!write)
+                tmp = h->nr_overcommit_huge_pages;
+        table->data = &tmp;
+        table->maxlen = sizeof(unsigned long);
        proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
-        spin_lock(&hugetlb_lock);
-        nr_overcommit_huge_pages = sysctl_overcommit_huge_pages;
+        if (write) {
-        spin_unlock(&hugetlb_lock);
+                spin_lock(&hugetlb_lock);
+                h->nr_overcommit_huge_pages = tmp;
+                spin_unlock(&hugetlb_lock);
+        }
        return 0;
 }
@@ -687,34 +1445,118 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 int hugetlb_report_meminfo(char *buf)
 {
+        struct hstate *h = &default_hstate;
        return sprintf(buf,
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
                        "HugePages_Rsvd:  %5lu\n"
                        "HugePages_Surp:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
-                        nr_huge_pages,
+                        h->nr_huge_pages,
-                        free_huge_pages,
+                        h->free_huge_pages,
-                        resv_huge_pages,
+                        h->resv_huge_pages,
-                        surplus_huge_pages,
+                        h->surplus_huge_pages,
-                        HPAGE_SIZE/1024);
+                        1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
 }
 int hugetlb_report_node_meminfo(int nid, char *buf)
 {
+        struct hstate *h = &default_hstate;
        return sprintf(buf,
                "Node %d HugePages_Total: %5u\n"
                "Node %d HugePages_Free:  %5u\n"
                "Node %d HugePages_Surp:  %5u\n",
-                nid, nr_huge_pages_node[nid],
+                nid, h->nr_huge_pages_node[nid],
-                nid, free_huge_pages_node[nid],
+                nid, h->free_huge_pages_node[nid],
-                nid, surplus_huge_pages_node[nid]);
+                nid, h->surplus_huge_pages_node[nid]);
 }
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-        return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+        struct hstate *h = &default_hstate;
+        return h->nr_huge_pages * pages_per_huge_page(h);
+}
+static int hugetlb_acct_memory(struct hstate *h, long delta)
+{
+        int ret = -ENOMEM;
+        spin_lock(&hugetlb_lock);
+        /*
+         * When cpuset is configured, it breaks the strict hugetlb page
+         * reservation as the accounting is done on a global variable. Such
+         * reservation is completely rubbish in the presence of cpuset because
+         * the reservation is not checked against page availability for the
+         * current cpuset. Application can still potentially OOM'ed by kernel
+         * with lack of free htlb page in cpuset that the task is in.
+         * Attempt to enforce strict accounting with cpuset is almost
+         * impossible (or too ugly) because cpuset is too fluid that
+         * task or memory node can be dynamically moved between cpusets.
+         *
+         * The change of semantics for shared hugetlb mapping with cpuset is
+         * undesirable. However, in order to preserve some of the semantics,
+         * we fall back to check against current free page availability as
+         * a best attempt and hopefully to minimize the impact of changing
+         * semantics that cpuset has.
+         */
+        if (delta > 0) {
+                if (gather_surplus_pages(h, delta) < 0)
+                        goto out;
+                if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+                        return_unused_surplus_pages(h, delta);
+                        goto out;
+                }
+        }
+        ret = 0;
+        if (delta < 0)
+                return_unused_surplus_pages(h, (unsigned long) -delta);
+out:
+        spin_unlock(&hugetlb_lock);
+        return ret;
+}
+static void hugetlb_vm_op_open(struct vm_area_struct *vma)
+{
+        struct resv_map *reservations = vma_resv_map(vma);
+        /*
+         * This new VMA should share its siblings reservation map if present.
+         * The VMA will only ever have a valid reservation map pointer where
+         * it is being copied for another still existing VMA.  As that VMA
+         * has a reference to the reservation map it cannot dissappear until
+         * after this open call completes.  It is therefore safe to take a
+         * new reference here without additional locking.
+         */
+        if (reservations)
+                kref_get(&reservations->refs);
+}
+static void hugetlb_vm_op_close(struct vm_area_struct *vma)
+{
+        struct hstate *h = hstate_vma(vma);
+        struct resv_map *reservations = vma_resv_map(vma);
+        unsigned long reserve;
+        unsigned long start;
+        unsigned long end;
+        if (reservations) {
+                start = vma_hugecache_offset(h, vma, vma->vm_start);
+                end = vma_hugecache_offset(h, vma, vma->vm_end);
+                reserve = (end - start) -
+                        region_count(&reservations->regions, start, end);
+                kref_put(&reservations->refs, resv_map_release);
+                if (reserve) {
+                        hugetlb_acct_memory(h, -reserve);
+                        hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+                }
+        }
 }
 /*
@@ -731,6 +1573,8 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
+        .open = hugetlb_vm_op_open,
+        .close = hugetlb_vm_op_close,
 };
 static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
@@ -769,14 +1613,16 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        struct page *ptepage;
        unsigned long addr;
        int cow;
+        struct hstate *h = hstate_vma(vma);
+        unsigned long sz = huge_page_size(h);
        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-        for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
                src_pte = huge_pte_offset(src, addr);
                if (!src_pte)
                        continue;
-                dst_pte = huge_pte_alloc(dst, addr);
+                dst_pte = huge_pte_alloc(dst, addr, sz);
                if (!dst_pte)
                        goto nomem;
@@ -804,7 +1650,7 @@ nomem:
 }
 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                            unsigned long end)
+                            unsigned long end, struct page *ref_page)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -812,6 +1658,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        pte_t pte;
        struct page *page;
        struct page *tmp;
+        struct hstate *h = hstate_vma(vma);
+        unsigned long sz = huge_page_size(h);
        /*
         * A page gathering list, protected by per file i_mmap_lock. The
         * lock is used to avoid list corruption from multiple unmapping
@@ -820,11 +1669,11 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        LIST_HEAD(page_list);
        WARN_ON(!is_vm_hugetlb_page(vma));
-        BUG_ON(start & ~HPAGE_MASK);
+        BUG_ON(start & ~huge_page_mask(h));
-        BUG_ON(end & ~HPAGE_MASK);
+        BUG_ON(end & ~huge_page_mask(h));
        spin_lock(&mm->page_table_lock);
-        for (address = start; address < end; address += HPAGE_SIZE) {
+        for (address = start; address < end; address += sz) {
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
@@ -832,6 +1681,27 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                if (huge_pmd_unshare(mm, &address, ptep))
                        continue;
+                /*
+                 * If a reference page is supplied, it is because a specific
+                 * page is being unmapped, not a range. Ensure the page we
+                 * are about to unmap is the actual page of interest.
+                 */
+                if (ref_page) {
+                        pte = huge_ptep_get(ptep);
+                        if (huge_pte_none(pte))
+                                continue;
+                        page = pte_page(pte);
+                        if (page != ref_page)
+                                continue;
+                        /*
+                         * Mark the VMA as having unmapped its page so that
+                         * future faults in this VMA will fail rather than
+                         * looking like data was lost
+                         */
+                        set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED);
+                }
                pte = huge_ptep_get_and_clear(mm, address, ptep);
                if (huge_pte_none(pte))
                        continue;
@@ -850,31 +1720,71 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 }
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                          unsigned long end)
+                          unsigned long end, struct page *ref_page)
 {
+        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+        __unmap_hugepage_range(vma, start, end, ref_page);
+        spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+}
+/*
+ * This is called when the original mapper is failing to COW a MAP_PRIVATE
+ * mappping it owns the reserve page for. The intention is to unmap the page
+ * from other VMAs and let the children be SIGKILLed if they are faulting the
+ * same region.
+ */
+int unmap_ref_private(struct mm_struct *mm,
+                                        struct vm_area_struct *vma,
+                                        struct page *page,
+                                        unsigned long address)
+{
+        struct vm_area_struct *iter_vma;
+        struct address_space *mapping;
+        struct prio_tree_iter iter;
+        pgoff_t pgoff;
        /*
-         * It is undesirable to test vma->vm_file as it should be non-null
+         * vm_pgoff is in PAGE_SIZE units, hence the different calculation
-         * for valid hugetlb area. However, vm_file will be NULL in the error
+         * from page cache lookup which is in HPAGE_SIZE units.
-         * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails,
-         * do_mmap_pgoff() nullifies vma->vm_file before calling this function
-         * to clean up. Since no pte has actually been setup, it is safe to
-         * do nothing in this case.
         */
-        if (vma->vm_file) {
+        address = address & huge_page_mask(hstate_vma(vma));
-                spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
+        pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
-                __unmap_hugepage_range(vma, start, end);
+                + (vma->vm_pgoff >> PAGE_SHIFT);
-                spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
+        mapping = (struct address_space *)page_private(page);
+        vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+                /* Do not unmap the current VMA */
+                if (iter_vma == vma)
+                        continue;
+                /*
+                 * Unmap the page from other VMAs without their own reserves.
+                 * They get marked to be SIGKILLed if they fault in these
+                 * areas. This is because a future no-page fault on this VMA
+                 * could insert a zeroed page instead of the data existing
+                 * from the time of fork. This would look like data corruption
+                 */
+                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
+                        unmap_hugepage_range(iter_vma,
+                                address, address + HPAGE_SIZE,
+                                page);
        }
+        return 1;
 }
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long address, pte_t *ptep, pte_t pte)
+                        unsigned long address, pte_t *ptep, pte_t pte,
+                        struct page *pagecache_page)
 {
+        struct hstate *h = hstate_vma(vma);
        struct page *old_page, *new_page;
        int avoidcopy;
+        int outside_reserve = 0;
        old_page = pte_page(pte);
+retry_avoidcopy:
        /* If no-one else is actually using this page, avoid the copy
         * and just make the page writable */
        avoidcopy = (page_count(old_page) == 1);
@@ -883,11 +1793,43 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                return 0;
        }
+        /*
+         * If the process that created a MAP_PRIVATE mapping is about to
+         * perform a COW due to a shared page count, attempt to satisfy
+         * the allocation without using the existing reserves. The pagecache
+         * page is used to determine if the reserve at this address was
+         * consumed or not. If reserves were used, a partial faulted mapping
+         * at the time of fork() could consume its reserves on COW instead
+         * of the full address range.
+         */
+        if (!(vma->vm_flags & VM_SHARED) &&
+                        is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+                        old_page != pagecache_page)
+                outside_reserve = 1;
        page_cache_get(old_page);
-        new_page = alloc_huge_page(vma, address);
+        new_page = alloc_huge_page(vma, address, outside_reserve);
        if (IS_ERR(new_page)) {
                page_cache_release(old_page);
+                /*
+                 * If a process owning a MAP_PRIVATE mapping fails to COW,
+                 * it is due to references held by a child and an insufficient
+                 * huge page pool. To guarantee the original mappers
+                 * reliability, unmap the page from child processes. The child
+                 * may get SIGKILLed if it later faults.
+                 */
+                if (outside_reserve) {
+                        BUG_ON(huge_pte_none(pte));
+                        if (unmap_ref_private(mm, vma, old_page, address)) {
+                                BUG_ON(page_count(old_page) != 1);
+                                BUG_ON(huge_pte_none(pte));
+                                goto retry_avoidcopy;
+                        }
+                        WARN_ON_ONCE(1);
+                }
                return -PTR_ERR(new_page);
        }
@@ -896,7 +1838,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        __SetPageUptodate(new_page);
        spin_lock(&mm->page_table_lock);
-        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
                /* Break COW */
                huge_ptep_clear_flush(vma, address, ptep);
@@ -910,19 +1852,44 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        return 0;
 }
+/* Return the pagecache page at a given address within a VMA */
+static struct page *hugetlbfs_pagecache_page(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long address)
+{
+        struct address_space *mapping;
+        pgoff_t idx;
+        mapping = vma->vm_file->f_mapping;
+        idx = vma_hugecache_offset(h, vma, address);
+        return find_lock_page(mapping, idx);
+}
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, int write_access)
 {
+        struct hstate *h = hstate_vma(vma);
        int ret = VM_FAULT_SIGBUS;
-        unsigned long idx;
+        pgoff_t idx;
        unsigned long size;
        struct page *page;
        struct address_space *mapping;
        pte_t new_pte;
+        /*
+         * Currently, we are forced to kill the process in the event the
+         * original mapper has unmapped pages from the child due to a failed
+         * COW. Warn that such a situation has occured as it may not be obvious
+         */
+        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
+                printk(KERN_WARNING
+                        "PID %d killed due to inadequate hugepage pool\n",
+                        current->pid);
+                return ret;
+        }
        mapping = vma->vm_file->f_mapping;
-        idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
+        idx = vma_hugecache_offset(h, vma, address);
-                + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
        /*
         * Use page lock to guard against racing truncation
@@ -931,15 +1898,15 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
-                size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+                size = i_size_read(mapping->host) >> huge_page_shift(h);
                if (idx >= size)
                        goto out;
-                page = alloc_huge_page(vma, address);
+                page = alloc_huge_page(vma, address, 0);
                if (IS_ERR(page)) {
                        ret = -PTR_ERR(page);
                        goto out;
                }
-                clear_huge_page(page, address);
+                clear_huge_page(page, address, huge_page_size(h));
                __SetPageUptodate(page);
                if (vma->vm_flags & VM_SHARED) {
@@ -955,14 +1922,14 @@ retry:
                        }
                        spin_lock(&inode->i_lock);
-                        inode->i_blocks += BLOCKS_PER_HUGEPAGE;
+                        inode->i_blocks += blocks_per_huge_page(h);
                        spin_unlock(&inode->i_lock);
                } else
                        lock_page(page);
        }
        spin_lock(&mm->page_table_lock);
-        size = i_size_read(mapping->host) >> HPAGE_SHIFT;
+        size = i_size_read(mapping->host) >> huge_page_shift(h);
        if (idx >= size)
                goto backout;
@@ -976,7 +1943,7 @@ retry:
        if (write_access && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
-                ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+                ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
        }
        spin_unlock(&mm->page_table_lock);
@@ -998,8 +1965,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t entry;
        int ret;
        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
+        struct hstate *h = hstate_vma(vma);
-        ptep = huge_pte_alloc(mm, address);
+        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
        if (!ptep)
                return VM_FAULT_OOM;
@@ -1021,14 +1989,30 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
        if (likely(pte_same(entry, huge_ptep_get(ptep))))
-                if (write_access && !pte_write(entry))
+                if (write_access && !pte_write(entry)) {
-                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
+                        struct page *page;
+                        page = hugetlbfs_pagecache_page(h, vma, address);
+                        ret = hugetlb_cow(mm, vma, address, ptep, entry, page);
+                        if (page) {
+                                unlock_page(page);
+                                put_page(page);
+                        }
+                }
        spin_unlock(&mm->page_table_lock);
        mutex_unlock(&hugetlb_instantiation_mutex);
        return ret;
 }
+/* Can be overriden by architectures */
+__attribute__((weak)) struct page *
+follow_huge_pud(struct mm_struct *mm, unsigned long address,
+               pud_t *pud, int write)
+{
+        BUG();
+        return NULL;
+}
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i,
@@ -1037,6 +2021,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long pfn_offset;
        unsigned long vaddr = *position;
        int remainder = *length;
+        struct hstate *h = hstate_vma(vma);
        spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
@@ -1048,7 +2033,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 * each hugepage.  We have to make * sure we get the
                 * first, for the page indexing below to work.
                 */
-                pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
+                pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
                if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
                    (write && !pte_write(huge_ptep_get(pte)))) {
@@ -1066,7 +2051,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        break;
                }
-                pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+                pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
                page = pte_page(huge_ptep_get(pte));
 same_page:
                if (pages) {
@@ -1082,7 +2067,7 @@ same_page:
                --remainder;
                ++i;
                if (vaddr < vma->vm_end && remainder &&
-                                pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+                                pfn_offset < pages_per_huge_page(h)) {
                        /*
                         * We use pfn_offset to avoid touching the pageframes
                         * of this compound page.
@@ -1104,13 +2089,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        unsigned long start = address;
        pte_t *ptep;
        pte_t pte;
+        struct hstate *h = hstate_vma(vma);
        BUG_ON(address >= end);
        flush_cache_range(vma, address, end);
        spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
        spin_lock(&mm->page_table_lock);
-        for (; address < end; address += HPAGE_SIZE) {
+        for (; address < end; address += huge_page_size(h)) {
                ptep = huge_pte_offset(mm, address);
                if (!ptep)
                        continue;
@@ -1128,195 +2114,59 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        flush_tlb_range(vma, start, end);
 }
-struct file_region {
+int hugetlb_reserve_pages(struct inode *inode,
-        struct list_head link;
+                                        long from, long to,
-        long from;
+                                        struct vm_area_struct *vma)
-        long to;
-};
-static long region_add(struct list_head *head, long f, long t)
-{
-        struct file_region *rg, *nrg, *trg;
-        /* Locate the region we are either in or before. */
-        list_for_each_entry(rg, head, link)
-                if (f <= rg->to)
-                        break;
-        /* Round our left edge to the current segment if it encloses us. */
-        if (f > rg->from)
-                f = rg->from;
-        /* Check for and consume any regions we now overlap with. */
-        nrg = rg;
-        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-                if (&rg->link == head)
-                        break;
-                if (rg->from > t)
-                        break;
-                /* If this area reaches higher then extend our area to
-                 * include it completely.  If this is not the first area
-                 * which we intend to reuse, free it. */
-                if (rg->to > t)
-                        t = rg->to;
-                if (rg != nrg) {
-                        list_del(&rg->link);
-                        kfree(rg);
-                }
-        }
-        nrg->from = f;
-        nrg->to = t;
-        return 0;
-}
-static long region_chg(struct list_head *head, long f, long t)
-{
-        struct file_region *rg, *nrg;
-        long chg = 0;
-        /* Locate the region we are before or in. */
-        list_for_each_entry(rg, head, link)
-                if (f <= rg->to)
-                        break;
-        /* If we are below the current region then a new region is required.
-         * Subtle, allocate a new region at the position but make it zero
-         * size such that we can guarantee to record the reservation. */
-        if (&rg->link == head || t < rg->from) {
-                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
-                if (!nrg)
-                        return -ENOMEM;
-                nrg->from = f;
-                nrg->to   = f;
-                INIT_LIST_HEAD(&nrg->link);
-                list_add(&nrg->link, rg->link.prev);
-                return t - f;
-        }
-        /* Round our left edge to the current segment if it encloses us. */
-        if (f > rg->from)
-                f = rg->from;
-        chg = t - f;
-        /* Check for and consume any regions we now overlap with. */
-        list_for_each_entry(rg, rg->link.prev, link) {
-                if (&rg->link == head)
-                        break;
-                if (rg->from > t)
-                        return chg;
-                /* We overlap with this area, if it extends futher than
-                 * us then we must extend ourselves.  Account for its
-                 * existing reservation. */
-                if (rg->to > t) {
-                        chg += rg->to - t;
-                        t = rg->to;
-                }
-                chg -= rg->to - rg->from;
-        }
-        return chg;
-}
-static long region_truncate(struct list_head *head, long end)
 {
-        struct file_region *rg, *trg;
+        long ret, chg;
-        long chg = 0;
+        struct hstate *h = hstate_inode(inode);
-        /* Locate the region we are either in or before. */
+        if (vma && vma->vm_flags & VM_NORESERVE)
-        list_for_each_entry(rg, head, link)
-                if (end <= rg->to)
-                        break;
-        if (&rg->link == head)
                return 0;
-        /* If we are in the middle of a region then adjust it. */
-        if (end > rg->from) {
-                chg = rg->to - end;
-                rg->to = end;
-                rg = list_entry(rg->link.next, typeof(*rg), link);
-        }
-        /* Drop any remaining regions. */
-        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-                if (&rg->link == head)
-                        break;
-                chg += rg->to - rg->from;
-                list_del(&rg->link);
-                kfree(rg);
-        }
-        return chg;
-}
-static int hugetlb_acct_memory(long delta)
-{
-        int ret = -ENOMEM;
-        spin_lock(&hugetlb_lock);
        /*
-         * When cpuset is configured, it breaks the strict hugetlb page
+         * Shared mappings base their reservation on the number of pages that
-         * reservation as the accounting is done on a global variable. Such
+         * are already allocated on behalf of the file. Private mappings need
-         * reservation is completely rubbish in the presence of cpuset because
+         * to reserve the full area even if read-only as mprotect() may be
-         * the reservation is not checked against page availability for the
+         * called to make the mapping read-write. Assume !vma is a shm mapping
-         * current cpuset. Application can still potentially OOM'ed by kernel
-         * with lack of free htlb page in cpuset that the task is in.
-         * Attempt to enforce strict accounting with cpuset is almost
-         * impossible (or too ugly) because cpuset is too fluid that
-         * task or memory node can be dynamically moved between cpusets.
-         *
-         * The change of semantics for shared hugetlb mapping with cpuset is
-         * undesirable. However, in order to preserve some of the semantics,
-         * we fall back to check against current free page availability as
-         * a best attempt and hopefully to minimize the impact of changing
-         * semantics that cpuset has.
         */
-        if (delta > 0) {
+        if (!vma || vma->vm_flags & VM_SHARED)
-                if (gather_surplus_pages(delta) < 0)
+                chg = region_chg(&inode->i_mapping->private_list, from, to);
-                        goto out;
+        else {
+                struct resv_map *resv_map = resv_map_alloc();
-                if (delta > cpuset_mems_nr(free_huge_pages_node)) {
+                if (!resv_map)
-                        return_unused_surplus_pages(delta);
+                        return -ENOMEM;
-                        goto out;
-                }
-        }
-        ret = 0;
-        if (delta < 0)
-                return_unused_surplus_pages((unsigned long) -delta);
-out:
+                chg = to - from;
-        spin_unlock(&hugetlb_lock);
-        return ret;
-}
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+                set_vma_resv_map(vma, resv_map);
-{
+                set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
-        long ret, chg;
+        }
-        chg = region_chg(&inode->i_mapping->private_list, from, to);
        if (chg < 0)
                return chg;
        if (hugetlb_get_quota(inode->i_mapping, chg))
                return -ENOSPC;
-        ret = hugetlb_acct_memory(chg);
+        ret = hugetlb_acct_memory(h, chg);
        if (ret < 0) {
                hugetlb_put_quota(inode->i_mapping, chg);
                return ret;
        }
-        region_add(&inode->i_mapping->private_list, from, to);
+        if (!vma || vma->vm_flags & VM_SHARED)
+                region_add(&inode->i_mapping->private_list, from, to);
        return 0;
 }
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
+        struct hstate *h = hstate_inode(inode);
        long chg = region_truncate(&inode->i_mapping->private_list, offset);
        spin_lock(&inode->i_lock);
-        inode->i_blocks -= BLOCKS_PER_HUGEPAGE * freed;
+        inode->i_blocks -= blocks_per_huge_page(h);
        spin_unlock(&inode->i_lock);
        hugetlb_put_quota(inode->i_mapping, (chg - freed));
-        hugetlb_acct_memory(-(chg - freed));
+        hugetlb_acct_memory(h, -(chg - freed));
 }
diff --git a/mm/internal.h b/mm/internal.h
index 0034e947e4bc..1f43f7416972 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,6 +13,11 @@
 #include <linux/mm.h>
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+                unsigned long floor, unsigned long ceiling);
+extern void prep_compound_page(struct page *page, unsigned long order);
 static inline void set_page_count(struct page *page, int v)
 {
        atomic_set(&page->_count, v);
@@ -59,4 +64,60 @@ static inline unsigned long page_order(struct page *page)
 #define __paginginit __init
 #endif
+/* Memory initialisation debug and verification */
+enum mminit_level {
+        MMINIT_WARNING,
+        MMINIT_VERIFY,
+        MMINIT_TRACE
+};
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+extern int mminit_loglevel;
+#define mminit_dprintk(level, prefix, fmt, arg...) \
+do { \
+        if (level < mminit_loglevel) { \
+                printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
+                printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+        } \
+} while (0)
+extern void mminit_verify_pageflags_layout(void);
+extern void mminit_verify_page_links(struct page *page,
+                enum zone_type zone, unsigned long nid, unsigned long pfn);
+extern void mminit_verify_zonelist(void);
+#else
+static inline void mminit_dprintk(enum mminit_level level,
+                                const char *prefix, const char *fmt, ...)
+{
+}
+static inline void mminit_verify_pageflags_layout(void)
+{
+}
+static inline void mminit_verify_page_links(struct page *page,
+                enum zone_type zone, unsigned long nid, unsigned long pfn)
+{
+}
+static inline void mminit_verify_zonelist(void)
+{
+}
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
+#if defined(CONFIG_SPARSEMEM)
+extern void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+                                unsigned long *end_pfn);
+#else
+static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
+                                unsigned long *end_pfn)
+{
+}
+#endif /* CONFIG_SPARSEMEM */
 #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e46451e1d9b7..fba566c51322 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -35,9 +35,9 @@
 #include <asm/uaccess.h>
-struct cgroup_subsys mem_cgroup_subsys;
+struct cgroup_subsys mem_cgroup_subsys __read_mostly;
-static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
+static struct kmem_cache *page_cgroup_cache __read_mostly;
-static struct kmem_cache *page_cgroup_cache;
+#define MEM_CGROUP_RECLAIM_RETRIES      5
 /*
 * Statistics for memory cgroup.
@@ -166,7 +166,6 @@ struct page_cgroup {
        struct list_head lru;           /* per cgroup LRU list */
        struct page *page;
        struct mem_cgroup *mem_cgroup;
-        int ref_cnt;                    /* cached, mapped, migrating */
        int flags;
 };
 #define PAGE_CGROUP_FLAG_CACHE  (0x1)   /* charged as cache */
@@ -185,6 +184,7 @@ static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
        MEM_CGROUP_CHARGE_TYPE_MAPPED,
+        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
 };
 /*
@@ -296,7 +296,7 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
                MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
        mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false);
-        list_del_init(&pc->lru);
+        list_del(&pc->lru);
 }
 static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
@@ -354,6 +354,9 @@ void mem_cgroup_move_lists(struct page *page, bool active)
        struct mem_cgroup_per_zone *mz;
        unsigned long flags;
+        if (mem_cgroup_subsys.disabled)
+                return;
        /*
         * We cannot lock_page_cgroup while holding zone's lru_lock,
         * because other holders of lock_page_cgroup can be interrupted
@@ -524,7 +527,8 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 * < 0 if the cgroup is over its limit
 */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-                                gfp_t gfp_mask, enum charge_type ctype)
+                                gfp_t gfp_mask, enum charge_type ctype,
+                                struct mem_cgroup *memcg)
 {
        struct mem_cgroup *mem;
        struct page_cgroup *pc;
@@ -532,35 +536,8 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
        struct mem_cgroup_per_zone *mz;
-        if (mem_cgroup_subsys.disabled)
+        pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask);
-                return 0;
+        if (unlikely(pc == NULL))
-        /*
-         * Should page_cgroup's go to their own slab?
-         * One could optimize the performance of the charging routine
-         * by saving a bit in the page_flags and using it as a lock
-         * to see if the cgroup page already has a page_cgroup associated
-         * with it
-         */
-retry:
-        lock_page_cgroup(page);
-        pc = page_get_page_cgroup(page);
-        /*
-         * The page_cgroup exists and
-         * the page has already been accounted.
-         */
-        if (pc) {
-                VM_BUG_ON(pc->page != page);
-                VM_BUG_ON(pc->ref_cnt <= 0);
-                pc->ref_cnt++;
-                unlock_page_cgroup(page);
-                goto done;
-        }
-        unlock_page_cgroup(page);
-        pc = kmem_cache_zalloc(page_cgroup_cache, gfp_mask);
-        if (pc == NULL)
                goto err;
        /*
@@ -569,16 +546,18 @@ retry:
         * thread group leader migrates. It's possible that mm is not
         * set, if so charge the init_mm (happens for pagecache usage).
         */
-        if (!mm)
+        if (likely(!memcg)) {
-                mm = &init_mm;
+                rcu_read_lock();
+                mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-        rcu_read_lock();
+                /*
-        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+                 * For every charge from the cgroup, increment reference count
-        /*
+                 */
-         * For every charge from the cgroup, increment reference count
+                css_get(&mem->css);
-         */
+                rcu_read_unlock();
-        css_get(&mem->css);
+        } else {
-        rcu_read_unlock();
+                mem = memcg;
+                css_get(&memcg->css);
+        }
        while (res_counter_charge(&mem->res, PAGE_SIZE)) {
                if (!(gfp_mask & __GFP_WAIT))
@@ -603,25 +582,24 @@ retry:
                }
        }
-        pc->ref_cnt = 1;
        pc->mem_cgroup = mem;
        pc->page = page;
-        pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
+        /*
+         * If a page is accounted as a page cache, insert to inactive list.
+         * If anon, insert to active list.
+         */
        if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
                pc->flags = PAGE_CGROUP_FLAG_CACHE;
+        else
+                pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
        lock_page_cgroup(page);
-        if (page_get_page_cgroup(page)) {
+        if (unlikely(page_get_page_cgroup(page))) {
                unlock_page_cgroup(page);
-                /*
-                 * Another charge has been added to this page already.
-                 * We take lock_page_cgroup(page) again and read
-                 * page->cgroup, increment refcnt.... just retry is OK.
-                 */
                res_counter_uncharge(&mem->res, PAGE_SIZE);
                css_put(&mem->css);
                kmem_cache_free(page_cgroup_cache, pc);
-                goto retry;
+                goto done;
        }
        page_assign_page_cgroup(page, pc);
@@ -642,24 +620,65 @@ err:
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
+        if (mem_cgroup_subsys.disabled)
+                return 0;
+        /*
+         * If already mapped, we don't have to account.
+         * If page cache, page->mapping has address_space.
+         * But page->mapping may have out-of-use anon_vma pointer,
+         * detecit it by PageAnon() check. newly-mapped-anon's page->mapping
+         * is NULL.
+         */
+        if (page_mapped(page) || (page->mapping && !PageAnon(page)))
+                return 0;
+        if (unlikely(!mm))
+                mm = &init_mm;
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_MAPPED);
+                                MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
 }
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
-        if (!mm)
+        if (mem_cgroup_subsys.disabled)
+                return 0;
+        /*
+         * Corner case handling. This is called from add_to_page_cache()
+         * in usual. But some FS (shmem) precharges this page before calling it
+         * and call add_to_page_cache() with GFP_NOWAIT.
+         *
+         * For GFP_NOWAIT case, the page may be pre-charged before calling
+         * add_to_page_cache(). (See shmem.c) check it here and avoid to call
+         * charge twice. (It works but has to pay a bit larger cost.)
+         */
+        if (!(gfp_mask & __GFP_WAIT)) {
+                struct page_cgroup *pc;
+                lock_page_cgroup(page);
+                pc = page_get_page_cgroup(page);
+                if (pc) {
+                        VM_BUG_ON(pc->page != page);
+                        VM_BUG_ON(!pc->mem_cgroup);
+                        unlock_page_cgroup(page);
+                        return 0;
+                }
+                unlock_page_cgroup(page);
+        }
+        if (unlikely(!mm))
                mm = &init_mm;
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_CACHE);
+                                MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
 }
 /*
- * Uncharging is always a welcome operation, we never complain, simply
+ * uncharge if !page_mapped(page)
- * uncharge.
 */
-void mem_cgroup_uncharge_page(struct page *page)
+static void
+__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
        struct page_cgroup *pc;
        struct mem_cgroup *mem;
@@ -674,98 +693,151 @@ void mem_cgroup_uncharge_page(struct page *page)
         */
        lock_page_cgroup(page);
        pc = page_get_page_cgroup(page);
-        if (!pc)
+        if (unlikely(!pc))
                goto unlock;
        VM_BUG_ON(pc->page != page);
-        VM_BUG_ON(pc->ref_cnt <= 0);
-        if (--(pc->ref_cnt) == 0) {
+        if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
-                mz = page_cgroup_zoneinfo(pc);
+            && ((pc->flags & PAGE_CGROUP_FLAG_CACHE)
-                spin_lock_irqsave(&mz->lru_lock, flags);
+                || page_mapped(page)))
-                __mem_cgroup_remove_list(mz, pc);
+                goto unlock;
-                spin_unlock_irqrestore(&mz->lru_lock, flags);
-                page_assign_page_cgroup(page, NULL);
+        mz = page_cgroup_zoneinfo(pc);
-                unlock_page_cgroup(page);
+        spin_lock_irqsave(&mz->lru_lock, flags);
+        __mem_cgroup_remove_list(mz, pc);
+        spin_unlock_irqrestore(&mz->lru_lock, flags);
-                mem = pc->mem_cgroup;
+        page_assign_page_cgroup(page, NULL);
-                res_counter_uncharge(&mem->res, PAGE_SIZE);
+        unlock_page_cgroup(page);
-                css_put(&mem->css);
-                kmem_cache_free(page_cgroup_cache, pc);
+        mem = pc->mem_cgroup;
-                return;
+        res_counter_uncharge(&mem->res, PAGE_SIZE);
-        }
+        css_put(&mem->css);
+        kmem_cache_free(page_cgroup_cache, pc);
+        return;
 unlock:
        unlock_page_cgroup(page);
 }
+void mem_cgroup_uncharge_page(struct page *page)
+{
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
+}
+void mem_cgroup_uncharge_cache_page(struct page *page)
+{
+        VM_BUG_ON(page_mapped(page));
+        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
+}
 /*
- * Returns non-zero if a page (under migration) has valid page_cgroup member.
+ * Before starting migration, account against new page.
- * Refcnt of page_cgroup is incremented.
 */
-int mem_cgroup_prepare_migration(struct page *page)
+int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
 {
        struct page_cgroup *pc;
+        struct mem_cgroup *mem = NULL;
+        enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+        int ret = 0;
        if (mem_cgroup_subsys.disabled)
                return 0;
        lock_page_cgroup(page);
        pc = page_get_page_cgroup(page);
-        if (pc)
+        if (pc) {
-                pc->ref_cnt++;
+                mem = pc->mem_cgroup;
+                css_get(&mem->css);
+                if (pc->flags & PAGE_CGROUP_FLAG_CACHE)
+                        ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+        }
        unlock_page_cgroup(page);
-        return pc != NULL;
+        if (mem) {
+                ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
+                        ctype, mem);
+                css_put(&mem->css);
+        }
+        return ret;
 }
-void mem_cgroup_end_migration(struct page *page)
+/* remove redundant charge if migration failed*/
+void mem_cgroup_end_migration(struct page *newpage)
 {
-        mem_cgroup_uncharge_page(page);
+        /*
+         * At success, page->mapping is not NULL.
+         * special rollback care is necessary when
+         * 1. at migration failure. (newpage->mapping is cleared in this case)
+         * 2. the newpage was moved but not remapped again because the task
+         *    exits and the newpage is obsolete. In this case, the new page
+         *    may be a swapcache. So, we just call mem_cgroup_uncharge_page()
+         *    always for avoiding mess. The  page_cgroup will be removed if
+         *    unnecessary. File cache pages is still on radix-tree. Don't
+         *    care it.
+         */
+        if (!newpage->mapping)
+                __mem_cgroup_uncharge_common(newpage,
+                                         MEM_CGROUP_CHARGE_TYPE_FORCE);
+        else if (PageAnon(newpage))
+                mem_cgroup_uncharge_page(newpage);
 }
 /*
- * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
+ * A call to try to shrink memory usage under specified resource controller.
- * And no race with uncharge() routines because page_cgroup for *page*
+ * This is typically used for page reclaiming for shmem for reducing side
- * has extra one reference by mem_cgroup_prepare_migration.
+ * effect of page allocation from shmem, which is used by some mem_cgroup.
 */
-void mem_cgroup_page_migration(struct page *page, struct page *newpage)
+int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 {
-        struct page_cgroup *pc;
+        struct mem_cgroup *mem;
-        struct mem_cgroup_per_zone *mz;
+        int progress = 0;
-        unsigned long flags;
+        int retry = MEM_CGROUP_RECLAIM_RETRIES;
-        lock_page_cgroup(page);
+        if (mem_cgroup_subsys.disabled)
-        pc = page_get_page_cgroup(page);
+                return 0;
-        if (!pc) {
-                unlock_page_cgroup(page);
-                return;
-        }
-        mz = page_cgroup_zoneinfo(pc);
+        rcu_read_lock();
-        spin_lock_irqsave(&mz->lru_lock, flags);
+        mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-        __mem_cgroup_remove_list(mz, pc);
+        css_get(&mem->css);
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        rcu_read_unlock();
-        page_assign_page_cgroup(page, NULL);
+        do {
-        unlock_page_cgroup(page);
+                progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
+        } while (!progress && --retry);
-        pc->page = newpage;
+        css_put(&mem->css);
-        lock_page_cgroup(newpage);
+        if (!retry)
-        page_assign_page_cgroup(newpage, pc);
+                return -ENOMEM;
+        return 0;
+}
-        mz = page_cgroup_zoneinfo(pc);
+int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
-        spin_lock_irqsave(&mz->lru_lock, flags);
+{
-        __mem_cgroup_add_list(mz, pc);
-        spin_unlock_irqrestore(&mz->lru_lock, flags);
+        int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
+        int progress;
+        int ret = 0;
-        unlock_page_cgroup(newpage);
+        while (res_counter_set_limit(&memcg->res, val)) {
+                if (signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
+                if (!retry_count) {
+                        ret = -EBUSY;
+                        break;
+                }
+                progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL);
+                if (!progress)
+                        retry_count--;
+        }
+        return ret;
 }
 /*
 * This routine traverse page_cgroup in given list and drop them all.
- * This routine ignores page_cgroup->ref_cnt.
 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
 */
 #define FORCE_UNCHARGE_BATCH    (128)
@@ -790,12 +862,20 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
                page = pc->page;
                get_page(page);
                spin_unlock_irqrestore(&mz->lru_lock, flags);
-                mem_cgroup_uncharge_page(page);
+                /*
-                put_page(page);
+                 * Check if this page is on LRU. !LRU page can be found
-                if (--count <= 0) {
+                 * if it's under page migration.
-                        count = FORCE_UNCHARGE_BATCH;
+                 */
+                if (PageLRU(page)) {
+                        __mem_cgroup_uncharge_common(page,
+                                        MEM_CGROUP_CHARGE_TYPE_FORCE);
+                        put_page(page);
+                        if (--count <= 0) {
+                                count = FORCE_UNCHARGE_BATCH;
+                                cond_resched();
+                        }
+                } else
                        cond_resched();
-                }
                spin_lock_irqsave(&mz->lru_lock, flags);
        }
        spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -810,9 +890,6 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
        int ret = -EBUSY;
        int node, zid;
-        if (mem_cgroup_subsys.disabled)
-                return 0;
        css_get(&mem->css);
        /*
         * page reclaim code (kswapd etc..) will move pages between
@@ -838,32 +915,34 @@ out:
        return ret;
 }
-static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
-{
-        *tmp = memparse(buf, &buf);
-        if (*buf != '\0')
-                return -EINVAL;
-        /*
-         * Round up the value to the closest page size
-         */
-        *tmp = ((*tmp + PAGE_SIZE - 1) >> PAGE_SHIFT) << PAGE_SHIFT;
-        return 0;
-}
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
        return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
                                    cft->private);
 }
+/*
-static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+ * The user of this function is...
-                                struct file *file, const char __user *userbuf,
+ * RES_LIMIT.
-                                size_t nbytes, loff_t *ppos)
+ */
+static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
+                            const char *buffer)
 {
-        return res_counter_write(&mem_cgroup_from_cont(cont)->res,
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-                                cft->private, userbuf, nbytes, ppos,
+        unsigned long long val;
-                                mem_cgroup_write_strategy);
+        int ret;
+        switch (cft->private) {
+        case RES_LIMIT:
+                /* This function does all necessary parse...reuse it */
+                ret = res_counter_memparse_write_strategy(buffer, &val);
+                if (!ret)
+                        ret = mem_cgroup_resize_limit(memcg, val);
+                break;
+        default:
+                ret = -EINVAL; /* should be BUG() ? */
+                break;
+        }
+        return ret;
 }
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
@@ -940,7 +1019,7 @@ static struct cftype mem_cgroup_files[] = {
        {
                .name = "limit_in_bytes",
                .private = RES_LIMIT,
-                .write = mem_cgroup_write,
+                .write_string = mem_cgroup_write,
                .read_u64 = mem_cgroup_read,
        },
        {
@@ -1070,8 +1149,6 @@ static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 static int mem_cgroup_populate(struct cgroup_subsys *ss,
                                struct cgroup *cont)
 {
-        if (mem_cgroup_subsys.disabled)
-                return 0;
        return cgroup_add_files(cont, ss, mem_cgroup_files,
                                        ARRAY_SIZE(mem_cgroup_files));
 }
@@ -1084,9 +1161,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
        struct mm_struct *mm;
        struct mem_cgroup *mem, *old_mem;
-        if (mem_cgroup_subsys.disabled)
-                return;
        mm = get_task_mm(p);
        if (mm == NULL)
                return;
diff --git a/mm/memory.c b/mm/memory.c
index 2302d228fe04..262e3eb6601a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -61,6 +61,8 @@
 #include <linux/swapops.h>
 #include <linux/elf.h>
+#include "internal.h"
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 /* use the per-pgdat data instead for discontigmem - mbligh */
 unsigned long max_mapnr;
@@ -211,7 +213,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 *
 * Must be called with pagetable lock held.
 */
-void free_pgd_range(struct mmu_gather **tlb,
+void free_pgd_range(struct mmu_gather *tlb,
                        unsigned long addr, unsigned long end,
                        unsigned long floor, unsigned long ceiling)
 {
@@ -262,16 +264,16 @@ void free_pgd_range(struct mmu_gather **tlb,
                return;
        start = addr;
-        pgd = pgd_offset((*tlb)->mm, addr);
+        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
+                free_pud_range(tlb, pgd, addr, next, floor, ceiling);
        } while (pgd++, addr = next, addr != end);
 }
-void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
                unsigned long floor, unsigned long ceiling)
 {
        while (vma) {
@@ -899,9 +901,23 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        }
                        if (unlikely(is_vm_hugetlb_page(vma))) {
-                                unmap_hugepage_range(vma, start, end);
+                                /*
-                                zap_work -= (end - start) /
+                                 * It is undesirable to test vma->vm_file as it
-                                                (HPAGE_SIZE / PAGE_SIZE);
+                                 * should be non-null for valid hugetlb area.
+                                 * However, vm_file will be NULL in the error
+                                 * cleanup path of do_mmap_pgoff. When
+                                 * hugetlbfs ->mmap method fails,
+                                 * do_mmap_pgoff() nullifies vma->vm_file
+                                 * before calling this function to clean up.
+                                 * Since no pte has actually been setup, it is
+                                 * safe to do nothing in this case.
+                                 */
+                                if (vma->vm_file) {
+                                        unmap_hugepage_range(vma, start, end, NULL);
+                                        zap_work -= (end - start) /
+                                        pages_per_huge_page(hstate_vma(vma));
+                                }
                                start = end;
                        } else
                                start = unmap_page_range(*tlbp, vma,
@@ -982,19 +998,24 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                goto no_page_table;
        pud = pud_offset(pgd, address);
-        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+        if (pud_none(*pud))
                goto no_page_table;
-        
+        if (pud_huge(*pud)) {
+                BUG_ON(flags & FOLL_GET);
+                page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
+                goto out;
+        }
+        if (unlikely(pud_bad(*pud)))
+                goto no_page_table;
        pmd = pmd_offset(pud, address);
        if (pmd_none(*pmd))
                goto no_page_table;
        if (pmd_huge(*pmd)) {
                BUG_ON(flags & FOLL_GET);
                page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
                goto out;
        }
        if (unlikely(pmd_bad(*pmd)))
                goto no_page_table;
@@ -1058,11 +1079,9 @@ static inline int use_zero_page(struct vm_area_struct *vma)
        if (vma->vm_flags & (VM_LOCKED | VM_SHARED))
                return 0;
        /*
-         * And if we have a fault or a nopfn routine, it's not an
+         * And if we have a fault routine, it's not an anonymous region.
-         * anonymous region.
         */
-        return !vma->vm_ops ||
+        return !vma->vm_ops || !vma->vm_ops->fault;
-                (!vma->vm_ops->fault && !vma->vm_ops->nopfn);
 }
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -1338,6 +1357,11 @@ out:
 *
 * This function should only be called from a vm_ops->fault handler, and
 * in that case the handler should return NULL.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
 */
 int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                        unsigned long pfn)
@@ -1548,6 +1572,8 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
        unsigned long next;
        int err;
+        BUG_ON(pud_huge(*pud));
        pmd = pmd_alloc(mm, pud, addr);
        if (!pmd)
                return -ENOMEM;
@@ -2501,59 +2527,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return __do_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
-/*
- * do_no_pfn() tries to create a new page mapping for a page without
- * a struct_page backing it
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- *
- * It is expected that the ->nopfn handler always returns the same pfn
- * for a given virtual mapping.
- *
- * Mark this `noinline' to prevent it from bloating the main pagefault code.
- */
-static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
-                     unsigned long address, pte_t *page_table, pmd_t *pmd,
-                     int write_access)
-{
-        spinlock_t *ptl;
-        pte_t entry;
-        unsigned long pfn;
-        pte_unmap(page_table);
-        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
-        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
-        pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
-        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
-        if (unlikely(pfn == NOPFN_OOM))
-                return VM_FAULT_OOM;
-        else if (unlikely(pfn == NOPFN_SIGBUS))
-                return VM_FAULT_SIGBUS;
-        else if (unlikely(pfn == NOPFN_REFAULT))
-                return 0;
-        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-        /* Only go through if we didn't race with anybody else... */
-        if (pte_none(*page_table)) {
-                entry = pfn_pte(pfn, vma->vm_page_prot);
-                if (write_access)
-                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                set_pte_at(mm, address, page_table, entry);
-        }
-        pte_unmap_unlock(page_table, ptl);
-        return 0;
-}
 /*
 * Fault of a previously existing named mapping. Repopulate the pte
 * from the encoded file_pte if possible. This enables swappable
@@ -2614,9 +2587,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                                if (likely(vma->vm_ops->fault))
                                        return do_linear_fault(mm, vma, address,
                                                pte, pmd, write_access, entry);
-                                if (unlikely(vma->vm_ops->nopfn))
-                                        return do_no_pfn(mm, vma, address, pte,
-                                                         pmd, write_access);
                        }
                        return do_anonymous_page(mm, vma, address,
                                                 pte, pmd, write_access);
@@ -2804,6 +2774,86 @@ int in_gate_area_no_task(unsigned long addr)
 #endif  /* __HAVE_ARCH_GATE_AREA */
+#ifdef CONFIG_HAVE_IOREMAP_PROT
+static resource_size_t follow_phys(struct vm_area_struct *vma,
+                        unsigned long address, unsigned int flags,
+                        unsigned long *prot)
+{
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *ptep, pte;
+        spinlock_t *ptl;
+        resource_size_t phys_addr = 0;
+        struct mm_struct *mm = vma->vm_mm;
+        VM_BUG_ON(!(vma->vm_flags & (VM_IO | VM_PFNMAP)));
+        pgd = pgd_offset(mm, address);
+        if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+                goto no_page_table;
+        pud = pud_offset(pgd, address);
+        if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+                goto no_page_table;
+        pmd = pmd_offset(pud, address);
+        if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+                goto no_page_table;
+        /* We cannot handle huge page PFN maps. Luckily they don't exist. */
+        if (pmd_huge(*pmd))
+                goto no_page_table;
+        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+        if (!ptep)
+                goto out;
+        pte = *ptep;
+        if (!pte_present(pte))
+                goto unlock;
+        if ((flags & FOLL_WRITE) && !pte_write(pte))
+                goto unlock;
+        phys_addr = pte_pfn(pte);
+        phys_addr <<= PAGE_SHIFT; /* Shift here to avoid overflow on PAE */
+        *prot = pgprot_val(pte_pgprot(pte));
+unlock:
+        pte_unmap_unlock(ptep, ptl);
+out:
+        return phys_addr;
+no_page_table:
+        return 0;
+}
+int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
+                        void *buf, int len, int write)
+{
+        resource_size_t phys_addr;
+        unsigned long prot = 0;
+        void *maddr;
+        int offset = addr & (PAGE_SIZE-1);
+        if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
+                return -EINVAL;
+        phys_addr = follow_phys(vma, addr, write, &prot);
+        if (!phys_addr)
+                return -EINVAL;
+        maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+        if (write)
+                memcpy_toio(maddr + offset, buf, len);
+        else
+                memcpy_fromio(buf, maddr + offset, len);
+        iounmap(maddr);
+        return len;
+}
+#endif
 /*
 * Access another process' address space.
 * Source/target buffer must be kernel space,
@@ -2813,7 +2863,6 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 {
        struct mm_struct *mm;
        struct vm_area_struct *vma;
-        struct page *page;
        void *old_buf = buf;
        mm = get_task_mm(tsk);
@@ -2825,28 +2874,44 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
        while (len) {
                int bytes, ret, offset;
                void *maddr;
+                struct page *page = NULL;
                ret = get_user_pages(tsk, mm, addr, 1,
                                write, 1, &page, &vma);
-                if (ret <= 0)
+                if (ret <= 0) {
-                        break;
+                        /*
+                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
-                bytes = len;
+                         * we can access using slightly different code.
-                offset = addr & (PAGE_SIZE-1);
+                         */
-                if (bytes > PAGE_SIZE-offset)
+#ifdef CONFIG_HAVE_IOREMAP_PROT
-                        bytes = PAGE_SIZE-offset;
+                        vma = find_vma(mm, addr);
+                        if (!vma)
-                maddr = kmap(page);
+                                break;
-                if (write) {
+                        if (vma->vm_ops && vma->vm_ops->access)
-                        copy_to_user_page(vma, page, addr,
+                                ret = vma->vm_ops->access(vma, addr, buf,
-                                          maddr + offset, buf, bytes);
+                                                          len, write);
-                        set_page_dirty_lock(page);
+                        if (ret <= 0)
+#endif
+                                break;
+                        bytes = ret;
                } else {
-                        copy_from_user_page(vma, page, addr,
+                        bytes = len;
-                                            buf, maddr + offset, bytes);
+                        offset = addr & (PAGE_SIZE-1);
+                        if (bytes > PAGE_SIZE-offset)
+                                bytes = PAGE_SIZE-offset;
+                        maddr = kmap(page);
+                        if (write) {
+                                copy_to_user_page(vma, page, addr,
+                                                  maddr + offset, buf, bytes);
+                                set_page_dirty_lock(page);
+                        } else {
+                                copy_from_user_page(vma, page, addr,
+                                                    buf, maddr + offset, bytes);
+                        }
+                        kunmap(page);
+                        page_cache_release(page);
                }
-                kunmap(page);
-                page_cache_release(page);
                len -= bytes;
                buf += bytes;
                addr += bytes;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 833f854eabe5..89fee2dcb039 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -62,9 +62,9 @@ static void release_memory_resource(struct resource *res)
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 #ifndef CONFIG_SPARSEMEM_VMEMMAP
-static void get_page_bootmem(unsigned long info,  struct page *page, int magic)
+static void get_page_bootmem(unsigned long info,  struct page *page, int type)
 {
-        atomic_set(&page->_mapcount, magic);
+        atomic_set(&page->_mapcount, type);
        SetPagePrivate(page);
        set_page_private(page, info);
        atomic_inc(&page->_count);
@@ -72,10 +72,10 @@ static void get_page_bootmem(unsigned long info,  struct page *page, int magic)
 void put_page_bootmem(struct page *page)
 {
-        int magic;
+        int type;
-        magic = atomic_read(&page->_mapcount);
+        type = atomic_read(&page->_mapcount);
-        BUG_ON(magic >= -1);
+        BUG_ON(type >= -1);
        if (atomic_dec_return(&page->_count) == 1) {
                ClearPagePrivate(page);
@@ -86,7 +86,7 @@ void put_page_bootmem(struct page *page)
 }
-void register_page_bootmem_info_section(unsigned long start_pfn)
+static void register_page_bootmem_info_section(unsigned long start_pfn)
 {
        unsigned long *usemap, mapsize, section_nr, i;
        struct mem_section *ms;
@@ -119,7 +119,7 @@ void register_page_bootmem_info_section(unsigned long start_pfn)
        mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
        for (i = 0; i < mapsize; i++, page++)
-                get_page_bootmem(section_nr, page, MIX_INFO);
+                get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 }
@@ -429,7 +429,9 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        if (need_zonelists_rebuild)
                build_all_zonelists();
-        vm_total_pages = nr_free_pagecache_pages();
+        else
+                vm_total_pages = nr_free_pagecache_pages();
        writeback_set_ratelimit();
        if (onlined_pages)
@@ -455,7 +457,7 @@ static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
        /* we can use NODE_DATA(nid) from here */
        /* init node's zones as empty zones, we don't have any present pages.*/
-        free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
+        free_area_init_node(nid, zones_size, start_pfn, zholes_size);
        return pgdat;
 }
@@ -521,6 +523,66 @@ EXPORT_SYMBOL_GPL(add_memory);
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
+ * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
+ * set and the size of the free page is given by page_order(). Using this,
+ * the function determines if the pageblock contains only free pages.
+ * Due to buddy contraints, a free page at least the size of a pageblock will
+ * be located at the start of the pageblock
+ */
+static inline int pageblock_free(struct page *page)
+{
+        return PageBuddy(page) && page_order(page) >= pageblock_order;
+}
+/* Return the start of the next active pageblock after a given page */
+static struct page *next_active_pageblock(struct page *page)
+{
+        int pageblocks_stride;
+        /* Ensure the starting page is pageblock-aligned */
+        BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
+        /* Move forward by at least 1 * pageblock_nr_pages */
+        pageblocks_stride = 1;
+        /* If the entire pageblock is free, move to the end of free page */
+        if (pageblock_free(page))
+                pageblocks_stride += page_order(page) - pageblock_order;
+        return page + (pageblocks_stride * pageblock_nr_pages);
+}
+/* Checks if this range of memory is likely to be hot-removable. */
+int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+{
+        int type;
+        struct page *page = pfn_to_page(start_pfn);
+        struct page *end_page = page + nr_pages;
+        /* Check the starting page of each pageblock within the range */
+        for (; page < end_page; page = next_active_pageblock(page)) {
+                type = get_pageblock_migratetype(page);
+                /*
+                 * A pageblock containing MOVABLE or free pages is considered
+                 * removable
+                 */
+                if (type != MIGRATE_MOVABLE && !pageblock_free(page))
+                        return 0;
+                /*
+                 * A pageblock starting with a PageReserved page is not
+                 * considered removable.
+                 */
+                if (PageReserved(page))
+                        return 0;
+        }
+        /* All pageblocks in the memory block are likely to be hot-removable */
+        return 1;
+}
+/*
 * Confirm all pages in a range [start, end) is belongs to the same zone.
 */
 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c94e58b192c3..e550bec20582 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1481,7 +1481,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
-                                                HPAGE_SHIFT), gfp_flags);
+                                huge_page_shift(hstate_vma(vma))), gfp_flags);
        } else {
                zl = policy_zonelist(gfp_flags, *mpol);
                if ((*mpol)->mode == MPOL_BIND)
@@ -2220,9 +2220,12 @@ static void check_huge_range(struct vm_area_struct *vma,
 {
        unsigned long addr;
        struct page *page;
+        struct hstate *h = hstate_vma(vma);
+        unsigned long sz = huge_page_size(h);
-        for (addr = start; addr < end; addr += HPAGE_SIZE) {
+        for (addr = start; addr < end; addr += sz) {
-                pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+                pte_t *ptep = huge_pte_offset(vma->vm_mm,
+                                                addr & huge_page_mask(h));
                pte_t pte;
                if (!ptep)
diff --git a/mm/migrate.c b/mm/migrate.c
index 55bd355d170d..d8c65a65c61d 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -30,6 +30,7 @@
 #include <linux/vmalloc.h>
 #include <linux/security.h>
 #include <linux/memcontrol.h>
+#include <linux/syscalls.h>
 #include "internal.h"
@@ -357,6 +358,9 @@ static int migrate_page_move_mapping(struct address_space *mapping,
        __inc_zone_page_state(newpage, NR_FILE_PAGES);
        write_unlock_irq(&mapping->tree_lock);
+        if (!PageSwapCache(newpage)) {
+                mem_cgroup_uncharge_cache_page(page);
+        }
        return 0;
 }
@@ -610,7 +614,6 @@ static int move_to_new_page(struct page *newpage, struct page *page)
                rc = fallback_migrate_page(mapping, newpage, page);
        if (!rc) {
-                mem_cgroup_page_migration(page, newpage);
                remove_migration_ptes(page, newpage);
        } else
                newpage->mapping = NULL;
@@ -640,6 +643,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                /* page was freed from under us. So we are done. */
                goto move_newpage;
+        charge = mem_cgroup_prepare_migration(page, newpage);
+        if (charge == -ENOMEM) {
+                rc = -ENOMEM;
+                goto move_newpage;
+        }
+        /* prepare cgroup just returns 0 or -ENOMEM */
+        BUG_ON(charge);
        rc = -EAGAIN;
        if (TestSetPageLocked(page)) {
                if (!force)
@@ -691,19 +702,14 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                goto rcu_unlock;
        }
-        charge = mem_cgroup_prepare_migration(page);
        /* Establish migration ptes or remove ptes */
        try_to_unmap(page, 1);
        if (!page_mapped(page))
                rc = move_to_new_page(newpage, page);
-        if (rc) {
+        if (rc)
                remove_migration_ptes(page, page);
-                if (charge)
-                        mem_cgroup_end_migration(page);
-        } else if (charge)
-                mem_cgroup_end_migration(newpage);
 rcu_unlock:
        if (rcu_locked)
                rcu_read_unlock();
@@ -724,6 +730,8 @@ unlock:
        }
 move_newpage:
+        if (!charge)
+                mem_cgroup_end_migration(newpage);
        /*
         * Move the new page to the LRU. If migration was not successful
         * then this will free the page.
@@ -1070,7 +1078,6 @@ out2:
        mmput(mm);
        return err;
 }
-#endif
 /*
 * Call migration functions in the vma_ops that may prepare
@@ -1092,3 +1099,4 @@ int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
        }
        return err;
 }
+#endif
diff --git a/mm/mm_init.c b/mm/mm_init.c
new file mode 100644
index 000000000000..c6af41ea9994
--- /dev/null
+++ b/mm/mm_init.c
@@ -0,0 +1,152 @@
+/*
+ * mm_init.c - Memory initialisation verification and debugging
+ *
+ * Copyright 2008 IBM Corporation, 2008
+ * Author Mel Gorman <mel@csn.ul.ie>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include "internal.h"
+#ifdef CONFIG_DEBUG_MEMORY_INIT
+int __meminitdata mminit_loglevel;
+/* The zonelists are simply reported, validation is manual. */
+void mminit_verify_zonelist(void)
+{
+        int nid;
+        if (mminit_loglevel < MMINIT_VERIFY)
+                return;
+        for_each_online_node(nid) {
+                pg_data_t *pgdat = NODE_DATA(nid);
+                struct zone *zone;
+                struct zoneref *z;
+                struct zonelist *zonelist;
+                int i, listid, zoneid;
+                BUG_ON(MAX_ZONELISTS > 2);
+                for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
+                        /* Identify the zone and nodelist */
+                        zoneid = i % MAX_NR_ZONES;
+                        listid = i / MAX_NR_ZONES;
+                        zonelist = &pgdat->node_zonelists[listid];
+                        zone = &pgdat->node_zones[zoneid];
+                        if (!populated_zone(zone))
+                                continue;
+                        /* Print information about the zonelist */
+                        printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
+                                listid > 0 ? "thisnode" : "general", nid,
+                                zone->name);
+                        /* Iterate the zonelist */
+                        for_each_zone_zonelist(zone, z, zonelist, zoneid) {
+#ifdef CONFIG_NUMA
+                                printk(KERN_CONT "%d:%s ",
+                                        zone->node, zone->name);
+#else
+                                printk(KERN_CONT "0:%s ", zone->name);
+#endif /* CONFIG_NUMA */
+                        }
+                        printk(KERN_CONT "\n");
+                }
+        }
+}
+void __init mminit_verify_pageflags_layout(void)
+{
+        int shift, width;
+        unsigned long or_mask, add_mask;
+        shift = 8 * sizeof(unsigned long);
+        width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH;
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
+                "Section %d Node %d Zone %d Flags %d\n",
+                SECTIONS_WIDTH,
+                NODES_WIDTH,
+                ZONES_WIDTH,
+                NR_PAGEFLAGS);
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
+                "Section %d Node %d Zone %d\n",
+#ifdef SECTIONS_SHIFT
+                SECTIONS_SHIFT,
+#else
+                0,
+#endif
+                NODES_SHIFT,
+                ZONES_SHIFT);
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_offsets",
+                "Section %lu Node %lu Zone %lu\n",
+                (unsigned long)SECTIONS_PGSHIFT,
+                (unsigned long)NODES_PGSHIFT,
+                (unsigned long)ZONES_PGSHIFT);
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_zoneid",
+                "Zone ID: %lu -> %lu\n",
+                (unsigned long)ZONEID_PGOFF,
+                (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT));
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_usage",
+                "location: %d -> %d unused %d -> %d flags %d -> %d\n",
+                shift, width, width, NR_PAGEFLAGS, NR_PAGEFLAGS, 0);
+#ifdef NODE_NOT_IN_PAGE_FLAGS
+        mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
+                "Node not in page flags");
+#endif
+        if (SECTIONS_WIDTH) {
+                shift -= SECTIONS_WIDTH;
+                BUG_ON(shift != SECTIONS_PGSHIFT);
+        }
+        if (NODES_WIDTH) {
+                shift -= NODES_WIDTH;
+                BUG_ON(shift != NODES_PGSHIFT);
+        }
+        if (ZONES_WIDTH) {
+                shift -= ZONES_WIDTH;
+                BUG_ON(shift != ZONES_PGSHIFT);
+        }
+        /* Check for bitmask overlaps */
+        or_mask = (ZONES_MASK << ZONES_PGSHIFT) |
+                        (NODES_MASK << NODES_PGSHIFT) |
+                        (SECTIONS_MASK << SECTIONS_PGSHIFT);
+        add_mask = (ZONES_MASK << ZONES_PGSHIFT) +
+                        (NODES_MASK << NODES_PGSHIFT) +
+                        (SECTIONS_MASK << SECTIONS_PGSHIFT);
+        BUG_ON(or_mask != add_mask);
+}
+void __meminit mminit_verify_page_links(struct page *page, enum zone_type zone,
+                        unsigned long nid, unsigned long pfn)
+{
+        BUG_ON(page_to_nid(page) != nid);
+        BUG_ON(page_zonenum(page) != zone);
+        BUG_ON(page_to_pfn(page) != pfn);
+}
+static __init int set_mminit_loglevel(char *str)
+{
+        get_option(&str, &mminit_loglevel);
+        return 0;
+}
+early_param("mminit_loglevel", set_mminit_loglevel);
+#endif /* CONFIG_DEBUG_MEMORY_INIT */
+struct kobject *mm_kobj;
+EXPORT_SYMBOL_GPL(mm_kobj);
+static int __init mm_sysfs_init(void)
+{
+        mm_kobj = kobject_create_and_add("mm", kernel_kobj);
+        if (!mm_kobj)
+                return -ENOMEM;
+        return 0;
+}
+__initcall(mm_sysfs_init);
diff --git a/mm/mmap.c b/mm/mmap.c
index 1d102b956fd8..5e0cc99e9cd5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -32,6 +32,8 @@
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
+#include "internal.h"
 #ifndef arch_mmap_check
 #define arch_mmap_check(addr, len, flags)       (0)
 #endif
@@ -1108,6 +1110,9 @@ munmap_back:
        if (!may_expand_vm(mm, len >> PAGE_SHIFT))
                return -ENOMEM;
+        if (flags & MAP_NORESERVE)
+                vm_flags |= VM_NORESERVE;
        if (accountable && (!(flags & MAP_NORESERVE) ||
                            sysctl_overcommit_memory == OVERCOMMIT_NEVER)) {
                if (vm_flags & VM_SHARED) {
@@ -1763,7 +1768,7 @@ static void unmap_region(struct mm_struct *mm,
        update_hiwater_rss(mm);
        unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
+        free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
                                 next? next->vm_start: 0);
        tlb_finish_mmu(tlb, start, end);
 }
@@ -1807,7 +1812,8 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
        struct mempolicy *pol;
        struct vm_area_struct *new;
-        if (is_vm_hugetlb_page(vma) && (addr & ~HPAGE_MASK))
+        if (is_vm_hugetlb_page(vma) && (addr &
+                                        ~(huge_page_mask(hstate_vma(vma)))))
                return -EINVAL;
        if (mm->map_count >= sysctl_max_map_count)
@@ -2063,7 +2069,7 @@ void exit_mmap(struct mm_struct *mm)
        /* Use -1 here to ensure all VMAs in the mm are unmapped */
        end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
        vm_unacct_memory(nr_accounted);
-        free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+        free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0);
        tlb_finish_mmu(tlb, 0, end);
        /*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 360d9cc8b38c..abd645a3b0a0 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -153,12 +153,10 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
         * If we make a private mapping writable we increase our commit;
         * but (without finer accounting) cannot reduce our commit if we
         * make it unwritable again.
-         *
-         * FIXME? We haven't defined a VM_NORESERVE flag, so mprotecting
-         * a MAP_NORESERVE private mapping to writable will now reserve.
         */
        if (newflags & VM_WRITE) {
-                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
+                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
+                                                VM_SHARED|VM_NORESERVE))) {
                        charged = nrpages;
                        if (security_vm_enough_memory(charged))
                                return -ENOMEM;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 79ac4afc908c..6da667274df5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -153,9 +153,9 @@ static unsigned long __meminitdata dma_reserve;
  static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
-  unsigned long __initdata required_kernelcore;
+  static unsigned long __initdata required_kernelcore;
  static unsigned long __initdata required_movablecore;
-  unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
+  static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
  int movable_zone;
@@ -264,7 +264,7 @@ static void free_compound_page(struct page *page)
        __free_pages_ok(page, compound_order(page));
 }
-static void prep_compound_page(struct page *page, unsigned long order)
+void prep_compound_page(struct page *page, unsigned long order)
 {
        int i;
        int nr_pages = 1 << order;
@@ -432,8 +432,9 @@ static inline void __free_one_page(struct page *page,
                buddy = __page_find_buddy(page, page_idx, order);
                if (!page_is_buddy(page, buddy, order))
-                        break;          /* Move the buddy up one level. */
+                        break;
+                /* Our buddy is free, merge with it and move up one order. */
                list_del(&buddy->lru);
                zone->free_area[order].nr_free--;
                rmv_page_order(buddy);
@@ -532,7 +533,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 /*
 * permit the bootmem allocator to evade page validation on high-order frees
 */
-void __free_pages_bootmem(struct page *page, unsigned int order)
+void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
 {
        if (order == 0) {
                __ClearPageReserved(page);
@@ -673,9 +674,9 @@ static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
 * Note that start_page and end_pages are not aligned on a pageblock
 * boundary. If alignment is required, use move_freepages_block()
 */
-int move_freepages(struct zone *zone,
+static int move_freepages(struct zone *zone,
-                        struct page *start_page, struct page *end_page,
+                          struct page *start_page, struct page *end_page,
-                        int migratetype)
+                          int migratetype)
 {
        struct page *page;
        unsigned long order;
@@ -714,7 +715,8 @@ int move_freepages(struct zone *zone,
        return pages_moved;
 }
-int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
+static int move_freepages_block(struct zone *zone, struct page *page,
+                                int migratetype)
 {
        unsigned long start_pfn, end_pfn;
        struct page *start_page, *end_page;
@@ -1429,7 +1431,7 @@ try_next_zone:
 /*
 * This is the 'heart' of the zoned buddy allocator.
 */
-static struct page *
+struct page *
 __alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
                        struct zonelist *zonelist, nodemask_t *nodemask)
 {
@@ -1632,22 +1634,7 @@ nopage:
 got_pg:
        return page;
 }
+EXPORT_SYMBOL(__alloc_pages_internal);
-struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
-                struct zonelist *zonelist)
-{
-        return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
-}
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-                struct zonelist *zonelist, nodemask_t *nodemask)
-{
-        return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
-}
-EXPORT_SYMBOL(__alloc_pages);
 /*
 * Common helper functions.
@@ -1711,6 +1698,59 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
+/**
+ * alloc_pages_exact - allocate an exact number physically-contiguous pages.
+ * @size: the number of bytes to allocate
+ * @gfp_mask: GFP flags for the allocation
+ *
+ * This function is similar to alloc_pages(), except that it allocates the
+ * minimum number of pages to satisfy the request.  alloc_pages() can only
+ * allocate memory in power-of-two pages.
+ *
+ * This function is also limited by MAX_ORDER.
+ *
+ * Memory allocated by this function must be released by free_pages_exact().
+ */
+void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
+{
+        unsigned int order = get_order(size);
+        unsigned long addr;
+        addr = __get_free_pages(gfp_mask, order);
+        if (addr) {
+                unsigned long alloc_end = addr + (PAGE_SIZE << order);
+                unsigned long used = addr + PAGE_ALIGN(size);
+                split_page(virt_to_page(addr), order);
+                while (used < alloc_end) {
+                        free_page(used);
+                        used += PAGE_SIZE;
+                }
+        }
+        return (void *)addr;
+}
+EXPORT_SYMBOL(alloc_pages_exact);
+/**
+ * free_pages_exact - release memory allocated via alloc_pages_exact()
+ * @virt: the value returned by alloc_pages_exact.
+ * @size: size of allocation, same value as passed to alloc_pages_exact().
+ *
+ * Release the memory allocated by a previous call to alloc_pages_exact.
+ */
+void free_pages_exact(void *virt, size_t size)
+{
+        unsigned long addr = (unsigned long)virt;
+        unsigned long end = addr + PAGE_ALIGN(size);
+        while (addr < end) {
+                free_page(addr);
+                addr += PAGE_SIZE;
+        }
+}
+EXPORT_SYMBOL(free_pages_exact);
 static unsigned int nr_free_zone_pages(int offset)
 {
        struct zoneref *z;
@@ -2352,6 +2392,7 @@ void build_all_zonelists(void)
        if (system_state == SYSTEM_BOOTING) {
                __build_all_zonelists(NULL);
+                mminit_verify_zonelist();
                cpuset_init_current_mems_allowed();
        } else {
                /* we have to stop all cpus to guarantee there is no user
@@ -2534,6 +2575,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                }
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
+                mminit_verify_page_links(page, zone, nid, pfn);
                init_page_count(page);
                reset_page_mapcount(page);
                SetPageReserved(page);
@@ -2611,7 +2653,7 @@ static int zone_batchsize(struct zone *zone)
        return batch;
 }
-inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
 {
        struct per_cpu_pages *pcp;
@@ -2836,6 +2878,12 @@ __meminit int init_currently_empty_zone(struct zone *zone,
        zone->zone_start_pfn = zone_start_pfn;
+        mminit_dprintk(MMINIT_TRACE, "memmap_init",
+                        "Initialising map node %d zone %lu pfns %lu -> %lu\n",
+                        pgdat->node_id,
+                        (unsigned long)zone_idx(zone),
+                        zone_start_pfn, (zone_start_pfn + size));
        zone_init_free_lists(zone);
        return 0;
@@ -2975,7 +3023,8 @@ void __init sparse_memory_present_with_active_regions(int nid)
 void __init push_node_boundaries(unsigned int nid,
                unsigned long start_pfn, unsigned long end_pfn)
 {
-        printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
+        mminit_dprintk(MMINIT_TRACE, "zoneboundary",
+                        "Entering push_node_boundaries(%u, %lu, %lu)\n",
                        nid, start_pfn, end_pfn);
        /* Initialise the boundary for this node if necessary */
@@ -2993,7 +3042,8 @@ void __init push_node_boundaries(unsigned int nid,
 static void __meminit account_node_boundary(unsigned int nid,
                unsigned long *start_pfn, unsigned long *end_pfn)
 {
-        printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
+        mminit_dprintk(MMINIT_TRACE, "zoneboundary",
+                        "Entering account_node_boundary(%u, %lu, %lu)\n",
                        nid, *start_pfn, *end_pfn);
        /* Return if boundary information has not been provided */
@@ -3050,7 +3100,7 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
 * assumption is made that zones within a node are ordered in monotonic
 * increasing memory addresses so that the "highest" populated zone is used
 */
-void __init find_usable_zone_for_movable(void)
+static void __init find_usable_zone_for_movable(void)
 {
        int zone_index;
        for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
@@ -3076,7 +3126,7 @@ void __init find_usable_zone_for_movable(void)
 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
 * zones within a node are in order of monotonic increases memory addresses
 */
-void __meminit adjust_zone_range_for_zone_movable(int nid,
+static void __meminit adjust_zone_range_for_zone_movable(int nid,
                                        unsigned long zone_type,
                                        unsigned long node_start_pfn,
                                        unsigned long node_end_pfn,
@@ -3137,7 +3187,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
 * then all holes in the requested range will be accounted for.
 */
-unsigned long __meminit __absent_pages_in_range(int nid,
+static unsigned long __meminit __absent_pages_in_range(int nid,
                                unsigned long range_start_pfn,
                                unsigned long range_end_pfn)
 {
@@ -3368,8 +3418,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                        PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
                if (realsize >= memmap_pages) {
                        realsize -= memmap_pages;
-                        printk(KERN_DEBUG
+                        mminit_dprintk(MMINIT_TRACE, "memmap_init",
-                                "  %s zone: %lu pages used for memmap\n",
+                                "%s zone: %lu pages used for memmap\n",
                                zone_names[j], memmap_pages);
                } else
                        printk(KERN_WARNING
@@ -3379,7 +3429,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                /* Account for reserved pages */
                if (j == 0 && realsize > dma_reserve) {
                        realsize -= dma_reserve;
-                        printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
+                        mminit_dprintk(MMINIT_TRACE, "memmap_init",
+                                        "%s zone: %lu pages reserved\n",
                                        zone_names[0], dma_reserve);
                }
@@ -3464,10 +3515,11 @@ static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
-void __paginginit free_area_init_node(int nid, struct pglist_data *pgdat,
+void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
-                unsigned long *zones_size, unsigned long node_start_pfn,
+                unsigned long node_start_pfn, unsigned long *zholes_size)
-                unsigned long *zholes_size)
 {
+        pg_data_t *pgdat = NODE_DATA(nid);
        pgdat->node_id = nid;
        pgdat->node_start_pfn = node_start_pfn;
        calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -3520,10 +3572,13 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
 {
        int i;
-        printk(KERN_DEBUG "Entering add_active_range(%d, %#lx, %#lx) "
+        mminit_dprintk(MMINIT_TRACE, "memory_register",
-                          "%d entries of %d used\n",
+                        "Entering add_active_range(%d, %#lx, %#lx) "
-                          nid, start_pfn, end_pfn,
+                        "%d entries of %d used\n",
-                          nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+                        nid, start_pfn, end_pfn,
+                        nr_nodemap_entries, MAX_ACTIVE_REGIONS);
+        mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
        /* Merge with existing active regions if possible */
        for (i = 0; i < nr_nodemap_entries; i++) {
@@ -3669,7 +3724,7 @@ static void __init sort_node_map(void)
 }
 /* Find the lowest pfn for a node */
-unsigned long __init find_min_pfn_for_node(int nid)
+static unsigned long __init find_min_pfn_for_node(int nid)
 {
        int i;
        unsigned long min_pfn = ULONG_MAX;
@@ -3741,7 +3796,7 @@ static unsigned long __init early_calculate_totalpages(void)
 * memory. When they don't, some nodes will have more kernelcore than
 * others
 */
-void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
 {
        int i, nid;
        unsigned long usable_startpfn;
@@ -3957,10 +4012,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                                                early_node_map[i].end_pfn);
        /* Initialise every node */
+        mminit_verify_pageflags_layout();
        setup_nr_node_ids();
        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
-                free_area_init_node(nid, pgdat, NULL,
+                free_area_init_node(nid, NULL,
                                find_min_pfn_for_node(nid), NULL);
                /* Any memory on that node */
@@ -4025,15 +4081,13 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 }
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-static bootmem_data_t contig_bootmem_data;
+struct pglist_data contig_page_data = { .bdata = &bootmem_node_data[0] };
-struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
 EXPORT_SYMBOL(contig_page_data);
 #endif
 void __init free_area_init(unsigned long *zones_size)
 {
-        free_area_init_node(0, NODE_DATA(0), zones_size,
+        free_area_init_node(0, zones_size,
                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 9d834aa4b979..0cbe0c60c6bf 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -130,7 +130,7 @@ static int __pdflush(struct pdflush_work *my_work)
                 * Thread creation: For how long have there been zero
                 * available threads?
                 */
-                if (jiffies - last_empty_jifs > 1 * HZ) {
+                if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
                        /* unlocked list_empty() test is OK here */
                        if (list_empty(&pdflush_list)) {
                                /* unlocked test is OK here */
@@ -151,7 +151,7 @@ static int __pdflush(struct pdflush_work *my_work)
                if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
                        continue;
                pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
-                if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
+                if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
                        /* Limit exit rate */
                        pdf->when_i_went_to_sleep = jiffies;
                        break;                                  /* exeunt */
diff --git a/mm/rmap.c b/mm/rmap.c
index bf0a5b7cfb8e..abbd29f7c43f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -576,14 +576,8 @@ void page_add_anon_rmap(struct page *page,
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        if (atomic_inc_and_test(&page->_mapcount))
                __page_set_anon_rmap(page, vma, address);
-        else {
+        else
                __page_check_anon_rmap(page, vma, address);
-                /*
-                 * We unconditionally charged during prepare, we uncharge here
-                 * This takes care of balancing the reference counts
-                 */
-                mem_cgroup_uncharge_page(page);
-        }
 }
 /**
@@ -614,12 +608,6 @@ void page_add_file_rmap(struct page *page)
 {
        if (atomic_inc_and_test(&page->_mapcount))
                __inc_zone_page_state(page, NR_FILE_MAPPED);
-        else
-                /*
-                 * We unconditionally charged during prepare, we uncharge here
-                 * This takes care of balancing the reference counts
-                 */
-                mem_cgroup_uncharge_page(page);
 }
 #ifdef CONFIG_DEBUG_VM
diff --git a/mm/shmem.c b/mm/shmem.c
index e2a6ae1a44e9..f92fea94d037 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -922,20 +922,26 @@ found:
        error = 1;
        if (!inode)
                goto out;
-        /* Precharge page while we can wait, compensate afterwards */
+        /* Precharge page using GFP_KERNEL while we can wait */
        error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
        if (error)
                goto out;
        error = radix_tree_preload(GFP_KERNEL);
-        if (error)
+        if (error) {
-                goto uncharge;
+                mem_cgroup_uncharge_cache_page(page);
+                goto out;
+        }
        error = 1;
        spin_lock(&info->lock);
        ptr = shmem_swp_entry(info, idx, NULL);
-        if (ptr && ptr->val == entry.val)
+        if (ptr && ptr->val == entry.val) {
                error = add_to_page_cache(page, inode->i_mapping,
                                                idx, GFP_NOWAIT);
+                /* does mem_cgroup_uncharge_cache_page on error */
+        } else  /* we must compensate for our precharge above */
+                mem_cgroup_uncharge_cache_page(page);
        if (error == -EEXIST) {
                struct page *filepage = find_get_page(inode->i_mapping, idx);
                error = 1;
@@ -961,8 +967,6 @@ found:
                shmem_swp_unmap(ptr);
        spin_unlock(&info->lock);
        radix_tree_preload_end();
-uncharge:
-        mem_cgroup_uncharge_page(page);
 out:
        unlock_page(page);
        page_cache_release(page);
@@ -1311,17 +1315,14 @@ repeat:
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
                        unlock_page(swappage);
+                        page_cache_release(swappage);
                        if (error == -ENOMEM) {
                                /* allow reclaim from this memory cgroup */
-                                error = mem_cgroup_cache_charge(swappage,
+                                error = mem_cgroup_shrink_usage(current->mm,
-                                        current->mm, gfp & ~__GFP_HIGHMEM);
+                                                                gfp);
-                                if (error) {
+                                if (error)
-                                        page_cache_release(swappage);
                                        goto failed;
-                                }
-                                mem_cgroup_uncharge_page(swappage);
                        }
-                        page_cache_release(swappage);
                        goto repeat;
                }
        } else if (sgp == SGP_READ && !filepage) {
@@ -1358,6 +1359,8 @@ repeat:
                }
                if (!filepage) {
+                        int ret;
                        spin_unlock(&info->lock);
                        filepage = shmem_alloc_page(gfp, info, idx);
                        if (!filepage) {
@@ -1386,10 +1389,18 @@ repeat:
                                swap = *entry;
                                shmem_swp_unmap(entry);
                        }
-                        if (error || swap.val || 0 != add_to_page_cache_lru(
+                        ret = error || swap.val;
-                                        filepage, mapping, idx, GFP_NOWAIT)) {
+                        if (ret)
+                                mem_cgroup_uncharge_cache_page(filepage);
+                        else
+                                ret = add_to_page_cache_lru(filepage, mapping,
+                                                idx, GFP_NOWAIT);
+                        /*
+                         * At add_to_page_cache_lru() failure, uncharge will
+                         * be done automatically.
+                         */
+                        if (ret) {
                                spin_unlock(&info->lock);
-                                mem_cgroup_uncharge_page(filepage);
                                page_cache_release(filepage);
                                shmem_unacct_blocks(info->flags, 1);
                                shmem_free_blocks(inode, 1);
@@ -1398,7 +1409,6 @@ repeat:
                                        goto failed;
                                goto repeat;
                        }
-                        mem_cgroup_uncharge_page(filepage);
                        info->flags |= SHMEM_PAGEIN;
                }
@@ -1690,26 +1700,38 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
        file_accessed(filp);
 }
-static ssize_t shmem_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+static ssize_t shmem_file_aio_read(struct kiocb *iocb,
+                const struct iovec *iov, unsigned long nr_segs, loff_t pos)
 {
-        read_descriptor_t desc;
+        struct file *filp = iocb->ki_filp;
+        ssize_t retval;
+        unsigned long seg;
+        size_t count;
+        loff_t *ppos = &iocb->ki_pos;
-        if ((ssize_t) count < 0)
+        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-                return -EINVAL;
+        if (retval)
-        if (!access_ok(VERIFY_WRITE, buf, count))
+                return retval;
-                return -EFAULT;
-        if (!count)
-                return 0;
-        desc.written = 0;
+        for (seg = 0; seg < nr_segs; seg++) {
-        desc.count = count;
+                read_descriptor_t desc;
-        desc.arg.buf = buf;
-        desc.error = 0;
-        do_shmem_file_read(filp, ppos, &desc, file_read_actor);
+                desc.written = 0;
-        if (desc.written)
+                desc.arg.buf = iov[seg].iov_base;
-                return desc.written;
+                desc.count = iov[seg].iov_len;
-        return desc.error;
+                if (desc.count == 0)
+                        continue;
+                desc.error = 0;
+                do_shmem_file_read(filp, ppos, &desc, file_read_actor);
+                retval += desc.written;
+                if (desc.error) {
+                        retval = retval ?: desc.error;
+                        break;
+                }
+                if (desc.count > 0)
+                        break;
+        }
+        return retval;
 }
 static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -2369,8 +2391,9 @@ static const struct file_operations shmem_file_operations = {
        .mmap           = shmem_mmap,
 #ifdef CONFIG_TMPFS
        .llseek         = generic_file_llseek,
-        .read           = shmem_file_read,
+        .read           = do_sync_read,
        .write          = do_sync_write,
+        .aio_read       = shmem_file_aio_read,
        .aio_write      = generic_file_aio_write,
        .fsync          = simple_sync_file,
        .splice_read    = generic_file_splice_read,
diff --git a/mm/slob.c b/mm/slob.c
index a3ad6671adf1..de268eb7ac70 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -130,17 +130,17 @@ static LIST_HEAD(free_slob_large);
 */
 static inline int slob_page(struct slob_page *sp)
 {
-        return test_bit(PG_active, &sp->flags);
+        return PageSlobPage((struct page *)sp);
 }
 static inline void set_slob_page(struct slob_page *sp)
 {
-        __set_bit(PG_active, &sp->flags);
+        __SetPageSlobPage((struct page *)sp);
 }
 static inline void clear_slob_page(struct slob_page *sp)
 {
-        __clear_bit(PG_active, &sp->flags);
+        __ClearPageSlobPage((struct page *)sp);
 }
 /*
@@ -148,19 +148,19 @@ static inline void clear_slob_page(struct slob_page *sp)
 */
 static inline int slob_page_free(struct slob_page *sp)
 {
-        return test_bit(PG_private, &sp->flags);
+        return PageSlobFree((struct page *)sp);
 }
 static void set_slob_page_free(struct slob_page *sp, struct list_head *list)
 {
        list_add(&sp->list, list);
-        __set_bit(PG_private, &sp->flags);
+        __SetPageSlobFree((struct page *)sp);
 }
 static inline void clear_slob_page_free(struct slob_page *sp)
 {
        list_del(&sp->list);
-        __clear_bit(PG_private, &sp->flags);
+        __ClearPageSlobFree((struct page *)sp);
 }
 #define SLOB_UNIT sizeof(slob_t)
diff --git a/mm/slub.c b/mm/slub.c
index 6d4a49c1ff2f..77c21cf53ff9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -102,44 +102,12 @@
 *                      the fast path and disables lockless freelists.
 */
-#define FROZEN (1 << PG_active)
 #ifdef CONFIG_SLUB_DEBUG
-#define SLABDEBUG (1 << PG_error)
+#define SLABDEBUG 1
 #else
 #define SLABDEBUG 0
 #endif
-static inline int SlabFrozen(struct page *page)
-{
-        return page->flags & FROZEN;
-}
-static inline void SetSlabFrozen(struct page *page)
-{
-        page->flags |= FROZEN;
-}
-static inline void ClearSlabFrozen(struct page *page)
-{
-        page->flags &= ~FROZEN;
-}
-static inline int SlabDebug(struct page *page)
-{
-        return page->flags & SLABDEBUG;
-}
-static inline void SetSlabDebug(struct page *page)
-{
-        page->flags |= SLABDEBUG;
-}
-static inline void ClearSlabDebug(struct page *page)
-{
-        page->flags &= ~SLABDEBUG;
-}
 /*
 * Issues still to be resolved:
 *
@@ -971,7 +939,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
        }
        /* Special debug activities for freeing objects */
-        if (!SlabFrozen(page) && !page->freelist)
+        if (!PageSlubFrozen(page) && !page->freelist)
                remove_full(s, page);
        if (s->flags & SLAB_STORE_USER)
                set_track(s, object, TRACK_FREE, addr);
@@ -1157,7 +1125,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        page->flags |= 1 << PG_slab;
        if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
                        SLAB_STORE_USER | SLAB_TRACE))
-                SetSlabDebug(page);
+                __SetPageSlubDebug(page);
        start = page_address(page);
@@ -1184,14 +1152,14 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        int order = compound_order(page);
        int pages = 1 << order;
-        if (unlikely(SlabDebug(page))) {
+        if (unlikely(SLABDEBUG && PageSlubDebug(page))) {
                void *p;
                slab_pad_check(s, page);
                for_each_object(p, s, page_address(page),
                                                page->objects)
                        check_object(s, page, p, 0);
-                ClearSlabDebug(page);
+                __ClearPageSlubDebug(page);
        }
        mod_zone_page_state(page_zone(page),
@@ -1288,7 +1256,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
        if (slab_trylock(page)) {
                list_del(&page->lru);
                n->nr_partial--;
-                SetSlabFrozen(page);
+                __SetPageSlubFrozen(page);
                return 1;
        }
        return 0;
@@ -1398,7 +1366,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
        struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id());
-        ClearSlabFrozen(page);
+        __ClearPageSlubFrozen(page);
        if (page->inuse) {
                if (page->freelist) {
@@ -1406,7 +1374,8 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
                        stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
                } else {
                        stat(c, DEACTIVATE_FULL);
-                        if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
+                        if (SLABDEBUG && PageSlubDebug(page) &&
+                                                (s->flags & SLAB_STORE_USER))
                                add_full(n, page);
                }
                slab_unlock(page);
@@ -1551,7 +1520,7 @@ load_freelist:
        object = c->page->freelist;
        if (unlikely(!object))
                goto another_slab;
-        if (unlikely(SlabDebug(c->page)))
+        if (unlikely(SLABDEBUG && PageSlubDebug(c->page)))
                goto debug;
        c->freelist = object[c->offset];
@@ -1588,7 +1557,7 @@ new_slab:
                if (c->page)
                        flush_slab(s, c);
                slab_lock(new);
-                SetSlabFrozen(new);
+                __SetPageSlubFrozen(new);
                c->page = new;
                goto load_freelist;
        }
@@ -1674,7 +1643,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
        stat(c, FREE_SLOWPATH);
        slab_lock(page);
-        if (unlikely(SlabDebug(page)))
+        if (unlikely(SLABDEBUG && PageSlubDebug(page)))
                goto debug;
 checks_ok:
@@ -1682,7 +1651,7 @@ checks_ok:
        page->freelist = object;
        page->inuse--;
-        if (unlikely(SlabFrozen(page))) {
+        if (unlikely(PageSlubFrozen(page))) {
                stat(c, FREE_FROZEN);
                goto out_unlock;
        }
@@ -3317,12 +3286,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page,
                        s->name, page);
        if (s->flags & DEBUG_DEFAULT_FLAGS) {
-                if (!SlabDebug(page))
+                if (!PageSlubDebug(page))
-                        printk(KERN_ERR "SLUB %s: SlabDebug not set "
+                        printk(KERN_ERR "SLUB %s: SlubDebug not set "
                                "on slab 0x%p\n", s->name, page);
        } else {
-                if (SlabDebug(page))
+                if (PageSlubDebug(page))
-                        printk(KERN_ERR "SLUB %s: SlabDebug set on "
+                        printk(KERN_ERR "SLUB %s: SlubDebug set on "
                                "slab 0x%p\n", s->name, page);
        }
 }
diff --git a/mm/sparse.c b/mm/sparse.c
index 36511c7b5e2c..8ffc08990008 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -12,6 +12,7 @@
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
+#include "internal.h"
 /*
 * Permanent SPARSEMEM data:
@@ -147,22 +148,41 @@ static inline int sparse_early_nid(struct mem_section *section)
        return (section->section_mem_map >> SECTION_NID_SHIFT);
 }
-/* Record a memory area against a node. */
+/* Validate the physical addressing limitations of the model */
-void __init memory_present(int nid, unsigned long start, unsigned long end)
+void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
+                                                unsigned long *end_pfn)
 {
-        unsigned long max_arch_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
+        unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);
-        unsigned long pfn;
        /*
         * Sanity checks - do not allow an architecture to pass
         * in larger pfns than the maximum scope of sparsemem:
         */
-        if (start >= max_arch_pfn)
+        if (*start_pfn > max_sparsemem_pfn) {
-                return;
+                mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
-        if (end >= max_arch_pfn)
+                        "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
-                end = max_arch_pfn;
+                        *start_pfn, *end_pfn, max_sparsemem_pfn);
+                WARN_ON_ONCE(1);
+                *start_pfn = max_sparsemem_pfn;
+                *end_pfn = max_sparsemem_pfn;
+        }
+        if (*end_pfn > max_sparsemem_pfn) {
+                mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
+                        "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
+                        *start_pfn, *end_pfn, max_sparsemem_pfn);
+                WARN_ON_ONCE(1);
+                *end_pfn = max_sparsemem_pfn;
+        }
+}
+/* Record a memory area against a node. */
+void __init memory_present(int nid, unsigned long start, unsigned long end)
+{
+        unsigned long pfn;
        start &= PAGE_SECTION_MASK;
+        mminit_validate_memmodel_limits(&start, &end);
        for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
                unsigned long section = pfn_to_section_nr(pfn);
                struct mem_section *ms;
@@ -187,6 +207,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
        unsigned long pfn;
        unsigned long nr_pages = 0;
+        mminit_validate_memmodel_limits(&start_pfn, &end_pfn);
        for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
                if (nid != early_pfn_to_nid(pfn))
                        continue;
@@ -248,16 +269,92 @@ static unsigned long *__kmalloc_section_usemap(void)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static unsigned long * __init
+sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+{
+        unsigned long section_nr;
+        /*
+         * A page may contain usemaps for other sections preventing the
+         * page being freed and making a section unremovable while
+         * other sections referencing the usemap retmain active. Similarly,
+         * a pgdat can prevent a section being removed. If section A
+         * contains a pgdat and section B contains the usemap, both
+         * sections become inter-dependent. This allocates usemaps
+         * from the same section as the pgdat where possible to avoid
+         * this problem.
+         */
+        section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+        return alloc_bootmem_section(usemap_size(), section_nr);
+}
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+        unsigned long usemap_snr, pgdat_snr;
+        static unsigned long old_usemap_snr = NR_MEM_SECTIONS;
+        static unsigned long old_pgdat_snr = NR_MEM_SECTIONS;
+        struct pglist_data *pgdat = NODE_DATA(nid);
+        int usemap_nid;
+        usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+        pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+        if (usemap_snr == pgdat_snr)
+                return;
+        if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
+                /* skip redundant message */
+                return;
+        old_usemap_snr = usemap_snr;
+        old_pgdat_snr = pgdat_snr;
+        usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
+        if (usemap_nid != nid) {
+                printk(KERN_INFO
+                       "node %d must be removed before remove section %ld\n",
+                       nid, usemap_snr);
+                return;
+        }
+        /*
+         * There is a circular dependency.
+         * Some platforms allow un-removable section because they will just
+         * gather other removable sections for dynamic partitioning.
+         * Just notify un-removable section's number here.
+         */
+        printk(KERN_INFO "Section %ld and %ld (node %d)", usemap_snr,
+               pgdat_snr, nid);
+        printk(KERN_CONT
+               " have a circular dependency on usemap and pgdat allocations\n");
+}
+#else
+static unsigned long * __init
+sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat)
+{
+        return NULL;
+}
+static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+{
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
 {
        unsigned long *usemap;
        struct mem_section *ms = __nr_to_section(pnum);
        int nid = sparse_early_nid(ms);
-        usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+        usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid));
        if (usemap)
                return usemap;
+        usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+        if (usemap) {
+                check_usemap_section_nr(nid, usemap);
+                return usemap;
+        }
        /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
        nid = 0;
diff --git a/mm/swap.c b/mm/swap.c
index 45c9f25a8a3b..dd89234ee51f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,9 +34,9 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 /*
 * This path almost never happens for VM activity - pages are normally
@@ -493,7 +493,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag);
 */
 #define ACCT_THRESHOLD  max(16, NR_CPUS * 2)
-static DEFINE_PER_CPU(long, committed_space) = 0;
+static DEFINE_PER_CPU(long, committed_space);
 void vm_acct_memory(long pages)
 {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index bd1bb5920306..2f33edb8bee9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -37,6 +37,7 @@ DEFINE_SPINLOCK(swap_lock);
 unsigned int nr_swapfiles;
 long total_swap_pages;
 static int swap_overflow;
+static int least_priority;
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
@@ -1260,6 +1261,11 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
                /* just pick something that's safe... */
                swap_list.next = swap_list.head;
        }
+        if (p->prio < 0) {
+                for (i = p->next; i >= 0; i = swap_info[i].next)
+                        swap_info[i].prio = p->prio--;
+                least_priority++;
+        }
        nr_swap_pages -= p->pages;
        total_swap_pages -= p->pages;
        p->flags &= ~SWP_WRITEOK;
@@ -1272,9 +1278,14 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        if (err) {
                /* re-insert swap space back into swap_list */
                spin_lock(&swap_lock);
-                for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next)
+                if (p->prio < 0)
+                        p->prio = --least_priority;
+                prev = -1;
+                for (i = swap_list.head; i >= 0; i = swap_info[i].next) {
                        if (p->prio >= swap_info[i].prio)
                                break;
+                        prev = i;
+                }
                p->next = i;
                if (prev < 0)
                        swap_list.head = swap_list.next = p - swap_info;
@@ -1447,7 +1458,6 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        unsigned int type;
        int i, prev;
        int error;
-        static int least_priority;
        union swap_header *swap_header = NULL;
        int swap_header_version;
        unsigned int nr_good_pages = 0;
@@ -1455,7 +1465,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        sector_t span;
        unsigned long maxpages = 1;
        int swapfilesize;
-        unsigned short *swap_map;
+        unsigned short *swap_map = NULL;
        struct page *page = NULL;
        struct inode *inode = NULL;
        int did_down = 0;
@@ -1474,22 +1484,10 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        }
        if (type >= nr_swapfiles)
                nr_swapfiles = type+1;
+        memset(p, 0, sizeof(*p));
        INIT_LIST_HEAD(&p->extent_list);
        p->flags = SWP_USED;
-        p->swap_file = NULL;
-        p->old_block_size = 0;
-        p->swap_map = NULL;
-        p->lowest_bit = 0;
-        p->highest_bit = 0;
-        p->cluster_nr = 0;
-        p->inuse_pages = 0;
        p->next = -1;
-        if (swap_flags & SWAP_FLAG_PREFER) {
-                p->prio =
-                  (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT;
-        } else {
-                p->prio = --least_priority;
-        }
        spin_unlock(&swap_lock);
        name = getname(specialfile);
        error = PTR_ERR(name);
@@ -1632,19 +1630,20 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                        goto bad_swap;
                /* OK, set up the swap map and apply the bad block list */
-                if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
+                swap_map = vmalloc(maxpages * sizeof(short));
+                if (!swap_map) {
                        error = -ENOMEM;
                        goto bad_swap;
                }
                error = 0;
-                memset(p->swap_map, 0, maxpages * sizeof(short));
+                memset(swap_map, 0, maxpages * sizeof(short));
                for (i = 0; i < swap_header->info.nr_badpages; i++) {
                        int page_nr = swap_header->info.badpages[i];
                        if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
                                error = -EINVAL;
                        else
-                                p->swap_map[page_nr] = SWAP_MAP_BAD;
+                                swap_map[page_nr] = SWAP_MAP_BAD;
                }
                nr_good_pages = swap_header->info.last_page -
                                swap_header->info.nr_badpages -
@@ -1654,7 +1653,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        }
        if (nr_good_pages) {
-                p->swap_map[0] = SWAP_MAP_BAD;
+                swap_map[0] = SWAP_MAP_BAD;
                p->max = maxpages;
                p->pages = nr_good_pages;
                nr_extents = setup_swap_extents(p, &span);
@@ -1672,6 +1671,12 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
+        if (swap_flags & SWAP_FLAG_PREFER)
+                p->prio =
+                  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
+        else
+                p->prio = --least_priority;
+        p->swap_map = swap_map;
        p->flags = SWP_ACTIVE;
        nr_swap_pages += nr_good_pages;
        total_swap_pages += nr_good_pages;
@@ -1707,12 +1712,8 @@ bad_swap:
        destroy_swap_extents(p);
 bad_swap_2:
        spin_lock(&swap_lock);
-        swap_map = p->swap_map;
        p->swap_file = NULL;
-        p->swap_map = NULL;
        p->flags = 0;
-        if (!(swap_flags & SWAP_FLAG_PREFER))
-                ++least_priority;
        spin_unlock(&swap_lock);
        vfree(swap_map);
        if (swap_file)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6e45b0f3d125..35f293816294 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -931,6 +931,25 @@ static void s_stop(struct seq_file *m, void *p)
        read_unlock(&vmlist_lock);
 }
+static void show_numa_info(struct seq_file *m, struct vm_struct *v)
+{
+        if (NUMA_BUILD) {
+                unsigned int nr, *counters = m->private;
+                if (!counters)
+                        return;
+                memset(counters, 0, nr_node_ids * sizeof(unsigned int));
+                for (nr = 0; nr < v->nr_pages; nr++)
+                        counters[page_to_nid(v->pages[nr])]++;
+                for_each_node_state(nr, N_HIGH_MEMORY)
+                        if (counters[nr])
+                                seq_printf(m, " N%u=%u", nr, counters[nr]);
+        }
+}
 static int s_show(struct seq_file *m, void *p)
 {
        struct vm_struct *v = p;
@@ -967,6 +986,7 @@ static int s_show(struct seq_file *m, void *p)
        if (v->flags & VM_VPAGES)
                seq_printf(m, " vpages");
+        show_numa_info(m, v);
        seq_putc(m, '\n');
        return 0;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 967d30ccd92b..26672c6cd3ce 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -38,6 +38,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
+#include <linux/delayacct.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1316,6 +1317,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        struct zone *zone;
        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
+        delayacct_freepages_start();
        if (scan_global_lru(sc))
                count_vm_event(ALLOCSTALL);
        /*
@@ -1396,6 +1399,8 @@ out:
        } else
                mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
+        delayacct_freepages_end();
        return ret;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c3d4a781802f..b0d08e667ece 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -13,6 +13,7 @@
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
+#include <linux/vmstat.h>
 #include <linux/sched.h>
 #ifdef CONFIG_VM_EVENT_COUNTERS
author	Ingo Molnar <mingo@elte.hu>	2008-07-26 11:48:49 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-07-26 11:48:49 -0400
commit	c3cc99ff5d24e2eeaf7ec2032e720681916990e3 (patch)
tree	c3e74171bbbd2adde9d60b9db1c440415c8d2831 /mm
parent	38ffbe66d59051fd9cfcfc8545f164700e2fa3bc (diff)
parent	024e8ac04453b3525448c31ef39848cf675ba6db (diff)