30 files changed, 1919 insertions, 1058 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0016ebd4dcba..3aa819d628c1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,18 @@ config MEMORY_HOTREMOVE
        depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
        depends on MIGRATION
+#
+# If we have space for more page flags then we can enable additional
+# optimizations and functionality.
+#
+# Regular Sparsemem takes page flag bits for the sectionid if it does not
+# use a virtual memmap. Disable extended page flags for 32 bit platforms
+# that require the use of a sectionid in the page flags.
+#
+config PAGEFLAGS_EXTENDED
+        def_bool y
+        depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2ccea700968f..e8fb927392b9 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -111,44 +111,74 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
 * might be used for boot-time allocations - or it might get added
 * to the free page pool later on.
 */
-static int __init reserve_bootmem_core(bootmem_data_t *bdata,
+static int __init can_reserve_bootmem_core(bootmem_data_t *bdata,
                        unsigned long addr, unsigned long size, int flags)
 {
        unsigned long sidx, eidx;
        unsigned long i;
-        int ret;
+        BUG_ON(!size);
+        /* out of range, don't hold other */
+        if (addr + size < bdata->node_boot_start ||
+                PFN_DOWN(addr) > bdata->node_low_pfn)
+                return 0;
        /*
-         * round up, partially reserved pages are considered
+         * Round up to index to the range.
-         * fully reserved.
         */
+        if (addr > bdata->node_boot_start)
+                sidx= PFN_DOWN(addr - bdata->node_boot_start);
+        else
+                sidx = 0;
+        eidx = PFN_UP(addr + size - bdata->node_boot_start);
+        if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+                eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
+        for (i = sidx; i < eidx; i++) {
+                if (test_bit(i, bdata->node_bootmem_map)) {
+                        if (flags & BOOTMEM_EXCLUSIVE)
+                                return -EBUSY;
+                }
+        }
+        return 0;
+}
+static void __init reserve_bootmem_core(bootmem_data_t *bdata,
+                        unsigned long addr, unsigned long size, int flags)
+{
+        unsigned long sidx, eidx;
+        unsigned long i;
        BUG_ON(!size);
-        BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
-        BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
-        BUG_ON(addr < bdata->node_boot_start);
-        sidx = PFN_DOWN(addr - bdata->node_boot_start);
+        /* out of range */
+        if (addr + size < bdata->node_boot_start ||
+                PFN_DOWN(addr) > bdata->node_low_pfn)
+                return;
+        /*
+         * Round up to index to the range.
+         */
+        if (addr > bdata->node_boot_start)
+                sidx= PFN_DOWN(addr - bdata->node_boot_start);
+        else
+                sidx = 0;
        eidx = PFN_UP(addr + size - bdata->node_boot_start);
+        if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
+                eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
-        for (i = sidx; i < eidx; i++)
+        for (i = sidx; i < eidx; i++) {
                if (test_and_set_bit(i, bdata->node_bootmem_map)) {
 #ifdef CONFIG_DEBUG_BOOTMEM
                        printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
 #endif
-                        if (flags & BOOTMEM_EXCLUSIVE) {
-                                ret = -EBUSY;
-                                goto err;
-                        }
                }
+        }
-        return 0;
-err:
-        /* unreserve memory we accidentally reserved */
-        for (i--; i >= sidx; i--)
-                clear_bit(i, bdata->node_bootmem_map);
-        return ret;
 }
 static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
@@ -206,9 +236,11 @@ void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
              unsigned long align, unsigned long goal, unsigned long limit)
 {
-        unsigned long offset, remaining_size, areasize, preferred;
+        unsigned long areasize, preferred;
        unsigned long i, start = 0, incr, eidx, end_pfn;
        void *ret;
+        unsigned long node_boot_start;
+        void *node_bootmem_map;
        if (!size) {
                printk("__alloc_bootmem_core(): zero-sized request\n");
@@ -216,70 +248,83 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
        }
        BUG_ON(align & (align-1));
-        if (limit && bdata->node_boot_start >= limit)
-                return NULL;
        /* on nodes without memory - bootmem_map is NULL */
        if (!bdata->node_bootmem_map)
                return NULL;
+        /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */
+        node_boot_start = bdata->node_boot_start;
+        node_bootmem_map = bdata->node_bootmem_map;
+        if (align) {
+                node_boot_start = ALIGN(bdata->node_boot_start, align);
+                if (node_boot_start > bdata->node_boot_start)
+                        node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
+                            PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
+        }
+        if (limit && node_boot_start >= limit)
+                return NULL;
        end_pfn = bdata->node_low_pfn;
        limit = PFN_DOWN(limit);
        if (limit && end_pfn > limit)
                end_pfn = limit;
-        eidx = end_pfn - PFN_DOWN(bdata->node_boot_start);
+        eidx = end_pfn - PFN_DOWN(node_boot_start);
-        offset = 0;
-        if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
-                offset = align - (bdata->node_boot_start & (align - 1UL));
-        offset = PFN_DOWN(offset);
        /*
         * We try to allocate bootmem pages above 'goal'
         * first, then we try to allocate lower pages.
         */
-        if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) {
+        preferred = 0;
-                preferred = goal - bdata->node_boot_start;
+        if (goal && PFN_DOWN(goal) < end_pfn) {
+                if (goal > node_boot_start)
+                        preferred = goal - node_boot_start;
-                if (bdata->last_success >= preferred)
+                if (bdata->last_success > node_boot_start &&
+                        bdata->last_success - node_boot_start >= preferred)
                        if (!limit || (limit && limit > bdata->last_success))
-                                preferred = bdata->last_success;
+                                preferred = bdata->last_success - node_boot_start;
-        } else
+        }
-                preferred = 0;
-        preferred = PFN_DOWN(ALIGN(preferred, align)) + offset;
+        preferred = PFN_DOWN(ALIGN(preferred, align));
        areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
        incr = align >> PAGE_SHIFT ? : 1;
 restart_scan:
-        for (i = preferred; i < eidx; i += incr) {
+        for (i = preferred; i < eidx;) {
                unsigned long j;
-                i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
+                i = find_next_zero_bit(node_bootmem_map, eidx, i);
                i = ALIGN(i, incr);
                if (i >= eidx)
                        break;
-                if (test_bit(i, bdata->node_bootmem_map))
+                if (test_bit(i, node_bootmem_map)) {
+                        i += incr;
                        continue;
+                }
                for (j = i + 1; j < i + areasize; ++j) {
                        if (j >= eidx)
                                goto fail_block;
-                        if (test_bit(j, bdata->node_bootmem_map))
+                        if (test_bit(j, node_bootmem_map))
                                goto fail_block;
                }
                start = i;
                goto found;
        fail_block:
                i = ALIGN(j, incr);
+                if (i == j)
+                        i += incr;
        }
-        if (preferred > offset) {
+        if (preferred > 0) {
-                preferred = offset;
+                preferred = 0;
                goto restart_scan;
        }
        return NULL;
 found:
-        bdata->last_success = PFN_PHYS(start);
+        bdata->last_success = PFN_PHYS(start) + node_boot_start;
        BUG_ON(start >= eidx);
        /*
@@ -289,6 +334,7 @@ found:
         */
        if (align < PAGE_SIZE &&
            bdata->last_offset && bdata->last_pos+1 == start) {
+                unsigned long offset, remaining_size;
                offset = ALIGN(bdata->last_offset, align);
                BUG_ON(offset > PAGE_SIZE);
                remaining_size = PAGE_SIZE - offset;
@@ -297,14 +343,12 @@ found:
                        /* last_pos unchanged */
                        bdata->last_offset = offset + size;
                        ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-                                           offset +
+                                           offset + node_boot_start);
-                                           bdata->node_boot_start);
                } else {
                        remaining_size = size - remaining_size;
                        areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
                        ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
-                                           offset +
+                                           offset + node_boot_start);
-                                           bdata->node_boot_start);
                        bdata->last_pos = start + areasize - 1;
                        bdata->last_offset = remaining_size;
                }
@@ -312,14 +356,14 @@ found:
        } else {
                bdata->last_pos = start + areasize - 1;
                bdata->last_offset = size & ~PAGE_MASK;
-                ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start);
+                ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
        }
        /*
         * Reserve the area now:
         */
        for (i = start; i < start + areasize; i++)
-                if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map)))
+                if (unlikely(test_and_set_bit(i, node_bootmem_map)))
                        BUG();
        memset(ret, 0, size);
        return ret;
@@ -401,6 +445,11 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
 void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
                                 unsigned long size, int flags)
 {
+        int ret;
+        ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
+        if (ret < 0)
+                return;
        reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
 }
@@ -412,6 +461,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
 unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
 {
+        register_page_bootmem_info_node(pgdat);
        return free_all_bootmem_core(pgdat);
 }
@@ -426,7 +476,18 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
 int __init reserve_bootmem(unsigned long addr, unsigned long size,
                            int flags)
 {
-        return reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size, flags);
+        bootmem_data_t *bdata;
+        int ret;
+        list_for_each_entry(bdata, &bdata_list, list) {
+                ret = can_reserve_bootmem_core(bdata, addr, size, flags);
+                if (ret < 0)
+                        return ret;
+        }
+        list_for_each_entry(bdata, &bdata_list, list)
+                reserve_bootmem_core(bdata, addr, size, flags);
+        return 0;
 }
 #endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
@@ -484,6 +545,37 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
        return __alloc_bootmem(size, align, goal);
 }
+#ifdef CONFIG_SPARSEMEM
+void * __init alloc_bootmem_section(unsigned long size,
+                                    unsigned long section_nr)
+{
+        void *ptr;
+        unsigned long limit, goal, start_nr, end_nr, pfn;
+        struct pglist_data *pgdat;
+        pfn = section_nr_to_pfn(section_nr);
+        goal = PFN_PHYS(pfn);
+        limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1;
+        pgdat = NODE_DATA(early_pfn_to_nid(pfn));
+        ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
+                                   limit);
+        if (!ptr)
+                return NULL;
+        start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr)));
+        end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size));
+        if (start_nr != section_nr || end_nr != section_nr) {
+                printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n",
+                       section_nr);
+                free_bootmem_core(pgdat->bdata, __pa(ptr), size);
+                ptr = NULL;
+        }
+        return ptr;
+}
+#endif
 #ifndef ARCH_LOW_ADDRESS_LIMIT
 #define ARCH_LOW_ADDRESS_LIMIT  0xffffffffUL
 #endif
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 34aaac451a96..b1f0885dda22 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -37,6 +37,10 @@
 #include <linux/types.h>
 #include <linux/wait.h>
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+#define DMAPOOL_DEBUG 1
+#endif
 struct dma_pool {               /* the pool */
        struct list_head page_list;
        spinlock_t lock;
@@ -216,7 +220,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
        page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
                                         &page->dma, mem_flags);
        if (page->vaddr) {
-#ifdef  CONFIG_DEBUG_SLAB
+#ifdef  DMAPOOL_DEBUG
                memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
 #endif
                pool_initialise_page(pool, page);
@@ -239,7 +243,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
 {
        dma_addr_t dma = page->dma;
-#ifdef  CONFIG_DEBUG_SLAB
+#ifdef  DMAPOOL_DEBUG
        memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
 #endif
        dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
@@ -336,7 +340,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
        page->offset = *(int *)(page->vaddr + offset);
        retval = offset + page->vaddr;
        *handle = offset + page->dma;
-#ifdef  CONFIG_DEBUG_SLAB
+#ifdef  DMAPOOL_DEBUG
        memset(retval, POOL_POISON_ALLOCATED, pool->size);
 #endif
 done:
@@ -391,7 +395,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
        }
        offset = vaddr - page->vaddr;
-#ifdef  CONFIG_DEBUG_SLAB
+#ifdef  DMAPOOL_DEBUG
        if ((dma - page->dma) != offset) {
                if (pool->dev)
                        dev_err(pool->dev,
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3c0f1e99f5e4..343cfdfebd9e 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -49,7 +49,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                goto out;
        }
-        if (mapping->a_ops->get_xip_page) {
+        if (mapping->a_ops->get_xip_mem) {
                switch (advice) {
                case POSIX_FADV_NORMAL:
                case POSIX_FADV_RANDOM:
diff --git a/mm/filemap.c b/mm/filemap.c
index 07e9d9258b48..239d36163bbe 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -576,10 +576,12 @@ EXPORT_SYMBOL(unlock_page);
 */
 void end_page_writeback(struct page *page)
 {
-        if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
+        if (TestClearPageReclaim(page))
-                if (!test_clear_page_writeback(page))
+                rotate_reclaimable_page(page);
-                        BUG();
-        }
+        if (!test_clear_page_writeback(page))
+                BUG();
        smp_mb__after_clear_bit();
        wake_up_page(page, PG_writeback);
 }
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 5e598c42afd7..3e744abcce9d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/sched.h>
 #include <asm/tlbflush.h>
+#include <asm/io.h>
 /*
 * We do use our own empty page to avoid interference with other users
@@ -42,37 +43,41 @@ static struct page *xip_sparse_page(void)
 /*
 * This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_page() function for the actual low-level
+ * the mapping->a_ops->get_xip_mem() function for the actual low-level
 * stuff.
 *
 * Note the struct file* is not used at all.  It may be NULL.
 */
-static void
+static ssize_t
 do_xip_mapping_read(struct address_space *mapping,
                    struct file_ra_state *_ra,
                    struct file *filp,
-                    loff_t *ppos,
+                    char __user *buf,
-                    read_descriptor_t *desc,
+                    size_t len,
-                    read_actor_t actor)
+                    loff_t *ppos)
 {
        struct inode *inode = mapping->host;
        pgoff_t index, end_index;
        unsigned long offset;
-        loff_t isize;
+        loff_t isize, pos;
+        size_t copied = 0, error = 0;
-        BUG_ON(!mapping->a_ops->get_xip_page);
+        BUG_ON(!mapping->a_ops->get_xip_mem);
-        index = *ppos >> PAGE_CACHE_SHIFT;
+        pos = *ppos;
-        offset = *ppos & ~PAGE_CACHE_MASK;
+        index = pos >> PAGE_CACHE_SHIFT;
+        offset = pos & ~PAGE_CACHE_MASK;
        isize = i_size_read(inode);
        if (!isize)
                goto out;
        end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
-        for (;;) {
+        do {
-                struct page *page;
+                unsigned long nr, left;
-                unsigned long nr, ret;
+                void *xip_mem;
+                unsigned long xip_pfn;
+                int zero = 0;
                /* nr is the maximum number of bytes to copy from this page */
                nr = PAGE_CACHE_SIZE;
@@ -85,19 +90,17 @@ do_xip_mapping_read(struct address_space *mapping,
                        }
                }
                nr = nr - offset;
+                if (nr > len)
+                        nr = len;
-                page = mapping->a_ops->get_xip_page(mapping,
+                error = mapping->a_ops->get_xip_mem(mapping, index, 0,
-                        index*(PAGE_SIZE/512), 0);
+                                                        &xip_mem, &xip_pfn);
-                if (!page)
+                if (unlikely(error)) {
-                        goto no_xip_page;
+                        if (error == -ENODATA) {
-                if (unlikely(IS_ERR(page))) {
-                        if (PTR_ERR(page) == -ENODATA) {
                                /* sparse */
-                                page = ZERO_PAGE(0);
+                                zero = 1;
-                        } else {
+                        } else
-                                desc->error = PTR_ERR(page);
                                goto out;
-                        }
                }
                /* If users can be writing to this page using arbitrary
@@ -105,10 +108,10 @@ do_xip_mapping_read(struct address_space *mapping,
                 * before reading the page on the kernel side.
                 */
                if (mapping_writably_mapped(mapping))
-                        flush_dcache_page(page);
+                        /* address based flush */ ;
                /*
-                 * Ok, we have the page, so now we can copy it to user space...
+                 * Ok, we have the mem, so now we can copy it to user space...
                 *
                 * The actor routine returns how many bytes were actually used..
                 * NOTE! This may not be the same as how much of a user buffer
@@ -116,47 +119,38 @@ do_xip_mapping_read(struct address_space *mapping,
                 * "pos" here (the actor routine has to update the user buffer
                 * pointers and the remaining count).
                 */
-                ret = actor(desc, page, offset, nr);
+                if (!zero)
-                offset += ret;
+                        left = __copy_to_user(buf+copied, xip_mem+offset, nr);
-                index += offset >> PAGE_CACHE_SHIFT;
+                else
-                offset &= ~PAGE_CACHE_MASK;
+                        left = __clear_user(buf + copied, nr);
-                if (ret == nr && desc->count)
+                if (left) {
-                        continue;
+                        error = -EFAULT;
-                goto out;
+                        goto out;
+                }
-no_xip_page:
+                copied += (nr - left);
-                /* Did not get the page. Report it */
+                offset += (nr - left);
-                desc->error = -EIO;
+                index += offset >> PAGE_CACHE_SHIFT;
-                goto out;
+                offset &= ~PAGE_CACHE_MASK;
-        }
+        } while (copied < len);
 out:
-        *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+        *ppos = pos + copied;
        if (filp)
                file_accessed(filp);
+        return (copied ? copied : error);
 }
 ssize_t
 xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 {
-        read_descriptor_t desc;
        if (!access_ok(VERIFY_WRITE, buf, len))
                return -EFAULT;
-        desc.written = 0;
+        return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
-        desc.arg.buf = buf;
+                            buf, len, ppos);
-        desc.count = len;
-        desc.error = 0;
-        do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
-                            ppos, &desc, file_read_actor);
-        if (desc.written)
-                return desc.written;
-        else
-                return desc.error;
 }
 EXPORT_SYMBOL_GPL(xip_file_read);
@@ -211,13 +205,16 @@ __xip_unmap (struct address_space * mapping,
 *
 * This function is derived from filemap_fault, but used for execute in place
 */
-static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
+static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-        struct file *file = area->vm_file;
+        struct file *file = vma->vm_file;
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
-        struct page *page;
        pgoff_t size;
+        void *xip_mem;
+        unsigned long xip_pfn;
+        struct page *page;
+        int error;
        /* XXX: are VM_FAULT_ codes OK? */
@@ -225,35 +222,44 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
        if (vmf->pgoff >= size)
                return VM_FAULT_SIGBUS;
-        page = mapping->a_ops->get_xip_page(mapping,
+        error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
-                                        vmf->pgoff*(PAGE_SIZE/512), 0);
+                                                &xip_mem, &xip_pfn);
-        if (!IS_ERR(page))
+        if (likely(!error))
-                goto out;
+                goto found;
-        if (PTR_ERR(page) != -ENODATA)
+        if (error != -ENODATA)
                return VM_FAULT_OOM;
        /* sparse block */
-        if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+        if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
-            (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
+            (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
            (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+                int err;
                /* maybe shared writable, allocate new block */
-                page = mapping->a_ops->get_xip_page(mapping,
+                error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
-                                        vmf->pgoff*(PAGE_SIZE/512), 1);
+                                                        &xip_mem, &xip_pfn);
-                if (IS_ERR(page))
+                if (error)
                        return VM_FAULT_SIGBUS;
-                /* unmap page at pgoff from all other vmas */
+                /* unmap sparse mappings at pgoff from all other vmas */
                __xip_unmap(mapping, vmf->pgoff);
+found:
+                err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+                                                        xip_pfn);
+                if (err == -ENOMEM)
+                        return VM_FAULT_OOM;
+                BUG_ON(err);
+                return VM_FAULT_NOPAGE;
        } else {
                /* not shared and writable, use xip_sparse_page() */
                page = xip_sparse_page();
                if (!page)
                        return VM_FAULT_OOM;
-        }
-out:
+                page_cache_get(page);
-        page_cache_get(page);
+                vmf->page = page;
-        vmf->page = page;
+                return 0;
-        return 0;
+        }
 }
 static struct vm_operations_struct xip_file_vm_ops = {
@@ -262,11 +268,11 @@ static struct vm_operations_struct xip_file_vm_ops = {
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
 {
-        BUG_ON(!file->f_mapping->a_ops->get_xip_page);
+        BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
        file_accessed(file);
        vma->vm_ops = &xip_file_vm_ops;
-        vma->vm_flags |= VM_CAN_NONLINEAR;
+        vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
        return 0;
 }
 EXPORT_SYMBOL_GPL(xip_file_mmap);
@@ -279,17 +285,17 @@ __xip_file_write(struct file *filp, const char __user *buf,
        const struct address_space_operations *a_ops = mapping->a_ops;
        struct inode    *inode = mapping->host;
        long            status = 0;
-        struct page     *page;
        size_t          bytes;
        ssize_t         written = 0;
-        BUG_ON(!mapping->a_ops->get_xip_page);
+        BUG_ON(!mapping->a_ops->get_xip_mem);
        do {
                unsigned long index;
                unsigned long offset;
                size_t copied;
-                char *kaddr;
+                void *xip_mem;
+                unsigned long xip_pfn;
                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
                index = pos >> PAGE_CACHE_SHIFT;
@@ -297,28 +303,22 @@ __xip_file_write(struct file *filp, const char __user *buf,
                if (bytes > count)
                        bytes = count;
-                page = a_ops->get_xip_page(mapping,
+                status = a_ops->get_xip_mem(mapping, index, 0,
-                                           index*(PAGE_SIZE/512), 0);
+                                                &xip_mem, &xip_pfn);
-                if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
+                if (status == -ENODATA) {
                        /* we allocate a new page unmap it */
-                        page = a_ops->get_xip_page(mapping,
+                        status = a_ops->get_xip_mem(mapping, index, 1,
-                                                   index*(PAGE_SIZE/512), 1);
+                                                        &xip_mem, &xip_pfn);
-                        if (!IS_ERR(page))
+                        if (!status)
                                /* unmap page at pgoff from all other vmas */
                                __xip_unmap(mapping, index);
                }
-                if (IS_ERR(page)) {
+                if (status)
-                        status = PTR_ERR(page);
                        break;
-                }
-                fault_in_pages_readable(buf, bytes);
-                kaddr = kmap_atomic(page, KM_USER0);
                copied = bytes -
-                        __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
+                        __copy_from_user_nocache(xip_mem + offset, buf, bytes);
-                kunmap_atomic(kaddr, KM_USER0);
-                flush_dcache_page(page);
                if (likely(copied > 0)) {
                        status = copied;
@@ -398,7 +398,7 @@ EXPORT_SYMBOL_GPL(xip_file_write);
 /*
 * truncate a page used for execute in place
- * functionality is analog to block_truncate_page but does use get_xip_page
+ * functionality is analog to block_truncate_page but does use get_xip_mem
 * to get the page instead of page cache
 */
 int
@@ -408,9 +408,11 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
        unsigned blocksize;
        unsigned length;
-        struct page *page;
+        void *xip_mem;
+        unsigned long xip_pfn;
+        int err;
-        BUG_ON(!mapping->a_ops->get_xip_page);
+        BUG_ON(!mapping->a_ops->get_xip_mem);
        blocksize = 1 << mapping->host->i_blkbits;
        length = offset & (blocksize - 1);
@@ -421,18 +423,16 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
        length = blocksize - length;
-        page = mapping->a_ops->get_xip_page(mapping,
+        err = mapping->a_ops->get_xip_mem(mapping, index, 0,
-                                            index*(PAGE_SIZE/512), 0);
+                                                &xip_mem, &xip_pfn);
-        if (!page)
+        if (unlikely(err)) {
-                return -ENOMEM;
+                if (err == -ENODATA)
-        if (unlikely(IS_ERR(page))) {
-                if (PTR_ERR(page) == -ENODATA)
                        /* Hole? No need to truncate */
                        return 0;
                else
-                        return PTR_ERR(page);
+                        return err;
        }
-        zero_user(page, offset, length);
+        memset(xip_mem + offset, 0, length);
        return 0;
 }
 EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51c9e2c01640..df28c1773fb2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -95,13 +95,16 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
        int nid;
        struct page *page = NULL;
        struct mempolicy *mpol;
+        nodemask_t *nodemask;
        struct zonelist *zonelist = huge_zonelist(vma, address,
-                                        htlb_alloc_mask, &mpol);
+                                        htlb_alloc_mask, &mpol, &nodemask);
-        struct zone **z;
+        struct zone *zone;
+        struct zoneref *z;
-        for (z = zonelist->zones; *z; z++) {
-                nid = zone_to_nid(*z);
+        for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) &&
+                                                MAX_NR_ZONES - 1, nodemask) {
+                nid = zone_to_nid(zone);
+                if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
                    !list_empty(&hugepage_freelists[nid])) {
                        page = list_entry(hugepage_freelists[nid].next,
                                          struct page, lru);
@@ -113,7 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
                        break;
                }
        }
-        mpol_free(mpol);        /* unref if mpol !NULL */
+        mpol_cond_put(mpol);
        return page;
 }
@@ -129,6 +132,7 @@ static void update_and_free_page(struct page *page)
        }
        set_compound_page_dtor(page, NULL);
        set_page_refcounted(page);
+        arch_release_hugepage(page);
        __free_pages(page, HUGETLB_PAGE_ORDER);
 }
@@ -198,6 +202,10 @@ static struct page *alloc_fresh_huge_page_node(int nid)
                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
                HUGETLB_PAGE_ORDER);
        if (page) {
+                if (arch_prepare_hugepage(page)) {
+                        __free_pages(page, HUGETLB_PAGE_ORDER);
+                        return 0;
+                }
                set_compound_page_dtor(page, free_huge_page);
                spin_lock(&hugetlb_lock);
                nr_huge_pages++;
@@ -239,6 +247,11 @@ static int alloc_fresh_huge_page(void)
                hugetlb_next_nid = next_nid;
        } while (!page && hugetlb_next_nid != start_nid);
+        if (ret)
+                count_vm_event(HTLB_BUDDY_PGALLOC);
+        else
+                count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
        return ret;
 }
@@ -299,9 +312,11 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
                 */
                nr_huge_pages_node[nid]++;
                surplus_huge_pages_node[nid]++;
+                __count_vm_event(HTLB_BUDDY_PGALLOC);
        } else {
                nr_huge_pages--;
                surplus_huge_pages--;
+                __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
        }
        spin_unlock(&hugetlb_lock);
@@ -369,11 +384,19 @@ retry:
        resv_huge_pages += delta;
        ret = 0;
 free:
+        /* Free the needed pages to the hugetlb pool */
        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+                if ((--needed) < 0)
+                        break;
                list_del(&page->lru);
-                if ((--needed) >= 0)
+                enqueue_huge_page(page);
-                        enqueue_huge_page(page);
+        }
-                else {
+        /* Free unnecessary surplus pages to the buddy allocator */
+        if (!list_empty(&surplus_list)) {
+                spin_unlock(&hugetlb_lock);
+                list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+                        list_del(&page->lru);
                        /*
                         * The page has a reference count of zero already, so
                         * call free_huge_page directly instead of using
@@ -381,10 +404,9 @@ free:
                         * unlocked which is safe because free_huge_page takes
                         * hugetlb_lock before deciding how to free the page.
                         */
-                        spin_unlock(&hugetlb_lock);
                        free_huge_page(page);
-                        spin_lock(&hugetlb_lock);
                }
+                spin_lock(&hugetlb_lock);
        }
        return ret;
@@ -718,7 +740,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
                entry =
                    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
        } else {
-                entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+                entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
        }
        entry = pte_mkyoung(entry);
        entry = pte_mkhuge(entry);
@@ -731,8 +753,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
 {
        pte_t entry;
-        entry = pte_mkwrite(pte_mkdirty(*ptep));
+        entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
-        if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+        if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
                update_mmu_cache(vma, address, entry);
        }
 }
@@ -762,10 +784,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                spin_lock(&dst->page_table_lock);
                spin_lock(&src->page_table_lock);
-                if (!pte_none(*src_pte)) {
+                if (!huge_pte_none(huge_ptep_get(src_pte))) {
                        if (cow)
-                                ptep_set_wrprotect(src, addr, src_pte);
+                                huge_ptep_set_wrprotect(src, addr, src_pte);
-                        entry = *src_pte;
+                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
                        get_page(ptepage);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
@@ -809,7 +831,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                        continue;
                pte = huge_ptep_get_and_clear(mm, address, ptep);
-                if (pte_none(pte))
+                if (huge_pte_none(pte))
                        continue;
                page = pte_page(pte);
@@ -873,8 +895,9 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_lock(&mm->page_table_lock);
        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
-        if (likely(pte_same(*ptep, pte))) {
+        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
                /* Break COW */
+                huge_ptep_clear_flush(vma, address, ptep);
                set_huge_pte_at(mm, address, ptep,
                                make_huge_pte(vma, new_page, 1));
                /* Make the old page be freed below */
@@ -942,7 +965,7 @@ retry:
                goto backout;
        ret = 0;
-        if (!pte_none(*ptep))
+        if (!huge_pte_none(huge_ptep_get(ptep)))
                goto backout;
        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
@@ -984,8 +1007,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * the same page in the page cache.
         */
        mutex_lock(&hugetlb_instantiation_mutex);
-        entry = *ptep;
+        entry = huge_ptep_get(ptep);
-        if (pte_none(entry)) {
+        if (huge_pte_none(entry)) {
                ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
                mutex_unlock(&hugetlb_instantiation_mutex);
                return ret;
@@ -995,7 +1018,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
-        if (likely(pte_same(entry, *ptep)))
+        if (likely(pte_same(entry, huge_ptep_get(ptep))))
                if (write_access && !pte_write(entry))
                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
        spin_unlock(&mm->page_table_lock);
@@ -1025,7 +1048,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
-                if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) {
+                if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
+                    (write && !pte_write(huge_ptep_get(pte)))) {
                        int ret;
                        spin_unlock(&mm->page_table_lock);
@@ -1041,7 +1065,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                }
                pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
-                page = pte_page(*pte);
+                page = pte_page(huge_ptep_get(pte));
 same_page:
                if (pages) {
                        get_page(page);
@@ -1090,7 +1114,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                        continue;
                if (huge_pmd_unshare(mm, &address, ptep))
                        continue;
-                if (!pte_none(*ptep)) {
+                if (!huge_pte_none(huge_ptep_get(ptep))) {
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
                        pte = pte_mkhuge(pte_modify(pte, newprot));
                        set_huge_pte_at(mm, address, ptep, pte);
diff --git a/mm/internal.h b/mm/internal.h
index 789727309f4d..0034e947e4bc 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -34,8 +34,7 @@ static inline void __put_page(struct page *page)
        atomic_dec(&page->_count);
 }
-extern void __init __free_pages_bootmem(struct page *page,
+extern void __free_pages_bootmem(struct page *page, unsigned int order);
-                                                unsigned int order);
 /*
 * function for dealing with page's order in buddy system.
diff --git a/mm/madvise.c b/mm/madvise.c
index 93ee375b38e7..23a0ec3e0ea0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -112,7 +112,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
        if (!file)
                return -EBADF;
-        if (file->f_mapping->a_ops->get_xip_page) {
+        if (file->f_mapping->a_ops->get_xip_mem) {
                /* no bad return value, but ignore advice */
                return 0;
        }
diff --git a/mm/memory.c b/mm/memory.c
index 0d14d1e58a5f..bbab1e37055e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -371,57 +371,93 @@ static inline int is_cow_mapping(unsigned int flags)
 }
 /*
- * This function gets the "struct page" associated with a pte.
+ * vm_normal_page -- This function gets the "struct page" associated with a pte.
 *
- * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
+ * "Special" mappings do not wish to be associated with a "struct page" (either
- * will have each page table entry just pointing to a raw page frame
+ * it doesn't exist, or it exists but they don't want to touch it). In this
- * number, and as far as the VM layer is concerned, those do not have
+ * case, NULL is returned here. "Normal" mappings do have a struct page.
- * pages associated with them - even if the PFN might point to memory
- * that otherwise is perfectly fine and has a "struct page".
 *
- * The way we recognize those mappings is through the rules set up
+ * There are 2 broad cases. Firstly, an architecture may define a pte_special()
- * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
+ * pte bit, in which case this function is trivial. Secondly, an architecture
- * and the vm_pgoff will point to the first PFN mapped: thus every
+ * may not have a spare pte bit, which requires a more complicated scheme,
- * page that is a raw mapping will always honor the rule
+ * described below.
+ *
+ * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
+ * special mapping (even if there are underlying and valid "struct pages").
+ * COWed pages of a VM_PFNMAP are always normal.
+ *
+ * The way we recognize COWed pages within VM_PFNMAP mappings is through the
+ * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
+ * set, and the vm_pgoff will point to the first PFN mapped: thus every special
+ * mapping will always honor the rule
 *
 *      pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
 *
- * and if that isn't true, the page has been COW'ed (in which case it
+ * And for normal mappings this is false.
- * _does_ have a "struct page" associated with it even if it is in a
+ *
- * VM_PFNMAP range).
+ * This restricts such mappings to be a linear translation from virtual address
+ * to pfn. To get around this restriction, we allow arbitrary mappings so long
+ * as the vma is not a COW mapping; in that case, we know that all ptes are
+ * special (because none can have been COWed).
+ *
+ *
+ * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
+ *
+ * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
+ * page" backing, however the difference is that _all_ pages with a struct
+ * page (that is, those where pfn_valid is true) are refcounted and considered
+ * normal pages by the VM. The disadvantage is that pages are refcounted
+ * (which can be slower and simply not an option for some PFNMAP users). The
+ * advantage is that we don't have to follow the strict linearity rule of
+ * PFNMAP mappings in order to support COWable mappings.
+ *
 */
-struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+#ifdef __HAVE_ARCH_PTE_SPECIAL
+# define HAVE_PTE_SPECIAL 1
+#else
+# define HAVE_PTE_SPECIAL 0
+#endif
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+                                pte_t pte)
 {
-        unsigned long pfn = pte_pfn(pte);
+        unsigned long pfn;
-        if (unlikely(vma->vm_flags & VM_PFNMAP)) {
+        if (HAVE_PTE_SPECIAL) {
-                unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
+                if (likely(!pte_special(pte))) {
-                if (pfn == vma->vm_pgoff + off)
+                        VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
-                        return NULL;
+                        return pte_page(pte);
-                if (!is_cow_mapping(vma->vm_flags))
+                }
-                        return NULL;
+                VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
+                return NULL;
        }
-#ifdef CONFIG_DEBUG_VM
+        /* !HAVE_PTE_SPECIAL case follows: */
-        /*
-         * Add some anal sanity checks for now. Eventually,
+        pfn = pte_pfn(pte);
-         * we should just do "return pfn_to_page(pfn)", but
-         * in the meantime we check that we get a valid pfn,
+        if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
-         * and that the resulting page looks ok.
+                if (vma->vm_flags & VM_MIXEDMAP) {
-         */
+                        if (!pfn_valid(pfn))
-        if (unlikely(!pfn_valid(pfn))) {
+                                return NULL;
-                print_bad_pte(vma, pte, addr);
+                        goto out;
-                return NULL;
+                } else {
+                        unsigned long off;
+                        off = (addr - vma->vm_start) >> PAGE_SHIFT;
+                        if (pfn == vma->vm_pgoff + off)
+                                return NULL;
+                        if (!is_cow_mapping(vma->vm_flags))
+                                return NULL;
+                }
        }
-#endif
+        VM_BUG_ON(!pfn_valid(pfn));
        /*
-         * NOTE! We still have PageReserved() pages in the page 
+         * NOTE! We still have PageReserved() pages in the page tables.
-         * tables. 
         *
-         * The PAGE_ZERO() pages and various VDSO mappings can
+         * eg. VDSO mappings can cause them to exist.
-         * cause them to exist.
         */
+out:
        return pfn_to_page(pfn);
 }
@@ -1057,8 +1093,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                if (pages)
                        foll_flags |= FOLL_GET;
                if (!write && !(vma->vm_flags & VM_LOCKED) &&
-                    (!vma->vm_ops || (!vma->vm_ops->nopage &&
+                    (!vma->vm_ops || !vma->vm_ops->fault))
-                                        !vma->vm_ops->fault)))
                        foll_flags |= FOLL_ANON;
                do {
@@ -1141,8 +1176,10 @@ pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
 * old drivers should use this, and they needed to mark their
 * pages reserved for the old functions anyway.
 */
-static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+static int insert_page(struct vm_area_struct *vma, unsigned long addr,
+                        struct page *page, pgprot_t prot)
 {
+        struct mm_struct *mm = vma->vm_mm;
        int retval;
        pte_t *pte;
        spinlock_t *ptl;
@@ -1202,40 +1239,26 @@ out:
 *
 * The page does not need to be reserved.
 */
-int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
+                        struct page *page)
 {
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
        if (!page_count(page))
                return -EINVAL;
        vma->vm_flags |= VM_INSERTPAGE;
-        return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
+        return insert_page(vma, addr, page, vma->vm_page_prot);
 }
 EXPORT_SYMBOL(vm_insert_page);
-/**
+static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- * vm_insert_pfn - insert single pfn into user vma
+                        unsigned long pfn, pgprot_t prot)
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- *
- * Similar to vm_inert_page, this allows drivers to insert individual pages
- * they've allocated into a user vma. Same comments apply.
- *
- * This function should only be called from a vm_ops->fault handler, and
- * in that case the handler should return NULL.
- */
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-                unsigned long pfn)
 {
        struct mm_struct *mm = vma->vm_mm;
        int retval;
        pte_t *pte, entry;
        spinlock_t *ptl;
-        BUG_ON(!(vma->vm_flags & VM_PFNMAP));
-        BUG_ON(is_cow_mapping(vma->vm_flags));
        retval = -ENOMEM;
        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
@@ -1245,19 +1268,74 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                goto out_unlock;
        /* Ok, finally just insert the thing.. */
-        entry = pfn_pte(pfn, vma->vm_page_prot);
+        entry = pte_mkspecial(pfn_pte(pfn, prot));
        set_pte_at(mm, addr, pte, entry);
-        update_mmu_cache(vma, addr, entry);
+        update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
        retval = 0;
 out_unlock:
        pte_unmap_unlock(pte, ptl);
 out:
        return retval;
 }
+/**
+ * vm_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_inert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return NULL.
+ */
+int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+                        unsigned long pfn)
+{
+        /*
+         * Technically, architectures with pte_special can avoid all these
+         * restrictions (same for remap_pfn_range).  However we would like
+         * consistency in testing and feature parity among all, so we should
+         * try to keep these invariants in place for everybody.
+         */
+        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
+        BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
+                                                (VM_PFNMAP|VM_MIXEDMAP));
+        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
+        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
+        if (addr < vma->vm_start || addr >= vma->vm_end)
+                return -EFAULT;
+        return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
 EXPORT_SYMBOL(vm_insert_pfn);
+int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+                        unsigned long pfn)
+{
+        BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
+        if (addr < vma->vm_start || addr >= vma->vm_end)
+                return -EFAULT;
+        /*
+         * If we don't have pte special, then we have to use the pfn_valid()
+         * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
+         * refcount the page if pfn_valid is true (hence insert_page rather
+         * than insert_pfn).
+         */
+        if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+                struct page *page;
+                page = pfn_to_page(pfn);
+                return insert_page(vma, addr, page, vma->vm_page_prot);
+        }
+        return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_mixed);
 /*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
@@ -1276,7 +1354,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
        arch_enter_lazy_mmu_mode();
        do {
                BUG_ON(!pte_none(*pte));
-                set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+                set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
                pfn++;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        arch_leave_lazy_mmu_mode();
@@ -2199,20 +2277,9 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        BUG_ON(vma->vm_flags & VM_PFNMAP);
-        if (likely(vma->vm_ops->fault)) {
+        ret = vma->vm_ops->fault(vma, &vmf);
-                ret = vma->vm_ops->fault(vma, &vmf);
+        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
-                if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
+                return ret;
-                        return ret;
-        } else {
-                /* Legacy ->nopage path */
-                ret = 0;
-                vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
-                /* no page was available -- either SIGBUS or OOM */
-                if (unlikely(vmf.page == NOPAGE_SIGBUS))
-                        return VM_FAULT_SIGBUS;
-                else if (unlikely(vmf.page == NOPAGE_OOM))
-                        return VM_FAULT_OOM;
-        }
        /*
         * For consistency in subsequent calls, make the faulted page always
@@ -2377,10 +2444,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long pfn;
        pte_unmap(page_table);
-        BUG_ON(!(vma->vm_flags & VM_PFNMAP));
+        BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
-        BUG_ON(is_cow_mapping(vma->vm_flags));
+        BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
        pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+        BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
        if (unlikely(pfn == NOPFN_OOM))
                return VM_FAULT_OOM;
        else if (unlikely(pfn == NOPFN_SIGBUS))
@@ -2458,7 +2528,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        if (!pte_present(entry)) {
                if (pte_none(entry)) {
                        if (vma->vm_ops) {
-                                if (vma->vm_ops->fault || vma->vm_ops->nopage)
+                                if (likely(vma->vm_ops->fault))
                                        return do_linear_fault(mm, vma, address,
                                                pte, pmd, write_access, entry);
                                if (unlikely(vma->vm_ops->nopfn))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0fb330271271..b17dca7249f8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,8 @@
 #include <asm/tlbflush.h>
+#include "internal.h"
 /* add this memory to iomem resource */
 static struct resource *register_memory_resource(u64 start, u64 size)
 {
@@ -58,8 +60,105 @@ static void release_memory_resource(struct resource *res)
        return;
 }
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+static void get_page_bootmem(unsigned long info,  struct page *page, int magic)
+{
+        atomic_set(&page->_mapcount, magic);
+        SetPagePrivate(page);
+        set_page_private(page, info);
+        atomic_inc(&page->_count);
+}
+void put_page_bootmem(struct page *page)
+{
+        int magic;
+        magic = atomic_read(&page->_mapcount);
+        BUG_ON(magic >= -1);
+        if (atomic_dec_return(&page->_count) == 1) {
+                ClearPagePrivate(page);
+                set_page_private(page, 0);
+                reset_page_mapcount(page);
+                __free_pages_bootmem(page, 0);
+        }
+}
+void register_page_bootmem_info_section(unsigned long start_pfn)
+{
+        unsigned long *usemap, mapsize, section_nr, i;
+        struct mem_section *ms;
+        struct page *page, *memmap;
+        if (!pfn_valid(start_pfn))
+                return;
+        section_nr = pfn_to_section_nr(start_pfn);
+        ms = __nr_to_section(section_nr);
+        /* Get section's memmap address */
+        memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+        /*
+         * Get page for the memmap's phys address
+         * XXX: need more consideration for sparse_vmemmap...
+         */
+        page = virt_to_page(memmap);
+        mapsize = sizeof(struct page) * PAGES_PER_SECTION;
+        mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
+        /* remember memmap's page */
+        for (i = 0; i < mapsize; i++, page++)
+                get_page_bootmem(section_nr, page, SECTION_INFO);
+        usemap = __nr_to_section(section_nr)->pageblock_flags;
+        page = virt_to_page(usemap);
+        mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+        for (i = 0; i < mapsize; i++, page++)
+                get_page_bootmem(section_nr, page, MIX_INFO);
+}
+void register_page_bootmem_info_node(struct pglist_data *pgdat)
+{
+        unsigned long i, pfn, end_pfn, nr_pages;
+        int node = pgdat->node_id;
+        struct page *page;
+        struct zone *zone;
+        nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
+        page = virt_to_page(pgdat);
+        for (i = 0; i < nr_pages; i++, page++)
+                get_page_bootmem(node, page, NODE_INFO);
+        zone = &pgdat->node_zones[0];
+        for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
+                if (zone->wait_table) {
+                        nr_pages = zone->wait_table_hash_nr_entries
+                                * sizeof(wait_queue_head_t);
+                        nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
+                        page = virt_to_page(zone->wait_table);
+                        for (i = 0; i < nr_pages; i++, page++)
+                                get_page_bootmem(node, page, NODE_INFO);
+                }
+        }
+        pfn = pgdat->node_start_pfn;
+        end_pfn = pfn + pgdat->node_spanned_pages;
+        /* register_section info */
+        for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
+                register_page_bootmem_info_section(pfn);
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
@@ -101,6 +200,36 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
        return register_new_memory(__pfn_to_section(phys_start_pfn));
 }
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+        /*
+         * XXX: Freeing memmap with vmemmap is not implement yet.
+         *      This should be removed later.
+         */
+        return -EBUSY;
+}
+#else
+static int __remove_section(struct zone *zone, struct mem_section *ms)
+{
+        unsigned long flags;
+        struct pglist_data *pgdat = zone->zone_pgdat;
+        int ret = -EINVAL;
+        if (!valid_section(ms))
+                return ret;
+        ret = unregister_memory_section(ms);
+        if (ret)
+                return ret;
+        pgdat_resize_lock(pgdat, &flags);
+        sparse_remove_one_section(zone, ms);
+        pgdat_resize_unlock(pgdat, &flags);
+        return 0;
+}
+#endif
 /*
 * Reasonably generic function for adding memory.  It is
 * expected that archs that support memory hotplug will
@@ -134,6 +263,42 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
 }
 EXPORT_SYMBOL_GPL(__add_pages);
+/**
+ * __remove_pages() - remove sections of pages from a zone
+ * @zone: zone from which pages need to be removed
+ * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
+ * @nr_pages: number of pages to remove (must be multiple of section size)
+ *
+ * Generic helper function to remove section mappings and sysfs entries
+ * for the section of the memory we are removing. Caller needs to make
+ * sure that pages are marked reserved and zones are adjust properly by
+ * calling offline_pages().
+ */
+int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+                 unsigned long nr_pages)
+{
+        unsigned long i, ret = 0;
+        int sections_to_remove;
+        /*
+         * We can only remove entire sections
+         */
+        BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
+        BUG_ON(nr_pages % PAGES_PER_SECTION);
+        release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
+        sections_to_remove = nr_pages / PAGES_PER_SECTION;
+        for (i = 0; i < sections_to_remove; i++) {
+                unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+                ret = __remove_section(zone, __pfn_to_section(pfn));
+                if (ret)
+                        break;
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(__remove_pages);
 static void grow_zone_span(struct zone *zone,
                unsigned long start_pfn, unsigned long end_pfn)
 {
@@ -164,6 +329,25 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
                                        pgdat->node_start_pfn;
 }
+void online_page(struct page *page)
+{
+        totalram_pages++;
+        num_physpages++;
+#ifdef CONFIG_HIGHMEM
+        if (PageHighMem(page))
+                totalhigh_pages++;
+#endif
+#ifdef CONFIG_FLATMEM
+        max_mapnr = max(page_to_pfn(page), max_mapnr);
+#endif
+        ClearPageReserved(page);
+        init_page_count(page);
+        __free_page(page);
+}
 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
                        void *arg)
 {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3c3601121509..a37a5034f63d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -63,7 +63,6 @@
   grows down?
   make bind policy root only? It can trigger oom much faster and the
   kernel is not always grateful with that.
-   could replace all the switch()es with a mempolicy_ops structure.
 */
 #include <linux/mempolicy.h>
@@ -89,6 +88,7 @@
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/ctype.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -105,142 +105,264 @@ static struct kmem_cache *sn_cache;
   policied. */
 enum zone_type policy_zone = 0;
+/*
+ * run-time system-wide default policy => local allocation
+ */
 struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
-        .policy = MPOL_DEFAULT,
+        .mode = MPOL_PREFERRED,
+        .flags = MPOL_F_LOCAL,
 };
-static void mpol_rebind_policy(struct mempolicy *pol,
+static const struct mempolicy_operations {
-                               const nodemask_t *newmask);
+        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
+        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
+} mpol_ops[MPOL_MAX];
-/* Do sanity checking on a policy */
+/* Check that the nodemask contains at least one populated zone */
-static int mpol_check_policy(int mode, nodemask_t *nodes)
+static int is_valid_nodemask(const nodemask_t *nodemask)
 {
-        int was_empty, is_empty;
+        int nd, k;
-        if (!nodes)
+        /* Check that there is something useful in this mask */
-                return 0;
+        k = policy_zone;
-        /*
+        for_each_node_mask(nd, *nodemask) {
-         * "Contextualize" the in-coming nodemast for cpusets:
+                struct zone *z;
-         * Remember whether in-coming nodemask was empty,  If not,
-         * restrict the nodes to the allowed nodes in the cpuset.
-         * This is guaranteed to be a subset of nodes with memory.
-         */
-        cpuset_update_task_memory_state();
-        is_empty = was_empty = nodes_empty(*nodes);
-        if (!was_empty) {
-                nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
-                is_empty = nodes_empty(*nodes); /* after "contextualization" */
-        }
-        switch (mode) {
+                for (k = 0; k <= policy_zone; k++) {
-        case MPOL_DEFAULT:
+                        z = &NODE_DATA(nd)->node_zones[k];
-                /*
+                        if (z->present_pages > 0)
-                 * require caller to specify an empty nodemask
+                                return 1;
-                 * before "contextualization"
+                }
-                 */
-                if (!was_empty)
-                        return -EINVAL;
-                break;
-        case MPOL_BIND:
-        case MPOL_INTERLEAVE:
-                /*
-                 * require at least 1 valid node after "contextualization"
-                 */
-                if (is_empty)
-                        return -EINVAL;
-                break;
-        case MPOL_PREFERRED:
-                /*
-                 * Did caller specify invalid nodes?
-                 * Don't silently accept this as "local allocation".
-                 */
-                if (!was_empty && is_empty)
-                        return -EINVAL;
-                break;
        }
        return 0;
 }
-/* Generate a custom zonelist for the BIND policy. */
+static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
-static struct zonelist *bind_zonelist(nodemask_t *nodes)
 {
-        struct zonelist *zl;
+        return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
-        int num, max, nd;
+}
-        enum zone_type k;
-        max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
+static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
-        max++;                  /* space for zlcache_ptr (see mmzone.h) */
+                                   const nodemask_t *rel)
-        zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
+{
-        if (!zl)
+        nodemask_t tmp;
-                return ERR_PTR(-ENOMEM);
+        nodes_fold(tmp, *orig, nodes_weight(*rel));
-        zl->zlcache_ptr = NULL;
+        nodes_onto(*ret, tmp, *rel);
-        num = 0;
+}
-        /* First put in the highest zones from all nodes, then all the next 
-           lower zones etc. Avoid empty zones because the memory allocator
+static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
-           doesn't like them. If you implement node hot removal you
+{
-           have to fix that. */
+        if (nodes_empty(*nodes))
-        k = MAX_NR_ZONES - 1;
+                return -EINVAL;
-        while (1) {
+        pol->v.nodes = *nodes;
-                for_each_node_mask(nd, *nodes) { 
+        return 0;
-                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
+}
-                        if (z->present_pages > 0) 
-                                zl->zones[num++] = z;
+static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
-                }
+{
-                if (k == 0)
+        if (!nodes)
-                        break;
+                pol->flags |= MPOL_F_LOCAL;     /* local allocation */
-                k--;
+        else if (nodes_empty(*nodes))
-        }
+                return -EINVAL;                 /*  no allowed nodes */
-        if (num == 0) {
+        else
-                kfree(zl);
+                pol->v.preferred_node = first_node(*nodes);
-                return ERR_PTR(-EINVAL);
+        return 0;
-        }
+}
-        zl->zones[num] = NULL;
-        return zl;
+static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
+{
+        if (!is_valid_nodemask(nodes))
+                return -EINVAL;
+        pol->v.nodes = *nodes;
+        return 0;
 }
 /* Create a new policy */
-static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
+static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
+                                  nodemask_t *nodes)
 {
        struct mempolicy *policy;
+        nodemask_t cpuset_context_nmask;
+        int ret;
-        pr_debug("setting mode %d nodes[0] %lx\n",
+        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
-                 mode, nodes ? nodes_addr(*nodes)[0] : -1);
+                 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
-        if (mode == MPOL_DEFAULT)
+        if (mode == MPOL_DEFAULT) {
-                return NULL;
+                if (nodes && !nodes_empty(*nodes))
+                        return ERR_PTR(-EINVAL);
+                return NULL;    /* simply delete any existing policy */
+        }
+        VM_BUG_ON(!nodes);
+        /*
+         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
+         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
+         * All other modes require a valid pointer to a non-empty nodemask.
+         */
+        if (mode == MPOL_PREFERRED) {
+                if (nodes_empty(*nodes)) {
+                        if (((flags & MPOL_F_STATIC_NODES) ||
+                             (flags & MPOL_F_RELATIVE_NODES)))
+                                return ERR_PTR(-EINVAL);
+                        nodes = NULL;   /* flag local alloc */
+                }
+        } else if (nodes_empty(*nodes))
+                return ERR_PTR(-EINVAL);
        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!policy)
                return ERR_PTR(-ENOMEM);
        atomic_set(&policy->refcnt, 1);
-        switch (mode) {
+        policy->mode = mode;
-        case MPOL_INTERLEAVE:
+        policy->flags = flags;
-                policy->v.nodes = *nodes;
-                if (nodes_weight(policy->v.nodes) == 0) {
+        if (nodes) {
-                        kmem_cache_free(policy_cache, policy);
+                /*
-                        return ERR_PTR(-EINVAL);
+                 * cpuset related setup doesn't apply to local allocation
-                }
+                 */
-                break;
+                cpuset_update_task_memory_state();
-        case MPOL_PREFERRED:
+                if (flags & MPOL_F_RELATIVE_NODES)
-                policy->v.preferred_node = first_node(*nodes);
+                        mpol_relative_nodemask(&cpuset_context_nmask, nodes,
-                if (policy->v.preferred_node >= MAX_NUMNODES)
+                                               &cpuset_current_mems_allowed);
-                        policy->v.preferred_node = -1;
+                else
-                break;
+                        nodes_and(cpuset_context_nmask, *nodes,
-        case MPOL_BIND:
+                                  cpuset_current_mems_allowed);
-                policy->v.zonelist = bind_zonelist(nodes);
+                if (mpol_store_user_nodemask(policy))
-                if (IS_ERR(policy->v.zonelist)) {
+                        policy->w.user_nodemask = *nodes;
-                        void *error_code = policy->v.zonelist;
+                else
-                        kmem_cache_free(policy_cache, policy);
+                        policy->w.cpuset_mems_allowed =
-                        return error_code;
+                                                cpuset_mems_allowed(current);
-                }
+        }
-                break;
+        ret = mpol_ops[mode].create(policy,
+                                nodes ? &cpuset_context_nmask : NULL);
+        if (ret < 0) {
+                kmem_cache_free(policy_cache, policy);
+                return ERR_PTR(ret);
        }
-        policy->policy = mode;
-        policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
        return policy;
 }
+/* Slow path of a mpol destructor. */
+void __mpol_put(struct mempolicy *p)
+{
+        if (!atomic_dec_and_test(&p->refcnt))
+                return;
+        kmem_cache_free(policy_cache, p);
+}
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
+{
+}
+static void mpol_rebind_nodemask(struct mempolicy *pol,
+                                 const nodemask_t *nodes)
+{
+        nodemask_t tmp;
+        if (pol->flags & MPOL_F_STATIC_NODES)
+                nodes_and(tmp, pol->w.user_nodemask, *nodes);
+        else if (pol->flags & MPOL_F_RELATIVE_NODES)
+                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+        else {
+                nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
+                            *nodes);
+                pol->w.cpuset_mems_allowed = *nodes;
+        }
+        pol->v.nodes = tmp;
+        if (!node_isset(current->il_next, tmp)) {
+                current->il_next = next_node(current->il_next, tmp);
+                if (current->il_next >= MAX_NUMNODES)
+                        current->il_next = first_node(tmp);
+                if (current->il_next >= MAX_NUMNODES)
+                        current->il_next = numa_node_id();
+        }
+}
+static void mpol_rebind_preferred(struct mempolicy *pol,
+                                  const nodemask_t *nodes)
+{
+        nodemask_t tmp;
+        if (pol->flags & MPOL_F_STATIC_NODES) {
+                int node = first_node(pol->w.user_nodemask);
+                if (node_isset(node, *nodes)) {
+                        pol->v.preferred_node = node;
+                        pol->flags &= ~MPOL_F_LOCAL;
+                } else
+                        pol->flags |= MPOL_F_LOCAL;
+        } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
+                mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+                pol->v.preferred_node = first_node(tmp);
+        } else if (!(pol->flags & MPOL_F_LOCAL)) {
+                pol->v.preferred_node = node_remap(pol->v.preferred_node,
+                                                   pol->w.cpuset_mems_allowed,
+                                                   *nodes);
+                pol->w.cpuset_mems_allowed = *nodes;
+        }
+}
+/* Migrate a policy to a different set of nodes */
+static void mpol_rebind_policy(struct mempolicy *pol,
+                               const nodemask_t *newmask)
+{
+        if (!pol)
+                return;
+        if (!mpol_store_user_nodemask(pol) &&
+            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
+                return;
+        mpol_ops[pol->mode].rebind(pol, newmask);
+}
+/*
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+        mpol_rebind_policy(tsk->mempolicy, new);
+}
+/*
+ * Rebind each vma in mm to new nodemask.
+ *
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
+ */
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+        struct vm_area_struct *vma;
+        down_write(&mm->mmap_sem);
+        for (vma = mm->mmap; vma; vma = vma->vm_next)
+                mpol_rebind_policy(vma->vm_policy, new);
+        up_write(&mm->mmap_sem);
+}
+static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
+        [MPOL_DEFAULT] = {
+                .rebind = mpol_rebind_default,
+        },
+        [MPOL_INTERLEAVE] = {
+                .create = mpol_new_interleave,
+                .rebind = mpol_rebind_nodemask,
+        },
+        [MPOL_PREFERRED] = {
+                .create = mpol_new_preferred,
+                .rebind = mpol_rebind_preferred,
+        },
+        [MPOL_BIND] = {
+                .create = mpol_new_bind,
+                .rebind = mpol_rebind_nodemask,
+        },
+};
 static void gather_stats(struct page *, void *, int pte_dirty);
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags);
@@ -421,7 +543,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
        if (!err) {
                mpol_get(new);
                vma->vm_policy = new;
-                mpol_free(old);
+                mpol_put(old);
        }
        return err;
 }
@@ -479,46 +601,55 @@ static void mpol_set_task_struct_flag(void)
 }
 /* Set the process memory policy */
-static long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(unsigned short mode, unsigned short flags,
+                             nodemask_t *nodes)
 {
        struct mempolicy *new;
+        struct mm_struct *mm = current->mm;
-        if (mpol_check_policy(mode, nodes))
+        new = mpol_new(mode, flags, nodes);
-                return -EINVAL;
-        new = mpol_new(mode, nodes);
        if (IS_ERR(new))
                return PTR_ERR(new);
-        mpol_free(current->mempolicy);
+        /*
+         * prevent changing our mempolicy while show_numa_maps()
+         * is using it.
+         * Note:  do_set_mempolicy() can be called at init time
+         * with no 'mm'.
+         */
+        if (mm)
+                down_write(&mm->mmap_sem);
+        mpol_put(current->mempolicy);
        current->mempolicy = new;
        mpol_set_task_struct_flag();
-        if (new && new->policy == MPOL_INTERLEAVE)
+        if (new && new->mode == MPOL_INTERLEAVE &&
+            nodes_weight(new->v.nodes))
                current->il_next = first_node(new->v.nodes);
+        if (mm)
+                up_write(&mm->mmap_sem);
        return 0;
 }
-/* Fill a zone bitmap for a policy */
+/*
-static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
+ * Return nodemask for policy for get_mempolicy() query
+ */
+static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 {
-        int i;
        nodes_clear(*nodes);
-        switch (p->policy) {
+        if (p == &default_policy)
+                return;
+        switch (p->mode) {
        case MPOL_BIND:
-                for (i = 0; p->v.zonelist->zones[i]; i++)
+                /* Fall through */
-                        node_set(zone_to_nid(p->v.zonelist->zones[i]),
-                                *nodes);
-                break;
-        case MPOL_DEFAULT:
-                break;
        case MPOL_INTERLEAVE:
                *nodes = p->v.nodes;
                break;
        case MPOL_PREFERRED:
-                /* or use current node instead of memory_map? */
+                if (!(p->flags & MPOL_F_LOCAL))
-                if (p->v.preferred_node < 0)
-                        *nodes = node_states[N_HIGH_MEMORY];
-                else
                        node_set(p->v.preferred_node, *nodes);
+                /* else return empty node mask for local allocation */
                break;
        default:
                BUG();
@@ -561,6 +692,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
        }
        if (flags & MPOL_F_ADDR) {
+                /*
+                 * Do NOT fall back to task policy if the
+                 * vma/shared policy at addr is NULL.  We
+                 * want to return MPOL_DEFAULT in this case.
+                 */
                down_read(&mm->mmap_sem);
                vma = find_vma_intersection(mm, addr, addr+1);
                if (!vma) {
@@ -575,7 +711,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                return -EINVAL;
        if (!pol)
-                pol = &default_policy;
+                pol = &default_policy;  /* indicates default behavior */
        if (flags & MPOL_F_NODE) {
                if (flags & MPOL_F_ADDR) {
@@ -584,14 +720,17 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
                                goto out;
                        *policy = err;
                } else if (pol == current->mempolicy &&
-                                pol->policy == MPOL_INTERLEAVE) {
+                                pol->mode == MPOL_INTERLEAVE) {
                        *policy = current->il_next;
                } else {
                        err = -EINVAL;
                        goto out;
                }
-        } else
+        } else {
-                *policy = pol->policy;
+                *policy = pol == &default_policy ? MPOL_DEFAULT :
+                                                pol->mode;
+                *policy |= pol->flags;
+        }
        if (vma) {
                up_read(&current->mm->mmap_sem);
@@ -600,9 +739,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
        err = 0;
        if (nmask)
-                get_zonemask(pol, nmask);
+                get_policy_nodemask(pol, nmask);
 out:
+        mpol_cond_put(pol);
        if (vma)
                up_read(&current->mm->mmap_sem);
        return err;
@@ -664,7 +804,7 @@ int do_migrate_pages(struct mm_struct *mm,
        int err = 0;
        nodemask_t tmp;
-        down_read(&mm->mmap_sem);
+        down_read(&mm->mmap_sem);
        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
        if (err)
@@ -781,8 +921,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 #endif
 static long do_mbind(unsigned long start, unsigned long len,
-                     unsigned long mode, nodemask_t *nmask,
+                     unsigned short mode, unsigned short mode_flags,
-                     unsigned long flags)
+                     nodemask_t *nmask, unsigned long flags)
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
@@ -791,9 +931,8 @@ static long do_mbind(unsigned long start, unsigned long len,
        int err;
        LIST_HEAD(pagelist);
-        if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+        if (flags & ~(unsigned long)(MPOL_MF_STRICT |
-                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+                                     MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-            || mode > MPOL_MAX)
                return -EINVAL;
        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
                return -EPERM;
@@ -812,10 +951,7 @@ static long do_mbind(unsigned long start, unsigned long len,
        if (end == start)
                return 0;
-        if (mpol_check_policy(mode, nmask))
+        new = mpol_new(mode, mode_flags, nmask);
-                return -EINVAL;
-        new = mpol_new(mode, nmask);
        if (IS_ERR(new))
                return PTR_ERR(new);
@@ -826,8 +962,9 @@ static long do_mbind(unsigned long start, unsigned long len,
        if (!new)
                flags |= MPOL_MF_DISCONTIG_OK;
-        pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+        pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
-                 mode, nmask ? nodes_addr(*nmask)[0] : -1);
+                 start, start + len, mode, mode_flags,
+                 nmask ? nodes_addr(*nmask)[0] : -1);
        down_write(&mm->mmap_sem);
        vma = check_range(mm, start, end, nmask,
@@ -848,7 +985,7 @@ static long do_mbind(unsigned long start, unsigned long len,
        }
        up_write(&mm->mmap_sem);
-        mpol_free(new);
+        mpol_put(new);
        return err;
 }
@@ -926,11 +1063,19 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 {
        nodemask_t nodes;
        int err;
+        unsigned short mode_flags;
+        mode_flags = mode & MPOL_MODE_FLAGS;
+        mode &= ~MPOL_MODE_FLAGS;
+        if (mode >= MPOL_MAX)
+                return -EINVAL;
+        if ((mode_flags & MPOL_F_STATIC_NODES) &&
+            (mode_flags & MPOL_F_RELATIVE_NODES))
+                return -EINVAL;
        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;
-        return do_mbind(start, len, mode, &nodes, flags);
+        return do_mbind(start, len, mode, mode_flags, &nodes, flags);
 }
 /* Set the process memory policy */
@@ -939,13 +1084,18 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 {
        int err;
        nodemask_t nodes;
+        unsigned short flags;
-        if (mode < 0 || mode > MPOL_MAX)
+        flags = mode & MPOL_MODE_FLAGS;
+        mode &= ~MPOL_MODE_FLAGS;
+        if ((unsigned int)mode >= MPOL_MAX)
+                return -EINVAL;
+        if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
                return -EINVAL;
        err = get_nodes(&nodes, nmask, maxnode);
        if (err)
                return err;
-        return do_set_mempolicy(mode, &nodes);
+        return do_set_mempolicy(mode, flags, &nodes);
 }
 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
@@ -1131,59 +1281,75 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 *
 * Returns effective policy for a VMA at specified address.
 * Falls back to @task or system default policy, as necessary.
- * Returned policy has extra reference count if shared, vma,
+ * Current or other task's task mempolicy and non-shared vma policies
- * or some other task's policy [show_numa_maps() can pass
+ * are protected by the task's mmap_sem, which must be held for read by
- * @task != current].  It is the caller's responsibility to
+ * the caller.
- * free the reference in these cases.
+ * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
+ * count--added by the get_policy() vm_op, as appropriate--to protect against
+ * freeing by another task.  It is the caller's responsibility to free the
+ * extra reference for shared policies.
 */
-static struct mempolicy * get_vma_policy(struct task_struct *task,
+static struct mempolicy *get_vma_policy(struct task_struct *task,
                struct vm_area_struct *vma, unsigned long addr)
 {
        struct mempolicy *pol = task->mempolicy;
-        int shared_pol = 0;
        if (vma) {
                if (vma->vm_ops && vma->vm_ops->get_policy) {
-                        pol = vma->vm_ops->get_policy(vma, addr);
+                        struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
-                        shared_pol = 1; /* if pol non-NULL, add ref below */
+                                                                        addr);
-                } else if (vma->vm_policy &&
+                        if (vpol)
-                                vma->vm_policy->policy != MPOL_DEFAULT)
+                                pol = vpol;
+                } else if (vma->vm_policy)
                        pol = vma->vm_policy;
        }
        if (!pol)
                pol = &default_policy;
-        else if (!shared_pol && pol != current->mempolicy)
-                mpol_get(pol);  /* vma or other task's policy */
        return pol;
 }
-/* Return a zonelist representing a mempolicy */
+/*
-static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
+ * Return a nodemask representing a mempolicy for filtering nodes for
+ * page allocation
+ */
+static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
 {
-        int nd;
+        /* Lower zones don't get a nodemask applied for MPOL_BIND */
+        if (unlikely(policy->mode == MPOL_BIND) &&
+                        gfp_zone(gfp) >= policy_zone &&
+                        cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
+                return &policy->v.nodes;
-        switch (policy->policy) {
+        return NULL;
+}
+/* Return a zonelist indicated by gfp for node representing a mempolicy */
+static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
+{
+        int nd = numa_node_id();
+        switch (policy->mode) {
        case MPOL_PREFERRED:
-                nd = policy->v.preferred_node;
+                if (!(policy->flags & MPOL_F_LOCAL))
-                if (nd < 0)
+                        nd = policy->v.preferred_node;
-                        nd = numa_node_id();
                break;
        case MPOL_BIND:
-                /* Lower zones don't get a policy applied */
+                /*
-                /* Careful: current->mems_allowed might have moved */
+                 * Normally, MPOL_BIND allocations are node-local within the
-                if (gfp_zone(gfp) >= policy_zone)
+                 * allowed nodemask.  However, if __GFP_THISNODE is set and the
-                        if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
+                 * current node is part of the mask, we use the zonelist for
-                                return policy->v.zonelist;
+                 * the first node in the mask instead.
-                /*FALL THROUGH*/
+                 */
+                if (unlikely(gfp & __GFP_THISNODE) &&
+                                unlikely(!node_isset(nd, policy->v.nodes)))
+                        nd = first_node(policy->v.nodes);
+                break;
        case MPOL_INTERLEAVE: /* should not happen */
-        case MPOL_DEFAULT:
-                nd = numa_node_id();
                break;
        default:
-                nd = 0;
                BUG();
        }
-        return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
+        return node_zonelist(nd, gfp);
 }
 /* Do dynamic interleaving for a process */
@@ -1196,36 +1362,51 @@ static unsigned interleave_nodes(struct mempolicy *policy)
        next = next_node(nid, policy->v.nodes);
        if (next >= MAX_NUMNODES)
                next = first_node(policy->v.nodes);
-        me->il_next = next;
+        if (next < MAX_NUMNODES)
+                me->il_next = next;
        return nid;
 }
 /*
 * Depending on the memory policy provide a node from which to allocate the
 * next slab entry.
+ * @policy must be protected by freeing by the caller.  If @policy is
+ * the current task's mempolicy, this protection is implicit, as only the
+ * task can change it's policy.  The system default policy requires no
+ * such protection.
 */
 unsigned slab_node(struct mempolicy *policy)
 {
-        int pol = policy ? policy->policy : MPOL_DEFAULT;
+        if (!policy || policy->flags & MPOL_F_LOCAL)
+                return numa_node_id();
+        switch (policy->mode) {
+        case MPOL_PREFERRED:
+                /*
+                 * handled MPOL_F_LOCAL above
+                 */
+                return policy->v.preferred_node;
-        switch (pol) {
        case MPOL_INTERLEAVE:
                return interleave_nodes(policy);
-        case MPOL_BIND:
+        case MPOL_BIND: {
                /*
                 * Follow bind policy behavior and start allocation at the
                 * first node.
                 */
-                return zone_to_nid(policy->v.zonelist->zones[0]);
+                struct zonelist *zonelist;
+                struct zone *zone;
-        case MPOL_PREFERRED:
+                enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
-                if (policy->v.preferred_node >= 0)
+                zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
-                        return policy->v.preferred_node;
+                (void)first_zones_zonelist(zonelist, highest_zoneidx,
-                /* Fall through */
+                                                        &policy->v.nodes,
+                                                        &zone);
+                return zone->node;
+        }
        default:
-                return numa_node_id();
+                BUG();
        }
 }
@@ -1234,10 +1415,13 @@ static unsigned offset_il_node(struct mempolicy *pol,
                struct vm_area_struct *vma, unsigned long off)
 {
        unsigned nnodes = nodes_weight(pol->v.nodes);
-        unsigned target = (unsigned)off % nnodes;
+        unsigned target;
        int c;
        int nid = -1;
+        if (!nnodes)
+                return numa_node_id();
+        target = (unsigned int)off % nnodes;
        c = 0;
        do {
                nid = next_node(nid, pol->v.nodes);
@@ -1274,40 +1458,30 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 * @vma = virtual memory area whose policy is sought
 * @addr = address in @vma for shared policy lookup and interleave policy
 * @gfp_flags = for requested zone
- * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
+ * @mpol = pointer to mempolicy pointer for reference counted mempolicy
+ * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
 *
- * Returns a zonelist suitable for a huge page allocation.
+ * Returns a zonelist suitable for a huge page allocation and a pointer
- * If the effective policy is 'BIND, returns pointer to policy's zonelist.
+ * to the struct mempolicy for conditional unref after allocation.
- * If it is also a policy for which get_vma_policy() returns an extra
+ * If the effective policy is 'BIND, returns a pointer to the mempolicy's
- * reference, we must hold that reference until after allocation.
+ * @nodemask for filtering the zonelist.
- * In that case, return policy via @mpol so hugetlb allocation can drop
- * the reference.  For non-'BIND referenced policies, we can/do drop the
- * reference here, so the caller doesn't need to know about the special case
- * for default and current task policy.
 */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
-                                gfp_t gfp_flags, struct mempolicy **mpol)
+                                gfp_t gfp_flags, struct mempolicy **mpol,
+                                nodemask_t **nodemask)
 {
-        struct mempolicy *pol = get_vma_policy(current, vma, addr);
        struct zonelist *zl;
-        *mpol = NULL;           /* probably no unref needed */
+        *mpol = get_vma_policy(current, vma, addr);
-        if (pol->policy == MPOL_INTERLEAVE) {
+        *nodemask = NULL;       /* assume !MPOL_BIND */
-                unsigned nid;
-                nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
-                if (unlikely(pol != &default_policy &&
-                                pol != current->mempolicy))
-                        __mpol_free(pol);       /* finished with pol */
-                return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
-        }
-        zl = zonelist_policy(GFP_HIGHUSER, pol);
+        if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
-        if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
+                zl = node_zonelist(interleave_nid(*mpol, vma, addr,
-                if (pol->policy != MPOL_BIND)
+                                                HPAGE_SHIFT), gfp_flags);
-                        __mpol_free(pol);       /* finished with pol */
+        } else {
-                else
+                zl = policy_zonelist(gfp_flags, *mpol);
-                        *mpol = pol;    /* unref needed after allocation */
+                if ((*mpol)->mode == MPOL_BIND)
+                        *nodemask = &(*mpol)->v.nodes;
        }
        return zl;
 }
@@ -1321,9 +1495,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
        struct zonelist *zl;
        struct page *page;
-        zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
+        zl = node_zonelist(nid, gfp);
        page = __alloc_pages(gfp, order, zl);
-        if (page && page_zone(page) == zl->zones[0])
+        if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
        return page;
 }
@@ -1358,28 +1532,27 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
        cpuset_update_task_memory_state();
-        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
+        if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
                unsigned nid;
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
-                if (unlikely(pol != &default_policy &&
+                mpol_cond_put(pol);
-                                pol != current->mempolicy))
-                        __mpol_free(pol);       /* finished with pol */
                return alloc_page_interleave(gfp, 0, nid);
        }
-        zl = zonelist_policy(gfp, pol);
+        zl = policy_zonelist(gfp, pol);
-        if (pol != &default_policy && pol != current->mempolicy) {
+        if (unlikely(mpol_needs_cond_ref(pol))) {
                /*
-                 * slow path: ref counted policy -- shared or vma
+                 * slow path: ref counted shared policy
                 */
-                struct page *page =  __alloc_pages(gfp, 0, zl);
+                struct page *page =  __alloc_pages_nodemask(gfp, 0,
-                __mpol_free(pol);
+                                                zl, policy_nodemask(gfp, pol));
+                __mpol_put(pol);
                return page;
        }
        /*
         * fast path:  default or task policy
         */
-        return __alloc_pages(gfp, 0, zl);
+        return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
 }
 /**
@@ -1409,22 +1582,28 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
                cpuset_update_task_memory_state();
        if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
                pol = &default_policy;
-        if (pol->policy == MPOL_INTERLEAVE)
+        /*
+         * No reference counting needed for current->mempolicy
+         * nor system default_policy
+         */
+        if (pol->mode == MPOL_INTERLEAVE)
                return alloc_page_interleave(gfp, order, interleave_nodes(pol));
-        return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
+        return __alloc_pages_nodemask(gfp, order,
+                        policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
 }
 EXPORT_SYMBOL(alloc_pages_current);
 /*
- * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
 * with the mems_allowed returned by cpuset_mems_allowed().  This
 * keeps mempolicies cpuset relative after its cpuset moves.  See
 * further kernel/cpuset.c update_nodemask().
 */
-/* Slow path of a mempolicy copy */
+/* Slow path of a mempolicy duplicate */
-struct mempolicy *__mpol_copy(struct mempolicy *old)
+struct mempolicy *__mpol_dup(struct mempolicy *old)
 {
        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -1436,55 +1615,64 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
        }
        *new = *old;
        atomic_set(&new->refcnt, 1);
-        if (new->policy == MPOL_BIND) {
-                int sz = ksize(old->v.zonelist);
-                new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
-                if (!new->v.zonelist) {
-                        kmem_cache_free(policy_cache, new);
-                        return ERR_PTR(-ENOMEM);
-                }
-        }
        return new;
 }
+/*
+ * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
+ * eliminate the * MPOL_F_* flags that require conditional ref and
+ * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
+ * after return.  Use the returned value.
+ *
+ * Allows use of a mempolicy for, e.g., multiple allocations with a single
+ * policy lookup, even if the policy needs/has extra ref on lookup.
+ * shmem_readahead needs this.
+ */
+struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
+                                                struct mempolicy *frompol)
+{
+        if (!mpol_needs_cond_ref(frompol))
+                return frompol;
+        *tompol = *frompol;
+        tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
+        __mpol_put(frompol);
+        return tompol;
+}
+static int mpol_match_intent(const struct mempolicy *a,
+                             const struct mempolicy *b)
+{
+        if (a->flags != b->flags)
+                return 0;
+        if (!mpol_store_user_nodemask(a))
+                return 1;
+        return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
+}
 /* Slow path of a mempolicy comparison */
 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
        if (!a || !b)
                return 0;
-        if (a->policy != b->policy)
+        if (a->mode != b->mode)
                return 0;
-        switch (a->policy) {
+        if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
-        case MPOL_DEFAULT:
+                return 0;
-                return 1;
+        switch (a->mode) {
+        case MPOL_BIND:
+                /* Fall through */
        case MPOL_INTERLEAVE:
                return nodes_equal(a->v.nodes, b->v.nodes);
        case MPOL_PREFERRED:
-                return a->v.preferred_node == b->v.preferred_node;
+                return a->v.preferred_node == b->v.preferred_node &&
-        case MPOL_BIND: {
+                        a->flags == b->flags;
-                int i;
-                for (i = 0; a->v.zonelist->zones[i]; i++)
-                        if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
-                                return 0;
-                return b->v.zonelist->zones[i] == NULL;
-        }
        default:
                BUG();
                return 0;
        }
 }
-/* Slow path of a mpol destructor. */
-void __mpol_free(struct mempolicy *p)
-{
-        if (!atomic_dec_and_test(&p->refcnt))
-                return;
-        if (p->policy == MPOL_BIND)
-                kfree(p->v.zonelist);
-        p->policy = MPOL_DEFAULT;
-        kmem_cache_free(policy_cache, p);
-}
 /*
 * Shared memory backing store policy support.
 *
@@ -1547,7 +1735,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
        rb_link_node(&new->nd, parent, p);
        rb_insert_color(&new->nd, &sp->root);
        pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
-                 new->policy ? new->policy->policy : 0);
+                 new->policy ? new->policy->mode : 0);
 }
 /* Find shared policy intersecting idx */
@@ -1573,7 +1761,7 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
        rb_erase(&n->nd, &sp->root);
-        mpol_free(n->policy);
+        mpol_put(n->policy);
        kmem_cache_free(sn_cache, n);
 }
@@ -1587,6 +1775,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
        n->start = start;
        n->end = end;
        mpol_get(pol);
+        pol->flags |= MPOL_F_SHARED;    /* for unref */
        n->policy = pol;
        return n;
 }
@@ -1633,33 +1822,41 @@ restart:
                sp_insert(sp, new);
        spin_unlock(&sp->lock);
        if (new2) {
-                mpol_free(new2->policy);
+                mpol_put(new2->policy);
                kmem_cache_free(sn_cache, new2);
        }
        return 0;
 }
-void mpol_shared_policy_init(struct shared_policy *info, int policy,
+/**
-                                nodemask_t *policy_nodes)
+ * mpol_shared_policy_init - initialize shared policy for inode
-{
+ * @sp: pointer to inode shared policy
-        info->root = RB_ROOT;
+ * @mpol:  struct mempolicy to install
-        spin_lock_init(&info->lock);
+ *
+ * Install non-NULL @mpol in inode's shared policy rb-tree.
-        if (policy != MPOL_DEFAULT) {
+ * On entry, the current task has a reference on a non-NULL @mpol.
-                struct mempolicy *newpol;
+ * This must be released on exit.
+ */
-                /* Falls back to MPOL_DEFAULT on any error */
+void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
-                newpol = mpol_new(policy, policy_nodes);
+{
-                if (!IS_ERR(newpol)) {
+        sp->root = RB_ROOT;             /* empty tree == default mempolicy */
-                        /* Create pseudo-vma that contains just the policy */
+        spin_lock_init(&sp->lock);
-                        struct vm_area_struct pvma;
+        if (mpol) {
-                        memset(&pvma, 0, sizeof(struct vm_area_struct));
+                struct vm_area_struct pvma;
-                        /* Policy covers entire file */
+                struct mempolicy *new;
-                        pvma.vm_end = TASK_SIZE;
-                        mpol_set_shared_policy(info, &pvma, newpol);
+                /* contextualize the tmpfs mount point mempolicy */
-                        mpol_free(newpol);
+                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
-                }
+                mpol_put(mpol); /* drop our ref on sb mpol */
+                if (IS_ERR(new))
+                        return;         /* no valid nodemask intersection */
+                /* Create pseudo-vma that contains just the policy */
+                memset(&pvma, 0, sizeof(struct vm_area_struct));
+                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
+                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
+                mpol_put(new);                  /* drop initial ref */
        }
 }
@@ -1670,9 +1867,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
        struct sp_node *new = NULL;
        unsigned long sz = vma_pages(vma);
-        pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
+        pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
                 vma->vm_pgoff,
-                 sz, npol? npol->policy : -1,
+                 sz, npol ? npol->mode : -1,
+                 npol ? npol->flags : -1,
                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
        if (npol) {
@@ -1700,7 +1898,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
                rb_erase(&n->nd, &p->root);
-                mpol_free(n->policy);
+                mpol_put(n->policy);
                kmem_cache_free(sn_cache, n);
        }
        spin_unlock(&p->lock);
@@ -1745,120 +1943,177 @@ void __init numa_policy_init(void)
        if (unlikely(nodes_empty(interleave_nodes)))
                node_set(prefer, interleave_nodes);
-        if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
+        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                printk("numa_policy_init: interleaving failed\n");
 }
 /* Reset policy of current process to default */
 void numa_default_policy(void)
 {
-        do_set_mempolicy(MPOL_DEFAULT, NULL);
+        do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
 }
-/* Migrate a policy to a different set of nodes */
+/*
-static void mpol_rebind_policy(struct mempolicy *pol,
+ * Parse and format mempolicy from/to strings
-                               const nodemask_t *newmask)
+ */
-{
-        nodemask_t *mpolmask;
-        nodemask_t tmp;
-        if (!pol)
+/*
-                return;
+ * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
-        mpolmask = &pol->cpuset_mems_allowed;
+ * Used only for mpol_parse_str() and mpol_to_str()
-        if (nodes_equal(*mpolmask, *newmask))
+ */
-                return;
+#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
+static const char * const policy_types[] =
+        { "default", "prefer", "bind", "interleave", "local" };
-        switch (pol->policy) {
-        case MPOL_DEFAULT:
-                break;
-        case MPOL_INTERLEAVE:
-                nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
-                pol->v.nodes = tmp;
-                *mpolmask = *newmask;
-                current->il_next = node_remap(current->il_next,
-                                                *mpolmask, *newmask);
-                break;
-        case MPOL_PREFERRED:
-                pol->v.preferred_node = node_remap(pol->v.preferred_node,
-                                                *mpolmask, *newmask);
-                *mpolmask = *newmask;
-                break;
-        case MPOL_BIND: {
-                nodemask_t nodes;
-                struct zone **z;
-                struct zonelist *zonelist;
+#ifdef CONFIG_TMPFS
+/**
+ * mpol_parse_str - parse string to mempolicy
+ * @str:  string containing mempolicy to parse
+ * @mpol:  pointer to struct mempolicy pointer, returned on success.
+ * @no_context:  flag whether to "contextualize" the mempolicy
+ *
+ * Format of input:
+ *      <mode>[=<flags>][:<nodelist>]
+ *
+ * if @no_context is true, save the input nodemask in w.user_nodemask in
+ * the returned mempolicy.  This will be used to "clone" the mempolicy in
+ * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
+ * mount option.  Note that if 'static' or 'relative' mode flags were
+ * specified, the input nodemask will already have been saved.  Saving
+ * it again is redundant, but safe.
+ *
+ * On success, returns 0, else 1
+ */
+int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
+{
+        struct mempolicy *new = NULL;
+        unsigned short uninitialized_var(mode);
+        unsigned short uninitialized_var(mode_flags);
+        nodemask_t nodes;
+        char *nodelist = strchr(str, ':');
+        char *flags = strchr(str, '=');
+        int i;
+        int err = 1;
+        if (nodelist) {
+                /* NUL-terminate mode or flags string */
+                *nodelist++ = '\0';
+                if (nodelist_parse(nodelist, nodes))
+                        goto out;
+                if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
+                        goto out;
+        } else
                nodes_clear(nodes);
-                for (z = pol->v.zonelist->zones; *z; z++)
-                        node_set(zone_to_nid(*z), nodes);
-                nodes_remap(tmp, nodes, *mpolmask, *newmask);
-                nodes = tmp;
-                zonelist = bind_zonelist(&nodes);
+        if (flags)
+                *flags++ = '\0';        /* terminate mode string */
-                /* If no mem, then zonelist is NULL and we keep old zonelist.
+        for (i = 0; i <= MPOL_LOCAL; i++) {
-                 * If that old zonelist has no remaining mems_allowed nodes,
+                if (!strcmp(str, policy_types[i])) {
-                 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
+                        mode = i;
-                 */
+                        break;
+                }
+        }
+        if (i > MPOL_LOCAL)
+                goto out;
-                if (!IS_ERR(zonelist)) {
+        switch (mode) {
-                        /* Good - got mem - substitute new zonelist */
+        case MPOL_PREFERRED:
-                        kfree(pol->v.zonelist);
+                /*
-                        pol->v.zonelist = zonelist;
+                 * Insist on a nodelist of one node only
+                 */
+                if (nodelist) {
+                        char *rest = nodelist;
+                        while (isdigit(*rest))
+                                rest++;
+                        if (!*rest)
+                                err = 0;
                }
-                *mpolmask = *newmask;
                break;
-        }
+        case MPOL_INTERLEAVE:
-        default:
+                /*
-                BUG();
+                 * Default to online nodes with memory if no nodelist
+                 */
+                if (!nodelist)
+                        nodes = node_states[N_HIGH_MEMORY];
+                err = 0;
+                break;
+        case MPOL_LOCAL:
+                /*
+                 * Don't allow a nodelist;  mpol_new() checks flags
+                 */
+                if (nodelist)
+                        goto out;
+                mode = MPOL_PREFERRED;
                break;
-        }
-}
-/*
- * Wrapper for mpol_rebind_policy() that just requires task
- * pointer, and updates task mempolicy.
- */
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+        /*
-{
+         * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
-        mpol_rebind_policy(tsk->mempolicy, new);
+         * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
-}
+         */
+        }
-/*
+        mode_flags = 0;
- * Rebind each vma in mm to new nodemask.
+        if (flags) {
- *
+                /*
- * Call holding a reference to mm.  Takes mm->mmap_sem during call.
+                 * Currently, we only support two mutually exclusive
- */
+                 * mode flags.
+                 */
+                if (!strcmp(flags, "static"))
+                        mode_flags |= MPOL_F_STATIC_NODES;
+                else if (!strcmp(flags, "relative"))
+                        mode_flags |= MPOL_F_RELATIVE_NODES;
+                else
+                        err = 1;
+        }
-void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+        new = mpol_new(mode, mode_flags, &nodes);
-{
+        if (IS_ERR(new))
-        struct vm_area_struct *vma;
+                err = 1;
+        else if (no_context)
+                new->w.user_nodemask = nodes;   /* save for contextualization */
-        down_write(&mm->mmap_sem);
+out:
-        for (vma = mm->mmap; vma; vma = vma->vm_next)
+        /* Restore string for error message */
-                mpol_rebind_policy(vma->vm_policy, new);
+        if (nodelist)
-        up_write(&mm->mmap_sem);
+                *--nodelist = ':';
+        if (flags)
+                *--flags = '=';
+        if (!err)
+                *mpol = new;
+        return err;
 }
+#endif /* CONFIG_TMPFS */
-/*
+/**
- * Display pages allocated per node and memory policy via /proc.
+ * mpol_to_str - format a mempolicy structure for printing
- */
+ * @buffer:  to contain formatted mempolicy string
+ * @maxlen:  length of @buffer
-static const char * const policy_types[] =
+ * @pol:  pointer to mempolicy to be formatted
-        { "default", "prefer", "bind", "interleave" };
+ * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
+ *
-/*
 * Convert a mempolicy into a string.
 * Returns the number of characters in buffer (if positive)
 * or an error (negative)
 */
-static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
 {
        char *p = buffer;
        int l;
        nodemask_t nodes;
-        int mode = pol ? pol->policy : MPOL_DEFAULT;
+        unsigned short mode;
+        unsigned short flags = pol ? pol->flags : 0;
+        /*
+         * Sanity check:  room for longest mode, flag and some nodes
+         */
+        VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
+        if (!pol || pol == &default_policy)
+                mode = MPOL_DEFAULT;
+        else
+                mode = pol->mode;
        switch (mode) {
        case MPOL_DEFAULT:
@@ -1867,33 +2122,50 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
        case MPOL_PREFERRED:
                nodes_clear(nodes);
-                node_set(pol->v.preferred_node, nodes);
+                if (flags & MPOL_F_LOCAL)
+                        mode = MPOL_LOCAL;      /* pseudo-policy */
+                else
+                        node_set(pol->v.preferred_node, nodes);
                break;
        case MPOL_BIND:
-                get_zonemask(pol, &nodes);
+                /* Fall through */
-                break;
        case MPOL_INTERLEAVE:
-                nodes = pol->v.nodes;
+                if (no_context)
+                        nodes = pol->w.user_nodemask;
+                else
+                        nodes = pol->v.nodes;
                break;
        default:
                BUG();
-                return -EFAULT;
        }
        l = strlen(policy_types[mode]);
-        if (buffer + maxlen < p + l + 1)
+        if (buffer + maxlen < p + l + 1)
-                return -ENOSPC;
+                return -ENOSPC;
        strcpy(p, policy_types[mode]);
        p += l;
-        if (!nodes_empty(nodes)) {
+        if (flags & MPOL_MODE_FLAGS) {
                if (buffer + maxlen < p + 2)
                        return -ENOSPC;
                *p++ = '=';
+                /*
+                 * Currently, the only defined flags are mutually exclusive
+                 */
+                if (flags & MPOL_F_STATIC_NODES)
+                        p += snprintf(p, buffer + maxlen - p, "static");
+                else if (flags & MPOL_F_RELATIVE_NODES)
+                        p += snprintf(p, buffer + maxlen - p, "relative");
+        }
+        if (!nodes_empty(nodes)) {
+                if (buffer + maxlen < p + 2)
+                        return -ENOSPC;
+                *p++ = ':';
                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
        }
        return p - buffer;
@@ -1971,6 +2243,9 @@ static inline void check_huge_range(struct vm_area_struct *vma,
 }
 #endif
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
 int show_numa_map(struct seq_file *m, void *v)
 {
        struct proc_maps_private *priv = m->private;
@@ -1990,12 +2265,8 @@ int show_numa_map(struct seq_file *m, void *v)
                return 0;
        pol = get_vma_policy(priv->task, vma, vma->vm_start);
-        mpol_to_str(buffer, sizeof(buffer), pol);
+        mpol_to_str(buffer, sizeof(buffer), pol, 0);
-        /*
+        mpol_cond_put(pol);
-         * unref shared or other task's mempolicy
-         */
-        if (pol != &default_policy && pol != current->mempolicy)
-                __mpol_free(pol);
        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
diff --git a/mm/mincore.c b/mm/mincore.c
index 5efe0ded69b1..5178800bc129 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -33,7 +33,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
         * When tmpfs swaps out a page from a file, any process mapping that
         * file will not get a swp_entry_t in its pte, but rather it is like
         * any other file mapping (ie. marked !present and faulted in with
-         * tmpfs's .nopage). So swapped out tmpfs mappings are tested here.
+         * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
         *
         * However when tmpfs moves the page from pagecache and into swapcache,
         * it is still in core, but the find_get_page below won't find it.
diff --git a/mm/mmap.c b/mm/mmap.c
index a32d28ce31cd..677d184b0d42 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -232,7 +232,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
                vma->vm_ops->close(vma);
        if (vma->vm_file)
                fput(vma->vm_file);
-        mpol_free(vma_policy(vma));
+        mpol_put(vma_policy(vma));
        kmem_cache_free(vm_area_cachep, vma);
        return next;
 }
@@ -626,7 +626,7 @@ again:			remove_next = 1 + (end > next->vm_end);
                if (file)
                        fput(file);
                mm->map_count--;
-                mpol_free(vma_policy(next));
+                mpol_put(vma_policy(next));
                kmem_cache_free(vm_area_cachep, next);
                /*
                 * In mprotect's case 6 (see comments on vma_merge),
@@ -1068,7 +1068,6 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
                mapping_cap_account_dirty(vma->vm_file->f_mapping);
 }
 unsigned long mmap_region(struct file *file, unsigned long addr,
                          unsigned long len, unsigned long flags,
                          unsigned int vm_flags, unsigned long pgoff,
@@ -1181,22 +1180,20 @@ munmap_back:
        if (vma_wants_writenotify(vma))
                vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
-        if (!file || !vma_merge(mm, prev, addr, vma->vm_end,
+        if (file && vma_merge(mm, prev, addr, vma->vm_end,
                        vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
-                file = vma->vm_file;
+                mpol_put(vma_policy(vma));
-                vma_link(mm, vma, prev, rb_link, rb_parent);
-                if (correct_wcount)
-                        atomic_inc(&inode->i_writecount);
-        } else {
-                if (file) {
-                        if (correct_wcount)
-                                atomic_inc(&inode->i_writecount);
-                        fput(file);
-                }
-                mpol_free(vma_policy(vma));
                kmem_cache_free(vm_area_cachep, vma);
+                fput(file);
+        } else {
+                vma_link(mm, vma, prev, rb_link, rb_parent);
+                file = vma->vm_file;
        }
-out:    
+        /* Once vma denies write, undo our temporary denial count */
+        if (correct_wcount)
+                atomic_inc(&inode->i_writecount);
+out:
        mm->total_vm += len >> PAGE_SHIFT;
        vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
        if (vm_flags & VM_LOCKED) {
@@ -1813,7 +1810,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
                new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
        }
-        pol = mpol_copy(vma_policy(vma));
+        pol = mpol_dup(vma_policy(vma));
        if (IS_ERR(pol)) {
                kmem_cache_free(vm_area_cachep, new);
                return PTR_ERR(pol);
@@ -2129,7 +2126,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
                if (new_vma) {
                        *new_vma = *vma;
-                        pol = mpol_copy(vma_policy(vma));
+                        pol = mpol_dup(vma_policy(vma));
                        if (IS_ERR(pol)) {
                                kmem_cache_free(vm_area_cachep, new_vma);
                                return NULL;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index eb5838634f18..486ed595ee6f 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone)
        return zone;
 }
+static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
+{
+#ifdef CONFIG_NUMA
+        return node_isset(zonelist_node_idx(zref), *nodes);
+#else
+        return 1;
+#endif /* CONFIG_NUMA */
+}
+/* Returns the next zone at or below highest_zoneidx in a zonelist */
+struct zoneref *next_zones_zonelist(struct zoneref *z,
+                                        enum zone_type highest_zoneidx,
+                                        nodemask_t *nodes,
+                                        struct zone **zone)
+{
+        /*
+         * Find the next suitable zone to use for the allocation.
+         * Only filter based on nodemask if it's set
+         */
+        if (likely(nodes == NULL))
+                while (zonelist_zone_idx(z) > highest_zoneidx)
+                        z++;
+        else
+                while (zonelist_zone_idx(z) > highest_zoneidx ||
+                                (z->zone && !zref_in_nodemask(z, nodes)))
+                        z++;
+        *zone = zonelist_zone(z++);
+        return z;
+}
diff --git a/mm/nommu.c b/mm/nommu.c
index 5d8ae086f74e..1d32fe89d57b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -105,7 +105,11 @@ unsigned int kobjsize(const void *objp)
 {
        struct page *page;
-        if (!objp || !((page = virt_to_page(objp))))
+        /*
+         * If the object we have should not have ksize performed on it,
+         * return size of 0
+         */
+        if (!objp || (unsigned long)objp >= memory_end || !((page = virt_to_page(objp))))
                return 0;
        if (PageSlab(page))
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index beb592fe9389..8a5467ee6265 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -53,8 +53,7 @@ static DEFINE_SPINLOCK(zone_scan_mutex);
 *    of least surprise ... (be careful when you change it)
 */
-unsigned long badness(struct task_struct *p, unsigned long uptime,
+unsigned long badness(struct task_struct *p, unsigned long uptime)
-                        struct mem_cgroup *mem)
 {
        unsigned long points, cpu_time, run_time, s;
        struct mm_struct *mm;
@@ -175,12 +174,14 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
                                                    gfp_t gfp_mask)
 {
 #ifdef CONFIG_NUMA
-        struct zone **z;
+        struct zone *zone;
+        struct zoneref *z;
+        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        nodemask_t nodes = node_states[N_HIGH_MEMORY];
-        for (z = zonelist->zones; *z; z++)
+        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-                if (cpuset_zone_allowed_softwall(*z, gfp_mask))
+                if (cpuset_zone_allowed_softwall(zone, gfp_mask))
-                        node_clear(zone_to_nid(*z), nodes);
+                        node_clear(zone_to_nid(zone), nodes);
                else
                        return CONSTRAINT_CPUSET;
@@ -254,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
                if (p->oomkilladj == OOM_DISABLE)
                        continue;
-                points = badness(p, uptime.tv_sec, mem);
+                points = badness(p, uptime.tv_sec);
                if (points > *ppoints || !chosen) {
                        chosen = p;
                        *ppoints = points;
@@ -460,29 +461,29 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 * if a parallel OOM killing is already taking place that includes a zone in
 * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
 */
-int try_set_zone_oom(struct zonelist *zonelist)
+int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 {
-        struct zone **z;
+        struct zoneref *z;
+        struct zone *zone;
        int ret = 1;
-        z = zonelist->zones;
        spin_lock(&zone_scan_mutex);
-        do {
+        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-                if (zone_is_oom_locked(*z)) {
+                if (zone_is_oom_locked(zone)) {
                        ret = 0;
                        goto out;
                }
-        } while (*(++z) != NULL);
+        }
+        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
+                /*
+                 * Lock each zone in the zonelist under zone_scan_mutex so a
+                 * parallel invocation of try_set_zone_oom() doesn't succeed
+                 * when it shouldn't.
+                 */
+                zone_set_flag(zone, ZONE_OOM_LOCKED);
+        }
-        /*
-         * Lock each zone in the zonelist under zone_scan_mutex so a parallel
-         * invocation of try_set_zone_oom() doesn't succeed when it shouldn't.
-         */
-        z = zonelist->zones;
-        do {
-                zone_set_flag(*z, ZONE_OOM_LOCKED);
-        } while (*(++z) != NULL);
 out:
        spin_unlock(&zone_scan_mutex);
        return ret;
@@ -493,16 +494,15 @@ out:
 * allocation attempts with zonelists containing them may now recall the OOM
 * killer, if necessary.
 */
-void clear_zonelist_oom(struct zonelist *zonelist)
+void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 {
-        struct zone **z;
+        struct zoneref *z;
+        struct zone *zone;
-        z = zonelist->zones;
        spin_lock(&zone_scan_mutex);
-        do {
+        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-                zone_clear_flag(*z, ZONE_OOM_LOCKED);
+                zone_clear_flag(zone, ZONE_OOM_LOCKED);
-        } while (*(++z) != NULL);
+        }
        spin_unlock(&zone_scan_mutex);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 32e796af12a1..d1cf4f05dcda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -546,7 +546,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 /*
 * permit the bootmem allocator to evade page validation on high-order frees
 */
-void __init __free_pages_bootmem(struct page *page, unsigned int order)
+void __free_pages_bootmem(struct page *page, unsigned int order)
 {
        if (order == 0) {
                __ClearPageReserved(page);
@@ -632,7 +632,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
        if (PageReserved(page))
                return 1;
-        page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead |
+        page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
                        1 << PG_referenced | 1 << PG_arch_1 |
                        1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
        set_page_private(page, 0);
@@ -1050,7 +1050,7 @@ void split_page(struct page *page, unsigned int order)
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
 */
-static struct page *buffered_rmqueue(struct zonelist *zonelist,
+static struct page *buffered_rmqueue(struct zone *preferred_zone,
                        struct zone *zone, int order, gfp_t gfp_flags)
 {
        unsigned long flags;
@@ -1102,7 +1102,7 @@ again:
        }
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
-        zone_statistics(zonelist, zone);
+        zone_statistics(preferred_zone, zone);
        local_irq_restore(flags);
        put_cpu();
@@ -1284,7 +1284,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
        if (!zlc)
                return NULL;
-       if (time_after(jiffies, zlc->last_full_zap + HZ)) {
+        if (time_after(jiffies, zlc->last_full_zap + HZ)) {
                bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
                zlc->last_full_zap = jiffies;
        }
@@ -1317,7 +1317,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
 * We are low on memory in the second scan, and should leave no stone
 * unturned looking for a free page.
 */
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
                                                nodemask_t *allowednodes)
 {
        struct zonelist_cache *zlc;     /* cached zonelist speedup info */
@@ -1328,7 +1328,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
        if (!zlc)
                return 1;
-        i = z - zonelist->zones;
+        i = z - zonelist->_zonerefs;
        n = zlc->z_to_n[i];
        /* This zone is worth trying if it is allowed but not full */
@@ -1340,7 +1340,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
 * zlc->fullzones, so that subsequent attempts to allocate a page
 * from that zone don't waste time re-examining it.
 */
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
        struct zonelist_cache *zlc;     /* cached zonelist speedup info */
        int i;                          /* index of *z in zonelist zones */
@@ -1349,7 +1349,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
        if (!zlc)
                return;
-        i = z - zonelist->zones;
+        i = z - zonelist->_zonerefs;
        set_bit(i, zlc->fullzones);
 }
@@ -1361,13 +1361,13 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
        return NULL;
 }
-static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
                                nodemask_t *allowednodes)
 {
        return 1;
 }
-static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 {
 }
 #endif  /* CONFIG_NUMA */
@@ -1377,42 +1377,31 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
 * a page.
 */
 static struct page *
-get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
-                struct zonelist *zonelist, int alloc_flags)
+                struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
 {
-        struct zone **z;
+        struct zoneref *z;
        struct page *page = NULL;
-        int classzone_idx = zone_idx(zonelist->zones[0]);
+        int classzone_idx;
-        struct zone *zone;
+        struct zone *zone, *preferred_zone;
        nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
        int zlc_active = 0;             /* set if using zonelist_cache */
        int did_zlc_setup = 0;          /* just call zlc_setup() one time */
-        enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */
+        (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
+                                                        &preferred_zone);
+        classzone_idx = zone_idx(preferred_zone);
 zonelist_scan:
        /*
         * Scan zonelist, looking for a zone with enough free.
         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
         */
-        z = zonelist->zones;
+        for_each_zone_zonelist_nodemask(zone, z, zonelist,
+                                                high_zoneidx, nodemask) {
-        do {
-                /*
-                 * In NUMA, this could be a policy zonelist which contains
-                 * zones that may not be allowed by the current gfp_mask.
-                 * Check the zone is allowed by the current flags
-                 */
-                if (unlikely(alloc_should_filter_zonelist(zonelist))) {
-                        if (highest_zoneidx == -1)
-                                highest_zoneidx = gfp_zone(gfp_mask);
-                        if (zone_idx(*z) > highest_zoneidx)
-                                continue;
-                }
                if (NUMA_BUILD && zlc_active &&
                        !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
-                zone = *z;
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                goto try_next_zone;
@@ -1433,7 +1422,7 @@ zonelist_scan:
                        }
                }
-                page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
+                page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
                if (page)
                        break;
 this_zone_full:
@@ -1446,7 +1435,7 @@ try_next_zone:
                        zlc_active = 1;
                        did_zlc_setup = 1;
                }
-        } while (*(++z) != NULL);
+        }
        if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
                /* Disable zlc cache for second zonelist scan */
@@ -1459,12 +1448,14 @@ try_next_zone:
 /*
 * This is the 'heart' of the zoned buddy allocator.
 */
-struct page *
+static struct page *
-__alloc_pages(gfp_t gfp_mask, unsigned int order,
+__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
-                struct zonelist *zonelist)
+                        struct zonelist *zonelist, nodemask_t *nodemask)
 {
        const gfp_t wait = gfp_mask & __GFP_WAIT;
-        struct zone **z;
+        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+        struct zoneref *z;
+        struct zone *zone;
        struct page *page;
        struct reclaim_state reclaim_state;
        struct task_struct *p = current;
@@ -1478,9 +1469,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
                return NULL;
 restart:
-        z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
+        z = zonelist->_zonerefs;  /* the list of zones suitable for gfp_mask */
-        if (unlikely(*z == NULL)) {
+        if (unlikely(!z->zone)) {
                /*
                 * Happens if we have an empty zonelist as a result of
                 * GFP_THISNODE being used on a memoryless node
@@ -1488,8 +1479,8 @@ restart:
                return NULL;
        }
-        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
-                                zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+                        zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
        if (page)
                goto got_pg;
@@ -1504,8 +1495,8 @@ restart:
        if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
-        for (z = zonelist->zones; *z; z++)
+        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-                wakeup_kswapd(*z, order);
+                wakeup_kswapd(zone, order);
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -1533,7 +1524,8 @@ restart:
         * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
         */
-        page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+        page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
+                                                high_zoneidx, alloc_flags);
        if (page)
                goto got_pg;
@@ -1545,8 +1537,8 @@ rebalance:
                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
 nofail_alloc:
                        /* go through the zonelist yet again, ignoring mins */
-                        page = get_page_from_freelist(gfp_mask, order,
+                        page = get_page_from_freelist(gfp_mask, nodemask, order,
-                                zonelist, ALLOC_NO_WATERMARKS);
+                                zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
                        if (page)
                                goto got_pg;
                        if (gfp_mask & __GFP_NOFAIL) {
@@ -1569,7 +1561,7 @@ nofail_alloc:
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
+        did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
        p->reclaim_state = NULL;
        p->flags &= ~PF_MEMALLOC;
@@ -1580,12 +1572,12 @@ nofail_alloc:
                drain_all_pages();
        if (likely(did_some_progress)) {
-                page = get_page_from_freelist(gfp_mask, order,
+                page = get_page_from_freelist(gfp_mask, nodemask, order,
-                                                zonelist, alloc_flags);
+                                        zonelist, high_zoneidx, alloc_flags);
                if (page)
                        goto got_pg;
        } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
-                if (!try_set_zone_oom(zonelist)) {
+                if (!try_set_zone_oom(zonelist, gfp_mask)) {
                        schedule_timeout_uninterruptible(1);
                        goto restart;
                }
@@ -1596,21 +1588,22 @@ nofail_alloc:
                 * a parallel oom killing, we must fail if we're still
                 * under heavy pressure.
                 */
-                page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+                page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
-                                zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
+                        order, zonelist, high_zoneidx,
+                        ALLOC_WMARK_HIGH|ALLOC_CPUSET);
                if (page) {
-                        clear_zonelist_oom(zonelist);
+                        clear_zonelist_oom(zonelist, gfp_mask);
                        goto got_pg;
                }
                /* The OOM killer will not help higher order allocs so fail */
                if (order > PAGE_ALLOC_COSTLY_ORDER) {
-                        clear_zonelist_oom(zonelist);
+                        clear_zonelist_oom(zonelist, gfp_mask);
                        goto nopage;
                }
                out_of_memory(zonelist, gfp_mask, order);
-                clear_zonelist_oom(zonelist);
+                clear_zonelist_oom(zonelist, gfp_mask);
                goto restart;
        }
@@ -1646,6 +1639,20 @@ got_pg:
        return page;
 }
+struct page *
+__alloc_pages(gfp_t gfp_mask, unsigned int order,
+                struct zonelist *zonelist)
+{
+        return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
+}
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+                struct zonelist *zonelist, nodemask_t *nodemask)
+{
+        return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
+}
 EXPORT_SYMBOL(__alloc_pages);
 /*
@@ -1712,15 +1719,15 @@ EXPORT_SYMBOL(free_pages);
 static unsigned int nr_free_zone_pages(int offset)
 {
+        struct zoneref *z;
+        struct zone *zone;
        /* Just pick one node, since fallback list is circular */
-        pg_data_t *pgdat = NODE_DATA(numa_node_id());
        unsigned int sum = 0;
-        struct zonelist *zonelist = pgdat->node_zonelists + offset;
+        struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
-        struct zone **zonep = zonelist->zones;
-        struct zone *zone;
-        for (zone = *zonep++; zone; zone = *zonep++) {
+        for_each_zone_zonelist(zone, z, zonelist, offset) {
                unsigned long size = zone->present_pages;
                unsigned long high = zone->pages_high;
                if (size > high)
@@ -1889,6 +1896,12 @@ void show_free_areas(void)
        show_swap_cache_info();
 }
+static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
+{
+        zoneref->zone = zone;
+        zoneref->zone_idx = zone_idx(zone);
+}
 /*
 * Builds allocation fallback zone lists.
 *
@@ -1906,7 +1919,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
                zone_type--;
                zone = pgdat->node_zones + zone_type;
                if (populated_zone(zone)) {
-                        zonelist->zones[nr_zones++] = zone;
+                        zoneref_set_zone(zone,
+                                &zonelist->_zonerefs[nr_zones++]);
                        check_highest_zone(zone_type);
                }
@@ -2078,17 +2092,16 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
 */
 static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 {
-        enum zone_type i;
        int j;
        struct zonelist *zonelist;
-        for (i = 0; i < MAX_NR_ZONES; i++) {
+        zonelist = &pgdat->node_zonelists[0];
-                zonelist = pgdat->node_zonelists + i;
+        for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
-                for (j = 0; zonelist->zones[j] != NULL; j++)
+                ;
-                        ;
+        j = build_zonelists_node(NODE_DATA(node), zonelist, j,
-                j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
+                                                        MAX_NR_ZONES - 1);
-                zonelist->zones[j] = NULL;
+        zonelist->_zonerefs[j].zone = NULL;
-        }
+        zonelist->_zonerefs[j].zone_idx = 0;
 }
 /*
@@ -2096,15 +2109,13 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 */
 static void build_thisnode_zonelists(pg_data_t *pgdat)
 {
-        enum zone_type i;
        int j;
        struct zonelist *zonelist;
-        for (i = 0; i < MAX_NR_ZONES; i++) {
+        zonelist = &pgdat->node_zonelists[1];
-                zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i;
+        j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
-                j = build_zonelists_node(pgdat, zonelist, 0, i);
+        zonelist->_zonerefs[j].zone = NULL;
-                zonelist->zones[j] = NULL;
+        zonelist->_zonerefs[j].zone_idx = 0;
-        }
 }
 /*
@@ -2117,27 +2128,26 @@ static int node_order[MAX_NUMNODES];
 static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 {
-        enum zone_type i;
        int pos, j, node;
        int zone_type;          /* needs to be signed */
        struct zone *z;
        struct zonelist *zonelist;
-        for (i = 0; i < MAX_NR_ZONES; i++) {
+        zonelist = &pgdat->node_zonelists[0];
-                zonelist = pgdat->node_zonelists + i;
+        pos = 0;
-                pos = 0;
+        for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
-                for (zone_type = i; zone_type >= 0; zone_type--) {
+                for (j = 0; j < nr_nodes; j++) {
-                        for (j = 0; j < nr_nodes; j++) {
+                        node = node_order[j];
-                                node = node_order[j];
+                        z = &NODE_DATA(node)->node_zones[zone_type];
-                                z = &NODE_DATA(node)->node_zones[zone_type];
+                        if (populated_zone(z)) {
-                                if (populated_zone(z)) {
+                                zoneref_set_zone(z,
-                                        zonelist->zones[pos++] = z;
+                                        &zonelist->_zonerefs[pos++]);
-                                        check_highest_zone(zone_type);
+                                check_highest_zone(zone_type);
-                                }
                        }
                }
-                zonelist->zones[pos] = NULL;
        }
+        zonelist->_zonerefs[pos].zone = NULL;
+        zonelist->_zonerefs[pos].zone_idx = 0;
 }
 static int default_zonelist_order(void)
@@ -2214,7 +2224,8 @@ static void build_zonelists(pg_data_t *pgdat)
        /* initialize zonelists */
        for (i = 0; i < MAX_ZONELISTS; i++) {
                zonelist = pgdat->node_zonelists + i;
-                zonelist->zones[0] = NULL;
+                zonelist->_zonerefs[0].zone = NULL;
+                zonelist->_zonerefs[0].zone_idx = 0;
        }
        /* NUMA-aware ordering of nodes */
@@ -2264,19 +2275,15 @@ static void build_zonelists(pg_data_t *pgdat)
 /* Construct the zonelist performance cache - see further mmzone.h */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
-        int i;
+        struct zonelist *zonelist;
+        struct zonelist_cache *zlc;
-        for (i = 0; i < MAX_NR_ZONES; i++) {
+        struct zoneref *z;
-                struct zonelist *zonelist;
-                struct zonelist_cache *zlc;
-                struct zone **z;
-                zonelist = pgdat->node_zonelists + i;
+        zonelist = &pgdat->node_zonelists[0];
-                zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+        zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
-                bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+        bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
-                for (z = zonelist->zones; *z; z++)
+        for (z = zonelist->_zonerefs; z->zone; z++)
-                        zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
+                zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
-        }
 }
@@ -2290,45 +2297,44 @@ static void set_zonelist_order(void)
 static void build_zonelists(pg_data_t *pgdat)
 {
        int node, local_node;
-        enum zone_type i,j;
+        enum zone_type j;
+        struct zonelist *zonelist;
        local_node = pgdat->node_id;
-        for (i = 0; i < MAX_NR_ZONES; i++) {
-                struct zonelist *zonelist;
-                zonelist = pgdat->node_zonelists + i;
+        zonelist = &pgdat->node_zonelists[0];
+        j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
-                j = build_zonelists_node(pgdat, zonelist, 0, i);
+        /*
-                /*
+         * Now we build the zonelist so that it contains the zones
-                 * Now we build the zonelist so that it contains the zones
+         * of all the other nodes.
-                 * of all the other nodes.
+         * We don't want to pressure a particular node, so when
-                 * We don't want to pressure a particular node, so when
+         * building the zones for node N, we make sure that the
-                 * building the zones for node N, we make sure that the
+         * zones coming right after the local ones are those from
-                 * zones coming right after the local ones are those from
+         * node N+1 (modulo N)
-                 * node N+1 (modulo N)
+         */
-                 */
+        for (node = local_node + 1; node < MAX_NUMNODES; node++) {
-                for (node = local_node + 1; node < MAX_NUMNODES; node++) {
+                if (!node_online(node))
-                        if (!node_online(node))
+                        continue;
-                                continue;
+                j = build_zonelists_node(NODE_DATA(node), zonelist, j,
-                        j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
+                                                        MAX_NR_ZONES - 1);
-                }
+        }
-                for (node = 0; node < local_node; node++) {
+        for (node = 0; node < local_node; node++) {
-                        if (!node_online(node))
+                if (!node_online(node))
-                                continue;
+                        continue;
-                        j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
+                j = build_zonelists_node(NODE_DATA(node), zonelist, j,
-                }
+                                                        MAX_NR_ZONES - 1);
-                zonelist->zones[j] = NULL;
        }
+        zonelist->_zonerefs[j].zone = NULL;
+        zonelist->_zonerefs[j].zone_idx = 0;
 }
 /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
 static void build_zonelist_cache(pg_data_t *pgdat)
 {
-        int i;
+        pgdat->node_zonelists[0].zlcache_ptr = NULL;
+        pgdat->node_zonelists[1].zlcache_ptr = NULL;
-        for (i = 0; i < MAX_NR_ZONES; i++)
-                pgdat->node_zonelists[i].zlcache_ptr = NULL;
 }
 #endif  /* CONFIG_NUMA */
@@ -4339,9 +4345,7 @@ void *__init alloc_large_system_hash(const char *tablename,
                else if (hashdist)
                        table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
                else {
-                        unsigned long order;
+                        unsigned long order = get_order(size);
-                        for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
-                                ;
                        table = (void*) __get_free_pages(GFP_ATOMIC, order);
                        /*
                         * If bucketsize is not a power-of-two, we may free
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 1cf1417ef8b7..0afd2387e507 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -9,11 +9,15 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        int err = 0;
        pte = pte_offset_map(pmd, addr);
-        do {
+        for (;;) {
                err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
                if (err)
                       break;
-        } while (pte++, addr += PAGE_SIZE, addr != end);
+                addr += PAGE_SIZE;
+                if (addr == end)
+                        break;
+                pte++;
+        }
        pte_unmap(pte);
        return err;
diff --git a/mm/rmap.c b/mm/rmap.c
index 997f06907b6d..bf0a5b7cfb8e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -413,9 +413,6 @@ int page_referenced(struct page *page, int is_locked,
 {
        int referenced = 0;
-        if (page_test_and_clear_young(page))
-                referenced++;
        if (TestClearPageReferenced(page))
                referenced++;
@@ -433,6 +430,10 @@ int page_referenced(struct page *page, int is_locked,
                        unlock_page(page);
                }
        }
+        if (page_test_and_clear_young(page))
+                referenced++;
        return referenced;
 }
@@ -661,7 +662,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
                        print_symbol (KERN_EMERG "  vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
                        if (vma->vm_ops) {
-                                print_symbol (KERN_EMERG "  vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
                                print_symbol (KERN_EMERG "  vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
                        }
                        if (vma->vm_file && vma->vm_file->f_op)
diff --git a/mm/shmem.c b/mm/shmem.c
index f514dd392cd9..e6d9298aa22a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1079,104 +1079,47 @@ redirty:
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_TMPFS
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
 {
-        char *nodelist = strchr(value, ':');
+        char buffer[64];
-        int err = 1;
-        if (nodelist) {
+        if (!mpol || mpol->mode == MPOL_DEFAULT)
-                /* NUL-terminate policy string */
+                return;         /* show nothing */
-                *nodelist++ = '\0';
-                if (nodelist_parse(nodelist, *policy_nodes))
-                        goto out;
-                if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
-                        goto out;
-        }
-        if (!strcmp(value, "default")) {
-                *policy = MPOL_DEFAULT;
-                /* Don't allow a nodelist */
-                if (!nodelist)
-                        err = 0;
-        } else if (!strcmp(value, "prefer")) {
-                *policy = MPOL_PREFERRED;
-                /* Insist on a nodelist of one node only */
-                if (nodelist) {
-                        char *rest = nodelist;
-                        while (isdigit(*rest))
-                                rest++;
-                        if (!*rest)
-                                err = 0;
-                }
-        } else if (!strcmp(value, "bind")) {
-                *policy = MPOL_BIND;
-                /* Insist on a nodelist */
-                if (nodelist)
-                        err = 0;
-        } else if (!strcmp(value, "interleave")) {
-                *policy = MPOL_INTERLEAVE;
-                /*
-                 * Default to online nodes with memory if no nodelist
-                 */
-                if (!nodelist)
-                        *policy_nodes = node_states[N_HIGH_MEMORY];
-                err = 0;
-        }
-out:
-        /* Restore string for error message */
-        if (nodelist)
-                *--nodelist = ':';
-        return err;
-}
-static void shmem_show_mpol(struct seq_file *seq, int policy,
-                            const nodemask_t policy_nodes)
-{
-        char *policy_string;
-        switch (policy) {
+        mpol_to_str(buffer, sizeof(buffer), mpol, 1);
-        case MPOL_PREFERRED:
-                policy_string = "prefer";
-                break;
-        case MPOL_BIND:
-                policy_string = "bind";
-                break;
-        case MPOL_INTERLEAVE:
-                policy_string = "interleave";
-                break;
-        default:
-                /* MPOL_DEFAULT */
-                return;
-        }
-        seq_printf(seq, ",mpol=%s", policy_string);
+        seq_printf(seq, ",mpol=%s", buffer);
+}
-        if (policy != MPOL_INTERLEAVE ||
-            !nodes_equal(policy_nodes, node_states[N_HIGH_MEMORY])) {
-                char buffer[64];
-                int len;
-                len = nodelist_scnprintf(buffer, sizeof(buffer), policy_nodes);
+static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
-                if (len < sizeof(buffer))
+{
-                        seq_printf(seq, ":%s", buffer);
+        struct mempolicy *mpol = NULL;
-                else
+        if (sbinfo->mpol) {
-                        seq_printf(seq, ":?");
+                spin_lock(&sbinfo->stat_lock);  /* prevent replace/use races */
+                mpol = sbinfo->mpol;
+                mpol_get(mpol);
+                spin_unlock(&sbinfo->stat_lock);
        }
+        return mpol;
 }
 #endif /* CONFIG_TMPFS */
 static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
                        struct shmem_inode_info *info, unsigned long idx)
 {
+        struct mempolicy mpol, *spol;
        struct vm_area_struct pvma;
        struct page *page;
+        spol = mpol_cond_copy(&mpol,
+                                mpol_shared_policy_lookup(&info->policy, idx));
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
        pvma.vm_pgoff = idx;
        pvma.vm_ops = NULL;
-        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+        pvma.vm_policy = spol;
        page = swapin_readahead(entry, gfp, &pvma, 0);
-        mpol_free(pvma.vm_policy);
        return page;
 }
@@ -1184,27 +1127,21 @@ static struct page *shmem_alloc_page(gfp_t gfp,
                        struct shmem_inode_info *info, unsigned long idx)
 {
        struct vm_area_struct pvma;
-        struct page *page;
        /* Create a pseudo vma that just contains the policy */
        pvma.vm_start = 0;
        pvma.vm_pgoff = idx;
        pvma.vm_ops = NULL;
        pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
-        page = alloc_page_vma(gfp, &pvma, 0);
-        mpol_free(pvma.vm_policy);
+        /*
-        return page;
+         * alloc_page_vma() will drop the shared policy reference
+         */
+        return alloc_page_vma(gfp, &pvma, 0);
 }
 #else /* !CONFIG_NUMA */
 #ifdef CONFIG_TMPFS
-static inline int shmem_parse_mpol(char *value, int *policy,
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
-                                                nodemask_t *policy_nodes)
-{
-        return 1;
-}
-static inline void shmem_show_mpol(struct seq_file *seq, int policy,
-                            const nodemask_t policy_nodes)
 {
 }
 #endif /* CONFIG_TMPFS */
@@ -1222,6 +1159,13 @@ static inline struct page *shmem_alloc_page(gfp_t gfp,
 }
 #endif /* CONFIG_NUMA */
+#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+        return NULL;
+}
+#endif
 /*
 * shmem_getpage - either get the page from swap or allocate a new one
 *
@@ -1576,8 +1520,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                case S_IFREG:
                        inode->i_op = &shmem_inode_operations;
                        inode->i_fop = &shmem_file_operations;
-                        mpol_shared_policy_init(&info->policy, sbinfo->policy,
+                        mpol_shared_policy_init(&info->policy,
-                                                        &sbinfo->policy_nodes);
+                                                 shmem_get_sbmpol(sbinfo));
                        break;
                case S_IFDIR:
                        inc_nlink(inode);
@@ -1591,8 +1535,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                         * Must not load anything in the rbtree,
                         * mpol_free_shared_policy will not be called.
                         */
-                        mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+                        mpol_shared_policy_init(&info->policy, NULL);
-                                                NULL);
                        break;
                }
        } else
@@ -2207,8 +2150,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
                        if (*rest)
                                goto bad_val;
                } else if (!strcmp(this_char,"mpol")) {
-                        if (shmem_parse_mpol(value, &sbinfo->policy,
+                        if (mpol_parse_str(value, &sbinfo->mpol, 1))
-                                             &sbinfo->policy_nodes))
                                goto bad_val;
                } else {
                        printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2259,8 +2201,9 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        sbinfo->free_blocks = config.max_blocks - blocks;
        sbinfo->max_inodes  = config.max_inodes;
        sbinfo->free_inodes = config.max_inodes - inodes;
-        sbinfo->policy      = config.policy;
-        sbinfo->policy_nodes = config.policy_nodes;
+        mpol_put(sbinfo->mpol);
+        sbinfo->mpol        = config.mpol;      /* transfers initial ref */
 out:
        spin_unlock(&sbinfo->stat_lock);
        return error;
@@ -2281,7 +2224,7 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",uid=%u", sbinfo->uid);
        if (sbinfo->gid != 0)
                seq_printf(seq, ",gid=%u", sbinfo->gid);
-        shmem_show_mpol(seq, sbinfo->policy, sbinfo->policy_nodes);
+        shmem_show_mpol(seq, sbinfo->mpol);
        return 0;
 }
 #endif /* CONFIG_TMPFS */
@@ -2311,8 +2254,7 @@ static int shmem_fill_super(struct super_block *sb,
        sbinfo->mode = S_IRWXUGO | S_ISVTX;
        sbinfo->uid = current->fsuid;
        sbinfo->gid = current->fsgid;
-        sbinfo->policy = MPOL_DEFAULT;
+        sbinfo->mpol = NULL;
-        sbinfo->policy_nodes = node_states[N_HIGH_MEMORY];
        sb->s_fs_info = sbinfo;
 #ifdef CONFIG_TMPFS
diff --git a/mm/slab.c b/mm/slab.c
index 03927cb5ec9e..39d20f8a0791 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -139,10 +139,6 @@
 #define BYTES_PER_WORD          sizeof(void *)
 #define REDZONE_ALIGN           max(BYTES_PER_WORD, __alignof__(unsigned long long))
-#ifndef cache_line_size
-#define cache_line_size()       L1_CACHE_BYTES
-#endif
 #ifndef ARCH_KMALLOC_MINALIGN
 /*
 * Enforce a minimum alignment for the kmalloc caches.
@@ -3242,15 +3238,16 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 {
        struct zonelist *zonelist;
        gfp_t local_flags;
-        struct zone **z;
+        struct zoneref *z;
+        struct zone *zone;
+        enum zone_type high_zoneidx = gfp_zone(flags);
        void *obj = NULL;
        int nid;
        if (flags & __GFP_THISNODE)
                return NULL;
-        zonelist = &NODE_DATA(slab_node(current->mempolicy))
+        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
-                        ->node_zonelists[gfp_zone(flags)];
        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 retry:
@@ -3258,10 +3255,10 @@ retry:
         * Look through allowed nodes for objects available
         * from existing per node queues.
         */
-        for (z = zonelist->zones; *z && !obj; z++) {
+        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                nid = zone_to_nid(*z);
+                nid = zone_to_nid(zone);
-                if (cpuset_zone_allowed_hardwall(*z, flags) &&
+                if (cpuset_zone_allowed_hardwall(zone, flags) &&
                        cache->nodelists[nid] &&
                        cache->nodelists[nid]->free_objects)
                                obj = ____cache_alloc_node(cache,
diff --git a/mm/slub.c b/mm/slub.c
index d821ce6fff39..992ecd4f0d39 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -186,11 +186,6 @@ static inline void ClearSlabDebug(struct page *page)
 #define __OBJECT_POISON         0x80000000 /* Poison object */
 #define __SYSFS_ADD_DEFERRED    0x40000000 /* Not yet visible via sysfs */
-/* Not all arches define cache_line_size */
-#ifndef cache_line_size
-#define cache_line_size()       L1_CACHE_BYTES
-#endif
 static int kmem_size = sizeof(struct kmem_cache);
 #ifdef CONFIG_SMP
@@ -1330,7 +1325,9 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 {
 #ifdef CONFIG_NUMA
        struct zonelist *zonelist;
-        struct zone **z;
+        struct zoneref *z;
+        struct zone *zone;
+        enum zone_type high_zoneidx = gfp_zone(flags);
        struct page *page;
        /*
@@ -1355,14 +1352,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
                        get_cycles() % 1024 > s->remote_node_defrag_ratio)
                return NULL;
-        zonelist = &NODE_DATA(
+        zonelist = node_zonelist(slab_node(current->mempolicy), flags);
-                slab_node(current->mempolicy))->node_zonelists[gfp_zone(flags)];
+        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-        for (z = zonelist->zones; *z; z++) {
                struct kmem_cache_node *n;
-                n = get_node(s, zone_to_nid(*z));
+                n = get_node(s, zone_to_nid(zone));
-                if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
+                if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
                                n->nr_partial > MIN_PARTIAL) {
                        page = get_partial_node(n);
                        if (page)
diff --git a/mm/sparse.c b/mm/sparse.c
index 98d6b39c3472..dff71f173ae9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
+#include "internal.h"
 #include <asm/dma.h>
 #include <asm/pgalloc.h>
 #include <asm/pgtable.h>
@@ -208,12 +209,12 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
 }
 /*
- * We need this if we ever free the mem_maps.  While not implemented yet,
+ * Decode mem_map from the coded memmap
- * this function is included for parity with its sibling.
 */
-static __attribute((unused))
 struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
 {
+        /* mask off the extra low bits of information */
+        coded_mem_map &= SECTION_MAP_MASK;
        return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
 }
@@ -232,7 +233,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
        return 1;
 }
-static unsigned long usemap_size(void)
+unsigned long usemap_size(void)
 {
        unsigned long size_bytes;
        size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
@@ -249,11 +250,22 @@ static unsigned long *__kmalloc_section_usemap(void)
 static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
 {
-        unsigned long *usemap;
+        unsigned long *usemap, section_nr;
        struct mem_section *ms = __nr_to_section(pnum);
        int nid = sparse_early_nid(ms);
+        struct pglist_data *pgdat = NODE_DATA(nid);
-        usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+        /*
+         * Usemap's page can't be freed until freeing other sections
+         * which use it. And, Pgdat has same feature.
+         * If section A has pgdat and section B has usemap for other
+         * sections (includes section A), both sections can't be removed,
+         * because there is the dependency each other.
+         * To solve above issue, this collects all usemap on the same section
+         * which has pgdat.
+         */
+        section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
+        usemap = alloc_bootmem_section(usemap_size(), section_nr);
        if (usemap)
                return usemap;
@@ -273,8 +285,8 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
        if (map)
                return map;
-        map = alloc_bootmem_node(NODE_DATA(nid),
+        map = alloc_bootmem_pages_node(NODE_DATA(nid),
-                        sizeof(struct page) * PAGES_PER_SECTION);
+                       PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
        return map;
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
@@ -295,6 +307,9 @@ struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
        return NULL;
 }
+void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
+{
+}
 /*
 * Allocate the accumulated non-linear sections, allocate a mem_map
 * for each and record the physical to section mapping.
@@ -304,22 +319,50 @@ void __init sparse_init(void)
        unsigned long pnum;
        struct page *map;
        unsigned long *usemap;
+        unsigned long **usemap_map;
+        int size;
+        /*
+         * map is using big page (aka 2M in x86 64 bit)
+         * usemap is less one page (aka 24 bytes)
+         * so alloc 2M (with 2M align) and 24 bytes in turn will
+         * make next 2M slip to one more 2M later.
+         * then in big system, the memory will have a lot of holes...
+         * here try to allocate 2M pages continously.
+         *
+         * powerpc need to call sparse_init_one_section right after each
+         * sparse_early_mem_map_alloc, so allocate usemap_map at first.
+         */
+        size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
+        usemap_map = alloc_bootmem(size);
+        if (!usemap_map)
+                panic("can not allocate usemap_map\n");
        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
                if (!present_section_nr(pnum))
                        continue;
+                usemap_map[pnum] = sparse_early_usemap_alloc(pnum);
+        }
-                map = sparse_early_mem_map_alloc(pnum);
+        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
-                if (!map)
+                if (!present_section_nr(pnum))
                        continue;
-                usemap = sparse_early_usemap_alloc(pnum);
+                usemap = usemap_map[pnum];
                if (!usemap)
                        continue;
+                map = sparse_early_mem_map_alloc(pnum);
+                if (!map)
+                        continue;
                sparse_init_one_section(__nr_to_section(pnum), pnum, map,
                                                                usemap);
        }
+        vmemmap_populate_print_last();
+        free_bootmem(__pa(usemap_map), size);
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -334,6 +377,9 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
        return; /* XXX: Not implemented yet */
 }
+static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+{
+}
 #else
 static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
 {
@@ -371,8 +417,69 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
                free_pages((unsigned long)memmap,
                           get_order(sizeof(struct page) * nr_pages));
 }
+static void free_map_bootmem(struct page *page, unsigned long nr_pages)
+{
+        unsigned long maps_section_nr, removing_section_nr, i;
+        int magic;
+        for (i = 0; i < nr_pages; i++, page++) {
+                magic = atomic_read(&page->_mapcount);
+                BUG_ON(magic == NODE_INFO);
+                maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
+                removing_section_nr = page->private;
+                /*
+                 * When this function is called, the removing section is
+                 * logical offlined state. This means all pages are isolated
+                 * from page allocator. If removing section's memmap is placed
+                 * on the same section, it must not be freed.
+                 * If it is freed, page allocator may allocate it which will
+                 * be removed physically soon.
+                 */
+                if (maps_section_nr != removing_section_nr)
+                        put_page_bootmem(page);
+        }
+}
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
+static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+{
+        struct page *usemap_page;
+        unsigned long nr_pages;
+        if (!usemap)
+                return;
+        usemap_page = virt_to_page(usemap);
+        /*
+         * Check to see if allocation came from hot-plug-add
+         */
+        if (PageSlab(usemap_page)) {
+                kfree(usemap);
+                if (memmap)
+                        __kfree_section_memmap(memmap, PAGES_PER_SECTION);
+                return;
+        }
+        /*
+         * The usemap came from bootmem. This is packed with other usemaps
+         * on the section which has pgdat at boot time. Just keep it as is now.
+         */
+        if (memmap) {
+                struct page *memmap_page;
+                memmap_page = virt_to_page(memmap);
+                nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+                        >> PAGE_SHIFT;
+                free_map_bootmem(memmap_page, nr_pages);
+        }
+}
 /*
 * returns the number of sections whose mem_maps were properly
 * set.  If this is <=0, then that means that the passed-in
@@ -425,4 +532,20 @@ out:
        }
        return ret;
 }
+void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
+{
+        struct page *memmap = NULL;
+        unsigned long *usemap = NULL;
+        if (ms->section_mem_map) {
+                usemap = ms->pageblock_flags;
+                memmap = sparse_decode_mem_map(ms->section_mem_map,
+                                                __section_nr(ms));
+                ms->section_mem_map = 0;
+                ms->pageblock_flags = NULL;
+        }
+        free_section_usemap(memmap, usemap);
+}
 #endif
diff --git a/mm/swap.c b/mm/swap.c
index aa1139ccf3a7..91e194445a5e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -132,34 +132,21 @@ static void pagevec_move_tail(struct pagevec *pvec)
 * Writeback is about to end against a page which has been marked for immediate
 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 * inactive list.
- *
- * Returns zero if it cleared PG_writeback.
 */
-int rotate_reclaimable_page(struct page *page)
+void  rotate_reclaimable_page(struct page *page)
 {
-        struct pagevec *pvec;
+        if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
-        unsigned long flags;
+            PageLRU(page)) {
+                struct pagevec *pvec;
-        if (PageLocked(page))
+                unsigned long flags;
-                return 1;
-        if (PageDirty(page))
-                return 1;
-        if (PageActive(page))
-                return 1;
-        if (!PageLRU(page))
-                return 1;
-        page_cache_get(page);
-        local_irq_save(flags);
-        pvec = &__get_cpu_var(lru_rotate_pvecs);
-        if (!pagevec_add(pvec, page))
-                pagevec_move_tail(pvec);
-        local_irq_restore(flags);
-        if (!test_clear_page_writeback(page))
-                BUG();
-        return 0;
+                page_cache_get(page);
+                local_irq_save(flags);
+                pvec = &__get_cpu_var(lru_rotate_pvecs);
+                if (!pagevec_add(pvec, page))
+                        pagevec_move_tail(pvec);
+                local_irq_restore(flags);
+        }
 }
 /*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2da149cfc9ac..67051be7083a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1582,6 +1582,14 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                error = -EINVAL;
                goto bad_swap;
        case 2:
+                /* swap partition endianess hack... */
+                if (swab32(swap_header->info.version) == 1) {
+                        swab32s(&swap_header->info.version);
+                        swab32s(&swap_header->info.last_page);
+                        swab32s(&swap_header->info.nr_badpages);
+                        for (i = 0; i < swap_header->info.nr_badpages; i++)
+                                swab32s(&swap_header->info.badpages[i]);
+                }
                /* Check the swap header's sub-version and the size of
                   the swap file and bad block lists */
                if (swap_header->info.version != 1) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 7d20ce41ecf5..b8961cb63414 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -391,6 +391,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
        pgoff_t next;
        int i;
        int ret = 0;
+        int ret2 = 0;
        int did_range_unmap = 0;
        int wrapped = 0;
@@ -438,9 +439,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                                }
                        }
                        BUG_ON(page_mapped(page));
-                        ret = do_launder_page(mapping, page);
+                        ret2 = do_launder_page(mapping, page);
-                        if (ret == 0 && !invalidate_complete_page2(mapping, page))
+                        if (ret2 == 0) {
-                                ret = -EIO;
+                                if (!invalidate_complete_page2(mapping, page))
+                                        ret2 = -EIO;
+                        }
+                        if (ret2 < 0)
+                                ret = ret2;
                        unlock_page(page);
                }
                pagevec_release(&pvec);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ecf91f8034bf..e33e0ae69ad1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -14,8 +14,9 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <linux/kallsyms.h>
 #include <asm/uaccess.h>
 #include <asm/tlbflush.h>
@@ -25,7 +26,7 @@ DEFINE_RWLOCK(vmlist_lock);
 struct vm_struct *vmlist;
 static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
-                            int node);
+                            int node, void *caller);
 static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
 {
@@ -204,9 +205,9 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
 }
 EXPORT_SYMBOL(vmalloc_to_pfn);
-static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags,
+static struct vm_struct *
-                                            unsigned long start, unsigned long end,
+__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
-                                            int node, gfp_t gfp_mask)
+                unsigned long end, int node, gfp_t gfp_mask, void *caller)
 {
        struct vm_struct **p, *tmp, *area;
        unsigned long align = 1;
@@ -269,6 +270,7 @@ found:
        area->pages = NULL;
        area->nr_pages = 0;
        area->phys_addr = 0;
+        area->caller = caller;
        write_unlock(&vmlist_lock);
        return area;
@@ -284,7 +286,8 @@ out:
 struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
                                unsigned long start, unsigned long end)
 {
-        return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL);
+        return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
+                                                __builtin_return_address(0));
 }
 EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -299,14 +302,22 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
 */
 struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
 {
-        return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END);
+        return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+                                -1, GFP_KERNEL, __builtin_return_address(0));
+}
+struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
+                                void *caller)
+{
+        return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
+                                                -1, GFP_KERNEL, caller);
 }
 struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
                                   int node, gfp_t gfp_mask)
 {
        return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
-                                  gfp_mask);
+                                  gfp_mask, __builtin_return_address(0));
 }
 /* Caller must hold vmlist_lock */
@@ -455,9 +466,11 @@ void *vmap(struct page **pages, unsigned int count,
        if (count > num_physpages)
                return NULL;
-        area = get_vm_area((count << PAGE_SHIFT), flags);
+        area = get_vm_area_caller((count << PAGE_SHIFT), flags,
+                                        __builtin_return_address(0));
        if (!area)
                return NULL;
        if (map_vm_area(area, prot, &pages)) {
                vunmap(area->addr);
                return NULL;
@@ -468,7 +481,7 @@ void *vmap(struct page **pages, unsigned int count,
 EXPORT_SYMBOL(vmap);
 static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
-                                 pgprot_t prot, int node)
+                                 pgprot_t prot, int node, void *caller)
 {
        struct page **pages;
        unsigned int nr_pages, array_size, i;
@@ -480,7 +493,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
        /* Please note that the recursion is strictly bounded. */
        if (array_size > PAGE_SIZE) {
                pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
-                                        PAGE_KERNEL, node);
+                                PAGE_KERNEL, node, caller);
                area->flags |= VM_VPAGES;
        } else {
                pages = kmalloc_node(array_size,
@@ -488,6 +501,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                                node);
        }
        area->pages = pages;
+        area->caller = caller;
        if (!area->pages) {
                remove_vm_area(area->addr);
                kfree(area);
@@ -521,7 +535,8 @@ fail:
 void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 {
-        return __vmalloc_area_node(area, gfp_mask, prot, -1);
+        return __vmalloc_area_node(area, gfp_mask, prot, -1,
+                                        __builtin_return_address(0));
 }
 /**
@@ -536,7 +551,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 *      kernel virtual space, using a pagetable protection of @prot.
 */
 static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
-                            int node)
+                                                int node, void *caller)
 {
        struct vm_struct *area;
@@ -544,16 +559,19 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
        if (!size || (size >> PAGE_SHIFT) > num_physpages)
                return NULL;
-        area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask);
+        area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
+                                                node, gfp_mask, caller);
        if (!area)
                return NULL;
-        return __vmalloc_area_node(area, gfp_mask, prot, node);
+        return __vmalloc_area_node(area, gfp_mask, prot, node, caller);
 }
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
-        return __vmalloc_node(size, gfp_mask, prot, -1);
+        return __vmalloc_node(size, gfp_mask, prot, -1,
+                                __builtin_return_address(0));
 }
 EXPORT_SYMBOL(__vmalloc);
@@ -568,7 +586,8 @@ EXPORT_SYMBOL(__vmalloc);
 */
 void *vmalloc(unsigned long size)
 {
-        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+                                        -1, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc);
@@ -608,7 +627,8 @@ EXPORT_SYMBOL(vmalloc_user);
 */
 void *vmalloc_node(unsigned long size, int node)
 {
-        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
+                                        node, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(vmalloc_node);
@@ -843,7 +863,8 @@ struct vm_struct *alloc_vm_area(size_t size)
 {
        struct vm_struct *area;
-        area = get_vm_area(size, VM_IOREMAP);
+        area = get_vm_area_caller(size, VM_IOREMAP,
+                                __builtin_return_address(0));
        if (area == NULL)
                return NULL;
@@ -873,3 +894,85 @@ void free_vm_area(struct vm_struct *area)
        kfree(area);
 }
 EXPORT_SYMBOL_GPL(free_vm_area);
+#ifdef CONFIG_PROC_FS
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+        loff_t n = *pos;
+        struct vm_struct *v;
+        read_lock(&vmlist_lock);
+        v = vmlist;
+        while (n > 0 && v) {
+                n--;
+                v = v->next;
+        }
+        if (!n)
+                return v;
+        return NULL;
+}
+static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+{
+        struct vm_struct *v = p;
+        ++*pos;
+        return v->next;
+}
+static void s_stop(struct seq_file *m, void *p)
+{
+        read_unlock(&vmlist_lock);
+}
+static int s_show(struct seq_file *m, void *p)
+{
+        struct vm_struct *v = p;
+        seq_printf(m, "0x%p-0x%p %7ld",
+                v->addr, v->addr + v->size, v->size);
+        if (v->caller) {
+                char buff[2 * KSYM_NAME_LEN];
+                seq_putc(m, ' ');
+                sprint_symbol(buff, (unsigned long)v->caller);
+                seq_puts(m, buff);
+        }
+        if (v->nr_pages)
+                seq_printf(m, " pages=%d", v->nr_pages);
+        if (v->phys_addr)
+                seq_printf(m, " phys=%lx", v->phys_addr);
+        if (v->flags & VM_IOREMAP)
+                seq_printf(m, " ioremap");
+        if (v->flags & VM_ALLOC)
+                seq_printf(m, " vmalloc");
+        if (v->flags & VM_MAP)
+                seq_printf(m, " vmap");
+        if (v->flags & VM_USERMAP)
+                seq_printf(m, " user");
+        if (v->flags & VM_VPAGES)
+                seq_printf(m, " vpages");
+        seq_putc(m, '\n');
+        return 0;
+}
+const struct seq_operations vmalloc_op = {
+        .start = s_start,
+        .next = s_next,
+        .stop = s_stop,
+        .show = s_show,
+};
+#endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f80a5b7c057f..eceac9f9032f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1246,17 +1246,16 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
-static unsigned long shrink_zones(int priority, struct zone **zones,
+static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
+        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
        unsigned long nr_reclaimed = 0;
-        int i;
+        struct zoneref *z;
+        struct zone *zone;
        sc->all_unreclaimable = 1;
-        for (i = 0; zones[i] != NULL; i++) {
+        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                struct zone *zone = zones[i];
                if (!populated_zone(zone))
                        continue;
                /*
@@ -1301,8 +1300,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
 * holds filesystem locks which prevent writeout this might not work, and the
 * allocation attempt will fail.
 */
-static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
+static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
-                                          struct scan_control *sc)
+                                        struct scan_control *sc)
 {
        int priority;
        int ret = 0;
@@ -1310,7 +1309,9 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
        unsigned long nr_reclaimed = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long lru_pages = 0;
-        int i;
+        struct zoneref *z;
+        struct zone *zone;
+        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
        if (scan_global_lru(sc))
                count_vm_event(ALLOCSTALL);
@@ -1318,8 +1319,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
         * mem_cgroup will not do shrink_slab.
         */
        if (scan_global_lru(sc)) {
-                for (i = 0; zones[i] != NULL; i++) {
+                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                        struct zone *zone = zones[i];
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
@@ -1333,13 +1333,13 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
                sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token();
-                nr_reclaimed += shrink_zones(priority, zones, sc);
+                nr_reclaimed += shrink_zones(priority, zonelist, sc);
                /*
                 * Don't shrink slabs when reclaiming memory from
                 * over limit cgroups
                 */
                if (scan_global_lru(sc)) {
-                        shrink_slab(sc->nr_scanned, gfp_mask, lru_pages);
+                        shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
                        if (reclaim_state) {
                                nr_reclaimed += reclaim_state->reclaimed_slab;
                                reclaim_state->reclaimed_slab = 0;
@@ -1383,8 +1383,7 @@ out:
                priority = 0;
        if (scan_global_lru(sc)) {
-                for (i = 0; zones[i] != NULL; i++) {
+                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                        struct zone *zone = zones[i];
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
@@ -1397,7 +1396,8 @@ out:
        return ret;
 }
-unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
+                                                                gfp_t gfp_mask)
 {
        struct scan_control sc = {
                .gfp_mask = gfp_mask,
@@ -1410,7 +1410,7 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
                .isolate_pages = isolate_pages_global,
        };
-        return do_try_to_free_pages(zones, gfp_mask, &sc);
+        return do_try_to_free_pages(zonelist, &sc);
 }
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1419,7 +1419,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                                gfp_t gfp_mask)
 {
        struct scan_control sc = {
-                .gfp_mask = gfp_mask,
                .may_writepage = !laptop_mode,
                .may_swap = 1,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
@@ -1428,13 +1427,12 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .mem_cgroup = mem_cont,
                .isolate_pages = mem_cgroup_isolate_pages,
        };
-        struct zone **zones;
+        struct zonelist *zonelist;
-        int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
-        zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones;
+        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
-        if (do_try_to_free_pages(zones, sc.gfp_mask, &sc))
+                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
-                return 1;
+        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
-        return 0;
+        return do_try_to_free_pages(zonelist, &sc);
 }
 #endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7c7286e9506d..ec6035eda933 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -322,6 +322,7 @@ void refresh_cpu_vm_stats(int cpu)
                                p->expire = 3;
 #endif
                        }
+                cond_resched();
 #ifdef CONFIG_NUMA
                /*
                 * Deal with draining the remote pageset of this
@@ -364,13 +365,13 @@ void refresh_cpu_vm_stats(int cpu)
 *
 * Must be called with interrupts disabled.
 */
-void zone_statistics(struct zonelist *zonelist, struct zone *z)
+void zone_statistics(struct zone *preferred_zone, struct zone *z)
 {
-        if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
+        if (z->zone_pgdat == preferred_zone->zone_pgdat) {
                __inc_zone_state(z, NUMA_HIT);
        } else {
                __inc_zone_state(z, NUMA_MISS);
-                __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
+                __inc_zone_state(preferred_zone, NUMA_FOREIGN);
        }
        if (z->node == numa_node_id())
                __inc_zone_state(z, NUMA_LOCAL);
@@ -645,6 +646,10 @@ static const char * const vmstat_text[] = {
        "allocstall",
        "pgrotated",
+#ifdef CONFIG_HUGETLB_PAGE
+        "htlb_buddy_alloc_success",
+        "htlb_buddy_alloc_fail",
+#endif
 #endif
 };