37 files changed, 3561 insertions, 1265 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index e24d348083c3..b1f03b0eb7f1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -112,6 +112,19 @@ config SPARSEMEM_EXTREME
        def_bool y
        depends on SPARSEMEM && !SPARSEMEM_STATIC
+#
+# SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page
+# and page_to_pfn.  The most efficient option where kernel virtual space is
+# not under pressure.
+#
+config SPARSEMEM_VMEMMAP_ENABLE
+        def_bool n
+config SPARSEMEM_VMEMMAP
+        bool
+        depends on SPARSEMEM
+        default y if (SPARSEMEM_VMEMMAP_ENABLE)
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
@@ -126,6 +139,11 @@ config MEMORY_HOTPLUG_SPARSE
        def_bool y
        depends on SPARSEMEM && MEMORY_HOTPLUG
+config MEMORY_HOTREMOVE
+        bool "Allow for memory hot remove"
+        depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
+        depends on MIGRATION
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/Makefile b/mm/Makefile
index 245e33ab00c4..5c0b0ea7572d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,13 +11,14 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           page_alloc.o page-writeback.o pdflush.o \
                           readahead.o swap.o truncate.o vmscan.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                           $(mmu-y)
+                           page_isolation.o $(mmu-y)
 obj-$(CONFIG_BOUNCE)    += bounce.o
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
+obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f50a2811f9dc..b0ceb29da4c7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -5,6 +5,41 @@
 #include <linux/sched.h>
 #include <linux/module.h>
+int bdi_init(struct backing_dev_info *bdi)
+{
+        int i, j;
+        int err;
+        for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
+                err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0);
+                if (err)
+                        goto err;
+        }
+        bdi->dirty_exceeded = 0;
+        err = prop_local_init_percpu(&bdi->completions);
+        if (err) {
+err:
+                for (j = 0; j < i; j++)
+                        percpu_counter_destroy(&bdi->bdi_stat[i]);
+        }
+        return err;
+}
+EXPORT_SYMBOL(bdi_init);
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+        int i;
+        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
+                percpu_counter_destroy(&bdi->bdi_stat[i]);
+        prop_local_destroy_percpu(&bdi->completions);
+}
+EXPORT_SYMBOL(bdi_destroy);
 static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -55,15 +90,3 @@ long congestion_wait(int rw, long timeout)
 }
 EXPORT_SYMBOL(congestion_wait);
-/**
- * congestion_end - wake up sleepers on a congested backing_dev_info
- * @rw: READ or WRITE
- */
-void congestion_end(int rw)
-{
-        wait_queue_head_t *wqh = &congestion_wqh[rw];
-        if (waitqueue_active(wqh))
-                wake_up(wqh);
-}
-EXPORT_SYMBOL(congestion_end);
diff --git a/mm/bounce.c b/mm/bounce.c
index 3b549bf31f7d..b6d2d0f1019b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -265,6 +265,12 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
        mempool_t *pool;
        /*
+         * Data-less bio, nothing to bounce
+         */
+        if (bio_empty_barrier(*bio_orig))
+                return;
+        /*
         * for non-isa bounce case, just check if the bounce pfn is equal
         * to or bigger than the highest pfn in the system -- in that case,
         * don't waste time iterating over bio segments
diff --git a/mm/filemap.c b/mm/filemap.c
index 15c8413ee929..79f24a969cb4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,7 +30,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
-#include "filemap.h"
+#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include "internal.h"
 /*
@@ -63,6 +63,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 *    ->private_lock            (__free_pte->__set_page_dirty_buffers)
 *      ->swap_lock             (exclusive_swap_page, others)
 *        ->mapping->tree_lock
+ *          ->zone.lock
 *
 *  ->i_mutex
 *    ->i_mmap_lock             (truncate->unmap_mapping_range)
@@ -593,7 +594,7 @@ void fastcall __lock_page_nosync(struct page *page)
 * Is there a pagecache struct page at the given (mapping, offset) tuple?
 * If yes, increment its refcount and return it; if no, return NULL.
 */
-struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
 {
        struct page *page;
@@ -617,30 +618,31 @@ EXPORT_SYMBOL(find_get_page);
 * Returns zero if the page was not present. find_lock_page() may sleep.
 */
 struct page *find_lock_page(struct address_space *mapping,
-                                unsigned long offset)
+                                pgoff_t offset)
 {
        struct page *page;
-        read_lock_irq(&mapping->tree_lock);
 repeat:
+        read_lock_irq(&mapping->tree_lock);
        page = radix_tree_lookup(&mapping->page_tree, offset);
        if (page) {
                page_cache_get(page);
                if (TestSetPageLocked(page)) {
                        read_unlock_irq(&mapping->tree_lock);
                        __lock_page(page);
-                        read_lock_irq(&mapping->tree_lock);
                        /* Has the page been truncated while we slept? */
-                        if (unlikely(page->mapping != mapping ||
+                        if (unlikely(page->mapping != mapping)) {
-                                     page->index != offset)) {
                                unlock_page(page);
                                page_cache_release(page);
                                goto repeat;
                        }
+                        VM_BUG_ON(page->index != offset);
+                        goto out;
                }
        }
        read_unlock_irq(&mapping->tree_lock);
+out:
        return page;
 }
 EXPORT_SYMBOL(find_lock_page);
@@ -663,29 +665,24 @@ EXPORT_SYMBOL(find_lock_page);
 * memory exhaustion.
 */
 struct page *find_or_create_page(struct address_space *mapping,
-                unsigned long index, gfp_t gfp_mask)
+                pgoff_t index, gfp_t gfp_mask)
 {
-        struct page *page, *cached_page = NULL;
+        struct page *page;
        int err;
 repeat:
        page = find_lock_page(mapping, index);
        if (!page) {
-                if (!cached_page) {
+                page = __page_cache_alloc(gfp_mask);
-                        cached_page =
+                if (!page)
-                                __page_cache_alloc(gfp_mask);
+                        return NULL;
-                        if (!cached_page)
+                err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
-                                return NULL;
+                if (unlikely(err)) {
+                        page_cache_release(page);
+                        page = NULL;
+                        if (err == -EEXIST)
+                                goto repeat;
                }
-                err = add_to_page_cache_lru(cached_page, mapping,
-                                        index, gfp_mask);
-                if (!err) {
-                        page = cached_page;
-                        cached_page = NULL;
-                } else if (err == -EEXIST)
-                        goto repeat;
        }
-        if (cached_page)
-                page_cache_release(cached_page);
        return page;
 }
 EXPORT_SYMBOL(find_or_create_page);
@@ -797,7 +794,7 @@ EXPORT_SYMBOL(find_get_pages_tag);
 * and deadlock against the caller's locked page.
 */
 struct page *
-grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
+grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
 {
        struct page *page = find_get_page(mapping, index);
@@ -859,34 +856,29 @@ static void shrink_readahead_size_eio(struct file *filp,
 * It may be NULL.
 */
 void do_generic_mapping_read(struct address_space *mapping,
-                             struct file_ra_state *_ra,
+                             struct file_ra_state *ra,
                             struct file *filp,
                             loff_t *ppos,
                             read_descriptor_t *desc,
                             read_actor_t actor)
 {
        struct inode *inode = mapping->host;
-        unsigned long index;
+        pgoff_t index;
-        unsigned long offset;
+        pgoff_t last_index;
-        unsigned long last_index;
+        pgoff_t prev_index;
-        unsigned long next_index;
+        unsigned long offset;      /* offset into pagecache page */
-        unsigned long prev_index;
        unsigned int prev_offset;
-        struct page *cached_page;
        int error;
-        struct file_ra_state ra = *_ra;
-        cached_page = NULL;
        index = *ppos >> PAGE_CACHE_SHIFT;
-        next_index = index;
+        prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
-        prev_index = ra.prev_index;
+        prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
-        prev_offset = ra.prev_offset;
        last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
        offset = *ppos & ~PAGE_CACHE_MASK;
        for (;;) {
                struct page *page;
-                unsigned long end_index;
+                pgoff_t end_index;
                loff_t isize;
                unsigned long nr, ret;
@@ -895,7 +887,7 @@ find_page:
                page = find_get_page(mapping, index);
                if (!page) {
                        page_cache_sync_readahead(mapping,
-                                        &ra, filp,
+                                        ra, filp,
                                        index, last_index - index);
                        page = find_get_page(mapping, index);
                        if (unlikely(page == NULL))
@@ -903,7 +895,7 @@ find_page:
                }
                if (PageReadahead(page)) {
                        page_cache_async_readahead(mapping,
-                                        &ra, filp, page,
+                                        ra, filp, page,
                                        index, last_index - index);
                }
                if (!PageUptodate(page))
@@ -966,7 +958,6 @@ page_ok:
                index += offset >> PAGE_CACHE_SHIFT;
                offset &= ~PAGE_CACHE_MASK;
                prev_offset = offset;
-                ra.prev_offset = offset;
                page_cache_release(page);
                if (ret == nr && desc->count)
@@ -1015,7 +1006,7 @@ readpage:
                                }
                                unlock_page(page);
                                error = -EIO;
-                                shrink_readahead_size_eio(filp, &ra);
+                                shrink_readahead_size_eio(filp, ra);
                                goto readpage_error;
                        }
                        unlock_page(page);
@@ -1034,33 +1025,29 @@ no_cached_page:
                 * Ok, it wasn't cached, so we need to create a new
                 * page..
                 */
-                if (!cached_page) {
+                page = page_cache_alloc_cold(mapping);
-                        cached_page = page_cache_alloc_cold(mapping);
+                if (!page) {
-                        if (!cached_page) {
+                        desc->error = -ENOMEM;
-                                desc->error = -ENOMEM;
+                        goto out;
-                                goto out;
-                        }
                }
-                error = add_to_page_cache_lru(cached_page, mapping,
+                error = add_to_page_cache_lru(page, mapping,
                                                index, GFP_KERNEL);
                if (error) {
+                        page_cache_release(page);
                        if (error == -EEXIST)
                                goto find_page;
                        desc->error = error;
                        goto out;
                }
-                page = cached_page;
-                cached_page = NULL;
                goto readpage;
        }
 out:
-        *_ra = ra;
+        ra->prev_pos = prev_index;
-        _ra->prev_index = prev_index;
+        ra->prev_pos <<= PAGE_CACHE_SHIFT;
+        ra->prev_pos |= prev_offset;
-        *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+        *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
-        if (cached_page)
-                page_cache_release(cached_page);
        if (filp)
                file_accessed(filp);
 }
@@ -1220,7 +1207,7 @@ EXPORT_SYMBOL(generic_file_aio_read);
 static ssize_t
 do_readahead(struct address_space *mapping, struct file *filp,
-             unsigned long index, unsigned long nr)
+             pgoff_t index, unsigned long nr)
 {
        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
                return -EINVAL;
@@ -1240,8 +1227,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
        if (file) {
                if (file->f_mode & FMODE_READ) {
                        struct address_space *mapping = file->f_mapping;
-                        unsigned long start = offset >> PAGE_CACHE_SHIFT;
+                        pgoff_t start = offset >> PAGE_CACHE_SHIFT;
-                        unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+                        pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
                        unsigned long len = end - start + 1;
                        ret = do_readahead(mapping, file, start, len);
                }
@@ -1251,7 +1238,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
 }
 #ifdef CONFIG_MMU
-static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
 /**
 * page_cache_read - adds requested page to the page cache if not already there
 * @file:       file to read
@@ -1260,7 +1246,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
 * This adds the requested page to the page cache if it isn't already there,
 * and schedules an I/O to read in its contents from disk.
 */
-static int fastcall page_cache_read(struct file * file, unsigned long offset)
+static int fastcall page_cache_read(struct file * file, pgoff_t offset)
 {
        struct address_space *mapping = file->f_mapping;
        struct page *page; 
@@ -1349,7 +1335,7 @@ retry_find:
                 * Do we miss much more than hit in this file? If so,
                 * stop bothering with read-ahead. It will only hurt.
                 */
-                if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
+                if (ra->mmap_miss > MMAP_LOTSAMISS)
                        goto no_cached_page;
                /*
@@ -1375,7 +1361,7 @@ retry_find:
        }
        if (!did_readaround)
-                ra->mmap_hit++;
+                ra->mmap_miss--;
        /*
         * We have a locked page in the page cache, now we need to check
@@ -1396,7 +1382,7 @@ retry_find:
         * Found the page and have a reference on it.
         */
        mark_page_accessed(page);
-        ra->prev_index = page->index;
+        ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
        vmf->page = page;
        return ret | VM_FAULT_LOCKED;
@@ -1501,39 +1487,32 @@ EXPORT_SYMBOL(generic_file_mmap);
 EXPORT_SYMBOL(generic_file_readonly_mmap);
 static struct page *__read_cache_page(struct address_space *mapping,
-                                unsigned long index,
+                                pgoff_t index,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
-        struct page *page, *cached_page = NULL;
+        struct page *page;
        int err;
 repeat:
        page = find_get_page(mapping, index);
        if (!page) {
-                if (!cached_page) {
+                page = page_cache_alloc_cold(mapping);
-                        cached_page = page_cache_alloc_cold(mapping);
+                if (!page)
-                        if (!cached_page)
+                        return ERR_PTR(-ENOMEM);
-                                return ERR_PTR(-ENOMEM);
+                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
-                }
+                if (unlikely(err)) {
-                err = add_to_page_cache_lru(cached_page, mapping,
+                        page_cache_release(page);
-                                        index, GFP_KERNEL);
+                        if (err == -EEXIST)
-                if (err == -EEXIST)
+                                goto repeat;
-                        goto repeat;
-                if (err < 0) {
                        /* Presumably ENOMEM for radix tree node */
-                        page_cache_release(cached_page);
                        return ERR_PTR(err);
                }
-                page = cached_page;
-                cached_page = NULL;
                err = filler(data, page);
                if (err < 0) {
                        page_cache_release(page);
                        page = ERR_PTR(err);
                }
        }
-        if (cached_page)
-                page_cache_release(cached_page);
        return page;
 }
@@ -1542,7 +1521,7 @@ repeat:
 * after submitting it to the filler.
 */
 struct page *read_cache_page_async(struct address_space *mapping,
-                                unsigned long index,
+                                pgoff_t index,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
@@ -1590,7 +1569,7 @@ EXPORT_SYMBOL(read_cache_page_async);
 * If the page does not get brought uptodate, return -EIO.
 */
 struct page *read_cache_page(struct address_space *mapping,
-                                unsigned long index,
+                                pgoff_t index,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
@@ -1610,40 +1589,6 @@ struct page *read_cache_page(struct address_space *mapping,
 EXPORT_SYMBOL(read_cache_page);
 /*
- * If the page was newly created, increment its refcount and add it to the
- * caller's lru-buffering pagevec.  This function is specifically for
- * generic_file_write().
- */
-static inline struct page *
-__grab_cache_page(struct address_space *mapping, unsigned long index,
-                        struct page **cached_page, struct pagevec *lru_pvec)
-{
-        int err;
-        struct page *page;
-repeat:
-        page = find_lock_page(mapping, index);
-        if (!page) {
-                if (!*cached_page) {
-                        *cached_page = page_cache_alloc(mapping);
-                        if (!*cached_page)
-                                return NULL;
-                }
-                err = add_to_page_cache(*cached_page, mapping,
-                                        index, GFP_KERNEL);
-                if (err == -EEXIST)
-                        goto repeat;
-                if (err == 0) {
-                        page = *cached_page;
-                        page_cache_get(page);
-                        if (!pagevec_add(lru_pvec, page))
-                                __pagevec_lru_add(lru_pvec);
-                        *cached_page = NULL;
-                }
-        }
-        return page;
-}
-/*
 * The logic we want is
 *
 *      if suid or (sgid and xgrp)
@@ -1682,17 +1627,22 @@ int __remove_suid(struct dentry *dentry, int kill)
 int remove_suid(struct dentry *dentry)
 {
-        int kill = should_remove_suid(dentry);
+        int killsuid = should_remove_suid(dentry);
+        int killpriv = security_inode_need_killpriv(dentry);
+        int error = 0;
-        if (unlikely(kill))
+        if (killpriv < 0)
-                return __remove_suid(dentry, kill);
+                return killpriv;
+        if (killpriv)
+                error = security_inode_killpriv(dentry);
+        if (!error && killsuid)
+                error = __remove_suid(dentry, killsuid);
-        return 0;
+        return error;
 }
 EXPORT_SYMBOL(remove_suid);
-size_t
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
                        const struct iovec *iov, size_t base, size_t bytes)
 {
        size_t copied = 0, left = 0;
@@ -1715,6 +1665,124 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr,
 }
 /*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+        char *kaddr;
+        size_t copied;
+        BUG_ON(!in_atomic());
+        kaddr = kmap_atomic(page, KM_USER0);
+        if (likely(i->nr_segs == 1)) {
+                int left;
+                char __user *buf = i->iov->iov_base + i->iov_offset;
+                left = __copy_from_user_inatomic_nocache(kaddr + offset,
+                                                        buf, bytes);
+                copied = bytes - left;
+        } else {
+                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+                                                i->iov, i->iov_offset, bytes);
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+        return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+/*
+ * This has the same sideeffects and return value as
+ * iov_iter_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+size_t iov_iter_copy_from_user(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+        char *kaddr;
+        size_t copied;
+        kaddr = kmap(page);
+        if (likely(i->nr_segs == 1)) {
+                int left;
+                char __user *buf = i->iov->iov_base + i->iov_offset;
+                left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
+                copied = bytes - left;
+        } else {
+                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+                                                i->iov, i->iov_offset, bytes);
+        }
+        kunmap(page);
+        return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user);
+static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
+{
+        if (likely(i->nr_segs == 1)) {
+                i->iov_offset += bytes;
+        } else {
+                const struct iovec *iov = i->iov;
+                size_t base = i->iov_offset;
+                while (bytes) {
+                        int copy = min(bytes, iov->iov_len - base);
+                        bytes -= copy;
+                        base += copy;
+                        if (iov->iov_len == base) {
+                                iov++;
+                                base = 0;
+                        }
+                }
+                i->iov = iov;
+                i->iov_offset = base;
+        }
+}
+void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+        BUG_ON(i->count < bytes);
+        __iov_iter_advance_iov(i, bytes);
+        i->count -= bytes;
+}
+EXPORT_SYMBOL(iov_iter_advance);
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+        char __user *buf = i->iov->iov_base + i->iov_offset;
+        bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+        return fault_in_pages_readable(buf, bytes);
+}
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(struct iov_iter *i)
+{
+        const struct iovec *iov = i->iov;
+        if (i->nr_segs == 1)
+                return i->count;
+        else
+                return min(i->count, iov->iov_len - i->iov_offset);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_count);
+/*
 * Performs necessary checks before doing a write
 *
 * Can adjust writing position or amount of bytes to write.
@@ -1796,6 +1864,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
 }
 EXPORT_SYMBOL(generic_write_checks);
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned flags,
+                                struct page **pagep, void **fsdata)
+{
+        const struct address_space_operations *aops = mapping->a_ops;
+        if (aops->write_begin) {
+                return aops->write_begin(file, mapping, pos, len, flags,
+                                                        pagep, fsdata);
+        } else {
+                int ret;
+                pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+                unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+                struct inode *inode = mapping->host;
+                struct page *page;
+again:
+                page = __grab_cache_page(mapping, index);
+                *pagep = page;
+                if (!page)
+                        return -ENOMEM;
+                if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
+                        /*
+                         * There is no way to resolve a short write situation
+                         * for a !Uptodate page (except by double copying in
+                         * the caller done by generic_perform_write_2copy).
+                         *
+                         * Instead, we have to bring it uptodate here.
+                         */
+                        ret = aops->readpage(file, page);
+                        page_cache_release(page);
+                        if (ret) {
+                                if (ret == AOP_TRUNCATED_PAGE)
+                                        goto again;
+                                return ret;
+                        }
+                        goto again;
+                }
+                ret = aops->prepare_write(file, page, offset, offset+len);
+                if (ret) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        if (pos + len > inode->i_size)
+                                vmtruncate(inode, inode->i_size);
+                }
+                return ret;
+        }
+}
+EXPORT_SYMBOL(pagecache_write_begin);
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata)
+{
+        const struct address_space_operations *aops = mapping->a_ops;
+        int ret;
+        if (aops->write_end) {
+                mark_page_accessed(page);
+                ret = aops->write_end(file, mapping, pos, len, copied,
+                                                        page, fsdata);
+        } else {
+                unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+                struct inode *inode = mapping->host;
+                flush_dcache_page(page);
+                ret = aops->commit_write(file, page, offset, offset+len);
+                unlock_page(page);
+                mark_page_accessed(page);
+                page_cache_release(page);
+                if (ret < 0) {
+                        if (pos + len > inode->i_size)
+                                vmtruncate(inode, inode->i_size);
+                } else if (ret > 0)
+                        ret = min_t(size_t, copied, ret);
+                else
+                        ret = copied;
+        }
+        return ret;
+}
+EXPORT_SYMBOL(pagecache_write_end);
 ssize_t
 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -1835,151 +1988,314 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 }
 EXPORT_SYMBOL(generic_file_direct_write);
-ssize_t
+/*
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ * Find or create a page at the given pagecache position. Return the locked
-                unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ * page. This function is specifically for buffered writes.
-                size_t count, ssize_t written)
+ */
+struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
 {
-        struct file *file = iocb->ki_filp;
+        int status;
-        struct address_space * mapping = file->f_mapping;
+        struct page *page;
-        const struct address_space_operations *a_ops = mapping->a_ops;
+repeat:
-        struct inode    *inode = mapping->host;
+        page = find_lock_page(mapping, index);
-        long            status = 0;
+        if (likely(page))
-        struct page     *page;
+                return page;
-        struct page     *cached_page = NULL;
-        size_t          bytes;
-        struct pagevec  lru_pvec;
-        const struct iovec *cur_iov = iov; /* current iovec */
-        size_t          iov_base = 0;      /* offset in the current iovec */
-        char __user     *buf;
-        pagevec_init(&lru_pvec, 0);
-        /*
+        page = page_cache_alloc(mapping);
-         * handle partial DIO write.  Adjust cur_iov if needed.
+        if (!page)
-         */
+                return NULL;
-        if (likely(nr_segs == 1))
+        status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
-                buf = iov->iov_base + written;
+        if (unlikely(status)) {
-        else {
+                page_cache_release(page);
-                filemap_set_next_iovec(&cur_iov, &iov_base, written);
+                if (status == -EEXIST)
-                buf = cur_iov->iov_base + iov_base;
+                        goto repeat;
+                return NULL;
        }
+        return page;
+}
+EXPORT_SYMBOL(__grab_cache_page);
+static ssize_t generic_perform_write_2copy(struct file *file,
+                                struct iov_iter *i, loff_t pos)
+{
+        struct address_space *mapping = file->f_mapping;
+        const struct address_space_operations *a_ops = mapping->a_ops;
+        struct inode *inode = mapping->host;
+        long status = 0;
+        ssize_t written = 0;
        do {
-                unsigned long index;
+                struct page *src_page;
-                unsigned long offset;
+                struct page *page;
-                size_t copied;
+                pgoff_t index;          /* Pagecache index for current page */
+                unsigned long offset;   /* Offset into pagecache page */
+                unsigned long bytes;    /* Bytes to write to page */
+                size_t copied;          /* Bytes copied from user */
-                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+                offset = (pos & (PAGE_CACHE_SIZE - 1));
                index = pos >> PAGE_CACHE_SHIFT;
-                bytes = PAGE_CACHE_SIZE - offset;
+                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+                                                iov_iter_count(i));
-                /* Limit the size of the copy to the caller's write size */
+                /*
-                bytes = min(bytes, count);
+                 * a non-NULL src_page indicates that we're doing the
+                 * copy via get_user_pages and kmap.
-                /* We only need to worry about prefaulting when writes are from
-                 * user-space.  NFSd uses vfs_writev with several non-aligned
-                 * segments in the vector, and limiting to one segment a time is
-                 * a noticeable performance for re-write
                 */
-                if (!segment_eq(get_fs(), KERNEL_DS)) {
+                src_page = NULL;
-                        /*
-                         * Limit the size of the copy to that of the current
-                         * segment, because fault_in_pages_readable() doesn't
-                         * know how to walk segments.
-                         */
-                        bytes = min(bytes, cur_iov->iov_len - iov_base);
-                        /*
+                /*
-                         * Bring in the user page that we will copy from
+                 * Bring in the user page that we will copy from _first_.
-                         * _first_.  Otherwise there's a nasty deadlock on
+                 * Otherwise there's a nasty deadlock on copying from the
-                         * copying from the same page as we're writing to,
+                 * same page as we're writing to, without it being marked
-                         * without it being marked up-to-date.
+                 * up-to-date.
-                         */
+                 *
-                        fault_in_pages_readable(buf, bytes);
+                 * Not only is this an optimisation, but it is also required
+                 * to check that the address is actually valid, when atomic
+                 * usercopies are used, below.
+                 */
+                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+                        status = -EFAULT;
+                        break;
                }
-                page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
+                page = __grab_cache_page(mapping, index);
                if (!page) {
                        status = -ENOMEM;
                        break;
                }
-                if (unlikely(bytes == 0)) {
+                /*
-                        status = 0;
+                 * non-uptodate pages cannot cope with short copies, and we
-                        copied = 0;
+                 * cannot take a pagefault with the destination page locked.
-                        goto zero_length_segment;
+                 * So pin the source page to copy it.
-                }
+                 */
+                if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
+                        unlock_page(page);
-                status = a_ops->prepare_write(file, page, offset, offset+bytes);
+                        src_page = alloc_page(GFP_KERNEL);
-                if (unlikely(status)) {
+                        if (!src_page) {
-                        loff_t isize = i_size_read(inode);
+                                page_cache_release(page);
+                                status = -ENOMEM;
+                                break;
+                        }
-                        if (status != AOP_TRUNCATED_PAGE)
+                        /*
+                         * Cannot get_user_pages with a page locked for the
+                         * same reason as we can't take a page fault with a
+                         * page locked (as explained below).
+                         */
+                        copied = iov_iter_copy_from_user(src_page, i,
+                                                                offset, bytes);
+                        if (unlikely(copied == 0)) {
+                                status = -EFAULT;
+                                page_cache_release(page);
+                                page_cache_release(src_page);
+                                break;
+                        }
+                        bytes = copied;
+                        lock_page(page);
+                        /*
+                         * Can't handle the page going uptodate here, because
+                         * that means we would use non-atomic usercopies, which
+                         * zero out the tail of the page, which can cause
+                         * zeroes to become transiently visible. We could just
+                         * use a non-zeroing copy, but the APIs aren't too
+                         * consistent.
+                         */
+                        if (unlikely(!page->mapping || PageUptodate(page))) {
                                unlock_page(page);
-                        page_cache_release(page);
+                                page_cache_release(page);
-                        if (status == AOP_TRUNCATED_PAGE)
+                                page_cache_release(src_page);
                                continue;
+                        }
+                }
+                status = a_ops->prepare_write(file, page, offset, offset+bytes);
+                if (unlikely(status))
+                        goto fs_write_aop_error;
+                if (!src_page) {
                        /*
-                         * prepare_write() may have instantiated a few blocks
+                         * Must not enter the pagefault handler here, because
-                         * outside i_size.  Trim these off again.
+                         * we hold the page lock, so we might recursively
+                         * deadlock on the same lock, or get an ABBA deadlock
+                         * against a different lock, or against the mmap_sem
+                         * (which nests outside the page lock).  So increment
+                         * preempt count, and use _atomic usercopies.
+                         *
+                         * The page is uptodate so we are OK to encounter a
+                         * short copy: if unmodified parts of the page are
+                         * marked dirty and written out to disk, it doesn't
+                         * really matter.
                         */
-                        if (pos + bytes > isize)
+                        pagefault_disable();
-                                vmtruncate(inode, isize);
+                        copied = iov_iter_copy_from_user_atomic(page, i,
-                        break;
+                                                                offset, bytes);
+                        pagefault_enable();
+                } else {
+                        void *src, *dst;
+                        src = kmap_atomic(src_page, KM_USER0);
+                        dst = kmap_atomic(page, KM_USER1);
+                        memcpy(dst + offset, src + offset, bytes);
+                        kunmap_atomic(dst, KM_USER1);
+                        kunmap_atomic(src, KM_USER0);
+                        copied = bytes;
                }
-                if (likely(nr_segs == 1))
-                        copied = filemap_copy_from_user(page, offset,
-                                                        buf, bytes);
-                else
-                        copied = filemap_copy_from_user_iovec(page, offset,
-                                                cur_iov, iov_base, bytes);
                flush_dcache_page(page);
                status = a_ops->commit_write(file, page, offset, offset+bytes);
-                if (status == AOP_TRUNCATED_PAGE) {
+                if (unlikely(status < 0))
-                        page_cache_release(page);
+                        goto fs_write_aop_error;
-                        continue;
+                if (unlikely(status > 0)) /* filesystem did partial write */
-                }
+                        copied = min_t(size_t, copied, status);
-zero_length_segment:
-                if (likely(copied >= 0)) {
-                        if (!status)
-                                status = copied;
-                        if (status >= 0) {
-                                written += status;
-                                count -= status;
-                                pos += status;
-                                buf += status;
-                                if (unlikely(nr_segs > 1)) {
-                                        filemap_set_next_iovec(&cur_iov,
-                                                        &iov_base, status);
-                                        if (count)
-                                                buf = cur_iov->iov_base +
-                                                        iov_base;
-                                } else {
-                                        iov_base += status;
-                                }
-                        }
-                }
-                if (unlikely(copied != bytes))
-                        if (status >= 0)
-                                status = -EFAULT;
                unlock_page(page);
                mark_page_accessed(page);
                page_cache_release(page);
-                if (status < 0)
+                if (src_page)
-                        break;
+                        page_cache_release(src_page);
+                iov_iter_advance(i, copied);
+                pos += copied;
+                written += copied;
                balance_dirty_pages_ratelimited(mapping);
                cond_resched();
-        } while (count);
+                continue;
-        *ppos = pos;
-        if (cached_page)
+fs_write_aop_error:
-                page_cache_release(cached_page);
+                unlock_page(page);
+                page_cache_release(page);
+                if (src_page)
+                        page_cache_release(src_page);
+                /*
+                 * prepare_write() may have instantiated a few blocks
+                 * outside i_size.  Trim these off again. Don't need
+                 * i_size_read because we hold i_mutex.
+                 */
+                if (pos + bytes > inode->i_size)
+                        vmtruncate(inode, inode->i_size);
+                break;
+        } while (iov_iter_count(i));
+        return written ? written : status;
+}
+static ssize_t generic_perform_write(struct file *file,
+                                struct iov_iter *i, loff_t pos)
+{
+        struct address_space *mapping = file->f_mapping;
+        const struct address_space_operations *a_ops = mapping->a_ops;
+        long status = 0;
+        ssize_t written = 0;
+        unsigned int flags = 0;
        /*
-         * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
+         * Copies from kernel address space cannot fail (NFSD is a big user).
         */
+        if (segment_eq(get_fs(), KERNEL_DS))
+                flags |= AOP_FLAG_UNINTERRUPTIBLE;
+        do {
+                struct page *page;
+                pgoff_t index;          /* Pagecache index for current page */
+                unsigned long offset;   /* Offset into pagecache page */
+                unsigned long bytes;    /* Bytes to write to page */
+                size_t copied;          /* Bytes copied from user */
+                void *fsdata;
+                offset = (pos & (PAGE_CACHE_SIZE - 1));
+                index = pos >> PAGE_CACHE_SHIFT;
+                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+                                                iov_iter_count(i));
+again:
+                /*
+                 * Bring in the user page that we will copy from _first_.
+                 * Otherwise there's a nasty deadlock on copying from the
+                 * same page as we're writing to, without it being marked
+                 * up-to-date.
+                 *
+                 * Not only is this an optimisation, but it is also required
+                 * to check that the address is actually valid, when atomic
+                 * usercopies are used, below.
+                 */
+                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+                        status = -EFAULT;
+                        break;
+                }
+                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+                                                &page, &fsdata);
+                if (unlikely(status))
+                        break;
+                pagefault_disable();
+                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+                pagefault_enable();
+                flush_dcache_page(page);
+                status = a_ops->write_end(file, mapping, pos, bytes, copied,
+                                                page, fsdata);
+                if (unlikely(status < 0))
+                        break;
+                copied = status;
+                cond_resched();
+                if (unlikely(copied == 0)) {
+                        /*
+                         * If we were unable to copy any data at all, we must
+                         * fall back to a single segment length write.
+                         *
+                         * If we didn't fallback here, we could livelock
+                         * because not all segments in the iov can be copied at
+                         * once without a pagefault.
+                         */
+                        bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+                                                iov_iter_single_seg_count(i));
+                        goto again;
+                }
+                iov_iter_advance(i, copied);
+                pos += copied;
+                written += copied;
+                balance_dirty_pages_ratelimited(mapping);
+        } while (iov_iter_count(i));
+        return written ? written : status;
+}
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+                unsigned long nr_segs, loff_t pos, loff_t *ppos,
+                size_t count, ssize_t written)
+{
+        struct file *file = iocb->ki_filp;
+        struct address_space *mapping = file->f_mapping;
+        const struct address_space_operations *a_ops = mapping->a_ops;
+        struct inode *inode = mapping->host;
+        ssize_t status;
+        struct iov_iter i;
+        iov_iter_init(&i, iov, nr_segs, count, written);
+        if (a_ops->write_begin)
+                status = generic_perform_write(file, &i, pos);
+        else
+                status = generic_perform_write_2copy(file, &i, pos);
        if (likely(status >= 0)) {
+                written += status;
+                *ppos = pos + status;
+                /*
+                 * For now, when the user asks for O_SYNC, we'll actually give
+                 * O_DSYNC
+                 */
                if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                        if (!a_ops->writepage || !is_sync_kiocb(iocb))
                                status = generic_osync_inode(inode, mapping,
@@ -1995,7 +2311,6 @@ zero_length_segment:
        if (unlikely(file->f_flags & O_DIRECT) && written)
                status = filemap_write_and_wait(mapping);
-        pagevec_lru_add(&lru_pvec);
        return written ? written : status;
 }
 EXPORT_SYMBOL(generic_file_buffered_write);
diff --git a/mm/filemap.h b/mm/filemap.h
deleted file mode 100644
index c2bff04c84ed..000000000000
--- a/mm/filemap.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *      linux/mm/filemap.h
- *
- * Copyright (C) 1994-1999  Linus Torvalds
- */
-#ifndef __FILEMAP_H
-#define __FILEMAP_H
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/uio.h>
-#include <linux/uaccess.h>
-size_t
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
-                                        const struct iovec *iov,
-                                        size_t base,
-                                        size_t bytes);
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then clear the page
- * out to (offset+bytes) and return the number of bytes which were copied.
- *
- * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
- * to *NOT* zero any tail of the buffer that it failed to copy.  If it does,
- * and if the following non-atomic copy succeeds, then there is a small window
- * where the target page contains neither the data before the write, nor the
- * data after the write (it contains zero).  A read at this time will see
- * data that is inconsistent with any ordering of the read and the write.
- * (This has been detected in practice).
- */
-static inline size_t
-filemap_copy_from_user(struct page *page, unsigned long offset,
-                        const char __user *buf, unsigned bytes)
-{
-        char *kaddr;
-        int left;
-        kaddr = kmap_atomic(page, KM_USER0);
-        left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
-        kunmap_atomic(kaddr, KM_USER0);
-        if (left != 0) {
-                /* Do it the slow way */
-                kaddr = kmap(page);
-                left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
-                kunmap(page);
-        }
-        return bytes - left;
-}
-/*
- * This has the same sideeffects and return value as filemap_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
- * single-segment behaviour.
- */
-static inline size_t
-filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
-                        const struct iovec *iov, size_t base, size_t bytes)
-{
-        char *kaddr;
-        size_t copied;
-        kaddr = kmap_atomic(page, KM_USER0);
-        copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
-                                                         base, bytes);
-        kunmap_atomic(kaddr, KM_USER0);
-        if (copied != bytes) {
-                kaddr = kmap(page);
-                copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
-                                                                 base, bytes);
-                if (bytes - copied)
-                        memset(kaddr + offset + copied, 0, bytes - copied);
-                kunmap(page);
-        }
-        return copied;
-}
-static inline void
-filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
-{
-        const struct iovec *iov = *iovp;
-        size_t base = *basep;
-        do {
-                int copy = min(bytes, iov->iov_len - base);
-                bytes -= copy;
-                base += copy;
-                if (iov->iov_len == base) {
-                        iov++;
-                        base = 0;
-                }
-        } while (bytes);
-        *iovp = iov;
-        *basep = base;
-}
-#endif
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 53ee6a299635..32132f3cd641 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -15,7 +15,6 @@
 #include <linux/rmap.h>
 #include <linux/sched.h>
 #include <asm/tlbflush.h>
-#include "filemap.h"
 /*
 * We do use our own empty page to avoid interference with other users
@@ -288,6 +287,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
                unsigned long index;
                unsigned long offset;
                size_t copied;
+                char *kaddr;
                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
                index = pos >> PAGE_CACHE_SHIFT;
@@ -295,14 +295,6 @@ __xip_file_write(struct file *filp, const char __user *buf,
                if (bytes > count)
                        bytes = count;
-                /*
-                 * Bring in the user page that we will copy from _first_.
-                 * Otherwise there's a nasty deadlock on copying from the
-                 * same page as we're writing to, without it being marked
-                 * up-to-date.
-                 */
-                fault_in_pages_readable(buf, bytes);
                page = a_ops->get_xip_page(mapping,
                                           index*(PAGE_SIZE/512), 0);
                if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
@@ -319,8 +311,13 @@ __xip_file_write(struct file *filp, const char __user *buf,
                        break;
                }
-                copied = filemap_copy_from_user(page, offset, buf, bytes);
+                fault_in_pages_readable(buf, bytes);
+                kaddr = kmap_atomic(page, KM_USER0);
+                copied = bytes -
+                        __copy_from_user_inatomic_nocache(kaddr, buf, bytes);
+                kunmap_atomic(kaddr, KM_USER0);
                flush_dcache_page(page);
                if (likely(copied > 0)) {
                        status = copied;
diff --git a/mm/fremap.c b/mm/fremap.c
index 95bcb5641c72..14bd3bf7826e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -5,7 +5,7 @@
 *
 * started by Ingo Molnar, Copyright (C) 2002, 2003
 */
+#include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/file.h>
@@ -97,26 +97,28 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
 }
-/***
+/**
- * sys_remap_file_pages - remap arbitrary pages of a shared backing store
+ * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
- *                        file within an existing vma.
 * @start: start of the remapped virtual memory range
 * @size: size of the remapped virtual memory range
- * @prot: new protection bits of the range
+ * @prot: new protection bits of the range (see NOTE)
- * @pgoff: to be mapped page of the backing store file
+ * @pgoff: to-be-mapped page of the backing store file
 * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
 *
- * this syscall works purely via pagetables, so it's the most efficient
+ * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
+ * (shared backing store file).
+ *
+ * This syscall works purely via pagetables, so it's the most efficient
 * way to map the same (large) file into a given virtual window. Unlike
 * mmap()/mremap() it does not create any new vmas. The new mappings are
 * also safe across swapout.
 *
- * NOTE: the 'prot' parameter right now is ignored, and the vma's default
+ * NOTE: the 'prot' parameter right now is ignored (but must be zero),
- * protection is used. Arbitrary protections might be implemented in the
+ * and the vma's default protection is used. Arbitrary protections
- * future.
+ * might be implemented in the future.
 */
 asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
-        unsigned long __prot, unsigned long pgoff, unsigned long flags)
+        unsigned long prot, unsigned long pgoff, unsigned long flags)
 {
        struct mm_struct *mm = current->mm;
        struct address_space *mapping;
@@ -125,7 +127,7 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
        int err = -EINVAL;
        int has_write_lock = 0;
-        if (__prot)
+        if (prot)
                return err;
        /*
         * Sanitize the syscall parameters:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eab8c428cc93..ae2959bb59cb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -23,12 +23,16 @@
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
+static unsigned long surplus_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
+static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
+int hugetlb_dynamic_pool;
+static int hugetlb_next_nid;
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -85,6 +89,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
                        list_del(&page->lru);
                        free_huge_pages--;
                        free_huge_pages_node[nid]--;
+                        if (vma && vma->vm_flags & VM_MAYSHARE)
+                                resv_huge_pages--;
                        break;
                }
        }
@@ -92,58 +98,269 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
        return page;
 }
+static void update_and_free_page(struct page *page)
+{
+        int i;
+        nr_huge_pages--;
+        nr_huge_pages_node[page_to_nid(page)]--;
+        for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+                                1 << PG_private | 1<< PG_writeback);
+        }
+        set_compound_page_dtor(page, NULL);
+        set_page_refcounted(page);
+        __free_pages(page, HUGETLB_PAGE_ORDER);
+}
 static void free_huge_page(struct page *page)
 {
-        BUG_ON(page_count(page));
+        int nid = page_to_nid(page);
+        BUG_ON(page_count(page));
        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
-        enqueue_huge_page(page);
+        if (surplus_huge_pages_node[nid]) {
+                update_and_free_page(page);
+                surplus_huge_pages--;
+                surplus_huge_pages_node[nid]--;
+        } else {
+                enqueue_huge_page(page);
+        }
        spin_unlock(&hugetlb_lock);
 }
-static int alloc_fresh_huge_page(void)
+/*
+ * Increment or decrement surplus_huge_pages.  Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(int delta)
 {
        static int prev_nid;
-        struct page *page;
+        int nid = prev_nid;
-        int nid;
+        int ret = 0;
+        VM_BUG_ON(delta != -1 && delta != 1);
+        do {
+                nid = next_node(nid, node_online_map);
+                if (nid == MAX_NUMNODES)
+                        nid = first_node(node_online_map);
+                /* To shrink on this node, there must be a surplus page */
+                if (delta < 0 && !surplus_huge_pages_node[nid])
+                        continue;
+                /* Surplus cannot exceed the total number of pages */
+                if (delta > 0 && surplus_huge_pages_node[nid] >=
+                                                nr_huge_pages_node[nid])
+                        continue;
+                surplus_huge_pages += delta;
+                surplus_huge_pages_node[nid] += delta;
+                ret = 1;
+                break;
+        } while (nid != prev_nid);
-        /*
-         * Copy static prev_nid to local nid, work on that, then copy it
-         * back to prev_nid afterwards: otherwise there's a window in which
-         * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
-         * But we don't need to use a spin_lock here: it really doesn't
-         * matter if occasionally a racer chooses the same nid as we do.
-         */
-        nid = next_node(prev_nid, node_online_map);
-        if (nid == MAX_NUMNODES)
-                nid = first_node(node_online_map);
        prev_nid = nid;
+        return ret;
+}
+static struct page *alloc_fresh_huge_page_node(int nid)
+{
+        struct page *page;
-        page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
+        page = alloc_pages_node(nid,
+                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
+                HUGETLB_PAGE_ORDER);
+        if (page) {
+                set_compound_page_dtor(page, free_huge_page);
+                spin_lock(&hugetlb_lock);
+                nr_huge_pages++;
+                nr_huge_pages_node[nid]++;
+                spin_unlock(&hugetlb_lock);
+                put_page(page); /* free it into the hugepage allocator */
+        }
+        return page;
+}
+static int alloc_fresh_huge_page(void)
+{
+        struct page *page;
+        int start_nid;
+        int next_nid;
+        int ret = 0;
+        start_nid = hugetlb_next_nid;
+        do {
+                page = alloc_fresh_huge_page_node(hugetlb_next_nid);
+                if (page)
+                        ret = 1;
+                /*
+                 * Use a helper variable to find the next node and then
+                 * copy it back to hugetlb_next_nid afterwards:
+                 * otherwise there's a window in which a racer might
+                 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+                 * But we don't need to use a spin_lock here: it really
+                 * doesn't matter if occasionally a racer chooses the
+                 * same nid as we do.  Move nid forward in the mask even
+                 * if we just successfully allocated a hugepage so that
+                 * the next caller gets hugepages on the next node.
+                 */
+                next_nid = next_node(hugetlb_next_nid, node_online_map);
+                if (next_nid == MAX_NUMNODES)
+                        next_nid = first_node(node_online_map);
+                hugetlb_next_nid = next_nid;
+        } while (!page && hugetlb_next_nid != start_nid);
+        return ret;
+}
+static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
+                                                unsigned long address)
+{
+        struct page *page;
+        /* Check if the dynamic pool is enabled */
+        if (!hugetlb_dynamic_pool)
+                return NULL;
+        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
                                        HUGETLB_PAGE_ORDER);
        if (page) {
                set_compound_page_dtor(page, free_huge_page);
                spin_lock(&hugetlb_lock);
                nr_huge_pages++;
                nr_huge_pages_node[page_to_nid(page)]++;
+                surplus_huge_pages++;
+                surplus_huge_pages_node[page_to_nid(page)]++;
                spin_unlock(&hugetlb_lock);
-                put_page(page); /* free it into the hugepage allocator */
-                return 1;
        }
-        return 0;
+        return page;
+}
+/*
+ * Increase the hugetlb pool such that it can accomodate a reservation
+ * of size 'delta'.
+ */
+static int gather_surplus_pages(int delta)
+{
+        struct list_head surplus_list;
+        struct page *page, *tmp;
+        int ret, i;
+        int needed, allocated;
+        needed = (resv_huge_pages + delta) - free_huge_pages;
+        if (needed <= 0)
+                return 0;
+        allocated = 0;
+        INIT_LIST_HEAD(&surplus_list);
+        ret = -ENOMEM;
+retry:
+        spin_unlock(&hugetlb_lock);
+        for (i = 0; i < needed; i++) {
+                page = alloc_buddy_huge_page(NULL, 0);
+                if (!page) {
+                        /*
+                         * We were not able to allocate enough pages to
+                         * satisfy the entire reservation so we free what
+                         * we've allocated so far.
+                         */
+                        spin_lock(&hugetlb_lock);
+                        needed = 0;
+                        goto free;
+                }
+                list_add(&page->lru, &surplus_list);
+        }
+        allocated += needed;
+        /*
+         * After retaking hugetlb_lock, we need to recalculate 'needed'
+         * because either resv_huge_pages or free_huge_pages may have changed.
+         */
+        spin_lock(&hugetlb_lock);
+        needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
+        if (needed > 0)
+                goto retry;
+        /*
+         * The surplus_list now contains _at_least_ the number of extra pages
+         * needed to accomodate the reservation.  Add the appropriate number
+         * of pages to the hugetlb pool and free the extras back to the buddy
+         * allocator.
+         */
+        needed += allocated;
+        ret = 0;
+free:
+        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+                list_del(&page->lru);
+                if ((--needed) >= 0)
+                        enqueue_huge_page(page);
+                else {
+                        /*
+                         * Decrement the refcount and free the page using its
+                         * destructor.  This must be done with hugetlb_lock
+                         * unlocked which is safe because free_huge_page takes
+                         * hugetlb_lock before deciding how to free the page.
+                         */
+                        spin_unlock(&hugetlb_lock);
+                        put_page(page);
+                        spin_lock(&hugetlb_lock);
+                }
+        }
+        return ret;
+}
+/*
+ * When releasing a hugetlb pool reservation, any surplus pages that were
+ * allocated to satisfy the reservation must be explicitly freed if they were
+ * never used.
+ */
+void return_unused_surplus_pages(unsigned long unused_resv_pages)
+{
+        static int nid = -1;
+        struct page *page;
+        unsigned long nr_pages;
+        nr_pages = min(unused_resv_pages, surplus_huge_pages);
+        while (nr_pages) {
+                nid = next_node(nid, node_online_map);
+                if (nid == MAX_NUMNODES)
+                        nid = first_node(node_online_map);
+                if (!surplus_huge_pages_node[nid])
+                        continue;
+                if (!list_empty(&hugepage_freelists[nid])) {
+                        page = list_entry(hugepage_freelists[nid].next,
+                                          struct page, lru);
+                        list_del(&page->lru);
+                        update_and_free_page(page);
+                        free_huge_pages--;
+                        free_huge_pages_node[nid]--;
+                        surplus_huge_pages--;
+                        surplus_huge_pages_node[nid]--;
+                        nr_pages--;
+                }
+        }
 }
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr)
 {
-        struct page *page;
+        struct page *page = NULL;
+        int use_reserved_page = vma->vm_flags & VM_MAYSHARE;
        spin_lock(&hugetlb_lock);
-        if (vma->vm_flags & VM_MAYSHARE)
+        if (!use_reserved_page && (free_huge_pages <= resv_huge_pages))
-                resv_huge_pages--;
-        else if (free_huge_pages <= resv_huge_pages)
                goto fail;
        page = dequeue_huge_page(vma, addr);
@@ -155,10 +372,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        return page;
 fail:
-        if (vma->vm_flags & VM_MAYSHARE)
-                resv_huge_pages++;
        spin_unlock(&hugetlb_lock);
-        return NULL;
+        /*
+         * Private mappings do not use reserved huge pages so the allocation
+         * may have failed due to an undersized hugetlb pool.  Try to grab a
+         * surplus huge page from the buddy allocator.
+         */
+        if (!use_reserved_page)
+                page = alloc_buddy_huge_page(vma, addr);
+        return page;
 }
 static int __init hugetlb_init(void)
@@ -171,6 +395,8 @@ static int __init hugetlb_init(void)
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&hugepage_freelists[i]);
+        hugetlb_next_nid = first_node(node_online_map);
        for (i = 0; i < max_huge_pages; ++i) {
                if (!alloc_fresh_huge_page())
                        break;
@@ -201,21 +427,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 }
 #ifdef CONFIG_SYSCTL
-static void update_and_free_page(struct page *page)
-{
-        int i;
-        nr_huge_pages--;
-        nr_huge_pages_node[page_to_nid(page)]--;
-        for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
-                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-                                1 << PG_private | 1<< PG_writeback);
-        }
-        set_compound_page_dtor(page, NULL);
-        set_page_refcounted(page);
-        __free_pages(page, HUGETLB_PAGE_ORDER);
-}
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low(unsigned long count)
 {
@@ -224,14 +435,14 @@ static void try_to_free_low(unsigned long count)
        for (i = 0; i < MAX_NUMNODES; ++i) {
                struct page *page, *next;
                list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
+                        if (count >= nr_huge_pages)
+                                return;
                        if (PageHighMem(page))
                                continue;
                        list_del(&page->lru);
                        update_and_free_page(page);
                        free_huge_pages--;
                        free_huge_pages_node[page_to_nid(page)]--;
-                        if (count >= nr_huge_pages)
-                                return;
                }
        }
 }
@@ -241,26 +452,61 @@ static inline void try_to_free_low(unsigned long count)
 }
 #endif
+#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
-        while (count > nr_huge_pages) {
+        unsigned long min_count, ret;
-                if (!alloc_fresh_huge_page())
-                        return nr_huge_pages;
-        }
-        if (count >= nr_huge_pages)
-                return nr_huge_pages;
+        /*
+         * Increase the pool size
+         * First take pages out of surplus state.  Then make up the
+         * remaining difference by allocating fresh huge pages.
+         */
        spin_lock(&hugetlb_lock);
-        count = max(count, resv_huge_pages);
+        while (surplus_huge_pages && count > persistent_huge_pages) {
-        try_to_free_low(count);
+                if (!adjust_pool_surplus(-1))
-        while (count < nr_huge_pages) {
+                        break;
+        }
+        while (count > persistent_huge_pages) {
+                int ret;
+                /*
+                 * If this allocation races such that we no longer need the
+                 * page, free_huge_page will handle it by freeing the page
+                 * and reducing the surplus.
+                 */
+                spin_unlock(&hugetlb_lock);
+                ret = alloc_fresh_huge_page();
+                spin_lock(&hugetlb_lock);
+                if (!ret)
+                        goto out;
+        }
+        /*
+         * Decrease the pool size
+         * First return free pages to the buddy allocator (being careful
+         * to keep enough around to satisfy reservations).  Then place
+         * pages into surplus state as needed so the pool will shrink
+         * to the desired size as pages become free.
+         */
+        min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
+        min_count = max(count, min_count);
+        try_to_free_low(min_count);
+        while (min_count < persistent_huge_pages) {
                struct page *page = dequeue_huge_page(NULL, 0);
                if (!page)
                        break;
                update_and_free_page(page);
        }
+        while (count < persistent_huge_pages) {
+                if (!adjust_pool_surplus(1))
+                        break;
+        }
+out:
+        ret = persistent_huge_pages;
        spin_unlock(&hugetlb_lock);
-        return nr_huge_pages;
+        return ret;
 }
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -292,10 +538,12 @@ int hugetlb_report_meminfo(char *buf)
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
                        "HugePages_Rsvd:  %5lu\n"
+                        "HugePages_Surp:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
                        nr_huge_pages,
                        free_huge_pages,
                        resv_huge_pages,
+                        surplus_huge_pages,
                        HPAGE_SIZE/1024);
 }
@@ -355,7 +603,6 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
        entry = pte_mkwrite(pte_mkdirty(*ptep));
        if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
                update_mmu_cache(vma, address, entry);
-                lazy_mmu_prot_update(entry);
        }
 }
@@ -708,7 +955,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
                        pte = pte_mkhuge(pte_modify(pte, newprot));
                        set_huge_pte_at(mm, address, ptep, pte);
-                        lazy_mmu_prot_update(pte);
                }
        }
        spin_unlock(&mm->page_table_lock);
@@ -843,21 +1089,6 @@ static int hugetlb_acct_memory(long delta)
        int ret = -ENOMEM;
        spin_lock(&hugetlb_lock);
-        if ((delta + resv_huge_pages) <= free_huge_pages) {
-                resv_huge_pages += delta;
-                ret = 0;
-        }
-        spin_unlock(&hugetlb_lock);
-        return ret;
-}
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
-{
-        long ret, chg;
-        chg = region_chg(&inode->i_mapping->private_list, from, to);
-        if (chg < 0)
-                return chg;
        /*
         * When cpuset is configured, it breaks the strict hugetlb page
         * reservation as the accounting is done on a global variable. Such
@@ -875,8 +1106,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
         * a best attempt and hopefully to minimize the impact of changing
         * semantics that cpuset has.
         */
-        if (chg > cpuset_mems_nr(free_huge_pages_node))
+        if (delta > 0) {
-                return -ENOMEM;
+                if (gather_surplus_pages(delta) < 0)
+                        goto out;
+                if (delta > cpuset_mems_nr(free_huge_pages_node))
+                        goto out;
+        }
+        ret = 0;
+        resv_huge_pages += delta;
+        if (delta < 0)
+                return_unused_surplus_pages((unsigned long) -delta);
+out:
+        spin_unlock(&hugetlb_lock);
+        return ret;
+}
+int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+{
+        long ret, chg;
+        chg = region_chg(&inode->i_mapping->private_list, from, to);
+        if (chg < 0)
+                return chg;
        ret = hugetlb_acct_memory(chg);
        if (ret < 0)
diff --git a/mm/internal.h b/mm/internal.h
index a3110c02aea7..953f941ea867 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,4 +37,14 @@ static inline void __put_page(struct page *page)
 extern void fastcall __init __free_pages_bootmem(struct page *page,
                                                unsigned int order);
+/*
+ * function for dealing with page's order in buddy system.
+ * zone->lock is already acquired when we use these.
+ * So, we don't need atomic page->flags operations here.
+ */
+static inline unsigned long page_order(struct page *page)
+{
+        VM_BUG_ON(!PageBuddy(page));
+        return page_private(page);
+}
 #endif
diff --git a/mm/memory.c b/mm/memory.c
index f82b359b2745..bd16dcaeefb8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -966,7 +966,7 @@ no_page_table:
         * has touched so far, we don't want to allocate page tables.
         */
        if (flags & FOLL_ANON) {
-                page = ZERO_PAGE(address);
+                page = ZERO_PAGE(0);
                if (flags & FOLL_GET)
                        get_page(page);
                BUG_ON(flags & FOLL_WRITE);
@@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages);
-static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
-                        unsigned long addr, unsigned long end, pgprot_t prot)
-{
-        pte_t *pte;
-        spinlock_t *ptl;
-        int err = 0;
-        pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
-        if (!pte)
-                return -EAGAIN;
-        arch_enter_lazy_mmu_mode();
-        do {
-                struct page *page = ZERO_PAGE(addr);
-                pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
-                if (unlikely(!pte_none(*pte))) {
-                        err = -EEXIST;
-                        pte++;
-                        break;
-                }
-                page_cache_get(page);
-                page_add_file_rmap(page);
-                inc_mm_counter(mm, file_rss);
-                set_pte_at(mm, addr, pte, zero_pte);
-        } while (pte++, addr += PAGE_SIZE, addr != end);
-        arch_leave_lazy_mmu_mode();
-        pte_unmap_unlock(pte - 1, ptl);
-        return err;
-}
-static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
-                        unsigned long addr, unsigned long end, pgprot_t prot)
-{
-        pmd_t *pmd;
-        unsigned long next;
-        int err;
-        pmd = pmd_alloc(mm, pud, addr);
-        if (!pmd)
-                return -EAGAIN;
-        do {
-                next = pmd_addr_end(addr, end);
-                err = zeromap_pte_range(mm, pmd, addr, next, prot);
-                if (err)
-                        break;
-        } while (pmd++, addr = next, addr != end);
-        return err;
-}
-static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
-                        unsigned long addr, unsigned long end, pgprot_t prot)
-{
-        pud_t *pud;
-        unsigned long next;
-        int err;
-        pud = pud_alloc(mm, pgd, addr);
-        if (!pud)
-                return -EAGAIN;
-        do {
-                next = pud_addr_end(addr, end);
-                err = zeromap_pmd_range(mm, pud, addr, next, prot);
-                if (err)
-                        break;
-        } while (pud++, addr = next, addr != end);
-        return err;
-}
-int zeromap_page_range(struct vm_area_struct *vma,
-                        unsigned long addr, unsigned long size, pgprot_t prot)
-{
-        pgd_t *pgd;
-        unsigned long next;
-        unsigned long end = addr + size;
-        struct mm_struct *mm = vma->vm_mm;
-        int err;
-        BUG_ON(addr >= end);
-        pgd = pgd_offset(mm, addr);
-        flush_cache_range(vma, addr, end);
-        do {
-                next = pgd_addr_end(addr, end);
-                err = zeromap_pud_range(mm, pgd, addr, next, prot);
-                if (err)
-                        break;
-        } while (pgd++, addr = next, addr != end);
-        return err;
-}
 pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
 {
        pgd_t * pgd = pgd_offset(mm, addr);
@@ -1700,10 +1611,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                if (ptep_set_access_flags(vma, address, page_table, entry,1)) {
+                if (ptep_set_access_flags(vma, address, page_table, entry,1))
                        update_mmu_cache(vma, address, entry);
-                        lazy_mmu_prot_update(entry);
-                }
                ret |= VM_FAULT_WRITE;
                goto unlock;
        }
@@ -1717,16 +1626,11 @@ gotten:
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
-        if (old_page == ZERO_PAGE(address)) {
+        VM_BUG_ON(old_page == ZERO_PAGE(0));
-                new_page = alloc_zeroed_user_highpage_movable(vma, address);
+        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-                if (!new_page)
+        if (!new_page)
-                        goto oom;
+                goto oom;
-        } else {
+        cow_user_page(new_page, old_page, address, vma);
-                new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-                if (!new_page)
-                        goto oom;
-                cow_user_page(new_page, old_page, address, vma);
-        }
        /*
         * Re-check the pte - we dropped the lock
@@ -1744,7 +1648,6 @@ gotten:
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                lazy_mmu_prot_update(entry);
                /*
                 * Clear the pte entry and flush it first, before updating the
                 * pte with the new entry. This will avoid a race condition
@@ -2252,44 +2155,28 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spinlock_t *ptl;
        pte_t entry;
-        if (write_access) {
+        /* Allocate our own private page. */
-                /* Allocate our own private page. */
+        pte_unmap(page_table);
-                pte_unmap(page_table);
-                if (unlikely(anon_vma_prepare(vma)))
-                        goto oom;
-                page = alloc_zeroed_user_highpage_movable(vma, address);
-                if (!page)
-                        goto oom;
-                entry = mk_pte(page, vma->vm_page_prot);
-                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+        if (unlikely(anon_vma_prepare(vma)))
-                if (!pte_none(*page_table))
+                goto oom;
-                        goto release;
+        page = alloc_zeroed_user_highpage_movable(vma, address);
-                inc_mm_counter(mm, anon_rss);
+        if (!page)
-                lru_cache_add_active(page);
+                goto oom;
-                page_add_new_anon_rmap(page, vma, address);
-        } else {
-                /* Map the ZERO_PAGE - vm_page_prot is readonly */
-                page = ZERO_PAGE(address);
-                page_cache_get(page);
-                entry = mk_pte(page, vma->vm_page_prot);
-                ptl = pte_lockptr(mm, pmd);
+        entry = mk_pte(page, vma->vm_page_prot);
-                spin_lock(ptl);
+        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                if (!pte_none(*page_table))
-                        goto release;
-                inc_mm_counter(mm, file_rss);
-                page_add_file_rmap(page);
-        }
+        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+        if (!pte_none(*page_table))
+                goto release;
+        inc_mm_counter(mm, anon_rss);
+        lru_cache_add_active(page);
+        page_add_new_anon_rmap(page, vma, address);
        set_pte_at(mm, address, page_table, entry);
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, address, entry);
-        lazy_mmu_prot_update(entry);
 unlock:
        pte_unmap_unlock(page_table, ptl);
        return 0;
@@ -2442,7 +2329,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                /* no need to invalidate: a not-present page won't be cached */
                update_mmu_cache(vma, address, entry);
-                lazy_mmu_prot_update(entry);
        } else {
                if (anon)
                        page_cache_release(page);
@@ -2470,7 +2356,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                int write_access, pte_t orig_pte)
 {
        pgoff_t pgoff = (((address & PAGE_MASK)
-                        - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
+                        - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
        unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
        pte_unmap(page_table);
@@ -2614,7 +2500,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
                update_mmu_cache(vma, address, entry);
-                lazy_mmu_prot_update(entry);
        } else {
                /*
                 * This is needed only for protection faults but the arch code
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index df9d554bea30..091b9c6c2529 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -23,6 +23,9 @@
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
 #include <linux/cpuset.h>
+#include <linux/delay.h>
+#include <linux/migrate.h>
+#include <linux/page-isolation.h>
 #include <asm/tlbflush.h>
@@ -161,14 +164,27 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
                                        pgdat->node_start_pfn;
 }
-int online_pages(unsigned long pfn, unsigned long nr_pages)
+static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
+                        void *arg)
 {
        unsigned long i;
+        unsigned long onlined_pages = *(unsigned long *)arg;
+        struct page *page;
+        if (PageReserved(pfn_to_page(start_pfn)))
+                for (i = 0; i < nr_pages; i++) {
+                        page = pfn_to_page(start_pfn + i);
+                        online_page(page);
+                        onlined_pages++;
+                }
+        *(unsigned long *)arg = onlined_pages;
+        return 0;
+}
+int online_pages(unsigned long pfn, unsigned long nr_pages)
+{
        unsigned long flags;
        unsigned long onlined_pages = 0;
-        struct resource res;
-        u64 section_end;
-        unsigned long start_pfn;
        struct zone *zone;
        int need_zonelists_rebuild = 0;
@@ -191,32 +207,16 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        if (!populated_zone(zone))
                need_zonelists_rebuild = 1;
-        res.start = (u64)pfn << PAGE_SHIFT;
+        walk_memory_resource(pfn, nr_pages, &onlined_pages,
-        res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
+                online_pages_range);
-        res.flags = IORESOURCE_MEM; /* we just need system ram */
-        section_end = res.end;
-        while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
-                start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
-                nr_pages = (unsigned long)
-                           ((res.end + 1 - res.start) >> PAGE_SHIFT);
-                if (PageReserved(pfn_to_page(start_pfn))) {
-                        /* this region's page is not onlined now */
-                        for (i = 0; i < nr_pages; i++) {
-                                struct page *page = pfn_to_page(start_pfn + i);
-                                online_page(page);
-                                onlined_pages++;
-                        }
-                }
-                res.start = res.end + 1;
-                res.end = section_end;
-        }
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
        setup_per_zone_pages_min();
+        if (onlined_pages) {
+                kswapd_run(zone_to_nid(zone));
+                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+        }
        if (need_zonelists_rebuild)
                build_all_zonelists();
@@ -271,9 +271,6 @@ int add_memory(int nid, u64 start, u64 size)
                if (!pgdat)
                        return -ENOMEM;
                new_pgdat = 1;
-                ret = kswapd_run(nid);
-                if (ret)
-                        goto error;
        }
        /* call arch's memory hotadd */
@@ -308,3 +305,260 @@ error:
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_memory);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * Confirm all pages in a range [start, end) is belongs to the same zone.
+ */
+static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        struct zone *zone = NULL;
+        struct page *page;
+        int i;
+        for (pfn = start_pfn;
+             pfn < end_pfn;
+             pfn += MAX_ORDER_NR_PAGES) {
+                i = 0;
+                /* This is just a CONFIG_HOLES_IN_ZONE check.*/
+                while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
+                        i++;
+                if (i == MAX_ORDER_NR_PAGES)
+                        continue;
+                page = pfn_to_page(pfn + i);
+                if (zone && page_zone(page) != zone)
+                        return 0;
+                zone = page_zone(page);
+        }
+        return 1;
+}
+/*
+ * Scanning pfn is much easier than scanning lru list.
+ * Scan pfn from start to end and Find LRU page.
+ */
+int scan_lru_pages(unsigned long start, unsigned long end)
+{
+        unsigned long pfn;
+        struct page *page;
+        for (pfn = start; pfn < end; pfn++) {
+                if (pfn_valid(pfn)) {
+                        page = pfn_to_page(pfn);
+                        if (PageLRU(page))
+                                return pfn;
+                }
+        }
+        return 0;
+}
+static struct page *
+hotremove_migrate_alloc(struct page *page,
+                        unsigned long private,
+                        int **x)
+{
+        /* This should be improoooooved!! */
+        return alloc_page(GFP_HIGHUSER_PAGECACHE);
+}
+#define NR_OFFLINE_AT_ONCE_PAGES        (256)
+static int
+do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        struct page *page;
+        int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
+        int not_managed = 0;
+        int ret = 0;
+        LIST_HEAD(source);
+        for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+                if (!pfn_valid(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                if (!page_count(page))
+                        continue;
+                /*
+                 * We can skip free pages. And we can only deal with pages on
+                 * LRU.
+                 */
+                ret = isolate_lru_page(page, &source);
+                if (!ret) { /* Success */
+                        move_pages--;
+                } else {
+                        /* Becasue we don't have big zone->lock. we should
+                           check this again here. */
+                        if (page_count(page))
+                                not_managed++;
+#ifdef CONFIG_DEBUG_VM
+                        printk(KERN_INFO "removing from LRU failed"
+                                         " %lx/%d/%lx\n",
+                                pfn, page_count(page), page->flags);
+#endif
+                }
+        }
+        ret = -EBUSY;
+        if (not_managed) {
+                if (!list_empty(&source))
+                        putback_lru_pages(&source);
+                goto out;
+        }
+        ret = 0;
+        if (list_empty(&source))
+                goto out;
+        /* this function returns # of failed pages */
+        ret = migrate_pages(&source, hotremove_migrate_alloc, 0);
+out:
+        return ret;
+}
+/*
+ * remove from free_area[] and mark all as Reserved.
+ */
+static int
+offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
+                        void *data)
+{
+        __offline_isolated_pages(start, start + nr_pages);
+        return 0;
+}
+static void
+offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+        walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
+                                offline_isolated_pages_cb);
+}
+/*
+ * Check all pages in range, recoreded as memory resource, are isolated.
+ */
+static int
+check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
+                        void *data)
+{
+        int ret;
+        long offlined = *(long *)data;
+        ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
+        offlined = nr_pages;
+        if (!ret)
+                *(long *)data += offlined;
+        return ret;
+}
+static long
+check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
+{
+        long offlined = 0;
+        int ret;
+        ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
+                        check_pages_isolated_cb);
+        if (ret < 0)
+                offlined = (long)ret;
+        return offlined;
+}
+extern void drain_all_local_pages(void);
+int offline_pages(unsigned long start_pfn,
+                  unsigned long end_pfn, unsigned long timeout)
+{
+        unsigned long pfn, nr_pages, expire;
+        long offlined_pages;
+        int ret, drain, retry_max;
+        struct zone *zone;
+        BUG_ON(start_pfn >= end_pfn);
+        /* at least, alignment against pageblock is necessary */
+        if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
+                return -EINVAL;
+        if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
+                return -EINVAL;
+        /* This makes hotplug much easier...and readable.
+           we assume this for now. .*/
+        if (!test_pages_in_a_zone(start_pfn, end_pfn))
+                return -EINVAL;
+        /* set above range as isolated */
+        ret = start_isolate_page_range(start_pfn, end_pfn);
+        if (ret)
+                return ret;
+        nr_pages = end_pfn - start_pfn;
+        pfn = start_pfn;
+        expire = jiffies + timeout;
+        drain = 0;
+        retry_max = 5;
+repeat:
+        /* start memory hot removal */
+        ret = -EAGAIN;
+        if (time_after(jiffies, expire))
+                goto failed_removal;
+        ret = -EINTR;
+        if (signal_pending(current))
+                goto failed_removal;
+        ret = 0;
+        if (drain) {
+                lru_add_drain_all();
+                flush_scheduled_work();
+                cond_resched();
+                drain_all_local_pages();
+        }
+        pfn = scan_lru_pages(start_pfn, end_pfn);
+        if (pfn) { /* We have page on LRU */
+                ret = do_migrate_range(pfn, end_pfn);
+                if (!ret) {
+                        drain = 1;
+                        goto repeat;
+                } else {
+                        if (ret < 0)
+                                if (--retry_max == 0)
+                                        goto failed_removal;
+                        yield();
+                        drain = 1;
+                        goto repeat;
+                }
+        }
+        /* drain all zone's lru pagevec, this is asyncronous... */
+        lru_add_drain_all();
+        flush_scheduled_work();
+        yield();
+        /* drain pcp pages , this is synchrouns. */
+        drain_all_local_pages();
+        /* check again */
+        offlined_pages = check_pages_isolated(start_pfn, end_pfn);
+        if (offlined_pages < 0) {
+                ret = -EBUSY;
+                goto failed_removal;
+        }
+        printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
+        /* Ok, all of our target is islaoted.
+           We cannot do rollback at this point. */
+        offline_isolated_pages(start_pfn, end_pfn);
+        /* reset pagetype flags */
+        start_isolate_page_range(start_pfn, end_pfn);
+        /* removal success */
+        zone = page_zone(pfn_to_page(start_pfn));
+        zone->present_pages -= offlined_pages;
+        zone->zone_pgdat->node_present_pages -= offlined_pages;
+        totalram_pages -= offlined_pages;
+        num_physpages -= offlined_pages;
+        vm_total_pages = nr_free_pagecache_pages();
+        writeback_set_ratelimit();
+        return 0;
+failed_removal:
+        printk(KERN_INFO "memory offlining %lx to %lx failed\n",
+                start_pfn, end_pfn);
+        /* pushback to free area */
+        undo_isolate_page_range(start_pfn, end_pfn);
+        return ret;
+}
+#else
+int remove_memory(u64 start, u64 size)
+{
+        return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3d6ac9505d07..568152ae6caf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -72,7 +72,6 @@
 #include <linux/hugetlb.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/mm.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/gfp.h>
@@ -82,13 +81,13 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/compat.h>
-#include <linux/mempolicy.h>
 #include <linux/swap.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/migrate.h>
 #include <linux/rmap.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -110,6 +109,9 @@ struct mempolicy default_policy = {
        .policy = MPOL_DEFAULT,
 };
+static void mpol_rebind_policy(struct mempolicy *pol,
+                               const nodemask_t *newmask);
 /* Do sanity checking on a policy */
 static int mpol_check_policy(int mode, nodemask_t *nodes)
 {
@@ -128,7 +130,7 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
                        return -EINVAL;
                break;
        }
-        return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
+        return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
 }
 /* Generate a custom zonelist for the BIND policy. */
@@ -185,7 +187,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
        switch (mode) {
        case MPOL_INTERLEAVE:
                policy->v.nodes = *nodes;
-                if (nodes_weight(*nodes) == 0) {
+                nodes_and(policy->v.nodes, policy->v.nodes,
+                                        node_states[N_HIGH_MEMORY]);
+                if (nodes_weight(policy->v.nodes) == 0) {
                        kmem_cache_free(policy_cache, policy);
                        return ERR_PTR(-EINVAL);
                }
@@ -459,7 +463,7 @@ static void mpol_set_task_struct_flag(void)
 }
 /* Set the process memory policy */
-long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
        struct mempolicy *new;
@@ -494,9 +498,9 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
                *nodes = p->v.nodes;
                break;
        case MPOL_PREFERRED:
-                /* or use current node instead of online map? */
+                /* or use current node instead of memory_map? */
                if (p->v.preferred_node < 0)
-                        *nodes = node_online_map;
+                        *nodes = node_states[N_HIGH_MEMORY];
                else
                        node_set(p->v.preferred_node, *nodes);
                break;
@@ -519,8 +523,8 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
 }
 /* Retrieve NUMA policy */
-long do_get_mempolicy(int *policy, nodemask_t *nmask,
+static long do_get_mempolicy(int *policy, nodemask_t *nmask,
-                        unsigned long addr, unsigned long flags)
+                             unsigned long addr, unsigned long flags)
 {
        int err;
        struct mm_struct *mm = current->mm;
@@ -528,8 +532,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
        struct mempolicy *pol = current->mempolicy;
        cpuset_update_task_memory_state();
-        if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
+        if (flags &
+                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
                return -EINVAL;
+        if (flags & MPOL_F_MEMS_ALLOWED) {
+                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
+                        return -EINVAL;
+                *policy = 0;    /* just so it's initialized */
+                *nmask  = cpuset_current_mems_allowed;
+                return 0;
+        }
        if (flags & MPOL_F_ADDR) {
                down_read(&mm->mmap_sem);
                vma = find_vma_intersection(mm, addr, addr+1);
@@ -601,7 +615,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
 */
-int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
+static int migrate_to_node(struct mm_struct *mm, int source, int dest,
+                           int flags)
 {
        nodemask_t nmask;
        LIST_HEAD(pagelist);
@@ -732,8 +747,9 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 }
 #endif
-long do_mbind(unsigned long start, unsigned long len,
+static long do_mbind(unsigned long start, unsigned long len,
-                unsigned long mode, nodemask_t *nmask, unsigned long flags)
+                     unsigned long mode, nodemask_t *nmask,
+                     unsigned long flags)
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
@@ -955,7 +971,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
                goto out;
        }
-        if (!nodes_subset(new, node_online_map)) {
+        if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
                err = -EINVAL;
                goto out;
        }
@@ -978,7 +994,8 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
                                unsigned long maxnode,
                                unsigned long addr, unsigned long flags)
 {
-        int err, pval;
+        int err;
+        int uninitialized_var(pval);
        nodemask_t nodes;
        if (nmask != NULL && maxnode < MAX_NUMNODES)
@@ -1527,8 +1544,8 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
        kmem_cache_free(sn_cache, n);
 }
-struct sp_node *
+static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
-sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
+                                struct mempolicy *pol)
 {
        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
@@ -1677,7 +1694,7 @@ void __init numa_policy_init(void)
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
-        for_each_online_node(nid) {
+        for_each_node_state(nid, N_HIGH_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);
                /* Preserve the largest node */
@@ -1706,7 +1723,8 @@ void numa_default_policy(void)
 }
 /* Migrate a policy to a different set of nodes */
-void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
+static void mpol_rebind_policy(struct mempolicy *pol,
+                               const nodemask_t *newmask)
 {
        nodemask_t *mpolmask;
        nodemask_t tmp;
@@ -1963,7 +1981,7 @@ int show_numa_map(struct seq_file *m, void *v)
                seq_printf(m, " huge");
        } else {
                check_pgd_range(vma, vma->vm_start, vma->vm_end,
-                                &node_online_map, MPOL_MF_STATS, md);
+                        &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
        }
        if (!md->pages)
@@ -1990,7 +2008,7 @@ int show_numa_map(struct seq_file *m, void *v)
        if (md->writeback)
                seq_printf(m," writeback=%lu", md->writeback);
-        for_each_online_node(n)
+        for_each_node_state(n, N_HIGH_MEMORY)
                if (md->node[n])
                        seq_printf(m, " N%d=%lu", n, md->node[n]);
 out:
diff --git a/mm/migrate.c b/mm/migrate.c
index e2fdbce1874b..06d0877a66ef 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -171,6 +171,7 @@ static void remove_migration_pte(struct vm_area_struct *vma,
        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
        if (is_write_migration_entry(entry))
                pte = pte_mkwrite(pte);
+        flush_cache_page(vma, addr, pte_pfn(pte));
        set_pte_at(mm, addr, ptep, pte);
        if (PageAnon(new))
@@ -180,7 +181,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, addr, pte);
-        lazy_mmu_prot_update(pte);
 out:
        pte_unmap_unlock(ptep, ptl);
@@ -972,7 +972,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
         * array. Return various errors if the user did something wrong.
         */
        for (i = 0; i < nr_pages; i++) {
-                const void *p;
+                const void __user *p;
                err = -EFAULT;
                if (get_user(p, pages + i))
@@ -986,7 +986,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
                                goto out;
                        err = -ENODEV;
-                        if (!node_online(node))
+                        if (!node_state(node, N_HIGH_MEMORY))
                                goto out;
                        err = -EACCES;
diff --git a/mm/mmap.c b/mm/mmap.c
index 0d40e66c841b..4275e81e25ba 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -7,6 +7,7 @@
 */
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/shm.h>
 #include <linux/mman.h>
@@ -180,8 +181,6 @@ error:
        return -ENOMEM;
 }
-EXPORT_SYMBOL(__vm_enough_memory);
 /*
 * Requires inode->i_mapping->i_mmap_lock
 */
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e8346c30abec..1d4d69790e59 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -53,7 +53,6 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        if (dirty_accountable && pte_dirty(ptent))
                                ptent = pte_mkwrite(ptent);
                        set_pte_at(mm, addr, pte, ptent);
-                        lazy_mmu_prot_update(ptent);
 #ifdef CONFIG_MIGRATION
                } else if (!pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
diff --git a/mm/nommu.c b/mm/nommu.c
index 8ed0cb43118a..42fb84e9e815 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -44,7 +44,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int heap_stack_gap = 0;
 EXPORT_SYMBOL(mem_map);
-EXPORT_SYMBOL(__vm_enough_memory);
 EXPORT_SYMBOL(num_physpages);
 /* list of shareable VMAs */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f9b82ad5047f..a64decb5b13f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -27,6 +27,8 @@
 #include <linux/notifier.h>
 int sysctl_panic_on_oom;
+int sysctl_oom_kill_allocating_task;
+static DEFINE_SPINLOCK(zone_scan_mutex);
 /* #define DEBUG */
 /**
@@ -141,7 +143,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * because p may have allocated or otherwise mapped memory on
         * this node before. However it will be less likely.
         */
-        if (!cpuset_excl_nodes_overlap(p))
+        if (!cpuset_mems_allowed_intersects(current, p))
                points /= 8;
        /*
@@ -164,27 +166,14 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 }
 /*
- * Types of limitations to the nodes from which allocations may occur
- */
-#define CONSTRAINT_NONE 1
-#define CONSTRAINT_MEMORY_POLICY 2
-#define CONSTRAINT_CPUSET 3
-/*
 * Determine the type of allocation constraint.
 */
-static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
+static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
+                                                    gfp_t gfp_mask)
 {
 #ifdef CONFIG_NUMA
        struct zone **z;
-        nodemask_t nodes;
+        nodemask_t nodes = node_states[N_HIGH_MEMORY];
-        int node;
-        nodes_clear(nodes);
-        /* node has memory ? */
-        for_each_online_node(node)
-                if (NODE_DATA(node)->node_present_pages)
-                        node_set(node, nodes);
        for (z = zonelist->zones; *z; z++)
                if (cpuset_zone_allowed_softwall(*z, gfp_mask))
@@ -344,12 +333,20 @@ static int oom_kill_task(struct task_struct *p)
        return 0;
 }
-static int oom_kill_process(struct task_struct *p, unsigned long points,
+static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-                const char *message)
+                            unsigned long points, const char *message)
 {
        struct task_struct *c;
        struct list_head *tsk;
+        if (printk_ratelimit()) {
+                printk(KERN_WARNING "%s invoked oom-killer: "
+                        "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
+                        current->comm, gfp_mask, order, current->oomkilladj);
+                dump_stack();
+                show_mem();
+        }
        /*
         * If the task is already exiting, don't alarm the sysadmin or kill
         * its children or threads, just set TIF_MEMDIE so it can die quickly
@@ -387,6 +384,57 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
+/*
+ * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
+ * if a parallel OOM killing is already taking place that includes a zone in
+ * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
+ */
+int try_set_zone_oom(struct zonelist *zonelist)
+{
+        struct zone **z;
+        int ret = 1;
+        z = zonelist->zones;
+        spin_lock(&zone_scan_mutex);
+        do {
+                if (zone_is_oom_locked(*z)) {
+                        ret = 0;
+                        goto out;
+                }
+        } while (*(++z) != NULL);
+        /*
+         * Lock each zone in the zonelist under zone_scan_mutex so a parallel
+         * invocation of try_set_zone_oom() doesn't succeed when it shouldn't.
+         */
+        z = zonelist->zones;
+        do {
+                zone_set_flag(*z, ZONE_OOM_LOCKED);
+        } while (*(++z) != NULL);
+out:
+        spin_unlock(&zone_scan_mutex);
+        return ret;
+}
+/*
+ * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
+ * allocation attempts with zonelists containing them may now recall the OOM
+ * killer, if necessary.
+ */
+void clear_zonelist_oom(struct zonelist *zonelist)
+{
+        struct zone **z;
+        z = zonelist->zones;
+        spin_lock(&zone_scan_mutex);
+        do {
+                zone_clear_flag(*z, ZONE_OOM_LOCKED);
+        } while (*(++z) != NULL);
+        spin_unlock(&zone_scan_mutex);
+}
 /**
 * out_of_memory - kill the "best" process when we run out of memory
 *
@@ -400,21 +448,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
        struct task_struct *p;
        unsigned long points = 0;
        unsigned long freed = 0;
-        int constraint;
+        enum oom_constraint constraint;
        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
        if (freed > 0)
                /* Got some memory back in the last second. */
                return;
-        if (printk_ratelimit()) {
-                printk(KERN_WARNING "%s invoked oom-killer: "
-                        "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
-                        current->comm, gfp_mask, order, current->oomkilladj);
-                dump_stack();
-                show_mem();
-        }
        if (sysctl_panic_on_oom == 2)
                panic("out of memory. Compulsory panic_on_oom is selected.\n");
@@ -423,23 +463,24 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
         * NUMA) that may require different handling.
         */
        constraint = constrained_alloc(zonelist, gfp_mask);
-        cpuset_lock();
        read_lock(&tasklist_lock);
        switch (constraint) {
        case CONSTRAINT_MEMORY_POLICY:
-                oom_kill_process(current, points,
+                oom_kill_process(current, gfp_mask, order, points,
                                "No available memory (MPOL_BIND)");
                break;
-        case CONSTRAINT_CPUSET:
-                oom_kill_process(current, points,
-                                "No available memory in cpuset");
-                break;
        case CONSTRAINT_NONE:
                if (sysctl_panic_on_oom)
                        panic("out of memory. panic_on_oom is selected\n");
+                /* Fall-through */
+        case CONSTRAINT_CPUSET:
+                if (sysctl_oom_kill_allocating_task) {
+                        oom_kill_process(current, gfp_mask, order, points,
+                                        "Out of memory (oom_kill_allocating_task)");
+                        break;
+                }
 retry:
                /*
                 * Rambo mode: Shoot down a process and hope it solves whatever
@@ -453,11 +494,11 @@ retry:
                /* Found nothing?!?! Either we hang forever, or we panic. */
                if (!p) {
                        read_unlock(&tasklist_lock);
-                        cpuset_unlock();
                        panic("Out of memory and no killable processes...\n");
                }
-                if (oom_kill_process(p, points, "Out of memory"))
+                if (oom_kill_process(p, points, gfp_mask, order,
+                                     "Out of memory"))
                        goto retry;
                break;
@@ -465,7 +506,6 @@ retry:
 out:
        read_unlock(&tasklist_lock);
-        cpuset_unlock();
        /*
         * Give "p" a good chance of killing itself before we
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 44720363374c..7845462064f4 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2,6 +2,7 @@
 * mm/page-writeback.c
 *
 * Copyright (C) 2002, Linus Torvalds.
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
 *
 * Contains functions related to writing back dirty pages at the
 * address_space level.
@@ -36,7 +37,7 @@
 /*
 * The maximum number of pages to writeout in a single bdflush/kupdate
- * operation.  We do this so we don't hold I_LOCK against an inode for
+ * operation.  We do this so we don't hold I_SYNC against an inode for
 * enormous amounts of time, which would block a userspace task which has
 * been forced to throttle against that inode.  Also, the code reevaluates
 * the dirty each time it has written this many pages.
@@ -49,8 +50,6 @@
 */
 static long ratelimit_pages = 32;
-static int dirty_exceeded __cacheline_aligned_in_smp;   /* Dirty mem may be over limit */
 /*
 * When balance_dirty_pages decides that the caller needs to perform some
 * non-background writeback, this is how many pages it will attempt to write.
@@ -103,6 +102,141 @@ EXPORT_SYMBOL(laptop_mode);
 static void background_writeout(unsigned long _min_pages);
 /*
+ * Scale the writeback cache size proportional to the relative writeout speeds.
+ *
+ * We do this by keeping a floating proportion between BDIs, based on page
+ * writeback completions [end_page_writeback()]. Those devices that write out
+ * pages fastest will get the larger share, while the slower will get a smaller
+ * share.
+ *
+ * We use page writeout completions because we are interested in getting rid of
+ * dirty pages. Having them written out is the primary goal.
+ *
+ * We introduce a concept of time, a period over which we measure these events,
+ * because demand can/will vary over time. The length of this period itself is
+ * measured in page writeback completions.
+ *
+ */
+static struct prop_descriptor vm_completions;
+static struct prop_descriptor vm_dirties;
+static unsigned long determine_dirtyable_memory(void);
+/*
+ * couple the period to the dirty_ratio:
+ *
+ *   period/2 ~ roundup_pow_of_two(dirty limit)
+ */
+static int calc_period_shift(void)
+{
+        unsigned long dirty_total;
+        dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+        return 2 + ilog2(dirty_total - 1);
+}
+/*
+ * update the period when the dirty ratio changes.
+ */
+int dirty_ratio_handler(struct ctl_table *table, int write,
+                struct file *filp, void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int old_ratio = vm_dirty_ratio;
+        int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+        if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+                int shift = calc_period_shift();
+                prop_change_shift(&vm_completions, shift);
+                prop_change_shift(&vm_dirties, shift);
+        }
+        return ret;
+}
+/*
+ * Increment the BDI's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+{
+        __prop_inc_percpu(&vm_completions, &bdi->completions);
+}
+static inline void task_dirty_inc(struct task_struct *tsk)
+{
+        prop_inc_single(&vm_dirties, &tsk->dirties);
+}
+/*
+ * Obtain an accurate fraction of the BDI's portion.
+ */
+static void bdi_writeout_fraction(struct backing_dev_info *bdi,
+                long *numerator, long *denominator)
+{
+        if (bdi_cap_writeback_dirty(bdi)) {
+                prop_fraction_percpu(&vm_completions, &bdi->completions,
+                                numerator, denominator);
+        } else {
+                *numerator = 0;
+                *denominator = 1;
+        }
+}
+/*
+ * Clip the earned share of dirty pages to that which is actually available.
+ * This avoids exceeding the total dirty_limit when the floating averages
+ * fluctuate too quickly.
+ */
+static void
+clip_bdi_dirty_limit(struct backing_dev_info *bdi, long dirty, long *pbdi_dirty)
+{
+        long avail_dirty;
+        avail_dirty = dirty -
+                (global_page_state(NR_FILE_DIRTY) +
+                 global_page_state(NR_WRITEBACK) +
+                 global_page_state(NR_UNSTABLE_NFS));
+        if (avail_dirty < 0)
+                avail_dirty = 0;
+        avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
+                bdi_stat(bdi, BDI_WRITEBACK);
+        *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
+}
+static inline void task_dirties_fraction(struct task_struct *tsk,
+                long *numerator, long *denominator)
+{
+        prop_fraction_single(&vm_dirties, &tsk->dirties,
+                                numerator, denominator);
+}
+/*
+ * scale the dirty limit
+ *
+ * task specific dirty limit:
+ *
+ *   dirty -= (dirty/8) * p_{t}
+ */
+void task_dirty_limit(struct task_struct *tsk, long *pdirty)
+{
+        long numerator, denominator;
+        long dirty = *pdirty;
+        u64 inv = dirty >> 3;
+        task_dirties_fraction(tsk, &numerator, &denominator);
+        inv *= numerator;
+        do_div(inv, denominator);
+        dirty -= inv;
+        if (dirty < *pdirty/2)
+                dirty = *pdirty/2;
+        *pdirty = dirty;
+}
+/*
 * Work out the current dirty-memory clamping and background writeout
 * thresholds.
 *
@@ -126,7 +260,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
        int node;
        unsigned long x = 0;
-        for_each_online_node(node) {
+        for_each_node_state(node, N_HIGH_MEMORY) {
                struct zone *z =
                        &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
@@ -158,8 +292,8 @@ static unsigned long determine_dirtyable_memory(void)
 }
 static void
-get_dirty_limits(long *pbackground, long *pdirty,
+get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
-                                        struct address_space *mapping)
+                 struct backing_dev_info *bdi)
 {
        int background_ratio;           /* Percentages */
        int dirty_ratio;
@@ -193,6 +327,23 @@ get_dirty_limits(long *pbackground, long *pdirty,
        }
        *pbackground = background;
        *pdirty = dirty;
+        if (bdi) {
+                u64 bdi_dirty = dirty;
+                long numerator, denominator;
+                /*
+                 * Calculate this BDI's share of the dirty ratio.
+                 */
+                bdi_writeout_fraction(bdi, &numerator, &denominator);
+                bdi_dirty *= numerator;
+                do_div(bdi_dirty, denominator);
+                *pbdi_dirty = bdi_dirty;
+                clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
+                task_dirty_limit(current, pbdi_dirty);
+        }
 }
 /*
@@ -204,9 +355,11 @@ get_dirty_limits(long *pbackground, long *pdirty,
 */
 static void balance_dirty_pages(struct address_space *mapping)
 {
-        long nr_reclaimable;
+        long bdi_nr_reclaimable;
+        long bdi_nr_writeback;
        long background_thresh;
        long dirty_thresh;
+        long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long write_chunk = sync_writeback_pages();
@@ -221,15 +374,15 @@ static void balance_dirty_pages(struct address_space *mapping)
                        .range_cyclic   = 1,
                };
-                get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
+                get_dirty_limits(&background_thresh, &dirty_thresh,
-                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
+                                &bdi_thresh, bdi);
-                                        global_page_state(NR_UNSTABLE_NFS);
+                bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-                if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
+                bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-                        dirty_thresh)
+                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
-                                break;
+                        break;
-                if (!dirty_exceeded)
+                if (!bdi->dirty_exceeded)
-                        dirty_exceeded = 1;
+                        bdi->dirty_exceeded = 1;
                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
                 * Unstable writes are a feature of certain networked
@@ -237,26 +390,42 @@ static void balance_dirty_pages(struct address_space *mapping)
                 * written to the server's write cache, but has not yet
                 * been flushed to permanent storage.
                 */
-                if (nr_reclaimable) {
+                if (bdi_nr_reclaimable) {
                        writeback_inodes(&wbc);
-                        get_dirty_limits(&background_thresh,
-                                                &dirty_thresh, mapping);
-                        nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-                                        global_page_state(NR_UNSTABLE_NFS);
-                        if (nr_reclaimable +
-                                global_page_state(NR_WRITEBACK)
-                                        <= dirty_thresh)
-                                                break;
                        pages_written += write_chunk - wbc.nr_to_write;
-                        if (pages_written >= write_chunk)
+                        get_dirty_limits(&background_thresh, &dirty_thresh,
-                                break;          /* We've done our duty */
+                                       &bdi_thresh, bdi);
                }
+                /*
+                 * In order to avoid the stacked BDI deadlock we need
+                 * to ensure we accurately count the 'dirty' pages when
+                 * the threshold is low.
+                 *
+                 * Otherwise it would be possible to get thresh+n pages
+                 * reported dirty, even though there are thresh-m pages
+                 * actually dirty; with m+n sitting in the percpu
+                 * deltas.
+                 */
+                if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+                        bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+                        bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+                } else if (bdi_nr_reclaimable) {
+                        bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+                        bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+                }
+                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+                        break;
+                if (pages_written >= write_chunk)
+                        break;          /* We've done our duty */
                congestion_wait(WRITE, HZ/10);
        }
-        if (nr_reclaimable + global_page_state(NR_WRITEBACK)
+        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
-                <= dirty_thresh && dirty_exceeded)
+                        bdi->dirty_exceeded)
-                        dirty_exceeded = 0;
+                bdi->dirty_exceeded = 0;
        if (writeback_in_progress(bdi))
                return;         /* pdflush is already working this queue */
@@ -270,7 +439,9 @@ static void balance_dirty_pages(struct address_space *mapping)
         * background_thresh, to keep the amount of dirty memory low.
         */
        if ((laptop_mode && pages_written) ||
-             (!laptop_mode && (nr_reclaimable > background_thresh)))
+                        (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
+                                          + global_page_state(NR_UNSTABLE_NFS)
+                                          > background_thresh)))
                pdflush_operation(background_writeout, 0);
 }
@@ -306,7 +477,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
        unsigned long *p;
        ratelimit = ratelimit_pages;
-        if (dirty_exceeded)
+        if (mapping->backing_dev_info->dirty_exceeded)
                ratelimit = 8;
        /*
@@ -331,18 +502,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        long background_thresh;
        long dirty_thresh;
-        if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO)) {
-                /*
-                 * The caller might hold locks which can prevent IO completion
-                 * or progress in the filesystem.  So we cannot just sit here
-                 * waiting for IO to complete.
-                 */
-                congestion_wait(WRITE, HZ/10);
-                return;
-        }
        for ( ; ; ) {
-                get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                /*
                 * Boost the allowable dirty threshold a bit for page
@@ -354,6 +515,14 @@ void throttle_vm_writeout(gfp_t gfp_mask)
                        global_page_state(NR_WRITEBACK) <= dirty_thresh)
                                break;
                congestion_wait(WRITE, HZ/10);
+                /*
+                 * The caller might hold locks which can prevent IO completion
+                 * or progress in the filesystem.  So we cannot just sit here
+                 * waiting for IO to complete.
+                 */
+                if ((gfp_mask & (__GFP_FS|__GFP_IO)) != (__GFP_FS|__GFP_IO))
+                        break;
        }
 }
@@ -377,11 +546,12 @@ static void background_writeout(unsigned long _min_pages)
                long background_thresh;
                long dirty_thresh;
-                get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
+                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
                if (global_page_state(NR_FILE_DIRTY) +
                        global_page_state(NR_UNSTABLE_NFS) < background_thresh
                                && min_pages <= 0)
                        break;
+                wbc.more_io = 0;
                wbc.encountered_congestion = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                wbc.pages_skipped = 0;
@@ -389,8 +559,9 @@ static void background_writeout(unsigned long _min_pages)
                min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
                        /* Wrote less than expected */
-                        congestion_wait(WRITE, HZ/10);
+                        if (wbc.encountered_congestion || wbc.more_io)
-                        if (!wbc.encountered_congestion)
+                                congestion_wait(WRITE, HZ/10);
+                        else
                                break;
                }
        }
@@ -455,11 +626,12 @@ static void wb_kupdate(unsigned long arg)
                        global_page_state(NR_UNSTABLE_NFS) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        while (nr_to_write > 0) {
+                wbc.more_io = 0;
                wbc.encountered_congestion = 0;
                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
                writeback_inodes(&wbc);
                if (wbc.nr_to_write > 0) {
-                        if (wbc.encountered_congestion)
+                        if (wbc.encountered_congestion || wbc.more_io)
                                congestion_wait(WRITE, HZ/10);
                        else
                                break;  /* All the old data is written */
@@ -580,9 +752,15 @@ static struct notifier_block __cpuinitdata ratelimit_nb = {
 */
 void __init page_writeback_init(void)
 {
+        int shift;
        mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
+        shift = calc_period_shift();
+        prop_descriptor_init(&vm_completions, shift);
+        prop_descriptor_init(&vm_dirties, shift);
 }
 /**
@@ -672,8 +850,10 @@ retry:
                        ret = (*writepage)(page, wbc, data);
-                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
+                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
                                unlock_page(page);
+                                ret = 0;
+                        }
                        if (ret || (--(wbc->nr_to_write) <= 0))
                                done = 1;
                        if (wbc->nonblocking && bdi_write_congested(bdi)) {
@@ -827,6 +1007,8 @@ int __set_page_dirty_nobuffers(struct page *page)
                        WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
                        if (mapping_cap_account_dirty(mapping)) {
                                __inc_zone_page_state(page, NR_FILE_DIRTY);
+                                __inc_bdi_stat(mapping->backing_dev_info,
+                                                BDI_RECLAIMABLE);
                                task_io_account_write(PAGE_CACHE_SIZE);
                        }
                        radix_tree_tag_set(&mapping->page_tree,
@@ -859,7 +1041,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
 * If the mapping doesn't provide a set_page_dirty a_op, then
 * just fall through and assume that it wants buffer_heads.
 */
-int fastcall set_page_dirty(struct page *page)
+static int __set_page_dirty(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
@@ -877,6 +1059,14 @@ int fastcall set_page_dirty(struct page *page)
        }
        return 0;
 }
+int fastcall set_page_dirty(struct page *page)
+{
+        int ret = __set_page_dirty(page);
+        if (ret)
+                task_dirty_inc(current);
+        return ret;
+}
 EXPORT_SYMBOL(set_page_dirty);
 /*
@@ -961,6 +1151,8 @@ int clear_page_dirty_for_io(struct page *page)
                 */
                if (TestClearPageDirty(page)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
+                        dec_bdi_stat(mapping->backing_dev_info,
+                                        BDI_RECLAIMABLE);
                        return 1;
                }
                return 0;
@@ -975,14 +1167,20 @@ int test_clear_page_writeback(struct page *page)
        int ret;
        if (mapping) {
+                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
                write_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestClearPageWriteback(page);
-                if (ret)
+                if (ret) {
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
+                        if (bdi_cap_writeback_dirty(bdi)) {
+                                __dec_bdi_stat(bdi, BDI_WRITEBACK);
+                                __bdi_writeout_inc(bdi);
+                        }
+                }
                write_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestClearPageWriteback(page);
@@ -998,14 +1196,18 @@ int test_set_page_writeback(struct page *page)
        int ret;
        if (mapping) {
+                struct backing_dev_info *bdi = mapping->backing_dev_info;
                unsigned long flags;
                write_lock_irqsave(&mapping->tree_lock, flags);
                ret = TestSetPageWriteback(page);
-                if (!ret)
+                if (!ret) {
                        radix_tree_tag_set(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
+                        if (bdi_cap_writeback_dirty(bdi))
+                                __inc_bdi_stat(bdi, BDI_WRITEBACK);
+                }
                if (!PageDirty(page))
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
@@ -1022,17 +1224,15 @@ int test_set_page_writeback(struct page *page)
 EXPORT_SYMBOL(test_set_page_writeback);
 /*
- * Return true if any of the pages in the mapping are marged with the
+ * Return true if any of the pages in the mapping are marked with the
 * passed tag.
 */
 int mapping_tagged(struct address_space *mapping, int tag)
 {
-        unsigned long flags;
        int ret;
+        rcu_read_lock();
-        read_lock_irqsave(&mapping->tree_lock, flags);
        ret = radix_tree_tagged(&mapping->page_tree, tag);
-        read_unlock_irqrestore(&mapping->tree_lock, flags);
+        rcu_read_unlock();
        return ret;
 }
 EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1a8c59571cb7..43f757fcf30f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -27,6 +27,7 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/oom.h>
 #include <linux/notifier.h>
 #include <linux/topology.h>
 #include <linux/sysctl.h>
@@ -41,24 +42,37 @@
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
+#include <linux/page-isolation.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 /*
- * MCD - HACK: Find somewhere to initialize this EARLY, or make this
+ * Array of node states.
- * initializer cleaner
 */
-nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
+nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
-EXPORT_SYMBOL(node_online_map);
+        [N_POSSIBLE] = NODE_MASK_ALL,
-nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
+        [N_ONLINE] = { { [0] = 1UL } },
-EXPORT_SYMBOL(node_possible_map);
+#ifndef CONFIG_NUMA
+        [N_NORMAL_MEMORY] = { { [0] = 1UL } },
+#ifdef CONFIG_HIGHMEM
+        [N_HIGH_MEMORY] = { { [0] = 1UL } },
+#endif
+        [N_CPU] = { { [0] = 1UL } },
+#endif  /* NUMA */
+};
+EXPORT_SYMBOL(node_states);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
 int percpu_pagelist_fraction;
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+int pageblock_order __read_mostly;
+#endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
@@ -137,7 +151,7 @@ static unsigned long __meminitdata dma_reserve;
  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
  unsigned long __initdata required_kernelcore;
-  unsigned long __initdata required_movablecore;
+  static unsigned long __initdata required_movablecore;
  unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -150,6 +164,14 @@ int nr_node_ids __read_mostly = MAX_NUMNODES;
 EXPORT_SYMBOL(nr_node_ids);
 #endif
+int page_group_by_mobility_disabled __read_mostly;
+static void set_pageblock_migratetype(struct page *page, int migratetype)
+{
+        set_pageblock_flags_group(page, (unsigned long)migratetype,
+                                        PB_migrate, PB_migrate_end);
+}
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -293,16 +315,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
                clear_highpage(page + i);
 }
-/*
- * function for dealing with page's order in buddy system.
- * zone->lock is already acquired when we use these.
- * So, we don't need atomic page->flags operations here.
- */
-static inline unsigned long page_order(struct page *page)
-{
-        return page_private(page);
-}
 static inline void set_page_order(struct page *page, int order)
 {
        set_page_private(page, order);
@@ -404,6 +416,7 @@ static inline void __free_one_page(struct page *page,
 {
        unsigned long page_idx;
        int order_size = 1 << order;
+        int migratetype = get_pageblock_migratetype(page);
        if (unlikely(PageCompound(page)))
                destroy_compound_page(page, order);
@@ -416,7 +429,6 @@ static inline void __free_one_page(struct page *page,
        __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
        while (order < MAX_ORDER-1) {
                unsigned long combined_idx;
-                struct free_area *area;
                struct page *buddy;
                buddy = __page_find_buddy(page, page_idx, order);
@@ -424,8 +436,7 @@ static inline void __free_one_page(struct page *page,
                        break;          /* Move the buddy up one level. */
                list_del(&buddy->lru);
-                area = zone->free_area + order;
+                zone->free_area[order].nr_free--;
-                area->nr_free--;
                rmv_page_order(buddy);
                combined_idx = __find_combined_index(page_idx, order);
                page = page + (combined_idx - page_idx);
@@ -433,7 +444,8 @@ static inline void __free_one_page(struct page *page,
                order++;
        }
        set_page_order(page, order);
-        list_add(&page->lru, &zone->free_area[order].free_list);
+        list_add(&page->lru,
+                &zone->free_area[order].free_list[migratetype]);
        zone->free_area[order].nr_free++;
 }
@@ -478,7 +490,7 @@ static void free_pages_bulk(struct zone *zone, int count,
                                        struct list_head *list, int order)
 {
        spin_lock(&zone->lock);
-        zone->all_unreclaimable = 0;
+        zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
        zone->pages_scanned = 0;
        while (count--) {
                struct page *page;
@@ -495,7 +507,7 @@ static void free_pages_bulk(struct zone *zone, int count,
 static void free_one_page(struct zone *zone, struct page *page, int order)
 {
        spin_lock(&zone->lock);
-        zone->all_unreclaimable = 0;
+        zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
        zone->pages_scanned = 0;
        __free_one_page(page, zone, order);
        spin_unlock(&zone->lock);
@@ -567,7 +579,8 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 * -- wli
 */
 static inline void expand(struct zone *zone, struct page *page,
-        int low, int high, struct free_area *area)
+        int low, int high, struct free_area *area,
+        int migratetype)
 {
        unsigned long size = 1 << high;
@@ -576,7 +589,7 @@ static inline void expand(struct zone *zone, struct page *page,
                high--;
                size >>= 1;
                VM_BUG_ON(bad_range(zone, &page[size]));
-                list_add(&page[size].lru, &area->free_list);
+                list_add(&page[size].lru, &area->free_list[migratetype]);
                area->nr_free++;
                set_page_order(&page[size], high);
        }
@@ -628,49 +641,235 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
        return 0;
 }
-/* 
+/*
- * Do the hard work of removing an element from the buddy allocator.
+ * Go through the free lists for the given migratetype and remove
- * Call me with the zone->lock already held.
+ * the smallest available page from the freelists
 */
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+                                                int migratetype)
 {
-        struct free_area * area;
        unsigned int current_order;
+        struct free_area * area;
        struct page *page;
+        /* Find a page of the appropriate size in the preferred list */
        for (current_order = order; current_order < MAX_ORDER; ++current_order) {
-                area = zone->free_area + current_order;
+                area = &(zone->free_area[current_order]);
-                if (list_empty(&area->free_list))
+                if (list_empty(&area->free_list[migratetype]))
                        continue;
-                page = list_entry(area->free_list.next, struct page, lru);
+                page = list_entry(area->free_list[migratetype].next,
+                                                        struct page, lru);
                list_del(&page->lru);
                rmv_page_order(page);
                area->nr_free--;
                __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
-                expand(zone, page, order, current_order, area);
+                expand(zone, page, order, current_order, area, migratetype);
                return page;
        }
        return NULL;
 }
+/*
+ * This array describes the order lists are fallen back to when
+ * the free lists for the desirable migrate type are depleted
+ */
+static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
+        [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
+        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
+        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+        [MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
+};
+/*
+ * Move the free pages in a range to the free lists of the requested type.
+ * Note that start_page and end_pages are not aligned on a pageblock
+ * boundary. If alignment is required, use move_freepages_block()
+ */
+int move_freepages(struct zone *zone,
+                        struct page *start_page, struct page *end_page,
+                        int migratetype)
+{
+        struct page *page;
+        unsigned long order;
+        int pages_moved = 0;
+#ifndef CONFIG_HOLES_IN_ZONE
+        /*
+         * page_zone is not safe to call in this context when
+         * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
+         * anyway as we check zone boundaries in move_freepages_block().
+         * Remove at a later date when no bug reports exist related to
+         * grouping pages by mobility
+         */
+        BUG_ON(page_zone(start_page) != page_zone(end_page));
+#endif
+        for (page = start_page; page <= end_page;) {
+                if (!pfn_valid_within(page_to_pfn(page))) {
+                        page++;
+                        continue;
+                }
+                if (!PageBuddy(page)) {
+                        page++;
+                        continue;
+                }
+                order = page_order(page);
+                list_del(&page->lru);
+                list_add(&page->lru,
+                        &zone->free_area[order].free_list[migratetype]);
+                page += 1 << order;
+                pages_moved += 1 << order;
+        }
+        return pages_moved;
+}
+int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
+{
+        unsigned long start_pfn, end_pfn;
+        struct page *start_page, *end_page;
+        start_pfn = page_to_pfn(page);
+        start_pfn = start_pfn & ~(pageblock_nr_pages-1);
+        start_page = pfn_to_page(start_pfn);
+        end_page = start_page + pageblock_nr_pages - 1;
+        end_pfn = start_pfn + pageblock_nr_pages - 1;
+        /* Do not cross zone boundaries */
+        if (start_pfn < zone->zone_start_pfn)
+                start_page = page;
+        if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
+                return 0;
+        return move_freepages(zone, start_page, end_page, migratetype);
+}
+/* Return the page with the lowest PFN in the list */
+static struct page *min_page(struct list_head *list)
+{
+        unsigned long min_pfn = -1UL;
+        struct page *min_page = NULL, *page;;
+        list_for_each_entry(page, list, lru) {
+                unsigned long pfn = page_to_pfn(page);
+                if (pfn < min_pfn) {
+                        min_pfn = pfn;
+                        min_page = page;
+                }
+        }
+        return min_page;
+}
+/* Remove an element from the buddy allocator from the fallback list */
+static struct page *__rmqueue_fallback(struct zone *zone, int order,
+                                                int start_migratetype)
+{
+        struct free_area * area;
+        int current_order;
+        struct page *page;
+        int migratetype, i;
+        /* Find the largest possible block of pages in the other list */
+        for (current_order = MAX_ORDER-1; current_order >= order;
+                                                --current_order) {
+                for (i = 0; i < MIGRATE_TYPES - 1; i++) {
+                        migratetype = fallbacks[start_migratetype][i];
+                        /* MIGRATE_RESERVE handled later if necessary */
+                        if (migratetype == MIGRATE_RESERVE)
+                                continue;
+                        area = &(zone->free_area[current_order]);
+                        if (list_empty(&area->free_list[migratetype]))
+                                continue;
+                        /* Bias kernel allocations towards low pfns */
+                        page = list_entry(area->free_list[migratetype].next,
+                                        struct page, lru);
+                        if (unlikely(start_migratetype != MIGRATE_MOVABLE))
+                                page = min_page(&area->free_list[migratetype]);
+                        area->nr_free--;
+                        /*
+                         * If breaking a large block of pages, move all free
+                         * pages to the preferred allocation list. If falling
+                         * back for a reclaimable kernel allocation, be more
+                         * agressive about taking ownership of free pages
+                         */
+                        if (unlikely(current_order >= (pageblock_order >> 1)) ||
+                                        start_migratetype == MIGRATE_RECLAIMABLE) {
+                                unsigned long pages;
+                                pages = move_freepages_block(zone, page,
+                                                                start_migratetype);
+                                /* Claim the whole block if over half of it is free */
+                                if (pages >= (1 << (pageblock_order-1)))
+                                        set_pageblock_migratetype(page,
+                                                                start_migratetype);
+                                migratetype = start_migratetype;
+                        }
+                        /* Remove the page from the freelists */
+                        list_del(&page->lru);
+                        rmv_page_order(page);
+                        __mod_zone_page_state(zone, NR_FREE_PAGES,
+                                                        -(1UL << order));
+                        if (current_order == pageblock_order)
+                                set_pageblock_migratetype(page,
+                                                        start_migratetype);
+                        expand(zone, page, order, current_order, area, migratetype);
+                        return page;
+                }
+        }
+        /* Use MIGRATE_RESERVE rather than fail an allocation */
+        return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
+}
+/*
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static struct page *__rmqueue(struct zone *zone, unsigned int order,
+                                                int migratetype)
+{
+        struct page *page;
+        page = __rmqueue_smallest(zone, order, migratetype);
+        if (unlikely(!page))
+                page = __rmqueue_fallback(zone, order, migratetype);
+        return page;
+}
 /* 
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
-                        unsigned long count, struct list_head *list)
+                        unsigned long count, struct list_head *list,
+                        int migratetype)
 {
        int i;
        
        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
-                struct page *page = __rmqueue(zone, order);
+                struct page *page = __rmqueue(zone, order, migratetype);
                if (unlikely(page == NULL))
                        break;
-                list_add_tail(&page->lru, list);
+                list_add(&page->lru, list);
+                set_page_private(page, migratetype);
        }
        spin_unlock(&zone->lock);
        return i;
@@ -732,7 +931,7 @@ void mark_free_pages(struct zone *zone)
 {
        unsigned long pfn, max_zone_pfn;
        unsigned long flags;
-        int order;
+        int order, t;
        struct list_head *curr;
        if (!zone->spanned_pages)
@@ -749,17 +948,18 @@ void mark_free_pages(struct zone *zone)
                                swsusp_unset_page_free(page);
                }
-        for (order = MAX_ORDER - 1; order >= 0; --order)
+        for_each_migratetype_order(order, t) {
-                list_for_each(curr, &zone->free_area[order].free_list) {
+                list_for_each(curr, &zone->free_area[order].free_list[t]) {
                        unsigned long i;
                        pfn = page_to_pfn(list_entry(curr, struct page, lru));
                        for (i = 0; i < (1UL << order); i++)
                                swsusp_set_page_free(pfn_to_page(pfn + i));
                }
+        }
        spin_unlock_irqrestore(&zone->lock, flags);
 }
+#endif /* CONFIG_PM */
 /*
 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
@@ -772,7 +972,25 @@ void drain_local_pages(void)
        __drain_pages(smp_processor_id());
        local_irq_restore(flags);       
 }
-#endif /* CONFIG_HIBERNATION */
+void smp_drain_local_pages(void *arg)
+{
+        drain_local_pages();
+}
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ */
+void drain_all_local_pages(void)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __drain_pages(smp_processor_id());
+        local_irq_restore(flags);
+        smp_call_function(smp_drain_local_pages, NULL, 0, 1);
+}
 /*
 * Free a 0-order page
@@ -797,6 +1015,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        local_irq_save(flags);
        __count_vm_event(PGFREE);
        list_add(&page->lru, &pcp->list);
+        set_page_private(page, get_pageblock_migratetype(page));
        pcp->count++;
        if (pcp->count >= pcp->high) {
                free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -846,6 +1065,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist,
        struct page *page;
        int cold = !!(gfp_flags & __GFP_COLD);
        int cpu;
+        int migratetype = allocflags_to_migratetype(gfp_flags);
 again:
        cpu  = get_cpu();
@@ -856,16 +1076,28 @@ again:
                local_irq_save(flags);
                if (!pcp->count) {
                        pcp->count = rmqueue_bulk(zone, 0,
-                                                pcp->batch, &pcp->list);
+                                        pcp->batch, &pcp->list, migratetype);
                        if (unlikely(!pcp->count))
                                goto failed;
                }
-                page = list_entry(pcp->list.next, struct page, lru);
+                /* Find a page of the appropriate migrate type */
+                list_for_each_entry(page, &pcp->list, lru)
+                        if (page_private(page) == migratetype)
+                                break;
+                /* Allocate more to the pcp list if necessary */
+                if (unlikely(&page->lru == &pcp->list)) {
+                        pcp->count += rmqueue_bulk(zone, 0,
+                                        pcp->batch, &pcp->list, migratetype);
+                        page = list_entry(pcp->list.next, struct page, lru);
+                }
                list_del(&page->lru);
                pcp->count--;
        } else {
                spin_lock_irqsave(&zone->lock, flags);
-                page = __rmqueue(zone, order);
+                page = __rmqueue(zone, order, migratetype);
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
@@ -1032,7 +1264,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 *
 * If the zonelist cache is present in the passed in zonelist, then
 * returns a pointer to the allowed node mask (either the current
- * tasks mems_allowed, or node_online_map.)
+ * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
 *
 * If the zonelist cache is not available for this zonelist, does
 * nothing and returns NULL.
@@ -1061,7 +1293,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
        allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
                                        &cpuset_current_mems_allowed :
-                                        &node_online_map;
+                                        &node_states[N_HIGH_MEMORY];
        return allowednodes;
 }
@@ -1183,9 +1415,6 @@ zonelist_scan:
                        !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
                zone = *z;
-                if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
-                        zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
-                                break;
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                goto try_next_zone;
@@ -1254,7 +1483,10 @@ restart:
        z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
        if (unlikely(*z == NULL)) {
-                /* Should this ever happen?? */
+                /*
+                 * Happens if we have an empty zonelist as a result of
+                 * GFP_THISNODE being used on a memoryless node
+                 */
                return NULL;
        }
@@ -1346,12 +1578,20 @@ nofail_alloc:
        cond_resched();
+        if (order != 0)
+                drain_all_local_pages();
        if (likely(did_some_progress)) {
                page = get_page_from_freelist(gfp_mask, order,
                                                zonelist, alloc_flags);
                if (page)
                        goto got_pg;
        } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+                if (!try_set_zone_oom(zonelist)) {
+                        schedule_timeout_uninterruptible(1);
+                        goto restart;
+                }
                /*
                 * Go through the zonelist yet one more time, keep
                 * very high watermark here, this is only to catch
@@ -1360,14 +1600,19 @@ nofail_alloc:
                 */
                page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
                                zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
-                if (page)
+                if (page) {
+                        clear_zonelist_oom(zonelist);
                        goto got_pg;
+                }
                /* The OOM killer will not help higher order allocs so fail */
-                if (order > PAGE_ALLOC_COSTLY_ORDER)
+                if (order > PAGE_ALLOC_COSTLY_ORDER) {
+                        clear_zonelist_oom(zonelist);
                        goto nopage;
+                }
                out_of_memory(zonelist, gfp_mask, order);
+                clear_zonelist_oom(zonelist);
                goto restart;
        }
@@ -1616,7 +1861,7 @@ void show_free_areas(void)
                        K(zone_page_state(zone, NR_INACTIVE)),
                        K(zone->present_pages),
                        zone->pages_scanned,
-                        (zone->all_unreclaimable ? "yes" : "no")
+                        (zone_is_all_unreclaimable(zone) ? "yes" : "no")
                        );
                printk("lowmem_reserve[]:");
                for (i = 0; i < MAX_NR_ZONES; i++)
@@ -1794,7 +2039,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
                return node;
        }
-        for_each_online_node(n) {
+        for_each_node_state(n, N_HIGH_MEMORY) {
                cpumask_t tmp;
                /* Don't want a node to appear more than once */
@@ -1850,6 +2095,22 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 }
 /*
+ * Build gfp_thisnode zonelists
+ */
+static void build_thisnode_zonelists(pg_data_t *pgdat)
+{
+        enum zone_type i;
+        int j;
+        struct zonelist *zonelist;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i;
+                j = build_zonelists_node(pgdat, zonelist, 0, i);
+                zonelist->zones[j] = NULL;
+        }
+}
+/*
 * Build zonelists ordered by zone and nodes within zones.
 * This results in conserving DMA zone[s] until all Normal memory is
 * exhausted, but results in overflowing to remote node while memory
@@ -1915,7 +2176,8 @@ static int default_zonelist_order(void)
         * If there is a node whose DMA/DMA32 memory is very big area on
         * local memory, NODE_ORDER may be suitable.
         */
-        average_size = total_size / (num_online_nodes() + 1);
+        average_size = total_size /
+                                (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
        for_each_online_node(nid) {
                low_kmem_size = 0;
                total_size = 0;
@@ -1953,7 +2215,7 @@ static void build_zonelists(pg_data_t *pgdat)
        int order = current_zonelist_order;
        /* initialize zonelists */
-        for (i = 0; i < MAX_NR_ZONES; i++) {
+        for (i = 0; i < MAX_ZONELISTS; i++) {
                zonelist = pgdat->node_zonelists + i;
                zonelist->zones[0] = NULL;
        }
@@ -1998,6 +2260,8 @@ static void build_zonelists(pg_data_t *pgdat)
                /* calculate node order -- i.e., DMA last! */
                build_zonelists_in_zone_order(pgdat, j);
        }
+        build_thisnode_zonelists(pgdat);
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
@@ -2078,8 +2342,10 @@ static int __build_all_zonelists(void *dummy)
        int nid;
        for_each_online_node(nid) {
-                build_zonelists(NODE_DATA(nid));
+                pg_data_t *pgdat = NODE_DATA(nid);
-                build_zonelist_cache(NODE_DATA(nid));
+                build_zonelists(pgdat);
+                build_zonelist_cache(pgdat);
        }
        return 0;
 }
@@ -2098,9 +2364,23 @@ void build_all_zonelists(void)
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
-        printk("Built %i zonelists in %s order.  Total pages: %ld\n",
+        /*
+         * Disable grouping by mobility if the number of pages in the
+         * system is too low to allow the mechanism to work. It would be
+         * more accurate, but expensive to check per-zone. This check is
+         * made on memory-hotadd so a system can start with mobility
+         * disabled and enable it later
+         */
+        if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
+                page_group_by_mobility_disabled = 1;
+        else
+                page_group_by_mobility_disabled = 0;
+        printk("Built %i zonelists in %s order, mobility grouping %s.  "
+                "Total pages: %ld\n",
                        num_online_nodes(),
                        zonelist_order_name[current_zonelist_order],
+                        page_group_by_mobility_disabled ? "off" : "on",
                        vm_total_pages);
 #ifdef CONFIG_NUMA
        printk("Policy zone: %s\n", zone_names[policy_zone]);
@@ -2176,6 +2456,61 @@ static inline unsigned long wait_table_bits(unsigned long size)
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
+ * Mark a number of pageblocks as MIGRATE_RESERVE. The number
+ * of blocks reserved is based on zone->pages_min. The memory within the
+ * reserve will tend to store contiguous free pages. Setting min_free_kbytes
+ * higher will lead to a bigger reserve which will get freed as contiguous
+ * blocks as reclaim kicks in
+ */
+static void setup_zone_migrate_reserve(struct zone *zone)
+{
+        unsigned long start_pfn, pfn, end_pfn;
+        struct page *page;
+        unsigned long reserve, block_migratetype;
+        /* Get the start pfn, end pfn and the number of blocks to reserve */
+        start_pfn = zone->zone_start_pfn;
+        end_pfn = start_pfn + zone->spanned_pages;
+        reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
+                                                        pageblock_order;
+        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+                if (!pfn_valid(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                /* Blocks with reserved pages will never free, skip them. */
+                if (PageReserved(page))
+                        continue;
+                block_migratetype = get_pageblock_migratetype(page);
+                /* If this block is reserved, account for it */
+                if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
+                        reserve--;
+                        continue;
+                }
+                /* Suitable for reserving if this block is movable */
+                if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
+                        set_pageblock_migratetype(page, MIGRATE_RESERVE);
+                        move_freepages_block(zone, page, MIGRATE_RESERVE);
+                        reserve--;
+                        continue;
+                }
+                /*
+                 * If the reserve is met and this is a previous reserved block,
+                 * take it back
+                 */
+                if (block_migratetype == MIGRATE_RESERVE) {
+                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+                        move_freepages_block(zone, page, MIGRATE_MOVABLE);
+                }
+        }
+}
+/*
 * Initially all pages are reserved - free ones are freed
 * up by free_all_bootmem() once the early boot process is
 * done. Non-atomic initialization, single-pass.
@@ -2204,6 +2539,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                init_page_count(page);
                reset_page_mapcount(page);
                SetPageReserved(page);
+                /*
+                 * Mark the block movable so that blocks are reserved for
+                 * movable at startup. This will force kernel allocations
+                 * to reserve their blocks rather than leaking throughout
+                 * the address space during boot when many long-lived
+                 * kernel allocations are made. Later some blocks near
+                 * the start are marked MIGRATE_RESERVE by
+                 * setup_zone_migrate_reserve()
+                 */
+                if ((pfn & (pageblock_nr_pages-1)))
+                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
                INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
                /* The shift won't overflow because ZONE_NORMAL is below 4G. */
@@ -2216,9 +2564,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
                                struct zone *zone, unsigned long size)
 {
-        int order;
+        int order, t;
-        for (order = 0; order < MAX_ORDER ; order++) {
+        for_each_migratetype_order(order, t) {
-                INIT_LIST_HEAD(&zone->free_area[order].free_list);
+                INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
                zone->free_area[order].nr_free = 0;
        }
 }
@@ -2324,6 +2672,9 @@ static struct per_cpu_pageset boot_pageset[NR_CPUS];
 static int __cpuinit process_zones(int cpu)
 {
        struct zone *zone, *dzone;
+        int node = cpu_to_node(cpu);
+        node_set_state(node, N_CPU);    /* this node has a cpu */
        for_each_zone(zone) {
@@ -2331,7 +2682,7 @@ static int __cpuinit process_zones(int cpu)
                        continue;
                zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
-                                         GFP_KERNEL, cpu_to_node(cpu));
+                                         GFP_KERNEL, node);
                if (!zone_pcp(zone, cpu))
                        goto bad;
@@ -2444,7 +2795,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
                 * To use this new node's memory, further consideration will be
                 * necessary.
                 */
-                zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
+                zone->wait_table = vmalloc(alloc_size);
        }
        if (!zone->wait_table)
                return -ENOMEM;
@@ -2680,10 +3031,8 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
                *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
        }
-        if (*start_pfn == -1UL) {
+        if (*start_pfn == -1UL)
-                printk(KERN_WARNING "Node %u active with no memory\n", nid);
                *start_pfn = 0;
-        }
        /* Push the node boundaries out if requested */
        account_node_boundary(nid, start_pfn, end_pfn);
@@ -2901,6 +3250,62 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
                                                        realtotalpages);
 }
+#ifndef CONFIG_SPARSEMEM
+/*
+ * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Start by making sure zonesize is a multiple of pageblock_order by rounding
+ * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
+ * round what is now in bits to nearest long in bits, then return it in
+ * bytes.
+ */
+static unsigned long __init usemap_size(unsigned long zonesize)
+{
+        unsigned long usemapsize;
+        usemapsize = roundup(zonesize, pageblock_nr_pages);
+        usemapsize = usemapsize >> pageblock_order;
+        usemapsize *= NR_PAGEBLOCK_BITS;
+        usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
+        return usemapsize / 8;
+}
+static void __init setup_usemap(struct pglist_data *pgdat,
+                                struct zone *zone, unsigned long zonesize)
+{
+        unsigned long usemapsize = usemap_size(zonesize);
+        zone->pageblock_flags = NULL;
+        if (usemapsize) {
+                zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+                memset(zone->pageblock_flags, 0, usemapsize);
+        }
+}
+#else
+static void inline setup_usemap(struct pglist_data *pgdat,
+                                struct zone *zone, unsigned long zonesize) {}
+#endif /* CONFIG_SPARSEMEM */
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
+static inline void __init set_pageblock_order(unsigned int order)
+{
+        /* Check that pageblock_nr_pages has not already been setup */
+        if (pageblock_order)
+                return;
+        /*
+         * Assume the largest contiguous order of interest is a huge page.
+         * This value may be variable depending on boot parameters on IA64
+         */
+        pageblock_order = order;
+}
+#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+/* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */
+#define set_pageblock_order(x)  do {} while (0)
+#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -2977,10 +3382,12 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                zone->nr_scan_active = 0;
                zone->nr_scan_inactive = 0;
                zap_zone_vm_stats(zone);
-                atomic_set(&zone->reclaim_in_progress, 0);
+                zone->flags = 0;
                if (!size)
                        continue;
+                set_pageblock_order(HUGETLB_PAGE_ORDER);
+                setup_usemap(pgdat, zone, size);
                ret = init_currently_empty_zone(zone, zone_start_pfn,
                                                size, MEMMAP_EARLY);
                BUG_ON(ret);
@@ -3234,16 +3641,24 @@ unsigned long __init find_max_pfn_with_active_regions(void)
        return max_pfn;
 }
-unsigned long __init early_calculate_totalpages(void)
+/*
+ * early_calculate_totalpages()
+ * Sum pages in active regions for movable zone.
+ * Populate N_HIGH_MEMORY for calculating usable_nodes.
+ */
+static unsigned long __init early_calculate_totalpages(void)
 {
        int i;
        unsigned long totalpages = 0;
-        for (i = 0; i < nr_nodemap_entries; i++)
+        for (i = 0; i < nr_nodemap_entries; i++) {
-                totalpages += early_node_map[i].end_pfn -
+                unsigned long pages = early_node_map[i].end_pfn -
                                                early_node_map[i].start_pfn;
+                totalpages += pages;
-        return totalpages;
+                if (pages)
+                        node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
+        }
+        return totalpages;
 }
 /*
@@ -3257,7 +3672,8 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
        int i, nid;
        unsigned long usable_startpfn;
        unsigned long kernelcore_node, kernelcore_remaining;
-        int usable_nodes = num_online_nodes();
+        unsigned long totalpages = early_calculate_totalpages();
+        int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
        /*
         * If movablecore was specified, calculate what size of
@@ -3268,7 +3684,6 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
         * what movablecore would have allowed.
         */
        if (required_movablecore) {
-                unsigned long totalpages = early_calculate_totalpages();
                unsigned long corepages;
                /*
@@ -3293,7 +3708,7 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
 restart:
        /* Spread kernelcore memory as evenly as possible throughout nodes */
        kernelcore_node = required_kernelcore / usable_nodes;
-        for_each_online_node(nid) {
+        for_each_node_state(nid, N_HIGH_MEMORY) {
                /*
                 * Recalculate kernelcore_node if the division per node
                 * now exceeds what is necessary to satisfy the requested
@@ -3385,6 +3800,20 @@ restart:
                        roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 }
+/* Any regular memory on that node ? */
+static void check_for_regular_memory(pg_data_t *pgdat)
+{
+#ifdef CONFIG_HIGHMEM
+        enum zone_type zone_type;
+        for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
+                struct zone *zone = &pgdat->node_zones[zone_type];
+                if (zone->present_pages)
+                        node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+        }
+#endif
+}
 /**
 * free_area_init_nodes - Initialise all pg_data_t and zone data
 * @max_zone_pfn: an array of max PFNs for each zone
@@ -3459,6 +3888,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                pg_data_t *pgdat = NODE_DATA(nid);
                free_area_init_node(nid, pgdat, NULL,
                                find_min_pfn_for_node(nid), NULL);
+                /* Any memory on that node */
+                if (pgdat->node_present_pages)
+                        node_set_state(nid, N_HIGH_MEMORY);
+                check_for_regular_memory(pgdat);
        }
 }
@@ -3673,6 +4107,7 @@ void setup_per_zone_pages_min(void)
                zone->pages_low   = zone->pages_min + (tmp >> 2);
                zone->pages_high  = zone->pages_min + (tmp >> 1);
+                setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
@@ -3934,4 +4369,169 @@ EXPORT_SYMBOL(pfn_to_page);
 EXPORT_SYMBOL(page_to_pfn);
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
+                                                        unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+        return __pfn_to_section(pfn)->pageblock_flags;
+#else
+        return zone->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+        pfn &= (PAGES_PER_SECTION-1);
+        return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+        pfn = pfn - zone->zone_start_pfn;
+        return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+/**
+ * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @start_bitidx: The first bit of interest to retrieve
+ * @end_bitidx: The last bit of interest
+ * returns pageblock_bits flags
+ */
+unsigned long get_pageblock_flags_group(struct page *page,
+                                        int start_bitidx, int end_bitidx)
+{
+        struct zone *zone;
+        unsigned long *bitmap;
+        unsigned long pfn, bitidx;
+        unsigned long flags = 0;
+        unsigned long value = 1;
+        zone = page_zone(page);
+        pfn = page_to_pfn(page);
+        bitmap = get_pageblock_bitmap(zone, pfn);
+        bitidx = pfn_to_bitidx(zone, pfn);
+        for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
+                if (test_bit(bitidx + start_bitidx, bitmap))
+                        flags |= value;
+        return flags;
+}
+/**
+ * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @start_bitidx: The first bit of interest
+ * @end_bitidx: The last bit of interest
+ * @flags: The flags to set
+ */
+void set_pageblock_flags_group(struct page *page, unsigned long flags,
+                                        int start_bitidx, int end_bitidx)
+{
+        struct zone *zone;
+        unsigned long *bitmap;
+        unsigned long pfn, bitidx;
+        unsigned long value = 1;
+        zone = page_zone(page);
+        pfn = page_to_pfn(page);
+        bitmap = get_pageblock_bitmap(zone, pfn);
+        bitidx = pfn_to_bitidx(zone, pfn);
+        for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
+                if (flags & value)
+                        __set_bit(bitidx + start_bitidx, bitmap);
+                else
+                        __clear_bit(bitidx + start_bitidx, bitmap);
+}
+/*
+ * This is designed as sub function...plz see page_isolation.c also.
+ * set/clear page block's type to be ISOLATE.
+ * page allocater never alloc memory from ISOLATE block.
+ */
+int set_migratetype_isolate(struct page *page)
+{
+        struct zone *zone;
+        unsigned long flags;
+        int ret = -EBUSY;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        /*
+         * In future, more migrate types will be able to be isolation target.
+         */
+        if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+                goto out;
+        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+        move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        ret = 0;
+out:
+        spin_unlock_irqrestore(&zone->lock, flags);
+        if (!ret)
+                drain_all_local_pages();
+        return ret;
+}
+void unset_migratetype_isolate(struct page *page)
+{
+        struct zone *zone;
+        unsigned long flags;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+                goto out;
+        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+        move_freepages_block(zone, page, MIGRATE_MOVABLE);
+out:
+        spin_unlock_irqrestore(&zone->lock, flags);
+}
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * All pages in the range must be isolated before calling this.
+ */
+void
+__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+        struct page *page;
+        struct zone *zone;
+        int order, i;
+        unsigned long pfn;
+        unsigned long flags;
+        /* find the first valid pfn */
+        for (pfn = start_pfn; pfn < end_pfn; pfn++)
+                if (pfn_valid(pfn))
+                        break;
+        if (pfn == end_pfn)
+                return;
+        zone = page_zone(pfn_to_page(pfn));
+        spin_lock_irqsave(&zone->lock, flags);
+        pfn = start_pfn;
+        while (pfn < end_pfn) {
+                if (!pfn_valid(pfn)) {
+                        pfn++;
+                        continue;
+                }
+                page = pfn_to_page(pfn);
+                BUG_ON(page_count(page));
+                BUG_ON(!PageBuddy(page));
+                order = page_order(page);
+#ifdef CONFIG_DEBUG_VM
+                printk(KERN_INFO "remove from free list %lx %d %lx\n",
+                       pfn, 1 << order, end_pfn);
+#endif
+                list_del(&page->lru);
+                rmv_page_order(page);
+                zone->free_area[order].nr_free--;
+                __mod_zone_page_state(zone, NR_FREE_PAGES,
+                                      - (1UL << order));
+                for (i = 0; i < (1 << order); i++)
+                        SetPageReserved((page+i));
+                pfn += (1 << order);
+        }
+        spin_unlock_irqrestore(&zone->lock, flags);
+}
+#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
new file mode 100644
index 000000000000..8f92a29695cc
--- /dev/null
+++ b/mm/page_isolation.c
@@ -0,0 +1,138 @@
+/*
+ * linux/mm/page_isolation.c
+ */
+#include <stddef.h>
+#include <linux/mm.h>
+#include <linux/page-isolation.h>
+#include <linux/pageblock-flags.h>
+#include "internal.h"
+static inline struct page *
+__first_valid_page(unsigned long pfn, unsigned long nr_pages)
+{
+        int i;
+        for (i = 0; i < nr_pages; i++)
+                if (pfn_valid_within(pfn + i))
+                        break;
+        if (unlikely(i == nr_pages))
+                return NULL;
+        return pfn_to_page(pfn + i);
+}
+/*
+ * start_isolate_page_range() -- make page-allocation-type of range of pages
+ * to be MIGRATE_ISOLATE.
+ * @start_pfn: The lower PFN of the range to be isolated.
+ * @end_pfn: The upper PFN of the range to be isolated.
+ *
+ * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
+ * the range will never be allocated. Any free pages and pages freed in the
+ * future will not be allocated again.
+ *
+ * start_pfn/end_pfn must be aligned to pageblock_order.
+ * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
+ */
+int
+start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        unsigned long undo_pfn;
+        struct page *page;
+        BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
+        BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+        for (pfn = start_pfn;
+             pfn < end_pfn;
+             pfn += pageblock_nr_pages) {
+                page = __first_valid_page(pfn, pageblock_nr_pages);
+                if (page && set_migratetype_isolate(page)) {
+                        undo_pfn = pfn;
+                        goto undo;
+                }
+        }
+        return 0;
+undo:
+        for (pfn = start_pfn;
+             pfn <= undo_pfn;
+             pfn += pageblock_nr_pages)
+                unset_migratetype_isolate(pfn_to_page(pfn));
+        return -EBUSY;
+}
+/*
+ * Make isolated pages available again.
+ */
+int
+undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        struct page *page;
+        BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
+        BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+        for (pfn = start_pfn;
+             pfn < end_pfn;
+             pfn += pageblock_nr_pages) {
+                page = __first_valid_page(pfn, pageblock_nr_pages);
+                if (!page || get_pageblock_flags(page) != MIGRATE_ISOLATE)
+                        continue;
+                unset_migratetype_isolate(page);
+        }
+        return 0;
+}
+/*
+ * Test all pages in the range is free(means isolated) or not.
+ * all pages in [start_pfn...end_pfn) must be in the same zone.
+ * zone->lock must be held before call this.
+ *
+ * Returns 0 if all pages in the range is isolated.
+ */
+static int
+__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
+{
+        struct page *page;
+        while (pfn < end_pfn) {
+                if (!pfn_valid_within(pfn)) {
+                        pfn++;
+                        continue;
+                }
+                page = pfn_to_page(pfn);
+                if (PageBuddy(page))
+                        pfn += 1 << page_order(page);
+                else if (page_count(page) == 0 &&
+                                page_private(page) == MIGRATE_ISOLATE)
+                        pfn += 1;
+                else
+                        break;
+        }
+        if (pfn < end_pfn)
+                return 0;
+        return 1;
+}
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        struct page *page;
+        pfn = start_pfn;
+        /*
+         * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
+         * is not aligned to pageblock_nr_pages.
+         * Then we just check pagetype fist.
+         */
+        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+                page = __first_valid_page(pfn, pageblock_nr_pages);
+                if (page && get_pageblock_flags(page) != MIGRATE_ISOLATE)
+                        break;
+        }
+        if (pfn < end_pfn)
+                return -EBUSY;
+        /* Check all pages are free or Marked as ISOLATED */
+        if (__test_page_isolated_in_pageblock(start_pfn, end_pfn))
+                return 0;
+        return -EBUSY;
+}
diff --git a/mm/readahead.c b/mm/readahead.c
index be20c9d699d3..c9c50ca1ec38 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -22,16 +22,8 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 }
 EXPORT_SYMBOL(default_unplug_io_fn);
-/*
- * Convienent macros for min/max read-ahead pages.
- * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up.
- * The latter is necessary for systems with large page size(i.e. 64k).
- */
-#define MAX_RA_PAGES    (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE)
-#define MIN_RA_PAGES    DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE)
 struct backing_dev_info default_backing_dev_info = {
-        .ra_pages       = MAX_RA_PAGES,
+        .ra_pages       = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
        .state          = 0,
        .capabilities   = BDI_CAP_MAP_COPY,
        .unplug_io_fn   = default_unplug_io_fn,
@@ -46,7 +38,7 @@ void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 {
        ra->ra_pages = mapping->backing_dev_info->ra_pages;
-        ra->prev_index = -1;
+        ra->prev_pos = -1;
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
@@ -66,28 +58,25 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
                        int (*filler)(void *, struct page *), void *data)
 {
        struct page *page;
-        struct pagevec lru_pvec;
        int ret = 0;
-        pagevec_init(&lru_pvec, 0);
        while (!list_empty(pages)) {
                page = list_to_page(pages);
                list_del(&page->lru);
-                if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
+                if (add_to_page_cache_lru(page, mapping,
+                                        page->index, GFP_KERNEL)) {
                        page_cache_release(page);
                        continue;
                }
+                page_cache_release(page);
                ret = filler(data, page);
-                if (!pagevec_add(&lru_pvec, page))
+                if (unlikely(ret)) {
-                        __pagevec_lru_add(&lru_pvec);
-                if (ret) {
                        put_pages_list(pages);
                        break;
                }
                task_io_account_read(PAGE_CACHE_SIZE);
        }
-        pagevec_lru_add(&lru_pvec);
        return ret;
 }
@@ -97,7 +86,6 @@ static int read_pages(struct address_space *mapping, struct file *filp,
                struct list_head *pages, unsigned nr_pages)
 {
        unsigned page_idx;
-        struct pagevec lru_pvec;
        int ret;
        if (mapping->a_ops->readpages) {
@@ -107,19 +95,15 @@ static int read_pages(struct address_space *mapping, struct file *filp,
                goto out;
        }
-        pagevec_init(&lru_pvec, 0);
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
                struct page *page = list_to_page(pages);
                list_del(&page->lru);
-                if (!add_to_page_cache(page, mapping,
+                if (!add_to_page_cache_lru(page, mapping,
                                        page->index, GFP_KERNEL)) {
                        mapping->a_ops->readpage(filp, page);
-                        if (!pagevec_add(&lru_pvec, page))
+                }
-                                __pagevec_lru_add(&lru_pvec);
+                page_cache_release(page);
-                } else
-                        page_cache_release(page);
        }
-        pagevec_lru_add(&lru_pvec);
        ret = 0;
 out:
        return ret;
@@ -157,20 +141,19 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
        /*
         * Preallocate as many pages as we will need.
         */
-        read_lock_irq(&mapping->tree_lock);
        for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
                pgoff_t page_offset = offset + page_idx;
                if (page_offset > end_index)
                        break;
+                rcu_read_lock();
                page = radix_tree_lookup(&mapping->page_tree, page_offset);
+                rcu_read_unlock();
                if (page)
                        continue;
-                read_unlock_irq(&mapping->tree_lock);
                page = page_cache_alloc_cold(mapping);
-                read_lock_irq(&mapping->tree_lock);
                if (!page)
                        break;
                page->index = page_offset;
@@ -179,7 +162,6 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
                        SetPageReadahead(page);
                ret++;
        }
-        read_unlock_irq(&mapping->tree_lock);
        /*
         * Now start the IO.  We ignore I/O errors - if the page is not
@@ -251,6 +233,12 @@ unsigned long max_sane_readahead(unsigned long nr)
                + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
 }
+static int __init readahead_init(void)
+{
+        return bdi_init(&default_backing_dev_info);
+}
+subsys_initcall(readahead_init);
 /*
 * Submit IO for the read-ahead request in file_ra_state.
 */
@@ -327,7 +315,7 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
 * indicator. The flag won't be set on already cached pages, to avoid the
 * readahead-for-nothing fuss, saving pointless page cache lookups.
 *
- * prev_index tracks the last visited page in the _previous_ read request.
+ * prev_pos tracks the last visited byte in the _previous_ read request.
 * It should be maintained by the caller, and will be used for detecting
 * small random reads. Note that the readahead algorithm checks loosely
 * for sequential patterns. Hence interleaved reads might be served as
@@ -351,11 +339,9 @@ ondemand_readahead(struct address_space *mapping,
                   bool hit_readahead_marker, pgoff_t offset,
                   unsigned long req_size)
 {
-        unsigned long max;      /* max readahead pages */
+        int     max = ra->ra_pages;     /* max readahead pages */
-        int sequential;
+        pgoff_t prev_offset;
+        int     sequential;
-        max = ra->ra_pages;
-        sequential = (offset - ra->prev_index <= 1UL) || (req_size > max);
        /*
         * It's the expected callback offset, assume sequential access.
@@ -369,6 +355,9 @@ ondemand_readahead(struct address_space *mapping,
                goto readit;
        }
+        prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
+        sequential = offset - prev_offset <= 1UL || req_size > max;
        /*
         * Standalone, small read.
         * Read as is, and do not pollute the readahead state.
@@ -379,6 +368,29 @@ ondemand_readahead(struct address_space *mapping,
        }
        /*
+         * Hit a marked page without valid readahead state.
+         * E.g. interleaved reads.
+         * Query the pagecache for async_size, which normally equals to
+         * readahead size. Ramp it up and use it as the new readahead size.
+         */
+        if (hit_readahead_marker) {
+                pgoff_t start;
+                read_lock_irq(&mapping->tree_lock);
+                start = radix_tree_next_hole(&mapping->page_tree, offset, max+1);
+                read_unlock_irq(&mapping->tree_lock);
+                if (!start || start - offset > max)
+                        return 0;
+                ra->start = start;
+                ra->size = start - offset;      /* old async_size */
+                ra->size = get_next_ra_size(ra, max);
+                ra->async_size = ra->size;
+                goto readit;
+        }
+        /*
         * It may be one of
         *      - first read on start of file
         *      - sequential cache miss
@@ -389,16 +401,6 @@ ondemand_readahead(struct address_space *mapping,
        ra->size = get_init_ra_size(req_size, max);
        ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
-        /*
-         * Hit on a marked page without valid readahead state.
-         * E.g. interleaved reads.
-         * Not knowing its readahead pos/size, bet on the minimal possible one.
-         */
-        if (hit_readahead_marker) {
-                ra->start++;
-                ra->size = get_next_ra_size(ra, max);
-        }
 readit:
        return ra_submit(ra, mapping, filp);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 41ac39749ef4..8990f909492f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -36,6 +36,7 @@
 *                 mapping->tree_lock (widely used, in set_page_dirty,
 *                           in arch-dependent flush_dcache_mmap_lock,
 *                           within inode_lock in __sync_single_inode)
+ *                   zone->lock (within radix tree node alloc)
 */
 #include <linux/mm.h>
@@ -137,8 +138,7 @@ void anon_vma_unlink(struct vm_area_struct *vma)
                anon_vma_free(anon_vma);
 }
-static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
+static void anon_vma_ctor(struct kmem_cache *cachep, void *data)
-                          unsigned long flags)
 {
        struct anon_vma *anon_vma = data;
@@ -436,7 +436,6 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
                entry = pte_wrprotect(entry);
                entry = pte_mkclean(entry);
                set_pte_at(mm, address, pte, entry);
-                lazy_mmu_prot_update(entry);
                ret = 1;
        }
diff --git a/mm/shmem.c b/mm/shmem.c
index fcd19d323f9f..289dbb0a6fd6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -49,7 +49,6 @@
 #include <linux/ctype.h>
 #include <linux/migrate.h>
 #include <linux/highmem.h>
-#include <linux/backing-dev.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
@@ -96,9 +95,9 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
         * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
         * might be reconsidered if it ever diverges from PAGE_SIZE.
         *
-         * __GFP_MOVABLE is masked out as swap vectors cannot move
+         * Mobility flags are masked out as swap vectors cannot move
         */
-        return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO,
+        return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
                                PAGE_CACHE_SHIFT-PAGE_SHIFT);
 }
@@ -972,7 +971,7 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, *policy_nodes))
                        goto out;
-                if (!nodes_subset(*policy_nodes, node_online_map))
+                if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
                        goto out;
        }
        if (!strcmp(value, "default")) {
@@ -997,9 +996,11 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_
                        err = 0;
        } else if (!strcmp(value, "interleave")) {
                *policy = MPOL_INTERLEAVE;
-                /* Default to nodes online if no nodelist */
+                /*
+                 * Default to online nodes with memory if no nodelist
+                 */
                if (!nodelist)
-                        *policy_nodes = node_online_map;
+                        *policy_nodes = node_states[N_HIGH_MEMORY];
                err = 0;
        }
 out:
@@ -1025,8 +1026,8 @@ static struct page *shmem_swapin_async(struct shared_policy *p,
        return page;
 }
-struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
+static struct page *shmem_swapin(struct shmem_inode_info *info,
-                          unsigned long idx)
+                                 swp_entry_t entry, unsigned long idx)
 {
        struct shared_policy *p = &info->policy;
        int i, num;
@@ -1061,7 +1062,8 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
        return page;
 }
 #else
-static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static inline int shmem_parse_mpol(char *value, int *policy,
+                                                nodemask_t *policy_nodes)
 {
        return 1;
 }
@@ -1109,7 +1111,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
         * Normally, filepage is NULL on entry, and either found
         * uptodate immediately, or allocated and zeroed, or read
         * in under swappage, which is then assigned to filepage.
-         * But shmem_readpage and shmem_prepare_write pass in a locked
+         * But shmem_readpage and shmem_write_begin pass in a locked
         * filepage, which may be found not uptodate by other callers
         * too, and may need to be copied from the swappage read in.
         */
@@ -1327,14 +1329,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 #ifdef CONFIG_NUMA
-int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
 {
        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
        return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
 }
-struct mempolicy *
+static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
-shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
+                                          unsigned long addr)
 {
        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
        unsigned long idx;
@@ -1446,7 +1448,7 @@ static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_symlink_inline_operations;
 /*
- * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write;
+ * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
 * but providing them allows a tmpfs file to be used for splice, sendfile, and
 * below the loop driver, in the generic fashion that many filesystems support.
 */
@@ -1459,10 +1461,30 @@ static int shmem_readpage(struct file *file, struct page *page)
 }
 static int
-shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+shmem_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata)
 {
-        struct inode *inode = page->mapping->host;
+        struct inode *inode = mapping->host;
-        return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        *pagep = NULL;
+        return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+}
+static int
+shmem_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        set_page_dirty(page);
+        page_cache_release(page);
+        if (pos+copied > inode->i_size)
+                i_size_write(inode, pos+copied);
+        return copied;
 }
 static ssize_t
@@ -2219,7 +2241,7 @@ static int shmem_fill_super(struct super_block *sb,
        unsigned long blocks = 0;
        unsigned long inodes = 0;
        int policy = MPOL_DEFAULT;
-        nodemask_t policy_nodes = node_online_map;
+        nodemask_t policy_nodes = node_states[N_HIGH_MEMORY];
 #ifdef CONFIG_TMPFS
        /*
@@ -2306,8 +2328,7 @@ static void shmem_destroy_inode(struct inode *inode)
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
-static void init_once(void *foo, struct kmem_cache *cachep,
+static void init_once(struct kmem_cache *cachep, void *foo)
-                      unsigned long flags)
 {
        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
@@ -2322,9 +2343,7 @@ static int init_inodecache(void)
 {
        shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                sizeof(struct shmem_inode_info),
-                                0, 0, init_once);
+                                0, SLAB_PANIC, init_once);
-        if (shmem_inode_cachep == NULL)
-                return -ENOMEM;
        return 0;
 }
@@ -2338,8 +2357,8 @@ static const struct address_space_operations shmem_aops = {
        .set_page_dirty = __set_page_dirty_no_writeback,
 #ifdef CONFIG_TMPFS
        .readpage       = shmem_readpage,
-        .prepare_write  = shmem_prepare_write,
+        .write_begin    = shmem_write_begin,
-        .commit_write   = simple_commit_write,
+        .write_end      = shmem_write_end,
 #endif
        .migratepage    = migrate_page,
 };
@@ -2442,6 +2461,10 @@ static int __init init_tmpfs(void)
 {
        int error;
+        error = bdi_init(&shmem_backing_dev_info);
+        if (error)
+                goto out4;
        error = init_inodecache();
        if (error)
                goto out3;
@@ -2466,6 +2489,8 @@ out1:
 out2:
        destroy_inodecache();
 out3:
+        bdi_destroy(&shmem_backing_dev_info);
+out4:
        shm_mnt = ERR_PTR(error);
        return error;
 }
@@ -2518,11 +2543,8 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
        d_instantiate(dentry, inode);
        inode->i_size = size;
        inode->i_nlink = 0;     /* It is unlinked */
-        file->f_path.mnt = mntget(shm_mnt);
+        init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
-        file->f_path.dentry = dentry;
+                        &shmem_file_operations);
-        file->f_mapping = inode->i_mapping;
-        file->f_op = &shmem_file_operations;
-        file->f_mode = FMODE_WRITE | FMODE_READ;
        return file;
 close_file:
diff --git a/mm/slab.c b/mm/slab.c
index 6f6abef83a1a..3ce9bc024d67 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -267,11 +267,10 @@ struct array_cache {
        unsigned int batchcount;
        unsigned int touched;
        spinlock_t lock;
-        void *entry[0]; /*
+        void *entry[];  /*
                         * Must have this definition in here for the proper
                         * alignment of array_cache. Also simplifies accessing
                         * the entries.
-                         * [0] is for gcc 2.95. It should really be [].
                         */
 };
@@ -408,7 +407,7 @@ struct kmem_cache {
        unsigned int dflags;            /* dynamic flags */
        /* constructor func */
-        void (*ctor) (void *, struct kmem_cache *, unsigned long);
+        void (*ctor)(struct kmem_cache *, void *);
 /* 5) cache creation/removal */
        const char *name;
@@ -1568,7 +1567,7 @@ void __init kmem_cache_init(void)
                /* Replace the static kmem_list3 structures for the boot cpu */
                init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
-                for_each_online_node(nid) {
+                for_each_node_state(nid, N_NORMAL_MEMORY) {
                        init_list(malloc_sizes[INDEX_AC].cs_cachep,
                                  &initkmem_list3[SIZE_AC + nid], nid);
@@ -1643,6 +1642,8 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 #endif
        flags |= cachep->gfpflags;
+        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+                flags |= __GFP_RECLAIMABLE;
        page = alloc_pages_node(nodeid, flags, cachep->gfporder);
        if (!page)
@@ -1944,7 +1945,7 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
 {
        int node;
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                cachep->nodelists[node] = &initkmem_list3[index + node];
                cachep->nodelists[node]->next_reap = jiffies +
                    REAPTIMEOUT_LIST3 +
@@ -2075,7 +2076,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
                        g_cpucache_up = PARTIAL_L3;
                } else {
                        int node;
-                        for_each_online_node(node) {
+                        for_each_node_state(node, N_NORMAL_MEMORY) {
                                cachep->nodelists[node] =
                                    kmalloc_node(sizeof(struct kmem_list3),
                                                GFP_KERNEL, node);
@@ -2127,7 +2128,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
        unsigned long flags,
-        void (*ctor)(void*, struct kmem_cache *, unsigned long))
+        void (*ctor)(struct kmem_cache *, void *))
 {
        size_t left_over, slab_size, ralign;
        struct kmem_cache *cachep = NULL, *pc;
@@ -2634,8 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                 * They must also be threaded.
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-                        cachep->ctor(objp + obj_offset(cachep), cachep,
+                        cachep->ctor(cachep, objp + obj_offset(cachep));
-                                     0);
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2651,7 +2651,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                         cachep->buffer_size / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
-                        cachep->ctor(objp, cachep, 0);
+                        cachep->ctor(cachep, objp);
 #endif
                slab_bufctl(slabp)[i] = i + 1;
        }
@@ -2746,9 +2746,9 @@ static int cache_grow(struct kmem_cache *cachep,
         * Be lazy and only check for valid flags here,  keeping it out of the
         * critical path in kmem_cache_alloc().
         */
-        BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK));
+        BUG_ON(flags & GFP_SLAB_BUG_MASK);
+        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
-        local_flags = (flags & GFP_LEVEL_MASK);
        /* Take the l3 list lock to change the colour_next on this node */
        check_irq_off();
        l3 = cachep->nodelists[nodeid];
@@ -2785,7 +2785,7 @@ static int cache_grow(struct kmem_cache *cachep,
        /* Get slab management. */
        slabp = alloc_slabmgmt(cachep, objp, offset,
-                        local_flags & ~GFP_THISNODE, nodeid);
+                        local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
        if (!slabp)
                goto opps1;
@@ -3076,7 +3076,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #endif
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
-                cachep->ctor(objp, cachep, 0);
+                cachep->ctor(cachep, objp);
 #if ARCH_SLAB_MINALIGN
        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -3225,7 +3225,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
        zonelist = &NODE_DATA(slab_node(current->mempolicy))
                        ->node_zonelists[gfp_zone(flags)];
-        local_flags = (flags & GFP_LEVEL_MASK);
+        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 retry:
        /*
@@ -3792,7 +3792,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
        struct array_cache *new_shared;
        struct array_cache **new_alien = NULL;
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                if (use_alien_caches) {
                        new_alien = alloc_alien_cache(node, cachep->limit);
@@ -4446,7 +4446,8 @@ const struct seq_operations slabstats_op = {
 */
 size_t ksize(const void *objp)
 {
-        if (unlikely(ZERO_OR_NULL_PTR(objp)))
+        BUG_ON(!objp);
+        if (unlikely(objp == ZERO_SIZE_PTR))
                return 0;
        return obj_size(virt_to_cache(objp));
diff --git a/mm/slob.c b/mm/slob.c
index ec33fcdc852e..5bc2ceb692ec 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size)
        slobidx_t units;
        unsigned long flags;
-        if (ZERO_OR_NULL_PTR(block))
+        if (unlikely(ZERO_OR_NULL_PTR(block)))
                return;
        BUG_ON(!size);
@@ -466,7 +466,7 @@ void kfree(const void *block)
 {
        struct slob_page *sp;
-        if (ZERO_OR_NULL_PTR(block))
+        if (unlikely(ZERO_OR_NULL_PTR(block)))
                return;
        sp = (struct slob_page *)virt_to_page(block);
@@ -484,7 +484,8 @@ size_t ksize(const void *block)
 {
        struct slob_page *sp;
-        if (ZERO_OR_NULL_PTR(block))
+        BUG_ON(!block);
+        if (unlikely(block == ZERO_SIZE_PTR))
                return 0;
        sp = (struct slob_page *)virt_to_page(block);
@@ -498,12 +499,12 @@ struct kmem_cache {
        unsigned int size, align;
        unsigned long flags;
        const char *name;
-        void (*ctor)(void *, struct kmem_cache *, unsigned long);
+        void (*ctor)(struct kmem_cache *, void *);
 };
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
        size_t align, unsigned long flags,
-        void (*ctor)(void*, struct kmem_cache *, unsigned long))
+        void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *c;
@@ -547,7 +548,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
                b = slob_new_page(flags, get_order(c->size), node);
        if (c->ctor)
-                c->ctor(b, c, 0);
+                c->ctor(c, b);
        return b;
 }
diff --git a/mm/slub.c b/mm/slub.c
index addb20a6d67d..e29a42988c78 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -90,7 +90,7 @@
 *                      One use of this flag is to mark slabs that are
 *                      used for allocations. Then such a slab becomes a cpu
 *                      slab. The cpu slab may be equipped with an additional
- *                      lockless_freelist that allows lockless access to
+ *                      freelist that allows lockless access to
 *                      free objects in addition to the regular freelist
 *                      that requires the slab lock.
 *
@@ -140,11 +140,6 @@ static inline void ClearSlabDebug(struct page *page)
 /*
 * Issues still to be resolved:
 *
- * - The per cpu array is updated for each new slab and and is a remote
- *   cacheline for most nodes. This could become a bouncing cacheline given
- *   enough frequent updates. There are 16 pointers in a cacheline, so at
- *   max 16 cpus could compete for the cacheline which may be okay.
- *
 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 *
 * - Variable sizing of the per node arrays
@@ -205,11 +200,6 @@ static inline void ClearSlabDebug(struct page *page)
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 #endif
-/*
- * The page->inuse field is 16 bit thus we have this limitation
- */
-#define MAX_OBJECTS_PER_SLAB 65535
 /* Internal SLUB flags */
 #define __OBJECT_POISON         0x80000000 /* Poison object */
 #define __SYSFS_ADD_DEFERRED    0x40000000 /* Not yet visible via sysfs */
@@ -277,6 +267,15 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 #endif
 }
+static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
+{
+#ifdef CONFIG_SMP
+        return s->cpu_slab[cpu];
+#else
+        return &s->cpu_slab;
+#endif
+}
 static inline int check_valid_pointer(struct kmem_cache *s,
                                struct page *page, const void *object)
 {
@@ -729,11 +728,6 @@ static int check_slab(struct kmem_cache *s, struct page *page)
                slab_err(s, page, "Not a valid slab page");
                return 0;
        }
-        if (page->offset * sizeof(void *) != s->offset) {
-                slab_err(s, page, "Corrupted offset %lu",
-                        (unsigned long)(page->offset * sizeof(void *)));
-                return 0;
-        }
        if (page->inuse > s->objects) {
                slab_err(s, page, "inuse %u > max %u",
                        s->name, page->inuse, s->objects);
@@ -872,8 +866,6 @@ bad:
                slab_fix(s, "Marking all objects used");
                page->inuse = s->objects;
                page->freelist = NULL;
-                /* Fix up fields that may be corrupted */
-                page->offset = s->offset / sizeof(void *);
        }
        return 0;
 }
@@ -988,7 +980,7 @@ __setup("slub_debug", setup_slub_debug);
 static unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(void *, struct kmem_cache *, unsigned long))
+        void (*ctor)(struct kmem_cache *, void *))
 {
        /*
         * The page->offset field is only 16 bit wide. This is an offset
@@ -1035,7 +1027,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long objsize,
        unsigned long flags, const char *name,
-        void (*ctor)(void *, struct kmem_cache *, unsigned long))
+        void (*ctor)(struct kmem_cache *, void *))
 {
        return flags;
 }
@@ -1055,6 +1047,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
        if (s->flags & SLAB_CACHE_DMA)
                flags |= SLUB_DMA;
+        if (s->flags & SLAB_RECLAIM_ACCOUNT)
+                flags |= __GFP_RECLAIMABLE;
        if (node == -1)
                page = alloc_pages(flags, s->order);
        else
@@ -1076,7 +1071,7 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 {
        setup_object_debug(s, page, object);
        if (unlikely(s->ctor))
-                s->ctor(object, s, 0);
+                s->ctor(s, object);
 }
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1088,19 +1083,16 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        void *last;
        void *p;
-        BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK));
+        BUG_ON(flags & GFP_SLAB_BUG_MASK);
-        if (flags & __GFP_WAIT)
-                local_irq_enable();
-        page = allocate_slab(s, flags & GFP_LEVEL_MASK, node);
+        page = allocate_slab(s,
+                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
        if (!page)
                goto out;
        n = get_node(s, page_to_nid(page));
        if (n)
                atomic_long_inc(&n->nr_slabs);
-        page->offset = s->offset / sizeof(void *);
        page->slab = s;
        page->flags |= 1 << PG_slab;
        if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
@@ -1123,11 +1115,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        set_freepointer(s, last, NULL);
        page->freelist = start;
-        page->lockless_freelist = NULL;
        page->inuse = 0;
 out:
-        if (flags & __GFP_WAIT)
-                local_irq_disable();
        return page;
 }
@@ -1149,7 +1138,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
                - pages);
-        page->mapping = NULL;
        __free_pages(page, s->order);
 }
@@ -1383,33 +1371,34 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
 /*
 * Remove the cpu slab
 */
-static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
+static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
+        struct page *page = c->page;
        /*
         * Merge cpu freelist into freelist. Typically we get here
         * because both freelists are empty. So this is unlikely
         * to occur.
         */
-        while (unlikely(page->lockless_freelist)) {
+        while (unlikely(c->freelist)) {
                void **object;
                /* Retrieve object from cpu_freelist */
-                object = page->lockless_freelist;
+                object = c->freelist;
-                page->lockless_freelist = page->lockless_freelist[page->offset];
+                c->freelist = c->freelist[c->offset];
                /* And put onto the regular freelist */
-                object[page->offset] = page->freelist;
+                object[c->offset] = page->freelist;
                page->freelist = object;
                page->inuse--;
        }
-        s->cpu_slab[cpu] = NULL;
+        c->page = NULL;
        unfreeze_slab(s, page);
 }
-static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
+static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
-        slab_lock(page);
+        slab_lock(c->page);
-        deactivate_slab(s, page, cpu);
+        deactivate_slab(s, c);
 }
 /*
@@ -1418,18 +1407,17 @@ static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
 */
 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
 {
-        struct page *page = s->cpu_slab[cpu];
+        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-        if (likely(page))
+        if (likely(c && c->page))
-                flush_slab(s, page, cpu);
+                flush_slab(s, c);
 }
 static void flush_cpu_slab(void *d)
 {
        struct kmem_cache *s = d;
-        int cpu = smp_processor_id();
-        __flush_cpu_slab(s, cpu);
+        __flush_cpu_slab(s, smp_processor_id());
 }
 static void flush_all(struct kmem_cache *s)
@@ -1446,6 +1434,19 @@ static void flush_all(struct kmem_cache *s)
 }
 /*
+ * Check if the objects in a per cpu structure fit numa
+ * locality expectations.
+ */
+static inline int node_match(struct kmem_cache_cpu *c, int node)
+{
+#ifdef CONFIG_NUMA
+        if (node != -1 && c->node != node)
+                return 0;
+#endif
+        return 1;
+}
+/*
 * Slow path. The lockless freelist is empty or we need to perform
 * debugging duties.
 *
@@ -1463,45 +1464,53 @@ static void flush_all(struct kmem_cache *s)
 * we need to allocate a new slab. This is slowest path since we may sleep.
 */
 static void *__slab_alloc(struct kmem_cache *s,
-                gfp_t gfpflags, int node, void *addr, struct page *page)
+                gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
 {
        void **object;
-        int cpu = smp_processor_id();
+        struct page *new;
-        if (!page)
+        if (!c->page)
                goto new_slab;
-        slab_lock(page);
+        slab_lock(c->page);
-        if (unlikely(node != -1 && page_to_nid(page) != node))
+        if (unlikely(!node_match(c, node)))
                goto another_slab;
 load_freelist:
-        object = page->freelist;
+        object = c->page->freelist;
        if (unlikely(!object))
                goto another_slab;
-        if (unlikely(SlabDebug(page)))
+        if (unlikely(SlabDebug(c->page)))
                goto debug;
-        object = page->freelist;
+        object = c->page->freelist;
-        page->lockless_freelist = object[page->offset];
+        c->freelist = object[c->offset];
-        page->inuse = s->objects;
+        c->page->inuse = s->objects;
-        page->freelist = NULL;
+        c->page->freelist = NULL;
-        slab_unlock(page);
+        c->node = page_to_nid(c->page);
+        slab_unlock(c->page);
        return object;
 another_slab:
-        deactivate_slab(s, page, cpu);
+        deactivate_slab(s, c);
 new_slab:
-        page = get_partial(s, gfpflags, node);
+        new = get_partial(s, gfpflags, node);
-        if (page) {
+        if (new) {
-                s->cpu_slab[cpu] = page;
+                c->page = new;
                goto load_freelist;
        }
-        page = new_slab(s, gfpflags, node);
+        if (gfpflags & __GFP_WAIT)
-        if (page) {
+                local_irq_enable();
-                cpu = smp_processor_id();
-                if (s->cpu_slab[cpu]) {
+        new = new_slab(s, gfpflags, node);
+        if (gfpflags & __GFP_WAIT)
+                local_irq_disable();
+        if (new) {
+                c = get_cpu_slab(s, smp_processor_id());
+                if (c->page) {
                        /*
                         * Someone else populated the cpu_slab while we
                         * enabled interrupts, or we have gotten scheduled
@@ -1509,34 +1518,33 @@ new_slab:
                         * requested node even if __GFP_THISNODE was
                         * specified. So we need to recheck.
                         */
-                        if (node == -1 ||
+                        if (node_match(c, node)) {
-                                page_to_nid(s->cpu_slab[cpu]) == node) {
                                /*
                                 * Current cpuslab is acceptable and we
                                 * want the current one since its cache hot
                                 */
-                                discard_slab(s, page);
+                                discard_slab(s, new);
-                                page = s->cpu_slab[cpu];
+                                slab_lock(c->page);
-                                slab_lock(page);
                                goto load_freelist;
                        }
                        /* New slab does not fit our expectations */
-                        flush_slab(s, s->cpu_slab[cpu], cpu);
+                        flush_slab(s, c);
                }
-                slab_lock(page);
+                slab_lock(new);
-                SetSlabFrozen(page);
+                SetSlabFrozen(new);
-                s->cpu_slab[cpu] = page;
+                c->page = new;
                goto load_freelist;
        }
        return NULL;
 debug:
-        object = page->freelist;
+        object = c->page->freelist;
-        if (!alloc_debug_processing(s, page, object, addr))
+        if (!alloc_debug_processing(s, c->page, object, addr))
                goto another_slab;
-        page->inuse++;
+        c->page->inuse++;
-        page->freelist = object[page->offset];
+        c->page->freelist = object[c->offset];
-        slab_unlock(page);
+        c->node = -1;
+        slab_unlock(c->page);
        return object;
 }
@@ -1553,25 +1561,24 @@ debug:
 static void __always_inline *slab_alloc(struct kmem_cache *s,
                gfp_t gfpflags, int node, void *addr)
 {
-        struct page *page;
        void **object;
        unsigned long flags;
+        struct kmem_cache_cpu *c;
        local_irq_save(flags);
-        page = s->cpu_slab[smp_processor_id()];
+        c = get_cpu_slab(s, smp_processor_id());
-        if (unlikely(!page || !page->lockless_freelist ||
+        if (unlikely(!c->freelist || !node_match(c, node)))
-                        (node != -1 && page_to_nid(page) != node)))
-                object = __slab_alloc(s, gfpflags, node, addr, page);
+                object = __slab_alloc(s, gfpflags, node, addr, c);
        else {
-                object = page->lockless_freelist;
+                object = c->freelist;
-                page->lockless_freelist = object[page->offset];
+                c->freelist = object[c->offset];
        }
        local_irq_restore(flags);
        if (unlikely((gfpflags & __GFP_ZERO) && object))
-                memset(object, 0, s->objsize);
+                memset(object, 0, c->objsize);
        return object;
 }
@@ -1599,7 +1606,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
 * handling required then we can return immediately.
 */
 static void __slab_free(struct kmem_cache *s, struct page *page,
-                                        void *x, void *addr)
+                                void *x, void *addr, unsigned int offset)
 {
        void *prior;
        void **object = (void *)x;
@@ -1609,7 +1616,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
        if (unlikely(SlabDebug(page)))
                goto debug;
 checks_ok:
-        prior = object[page->offset] = page->freelist;
+        prior = object[offset] = page->freelist;
        page->freelist = object;
        page->inuse--;
@@ -1664,15 +1671,16 @@ static void __always_inline slab_free(struct kmem_cache *s,
 {
        void **object = (void *)x;
        unsigned long flags;
+        struct kmem_cache_cpu *c;
        local_irq_save(flags);
        debug_check_no_locks_freed(object, s->objsize);
-        if (likely(page == s->cpu_slab[smp_processor_id()] &&
+        c = get_cpu_slab(s, smp_processor_id());
-                                                !SlabDebug(page))) {
+        if (likely(page == c->page && c->node >= 0)) {
-                object[page->offset] = page->lockless_freelist;
+                object[c->offset] = c->freelist;
-                page->lockless_freelist = object;
+                c->freelist = object;
        } else
-                __slab_free(s, page, x, addr);
+                __slab_free(s, page, x, addr, c->offset);
        local_irq_restore(flags);
 }
@@ -1759,14 +1767,6 @@ static inline int slab_order(int size, int min_objects,
        int rem;
        int min_order = slub_min_order;
-        /*
-         * If we would create too many object per slab then reduce
-         * the slab order even if it goes below slub_min_order.
-         */
-        while (min_order > 0 &&
-                (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size)
-                        min_order--;
        for (order = max(min_order,
                                fls(min_objects * size - 1) - PAGE_SHIFT);
                        order <= max_order; order++) {
@@ -1781,9 +1781,6 @@ static inline int slab_order(int size, int min_objects,
                if (rem <= slab_size / fract_leftover)
                        break;
-                /* If the next size is too high then exit now */
-                if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size)
-                        break;
        }
        return order;
@@ -1858,6 +1855,16 @@ static unsigned long calculate_alignment(unsigned long flags,
        return ALIGN(align, sizeof(void *));
 }
+static void init_kmem_cache_cpu(struct kmem_cache *s,
+                        struct kmem_cache_cpu *c)
+{
+        c->page = NULL;
+        c->freelist = NULL;
+        c->node = 0;
+        c->offset = s->offset / sizeof(void *);
+        c->objsize = s->objsize;
+}
 static void init_kmem_cache_node(struct kmem_cache_node *n)
 {
        n->nr_partial = 0;
@@ -1869,6 +1876,131 @@ static void init_kmem_cache_node(struct kmem_cache_node *n)
 #endif
 }
+#ifdef CONFIG_SMP
+/*
+ * Per cpu array for per cpu structures.
+ *
+ * The per cpu array places all kmem_cache_cpu structures from one processor
+ * close together meaning that it becomes possible that multiple per cpu
+ * structures are contained in one cacheline. This may be particularly
+ * beneficial for the kmalloc caches.
+ *
+ * A desktop system typically has around 60-80 slabs. With 100 here we are
+ * likely able to get per cpu structures for all caches from the array defined
+ * here. We must be able to cover all kmalloc caches during bootstrap.
+ *
+ * If the per cpu array is exhausted then fall back to kmalloc
+ * of individual cachelines. No sharing is possible then.
+ */
+#define NR_KMEM_CACHE_CPU 100
+static DEFINE_PER_CPU(struct kmem_cache_cpu,
+                                kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
+static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
+static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
+static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
+                                                        int cpu, gfp_t flags)
+{
+        struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
+        if (c)
+                per_cpu(kmem_cache_cpu_free, cpu) =
+                                (void *)c->freelist;
+        else {
+                /* Table overflow: So allocate ourselves */
+                c = kmalloc_node(
+                        ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
+                        flags, cpu_to_node(cpu));
+                if (!c)
+                        return NULL;
+        }
+        init_kmem_cache_cpu(s, c);
+        return c;
+}
+static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
+{
+        if (c < per_cpu(kmem_cache_cpu, cpu) ||
+                        c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
+                kfree(c);
+                return;
+        }
+        c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
+        per_cpu(kmem_cache_cpu_free, cpu) = c;
+}
+static void free_kmem_cache_cpus(struct kmem_cache *s)
+{
+        int cpu;
+        for_each_online_cpu(cpu) {
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                if (c) {
+                        s->cpu_slab[cpu] = NULL;
+                        free_kmem_cache_cpu(c, cpu);
+                }
+        }
+}
+static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
+{
+        int cpu;
+        for_each_online_cpu(cpu) {
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                if (c)
+                        continue;
+                c = alloc_kmem_cache_cpu(s, cpu, flags);
+                if (!c) {
+                        free_kmem_cache_cpus(s);
+                        return 0;
+                }
+                s->cpu_slab[cpu] = c;
+        }
+        return 1;
+}
+/*
+ * Initialize the per cpu array.
+ */
+static void init_alloc_cpu_cpu(int cpu)
+{
+        int i;
+        if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
+                return;
+        for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
+                free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
+        cpu_set(cpu, kmem_cach_cpu_free_init_once);
+}
+static void __init init_alloc_cpu(void)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                init_alloc_cpu_cpu(cpu);
+  }
+#else
+static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
+static inline void init_alloc_cpu(void) {}
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
+{
+        init_kmem_cache_cpu(s, &s->cpu_slab);
+        return 1;
+}
+#endif
 #ifdef CONFIG_NUMA
 /*
 * No kmalloc_node yet so do it by hand. We know that this is the first
@@ -1876,10 +2008,11 @@ static void init_kmem_cache_node(struct kmem_cache_node *n)
 * possible.
 *
 * Note that this function only works on the kmalloc_node_cache
- * when allocating for the kmalloc_node_cache.
+ * when allocating for the kmalloc_node_cache. This is used for bootstrapping
+ * memory on a fresh node that has no slab structures yet.
 */
-static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags,
+static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
-                                                                int node)
+                                                           int node)
 {
        struct page *page;
        struct kmem_cache_node *n;
@@ -1908,12 +2041,6 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
        init_kmem_cache_node(n);
        atomic_long_inc(&n->nr_slabs);
        add_partial(n, page);
-        /*
-         * new_slab() disables interupts. If we do not reenable interrupts here
-         * then bootup would continue with interrupts disabled.
-         */
-        local_irq_enable();
        return n;
 }
@@ -1921,7 +2048,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 {
        int node;
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = s->node[node];
                if (n && n != &s->local_node)
                        kmem_cache_free(kmalloc_caches, n);
@@ -1939,7 +2066,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
        else
                local_node = 0;
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n;
                if (local_node == node)
@@ -2077,21 +2204,14 @@ static int calculate_sizes(struct kmem_cache *s)
         */
        s->objects = (PAGE_SIZE << s->order) / size;
-        /*
+        return !!s->objects;
-         * Verify that the number of objects is within permitted limits.
-         * The page->inuse field is only 16 bit wide! So we cannot have
-         * more than 64k objects per slab.
-         */
-        if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB)
-                return 0;
-        return 1;
 }
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
                const char *name, size_t size,
                size_t align, unsigned long flags,
-                void (*ctor)(void *, struct kmem_cache *, unsigned long))
+                void (*ctor)(struct kmem_cache *, void *))
 {
        memset(s, 0, kmem_size);
        s->name = name;
@@ -2107,9 +2227,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 #ifdef CONFIG_NUMA
        s->defrag_ratio = 100;
 #endif
+        if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+                goto error;
-        if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+        if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
                return 1;
+        free_kmem_cache_nodes(s);
 error:
        if (flags & SLAB_PANIC)
                panic("Cannot create slab %s size=%lu realsize=%u "
@@ -2192,7 +2315,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
        flush_all(s);
        /* Attempt to free all objects */
-        for_each_online_node(node) {
+        free_kmem_cache_cpus(s);
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
                n->nr_partial -= free_list(s, n, &n->partial);
@@ -2227,11 +2351,11 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 *              Kmalloc subsystem
 *******************************************************************/
-struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 #ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1];
+static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
 #endif
 static int __init setup_slub_min_order(char *str)
@@ -2397,12 +2521,8 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
                        return ZERO_SIZE_PTR;
                index = size_index[(size - 1) / 8];
-        } else {
+        } else
-                if (size > KMALLOC_MAX_SIZE)
-                        return NULL;
                index = fls(size - 1);
-        }
 #ifdef CONFIG_ZONE_DMA
        if (unlikely((flags & SLUB_DMA)))
@@ -2414,9 +2534,15 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
 void *__kmalloc(size_t size, gfp_t flags)
 {
-        struct kmem_cache *s = get_slab(size, flags);
+        struct kmem_cache *s;
-        if (ZERO_OR_NULL_PTR(s))
+        if (unlikely(size > PAGE_SIZE / 2))
+                return (void *)__get_free_pages(flags | __GFP_COMP,
+                                                        get_order(size));
+        s = get_slab(size, flags);
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
        return slab_alloc(s, flags, -1, __builtin_return_address(0));
@@ -2426,9 +2552,15 @@ EXPORT_SYMBOL(__kmalloc);
 #ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
-        struct kmem_cache *s = get_slab(size, flags);
+        struct kmem_cache *s;
-        if (ZERO_OR_NULL_PTR(s))
+        if (unlikely(size > PAGE_SIZE / 2))
+                return (void *)__get_free_pages(flags | __GFP_COMP,
+                                                        get_order(size));
+        s = get_slab(size, flags);
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
        return slab_alloc(s, flags, node, __builtin_return_address(0));
@@ -2441,7 +2573,8 @@ size_t ksize(const void *object)
        struct page *page;
        struct kmem_cache *s;
-        if (ZERO_OR_NULL_PTR(object))
+        BUG_ON(!object);
+        if (unlikely(object == ZERO_SIZE_PTR))
                return 0;
        page = get_object_page(object);
@@ -2473,22 +2606,17 @@ EXPORT_SYMBOL(ksize);
 void kfree(const void *x)
 {
-        struct kmem_cache *s;
        struct page *page;
-        /*
+        if (unlikely(ZERO_OR_NULL_PTR(x)))
-         * This has to be an unsigned comparison. According to Linus
-         * some gcc version treat a pointer as a signed entity. Then
-         * this comparison would be true for all "negative" pointers
-         * (which would cover the whole upper half of the address space).
-         */
-        if (ZERO_OR_NULL_PTR(x))
                return;
        page = virt_to_head_page(x);
-        s = page->slab;
+        if (unlikely(!PageSlab(page))) {
+                put_page(page);
-        slab_free(s, page, (void *)x, __builtin_return_address(0));
+                return;
+        }
+        slab_free(page->slab, page, (void *)x, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(kfree);
@@ -2517,7 +2645,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
                return -ENOMEM;
        flush_all(s);
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                n = get_node(s, node);
                if (!n->nr_partial)
@@ -2575,6 +2703,8 @@ void __init kmem_cache_init(void)
        int i;
        int caches = 0;
+        init_alloc_cpu();
 #ifdef CONFIG_NUMA
        /*
         * Must first have the slab cache available for the allocations of the
@@ -2602,7 +2732,7 @@ void __init kmem_cache_init(void)
                caches++;
        }
-        for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+        for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) {
                create_kmalloc_cache(&kmalloc_caches[i],
                        "kmalloc", 1 << i, GFP_KERNEL);
                caches++;
@@ -2629,16 +2759,18 @@ void __init kmem_cache_init(void)
        slab_state = UP;
        /* Provide the correct kmalloc names now that the caches are up */
-        for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
+        for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
                kmalloc_caches[i]. name =
                        kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
 #ifdef CONFIG_SMP
        register_cpu_notifier(&slab_notifier);
+        kmem_size = offsetof(struct kmem_cache, cpu_slab) +
+                                nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
+#else
+        kmem_size = sizeof(struct kmem_cache);
 #endif
-        kmem_size = offsetof(struct kmem_cache, cpu_slab) +
-                                nr_cpu_ids * sizeof(struct page *);
        printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
                " CPUs=%d, Nodes=%d\n",
@@ -2669,7 +2801,7 @@ static int slab_unmergeable(struct kmem_cache *s)
 static struct kmem_cache *find_mergeable(size_t size,
                size_t align, unsigned long flags, const char *name,
-                void (*ctor)(void *, struct kmem_cache *, unsigned long))
+                void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *s;
@@ -2710,19 +2842,28 @@ static struct kmem_cache *find_mergeable(size_t size,
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
                size_t align, unsigned long flags,
-                void (*ctor)(void *, struct kmem_cache *, unsigned long))
+                void (*ctor)(struct kmem_cache *, void *))
 {
        struct kmem_cache *s;
        down_write(&slub_lock);
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
+                int cpu;
                s->refcount++;
                /*
                 * Adjust the object sizes so that we clear
                 * the complete object on kzalloc.
                 */
                s->objsize = max(s->objsize, (int)size);
+                /*
+                 * And then we need to update the object size in the
+                 * per cpu structures
+                 */
+                for_each_online_cpu(cpu)
+                        get_cpu_slab(s, cpu)->objsize = s->objsize;
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
                up_write(&slub_lock);
                if (sysfs_slab_alias(s, name))
@@ -2765,15 +2906,29 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
        unsigned long flags;
        switch (action) {
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
+                init_alloc_cpu_cpu(cpu);
+                down_read(&slub_lock);
+                list_for_each_entry(s, &slab_caches, list)
+                        s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
+                                                        GFP_KERNEL);
+                up_read(&slub_lock);
+                break;
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                down_read(&slub_lock);
                list_for_each_entry(s, &slab_caches, list) {
+                        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
                        local_irq_save(flags);
                        __flush_cpu_slab(s, cpu);
                        local_irq_restore(flags);
+                        free_kmem_cache_cpu(c, cpu);
+                        s->cpu_slab[cpu] = NULL;
                }
                up_read(&slub_lock);
                break;
@@ -2790,9 +2945,14 @@ static struct notifier_block __cpuinitdata slab_notifier =
 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 {
-        struct kmem_cache *s = get_slab(size, gfpflags);
+        struct kmem_cache *s;
+        if (unlikely(size > PAGE_SIZE / 2))
+                return (void *)__get_free_pages(gfpflags | __GFP_COMP,
+                                                        get_order(size));
+        s = get_slab(size, gfpflags);
-        if (ZERO_OR_NULL_PTR(s))
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
        return slab_alloc(s, gfpflags, -1, caller);
@@ -2801,9 +2961,14 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
                                        int node, void *caller)
 {
-        struct kmem_cache *s = get_slab(size, gfpflags);
+        struct kmem_cache *s;
+        if (unlikely(size > PAGE_SIZE / 2))
+                return (void *)__get_free_pages(gfpflags | __GFP_COMP,
+                                                        get_order(size));
+        s = get_slab(size, gfpflags);
-        if (ZERO_OR_NULL_PTR(s))
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
        return slab_alloc(s, gfpflags, node, caller);
@@ -2902,7 +3067,7 @@ static long validate_slab_cache(struct kmem_cache *s)
                return -ENOMEM;
        flush_all(s);
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
                count += validate_slab_node(s, n, map);
@@ -3116,13 +3281,13 @@ static int list_locations(struct kmem_cache *s, char *buf,
        int node;
        if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
-                        GFP_KERNEL))
+                        GFP_TEMPORARY))
                return sprintf(buf, "Out of memory\n");
        /* Push back cpu slabs */
        flush_all(s);
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
                unsigned long flags;
                struct page *page;
@@ -3230,11 +3395,18 @@ static unsigned long slab_objects(struct kmem_cache *s,
        per_cpu = nodes + nr_node_ids;
        for_each_possible_cpu(cpu) {
-                struct page *page = s->cpu_slab[cpu];
+                struct page *page;
                int node;
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                if (!c)
+                        continue;
+                page = c->page;
+                node = c->node;
+                if (node < 0)
+                        continue;
                if (page) {
-                        node = page_to_nid(page);
                        if (flags & SO_CPU) {
                                int x = 0;
@@ -3249,7 +3421,7 @@ static unsigned long slab_objects(struct kmem_cache *s,
                }
        }
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
                if (flags & SO_PARTIAL) {
@@ -3277,7 +3449,7 @@ static unsigned long slab_objects(struct kmem_cache *s,
        x = sprintf(buf, "%lu", total);
 #ifdef CONFIG_NUMA
-        for_each_online_node(node)
+        for_each_node_state(node, N_NORMAL_MEMORY)
                if (nodes[node])
                        x += sprintf(buf + x, " N%d=%lu",
                                        node, nodes[node]);
@@ -3291,13 +3463,19 @@ static int any_slab_objects(struct kmem_cache *s)
        int node;
        int cpu;
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
-                if (s->cpu_slab[cpu])
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                if (c && c->page)
                        return 1;
+        }
-        for_each_node(node) {
+        for_each_online_node(node) {
                struct kmem_cache_node *n = get_node(s, node);
+                if (!n)
+                        continue;
                if (n->nr_partial || atomic_long_read(&n->nr_slabs))
                        return 1;
        }
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
new file mode 100644
index 000000000000..d3b718b0c20a
--- /dev/null
+++ b/mm/sparse-vmemmap.c
@@ -0,0 +1,148 @@
+/*
+ * Virtual Memory Map support
+ *
+ * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>.
+ *
+ * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
+ * virt_to_page, page_address() to be implemented as a base offset
+ * calculation without memory access.
+ *
+ * However, virtual mappings need a page table and TLBs. Many Linux
+ * architectures already map their physical space using 1-1 mappings
+ * via TLBs. For those arches the virtual memmory map is essentially
+ * for free if we use the same page size as the 1-1 mappings. In that
+ * case the overhead consists of a few additional pages that are
+ * allocated to create a view of memory for vmemmap.
+ *
+ * The architecture is expected to provide a vmemmap_populate() function
+ * to instantiate the mapping.
+ */
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <asm/dma.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+/*
+ * Allocate a block of memory to be used to back the virtual memory map
+ * or to back the page tables that are used to create the mapping.
+ * Uses the main allocators if they are available, else bootmem.
+ */
+void * __meminit vmemmap_alloc_block(unsigned long size, int node)
+{
+        /* If the main allocator is up use that, fallback to bootmem. */
+        if (slab_is_available()) {
+                struct page *page = alloc_pages_node(node,
+                                GFP_KERNEL | __GFP_ZERO, get_order(size));
+                if (page)
+                        return page_address(page);
+                return NULL;
+        } else
+                return __alloc_bootmem_node(NODE_DATA(node), size, size,
+                                __pa(MAX_DMA_ADDRESS));
+}
+void __meminit vmemmap_verify(pte_t *pte, int node,
+                                unsigned long start, unsigned long end)
+{
+        unsigned long pfn = pte_pfn(*pte);
+        int actual_node = early_pfn_to_nid(pfn);
+        if (actual_node != node)
+                printk(KERN_WARNING "[%lx-%lx] potential offnode "
+                        "page_structs\n", start, end - 1);
+}
+pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
+{
+        pte_t *pte = pte_offset_kernel(pmd, addr);
+        if (pte_none(*pte)) {
+                pte_t entry;
+                void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+                if (!p)
+                        return 0;
+                entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+                set_pte_at(&init_mm, addr, pte, entry);
+        }
+        return pte;
+}
+pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
+{
+        pmd_t *pmd = pmd_offset(pud, addr);
+        if (pmd_none(*pmd)) {
+                void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+                if (!p)
+                        return 0;
+                pmd_populate_kernel(&init_mm, pmd, p);
+        }
+        return pmd;
+}
+pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node)
+{
+        pud_t *pud = pud_offset(pgd, addr);
+        if (pud_none(*pud)) {
+                void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+                if (!p)
+                        return 0;
+                pud_populate(&init_mm, pud, p);
+        }
+        return pud;
+}
+pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
+{
+        pgd_t *pgd = pgd_offset_k(addr);
+        if (pgd_none(*pgd)) {
+                void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+                if (!p)
+                        return 0;
+                pgd_populate(&init_mm, pgd, p);
+        }
+        return pgd;
+}
+int __meminit vmemmap_populate_basepages(struct page *start_page,
+                                                unsigned long size, int node)
+{
+        unsigned long addr = (unsigned long)start_page;
+        unsigned long end = (unsigned long)(start_page + size);
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        for (; addr < end; addr += PAGE_SIZE) {
+                pgd = vmemmap_pgd_populate(addr, node);
+                if (!pgd)
+                        return -ENOMEM;
+                pud = vmemmap_pud_populate(pgd, addr, node);
+                if (!pud)
+                        return -ENOMEM;
+                pmd = vmemmap_pmd_populate(pud, addr, node);
+                if (!pmd)
+                        return -ENOMEM;
+                pte = vmemmap_pte_populate(pmd, addr, node);
+                if (!pte)
+                        return -ENOMEM;
+                vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+        }
+        return 0;
+}
+struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
+{
+        struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION);
+        int error = vmemmap_populate(map, PAGES_PER_SECTION, nid);
+        if (error)
+                return NULL;
+        return map;
+}
diff --git a/mm/sparse.c b/mm/sparse.c
index 239f5a720d38..08fb14f5eea3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -9,6 +9,8 @@
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <asm/dma.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
 /*
 * Permanent SPARSEMEM data:
@@ -106,7 +108,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
 /*
 * Although written for the SPARSEMEM_EXTREME case, this happens
- * to also work for the flat array case becase
+ * to also work for the flat array case because
 * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
 */
 int __section_nr(struct mem_section* ms)
@@ -176,7 +178,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
                if (nid != early_pfn_to_nid(pfn))
                        continue;
-                if (pfn_valid(pfn))
+                if (pfn_present(pfn))
                        nr_pages += PAGES_PER_SECTION;
        }
@@ -204,13 +206,16 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
 }
 static int __meminit sparse_init_one_section(struct mem_section *ms,
-                unsigned long pnum, struct page *mem_map)
+                unsigned long pnum, struct page *mem_map,
+                unsigned long *pageblock_bitmap)
 {
-        if (!valid_section(ms))
+        if (!present_section(ms))
                return -EINVAL;
        ms->section_mem_map &= ~SECTION_MAP_MASK;
-        ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
+        ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
+                                                        SECTION_HAS_MEM_MAP;
+        ms->pageblock_flags = pageblock_bitmap;
        return 1;
 }
@@ -221,12 +226,43 @@ void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
        return NULL;
 }
-static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
+static unsigned long usemap_size(void)
 {
-        struct page *map;
+        unsigned long size_bytes;
+        size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
+        size_bytes = roundup(size_bytes, sizeof(unsigned long));
+        return size_bytes;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static unsigned long *__kmalloc_section_usemap(void)
+{
+        return kmalloc(usemap_size(), GFP_KERNEL);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+static unsigned long *sparse_early_usemap_alloc(unsigned long pnum)
+{
+        unsigned long *usemap;
        struct mem_section *ms = __nr_to_section(pnum);
        int nid = sparse_early_nid(ms);
+        usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+        if (usemap)
+                return usemap;
+        /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
+        nid = 0;
+        printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+        return NULL;
+}
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
+{
+        struct page *map;
        map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
        if (map)
                return map;
@@ -238,10 +274,22 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
        map = alloc_bootmem_node(NODE_DATA(nid),
                        sizeof(struct page) * PAGES_PER_SECTION);
+        return map;
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
+{
+        struct page *map;
+        struct mem_section *ms = __nr_to_section(pnum);
+        int nid = sparse_early_nid(ms);
+        map = sparse_mem_map_populate(pnum, nid);
        if (map)
                return map;
-        printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+        printk(KERN_ERR "%s: sparsemem memory map backing failed "
+                        "some memory will not be available.\n", __FUNCTION__);
        ms->section_mem_map = 0;
        return NULL;
 }
@@ -254,19 +302,38 @@ void __init sparse_init(void)
 {
        unsigned long pnum;
        struct page *map;
+        unsigned long *usemap;
        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
-                if (!valid_section_nr(pnum))
+                if (!present_section_nr(pnum))
                        continue;
                map = sparse_early_mem_map_alloc(pnum);
                if (!map)
                        continue;
-                sparse_init_one_section(__nr_to_section(pnum), pnum, map);
+                usemap = sparse_early_usemap_alloc(pnum);
+                if (!usemap)
+                        continue;
+                sparse_init_one_section(__nr_to_section(pnum), pnum, map,
+                                                                usemap);
        }
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+                                                 unsigned long nr_pages)
+{
+        /* This will make the necessary allocations eventually. */
+        return sparse_mem_map_populate(pnum, nid);
+}
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+{
+        return; /* XXX: Not implemented yet */
+}
+#else
 static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
 {
        struct page *page, *ret;
@@ -289,6 +356,12 @@ got_map_ptr:
        return ret;
 }
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+                                                  unsigned long nr_pages)
+{
+        return __kmalloc_section_memmap(nr_pages);
+}
 static int vaddr_in_vmalloc_area(void *addr)
 {
        if (addr >= (void *)VMALLOC_START &&
@@ -305,6 +378,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
                free_pages((unsigned long)memmap,
                           get_order(sizeof(struct page) * nr_pages));
 }
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 /*
 * returns the number of sections whose mem_maps were properly
@@ -318,6 +392,7 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
        struct pglist_data *pgdat = zone->zone_pgdat;
        struct mem_section *ms;
        struct page *memmap;
+        unsigned long *usemap;
        unsigned long flags;
        int ret;
@@ -326,7 +401,8 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
         * plus, it does a kmalloc
         */
        sparse_index_init(section_nr, pgdat->node_id);
-        memmap = __kmalloc_section_memmap(nr_pages);
+        memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
+        usemap = __kmalloc_section_usemap();
        pgdat_resize_lock(pgdat, &flags);
@@ -335,9 +411,14 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
                ret = -EEXIST;
                goto out;
        }
+        if (!usemap) {
+                ret = -ENOMEM;
+                goto out;
+        }
        ms->section_mem_map |= SECTION_MARKED_PRESENT;
-        ret = sparse_init_one_section(ms, section_nr, memmap);
+        ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
 out:
        pgdat_resize_unlock(pgdat, &flags);
diff --git a/mm/swap.c b/mm/swap.c
index d3cb966fe992..a65eff8a517a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -24,16 +24,19 @@
 #include <linux/module.h>
 #include <linux/mm_inline.h>
 #include <linux/buffer_head.h>  /* for try_to_release_page() */
-#include <linux/module.h>
 #include <linux/percpu_counter.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
-#include <linux/init.h>
+#include <linux/backing-dev.h>
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
 /*
 * This path almost never happens for VM activity - pages are normally
 * freed via pagevecs.  But it gets used by networking.
@@ -94,23 +97,47 @@ void put_pages_list(struct list_head *pages)
 EXPORT_SYMBOL(put_pages_list);
 /*
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
+ */
+static void pagevec_move_tail(struct pagevec *pvec)
+{
+        int i;
+        int pgmoved = 0;
+        struct zone *zone = NULL;
+        for (i = 0; i < pagevec_count(pvec); i++) {
+                struct page *page = pvec->pages[i];
+                struct zone *pagezone = page_zone(page);
+                if (pagezone != zone) {
+                        if (zone)
+                                spin_unlock(&zone->lru_lock);
+                        zone = pagezone;
+                        spin_lock(&zone->lru_lock);
+                }
+                if (PageLRU(page) && !PageActive(page)) {
+                        list_move_tail(&page->lru, &zone->inactive_list);
+                        pgmoved++;
+                }
+        }
+        if (zone)
+                spin_unlock(&zone->lru_lock);
+        __count_vm_events(PGROTATED, pgmoved);
+        release_pages(pvec->pages, pvec->nr, pvec->cold);
+        pagevec_reinit(pvec);
+}
+/*
 * Writeback is about to end against a page which has been marked for immediate
 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
- * inactive list.  The page still has PageWriteback set, which will pin it.
+ * inactive list.
- *
- * We don't expect many pages to come through here, so don't bother batching
- * things up.
- *
- * To avoid placing the page at the tail of the LRU while PG_writeback is still
- * set, this function will clear PG_writeback before performing the page
- * motion.  Do that inside the lru lock because once PG_writeback is cleared
- * we may not touch the page.
 *
 * Returns zero if it cleared PG_writeback.
 */
 int rotate_reclaimable_page(struct page *page)
 {
-        struct zone *zone;
+        struct pagevec *pvec;
        unsigned long flags;
        if (PageLocked(page))
@@ -122,15 +149,16 @@ int rotate_reclaimable_page(struct page *page)
        if (!PageLRU(page))
                return 1;
-        zone = page_zone(page);
+        page_cache_get(page);
-        spin_lock_irqsave(&zone->lru_lock, flags);
+        local_irq_save(flags);
-        if (PageLRU(page) && !PageActive(page)) {
+        pvec = &__get_cpu_var(lru_rotate_pvecs);
-                list_move_tail(&page->lru, &zone->inactive_list);
+        if (!pagevec_add(pvec, page))
-                __count_vm_event(PGROTATED);
+                pagevec_move_tail(pvec);
-        }
+        local_irq_restore(flags);
        if (!test_clear_page_writeback(page))
                BUG();
-        spin_unlock_irqrestore(&zone->lru_lock, flags);
        return 0;
 }
@@ -174,9 +202,6 @@ EXPORT_SYMBOL(mark_page_accessed);
 * lru_cache_add: add a page to the page lists
 * @page: the page to add
 */
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
 void fastcall lru_cache_add(struct page *page)
 {
        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
@@ -197,21 +222,37 @@ void fastcall lru_cache_add_active(struct page *page)
        put_cpu_var(lru_add_active_pvecs);
 }
-static void __lru_add_drain(int cpu)
+/*
+ * Drain pages out of the cpu's pagevecs.
+ * Either "cpu" is the current CPU, and preemption has already been
+ * disabled; or "cpu" is being hot-unplugged, and is already dead.
+ */
+static void drain_cpu_pagevecs(int cpu)
 {
-        struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+        struct pagevec *pvec;
-        /* CPU is dead, so no locking needed. */
+        pvec = &per_cpu(lru_add_pvecs, cpu);
        if (pagevec_count(pvec))
                __pagevec_lru_add(pvec);
        pvec = &per_cpu(lru_add_active_pvecs, cpu);
        if (pagevec_count(pvec))
                __pagevec_lru_add_active(pvec);
+        pvec = &per_cpu(lru_rotate_pvecs, cpu);
+        if (pagevec_count(pvec)) {
+                unsigned long flags;
+                /* No harm done if a racing interrupt already did this */
+                local_irq_save(flags);
+                pagevec_move_tail(pvec);
+                local_irq_restore(flags);
+        }
 }
 void lru_add_drain(void)
 {
-        __lru_add_drain(get_cpu());
+        drain_cpu_pagevecs(get_cpu());
        put_cpu();
 }
@@ -258,6 +299,7 @@ void release_pages(struct page **pages, int nr, int cold)
        int i;
        struct pagevec pages_to_free;
        struct zone *zone = NULL;
+        unsigned long uninitialized_var(flags);
        pagevec_init(&pages_to_free, cold);
        for (i = 0; i < nr; i++) {
@@ -265,7 +307,7 @@ void release_pages(struct page **pages, int nr, int cold)
                if (unlikely(PageCompound(page))) {
                        if (zone) {
-                                spin_unlock_irq(&zone->lru_lock);
+                                spin_unlock_irqrestore(&zone->lru_lock, flags);
                                zone = NULL;
                        }
                        put_compound_page(page);
@@ -279,9 +321,10 @@ void release_pages(struct page **pages, int nr, int cold)
                        struct zone *pagezone = page_zone(page);
                        if (pagezone != zone) {
                                if (zone)
-                                        spin_unlock_irq(&zone->lru_lock);
+                                        spin_unlock_irqrestore(&zone->lru_lock,
+                                                                        flags);
                                zone = pagezone;
-                                spin_lock_irq(&zone->lru_lock);
+                                spin_lock_irqsave(&zone->lru_lock, flags);
                        }
                        VM_BUG_ON(!PageLRU(page));
                        __ClearPageLRU(page);
@@ -290,7 +333,7 @@ void release_pages(struct page **pages, int nr, int cold)
                if (!pagevec_add(&pages_to_free, page)) {
                        if (zone) {
-                                spin_unlock_irq(&zone->lru_lock);
+                                spin_unlock_irqrestore(&zone->lru_lock, flags);
                                zone = NULL;
                        }
                        __pagevec_free(&pages_to_free);
@@ -298,7 +341,7 @@ void release_pages(struct page **pages, int nr, int cold)
                }
        }
        if (zone)
-                spin_unlock_irq(&zone->lru_lock);
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
        pagevec_free(&pages_to_free);
 }
@@ -491,7 +534,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
                atomic_add(*committed, &vm_committed_space);
                *committed = 0;
-                __lru_add_drain((long)hcpu);
+                drain_cpu_pagevecs((long)hcpu);
        }
        return NOTIFY_OK;
 }
@@ -505,6 +548,10 @@ void __init swap_setup(void)
 {
        unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
+#ifdef CONFIG_SWAP
+        bdi_init(swapper_space.backing_dev_info);
+#endif
        /* Use a smaller cluster for small-memory machines */
        if (megs < 16)
                page_cluster = 2;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 67daecb6031a..b52635601dfe 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -74,6 +74,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 {
        int error;
+        BUG_ON(!PageLocked(page));
        BUG_ON(PageSwapCache(page));
        BUG_ON(PagePrivate(page));
        error = radix_tree_preload(gfp_mask);
@@ -83,7 +84,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
                                                entry.val, page);
                if (!error) {
                        page_cache_get(page);
-                        SetPageLocked(page);
                        SetPageSwapCache(page);
                        set_page_private(page, entry.val);
                        total_swapcache_pages++;
@@ -99,15 +99,18 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
        int error;
+        BUG_ON(PageLocked(page));
        if (!swap_duplicate(entry)) {
                INC_CACHE_INFO(noent_race);
                return -ENOENT;
        }
+        SetPageLocked(page);
        error = __add_to_swap_cache(page, entry, GFP_KERNEL);
        /*
         * Anon pages are already on the LRU, we don't run lru_cache_add here.
         */
        if (error) {
+                ClearPageLocked(page);
                swap_free(entry);
                if (error == -EEXIST)
                        INC_CACHE_INFO(exist_race);
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index 8803471593fd..d436a9c82db7 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -66,24 +66,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
        if (!dentry)
                goto put_memory;
-        error = -ENFILE;
-        file = get_empty_filp();
-        if (!file)
-                goto put_dentry;
        error = -ENOSPC;
        inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
        if (!inode)
-                goto close_file;
+                goto put_dentry;
        d_instantiate(dentry, inode);
-        inode->i_nlink = 0;     /* It is unlinked */
+        error = -ENFILE;
+        file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
+                        &ramfs_file_operations);
+        if (!file)
+                goto put_dentry;
-        file->f_path.mnt = mntget(shm_mnt);
+        inode->i_nlink = 0;     /* It is unlinked */
-        file->f_path.dentry = dentry;
-        file->f_mapping = inode->i_mapping;
-        file->f_op = &ramfs_file_operations;
-        file->f_mode = FMODE_WRITE | FMODE_READ;
        /* notify everyone as to the change of file size */
        error = do_truncate(dentry, size, 0, file);
diff --git a/mm/truncate.c b/mm/truncate.c
index 5cdfbc1a59fd..cadc15653dde 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -8,6 +8,7 @@
 */
 #include <linux/kernel.h>
+#include <linux/backing-dev.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/module.h>
@@ -72,6 +73,8 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
                struct address_space *mapping = page->mapping;
                if (mapping && mapping_cap_account_dirty(mapping)) {
                        dec_zone_page_state(page, NR_FILE_DIRTY);
+                        dec_bdi_stat(mapping->backing_dev_info,
+                                        BDI_RECLAIMABLE);
                        if (account_size)
                                task_io_account_cancelled_write(account_size);
                }
diff --git a/mm/util.c b/mm/util.c
index bf340d806868..5f64026cbb4d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -81,14 +81,16 @@ EXPORT_SYMBOL(kmemdup);
 void *krealloc(const void *p, size_t new_size, gfp_t flags)
 {
        void *ret;
-        size_t ks;
+        size_t ks = 0;
        if (unlikely(!new_size)) {
                kfree(p);
                return ZERO_SIZE_PTR;
        }
-        ks = ksize(p);
+        if (p)
+                ks = ksize(p);
        if (ks >= new_size)
                return (void *)p;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3cee76a8c9f0..2e01af365848 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -190,7 +190,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
        if (unlikely(!size))
                return NULL;
-        area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node);
+        area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!area))
                return NULL;
@@ -439,7 +440,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                area->flags |= VM_VPAGES;
        } else {
                pages = kmalloc_node(array_size,
-                                (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO,
+                                (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
                                node);
        }
        area->pages = pages;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a6e65d024995..e1471385d001 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -932,6 +932,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                long mapped_ratio;
                long distress;
                long swap_tendency;
+                long imbalance;
                if (zone_is_near_oom(zone))
                        goto force_reclaim_mapped;
@@ -967,6 +968,46 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
                /*
+                 * If there's huge imbalance between active and inactive
+                 * (think active 100 times larger than inactive) we should
+                 * become more permissive, or the system will take too much
+                 * cpu before it start swapping during memory pressure.
+                 * Distress is about avoiding early-oom, this is about
+                 * making swappiness graceful despite setting it to low
+                 * values.
+                 *
+                 * Avoid div by zero with nr_inactive+1, and max resulting
+                 * value is vm_total_pages.
+                 */
+                imbalance  = zone_page_state(zone, NR_ACTIVE);
+                imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
+                /*
+                 * Reduce the effect of imbalance if swappiness is low,
+                 * this means for a swappiness very low, the imbalance
+                 * must be much higher than 100 for this logic to make
+                 * the difference.
+                 *
+                 * Max temporary value is vm_total_pages*100.
+                 */
+                imbalance *= (vm_swappiness + 1);
+                imbalance /= 100;
+                /*
+                 * If not much of the ram is mapped, makes the imbalance
+                 * less relevant, it's high priority we refill the inactive
+                 * list with mapped pages only in presence of high ratio of
+                 * mapped pages.
+                 *
+                 * Max temporary value is vm_total_pages*100.
+                 */
+                imbalance *= mapped_ratio;
+                imbalance /= 100;
+                /* apply imbalance feedback to swap_tendency */
+                swap_tendency += imbalance;
+                /*
                 * Now use this metric to decide whether to start moving mapped
                 * memory onto the inactive list.
                 */
@@ -1067,8 +1108,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
        unsigned long nr_to_scan;
        unsigned long nr_reclaimed = 0;
-        atomic_inc(&zone->reclaim_in_progress);
        /*
         * Add one to `nr_to_scan' just to make sure that the kernel will
         * slowly sift through the active list.
@@ -1107,8 +1146,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
        }
        throttle_vm_writeout(sc->gfp_mask);
-        atomic_dec(&zone->reclaim_in_progress);
        return nr_reclaimed;
 }
@@ -1146,7 +1183,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
                note_zone_scanning_priority(zone, priority);
-                if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY)
                        continue;       /* Let kswapd poll it */
                sc->all_unreclaimable = 0;
@@ -1327,7 +1364,8 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
-                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                        if (zone_is_all_unreclaimable(zone) &&
+                            priority != DEF_PRIORITY)
                                continue;
                        if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1362,7 +1400,8 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
-                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                        if (zone_is_all_unreclaimable(zone) &&
+                                        priority != DEF_PRIORITY)
                                continue;
                        if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1371,18 +1410,25 @@ loop_again:
                        temp_priority[i] = priority;
                        sc.nr_scanned = 0;
                        note_zone_scanning_priority(zone, priority);
-                        nr_reclaimed += shrink_zone(priority, zone, &sc);
+                        /*
+                         * We put equal pressure on every zone, unless one
+                         * zone has way too many pages free already.
+                         */
+                        if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
+                                                end_zone, 0))
+                                nr_reclaimed += shrink_zone(priority, zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
                                                lru_pages);
                        nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_scanned += sc.nr_scanned;
-                        if (zone->all_unreclaimable)
+                        if (zone_is_all_unreclaimable(zone))
                                continue;
                        if (nr_slab == 0 && zone->pages_scanned >=
                                (zone_page_state(zone, NR_ACTIVE)
                                + zone_page_state(zone, NR_INACTIVE)) * 6)
-                                        zone->all_unreclaimable = 1;
+                                        zone_set_flag(zone,
+                                                      ZONE_ALL_UNRECLAIMABLE);
                        /*
                         * If we've done a decent amount of scanning and
                         * the reclaim ratio is low, start doing writepage
@@ -1548,7 +1594,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
                if (!populated_zone(zone))
                        continue;
-                if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+                if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
                        continue;
                /* For pass = 0 we don't shrink the active list */
@@ -1688,9 +1734,11 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 {
        pg_data_t *pgdat;
        cpumask_t mask;
+        int nid;
        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
-                for_each_online_pgdat(pgdat) {
+                for_each_node_state(nid, N_HIGH_MEMORY) {
+                        pgdat = NODE_DATA(nid);
                        mask = node_to_cpumask(pgdat->node_id);
                        if (any_online_cpu(mask) != NR_CPUS)
                                /* One of our CPUs online: restore mask */
@@ -1727,7 +1775,7 @@ static int __init kswapd_init(void)
        int nid;
        swap_setup();
-        for_each_online_node(nid)
+        for_each_node_state(nid, N_HIGH_MEMORY)
                kswapd_run(nid);
        hotcpu_notifier(cpu_callback, 0);
        return 0;
@@ -1847,8 +1895,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-        cpumask_t mask;
        int node_id;
+        int ret;
        /*
         * Zone reclaim reclaims unmapped file backed pages and
@@ -1866,15 +1914,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                        <= zone->min_slab_pages)
                return 0;
+        if (zone_is_all_unreclaimable(zone))
+                return 0;
        /*
-         * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+         * Do not scan if the allocation should not be delayed.
-         * not have reclaimable pages and if we should not delay the allocation
-         * then do not scan.
         */
-        if (!(gfp_mask & __GFP_WAIT) ||
+        if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
-                zone->all_unreclaimable ||
-                atomic_read(&zone->reclaim_in_progress) > 0 ||
-                (current->flags & PF_MEMALLOC))
                        return 0;
        /*
@@ -1884,9 +1930,14 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         * as wide as possible.
         */
        node_id = zone_to_nid(zone);
-        mask = node_to_cpumask(node_id);
+        if (node_state(node_id, N_CPU) && node_id != numa_node_id())
-        if (!cpus_empty(mask) && node_id != numa_node_id())
+                return 0;
+        if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
                return 0;
-        return __zone_reclaim(zone, gfp_mask, order);
+        ret = __zone_reclaim(zone, gfp_mask, order);
+        zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+        return ret;
 }
 #endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c64d169537bf..4651bf153f35 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -353,23 +353,6 @@ void refresh_cpu_vm_stats(int cpu)
        }
 }
-static void __refresh_cpu_vm_stats(void *dummy)
-{
-        refresh_cpu_vm_stats(smp_processor_id());
-}
-/*
- * Consolidate all counters.
- *
- * Note that the result is less inaccurate but still inaccurate
- * if concurrent processes are allowed to run.
- */
-void refresh_vm_stats(void)
-{
-        on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
-}
-EXPORT_SYMBOL(refresh_vm_stats);
 #endif
 #ifdef CONFIG_NUMA
@@ -398,6 +381,13 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z)
 #include <linux/seq_file.h>
+static char * const migratetype_names[MIGRATE_TYPES] = {
+        "Unmovable",
+        "Reclaimable",
+        "Movable",
+        "Reserve",
+};
 static void *frag_start(struct seq_file *m, loff_t *pos)
 {
        pg_data_t *pgdat;
@@ -422,28 +412,144 @@ static void frag_stop(struct seq_file *m, void *arg)
 {
 }
-/*
+/* Walk all the zones in a node and print using a callback */
- * This walks the free areas for each zone.
+static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
- */
+                void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
-static int frag_show(struct seq_file *m, void *arg)
 {
-        pg_data_t *pgdat = (pg_data_t *)arg;
        struct zone *zone;
        struct zone *node_zones = pgdat->node_zones;
        unsigned long flags;
-        int order;
        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
                if (!populated_zone(zone))
                        continue;
                spin_lock_irqsave(&zone->lock, flags);
-                seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+                print(m, pgdat, zone);
-                for (order = 0; order < MAX_ORDER; ++order)
-                        seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
                spin_unlock_irqrestore(&zone->lock, flags);
+        }
+}
+static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
+                                                struct zone *zone)
+{
+        int order;
+        seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+        for (order = 0; order < MAX_ORDER; ++order)
+                seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+        seq_putc(m, '\n');
+}
+/*
+ * This walks the free areas for each zone.
+ */
+static int frag_show(struct seq_file *m, void *arg)
+{
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        walk_zones_in_node(m, pgdat, frag_show_print);
+        return 0;
+}
+static void pagetypeinfo_showfree_print(struct seq_file *m,
+                                        pg_data_t *pgdat, struct zone *zone)
+{
+        int order, mtype;
+        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
+                seq_printf(m, "Node %4d, zone %8s, type %12s ",
+                                        pgdat->node_id,
+                                        zone->name,
+                                        migratetype_names[mtype]);
+                for (order = 0; order < MAX_ORDER; ++order) {
+                        unsigned long freecount = 0;
+                        struct free_area *area;
+                        struct list_head *curr;
+                        area = &(zone->free_area[order]);
+                        list_for_each(curr, &area->free_list[mtype])
+                                freecount++;
+                        seq_printf(m, "%6lu ", freecount);
+                }
                seq_putc(m, '\n');
        }
+}
+/* Print out the free pages at each order for each migatetype */
+static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
+{
+        int order;
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        /* Print header */
+        seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
+        for (order = 0; order < MAX_ORDER; ++order)
+                seq_printf(m, "%6d ", order);
+        seq_putc(m, '\n');
+        walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
+        return 0;
+}
+static void pagetypeinfo_showblockcount_print(struct seq_file *m,
+                                        pg_data_t *pgdat, struct zone *zone)
+{
+        int mtype;
+        unsigned long pfn;
+        unsigned long start_pfn = zone->zone_start_pfn;
+        unsigned long end_pfn = start_pfn + zone->spanned_pages;
+        unsigned long count[MIGRATE_TYPES] = { 0, };
+        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+                struct page *page;
+                if (!pfn_valid(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                mtype = get_pageblock_migratetype(page);
+                count[mtype]++;
+        }
+        /* Print counts */
+        seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+                seq_printf(m, "%12lu ", count[mtype]);
+        seq_putc(m, '\n');
+}
+/* Print out the free pages at each order for each migratetype */
+static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
+{
+        int mtype;
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        seq_printf(m, "\n%-23s", "Number of blocks type ");
+        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+                seq_printf(m, "%12s ", migratetype_names[mtype]);
+        seq_putc(m, '\n');
+        walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
+        return 0;
+}
+/*
+ * This prints out statistics in relation to grouping pages by mobility.
+ * It is expensive to collect so do not constantly read the file.
+ */
+static int pagetypeinfo_show(struct seq_file *m, void *arg)
+{
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        seq_printf(m, "Page block order: %d\n", pageblock_order);
+        seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
+        seq_putc(m, '\n');
+        pagetypeinfo_showfree(m, pgdat);
+        pagetypeinfo_showblockcount(m, pgdat);
        return 0;
 }
@@ -454,6 +560,13 @@ const struct seq_operations fragmentation_op = {
        .show   = frag_show,
 };
+const struct seq_operations pagetypeinfo_op = {
+        .start  = frag_start,
+        .next   = frag_next,
+        .stop   = frag_stop,
+        .show   = pagetypeinfo_show,
+};
 #ifdef CONFIG_ZONE_DMA
 #define TEXT_FOR_DMA(xx) xx "_dma",
 #else
@@ -532,84 +645,78 @@ static const char * const vmstat_text[] = {
 #endif
 };
-/*
+static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
- * Output information about zones in @pgdat.
+                                                        struct zone *zone)
- */
-static int zoneinfo_show(struct seq_file *m, void *arg)
 {
-        pg_data_t *pgdat = arg;
+        int i;
-        struct zone *zone;
+        seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
-        struct zone *node_zones = pgdat->node_zones;
+        seq_printf(m,
-        unsigned long flags;
+                   "\n  pages free     %lu"
+                   "\n        min      %lu"
-        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
+                   "\n        low      %lu"
-                int i;
+                   "\n        high     %lu"
+                   "\n        scanned  %lu (a: %lu i: %lu)"
-                if (!populated_zone(zone))
+                   "\n        spanned  %lu"
-                        continue;
+                   "\n        present  %lu",
+                   zone_page_state(zone, NR_FREE_PAGES),
-                spin_lock_irqsave(&zone->lock, flags);
+                   zone->pages_min,
-                seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+                   zone->pages_low,
-                seq_printf(m,
+                   zone->pages_high,
-                           "\n  pages free     %lu"
+                   zone->pages_scanned,
-                           "\n        min      %lu"
+                   zone->nr_scan_active, zone->nr_scan_inactive,
-                           "\n        low      %lu"
+                   zone->spanned_pages,
-                           "\n        high     %lu"
+                   zone->present_pages);
-                           "\n        scanned  %lu (a: %lu i: %lu)"
-                           "\n        spanned  %lu"
-                           "\n        present  %lu",
-                           zone_page_state(zone, NR_FREE_PAGES),
-                           zone->pages_min,
-                           zone->pages_low,
-                           zone->pages_high,
-                           zone->pages_scanned,
-                           zone->nr_scan_active, zone->nr_scan_inactive,
-                           zone->spanned_pages,
-                           zone->present_pages);
-                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-                        seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
+                seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
-                                        zone_page_state(zone, i));
+                                zone_page_state(zone, i));
-                seq_printf(m,
+        seq_printf(m,
-                           "\n        protection: (%lu",
+                   "\n        protection: (%lu",
-                           zone->lowmem_reserve[0]);
+                   zone->lowmem_reserve[0]);
-                for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+        for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
-                        seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+                seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
-                seq_printf(m,
+        seq_printf(m,
-                           ")"
+                   ")"
-                           "\n  pagesets");
+                   "\n  pagesets");
-                for_each_online_cpu(i) {
+        for_each_online_cpu(i) {
-                        struct per_cpu_pageset *pageset;
+                struct per_cpu_pageset *pageset;
-                        int j;
+                int j;
-                        pageset = zone_pcp(zone, i);
+                pageset = zone_pcp(zone, i);
-                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+                for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
-                                seq_printf(m,
+                        seq_printf(m,
-                                           "\n    cpu: %i pcp: %i"
+                                   "\n    cpu: %i pcp: %i"
-                                           "\n              count: %i"
+                                   "\n              count: %i"
-                                           "\n              high:  %i"
+                                   "\n              high:  %i"
-                                           "\n              batch: %i",
+                                   "\n              batch: %i",
-                                           i, j,
+                                   i, j,
-                                           pageset->pcp[j].count,
+                                   pageset->pcp[j].count,
-                                           pageset->pcp[j].high,
+                                   pageset->pcp[j].high,
-                                           pageset->pcp[j].batch);
+                                   pageset->pcp[j].batch);
                        }
 #ifdef CONFIG_SMP
-                        seq_printf(m, "\n  vm stats threshold: %d",
+                seq_printf(m, "\n  vm stats threshold: %d",
-                                        pageset->stat_threshold);
+                                pageset->stat_threshold);
 #endif
-                }
-                seq_printf(m,
-                           "\n  all_unreclaimable: %u"
-                           "\n  prev_priority:     %i"
-                           "\n  start_pfn:         %lu",
-                           zone->all_unreclaimable,
-                           zone->prev_priority,
-                           zone->zone_start_pfn);
-                spin_unlock_irqrestore(&zone->lock, flags);
-                seq_putc(m, '\n');
        }
+        seq_printf(m,
+                   "\n  all_unreclaimable: %u"
+                   "\n  prev_priority:     %i"
+                   "\n  start_pfn:         %lu",
+                           zone_is_all_unreclaimable(zone),
+                   zone->prev_priority,
+                   zone->zone_start_pfn);
+        seq_putc(m, '\n');
+}
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        walk_zones_in_node(m, pgdat, zoneinfo_show_print);
        return 0;
 }
@@ -741,7 +848,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
 static struct notifier_block __cpuinitdata vmstat_notifier =
        { &vmstat_cpuup_callback, NULL, 0 };
-int __init setup_vmstat(void)
+static int __init setup_vmstat(void)
 {
        int cpu;