31 files changed, 3077 insertions, 1078 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a7609cbcb00d..1cc6cada2bbf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -112,6 +112,19 @@ config SPARSEMEM_EXTREME
        def_bool y
        depends on SPARSEMEM && !SPARSEMEM_STATIC
+#
+# SPARSEMEM_VMEMMAP uses a virtually mapped mem_map to optimise pfn_to_page
+# and page_to_pfn.  The most efficient option where kernel virtual space is
+# not under pressure.
+#
+config SPARSEMEM_VMEMMAP_ENABLE
+        def_bool n
+config SPARSEMEM_VMEMMAP
+        bool
+        depends on SPARSEMEM
+        default y if (SPARSEMEM_VMEMMAP_ENABLE)
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
@@ -126,6 +139,11 @@ config MEMORY_HOTPLUG_SPARSE
        def_bool y
        depends on SPARSEMEM && MEMORY_HOTPLUG
+config MEMORY_HOTREMOVE
+        bool "Allow for memory hot remove"
+        depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
+        depends on MIGRATION
 # Heavily threaded applications may benefit from splitting the mm-wide
 # page_table_lock, so that faults on different parts of the user address
 # space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/Makefile b/mm/Makefile
index 245e33ab00c4..5c0b0ea7572d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,13 +11,14 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           page_alloc.o page-writeback.o pdflush.o \
                           readahead.o swap.o truncate.o vmscan.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                           $(mmu-y)
+                           page_isolation.o $(mmu-y)
 obj-$(CONFIG_BOUNCE)    += bounce.o
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
 obj-$(CONFIG_NUMA)      += mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
+obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
diff --git a/mm/bounce.c b/mm/bounce.c
index 3b549bf31f7d..b6d2d0f1019b 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -265,6 +265,12 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
        mempool_t *pool;
        /*
+         * Data-less bio, nothing to bounce
+         */
+        if (bio_empty_barrier(*bio_orig))
+                return;
+        /*
         * for non-isa bounce case, just check if the bounce pfn is equal
         * to or bigger than the highest pfn in the system -- in that case,
         * don't waste time iterating over bio segments
diff --git a/mm/filemap.c b/mm/filemap.c
index 15c8413ee929..c6049e947cd9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,7 +30,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
-#include "filemap.h"
+#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include "internal.h"
 /*
@@ -593,7 +593,7 @@ void fastcall __lock_page_nosync(struct page *page)
 * Is there a pagecache struct page at the given (mapping, offset) tuple?
 * If yes, increment its refcount and return it; if no, return NULL.
 */
-struct page * find_get_page(struct address_space *mapping, unsigned long offset)
+struct page * find_get_page(struct address_space *mapping, pgoff_t offset)
 {
        struct page *page;
@@ -617,30 +617,31 @@ EXPORT_SYMBOL(find_get_page);
 * Returns zero if the page was not present. find_lock_page() may sleep.
 */
 struct page *find_lock_page(struct address_space *mapping,
-                                unsigned long offset)
+                                pgoff_t offset)
 {
        struct page *page;
-        read_lock_irq(&mapping->tree_lock);
 repeat:
+        read_lock_irq(&mapping->tree_lock);
        page = radix_tree_lookup(&mapping->page_tree, offset);
        if (page) {
                page_cache_get(page);
                if (TestSetPageLocked(page)) {
                        read_unlock_irq(&mapping->tree_lock);
                        __lock_page(page);
-                        read_lock_irq(&mapping->tree_lock);
                        /* Has the page been truncated while we slept? */
-                        if (unlikely(page->mapping != mapping ||
+                        if (unlikely(page->mapping != mapping)) {
-                                     page->index != offset)) {
                                unlock_page(page);
                                page_cache_release(page);
                                goto repeat;
                        }
+                        VM_BUG_ON(page->index != offset);
+                        goto out;
                }
        }
        read_unlock_irq(&mapping->tree_lock);
+out:
        return page;
 }
 EXPORT_SYMBOL(find_lock_page);
@@ -663,29 +664,24 @@ EXPORT_SYMBOL(find_lock_page);
 * memory exhaustion.
 */
 struct page *find_or_create_page(struct address_space *mapping,
-                unsigned long index, gfp_t gfp_mask)
+                pgoff_t index, gfp_t gfp_mask)
 {
-        struct page *page, *cached_page = NULL;
+        struct page *page;
        int err;
 repeat:
        page = find_lock_page(mapping, index);
        if (!page) {
-                if (!cached_page) {
+                page = __page_cache_alloc(gfp_mask);
-                        cached_page =
+                if (!page)
-                                __page_cache_alloc(gfp_mask);
+                        return NULL;
-                        if (!cached_page)
+                err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
-                                return NULL;
+                if (unlikely(err)) {
+                        page_cache_release(page);
+                        page = NULL;
+                        if (err == -EEXIST)
+                                goto repeat;
                }
-                err = add_to_page_cache_lru(cached_page, mapping,
-                                        index, gfp_mask);
-                if (!err) {
-                        page = cached_page;
-                        cached_page = NULL;
-                } else if (err == -EEXIST)
-                        goto repeat;
        }
-        if (cached_page)
-                page_cache_release(cached_page);
        return page;
 }
 EXPORT_SYMBOL(find_or_create_page);
@@ -797,7 +793,7 @@ EXPORT_SYMBOL(find_get_pages_tag);
 * and deadlock against the caller's locked page.
 */
 struct page *
-grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
+grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
 {
        struct page *page = find_get_page(mapping, index);
@@ -859,34 +855,29 @@ static void shrink_readahead_size_eio(struct file *filp,
 * It may be NULL.
 */
 void do_generic_mapping_read(struct address_space *mapping,
-                             struct file_ra_state *_ra,
+                             struct file_ra_state *ra,
                             struct file *filp,
                             loff_t *ppos,
                             read_descriptor_t *desc,
                             read_actor_t actor)
 {
        struct inode *inode = mapping->host;
-        unsigned long index;
+        pgoff_t index;
-        unsigned long offset;
+        pgoff_t last_index;
-        unsigned long last_index;
+        pgoff_t prev_index;
-        unsigned long next_index;
+        unsigned long offset;      /* offset into pagecache page */
-        unsigned long prev_index;
        unsigned int prev_offset;
-        struct page *cached_page;
        int error;
-        struct file_ra_state ra = *_ra;
-        cached_page = NULL;
        index = *ppos >> PAGE_CACHE_SHIFT;
-        next_index = index;
+        prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
-        prev_index = ra.prev_index;
+        prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
-        prev_offset = ra.prev_offset;
        last_index = (*ppos + desc->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
        offset = *ppos & ~PAGE_CACHE_MASK;
        for (;;) {
                struct page *page;
-                unsigned long end_index;
+                pgoff_t end_index;
                loff_t isize;
                unsigned long nr, ret;
@@ -895,7 +886,7 @@ find_page:
                page = find_get_page(mapping, index);
                if (!page) {
                        page_cache_sync_readahead(mapping,
-                                        &ra, filp,
+                                        ra, filp,
                                        index, last_index - index);
                        page = find_get_page(mapping, index);
                        if (unlikely(page == NULL))
@@ -903,7 +894,7 @@ find_page:
                }
                if (PageReadahead(page)) {
                        page_cache_async_readahead(mapping,
-                                        &ra, filp, page,
+                                        ra, filp, page,
                                        index, last_index - index);
                }
                if (!PageUptodate(page))
@@ -966,7 +957,6 @@ page_ok:
                index += offset >> PAGE_CACHE_SHIFT;
                offset &= ~PAGE_CACHE_MASK;
                prev_offset = offset;
-                ra.prev_offset = offset;
                page_cache_release(page);
                if (ret == nr && desc->count)
@@ -1015,7 +1005,7 @@ readpage:
                                }
                                unlock_page(page);
                                error = -EIO;
-                                shrink_readahead_size_eio(filp, &ra);
+                                shrink_readahead_size_eio(filp, ra);
                                goto readpage_error;
                        }
                        unlock_page(page);
@@ -1034,33 +1024,29 @@ no_cached_page:
                 * Ok, it wasn't cached, so we need to create a new
                 * page..
                 */
-                if (!cached_page) {
+                page = page_cache_alloc_cold(mapping);
-                        cached_page = page_cache_alloc_cold(mapping);
+                if (!page) {
-                        if (!cached_page) {
+                        desc->error = -ENOMEM;
-                                desc->error = -ENOMEM;
+                        goto out;
-                                goto out;
-                        }
                }
-                error = add_to_page_cache_lru(cached_page, mapping,
+                error = add_to_page_cache_lru(page, mapping,
                                                index, GFP_KERNEL);
                if (error) {
+                        page_cache_release(page);
                        if (error == -EEXIST)
                                goto find_page;
                        desc->error = error;
                        goto out;
                }
-                page = cached_page;
-                cached_page = NULL;
                goto readpage;
        }
 out:
-        *_ra = ra;
+        ra->prev_pos = prev_index;
-        _ra->prev_index = prev_index;
+        ra->prev_pos <<= PAGE_CACHE_SHIFT;
+        ra->prev_pos |= prev_offset;
-        *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
+        *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
-        if (cached_page)
-                page_cache_release(cached_page);
        if (filp)
                file_accessed(filp);
 }
@@ -1220,7 +1206,7 @@ EXPORT_SYMBOL(generic_file_aio_read);
 static ssize_t
 do_readahead(struct address_space *mapping, struct file *filp,
-             unsigned long index, unsigned long nr)
+             pgoff_t index, unsigned long nr)
 {
        if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
                return -EINVAL;
@@ -1240,8 +1226,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
        if (file) {
                if (file->f_mode & FMODE_READ) {
                        struct address_space *mapping = file->f_mapping;
-                        unsigned long start = offset >> PAGE_CACHE_SHIFT;
+                        pgoff_t start = offset >> PAGE_CACHE_SHIFT;
-                        unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+                        pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
                        unsigned long len = end - start + 1;
                        ret = do_readahead(mapping, file, start, len);
                }
@@ -1251,7 +1237,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
 }
 #ifdef CONFIG_MMU
-static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
 /**
 * page_cache_read - adds requested page to the page cache if not already there
 * @file:       file to read
@@ -1260,7 +1245,7 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
 * This adds the requested page to the page cache if it isn't already there,
 * and schedules an I/O to read in its contents from disk.
 */
-static int fastcall page_cache_read(struct file * file, unsigned long offset)
+static int fastcall page_cache_read(struct file * file, pgoff_t offset)
 {
        struct address_space *mapping = file->f_mapping;
        struct page *page; 
@@ -1349,7 +1334,7 @@ retry_find:
                 * Do we miss much more than hit in this file? If so,
                 * stop bothering with read-ahead. It will only hurt.
                 */
-                if (ra->mmap_miss > ra->mmap_hit + MMAP_LOTSAMISS)
+                if (ra->mmap_miss > MMAP_LOTSAMISS)
                        goto no_cached_page;
                /*
@@ -1375,7 +1360,7 @@ retry_find:
        }
        if (!did_readaround)
-                ra->mmap_hit++;
+                ra->mmap_miss--;
        /*
         * We have a locked page in the page cache, now we need to check
@@ -1396,7 +1381,7 @@ retry_find:
         * Found the page and have a reference on it.
         */
        mark_page_accessed(page);
-        ra->prev_index = page->index;
+        ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
        vmf->page = page;
        return ret | VM_FAULT_LOCKED;
@@ -1501,39 +1486,32 @@ EXPORT_SYMBOL(generic_file_mmap);
 EXPORT_SYMBOL(generic_file_readonly_mmap);
 static struct page *__read_cache_page(struct address_space *mapping,
-                                unsigned long index,
+                                pgoff_t index,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
-        struct page *page, *cached_page = NULL;
+        struct page *page;
        int err;
 repeat:
        page = find_get_page(mapping, index);
        if (!page) {
-                if (!cached_page) {
+                page = page_cache_alloc_cold(mapping);
-                        cached_page = page_cache_alloc_cold(mapping);
+                if (!page)
-                        if (!cached_page)
+                        return ERR_PTR(-ENOMEM);
-                                return ERR_PTR(-ENOMEM);
+                err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
-                }
+                if (unlikely(err)) {
-                err = add_to_page_cache_lru(cached_page, mapping,
+                        page_cache_release(page);
-                                        index, GFP_KERNEL);
+                        if (err == -EEXIST)
-                if (err == -EEXIST)
+                                goto repeat;
-                        goto repeat;
-                if (err < 0) {
                        /* Presumably ENOMEM for radix tree node */
-                        page_cache_release(cached_page);
                        return ERR_PTR(err);
                }
-                page = cached_page;
-                cached_page = NULL;
                err = filler(data, page);
                if (err < 0) {
                        page_cache_release(page);
                        page = ERR_PTR(err);
                }
        }
-        if (cached_page)
-                page_cache_release(cached_page);
        return page;
 }
@@ -1542,7 +1520,7 @@ repeat:
 * after submitting it to the filler.
 */
 struct page *read_cache_page_async(struct address_space *mapping,
-                                unsigned long index,
+                                pgoff_t index,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
@@ -1590,7 +1568,7 @@ EXPORT_SYMBOL(read_cache_page_async);
 * If the page does not get brought uptodate, return -EIO.
 */
 struct page *read_cache_page(struct address_space *mapping,
-                                unsigned long index,
+                                pgoff_t index,
                                int (*filler)(void *,struct page*),
                                void *data)
 {
@@ -1610,40 +1588,6 @@ struct page *read_cache_page(struct address_space *mapping,
 EXPORT_SYMBOL(read_cache_page);
 /*
- * If the page was newly created, increment its refcount and add it to the
- * caller's lru-buffering pagevec.  This function is specifically for
- * generic_file_write().
- */
-static inline struct page *
-__grab_cache_page(struct address_space *mapping, unsigned long index,
-                        struct page **cached_page, struct pagevec *lru_pvec)
-{
-        int err;
-        struct page *page;
-repeat:
-        page = find_lock_page(mapping, index);
-        if (!page) {
-                if (!*cached_page) {
-                        *cached_page = page_cache_alloc(mapping);
-                        if (!*cached_page)
-                                return NULL;
-                }
-                err = add_to_page_cache(*cached_page, mapping,
-                                        index, GFP_KERNEL);
-                if (err == -EEXIST)
-                        goto repeat;
-                if (err == 0) {
-                        page = *cached_page;
-                        page_cache_get(page);
-                        if (!pagevec_add(lru_pvec, page))
-                                __pagevec_lru_add(lru_pvec);
-                        *cached_page = NULL;
-                }
-        }
-        return page;
-}
-/*
 * The logic we want is
 *
 *      if suid or (sgid and xgrp)
@@ -1691,8 +1635,7 @@ int remove_suid(struct dentry *dentry)
 }
 EXPORT_SYMBOL(remove_suid);
-size_t
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
                        const struct iovec *iov, size_t base, size_t bytes)
 {
        size_t copied = 0, left = 0;
@@ -1715,6 +1658,124 @@ __filemap_copy_from_user_iovec_inatomic(char *vaddr,
 }
 /*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+size_t iov_iter_copy_from_user_atomic(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+        char *kaddr;
+        size_t copied;
+        BUG_ON(!in_atomic());
+        kaddr = kmap_atomic(page, KM_USER0);
+        if (likely(i->nr_segs == 1)) {
+                int left;
+                char __user *buf = i->iov->iov_base + i->iov_offset;
+                left = __copy_from_user_inatomic_nocache(kaddr + offset,
+                                                        buf, bytes);
+                copied = bytes - left;
+        } else {
+                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+                                                i->iov, i->iov_offset, bytes);
+        }
+        kunmap_atomic(kaddr, KM_USER0);
+        return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+/*
+ * This has the same sideeffects and return value as
+ * iov_iter_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+size_t iov_iter_copy_from_user(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+        char *kaddr;
+        size_t copied;
+        kaddr = kmap(page);
+        if (likely(i->nr_segs == 1)) {
+                int left;
+                char __user *buf = i->iov->iov_base + i->iov_offset;
+                left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
+                copied = bytes - left;
+        } else {
+                copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+                                                i->iov, i->iov_offset, bytes);
+        }
+        kunmap(page);
+        return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user);
+static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes)
+{
+        if (likely(i->nr_segs == 1)) {
+                i->iov_offset += bytes;
+        } else {
+                const struct iovec *iov = i->iov;
+                size_t base = i->iov_offset;
+                while (bytes) {
+                        int copy = min(bytes, iov->iov_len - base);
+                        bytes -= copy;
+                        base += copy;
+                        if (iov->iov_len == base) {
+                                iov++;
+                                base = 0;
+                        }
+                }
+                i->iov = iov;
+                i->iov_offset = base;
+        }
+}
+void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+        BUG_ON(i->count < bytes);
+        __iov_iter_advance_iov(i, bytes);
+        i->count -= bytes;
+}
+EXPORT_SYMBOL(iov_iter_advance);
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+        char __user *buf = i->iov->iov_base + i->iov_offset;
+        bytes = min(bytes, i->iov->iov_len - i->iov_offset);
+        return fault_in_pages_readable(buf, bytes);
+}
+EXPORT_SYMBOL(iov_iter_fault_in_readable);
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+size_t iov_iter_single_seg_count(struct iov_iter *i)
+{
+        const struct iovec *iov = i->iov;
+        if (i->nr_segs == 1)
+                return i->count;
+        else
+                return min(i->count, iov->iov_len - i->iov_offset);
+}
+EXPORT_SYMBOL(iov_iter_single_seg_count);
+/*
 * Performs necessary checks before doing a write
 *
 * Can adjust writing position or amount of bytes to write.
@@ -1796,6 +1857,91 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
 }
 EXPORT_SYMBOL(generic_write_checks);
+int pagecache_write_begin(struct file *file, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned flags,
+                                struct page **pagep, void **fsdata)
+{
+        const struct address_space_operations *aops = mapping->a_ops;
+        if (aops->write_begin) {
+                return aops->write_begin(file, mapping, pos, len, flags,
+                                                        pagep, fsdata);
+        } else {
+                int ret;
+                pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+                unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+                struct inode *inode = mapping->host;
+                struct page *page;
+again:
+                page = __grab_cache_page(mapping, index);
+                *pagep = page;
+                if (!page)
+                        return -ENOMEM;
+                if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
+                        /*
+                         * There is no way to resolve a short write situation
+                         * for a !Uptodate page (except by double copying in
+                         * the caller done by generic_perform_write_2copy).
+                         *
+                         * Instead, we have to bring it uptodate here.
+                         */
+                        ret = aops->readpage(file, page);
+                        page_cache_release(page);
+                        if (ret) {
+                                if (ret == AOP_TRUNCATED_PAGE)
+                                        goto again;
+                                return ret;
+                        }
+                        goto again;
+                }
+                ret = aops->prepare_write(file, page, offset, offset+len);
+                if (ret) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        if (pos + len > inode->i_size)
+                                vmtruncate(inode, inode->i_size);
+                }
+                return ret;
+        }
+}
+EXPORT_SYMBOL(pagecache_write_begin);
+int pagecache_write_end(struct file *file, struct address_space *mapping,
+                                loff_t pos, unsigned len, unsigned copied,
+                                struct page *page, void *fsdata)
+{
+        const struct address_space_operations *aops = mapping->a_ops;
+        int ret;
+        if (aops->write_end) {
+                mark_page_accessed(page);
+                ret = aops->write_end(file, mapping, pos, len, copied,
+                                                        page, fsdata);
+        } else {
+                unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+                struct inode *inode = mapping->host;
+                flush_dcache_page(page);
+                ret = aops->commit_write(file, page, offset, offset+len);
+                unlock_page(page);
+                mark_page_accessed(page);
+                page_cache_release(page);
+                if (ret < 0) {
+                        if (pos + len > inode->i_size)
+                                vmtruncate(inode, inode->i_size);
+                } else if (ret > 0)
+                        ret = min_t(size_t, copied, ret);
+                else
+                        ret = copied;
+        }
+        return ret;
+}
+EXPORT_SYMBOL(pagecache_write_end);
 ssize_t
 generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -1835,151 +1981,314 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 }
 EXPORT_SYMBOL(generic_file_direct_write);
-ssize_t
+/*
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ * Find or create a page at the given pagecache position. Return the locked
-                unsigned long nr_segs, loff_t pos, loff_t *ppos,
+ * page. This function is specifically for buffered writes.
-                size_t count, ssize_t written)
+ */
+struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
 {
-        struct file *file = iocb->ki_filp;
+        int status;
-        struct address_space * mapping = file->f_mapping;
+        struct page *page;
-        const struct address_space_operations *a_ops = mapping->a_ops;
+repeat:
-        struct inode    *inode = mapping->host;
+        page = find_lock_page(mapping, index);
-        long            status = 0;
+        if (likely(page))
-        struct page     *page;
+                return page;
-        struct page     *cached_page = NULL;
-        size_t          bytes;
-        struct pagevec  lru_pvec;
-        const struct iovec *cur_iov = iov; /* current iovec */
-        size_t          iov_base = 0;      /* offset in the current iovec */
-        char __user     *buf;
-        pagevec_init(&lru_pvec, 0);
-        /*
+        page = page_cache_alloc(mapping);
-         * handle partial DIO write.  Adjust cur_iov if needed.
+        if (!page)
-         */
+                return NULL;
-        if (likely(nr_segs == 1))
+        status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
-                buf = iov->iov_base + written;
+        if (unlikely(status)) {
-        else {
+                page_cache_release(page);
-                filemap_set_next_iovec(&cur_iov, &iov_base, written);
+                if (status == -EEXIST)
-                buf = cur_iov->iov_base + iov_base;
+                        goto repeat;
+                return NULL;
        }
+        return page;
+}
+EXPORT_SYMBOL(__grab_cache_page);
+static ssize_t generic_perform_write_2copy(struct file *file,
+                                struct iov_iter *i, loff_t pos)
+{
+        struct address_space *mapping = file->f_mapping;
+        const struct address_space_operations *a_ops = mapping->a_ops;
+        struct inode *inode = mapping->host;
+        long status = 0;
+        ssize_t written = 0;
        do {
-                unsigned long index;
+                struct page *src_page;
-                unsigned long offset;
+                struct page *page;
-                size_t copied;
+                pgoff_t index;          /* Pagecache index for current page */
+                unsigned long offset;   /* Offset into pagecache page */
+                unsigned long bytes;    /* Bytes to write to page */
+                size_t copied;          /* Bytes copied from user */
-                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+                offset = (pos & (PAGE_CACHE_SIZE - 1));
                index = pos >> PAGE_CACHE_SHIFT;
-                bytes = PAGE_CACHE_SIZE - offset;
+                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+                                                iov_iter_count(i));
-                /* Limit the size of the copy to the caller's write size */
-                bytes = min(bytes, count);
-                /* We only need to worry about prefaulting when writes are from
+                /*
-                 * user-space.  NFSd uses vfs_writev with several non-aligned
+                 * a non-NULL src_page indicates that we're doing the
-                 * segments in the vector, and limiting to one segment a time is
+                 * copy via get_user_pages and kmap.
-                 * a noticeable performance for re-write
                 */
-                if (!segment_eq(get_fs(), KERNEL_DS)) {
+                src_page = NULL;
-                        /*
-                         * Limit the size of the copy to that of the current
-                         * segment, because fault_in_pages_readable() doesn't
-                         * know how to walk segments.
-                         */
-                        bytes = min(bytes, cur_iov->iov_len - iov_base);
-                        /*
+                /*
-                         * Bring in the user page that we will copy from
+                 * Bring in the user page that we will copy from _first_.
-                         * _first_.  Otherwise there's a nasty deadlock on
+                 * Otherwise there's a nasty deadlock on copying from the
-                         * copying from the same page as we're writing to,
+                 * same page as we're writing to, without it being marked
-                         * without it being marked up-to-date.
+                 * up-to-date.
-                         */
+                 *
-                        fault_in_pages_readable(buf, bytes);
+                 * Not only is this an optimisation, but it is also required
+                 * to check that the address is actually valid, when atomic
+                 * usercopies are used, below.
+                 */
+                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+                        status = -EFAULT;
+                        break;
                }
-                page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
+                page = __grab_cache_page(mapping, index);
                if (!page) {
                        status = -ENOMEM;
                        break;
                }
-                if (unlikely(bytes == 0)) {
+                /*
-                        status = 0;
+                 * non-uptodate pages cannot cope with short copies, and we
-                        copied = 0;
+                 * cannot take a pagefault with the destination page locked.
-                        goto zero_length_segment;
+                 * So pin the source page to copy it.
-                }
+                 */
+                if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
+                        unlock_page(page);
-                status = a_ops->prepare_write(file, page, offset, offset+bytes);
+                        src_page = alloc_page(GFP_KERNEL);
-                if (unlikely(status)) {
+                        if (!src_page) {
-                        loff_t isize = i_size_read(inode);
+                                page_cache_release(page);
+                                status = -ENOMEM;
+                                break;
+                        }
+                        /*
+                         * Cannot get_user_pages with a page locked for the
+                         * same reason as we can't take a page fault with a
+                         * page locked (as explained below).
+                         */
+                        copied = iov_iter_copy_from_user(src_page, i,
+                                                                offset, bytes);
+                        if (unlikely(copied == 0)) {
+                                status = -EFAULT;
+                                page_cache_release(page);
+                                page_cache_release(src_page);
+                                break;
+                        }
+                        bytes = copied;
-                        if (status != AOP_TRUNCATED_PAGE)
+                        lock_page(page);
+                        /*
+                         * Can't handle the page going uptodate here, because
+                         * that means we would use non-atomic usercopies, which
+                         * zero out the tail of the page, which can cause
+                         * zeroes to become transiently visible. We could just
+                         * use a non-zeroing copy, but the APIs aren't too
+                         * consistent.
+                         */
+                        if (unlikely(!page->mapping || PageUptodate(page))) {
                                unlock_page(page);
-                        page_cache_release(page);
+                                page_cache_release(page);
-                        if (status == AOP_TRUNCATED_PAGE)
+                                page_cache_release(src_page);
                                continue;
+                        }
+                }
+                status = a_ops->prepare_write(file, page, offset, offset+bytes);
+                if (unlikely(status))
+                        goto fs_write_aop_error;
+                if (!src_page) {
                        /*
-                         * prepare_write() may have instantiated a few blocks
+                         * Must not enter the pagefault handler here, because
-                         * outside i_size.  Trim these off again.
+                         * we hold the page lock, so we might recursively
+                         * deadlock on the same lock, or get an ABBA deadlock
+                         * against a different lock, or against the mmap_sem
+                         * (which nests outside the page lock).  So increment
+                         * preempt count, and use _atomic usercopies.
+                         *
+                         * The page is uptodate so we are OK to encounter a
+                         * short copy: if unmodified parts of the page are
+                         * marked dirty and written out to disk, it doesn't
+                         * really matter.
                         */
-                        if (pos + bytes > isize)
+                        pagefault_disable();
-                                vmtruncate(inode, isize);
+                        copied = iov_iter_copy_from_user_atomic(page, i,
-                        break;
+                                                                offset, bytes);
+                        pagefault_enable();
+                } else {
+                        void *src, *dst;
+                        src = kmap_atomic(src_page, KM_USER0);
+                        dst = kmap_atomic(page, KM_USER1);
+                        memcpy(dst + offset, src + offset, bytes);
+                        kunmap_atomic(dst, KM_USER1);
+                        kunmap_atomic(src, KM_USER0);
+                        copied = bytes;
                }
-                if (likely(nr_segs == 1))
-                        copied = filemap_copy_from_user(page, offset,
-                                                        buf, bytes);
-                else
-                        copied = filemap_copy_from_user_iovec(page, offset,
-                                                cur_iov, iov_base, bytes);
                flush_dcache_page(page);
                status = a_ops->commit_write(file, page, offset, offset+bytes);
-                if (status == AOP_TRUNCATED_PAGE) {
+                if (unlikely(status < 0))
-                        page_cache_release(page);
+                        goto fs_write_aop_error;
-                        continue;
+                if (unlikely(status > 0)) /* filesystem did partial write */
-                }
+                        copied = min_t(size_t, copied, status);
-zero_length_segment:
-                if (likely(copied >= 0)) {
-                        if (!status)
-                                status = copied;
-                        if (status >= 0) {
-                                written += status;
-                                count -= status;
-                                pos += status;
-                                buf += status;
-                                if (unlikely(nr_segs > 1)) {
-                                        filemap_set_next_iovec(&cur_iov,
-                                                        &iov_base, status);
-                                        if (count)
-                                                buf = cur_iov->iov_base +
-                                                        iov_base;
-                                } else {
-                                        iov_base += status;
-                                }
-                        }
-                }
-                if (unlikely(copied != bytes))
-                        if (status >= 0)
-                                status = -EFAULT;
                unlock_page(page);
                mark_page_accessed(page);
                page_cache_release(page);
-                if (status < 0)
+                if (src_page)
-                        break;
+                        page_cache_release(src_page);
+                iov_iter_advance(i, copied);
+                pos += copied;
+                written += copied;
                balance_dirty_pages_ratelimited(mapping);
                cond_resched();
-        } while (count);
+                continue;
-        *ppos = pos;
-        if (cached_page)
+fs_write_aop_error:
-                page_cache_release(cached_page);
+                unlock_page(page);
+                page_cache_release(page);
+                if (src_page)
+                        page_cache_release(src_page);
+                /*
+                 * prepare_write() may have instantiated a few blocks
+                 * outside i_size.  Trim these off again. Don't need
+                 * i_size_read because we hold i_mutex.
+                 */
+                if (pos + bytes > inode->i_size)
+                        vmtruncate(inode, inode->i_size);
+                break;
+        } while (iov_iter_count(i));
+        return written ? written : status;
+}
+static ssize_t generic_perform_write(struct file *file,
+                                struct iov_iter *i, loff_t pos)
+{
+        struct address_space *mapping = file->f_mapping;
+        const struct address_space_operations *a_ops = mapping->a_ops;
+        long status = 0;
+        ssize_t written = 0;
+        unsigned int flags = 0;
        /*
-         * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
+         * Copies from kernel address space cannot fail (NFSD is a big user).
         */
+        if (segment_eq(get_fs(), KERNEL_DS))
+                flags |= AOP_FLAG_UNINTERRUPTIBLE;
+        do {
+                struct page *page;
+                pgoff_t index;          /* Pagecache index for current page */
+                unsigned long offset;   /* Offset into pagecache page */
+                unsigned long bytes;    /* Bytes to write to page */
+                size_t copied;          /* Bytes copied from user */
+                void *fsdata;
+                offset = (pos & (PAGE_CACHE_SIZE - 1));
+                index = pos >> PAGE_CACHE_SHIFT;
+                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+                                                iov_iter_count(i));
+again:
+                /*
+                 * Bring in the user page that we will copy from _first_.
+                 * Otherwise there's a nasty deadlock on copying from the
+                 * same page as we're writing to, without it being marked
+                 * up-to-date.
+                 *
+                 * Not only is this an optimisation, but it is also required
+                 * to check that the address is actually valid, when atomic
+                 * usercopies are used, below.
+                 */
+                if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
+                        status = -EFAULT;
+                        break;
+                }
+                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+                                                &page, &fsdata);
+                if (unlikely(status))
+                        break;
+                pagefault_disable();
+                copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+                pagefault_enable();
+                flush_dcache_page(page);
+                status = a_ops->write_end(file, mapping, pos, bytes, copied,
+                                                page, fsdata);
+                if (unlikely(status < 0))
+                        break;
+                copied = status;
+                cond_resched();
+                if (unlikely(copied == 0)) {
+                        /*
+                         * If we were unable to copy any data at all, we must
+                         * fall back to a single segment length write.
+                         *
+                         * If we didn't fallback here, we could livelock
+                         * because not all segments in the iov can be copied at
+                         * once without a pagefault.
+                         */
+                        bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
+                                                iov_iter_single_seg_count(i));
+                        goto again;
+                }
+                iov_iter_advance(i, copied);
+                pos += copied;
+                written += copied;
+                balance_dirty_pages_ratelimited(mapping);
+        } while (iov_iter_count(i));
+        return written ? written : status;
+}
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+                unsigned long nr_segs, loff_t pos, loff_t *ppos,
+                size_t count, ssize_t written)
+{
+        struct file *file = iocb->ki_filp;
+        struct address_space *mapping = file->f_mapping;
+        const struct address_space_operations *a_ops = mapping->a_ops;
+        struct inode *inode = mapping->host;
+        ssize_t status;
+        struct iov_iter i;
+        iov_iter_init(&i, iov, nr_segs, count, written);
+        if (a_ops->write_begin)
+                status = generic_perform_write(file, &i, pos);
+        else
+                status = generic_perform_write_2copy(file, &i, pos);
        if (likely(status >= 0)) {
+                written += status;
+                *ppos = pos + status;
+                /*
+                 * For now, when the user asks for O_SYNC, we'll actually give
+                 * O_DSYNC
+                 */
                if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                        if (!a_ops->writepage || !is_sync_kiocb(iocb))
                                status = generic_osync_inode(inode, mapping,
@@ -1995,7 +2304,6 @@ zero_length_segment:
        if (unlikely(file->f_flags & O_DIRECT) && written)
                status = filemap_write_and_wait(mapping);
-        pagevec_lru_add(&lru_pvec);
        return written ? written : status;
 }
 EXPORT_SYMBOL(generic_file_buffered_write);
diff --git a/mm/filemap.h b/mm/filemap.h
deleted file mode 100644
index c2bff04c84ed..000000000000
--- a/mm/filemap.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- *      linux/mm/filemap.h
- *
- * Copyright (C) 1994-1999  Linus Torvalds
- */
-#ifndef __FILEMAP_H
-#define __FILEMAP_H
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/uio.h>
-#include <linux/uaccess.h>
-size_t
-__filemap_copy_from_user_iovec_inatomic(char *vaddr,
-                                        const struct iovec *iov,
-                                        size_t base,
-                                        size_t bytes);
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were sucessfully copied.  If a fault is encountered then clear the page
- * out to (offset+bytes) and return the number of bytes which were copied.
- *
- * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
- * to *NOT* zero any tail of the buffer that it failed to copy.  If it does,
- * and if the following non-atomic copy succeeds, then there is a small window
- * where the target page contains neither the data before the write, nor the
- * data after the write (it contains zero).  A read at this time will see
- * data that is inconsistent with any ordering of the read and the write.
- * (This has been detected in practice).
- */
-static inline size_t
-filemap_copy_from_user(struct page *page, unsigned long offset,
-                        const char __user *buf, unsigned bytes)
-{
-        char *kaddr;
-        int left;
-        kaddr = kmap_atomic(page, KM_USER0);
-        left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
-        kunmap_atomic(kaddr, KM_USER0);
-        if (left != 0) {
-                /* Do it the slow way */
-                kaddr = kmap(page);
-                left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
-                kunmap(page);
-        }
-        return bytes - left;
-}
-/*
- * This has the same sideeffects and return value as filemap_copy_from_user().
- * The difference is that on a fault we need to memset the remainder of the
- * page (out to offset+bytes), to emulate filemap_copy_from_user()'s
- * single-segment behaviour.
- */
-static inline size_t
-filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
-                        const struct iovec *iov, size_t base, size_t bytes)
-{
-        char *kaddr;
-        size_t copied;
-        kaddr = kmap_atomic(page, KM_USER0);
-        copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
-                                                         base, bytes);
-        kunmap_atomic(kaddr, KM_USER0);
-        if (copied != bytes) {
-                kaddr = kmap(page);
-                copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
-                                                                 base, bytes);
-                if (bytes - copied)
-                        memset(kaddr + offset + copied, 0, bytes - copied);
-                kunmap(page);
-        }
-        return copied;
-}
-static inline void
-filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
-{
-        const struct iovec *iov = *iovp;
-        size_t base = *basep;
-        do {
-                int copy = min(bytes, iov->iov_len - base);
-                bytes -= copy;
-                base += copy;
-                if (iov->iov_len == base) {
-                        iov++;
-                        base = 0;
-                }
-        } while (bytes);
-        *iovp = iov;
-        *basep = base;
-}
-#endif
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 53ee6a299635..32132f3cd641 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -15,7 +15,6 @@
 #include <linux/rmap.h>
 #include <linux/sched.h>
 #include <asm/tlbflush.h>
-#include "filemap.h"
 /*
 * We do use our own empty page to avoid interference with other users
@@ -288,6 +287,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
                unsigned long index;
                unsigned long offset;
                size_t copied;
+                char *kaddr;
                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
                index = pos >> PAGE_CACHE_SHIFT;
@@ -295,14 +295,6 @@ __xip_file_write(struct file *filp, const char __user *buf,
                if (bytes > count)
                        bytes = count;
-                /*
-                 * Bring in the user page that we will copy from _first_.
-                 * Otherwise there's a nasty deadlock on copying from the
-                 * same page as we're writing to, without it being marked
-                 * up-to-date.
-                 */
-                fault_in_pages_readable(buf, bytes);
                page = a_ops->get_xip_page(mapping,
                                           index*(PAGE_SIZE/512), 0);
                if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
@@ -319,8 +311,13 @@ __xip_file_write(struct file *filp, const char __user *buf,
                        break;
                }
-                copied = filemap_copy_from_user(page, offset, buf, bytes);
+                fault_in_pages_readable(buf, bytes);
+                kaddr = kmap_atomic(page, KM_USER0);
+                copied = bytes -
+                        __copy_from_user_inatomic_nocache(kaddr, buf, bytes);
+                kunmap_atomic(kaddr, KM_USER0);
                flush_dcache_page(page);
                if (likely(copied > 0)) {
                        status = copied;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eab8c428cc93..ae2959bb59cb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -23,12 +23,16 @@
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
+static unsigned long surplus_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
+static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
+int hugetlb_dynamic_pool;
+static int hugetlb_next_nid;
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -85,6 +89,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
                        list_del(&page->lru);
                        free_huge_pages--;
                        free_huge_pages_node[nid]--;
+                        if (vma && vma->vm_flags & VM_MAYSHARE)
+                                resv_huge_pages--;
                        break;
                }
        }
@@ -92,58 +98,269 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
        return page;
 }
+static void update_and_free_page(struct page *page)
+{
+        int i;
+        nr_huge_pages--;
+        nr_huge_pages_node[page_to_nid(page)]--;
+        for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+                                1 << PG_private | 1<< PG_writeback);
+        }
+        set_compound_page_dtor(page, NULL);
+        set_page_refcounted(page);
+        __free_pages(page, HUGETLB_PAGE_ORDER);
+}
 static void free_huge_page(struct page *page)
 {
-        BUG_ON(page_count(page));
+        int nid = page_to_nid(page);
+        BUG_ON(page_count(page));
        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
-        enqueue_huge_page(page);
+        if (surplus_huge_pages_node[nid]) {
+                update_and_free_page(page);
+                surplus_huge_pages--;
+                surplus_huge_pages_node[nid]--;
+        } else {
+                enqueue_huge_page(page);
+        }
        spin_unlock(&hugetlb_lock);
 }
-static int alloc_fresh_huge_page(void)
+/*
+ * Increment or decrement surplus_huge_pages.  Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(int delta)
 {
        static int prev_nid;
-        struct page *page;
+        int nid = prev_nid;
-        int nid;
+        int ret = 0;
+        VM_BUG_ON(delta != -1 && delta != 1);
+        do {
+                nid = next_node(nid, node_online_map);
+                if (nid == MAX_NUMNODES)
+                        nid = first_node(node_online_map);
+                /* To shrink on this node, there must be a surplus page */
+                if (delta < 0 && !surplus_huge_pages_node[nid])
+                        continue;
+                /* Surplus cannot exceed the total number of pages */
+                if (delta > 0 && surplus_huge_pages_node[nid] >=
+                                                nr_huge_pages_node[nid])
+                        continue;
+                surplus_huge_pages += delta;
+                surplus_huge_pages_node[nid] += delta;
+                ret = 1;
+                break;
+        } while (nid != prev_nid);
-        /*
-         * Copy static prev_nid to local nid, work on that, then copy it
-         * back to prev_nid afterwards: otherwise there's a window in which
-         * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
-         * But we don't need to use a spin_lock here: it really doesn't
-         * matter if occasionally a racer chooses the same nid as we do.
-         */
-        nid = next_node(prev_nid, node_online_map);
-        if (nid == MAX_NUMNODES)
-                nid = first_node(node_online_map);
        prev_nid = nid;
+        return ret;
+}
+static struct page *alloc_fresh_huge_page_node(int nid)
+{
+        struct page *page;
-        page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
+        page = alloc_pages_node(nid,
+                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
+                HUGETLB_PAGE_ORDER);
+        if (page) {
+                set_compound_page_dtor(page, free_huge_page);
+                spin_lock(&hugetlb_lock);
+                nr_huge_pages++;
+                nr_huge_pages_node[nid]++;
+                spin_unlock(&hugetlb_lock);
+                put_page(page); /* free it into the hugepage allocator */
+        }
+        return page;
+}
+static int alloc_fresh_huge_page(void)
+{
+        struct page *page;
+        int start_nid;
+        int next_nid;
+        int ret = 0;
+        start_nid = hugetlb_next_nid;
+        do {
+                page = alloc_fresh_huge_page_node(hugetlb_next_nid);
+                if (page)
+                        ret = 1;
+                /*
+                 * Use a helper variable to find the next node and then
+                 * copy it back to hugetlb_next_nid afterwards:
+                 * otherwise there's a window in which a racer might
+                 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+                 * But we don't need to use a spin_lock here: it really
+                 * doesn't matter if occasionally a racer chooses the
+                 * same nid as we do.  Move nid forward in the mask even
+                 * if we just successfully allocated a hugepage so that
+                 * the next caller gets hugepages on the next node.
+                 */
+                next_nid = next_node(hugetlb_next_nid, node_online_map);
+                if (next_nid == MAX_NUMNODES)
+                        next_nid = first_node(node_online_map);
+                hugetlb_next_nid = next_nid;
+        } while (!page && hugetlb_next_nid != start_nid);
+        return ret;
+}
+static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
+                                                unsigned long address)
+{
+        struct page *page;
+        /* Check if the dynamic pool is enabled */
+        if (!hugetlb_dynamic_pool)
+                return NULL;
+        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
                                        HUGETLB_PAGE_ORDER);
        if (page) {
                set_compound_page_dtor(page, free_huge_page);
                spin_lock(&hugetlb_lock);
                nr_huge_pages++;
                nr_huge_pages_node[page_to_nid(page)]++;
+                surplus_huge_pages++;
+                surplus_huge_pages_node[page_to_nid(page)]++;
                spin_unlock(&hugetlb_lock);
-                put_page(page); /* free it into the hugepage allocator */
-                return 1;
        }
-        return 0;
+        return page;
+}
+/*
+ * Increase the hugetlb pool such that it can accomodate a reservation
+ * of size 'delta'.
+ */
+static int gather_surplus_pages(int delta)
+{
+        struct list_head surplus_list;
+        struct page *page, *tmp;
+        int ret, i;
+        int needed, allocated;
+        needed = (resv_huge_pages + delta) - free_huge_pages;
+        if (needed <= 0)
+                return 0;
+        allocated = 0;
+        INIT_LIST_HEAD(&surplus_list);
+        ret = -ENOMEM;
+retry:
+        spin_unlock(&hugetlb_lock);
+        for (i = 0; i < needed; i++) {
+                page = alloc_buddy_huge_page(NULL, 0);
+                if (!page) {
+                        /*
+                         * We were not able to allocate enough pages to
+                         * satisfy the entire reservation so we free what
+                         * we've allocated so far.
+                         */
+                        spin_lock(&hugetlb_lock);
+                        needed = 0;
+                        goto free;
+                }
+                list_add(&page->lru, &surplus_list);
+        }
+        allocated += needed;
+        /*
+         * After retaking hugetlb_lock, we need to recalculate 'needed'
+         * because either resv_huge_pages or free_huge_pages may have changed.
+         */
+        spin_lock(&hugetlb_lock);
+        needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
+        if (needed > 0)
+                goto retry;
+        /*
+         * The surplus_list now contains _at_least_ the number of extra pages
+         * needed to accomodate the reservation.  Add the appropriate number
+         * of pages to the hugetlb pool and free the extras back to the buddy
+         * allocator.
+         */
+        needed += allocated;
+        ret = 0;
+free:
+        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+                list_del(&page->lru);
+                if ((--needed) >= 0)
+                        enqueue_huge_page(page);
+                else {
+                        /*
+                         * Decrement the refcount and free the page using its
+                         * destructor.  This must be done with hugetlb_lock
+                         * unlocked which is safe because free_huge_page takes
+                         * hugetlb_lock before deciding how to free the page.
+                         */
+                        spin_unlock(&hugetlb_lock);
+                        put_page(page);
+                        spin_lock(&hugetlb_lock);
+                }
+        }
+        return ret;
+}
+/*
+ * When releasing a hugetlb pool reservation, any surplus pages that were
+ * allocated to satisfy the reservation must be explicitly freed if they were
+ * never used.
+ */
+void return_unused_surplus_pages(unsigned long unused_resv_pages)
+{
+        static int nid = -1;
+        struct page *page;
+        unsigned long nr_pages;
+        nr_pages = min(unused_resv_pages, surplus_huge_pages);
+        while (nr_pages) {
+                nid = next_node(nid, node_online_map);
+                if (nid == MAX_NUMNODES)
+                        nid = first_node(node_online_map);
+                if (!surplus_huge_pages_node[nid])
+                        continue;
+                if (!list_empty(&hugepage_freelists[nid])) {
+                        page = list_entry(hugepage_freelists[nid].next,
+                                          struct page, lru);
+                        list_del(&page->lru);
+                        update_and_free_page(page);
+                        free_huge_pages--;
+                        free_huge_pages_node[nid]--;
+                        surplus_huge_pages--;
+                        surplus_huge_pages_node[nid]--;
+                        nr_pages--;
+                }
+        }
 }
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr)
 {
-        struct page *page;
+        struct page *page = NULL;
+        int use_reserved_page = vma->vm_flags & VM_MAYSHARE;
        spin_lock(&hugetlb_lock);
-        if (vma->vm_flags & VM_MAYSHARE)
+        if (!use_reserved_page && (free_huge_pages <= resv_huge_pages))
-                resv_huge_pages--;
-        else if (free_huge_pages <= resv_huge_pages)
                goto fail;
        page = dequeue_huge_page(vma, addr);
@@ -155,10 +372,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        return page;
 fail:
-        if (vma->vm_flags & VM_MAYSHARE)
-                resv_huge_pages++;
        spin_unlock(&hugetlb_lock);
-        return NULL;
+        /*
+         * Private mappings do not use reserved huge pages so the allocation
+         * may have failed due to an undersized hugetlb pool.  Try to grab a
+         * surplus huge page from the buddy allocator.
+         */
+        if (!use_reserved_page)
+                page = alloc_buddy_huge_page(vma, addr);
+        return page;
 }
 static int __init hugetlb_init(void)
@@ -171,6 +395,8 @@ static int __init hugetlb_init(void)
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&hugepage_freelists[i]);
+        hugetlb_next_nid = first_node(node_online_map);
        for (i = 0; i < max_huge_pages; ++i) {
                if (!alloc_fresh_huge_page())
                        break;
@@ -201,21 +427,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 }
 #ifdef CONFIG_SYSCTL
-static void update_and_free_page(struct page *page)
-{
-        int i;
-        nr_huge_pages--;
-        nr_huge_pages_node[page_to_nid(page)]--;
-        for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
-                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-                                1 << PG_private | 1<< PG_writeback);
-        }
-        set_compound_page_dtor(page, NULL);
-        set_page_refcounted(page);
-        __free_pages(page, HUGETLB_PAGE_ORDER);
-}
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low(unsigned long count)
 {
@@ -224,14 +435,14 @@ static void try_to_free_low(unsigned long count)
        for (i = 0; i < MAX_NUMNODES; ++i) {
                struct page *page, *next;
                list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
+                        if (count >= nr_huge_pages)
+                                return;
                        if (PageHighMem(page))
                                continue;
                        list_del(&page->lru);
                        update_and_free_page(page);
                        free_huge_pages--;
                        free_huge_pages_node[page_to_nid(page)]--;
-                        if (count >= nr_huge_pages)
-                                return;
                }
        }
 }
@@ -241,26 +452,61 @@ static inline void try_to_free_low(unsigned long count)
 }
 #endif
+#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
-        while (count > nr_huge_pages) {
+        unsigned long min_count, ret;
-                if (!alloc_fresh_huge_page())
-                        return nr_huge_pages;
-        }
-        if (count >= nr_huge_pages)
-                return nr_huge_pages;
+        /*
+         * Increase the pool size
+         * First take pages out of surplus state.  Then make up the
+         * remaining difference by allocating fresh huge pages.
+         */
        spin_lock(&hugetlb_lock);
-        count = max(count, resv_huge_pages);
+        while (surplus_huge_pages && count > persistent_huge_pages) {
-        try_to_free_low(count);
+                if (!adjust_pool_surplus(-1))
-        while (count < nr_huge_pages) {
+                        break;
+        }
+        while (count > persistent_huge_pages) {
+                int ret;
+                /*
+                 * If this allocation races such that we no longer need the
+                 * page, free_huge_page will handle it by freeing the page
+                 * and reducing the surplus.
+                 */
+                spin_unlock(&hugetlb_lock);
+                ret = alloc_fresh_huge_page();
+                spin_lock(&hugetlb_lock);
+                if (!ret)
+                        goto out;
+        }
+        /*
+         * Decrease the pool size
+         * First return free pages to the buddy allocator (being careful
+         * to keep enough around to satisfy reservations).  Then place
+         * pages into surplus state as needed so the pool will shrink
+         * to the desired size as pages become free.
+         */
+        min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
+        min_count = max(count, min_count);
+        try_to_free_low(min_count);
+        while (min_count < persistent_huge_pages) {
                struct page *page = dequeue_huge_page(NULL, 0);
                if (!page)
                        break;
                update_and_free_page(page);
        }
+        while (count < persistent_huge_pages) {
+                if (!adjust_pool_surplus(1))
+                        break;
+        }
+out:
+        ret = persistent_huge_pages;
        spin_unlock(&hugetlb_lock);
-        return nr_huge_pages;
+        return ret;
 }
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -292,10 +538,12 @@ int hugetlb_report_meminfo(char *buf)
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
                        "HugePages_Rsvd:  %5lu\n"
+                        "HugePages_Surp:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
                        nr_huge_pages,
                        free_huge_pages,
                        resv_huge_pages,
+                        surplus_huge_pages,
                        HPAGE_SIZE/1024);
 }
@@ -355,7 +603,6 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
        entry = pte_mkwrite(pte_mkdirty(*ptep));
        if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
                update_mmu_cache(vma, address, entry);
-                lazy_mmu_prot_update(entry);
        }
 }
@@ -708,7 +955,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
                        pte = pte_mkhuge(pte_modify(pte, newprot));
                        set_huge_pte_at(mm, address, ptep, pte);
-                        lazy_mmu_prot_update(pte);
                }
        }
        spin_unlock(&mm->page_table_lock);
@@ -843,21 +1089,6 @@ static int hugetlb_acct_memory(long delta)
        int ret = -ENOMEM;
        spin_lock(&hugetlb_lock);
-        if ((delta + resv_huge_pages) <= free_huge_pages) {
-                resv_huge_pages += delta;
-                ret = 0;
-        }
-        spin_unlock(&hugetlb_lock);
-        return ret;
-}
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
-{
-        long ret, chg;
-        chg = region_chg(&inode->i_mapping->private_list, from, to);
-        if (chg < 0)
-                return chg;
        /*
         * When cpuset is configured, it breaks the strict hugetlb page
         * reservation as the accounting is done on a global variable. Such
@@ -875,8 +1106,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
         * a best attempt and hopefully to minimize the impact of changing
         * semantics that cpuset has.
         */
-        if (chg > cpuset_mems_nr(free_huge_pages_node))
+        if (delta > 0) {
-                return -ENOMEM;
+                if (gather_surplus_pages(delta) < 0)
+                        goto out;
+                if (delta > cpuset_mems_nr(free_huge_pages_node))
+                        goto out;
+        }
+        ret = 0;
+        resv_huge_pages += delta;
+        if (delta < 0)
+                return_unused_surplus_pages((unsigned long) -delta);
+out:
+        spin_unlock(&hugetlb_lock);
+        return ret;
+}
+int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+{
+        long ret, chg;
+        chg = region_chg(&inode->i_mapping->private_list, from, to);
+        if (chg < 0)
+                return chg;
        ret = hugetlb_acct_memory(chg);
        if (ret < 0)
diff --git a/mm/internal.h b/mm/internal.h
index a3110c02aea7..953f941ea867 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -37,4 +37,14 @@ static inline void __put_page(struct page *page)
 extern void fastcall __init __free_pages_bootmem(struct page *page,
                                                unsigned int order);
+/*
+ * function for dealing with page's order in buddy system.
+ * zone->lock is already acquired when we use these.
+ * So, we don't need atomic page->flags operations here.
+ */
+static inline unsigned long page_order(struct page *page)
+{
+        VM_BUG_ON(!PageBuddy(page));
+        return page_private(page);
+}
 #endif
diff --git a/mm/memory.c b/mm/memory.c
index f82b359b2745..bd16dcaeefb8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -966,7 +966,7 @@ no_page_table:
         * has touched so far, we don't want to allocate page tables.
         */
        if (flags & FOLL_ANON) {
-                page = ZERO_PAGE(address);
+                page = ZERO_PAGE(0);
                if (flags & FOLL_GET)
                        get_page(page);
                BUG_ON(flags & FOLL_WRITE);
@@ -1111,95 +1111,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 }
 EXPORT_SYMBOL(get_user_pages);
-static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
-                        unsigned long addr, unsigned long end, pgprot_t prot)
-{
-        pte_t *pte;
-        spinlock_t *ptl;
-        int err = 0;
-        pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
-        if (!pte)
-                return -EAGAIN;
-        arch_enter_lazy_mmu_mode();
-        do {
-                struct page *page = ZERO_PAGE(addr);
-                pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
-                if (unlikely(!pte_none(*pte))) {
-                        err = -EEXIST;
-                        pte++;
-                        break;
-                }
-                page_cache_get(page);
-                page_add_file_rmap(page);
-                inc_mm_counter(mm, file_rss);
-                set_pte_at(mm, addr, pte, zero_pte);
-        } while (pte++, addr += PAGE_SIZE, addr != end);
-        arch_leave_lazy_mmu_mode();
-        pte_unmap_unlock(pte - 1, ptl);
-        return err;
-}
-static inline int zeromap_pmd_range(struct mm_struct *mm, pud_t *pud,
-                        unsigned long addr, unsigned long end, pgprot_t prot)
-{
-        pmd_t *pmd;
-        unsigned long next;
-        int err;
-        pmd = pmd_alloc(mm, pud, addr);
-        if (!pmd)
-                return -EAGAIN;
-        do {
-                next = pmd_addr_end(addr, end);
-                err = zeromap_pte_range(mm, pmd, addr, next, prot);
-                if (err)
-                        break;
-        } while (pmd++, addr = next, addr != end);
-        return err;
-}
-static inline int zeromap_pud_range(struct mm_struct *mm, pgd_t *pgd,
-                        unsigned long addr, unsigned long end, pgprot_t prot)
-{
-        pud_t *pud;
-        unsigned long next;
-        int err;
-        pud = pud_alloc(mm, pgd, addr);
-        if (!pud)
-                return -EAGAIN;
-        do {
-                next = pud_addr_end(addr, end);
-                err = zeromap_pmd_range(mm, pud, addr, next, prot);
-                if (err)
-                        break;
-        } while (pud++, addr = next, addr != end);
-        return err;
-}
-int zeromap_page_range(struct vm_area_struct *vma,
-                        unsigned long addr, unsigned long size, pgprot_t prot)
-{
-        pgd_t *pgd;
-        unsigned long next;
-        unsigned long end = addr + size;
-        struct mm_struct *mm = vma->vm_mm;
-        int err;
-        BUG_ON(addr >= end);
-        pgd = pgd_offset(mm, addr);
-        flush_cache_range(vma, addr, end);
-        do {
-                next = pgd_addr_end(addr, end);
-                err = zeromap_pud_range(mm, pgd, addr, next, prot);
-                if (err)
-                        break;
-        } while (pgd++, addr = next, addr != end);
-        return err;
-}
 pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
 {
        pgd_t * pgd = pgd_offset(mm, addr);
@@ -1700,10 +1611,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = pte_mkyoung(orig_pte);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                if (ptep_set_access_flags(vma, address, page_table, entry,1)) {
+                if (ptep_set_access_flags(vma, address, page_table, entry,1))
                        update_mmu_cache(vma, address, entry);
-                        lazy_mmu_prot_update(entry);
-                }
                ret |= VM_FAULT_WRITE;
                goto unlock;
        }
@@ -1717,16 +1626,11 @@ gotten:
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
-        if (old_page == ZERO_PAGE(address)) {
+        VM_BUG_ON(old_page == ZERO_PAGE(0));
-                new_page = alloc_zeroed_user_highpage_movable(vma, address);
+        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-                if (!new_page)
+        if (!new_page)
-                        goto oom;
+                goto oom;
-        } else {
+        cow_user_page(new_page, old_page, address, vma);
-                new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
-                if (!new_page)
-                        goto oom;
-                cow_user_page(new_page, old_page, address, vma);
-        }
        /*
         * Re-check the pte - we dropped the lock
@@ -1744,7 +1648,6 @@ gotten:
                flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                lazy_mmu_prot_update(entry);
                /*
                 * Clear the pte entry and flush it first, before updating the
                 * pte with the new entry. This will avoid a race condition
@@ -2252,44 +2155,28 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spinlock_t *ptl;
        pte_t entry;
-        if (write_access) {
+        /* Allocate our own private page. */
-                /* Allocate our own private page. */
+        pte_unmap(page_table);
-                pte_unmap(page_table);
-                if (unlikely(anon_vma_prepare(vma)))
-                        goto oom;
-                page = alloc_zeroed_user_highpage_movable(vma, address);
-                if (!page)
-                        goto oom;
-                entry = mk_pte(page, vma->vm_page_prot);
-                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+        if (unlikely(anon_vma_prepare(vma)))
-                if (!pte_none(*page_table))
+                goto oom;
-                        goto release;
+        page = alloc_zeroed_user_highpage_movable(vma, address);
-                inc_mm_counter(mm, anon_rss);
+        if (!page)
-                lru_cache_add_active(page);
+                goto oom;
-                page_add_new_anon_rmap(page, vma, address);
-        } else {
-                /* Map the ZERO_PAGE - vm_page_prot is readonly */
-                page = ZERO_PAGE(address);
-                page_cache_get(page);
-                entry = mk_pte(page, vma->vm_page_prot);
-                ptl = pte_lockptr(mm, pmd);
+        entry = mk_pte(page, vma->vm_page_prot);
-                spin_lock(ptl);
+        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-                if (!pte_none(*page_table))
-                        goto release;
-                inc_mm_counter(mm, file_rss);
-                page_add_file_rmap(page);
-        }
+        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+        if (!pte_none(*page_table))
+                goto release;
+        inc_mm_counter(mm, anon_rss);
+        lru_cache_add_active(page);
+        page_add_new_anon_rmap(page, vma, address);
        set_pte_at(mm, address, page_table, entry);
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, address, entry);
-        lazy_mmu_prot_update(entry);
 unlock:
        pte_unmap_unlock(page_table, ptl);
        return 0;
@@ -2442,7 +2329,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                /* no need to invalidate: a not-present page won't be cached */
                update_mmu_cache(vma, address, entry);
-                lazy_mmu_prot_update(entry);
        } else {
                if (anon)
                        page_cache_release(page);
@@ -2470,7 +2356,7 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                int write_access, pte_t orig_pte)
 {
        pgoff_t pgoff = (((address & PAGE_MASK)
-                        - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
+                        - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
        unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
        pte_unmap(page_table);
@@ -2614,7 +2500,6 @@ static inline int handle_pte_fault(struct mm_struct *mm,
        entry = pte_mkyoung(entry);
        if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
                update_mmu_cache(vma, address, entry);
-                lazy_mmu_prot_update(entry);
        } else {
                /*
                 * This is needed only for protection faults but the arch code
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index df9d554bea30..091b9c6c2529 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -23,6 +23,9 @@
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
 #include <linux/cpuset.h>
+#include <linux/delay.h>
+#include <linux/migrate.h>
+#include <linux/page-isolation.h>
 #include <asm/tlbflush.h>
@@ -161,14 +164,27 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
                                        pgdat->node_start_pfn;
 }
-int online_pages(unsigned long pfn, unsigned long nr_pages)
+static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
+                        void *arg)
 {
        unsigned long i;
+        unsigned long onlined_pages = *(unsigned long *)arg;
+        struct page *page;
+        if (PageReserved(pfn_to_page(start_pfn)))
+                for (i = 0; i < nr_pages; i++) {
+                        page = pfn_to_page(start_pfn + i);
+                        online_page(page);
+                        onlined_pages++;
+                }
+        *(unsigned long *)arg = onlined_pages;
+        return 0;
+}
+int online_pages(unsigned long pfn, unsigned long nr_pages)
+{
        unsigned long flags;
        unsigned long onlined_pages = 0;
-        struct resource res;
-        u64 section_end;
-        unsigned long start_pfn;
        struct zone *zone;
        int need_zonelists_rebuild = 0;
@@ -191,32 +207,16 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        if (!populated_zone(zone))
                need_zonelists_rebuild = 1;
-        res.start = (u64)pfn << PAGE_SHIFT;
+        walk_memory_resource(pfn, nr_pages, &onlined_pages,
-        res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
+                online_pages_range);
-        res.flags = IORESOURCE_MEM; /* we just need system ram */
-        section_end = res.end;
-        while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
-                start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
-                nr_pages = (unsigned long)
-                           ((res.end + 1 - res.start) >> PAGE_SHIFT);
-                if (PageReserved(pfn_to_page(start_pfn))) {
-                        /* this region's page is not onlined now */
-                        for (i = 0; i < nr_pages; i++) {
-                                struct page *page = pfn_to_page(start_pfn + i);
-                                online_page(page);
-                                onlined_pages++;
-                        }
-                }
-                res.start = res.end + 1;
-                res.end = section_end;
-        }
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
        setup_per_zone_pages_min();
+        if (onlined_pages) {
+                kswapd_run(zone_to_nid(zone));
+                node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+        }
        if (need_zonelists_rebuild)
                build_all_zonelists();
@@ -271,9 +271,6 @@ int add_memory(int nid, u64 start, u64 size)
                if (!pgdat)
                        return -ENOMEM;
                new_pgdat = 1;
-                ret = kswapd_run(nid);
-                if (ret)
-                        goto error;
        }
        /* call arch's memory hotadd */
@@ -308,3 +305,260 @@ error:
        return ret;
 }
 EXPORT_SYMBOL_GPL(add_memory);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * Confirm all pages in a range [start, end) is belongs to the same zone.
+ */
+static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        struct zone *zone = NULL;
+        struct page *page;
+        int i;
+        for (pfn = start_pfn;
+             pfn < end_pfn;
+             pfn += MAX_ORDER_NR_PAGES) {
+                i = 0;
+                /* This is just a CONFIG_HOLES_IN_ZONE check.*/
+                while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
+                        i++;
+                if (i == MAX_ORDER_NR_PAGES)
+                        continue;
+                page = pfn_to_page(pfn + i);
+                if (zone && page_zone(page) != zone)
+                        return 0;
+                zone = page_zone(page);
+        }
+        return 1;
+}
+/*
+ * Scanning pfn is much easier than scanning lru list.
+ * Scan pfn from start to end and Find LRU page.
+ */
+int scan_lru_pages(unsigned long start, unsigned long end)
+{
+        unsigned long pfn;
+        struct page *page;
+        for (pfn = start; pfn < end; pfn++) {
+                if (pfn_valid(pfn)) {
+                        page = pfn_to_page(pfn);
+                        if (PageLRU(page))
+                                return pfn;
+                }
+        }
+        return 0;
+}
+static struct page *
+hotremove_migrate_alloc(struct page *page,
+                        unsigned long private,
+                        int **x)
+{
+        /* This should be improoooooved!! */
+        return alloc_page(GFP_HIGHUSER_PAGECACHE);
+}
+#define NR_OFFLINE_AT_ONCE_PAGES        (256)
+static int
+do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        struct page *page;
+        int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
+        int not_managed = 0;
+        int ret = 0;
+        LIST_HEAD(source);
+        for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+                if (!pfn_valid(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                if (!page_count(page))
+                        continue;
+                /*
+                 * We can skip free pages. And we can only deal with pages on
+                 * LRU.
+                 */
+                ret = isolate_lru_page(page, &source);
+                if (!ret) { /* Success */
+                        move_pages--;
+                } else {
+                        /* Becasue we don't have big zone->lock. we should
+                           check this again here. */
+                        if (page_count(page))
+                                not_managed++;
+#ifdef CONFIG_DEBUG_VM
+                        printk(KERN_INFO "removing from LRU failed"
+                                         " %lx/%d/%lx\n",
+                                pfn, page_count(page), page->flags);
+#endif
+                }
+        }
+        ret = -EBUSY;
+        if (not_managed) {
+                if (!list_empty(&source))
+                        putback_lru_pages(&source);
+                goto out;
+        }
+        ret = 0;
+        if (list_empty(&source))
+                goto out;
+        /* this function returns # of failed pages */
+        ret = migrate_pages(&source, hotremove_migrate_alloc, 0);
+out:
+        return ret;
+}
+/*
+ * remove from free_area[] and mark all as Reserved.
+ */
+static int
+offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
+                        void *data)
+{
+        __offline_isolated_pages(start, start + nr_pages);
+        return 0;
+}
+static void
+offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+        walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
+                                offline_isolated_pages_cb);
+}
+/*
+ * Check all pages in range, recoreded as memory resource, are isolated.
+ */
+static int
+check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
+                        void *data)
+{
+        int ret;
+        long offlined = *(long *)data;
+        ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
+        offlined = nr_pages;
+        if (!ret)
+                *(long *)data += offlined;
+        return ret;
+}
+static long
+check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
+{
+        long offlined = 0;
+        int ret;
+        ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
+                        check_pages_isolated_cb);
+        if (ret < 0)
+                offlined = (long)ret;
+        return offlined;
+}
+extern void drain_all_local_pages(void);
+int offline_pages(unsigned long start_pfn,
+                  unsigned long end_pfn, unsigned long timeout)
+{
+        unsigned long pfn, nr_pages, expire;
+        long offlined_pages;
+        int ret, drain, retry_max;
+        struct zone *zone;
+        BUG_ON(start_pfn >= end_pfn);
+        /* at least, alignment against pageblock is necessary */
+        if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
+                return -EINVAL;
+        if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
+                return -EINVAL;
+        /* This makes hotplug much easier...and readable.
+           we assume this for now. .*/
+        if (!test_pages_in_a_zone(start_pfn, end_pfn))
+                return -EINVAL;
+        /* set above range as isolated */
+        ret = start_isolate_page_range(start_pfn, end_pfn);
+        if (ret)
+                return ret;
+        nr_pages = end_pfn - start_pfn;
+        pfn = start_pfn;
+        expire = jiffies + timeout;
+        drain = 0;
+        retry_max = 5;
+repeat:
+        /* start memory hot removal */
+        ret = -EAGAIN;
+        if (time_after(jiffies, expire))
+                goto failed_removal;
+        ret = -EINTR;
+        if (signal_pending(current))
+                goto failed_removal;
+        ret = 0;
+        if (drain) {
+                lru_add_drain_all();
+                flush_scheduled_work();
+                cond_resched();
+                drain_all_local_pages();
+        }
+        pfn = scan_lru_pages(start_pfn, end_pfn);
+        if (pfn) { /* We have page on LRU */
+                ret = do_migrate_range(pfn, end_pfn);
+                if (!ret) {
+                        drain = 1;
+                        goto repeat;
+                } else {
+                        if (ret < 0)
+                                if (--retry_max == 0)
+                                        goto failed_removal;
+                        yield();
+                        drain = 1;
+                        goto repeat;
+                }
+        }
+        /* drain all zone's lru pagevec, this is asyncronous... */
+        lru_add_drain_all();
+        flush_scheduled_work();
+        yield();
+        /* drain pcp pages , this is synchrouns. */
+        drain_all_local_pages();
+        /* check again */
+        offlined_pages = check_pages_isolated(start_pfn, end_pfn);
+        if (offlined_pages < 0) {
+                ret = -EBUSY;
+                goto failed_removal;
+        }
+        printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
+        /* Ok, all of our target is islaoted.
+           We cannot do rollback at this point. */
+        offline_isolated_pages(start_pfn, end_pfn);
+        /* reset pagetype flags */
+        start_isolate_page_range(start_pfn, end_pfn);
+        /* removal success */
+        zone = page_zone(pfn_to_page(start_pfn));
+        zone->present_pages -= offlined_pages;
+        zone->zone_pgdat->node_present_pages -= offlined_pages;
+        totalram_pages -= offlined_pages;
+        num_physpages -= offlined_pages;
+        vm_total_pages = nr_free_pagecache_pages();
+        writeback_set_ratelimit();
+        return 0;
+failed_removal:
+        printk(KERN_INFO "memory offlining %lx to %lx failed\n",
+                start_pfn, end_pfn);
+        /* pushback to free area */
+        undo_isolate_page_range(start_pfn, end_pfn);
+        return ret;
+}
+#else
+int remove_memory(u64 start, u64 size)
+{
+        return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(remove_memory);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3d6ac9505d07..568152ae6caf 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -72,7 +72,6 @@
 #include <linux/hugetlb.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/mm.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/gfp.h>
@@ -82,13 +81,13 @@
 #include <linux/interrupt.h>
 #include <linux/init.h>
 #include <linux/compat.h>
-#include <linux/mempolicy.h>
 #include <linux/swap.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/migrate.h>
 #include <linux/rmap.h>
 #include <linux/security.h>
+#include <linux/syscalls.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -110,6 +109,9 @@ struct mempolicy default_policy = {
        .policy = MPOL_DEFAULT,
 };
+static void mpol_rebind_policy(struct mempolicy *pol,
+                               const nodemask_t *newmask);
 /* Do sanity checking on a policy */
 static int mpol_check_policy(int mode, nodemask_t *nodes)
 {
@@ -128,7 +130,7 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
                        return -EINVAL;
                break;
        }
-        return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
+        return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
 }
 /* Generate a custom zonelist for the BIND policy. */
@@ -185,7 +187,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
        switch (mode) {
        case MPOL_INTERLEAVE:
                policy->v.nodes = *nodes;
-                if (nodes_weight(*nodes) == 0) {
+                nodes_and(policy->v.nodes, policy->v.nodes,
+                                        node_states[N_HIGH_MEMORY]);
+                if (nodes_weight(policy->v.nodes) == 0) {
                        kmem_cache_free(policy_cache, policy);
                        return ERR_PTR(-EINVAL);
                }
@@ -459,7 +463,7 @@ static void mpol_set_task_struct_flag(void)
 }
 /* Set the process memory policy */
-long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
        struct mempolicy *new;
@@ -494,9 +498,9 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
                *nodes = p->v.nodes;
                break;
        case MPOL_PREFERRED:
-                /* or use current node instead of online map? */
+                /* or use current node instead of memory_map? */
                if (p->v.preferred_node < 0)
-                        *nodes = node_online_map;
+                        *nodes = node_states[N_HIGH_MEMORY];
                else
                        node_set(p->v.preferred_node, *nodes);
                break;
@@ -519,8 +523,8 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
 }
 /* Retrieve NUMA policy */
-long do_get_mempolicy(int *policy, nodemask_t *nmask,
+static long do_get_mempolicy(int *policy, nodemask_t *nmask,
-                        unsigned long addr, unsigned long flags)
+                             unsigned long addr, unsigned long flags)
 {
        int err;
        struct mm_struct *mm = current->mm;
@@ -528,8 +532,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
        struct mempolicy *pol = current->mempolicy;
        cpuset_update_task_memory_state();
-        if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
+        if (flags &
+                ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
                return -EINVAL;
+        if (flags & MPOL_F_MEMS_ALLOWED) {
+                if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
+                        return -EINVAL;
+                *policy = 0;    /* just so it's initialized */
+                *nmask  = cpuset_current_mems_allowed;
+                return 0;
+        }
        if (flags & MPOL_F_ADDR) {
                down_read(&mm->mmap_sem);
                vma = find_vma_intersection(mm, addr, addr+1);
@@ -601,7 +615,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
 */
-int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
+static int migrate_to_node(struct mm_struct *mm, int source, int dest,
+                           int flags)
 {
        nodemask_t nmask;
        LIST_HEAD(pagelist);
@@ -732,8 +747,9 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 }
 #endif
-long do_mbind(unsigned long start, unsigned long len,
+static long do_mbind(unsigned long start, unsigned long len,
-                unsigned long mode, nodemask_t *nmask, unsigned long flags)
+                     unsigned long mode, nodemask_t *nmask,
+                     unsigned long flags)
 {
        struct vm_area_struct *vma;
        struct mm_struct *mm = current->mm;
@@ -955,7 +971,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
                goto out;
        }
-        if (!nodes_subset(new, node_online_map)) {
+        if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
                err = -EINVAL;
                goto out;
        }
@@ -978,7 +994,8 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
                                unsigned long maxnode,
                                unsigned long addr, unsigned long flags)
 {
-        int err, pval;
+        int err;
+        int uninitialized_var(pval);
        nodemask_t nodes;
        if (nmask != NULL && maxnode < MAX_NUMNODES)
@@ -1527,8 +1544,8 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
        kmem_cache_free(sn_cache, n);
 }
-struct sp_node *
+static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
-sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
+                                struct mempolicy *pol)
 {
        struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
@@ -1677,7 +1694,7 @@ void __init numa_policy_init(void)
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
-        for_each_online_node(nid) {
+        for_each_node_state(nid, N_HIGH_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);
                /* Preserve the largest node */
@@ -1706,7 +1723,8 @@ void numa_default_policy(void)
 }
 /* Migrate a policy to a different set of nodes */
-void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
+static void mpol_rebind_policy(struct mempolicy *pol,
+                               const nodemask_t *newmask)
 {
        nodemask_t *mpolmask;
        nodemask_t tmp;
@@ -1963,7 +1981,7 @@ int show_numa_map(struct seq_file *m, void *v)
                seq_printf(m, " huge");
        } else {
                check_pgd_range(vma, vma->vm_start, vma->vm_end,
-                                &node_online_map, MPOL_MF_STATS, md);
+                        &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
        }
        if (!md->pages)
@@ -1990,7 +2008,7 @@ int show_numa_map(struct seq_file *m, void *v)
        if (md->writeback)
                seq_printf(m," writeback=%lu", md->writeback);
-        for_each_online_node(n)
+        for_each_node_state(n, N_HIGH_MEMORY)
                if (md->node[n])
                        seq_printf(m, " N%d=%lu", n, md->node[n]);
 out:
diff --git a/mm/migrate.c b/mm/migrate.c
index 07f22d4a431f..06d0877a66ef 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -171,6 +171,7 @@ static void remove_migration_pte(struct vm_area_struct *vma,
        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
        if (is_write_migration_entry(entry))
                pte = pte_mkwrite(pte);
+        flush_cache_page(vma, addr, pte_pfn(pte));
        set_pte_at(mm, addr, ptep, pte);
        if (PageAnon(new))
@@ -180,7 +181,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, addr, pte);
-        lazy_mmu_prot_update(pte);
 out:
        pte_unmap_unlock(ptep, ptl);
@@ -986,7 +986,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
                                goto out;
                        err = -ENODEV;
-                        if (!node_online(node))
+                        if (!node_state(node, N_HIGH_MEMORY))
                                goto out;
                        err = -EACCES;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index e8346c30abec..1d4d69790e59 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -53,7 +53,6 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        if (dirty_accountable && pte_dirty(ptent))
                                ptent = pte_mkwrite(ptent);
                        set_pte_at(mm, addr, pte, ptent);
-                        lazy_mmu_prot_update(ptent);
 #ifdef CONFIG_MIGRATION
                } else if (!pte_file(oldpte)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f9b82ad5047f..41b4e362221d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -177,14 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
 {
 #ifdef CONFIG_NUMA
        struct zone **z;
-        nodemask_t nodes;
+        nodemask_t nodes = node_states[N_HIGH_MEMORY];
-        int node;
-        nodes_clear(nodes);
-        /* node has memory ? */
-        for_each_online_node(node)
-                if (NODE_DATA(node)->node_present_pages)
-                        node_set(node, nodes);
        for (z = zonelist->zones; *z; z++)
                if (cpuset_zone_allowed_softwall(*z, gfp_mask))
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 44720363374c..d821321326e3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -126,7 +126,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
        int node;
        unsigned long x = 0;
-        for_each_online_node(node) {
+        for_each_node_state(node, N_HIGH_MEMORY) {
                struct zone *z =
                        &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
@@ -1022,17 +1022,15 @@ int test_set_page_writeback(struct page *page)
 EXPORT_SYMBOL(test_set_page_writeback);
 /*
- * Return true if any of the pages in the mapping are marged with the
+ * Return true if any of the pages in the mapping are marked with the
 * passed tag.
 */
 int mapping_tagged(struct address_space *mapping, int tag)
 {
-        unsigned long flags;
        int ret;
+        rcu_read_lock();
-        read_lock_irqsave(&mapping->tree_lock, flags);
        ret = radix_tree_tagged(&mapping->page_tree, tag);
-        read_unlock_irqrestore(&mapping->tree_lock, flags);
+        rcu_read_unlock();
        return ret;
 }
 EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1a8c59571cb7..d315e1127dc9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -41,24 +41,37 @@
 #include <linux/pfn.h>
 #include <linux/backing-dev.h>
 #include <linux/fault-inject.h>
+#include <linux/page-isolation.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include "internal.h"
 /*
- * MCD - HACK: Find somewhere to initialize this EARLY, or make this
+ * Array of node states.
- * initializer cleaner
 */
-nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
+nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
-EXPORT_SYMBOL(node_online_map);
+        [N_POSSIBLE] = NODE_MASK_ALL,
-nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
+        [N_ONLINE] = { { [0] = 1UL } },
-EXPORT_SYMBOL(node_possible_map);
+#ifndef CONFIG_NUMA
+        [N_NORMAL_MEMORY] = { { [0] = 1UL } },
+#ifdef CONFIG_HIGHMEM
+        [N_HIGH_MEMORY] = { { [0] = 1UL } },
+#endif
+        [N_CPU] = { { [0] = 1UL } },
+#endif  /* NUMA */
+};
+EXPORT_SYMBOL(node_states);
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
 long nr_swap_pages;
 int percpu_pagelist_fraction;
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+int pageblock_order __read_mostly;
+#endif
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
@@ -137,7 +150,7 @@ static unsigned long __meminitdata dma_reserve;
  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
  unsigned long __initdata required_kernelcore;
-  unsigned long __initdata required_movablecore;
+  static unsigned long __initdata required_movablecore;
  unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
  /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -150,6 +163,14 @@ int nr_node_ids __read_mostly = MAX_NUMNODES;
 EXPORT_SYMBOL(nr_node_ids);
 #endif
+int page_group_by_mobility_disabled __read_mostly;
+static void set_pageblock_migratetype(struct page *page, int migratetype)
+{
+        set_pageblock_flags_group(page, (unsigned long)migratetype,
+                                        PB_migrate, PB_migrate_end);
+}
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -293,16 +314,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
                clear_highpage(page + i);
 }
-/*
- * function for dealing with page's order in buddy system.
- * zone->lock is already acquired when we use these.
- * So, we don't need atomic page->flags operations here.
- */
-static inline unsigned long page_order(struct page *page)
-{
-        return page_private(page);
-}
 static inline void set_page_order(struct page *page, int order)
 {
        set_page_private(page, order);
@@ -404,6 +415,7 @@ static inline void __free_one_page(struct page *page,
 {
        unsigned long page_idx;
        int order_size = 1 << order;
+        int migratetype = get_pageblock_migratetype(page);
        if (unlikely(PageCompound(page)))
                destroy_compound_page(page, order);
@@ -416,7 +428,6 @@ static inline void __free_one_page(struct page *page,
        __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
        while (order < MAX_ORDER-1) {
                unsigned long combined_idx;
-                struct free_area *area;
                struct page *buddy;
                buddy = __page_find_buddy(page, page_idx, order);
@@ -424,8 +435,7 @@ static inline void __free_one_page(struct page *page,
                        break;          /* Move the buddy up one level. */
                list_del(&buddy->lru);
-                area = zone->free_area + order;
+                zone->free_area[order].nr_free--;
-                area->nr_free--;
                rmv_page_order(buddy);
                combined_idx = __find_combined_index(page_idx, order);
                page = page + (combined_idx - page_idx);
@@ -433,7 +443,8 @@ static inline void __free_one_page(struct page *page,
                order++;
        }
        set_page_order(page, order);
-        list_add(&page->lru, &zone->free_area[order].free_list);
+        list_add(&page->lru,
+                &zone->free_area[order].free_list[migratetype]);
        zone->free_area[order].nr_free++;
 }
@@ -567,7 +578,8 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 * -- wli
 */
 static inline void expand(struct zone *zone, struct page *page,
-        int low, int high, struct free_area *area)
+        int low, int high, struct free_area *area,
+        int migratetype)
 {
        unsigned long size = 1 << high;
@@ -576,7 +588,7 @@ static inline void expand(struct zone *zone, struct page *page,
                high--;
                size >>= 1;
                VM_BUG_ON(bad_range(zone, &page[size]));
-                list_add(&page[size].lru, &area->free_list);
+                list_add(&page[size].lru, &area->free_list[migratetype]);
                area->nr_free++;
                set_page_order(&page[size], high);
        }
@@ -628,49 +640,235 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
        return 0;
 }
-/* 
+/*
- * Do the hard work of removing an element from the buddy allocator.
+ * Go through the free lists for the given migratetype and remove
- * Call me with the zone->lock already held.
+ * the smallest available page from the freelists
 */
-static struct page *__rmqueue(struct zone *zone, unsigned int order)
+static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
+                                                int migratetype)
 {
-        struct free_area * area;
        unsigned int current_order;
+        struct free_area * area;
        struct page *page;
+        /* Find a page of the appropriate size in the preferred list */
        for (current_order = order; current_order < MAX_ORDER; ++current_order) {
-                area = zone->free_area + current_order;
+                area = &(zone->free_area[current_order]);
-                if (list_empty(&area->free_list))
+                if (list_empty(&area->free_list[migratetype]))
                        continue;
-                page = list_entry(area->free_list.next, struct page, lru);
+                page = list_entry(area->free_list[migratetype].next,
+                                                        struct page, lru);
                list_del(&page->lru);
                rmv_page_order(page);
                area->nr_free--;
                __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
-                expand(zone, page, order, current_order, area);
+                expand(zone, page, order, current_order, area, migratetype);
                return page;
        }
        return NULL;
 }
+/*
+ * This array describes the order lists are fallen back to when
+ * the free lists for the desirable migrate type are depleted
+ */
+static int fallbacks[MIGRATE_TYPES][MIGRATE_TYPES-1] = {
+        [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_RESERVE },
+        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_RESERVE },
+        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_RESERVE },
+        [MIGRATE_RESERVE]     = { MIGRATE_RESERVE,     MIGRATE_RESERVE,   MIGRATE_RESERVE }, /* Never used */
+};
+/*
+ * Move the free pages in a range to the free lists of the requested type.
+ * Note that start_page and end_pages are not aligned on a pageblock
+ * boundary. If alignment is required, use move_freepages_block()
+ */
+int move_freepages(struct zone *zone,
+                        struct page *start_page, struct page *end_page,
+                        int migratetype)
+{
+        struct page *page;
+        unsigned long order;
+        int pages_moved = 0;
+#ifndef CONFIG_HOLES_IN_ZONE
+        /*
+         * page_zone is not safe to call in this context when
+         * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
+         * anyway as we check zone boundaries in move_freepages_block().
+         * Remove at a later date when no bug reports exist related to
+         * grouping pages by mobility
+         */
+        BUG_ON(page_zone(start_page) != page_zone(end_page));
+#endif
+        for (page = start_page; page <= end_page;) {
+                if (!pfn_valid_within(page_to_pfn(page))) {
+                        page++;
+                        continue;
+                }
+                if (!PageBuddy(page)) {
+                        page++;
+                        continue;
+                }
+                order = page_order(page);
+                list_del(&page->lru);
+                list_add(&page->lru,
+                        &zone->free_area[order].free_list[migratetype]);
+                page += 1 << order;
+                pages_moved += 1 << order;
+        }
+        return pages_moved;
+}
+int move_freepages_block(struct zone *zone, struct page *page, int migratetype)
+{
+        unsigned long start_pfn, end_pfn;
+        struct page *start_page, *end_page;
+        start_pfn = page_to_pfn(page);
+        start_pfn = start_pfn & ~(pageblock_nr_pages-1);
+        start_page = pfn_to_page(start_pfn);
+        end_page = start_page + pageblock_nr_pages - 1;
+        end_pfn = start_pfn + pageblock_nr_pages - 1;
+        /* Do not cross zone boundaries */
+        if (start_pfn < zone->zone_start_pfn)
+                start_page = page;
+        if (end_pfn >= zone->zone_start_pfn + zone->spanned_pages)
+                return 0;
+        return move_freepages(zone, start_page, end_page, migratetype);
+}
+/* Return the page with the lowest PFN in the list */
+static struct page *min_page(struct list_head *list)
+{
+        unsigned long min_pfn = -1UL;
+        struct page *min_page = NULL, *page;;
+        list_for_each_entry(page, list, lru) {
+                unsigned long pfn = page_to_pfn(page);
+                if (pfn < min_pfn) {
+                        min_pfn = pfn;
+                        min_page = page;
+                }
+        }
+        return min_page;
+}
+/* Remove an element from the buddy allocator from the fallback list */
+static struct page *__rmqueue_fallback(struct zone *zone, int order,
+                                                int start_migratetype)
+{
+        struct free_area * area;
+        int current_order;
+        struct page *page;
+        int migratetype, i;
+        /* Find the largest possible block of pages in the other list */
+        for (current_order = MAX_ORDER-1; current_order >= order;
+                                                --current_order) {
+                for (i = 0; i < MIGRATE_TYPES - 1; i++) {
+                        migratetype = fallbacks[start_migratetype][i];
+                        /* MIGRATE_RESERVE handled later if necessary */
+                        if (migratetype == MIGRATE_RESERVE)
+                                continue;
+                        area = &(zone->free_area[current_order]);
+                        if (list_empty(&area->free_list[migratetype]))
+                                continue;
+                        /* Bias kernel allocations towards low pfns */
+                        page = list_entry(area->free_list[migratetype].next,
+                                        struct page, lru);
+                        if (unlikely(start_migratetype != MIGRATE_MOVABLE))
+                                page = min_page(&area->free_list[migratetype]);
+                        area->nr_free--;
+                        /*
+                         * If breaking a large block of pages, move all free
+                         * pages to the preferred allocation list. If falling
+                         * back for a reclaimable kernel allocation, be more
+                         * agressive about taking ownership of free pages
+                         */
+                        if (unlikely(current_order >= (pageblock_order >> 1)) ||
+                                        start_migratetype == MIGRATE_RECLAIMABLE) {
+                                unsigned long pages;
+                                pages = move_freepages_block(zone, page,
+                                                                start_migratetype);
+                                /* Claim the whole block if over half of it is free */
+                                if (pages >= (1 << (pageblock_order-1)))
+                                        set_pageblock_migratetype(page,
+                                                                start_migratetype);
+                                migratetype = start_migratetype;
+                        }
+                        /* Remove the page from the freelists */
+                        list_del(&page->lru);
+                        rmv_page_order(page);
+                        __mod_zone_page_state(zone, NR_FREE_PAGES,
+                                                        -(1UL << order));
+                        if (current_order == pageblock_order)
+                                set_pageblock_migratetype(page,
+                                                        start_migratetype);
+                        expand(zone, page, order, current_order, area, migratetype);
+                        return page;
+                }
+        }
+        /* Use MIGRATE_RESERVE rather than fail an allocation */
+        return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
+}
+/*
+ * Do the hard work of removing an element from the buddy allocator.
+ * Call me with the zone->lock already held.
+ */
+static struct page *__rmqueue(struct zone *zone, unsigned int order,
+                                                int migratetype)
+{
+        struct page *page;
+        page = __rmqueue_smallest(zone, order, migratetype);
+        if (unlikely(!page))
+                page = __rmqueue_fallback(zone, order, migratetype);
+        return page;
+}
 /* 
 * Obtain a specified number of elements from the buddy allocator, all under
 * a single hold of the lock, for efficiency.  Add them to the supplied list.
 * Returns the number of new pages which were placed at *list.
 */
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
-                        unsigned long count, struct list_head *list)
+                        unsigned long count, struct list_head *list,
+                        int migratetype)
 {
        int i;
        
        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
-                struct page *page = __rmqueue(zone, order);
+                struct page *page = __rmqueue(zone, order, migratetype);
                if (unlikely(page == NULL))
                        break;
-                list_add_tail(&page->lru, list);
+                list_add(&page->lru, list);
+                set_page_private(page, migratetype);
        }
        spin_unlock(&zone->lock);
        return i;
@@ -732,7 +930,7 @@ void mark_free_pages(struct zone *zone)
 {
        unsigned long pfn, max_zone_pfn;
        unsigned long flags;
-        int order;
+        int order, t;
        struct list_head *curr;
        if (!zone->spanned_pages)
@@ -749,17 +947,18 @@ void mark_free_pages(struct zone *zone)
                                swsusp_unset_page_free(page);
                }
-        for (order = MAX_ORDER - 1; order >= 0; --order)
+        for_each_migratetype_order(order, t) {
-                list_for_each(curr, &zone->free_area[order].free_list) {
+                list_for_each(curr, &zone->free_area[order].free_list[t]) {
                        unsigned long i;
                        pfn = page_to_pfn(list_entry(curr, struct page, lru));
                        for (i = 0; i < (1UL << order); i++)
                                swsusp_set_page_free(pfn_to_page(pfn + i));
                }
+        }
        spin_unlock_irqrestore(&zone->lock, flags);
 }
+#endif /* CONFIG_PM */
 /*
 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
@@ -772,7 +971,25 @@ void drain_local_pages(void)
        __drain_pages(smp_processor_id());
        local_irq_restore(flags);       
 }
-#endif /* CONFIG_HIBERNATION */
+void smp_drain_local_pages(void *arg)
+{
+        drain_local_pages();
+}
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ */
+void drain_all_local_pages(void)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __drain_pages(smp_processor_id());
+        local_irq_restore(flags);
+        smp_call_function(smp_drain_local_pages, NULL, 0, 1);
+}
 /*
 * Free a 0-order page
@@ -797,6 +1014,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        local_irq_save(flags);
        __count_vm_event(PGFREE);
        list_add(&page->lru, &pcp->list);
+        set_page_private(page, get_pageblock_migratetype(page));
        pcp->count++;
        if (pcp->count >= pcp->high) {
                free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -846,6 +1064,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist,
        struct page *page;
        int cold = !!(gfp_flags & __GFP_COLD);
        int cpu;
+        int migratetype = allocflags_to_migratetype(gfp_flags);
 again:
        cpu  = get_cpu();
@@ -856,16 +1075,28 @@ again:
                local_irq_save(flags);
                if (!pcp->count) {
                        pcp->count = rmqueue_bulk(zone, 0,
-                                                pcp->batch, &pcp->list);
+                                        pcp->batch, &pcp->list, migratetype);
                        if (unlikely(!pcp->count))
                                goto failed;
                }
-                page = list_entry(pcp->list.next, struct page, lru);
+                /* Find a page of the appropriate migrate type */
+                list_for_each_entry(page, &pcp->list, lru)
+                        if (page_private(page) == migratetype)
+                                break;
+                /* Allocate more to the pcp list if necessary */
+                if (unlikely(&page->lru == &pcp->list)) {
+                        pcp->count += rmqueue_bulk(zone, 0,
+                                        pcp->batch, &pcp->list, migratetype);
+                        page = list_entry(pcp->list.next, struct page, lru);
+                }
                list_del(&page->lru);
                pcp->count--;
        } else {
                spin_lock_irqsave(&zone->lock, flags);
-                page = __rmqueue(zone, order);
+                page = __rmqueue(zone, order, migratetype);
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
@@ -1032,7 +1263,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 *
 * If the zonelist cache is present in the passed in zonelist, then
 * returns a pointer to the allowed node mask (either the current
- * tasks mems_allowed, or node_online_map.)
+ * tasks mems_allowed, or node_states[N_HIGH_MEMORY].)
 *
 * If the zonelist cache is not available for this zonelist, does
 * nothing and returns NULL.
@@ -1061,7 +1292,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
        allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
                                        &cpuset_current_mems_allowed :
-                                        &node_online_map;
+                                        &node_states[N_HIGH_MEMORY];
        return allowednodes;
 }
@@ -1183,9 +1414,6 @@ zonelist_scan:
                        !zlc_zone_worth_trying(zonelist, z, allowednodes))
                                continue;
                zone = *z;
-                if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
-                        zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
-                                break;
                if ((alloc_flags & ALLOC_CPUSET) &&
                        !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                goto try_next_zone;
@@ -1254,7 +1482,10 @@ restart:
        z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
        if (unlikely(*z == NULL)) {
-                /* Should this ever happen?? */
+                /*
+                 * Happens if we have an empty zonelist as a result of
+                 * GFP_THISNODE being used on a memoryless node
+                 */
                return NULL;
        }
@@ -1346,6 +1577,9 @@ nofail_alloc:
        cond_resched();
+        if (order != 0)
+                drain_all_local_pages();
        if (likely(did_some_progress)) {
                page = get_page_from_freelist(gfp_mask, order,
                                                zonelist, alloc_flags);
@@ -1794,7 +2028,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
                return node;
        }
-        for_each_online_node(n) {
+        for_each_node_state(n, N_HIGH_MEMORY) {
                cpumask_t tmp;
                /* Don't want a node to appear more than once */
@@ -1850,6 +2084,22 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
 }
 /*
+ * Build gfp_thisnode zonelists
+ */
+static void build_thisnode_zonelists(pg_data_t *pgdat)
+{
+        enum zone_type i;
+        int j;
+        struct zonelist *zonelist;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i;
+                j = build_zonelists_node(pgdat, zonelist, 0, i);
+                zonelist->zones[j] = NULL;
+        }
+}
+/*
 * Build zonelists ordered by zone and nodes within zones.
 * This results in conserving DMA zone[s] until all Normal memory is
 * exhausted, but results in overflowing to remote node while memory
@@ -1915,7 +2165,8 @@ static int default_zonelist_order(void)
         * If there is a node whose DMA/DMA32 memory is very big area on
         * local memory, NODE_ORDER may be suitable.
         */
-        average_size = total_size / (num_online_nodes() + 1);
+        average_size = total_size /
+                                (nodes_weight(node_states[N_HIGH_MEMORY]) + 1);
        for_each_online_node(nid) {
                low_kmem_size = 0;
                total_size = 0;
@@ -1953,7 +2204,7 @@ static void build_zonelists(pg_data_t *pgdat)
        int order = current_zonelist_order;
        /* initialize zonelists */
-        for (i = 0; i < MAX_NR_ZONES; i++) {
+        for (i = 0; i < MAX_ZONELISTS; i++) {
                zonelist = pgdat->node_zonelists + i;
                zonelist->zones[0] = NULL;
        }
@@ -1998,6 +2249,8 @@ static void build_zonelists(pg_data_t *pgdat)
                /* calculate node order -- i.e., DMA last! */
                build_zonelists_in_zone_order(pgdat, j);
        }
+        build_thisnode_zonelists(pgdat);
 }
 /* Construct the zonelist performance cache - see further mmzone.h */
@@ -2078,8 +2331,10 @@ static int __build_all_zonelists(void *dummy)
        int nid;
        for_each_online_node(nid) {
-                build_zonelists(NODE_DATA(nid));
+                pg_data_t *pgdat = NODE_DATA(nid);
-                build_zonelist_cache(NODE_DATA(nid));
+                build_zonelists(pgdat);
+                build_zonelist_cache(pgdat);
        }
        return 0;
 }
@@ -2098,9 +2353,23 @@ void build_all_zonelists(void)
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
-        printk("Built %i zonelists in %s order.  Total pages: %ld\n",
+        /*
+         * Disable grouping by mobility if the number of pages in the
+         * system is too low to allow the mechanism to work. It would be
+         * more accurate, but expensive to check per-zone. This check is
+         * made on memory-hotadd so a system can start with mobility
+         * disabled and enable it later
+         */
+        if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
+                page_group_by_mobility_disabled = 1;
+        else
+                page_group_by_mobility_disabled = 0;
+        printk("Built %i zonelists in %s order, mobility grouping %s.  "
+                "Total pages: %ld\n",
                        num_online_nodes(),
                        zonelist_order_name[current_zonelist_order],
+                        page_group_by_mobility_disabled ? "off" : "on",
                        vm_total_pages);
 #ifdef CONFIG_NUMA
        printk("Policy zone: %s\n", zone_names[policy_zone]);
@@ -2176,6 +2445,61 @@ static inline unsigned long wait_table_bits(unsigned long size)
 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 /*
+ * Mark a number of pageblocks as MIGRATE_RESERVE. The number
+ * of blocks reserved is based on zone->pages_min. The memory within the
+ * reserve will tend to store contiguous free pages. Setting min_free_kbytes
+ * higher will lead to a bigger reserve which will get freed as contiguous
+ * blocks as reclaim kicks in
+ */
+static void setup_zone_migrate_reserve(struct zone *zone)
+{
+        unsigned long start_pfn, pfn, end_pfn;
+        struct page *page;
+        unsigned long reserve, block_migratetype;
+        /* Get the start pfn, end pfn and the number of blocks to reserve */
+        start_pfn = zone->zone_start_pfn;
+        end_pfn = start_pfn + zone->spanned_pages;
+        reserve = roundup(zone->pages_min, pageblock_nr_pages) >>
+                                                        pageblock_order;
+        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+                if (!pfn_valid(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                /* Blocks with reserved pages will never free, skip them. */
+                if (PageReserved(page))
+                        continue;
+                block_migratetype = get_pageblock_migratetype(page);
+                /* If this block is reserved, account for it */
+                if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) {
+                        reserve--;
+                        continue;
+                }
+                /* Suitable for reserving if this block is movable */
+                if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) {
+                        set_pageblock_migratetype(page, MIGRATE_RESERVE);
+                        move_freepages_block(zone, page, MIGRATE_RESERVE);
+                        reserve--;
+                        continue;
+                }
+                /*
+                 * If the reserve is met and this is a previous reserved block,
+                 * take it back
+                 */
+                if (block_migratetype == MIGRATE_RESERVE) {
+                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+                        move_freepages_block(zone, page, MIGRATE_MOVABLE);
+                }
+        }
+}
+/*
 * Initially all pages are reserved - free ones are freed
 * up by free_all_bootmem() once the early boot process is
 * done. Non-atomic initialization, single-pass.
@@ -2204,6 +2528,19 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                init_page_count(page);
                reset_page_mapcount(page);
                SetPageReserved(page);
+                /*
+                 * Mark the block movable so that blocks are reserved for
+                 * movable at startup. This will force kernel allocations
+                 * to reserve their blocks rather than leaking throughout
+                 * the address space during boot when many long-lived
+                 * kernel allocations are made. Later some blocks near
+                 * the start are marked MIGRATE_RESERVE by
+                 * setup_zone_migrate_reserve()
+                 */
+                if ((pfn & (pageblock_nr_pages-1)))
+                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
                INIT_LIST_HEAD(&page->lru);
 #ifdef WANT_PAGE_VIRTUAL
                /* The shift won't overflow because ZONE_NORMAL is below 4G. */
@@ -2216,9 +2553,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 static void __meminit zone_init_free_lists(struct pglist_data *pgdat,
                                struct zone *zone, unsigned long size)
 {
-        int order;
+        int order, t;
-        for (order = 0; order < MAX_ORDER ; order++) {
+        for_each_migratetype_order(order, t) {
-                INIT_LIST_HEAD(&zone->free_area[order].free_list);
+                INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
                zone->free_area[order].nr_free = 0;
        }
 }
@@ -2324,6 +2661,9 @@ static struct per_cpu_pageset boot_pageset[NR_CPUS];
 static int __cpuinit process_zones(int cpu)
 {
        struct zone *zone, *dzone;
+        int node = cpu_to_node(cpu);
+        node_set_state(node, N_CPU);    /* this node has a cpu */
        for_each_zone(zone) {
@@ -2331,7 +2671,7 @@ static int __cpuinit process_zones(int cpu)
                        continue;
                zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
-                                         GFP_KERNEL, cpu_to_node(cpu));
+                                         GFP_KERNEL, node);
                if (!zone_pcp(zone, cpu))
                        goto bad;
@@ -2444,7 +2784,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
                 * To use this new node's memory, further consideration will be
                 * necessary.
                 */
-                zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
+                zone->wait_table = vmalloc(alloc_size);
        }
        if (!zone->wait_table)
                return -ENOMEM;
@@ -2680,10 +3020,8 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
                *end_pfn = max(*end_pfn, early_node_map[i].end_pfn);
        }
-        if (*start_pfn == -1UL) {
+        if (*start_pfn == -1UL)
-                printk(KERN_WARNING "Node %u active with no memory\n", nid);
                *start_pfn = 0;
-        }
        /* Push the node boundaries out if requested */
        account_node_boundary(nid, start_pfn, end_pfn);
@@ -2901,6 +3239,62 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
                                                        realtotalpages);
 }
+#ifndef CONFIG_SPARSEMEM
+/*
+ * Calculate the size of the zone->blockflags rounded to an unsigned long
+ * Start by making sure zonesize is a multiple of pageblock_order by rounding
+ * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
+ * round what is now in bits to nearest long in bits, then return it in
+ * bytes.
+ */
+static unsigned long __init usemap_size(unsigned long zonesize)
+{
+        unsigned long usemapsize;
+        usemapsize = roundup(zonesize, pageblock_nr_pages);
+        usemapsize = usemapsize >> pageblock_order;
+        usemapsize *= NR_PAGEBLOCK_BITS;
+        usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
+        return usemapsize / 8;
+}
+static void __init setup_usemap(struct pglist_data *pgdat,
+                                struct zone *zone, unsigned long zonesize)
+{
+        unsigned long usemapsize = usemap_size(zonesize);
+        zone->pageblock_flags = NULL;
+        if (usemapsize) {
+                zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
+                memset(zone->pageblock_flags, 0, usemapsize);
+        }
+}
+#else
+static void inline setup_usemap(struct pglist_data *pgdat,
+                                struct zone *zone, unsigned long zonesize) {}
+#endif /* CONFIG_SPARSEMEM */
+#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
+/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
+static inline void __init set_pageblock_order(unsigned int order)
+{
+        /* Check that pageblock_nr_pages has not already been setup */
+        if (pageblock_order)
+                return;
+        /*
+         * Assume the largest contiguous order of interest is a huge page.
+         * This value may be variable depending on boot parameters on IA64
+         */
+        pageblock_order = order;
+}
+#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
+/* Defined this way to avoid accidently referencing HUGETLB_PAGE_ORDER */
+#define set_pageblock_order(x)  do {} while (0)
+#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
 /*
 * Set up the zone data structures:
 *   - mark all pages reserved
@@ -2981,6 +3375,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                if (!size)
                        continue;
+                set_pageblock_order(HUGETLB_PAGE_ORDER);
+                setup_usemap(pgdat, zone, size);
                ret = init_currently_empty_zone(zone, zone_start_pfn,
                                                size, MEMMAP_EARLY);
                BUG_ON(ret);
@@ -3234,16 +3630,24 @@ unsigned long __init find_max_pfn_with_active_regions(void)
        return max_pfn;
 }
-unsigned long __init early_calculate_totalpages(void)
+/*
+ * early_calculate_totalpages()
+ * Sum pages in active regions for movable zone.
+ * Populate N_HIGH_MEMORY for calculating usable_nodes.
+ */
+static unsigned long __init early_calculate_totalpages(void)
 {
        int i;
        unsigned long totalpages = 0;
-        for (i = 0; i < nr_nodemap_entries; i++)
+        for (i = 0; i < nr_nodemap_entries; i++) {
-                totalpages += early_node_map[i].end_pfn -
+                unsigned long pages = early_node_map[i].end_pfn -
                                                early_node_map[i].start_pfn;
+                totalpages += pages;
-        return totalpages;
+                if (pages)
+                        node_set_state(early_node_map[i].nid, N_HIGH_MEMORY);
+        }
+        return totalpages;
 }
 /*
@@ -3257,7 +3661,8 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
        int i, nid;
        unsigned long usable_startpfn;
        unsigned long kernelcore_node, kernelcore_remaining;
-        int usable_nodes = num_online_nodes();
+        unsigned long totalpages = early_calculate_totalpages();
+        int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
        /*
         * If movablecore was specified, calculate what size of
@@ -3268,7 +3673,6 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
         * what movablecore would have allowed.
         */
        if (required_movablecore) {
-                unsigned long totalpages = early_calculate_totalpages();
                unsigned long corepages;
                /*
@@ -3293,7 +3697,7 @@ void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
 restart:
        /* Spread kernelcore memory as evenly as possible throughout nodes */
        kernelcore_node = required_kernelcore / usable_nodes;
-        for_each_online_node(nid) {
+        for_each_node_state(nid, N_HIGH_MEMORY) {
                /*
                 * Recalculate kernelcore_node if the division per node
                 * now exceeds what is necessary to satisfy the requested
@@ -3385,6 +3789,20 @@ restart:
                        roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
 }
+/* Any regular memory on that node ? */
+static void check_for_regular_memory(pg_data_t *pgdat)
+{
+#ifdef CONFIG_HIGHMEM
+        enum zone_type zone_type;
+        for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
+                struct zone *zone = &pgdat->node_zones[zone_type];
+                if (zone->present_pages)
+                        node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+        }
+#endif
+}
 /**
 * free_area_init_nodes - Initialise all pg_data_t and zone data
 * @max_zone_pfn: an array of max PFNs for each zone
@@ -3459,6 +3877,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
                pg_data_t *pgdat = NODE_DATA(nid);
                free_area_init_node(nid, pgdat, NULL,
                                find_min_pfn_for_node(nid), NULL);
+                /* Any memory on that node */
+                if (pgdat->node_present_pages)
+                        node_set_state(nid, N_HIGH_MEMORY);
+                check_for_regular_memory(pgdat);
        }
 }
@@ -3673,6 +4096,7 @@ void setup_per_zone_pages_min(void)
                zone->pages_low   = zone->pages_min + (tmp >> 2);
                zone->pages_high  = zone->pages_min + (tmp >> 1);
+                setup_zone_migrate_reserve(zone);
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
@@ -3934,4 +4358,169 @@ EXPORT_SYMBOL(pfn_to_page);
 EXPORT_SYMBOL(page_to_pfn);
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
+                                                        unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+        return __pfn_to_section(pfn)->pageblock_flags;
+#else
+        return zone->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+        pfn &= (PAGES_PER_SECTION-1);
+        return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+        pfn = pfn - zone->zone_start_pfn;
+        return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+/**
+ * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @start_bitidx: The first bit of interest to retrieve
+ * @end_bitidx: The last bit of interest
+ * returns pageblock_bits flags
+ */
+unsigned long get_pageblock_flags_group(struct page *page,
+                                        int start_bitidx, int end_bitidx)
+{
+        struct zone *zone;
+        unsigned long *bitmap;
+        unsigned long pfn, bitidx;
+        unsigned long flags = 0;
+        unsigned long value = 1;
+        zone = page_zone(page);
+        pfn = page_to_pfn(page);
+        bitmap = get_pageblock_bitmap(zone, pfn);
+        bitidx = pfn_to_bitidx(zone, pfn);
+        for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
+                if (test_bit(bitidx + start_bitidx, bitmap))
+                        flags |= value;
+        return flags;
+}
+/**
+ * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @start_bitidx: The first bit of interest
+ * @end_bitidx: The last bit of interest
+ * @flags: The flags to set
+ */
+void set_pageblock_flags_group(struct page *page, unsigned long flags,
+                                        int start_bitidx, int end_bitidx)
+{
+        struct zone *zone;
+        unsigned long *bitmap;
+        unsigned long pfn, bitidx;
+        unsigned long value = 1;
+        zone = page_zone(page);
+        pfn = page_to_pfn(page);
+        bitmap = get_pageblock_bitmap(zone, pfn);
+        bitidx = pfn_to_bitidx(zone, pfn);
+        for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1)
+                if (flags & value)
+                        __set_bit(bitidx + start_bitidx, bitmap);
+                else
+                        __clear_bit(bitidx + start_bitidx, bitmap);
+}
+/*
+ * This is designed as sub function...plz see page_isolation.c also.
+ * set/clear page block's type to be ISOLATE.
+ * page allocater never alloc memory from ISOLATE block.
+ */
+int set_migratetype_isolate(struct page *page)
+{
+        struct zone *zone;
+        unsigned long flags;
+        int ret = -EBUSY;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        /*
+         * In future, more migrate types will be able to be isolation target.
+         */
+        if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+                goto out;
+        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+        move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        ret = 0;
+out:
+        spin_unlock_irqrestore(&zone->lock, flags);
+        if (!ret)
+                drain_all_local_pages();
+        return ret;
+}
+void unset_migratetype_isolate(struct page *page)
+{
+        struct zone *zone;
+        unsigned long flags;
+        zone = page_zone(page);
+        spin_lock_irqsave(&zone->lock, flags);
+        if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
+                goto out;
+        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+        move_freepages_block(zone, page, MIGRATE_MOVABLE);
+out:
+        spin_unlock_irqrestore(&zone->lock, flags);
+}
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/*
+ * All pages in the range must be isolated before calling this.
+ */
+void
+__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+        struct page *page;
+        struct zone *zone;
+        int order, i;
+        unsigned long pfn;
+        unsigned long flags;
+        /* find the first valid pfn */
+        for (pfn = start_pfn; pfn < end_pfn; pfn++)
+                if (pfn_valid(pfn))
+                        break;
+        if (pfn == end_pfn)
+                return;
+        zone = page_zone(pfn_to_page(pfn));
+        spin_lock_irqsave(&zone->lock, flags);
+        pfn = start_pfn;
+        while (pfn < end_pfn) {
+                if (!pfn_valid(pfn)) {
+                        pfn++;
+                        continue;
+                }
+                page = pfn_to_page(pfn);
+                BUG_ON(page_count(page));
+                BUG_ON(!PageBuddy(page));
+                order = page_order(page);
+#ifdef CONFIG_DEBUG_VM
+                printk(KERN_INFO "remove from free list %lx %d %lx\n",
+                       pfn, 1 << order, end_pfn);
+#endif
+                list_del(&page->lru);
+                rmv_page_order(page);
+                zone->free_area[order].nr_free--;
+                __mod_zone_page_state(zone, NR_FREE_PAGES,
+                                      - (1UL << order));
+                for (i = 0; i < (1 << order); i++)
+                        SetPageReserved((page+i));
+                pfn += (1 << order);
+        }
+        spin_unlock_irqrestore(&zone->lock, flags);
+}
+#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
new file mode 100644
index 000000000000..8f92a29695cc
--- /dev/null
+++ b/mm/page_isolation.c
@@ -0,0 +1,138 @@
+/*
+ * linux/mm/page_isolation.c
+ */
+#include <stddef.h>
+#include <linux/mm.h>
+#include <linux/page-isolation.h>
+#include <linux/pageblock-flags.h>
+#include "internal.h"
+static inline struct page *
+__first_valid_page(unsigned long pfn, unsigned long nr_pages)
+{
+        int i;
+        for (i = 0; i < nr_pages; i++)
+                if (pfn_valid_within(pfn + i))
+                        break;
+        if (unlikely(i == nr_pages))
+                return NULL;
+        return pfn_to_page(pfn + i);
+}
+/*
+ * start_isolate_page_range() -- make page-allocation-type of range of pages
+ * to be MIGRATE_ISOLATE.
+ * @start_pfn: The lower PFN of the range to be isolated.
+ * @end_pfn: The upper PFN of the range to be isolated.
+ *
+ * Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
+ * the range will never be allocated. Any free pages and pages freed in the
+ * future will not be allocated again.
+ *
+ * start_pfn/end_pfn must be aligned to pageblock_order.
+ * Returns 0 on success and -EBUSY if any part of range cannot be isolated.
+ */
+int
+start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        unsigned long undo_pfn;
+        struct page *page;
+        BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
+        BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+        for (pfn = start_pfn;
+             pfn < end_pfn;
+             pfn += pageblock_nr_pages) {
+                page = __first_valid_page(pfn, pageblock_nr_pages);
+                if (page && set_migratetype_isolate(page)) {
+                        undo_pfn = pfn;
+                        goto undo;
+                }
+        }
+        return 0;
+undo:
+        for (pfn = start_pfn;
+             pfn <= undo_pfn;
+             pfn += pageblock_nr_pages)
+                unset_migratetype_isolate(pfn_to_page(pfn));
+        return -EBUSY;
+}
+/*
+ * Make isolated pages available again.
+ */
+int
+undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        struct page *page;
+        BUG_ON((start_pfn) & (pageblock_nr_pages - 1));
+        BUG_ON((end_pfn) & (pageblock_nr_pages - 1));
+        for (pfn = start_pfn;
+             pfn < end_pfn;
+             pfn += pageblock_nr_pages) {
+                page = __first_valid_page(pfn, pageblock_nr_pages);
+                if (!page || get_pageblock_flags(page) != MIGRATE_ISOLATE)
+                        continue;
+                unset_migratetype_isolate(page);
+        }
+        return 0;
+}
+/*
+ * Test all pages in the range is free(means isolated) or not.
+ * all pages in [start_pfn...end_pfn) must be in the same zone.
+ * zone->lock must be held before call this.
+ *
+ * Returns 0 if all pages in the range is isolated.
+ */
+static int
+__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
+{
+        struct page *page;
+        while (pfn < end_pfn) {
+                if (!pfn_valid_within(pfn)) {
+                        pfn++;
+                        continue;
+                }
+                page = pfn_to_page(pfn);
+                if (PageBuddy(page))
+                        pfn += 1 << page_order(page);
+                else if (page_count(page) == 0 &&
+                                page_private(page) == MIGRATE_ISOLATE)
+                        pfn += 1;
+                else
+                        break;
+        }
+        if (pfn < end_pfn)
+                return 0;
+        return 1;
+}
+int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
+{
+        unsigned long pfn;
+        struct page *page;
+        pfn = start_pfn;
+        /*
+         * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page
+         * is not aligned to pageblock_nr_pages.
+         * Then we just check pagetype fist.
+         */
+        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+                page = __first_valid_page(pfn, pageblock_nr_pages);
+                if (page && get_pageblock_flags(page) != MIGRATE_ISOLATE)
+                        break;
+        }
+        if (pfn < end_pfn)
+                return -EBUSY;
+        /* Check all pages are free or Marked as ISOLATED */
+        if (__test_page_isolated_in_pageblock(start_pfn, end_pfn))
+                return 0;
+        return -EBUSY;
+}
diff --git a/mm/readahead.c b/mm/readahead.c
index be20c9d699d3..229788884010 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -22,16 +22,8 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 }
 EXPORT_SYMBOL(default_unplug_io_fn);
-/*
- * Convienent macros for min/max read-ahead pages.
- * Note that MAX_RA_PAGES is rounded down, while MIN_RA_PAGES is rounded up.
- * The latter is necessary for systems with large page size(i.e. 64k).
- */
-#define MAX_RA_PAGES    (VM_MAX_READAHEAD*1024 / PAGE_CACHE_SIZE)
-#define MIN_RA_PAGES    DIV_ROUND_UP(VM_MIN_READAHEAD*1024, PAGE_CACHE_SIZE)
 struct backing_dev_info default_backing_dev_info = {
-        .ra_pages       = MAX_RA_PAGES,
+        .ra_pages       = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
        .state          = 0,
        .capabilities   = BDI_CAP_MAP_COPY,
        .unplug_io_fn   = default_unplug_io_fn,
@@ -46,7 +38,7 @@ void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 {
        ra->ra_pages = mapping->backing_dev_info->ra_pages;
-        ra->prev_index = -1;
+        ra->prev_pos = -1;
 }
 EXPORT_SYMBOL_GPL(file_ra_state_init);
@@ -66,28 +58,25 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
                        int (*filler)(void *, struct page *), void *data)
 {
        struct page *page;
-        struct pagevec lru_pvec;
        int ret = 0;
-        pagevec_init(&lru_pvec, 0);
        while (!list_empty(pages)) {
                page = list_to_page(pages);
                list_del(&page->lru);
-                if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
+                if (add_to_page_cache_lru(page, mapping,
+                                        page->index, GFP_KERNEL)) {
                        page_cache_release(page);
                        continue;
                }
+                page_cache_release(page);
                ret = filler(data, page);
-                if (!pagevec_add(&lru_pvec, page))
+                if (unlikely(ret)) {
-                        __pagevec_lru_add(&lru_pvec);
-                if (ret) {
                        put_pages_list(pages);
                        break;
                }
                task_io_account_read(PAGE_CACHE_SIZE);
        }
-        pagevec_lru_add(&lru_pvec);
        return ret;
 }
@@ -97,7 +86,6 @@ static int read_pages(struct address_space *mapping, struct file *filp,
                struct list_head *pages, unsigned nr_pages)
 {
        unsigned page_idx;
-        struct pagevec lru_pvec;
        int ret;
        if (mapping->a_ops->readpages) {
@@ -107,19 +95,15 @@ static int read_pages(struct address_space *mapping, struct file *filp,
                goto out;
        }
-        pagevec_init(&lru_pvec, 0);
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
                struct page *page = list_to_page(pages);
                list_del(&page->lru);
-                if (!add_to_page_cache(page, mapping,
+                if (!add_to_page_cache_lru(page, mapping,
                                        page->index, GFP_KERNEL)) {
                        mapping->a_ops->readpage(filp, page);
-                        if (!pagevec_add(&lru_pvec, page))
+                }
-                                __pagevec_lru_add(&lru_pvec);
+                page_cache_release(page);
-                } else
-                        page_cache_release(page);
        }
-        pagevec_lru_add(&lru_pvec);
        ret = 0;
 out:
        return ret;
@@ -157,20 +141,19 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
        /*
         * Preallocate as many pages as we will need.
         */
-        read_lock_irq(&mapping->tree_lock);
        for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
                pgoff_t page_offset = offset + page_idx;
                if (page_offset > end_index)
                        break;
+                rcu_read_lock();
                page = radix_tree_lookup(&mapping->page_tree, page_offset);
+                rcu_read_unlock();
                if (page)
                        continue;
-                read_unlock_irq(&mapping->tree_lock);
                page = page_cache_alloc_cold(mapping);
-                read_lock_irq(&mapping->tree_lock);
                if (!page)
                        break;
                page->index = page_offset;
@@ -179,7 +162,6 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
                        SetPageReadahead(page);
                ret++;
        }
-        read_unlock_irq(&mapping->tree_lock);
        /*
         * Now start the IO.  We ignore I/O errors - if the page is not
@@ -327,7 +309,7 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
 * indicator. The flag won't be set on already cached pages, to avoid the
 * readahead-for-nothing fuss, saving pointless page cache lookups.
 *
- * prev_index tracks the last visited page in the _previous_ read request.
+ * prev_pos tracks the last visited byte in the _previous_ read request.
 * It should be maintained by the caller, and will be used for detecting
 * small random reads. Note that the readahead algorithm checks loosely
 * for sequential patterns. Hence interleaved reads might be served as
@@ -351,11 +333,9 @@ ondemand_readahead(struct address_space *mapping,
                   bool hit_readahead_marker, pgoff_t offset,
                   unsigned long req_size)
 {
-        unsigned long max;      /* max readahead pages */
+        int     max = ra->ra_pages;     /* max readahead pages */
-        int sequential;
+        pgoff_t prev_offset;
+        int     sequential;
-        max = ra->ra_pages;
-        sequential = (offset - ra->prev_index <= 1UL) || (req_size > max);
        /*
         * It's the expected callback offset, assume sequential access.
@@ -369,6 +349,9 @@ ondemand_readahead(struct address_space *mapping,
                goto readit;
        }
+        prev_offset = ra->prev_pos >> PAGE_CACHE_SHIFT;
+        sequential = offset - prev_offset <= 1UL || req_size > max;
        /*
         * Standalone, small read.
         * Read as is, and do not pollute the readahead state.
@@ -379,6 +362,29 @@ ondemand_readahead(struct address_space *mapping,
        }
        /*
+         * Hit a marked page without valid readahead state.
+         * E.g. interleaved reads.
+         * Query the pagecache for async_size, which normally equals to
+         * readahead size. Ramp it up and use it as the new readahead size.
+         */
+        if (hit_readahead_marker) {
+                pgoff_t start;
+                read_lock_irq(&mapping->tree_lock);
+                start = radix_tree_next_hole(&mapping->page_tree, offset, max+1);
+                read_unlock_irq(&mapping->tree_lock);
+                if (!start || start - offset > max)
+                        return 0;
+                ra->start = start;
+                ra->size = start - offset;      /* old async_size */
+                ra->size = get_next_ra_size(ra, max);
+                ra->async_size = ra->size;
+                goto readit;
+        }
+        /*
         * It may be one of
         *      - first read on start of file
         *      - sequential cache miss
@@ -389,16 +395,6 @@ ondemand_readahead(struct address_space *mapping,
        ra->size = get_init_ra_size(req_size, max);
        ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
-        /*
-         * Hit on a marked page without valid readahead state.
-         * E.g. interleaved reads.
-         * Not knowing its readahead pos/size, bet on the minimal possible one.
-         */
-        if (hit_readahead_marker) {
-                ra->start++;
-                ra->size = get_next_ra_size(ra, max);
-        }
 readit:
        return ra_submit(ra, mapping, filp);
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 41ac39749ef4..2b9f413c9c00 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -436,7 +436,6 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma)
                entry = pte_wrprotect(entry);
                entry = pte_mkclean(entry);
                set_pte_at(mm, address, pte, entry);
-                lazy_mmu_prot_update(entry);
                ret = 1;
        }
diff --git a/mm/shmem.c b/mm/shmem.c
index fcd19d323f9f..8a82342a8595 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -49,7 +49,6 @@
 #include <linux/ctype.h>
 #include <linux/migrate.h>
 #include <linux/highmem.h>
-#include <linux/backing-dev.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
@@ -96,9 +95,9 @@ static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
         * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
         * might be reconsidered if it ever diverges from PAGE_SIZE.
         *
-         * __GFP_MOVABLE is masked out as swap vectors cannot move
+         * Mobility flags are masked out as swap vectors cannot move
         */
-        return alloc_pages((gfp_mask & ~__GFP_MOVABLE) | __GFP_ZERO,
+        return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
                                PAGE_CACHE_SHIFT-PAGE_SHIFT);
 }
@@ -972,7 +971,7 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_
                *nodelist++ = '\0';
                if (nodelist_parse(nodelist, *policy_nodes))
                        goto out;
-                if (!nodes_subset(*policy_nodes, node_online_map))
+                if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
                        goto out;
        }
        if (!strcmp(value, "default")) {
@@ -997,9 +996,11 @@ static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_
                        err = 0;
        } else if (!strcmp(value, "interleave")) {
                *policy = MPOL_INTERLEAVE;
-                /* Default to nodes online if no nodelist */
+                /*
+                 * Default to online nodes with memory if no nodelist
+                 */
                if (!nodelist)
-                        *policy_nodes = node_online_map;
+                        *policy_nodes = node_states[N_HIGH_MEMORY];
                err = 0;
        }
 out:
@@ -1025,8 +1026,8 @@ static struct page *shmem_swapin_async(struct shared_policy *p,
        return page;
 }
-struct page *shmem_swapin(struct shmem_inode_info *info, swp_entry_t entry,
+static struct page *shmem_swapin(struct shmem_inode_info *info,
-                          unsigned long idx)
+                                 swp_entry_t entry, unsigned long idx)
 {
        struct shared_policy *p = &info->policy;
        int i, num;
@@ -1061,7 +1062,8 @@ shmem_alloc_page(gfp_t gfp, struct shmem_inode_info *info,
        return page;
 }
 #else
-static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static inline int shmem_parse_mpol(char *value, int *policy,
+                                                nodemask_t *policy_nodes)
 {
        return 1;
 }
@@ -1109,7 +1111,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
         * Normally, filepage is NULL on entry, and either found
         * uptodate immediately, or allocated and zeroed, or read
         * in under swappage, which is then assigned to filepage.
-         * But shmem_readpage and shmem_prepare_write pass in a locked
+         * But shmem_readpage and shmem_write_begin pass in a locked
         * filepage, which may be found not uptodate by other callers
         * too, and may need to be copied from the swappage read in.
         */
@@ -1327,14 +1329,14 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 #ifdef CONFIG_NUMA
-int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
 {
        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
        return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
 }
-struct mempolicy *
+static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
-shmem_get_policy(struct vm_area_struct *vma, unsigned long addr)
+                                          unsigned long addr)
 {
        struct inode *i = vma->vm_file->f_path.dentry->d_inode;
        unsigned long idx;
@@ -1446,7 +1448,7 @@ static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_symlink_inline_operations;
 /*
- * Normally tmpfs avoids the use of shmem_readpage and shmem_prepare_write;
+ * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
 * but providing them allows a tmpfs file to be used for splice, sendfile, and
 * below the loop driver, in the generic fashion that many filesystems support.
 */
@@ -1459,10 +1461,30 @@ static int shmem_readpage(struct file *file, struct page *page)
 }
 static int
-shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
+shmem_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata)
 {
-        struct inode *inode = page->mapping->host;
+        struct inode *inode = mapping->host;
-        return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL);
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+        *pagep = NULL;
+        return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+}
+static int
+shmem_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        struct inode *inode = mapping->host;
+        set_page_dirty(page);
+        page_cache_release(page);
+        if (pos+copied > inode->i_size)
+                i_size_write(inode, pos+copied);
+        return copied;
 }
 static ssize_t
@@ -2219,7 +2241,7 @@ static int shmem_fill_super(struct super_block *sb,
        unsigned long blocks = 0;
        unsigned long inodes = 0;
        int policy = MPOL_DEFAULT;
-        nodemask_t policy_nodes = node_online_map;
+        nodemask_t policy_nodes = node_states[N_HIGH_MEMORY];
 #ifdef CONFIG_TMPFS
        /*
@@ -2338,8 +2360,8 @@ static const struct address_space_operations shmem_aops = {
        .set_page_dirty = __set_page_dirty_no_writeback,
 #ifdef CONFIG_TMPFS
        .readpage       = shmem_readpage,
-        .prepare_write  = shmem_prepare_write,
+        .write_begin    = shmem_write_begin,
-        .commit_write   = simple_commit_write,
+        .write_end      = shmem_write_end,
 #endif
        .migratepage    = migrate_page,
 };
diff --git a/mm/slab.c b/mm/slab.c
index 6f6abef83a1a..e34bcb87a6ee 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1568,7 +1568,7 @@ void __init kmem_cache_init(void)
                /* Replace the static kmem_list3 structures for the boot cpu */
                init_list(&cache_cache, &initkmem_list3[CACHE_CACHE], node);
-                for_each_online_node(nid) {
+                for_each_node_state(nid, N_NORMAL_MEMORY) {
                        init_list(malloc_sizes[INDEX_AC].cs_cachep,
                                  &initkmem_list3[SIZE_AC + nid], nid);
@@ -1643,6 +1643,8 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 #endif
        flags |= cachep->gfpflags;
+        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
+                flags |= __GFP_RECLAIMABLE;
        page = alloc_pages_node(nodeid, flags, cachep->gfporder);
        if (!page)
@@ -1944,7 +1946,7 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
 {
        int node;
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                cachep->nodelists[node] = &initkmem_list3[index + node];
                cachep->nodelists[node]->next_reap = jiffies +
                    REAPTIMEOUT_LIST3 +
@@ -2075,7 +2077,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
                        g_cpucache_up = PARTIAL_L3;
                } else {
                        int node;
-                        for_each_online_node(node) {
+                        for_each_node_state(node, N_NORMAL_MEMORY) {
                                cachep->nodelists[node] =
                                    kmalloc_node(sizeof(struct kmem_list3),
                                                GFP_KERNEL, node);
@@ -2746,9 +2748,9 @@ static int cache_grow(struct kmem_cache *cachep,
         * Be lazy and only check for valid flags here,  keeping it out of the
         * critical path in kmem_cache_alloc().
         */
-        BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK));
+        BUG_ON(flags & GFP_SLAB_BUG_MASK);
+        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
-        local_flags = (flags & GFP_LEVEL_MASK);
        /* Take the l3 list lock to change the colour_next on this node */
        check_irq_off();
        l3 = cachep->nodelists[nodeid];
@@ -2785,7 +2787,7 @@ static int cache_grow(struct kmem_cache *cachep,
        /* Get slab management. */
        slabp = alloc_slabmgmt(cachep, objp, offset,
-                        local_flags & ~GFP_THISNODE, nodeid);
+                        local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
        if (!slabp)
                goto opps1;
@@ -3225,7 +3227,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
        zonelist = &NODE_DATA(slab_node(current->mempolicy))
                        ->node_zonelists[gfp_zone(flags)];
-        local_flags = (flags & GFP_LEVEL_MASK);
+        local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 retry:
        /*
@@ -3792,7 +3794,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
        struct array_cache *new_shared;
        struct array_cache **new_alien = NULL;
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                if (use_alien_caches) {
                        new_alien = alloc_alien_cache(node, cachep->limit);
@@ -4446,7 +4448,8 @@ const struct seq_operations slabstats_op = {
 */
 size_t ksize(const void *objp)
 {
-        if (unlikely(ZERO_OR_NULL_PTR(objp)))
+        BUG_ON(!objp);
+        if (unlikely(objp == ZERO_SIZE_PTR))
                return 0;
        return obj_size(virt_to_cache(objp));
diff --git a/mm/slob.c b/mm/slob.c
index ec33fcdc852e..de5d5563a46c 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -360,7 +360,7 @@ static void slob_free(void *block, int size)
        slobidx_t units;
        unsigned long flags;
-        if (ZERO_OR_NULL_PTR(block))
+        if (unlikely(ZERO_OR_NULL_PTR(block)))
                return;
        BUG_ON(!size);
@@ -466,7 +466,7 @@ void kfree(const void *block)
 {
        struct slob_page *sp;
-        if (ZERO_OR_NULL_PTR(block))
+        if (unlikely(ZERO_OR_NULL_PTR(block)))
                return;
        sp = (struct slob_page *)virt_to_page(block);
@@ -484,7 +484,8 @@ size_t ksize(const void *block)
 {
        struct slob_page *sp;
-        if (ZERO_OR_NULL_PTR(block))
+        BUG_ON(!block);
+        if (unlikely(block == ZERO_SIZE_PTR))
                return 0;
        sp = (struct slob_page *)virt_to_page(block);
diff --git a/mm/slub.c b/mm/slub.c
index addb20a6d67d..f426f9bc644b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -90,7 +90,7 @@
 *                      One use of this flag is to mark slabs that are
 *                      used for allocations. Then such a slab becomes a cpu
 *                      slab. The cpu slab may be equipped with an additional
- *                      lockless_freelist that allows lockless access to
+ *                      freelist that allows lockless access to
 *                      free objects in addition to the regular freelist
 *                      that requires the slab lock.
 *
@@ -140,11 +140,6 @@ static inline void ClearSlabDebug(struct page *page)
 /*
 * Issues still to be resolved:
 *
- * - The per cpu array is updated for each new slab and and is a remote
- *   cacheline for most nodes. This could become a bouncing cacheline given
- *   enough frequent updates. There are 16 pointers in a cacheline, so at
- *   max 16 cpus could compete for the cacheline which may be okay.
- *
 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 *
 * - Variable sizing of the per node arrays
@@ -205,11 +200,6 @@ static inline void ClearSlabDebug(struct page *page)
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 #endif
-/*
- * The page->inuse field is 16 bit thus we have this limitation
- */
-#define MAX_OBJECTS_PER_SLAB 65535
 /* Internal SLUB flags */
 #define __OBJECT_POISON         0x80000000 /* Poison object */
 #define __SYSFS_ADD_DEFERRED    0x40000000 /* Not yet visible via sysfs */
@@ -277,6 +267,15 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 #endif
 }
+static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
+{
+#ifdef CONFIG_SMP
+        return s->cpu_slab[cpu];
+#else
+        return &s->cpu_slab;
+#endif
+}
 static inline int check_valid_pointer(struct kmem_cache *s,
                                struct page *page, const void *object)
 {
@@ -729,11 +728,6 @@ static int check_slab(struct kmem_cache *s, struct page *page)
                slab_err(s, page, "Not a valid slab page");
                return 0;
        }
-        if (page->offset * sizeof(void *) != s->offset) {
-                slab_err(s, page, "Corrupted offset %lu",
-                        (unsigned long)(page->offset * sizeof(void *)));
-                return 0;
-        }
        if (page->inuse > s->objects) {
                slab_err(s, page, "inuse %u > max %u",
                        s->name, page->inuse, s->objects);
@@ -872,8 +866,6 @@ bad:
                slab_fix(s, "Marking all objects used");
                page->inuse = s->objects;
                page->freelist = NULL;
-                /* Fix up fields that may be corrupted */
-                page->offset = s->offset / sizeof(void *);
        }
        return 0;
 }
@@ -1055,6 +1047,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
        if (s->flags & SLAB_CACHE_DMA)
                flags |= SLUB_DMA;
+        if (s->flags & SLAB_RECLAIM_ACCOUNT)
+                flags |= __GFP_RECLAIMABLE;
        if (node == -1)
                page = alloc_pages(flags, s->order);
        else
@@ -1088,19 +1083,19 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        void *last;
        void *p;
-        BUG_ON(flags & ~(GFP_DMA | __GFP_ZERO | GFP_LEVEL_MASK));
+        BUG_ON(flags & GFP_SLAB_BUG_MASK);
        if (flags & __GFP_WAIT)
                local_irq_enable();
-        page = allocate_slab(s, flags & GFP_LEVEL_MASK, node);
+        page = allocate_slab(s,
+                flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
        if (!page)
                goto out;
        n = get_node(s, page_to_nid(page));
        if (n)
                atomic_long_inc(&n->nr_slabs);
-        page->offset = s->offset / sizeof(void *);
        page->slab = s;
        page->flags |= 1 << PG_slab;
        if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
@@ -1123,7 +1118,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        set_freepointer(s, last, NULL);
        page->freelist = start;
-        page->lockless_freelist = NULL;
        page->inuse = 0;
 out:
        if (flags & __GFP_WAIT)
@@ -1149,7 +1143,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
                NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
                - pages);
-        page->mapping = NULL;
        __free_pages(page, s->order);
 }
@@ -1383,33 +1376,34 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page)
 /*
 * Remove the cpu slab
 */
-static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
+static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
+        struct page *page = c->page;
        /*
         * Merge cpu freelist into freelist. Typically we get here
         * because both freelists are empty. So this is unlikely
         * to occur.
         */
-        while (unlikely(page->lockless_freelist)) {
+        while (unlikely(c->freelist)) {
                void **object;
                /* Retrieve object from cpu_freelist */
-                object = page->lockless_freelist;
+                object = c->freelist;
-                page->lockless_freelist = page->lockless_freelist[page->offset];
+                c->freelist = c->freelist[c->offset];
                /* And put onto the regular freelist */
-                object[page->offset] = page->freelist;
+                object[c->offset] = page->freelist;
                page->freelist = object;
                page->inuse--;
        }
-        s->cpu_slab[cpu] = NULL;
+        c->page = NULL;
        unfreeze_slab(s, page);
 }
-static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
+static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
-        slab_lock(page);
+        slab_lock(c->page);
-        deactivate_slab(s, page, cpu);
+        deactivate_slab(s, c);
 }
 /*
@@ -1418,18 +1412,17 @@ static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
 */
 static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
 {
-        struct page *page = s->cpu_slab[cpu];
+        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
-        if (likely(page))
+        if (likely(c && c->page))
-                flush_slab(s, page, cpu);
+                flush_slab(s, c);
 }
 static void flush_cpu_slab(void *d)
 {
        struct kmem_cache *s = d;
-        int cpu = smp_processor_id();
-        __flush_cpu_slab(s, cpu);
+        __flush_cpu_slab(s, smp_processor_id());
 }
 static void flush_all(struct kmem_cache *s)
@@ -1446,6 +1439,19 @@ static void flush_all(struct kmem_cache *s)
 }
 /*
+ * Check if the objects in a per cpu structure fit numa
+ * locality expectations.
+ */
+static inline int node_match(struct kmem_cache_cpu *c, int node)
+{
+#ifdef CONFIG_NUMA
+        if (node != -1 && c->node != node)
+                return 0;
+#endif
+        return 1;
+}
+/*
 * Slow path. The lockless freelist is empty or we need to perform
 * debugging duties.
 *
@@ -1463,45 +1469,46 @@ static void flush_all(struct kmem_cache *s)
 * we need to allocate a new slab. This is slowest path since we may sleep.
 */
 static void *__slab_alloc(struct kmem_cache *s,
-                gfp_t gfpflags, int node, void *addr, struct page *page)
+                gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
 {
        void **object;
-        int cpu = smp_processor_id();
+        struct page *new;
-        if (!page)
+        if (!c->page)
                goto new_slab;
-        slab_lock(page);
+        slab_lock(c->page);
-        if (unlikely(node != -1 && page_to_nid(page) != node))
+        if (unlikely(!node_match(c, node)))
                goto another_slab;
 load_freelist:
-        object = page->freelist;
+        object = c->page->freelist;
        if (unlikely(!object))
                goto another_slab;
-        if (unlikely(SlabDebug(page)))
+        if (unlikely(SlabDebug(c->page)))
                goto debug;
-        object = page->freelist;
+        object = c->page->freelist;
-        page->lockless_freelist = object[page->offset];
+        c->freelist = object[c->offset];
-        page->inuse = s->objects;
+        c->page->inuse = s->objects;
-        page->freelist = NULL;
+        c->page->freelist = NULL;
-        slab_unlock(page);
+        c->node = page_to_nid(c->page);
+        slab_unlock(c->page);
        return object;
 another_slab:
-        deactivate_slab(s, page, cpu);
+        deactivate_slab(s, c);
 new_slab:
-        page = get_partial(s, gfpflags, node);
+        new = get_partial(s, gfpflags, node);
-        if (page) {
+        if (new) {
-                s->cpu_slab[cpu] = page;
+                c->page = new;
                goto load_freelist;
        }
-        page = new_slab(s, gfpflags, node);
+        new = new_slab(s, gfpflags, node);
-        if (page) {
+        if (new) {
-                cpu = smp_processor_id();
+                c = get_cpu_slab(s, smp_processor_id());
-                if (s->cpu_slab[cpu]) {
+                if (c->page) {
                        /*
                         * Someone else populated the cpu_slab while we
                         * enabled interrupts, or we have gotten scheduled
@@ -1509,34 +1516,33 @@ new_slab:
                         * requested node even if __GFP_THISNODE was
                         * specified. So we need to recheck.
                         */
-                        if (node == -1 ||
+                        if (node_match(c, node)) {
-                                page_to_nid(s->cpu_slab[cpu]) == node) {
                                /*
                                 * Current cpuslab is acceptable and we
                                 * want the current one since its cache hot
                                 */
-                                discard_slab(s, page);
+                                discard_slab(s, new);
-                                page = s->cpu_slab[cpu];
+                                slab_lock(c->page);
-                                slab_lock(page);
                                goto load_freelist;
                        }
                        /* New slab does not fit our expectations */
-                        flush_slab(s, s->cpu_slab[cpu], cpu);
+                        flush_slab(s, c);
                }
-                slab_lock(page);
+                slab_lock(new);
-                SetSlabFrozen(page);
+                SetSlabFrozen(new);
-                s->cpu_slab[cpu] = page;
+                c->page = new;
                goto load_freelist;
        }
        return NULL;
 debug:
-        object = page->freelist;
+        object = c->page->freelist;
-        if (!alloc_debug_processing(s, page, object, addr))
+        if (!alloc_debug_processing(s, c->page, object, addr))
                goto another_slab;
-        page->inuse++;
+        c->page->inuse++;
-        page->freelist = object[page->offset];
+        c->page->freelist = object[c->offset];
-        slab_unlock(page);
+        c->node = -1;
+        slab_unlock(c->page);
        return object;
 }
@@ -1553,25 +1559,24 @@ debug:
 static void __always_inline *slab_alloc(struct kmem_cache *s,
                gfp_t gfpflags, int node, void *addr)
 {
-        struct page *page;
        void **object;
        unsigned long flags;
+        struct kmem_cache_cpu *c;
        local_irq_save(flags);
-        page = s->cpu_slab[smp_processor_id()];
+        c = get_cpu_slab(s, smp_processor_id());
-        if (unlikely(!page || !page->lockless_freelist ||
+        if (unlikely(!c->freelist || !node_match(c, node)))
-                        (node != -1 && page_to_nid(page) != node)))
-                object = __slab_alloc(s, gfpflags, node, addr, page);
+                object = __slab_alloc(s, gfpflags, node, addr, c);
        else {
-                object = page->lockless_freelist;
+                object = c->freelist;
-                page->lockless_freelist = object[page->offset];
+                c->freelist = object[c->offset];
        }
        local_irq_restore(flags);
        if (unlikely((gfpflags & __GFP_ZERO) && object))
-                memset(object, 0, s->objsize);
+                memset(object, 0, c->objsize);
        return object;
 }
@@ -1599,7 +1604,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
 * handling required then we can return immediately.
 */
 static void __slab_free(struct kmem_cache *s, struct page *page,
-                                        void *x, void *addr)
+                                void *x, void *addr, unsigned int offset)
 {
        void *prior;
        void **object = (void *)x;
@@ -1609,7 +1614,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
        if (unlikely(SlabDebug(page)))
                goto debug;
 checks_ok:
-        prior = object[page->offset] = page->freelist;
+        prior = object[offset] = page->freelist;
        page->freelist = object;
        page->inuse--;
@@ -1664,15 +1669,16 @@ static void __always_inline slab_free(struct kmem_cache *s,
 {
        void **object = (void *)x;
        unsigned long flags;
+        struct kmem_cache_cpu *c;
        local_irq_save(flags);
        debug_check_no_locks_freed(object, s->objsize);
-        if (likely(page == s->cpu_slab[smp_processor_id()] &&
+        c = get_cpu_slab(s, smp_processor_id());
-                                                !SlabDebug(page))) {
+        if (likely(page == c->page && c->node >= 0)) {
-                object[page->offset] = page->lockless_freelist;
+                object[c->offset] = c->freelist;
-                page->lockless_freelist = object;
+                c->freelist = object;
        } else
-                __slab_free(s, page, x, addr);
+                __slab_free(s, page, x, addr, c->offset);
        local_irq_restore(flags);
 }
@@ -1759,14 +1765,6 @@ static inline int slab_order(int size, int min_objects,
        int rem;
        int min_order = slub_min_order;
-        /*
-         * If we would create too many object per slab then reduce
-         * the slab order even if it goes below slub_min_order.
-         */
-        while (min_order > 0 &&
-                (PAGE_SIZE << min_order) >= MAX_OBJECTS_PER_SLAB * size)
-                        min_order--;
        for (order = max(min_order,
                                fls(min_objects * size - 1) - PAGE_SHIFT);
                        order <= max_order; order++) {
@@ -1781,9 +1779,6 @@ static inline int slab_order(int size, int min_objects,
                if (rem <= slab_size / fract_leftover)
                        break;
-                /* If the next size is too high then exit now */
-                if (slab_size * 2 >= MAX_OBJECTS_PER_SLAB * size)
-                        break;
        }
        return order;
@@ -1858,6 +1853,16 @@ static unsigned long calculate_alignment(unsigned long flags,
        return ALIGN(align, sizeof(void *));
 }
+static void init_kmem_cache_cpu(struct kmem_cache *s,
+                        struct kmem_cache_cpu *c)
+{
+        c->page = NULL;
+        c->freelist = NULL;
+        c->node = 0;
+        c->offset = s->offset / sizeof(void *);
+        c->objsize = s->objsize;
+}
 static void init_kmem_cache_node(struct kmem_cache_node *n)
 {
        n->nr_partial = 0;
@@ -1869,6 +1874,131 @@ static void init_kmem_cache_node(struct kmem_cache_node *n)
 #endif
 }
+#ifdef CONFIG_SMP
+/*
+ * Per cpu array for per cpu structures.
+ *
+ * The per cpu array places all kmem_cache_cpu structures from one processor
+ * close together meaning that it becomes possible that multiple per cpu
+ * structures are contained in one cacheline. This may be particularly
+ * beneficial for the kmalloc caches.
+ *
+ * A desktop system typically has around 60-80 slabs. With 100 here we are
+ * likely able to get per cpu structures for all caches from the array defined
+ * here. We must be able to cover all kmalloc caches during bootstrap.
+ *
+ * If the per cpu array is exhausted then fall back to kmalloc
+ * of individual cachelines. No sharing is possible then.
+ */
+#define NR_KMEM_CACHE_CPU 100
+static DEFINE_PER_CPU(struct kmem_cache_cpu,
+                                kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
+static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
+static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE;
+static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
+                                                        int cpu, gfp_t flags)
+{
+        struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu);
+        if (c)
+                per_cpu(kmem_cache_cpu_free, cpu) =
+                                (void *)c->freelist;
+        else {
+                /* Table overflow: So allocate ourselves */
+                c = kmalloc_node(
+                        ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()),
+                        flags, cpu_to_node(cpu));
+                if (!c)
+                        return NULL;
+        }
+        init_kmem_cache_cpu(s, c);
+        return c;
+}
+static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
+{
+        if (c < per_cpu(kmem_cache_cpu, cpu) ||
+                        c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
+                kfree(c);
+                return;
+        }
+        c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
+        per_cpu(kmem_cache_cpu_free, cpu) = c;
+}
+static void free_kmem_cache_cpus(struct kmem_cache *s)
+{
+        int cpu;
+        for_each_online_cpu(cpu) {
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                if (c) {
+                        s->cpu_slab[cpu] = NULL;
+                        free_kmem_cache_cpu(c, cpu);
+                }
+        }
+}
+static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
+{
+        int cpu;
+        for_each_online_cpu(cpu) {
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                if (c)
+                        continue;
+                c = alloc_kmem_cache_cpu(s, cpu, flags);
+                if (!c) {
+                        free_kmem_cache_cpus(s);
+                        return 0;
+                }
+                s->cpu_slab[cpu] = c;
+        }
+        return 1;
+}
+/*
+ * Initialize the per cpu array.
+ */
+static void init_alloc_cpu_cpu(int cpu)
+{
+        int i;
+        if (cpu_isset(cpu, kmem_cach_cpu_free_init_once))
+                return;
+        for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
+                free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
+        cpu_set(cpu, kmem_cach_cpu_free_init_once);
+}
+static void __init init_alloc_cpu(void)
+{
+        int cpu;
+        for_each_online_cpu(cpu)
+                init_alloc_cpu_cpu(cpu);
+  }
+#else
+static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
+static inline void init_alloc_cpu(void) {}
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
+{
+        init_kmem_cache_cpu(s, &s->cpu_slab);
+        return 1;
+}
+#endif
 #ifdef CONFIG_NUMA
 /*
 * No kmalloc_node yet so do it by hand. We know that this is the first
@@ -1876,10 +2006,11 @@ static void init_kmem_cache_node(struct kmem_cache_node *n)
 * possible.
 *
 * Note that this function only works on the kmalloc_node_cache
- * when allocating for the kmalloc_node_cache.
+ * when allocating for the kmalloc_node_cache. This is used for bootstrapping
+ * memory on a fresh node that has no slab structures yet.
 */
-static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags,
+static struct kmem_cache_node *early_kmem_cache_node_alloc(gfp_t gfpflags,
-                                                                int node)
+                                                           int node)
 {
        struct page *page;
        struct kmem_cache_node *n;
@@ -1921,7 +2052,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 {
        int node;
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = s->node[node];
                if (n && n != &s->local_node)
                        kmem_cache_free(kmalloc_caches, n);
@@ -1939,7 +2070,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
        else
                local_node = 0;
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n;
                if (local_node == node)
@@ -2077,14 +2208,7 @@ static int calculate_sizes(struct kmem_cache *s)
         */
        s->objects = (PAGE_SIZE << s->order) / size;
-        /*
+        return !!s->objects;
-         * Verify that the number of objects is within permitted limits.
-         * The page->inuse field is only 16 bit wide! So we cannot have
-         * more than 64k objects per slab.
-         */
-        if (!s->objects || s->objects > MAX_OBJECTS_PER_SLAB)
-                return 0;
-        return 1;
 }
@@ -2107,9 +2231,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
 #ifdef CONFIG_NUMA
        s->defrag_ratio = 100;
 #endif
+        if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+                goto error;
-        if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+        if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
                return 1;
+        free_kmem_cache_nodes(s);
 error:
        if (flags & SLAB_PANIC)
                panic("Cannot create slab %s size=%lu realsize=%u "
@@ -2192,7 +2319,8 @@ static inline int kmem_cache_close(struct kmem_cache *s)
        flush_all(s);
        /* Attempt to free all objects */
-        for_each_online_node(node) {
+        free_kmem_cache_cpus(s);
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
                n->nr_partial -= free_list(s, n, &n->partial);
@@ -2227,11 +2355,11 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 *              Kmalloc subsystem
 *******************************************************************/
-struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned;
+struct kmem_cache kmalloc_caches[PAGE_SHIFT] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 #ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1];
+static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT];
 #endif
 static int __init setup_slub_min_order(char *str)
@@ -2397,12 +2525,8 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
                        return ZERO_SIZE_PTR;
                index = size_index[(size - 1) / 8];
-        } else {
+        } else
-                if (size > KMALLOC_MAX_SIZE)
-                        return NULL;
                index = fls(size - 1);
-        }
 #ifdef CONFIG_ZONE_DMA
        if (unlikely((flags & SLUB_DMA)))
@@ -2414,9 +2538,15 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
 void *__kmalloc(size_t size, gfp_t flags)
 {
-        struct kmem_cache *s = get_slab(size, flags);
+        struct kmem_cache *s;
+        if (unlikely(size > PAGE_SIZE / 2))
+                return (void *)__get_free_pages(flags | __GFP_COMP,
+                                                        get_order(size));
-        if (ZERO_OR_NULL_PTR(s))
+        s = get_slab(size, flags);
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
        return slab_alloc(s, flags, -1, __builtin_return_address(0));
@@ -2426,9 +2556,15 @@ EXPORT_SYMBOL(__kmalloc);
 #ifdef CONFIG_NUMA
 void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
-        struct kmem_cache *s = get_slab(size, flags);
+        struct kmem_cache *s;
-        if (ZERO_OR_NULL_PTR(s))
+        if (unlikely(size > PAGE_SIZE / 2))
+                return (void *)__get_free_pages(flags | __GFP_COMP,
+                                                        get_order(size));
+        s = get_slab(size, flags);
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
        return slab_alloc(s, flags, node, __builtin_return_address(0));
@@ -2441,7 +2577,8 @@ size_t ksize(const void *object)
        struct page *page;
        struct kmem_cache *s;
-        if (ZERO_OR_NULL_PTR(object))
+        BUG_ON(!object);
+        if (unlikely(object == ZERO_SIZE_PTR))
                return 0;
        page = get_object_page(object);
@@ -2473,22 +2610,17 @@ EXPORT_SYMBOL(ksize);
 void kfree(const void *x)
 {
-        struct kmem_cache *s;
        struct page *page;
-        /*
+        if (unlikely(ZERO_OR_NULL_PTR(x)))
-         * This has to be an unsigned comparison. According to Linus
-         * some gcc version treat a pointer as a signed entity. Then
-         * this comparison would be true for all "negative" pointers
-         * (which would cover the whole upper half of the address space).
-         */
-        if (ZERO_OR_NULL_PTR(x))
                return;
        page = virt_to_head_page(x);
-        s = page->slab;
+        if (unlikely(!PageSlab(page))) {
+                put_page(page);
-        slab_free(s, page, (void *)x, __builtin_return_address(0));
+                return;
+        }
+        slab_free(page->slab, page, (void *)x, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(kfree);
@@ -2517,7 +2649,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
                return -ENOMEM;
        flush_all(s);
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                n = get_node(s, node);
                if (!n->nr_partial)
@@ -2575,6 +2707,8 @@ void __init kmem_cache_init(void)
        int i;
        int caches = 0;
+        init_alloc_cpu();
 #ifdef CONFIG_NUMA
        /*
         * Must first have the slab cache available for the allocations of the
@@ -2602,7 +2736,7 @@ void __init kmem_cache_init(void)
                caches++;
        }
-        for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+        for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++) {
                create_kmalloc_cache(&kmalloc_caches[i],
                        "kmalloc", 1 << i, GFP_KERNEL);
                caches++;
@@ -2629,16 +2763,18 @@ void __init kmem_cache_init(void)
        slab_state = UP;
        /* Provide the correct kmalloc names now that the caches are up */
-        for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
+        for (i = KMALLOC_SHIFT_LOW; i < PAGE_SHIFT; i++)
                kmalloc_caches[i]. name =
                        kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
 #ifdef CONFIG_SMP
        register_cpu_notifier(&slab_notifier);
+        kmem_size = offsetof(struct kmem_cache, cpu_slab) +
+                                nr_cpu_ids * sizeof(struct kmem_cache_cpu *);
+#else
+        kmem_size = sizeof(struct kmem_cache);
 #endif
-        kmem_size = offsetof(struct kmem_cache, cpu_slab) +
-                                nr_cpu_ids * sizeof(struct page *);
        printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
                " CPUs=%d, Nodes=%d\n",
@@ -2717,12 +2853,21 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
        down_write(&slub_lock);
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
+                int cpu;
                s->refcount++;
                /*
                 * Adjust the object sizes so that we clear
                 * the complete object on kzalloc.
                 */
                s->objsize = max(s->objsize, (int)size);
+                /*
+                 * And then we need to update the object size in the
+                 * per cpu structures
+                 */
+                for_each_online_cpu(cpu)
+                        get_cpu_slab(s, cpu)->objsize = s->objsize;
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
                up_write(&slub_lock);
                if (sysfs_slab_alias(s, name))
@@ -2765,15 +2910,29 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
        unsigned long flags;
        switch (action) {
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
+                init_alloc_cpu_cpu(cpu);
+                down_read(&slub_lock);
+                list_for_each_entry(s, &slab_caches, list)
+                        s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu,
+                                                        GFP_KERNEL);
+                up_read(&slub_lock);
+                break;
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                down_read(&slub_lock);
                list_for_each_entry(s, &slab_caches, list) {
+                        struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
                        local_irq_save(flags);
                        __flush_cpu_slab(s, cpu);
                        local_irq_restore(flags);
+                        free_kmem_cache_cpu(c, cpu);
+                        s->cpu_slab[cpu] = NULL;
                }
                up_read(&slub_lock);
                break;
@@ -2790,9 +2949,14 @@ static struct notifier_block __cpuinitdata slab_notifier =
 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 {
-        struct kmem_cache *s = get_slab(size, gfpflags);
+        struct kmem_cache *s;
+        if (unlikely(size > PAGE_SIZE / 2))
+                return (void *)__get_free_pages(gfpflags | __GFP_COMP,
+                                                        get_order(size));
+        s = get_slab(size, gfpflags);
-        if (ZERO_OR_NULL_PTR(s))
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
        return slab_alloc(s, gfpflags, -1, caller);
@@ -2801,9 +2965,14 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
                                        int node, void *caller)
 {
-        struct kmem_cache *s = get_slab(size, gfpflags);
+        struct kmem_cache *s;
+        if (unlikely(size > PAGE_SIZE / 2))
+                return (void *)__get_free_pages(gfpflags | __GFP_COMP,
+                                                        get_order(size));
+        s = get_slab(size, gfpflags);
-        if (ZERO_OR_NULL_PTR(s))
+        if (unlikely(ZERO_OR_NULL_PTR(s)))
                return s;
        return slab_alloc(s, gfpflags, node, caller);
@@ -2902,7 +3071,7 @@ static long validate_slab_cache(struct kmem_cache *s)
                return -ENOMEM;
        flush_all(s);
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
                count += validate_slab_node(s, n, map);
@@ -3116,13 +3285,13 @@ static int list_locations(struct kmem_cache *s, char *buf,
        int node;
        if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
-                        GFP_KERNEL))
+                        GFP_TEMPORARY))
                return sprintf(buf, "Out of memory\n");
        /* Push back cpu slabs */
        flush_all(s);
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
                unsigned long flags;
                struct page *page;
@@ -3230,11 +3399,18 @@ static unsigned long slab_objects(struct kmem_cache *s,
        per_cpu = nodes + nr_node_ids;
        for_each_possible_cpu(cpu) {
-                struct page *page = s->cpu_slab[cpu];
+                struct page *page;
                int node;
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                if (!c)
+                        continue;
+                page = c->page;
+                node = c->node;
+                if (node < 0)
+                        continue;
                if (page) {
-                        node = page_to_nid(page);
                        if (flags & SO_CPU) {
                                int x = 0;
@@ -3249,7 +3425,7 @@ static unsigned long slab_objects(struct kmem_cache *s,
                }
        }
-        for_each_online_node(node) {
+        for_each_node_state(node, N_NORMAL_MEMORY) {
                struct kmem_cache_node *n = get_node(s, node);
                if (flags & SO_PARTIAL) {
@@ -3277,7 +3453,7 @@ static unsigned long slab_objects(struct kmem_cache *s,
        x = sprintf(buf, "%lu", total);
 #ifdef CONFIG_NUMA
-        for_each_online_node(node)
+        for_each_node_state(node, N_NORMAL_MEMORY)
                if (nodes[node])
                        x += sprintf(buf + x, " N%d=%lu",
                                        node, nodes[node]);
@@ -3291,13 +3467,19 @@ static int any_slab_objects(struct kmem_cache *s)
        int node;
        int cpu;
-        for_each_possible_cpu(cpu)
+        for_each_possible_cpu(cpu) {
-                if (s->cpu_slab[cpu])
+                struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+                if (c && c->page)
                        return 1;
+        }
-        for_each_node(node) {
+        for_each_online_node(node) {
                struct kmem_cache_node *n = get_node(s, node);
+                if (!n)
+                        continue;
                if (n->nr_partial || atomic_long_read(&n->nr_slabs))
                        return 1;
        }
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
new file mode 100644
index 000000000000..d3b718b0c20a
--- /dev/null
+++ b/mm/sparse-vmemmap.c
@@ -0,0 +1,148 @@
+/*
+ * Virtual Memory Map support
+ *
+ * (C) 2007 sgi. Christoph Lameter <clameter@sgi.com>.
+ *
+ * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
+ * virt_to_page, page_address() to be implemented as a base offset
+ * calculation without memory access.
+ *
+ * However, virtual mappings need a page table and TLBs. Many Linux
+ * architectures already map their physical space using 1-1 mappings
+ * via TLBs. For those arches the virtual memmory map is essentially
+ * for free if we use the same page size as the 1-1 mappings. In that
+ * case the overhead consists of a few additional pages that are
+ * allocated to create a view of memory for vmemmap.
+ *
+ * The architecture is expected to provide a vmemmap_populate() function
+ * to instantiate the mapping.
+ */
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <asm/dma.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
+/*
+ * Allocate a block of memory to be used to back the virtual memory map
+ * or to back the page tables that are used to create the mapping.
+ * Uses the main allocators if they are available, else bootmem.
+ */
+void * __meminit vmemmap_alloc_block(unsigned long size, int node)
+{
+        /* If the main allocator is up use that, fallback to bootmem. */
+        if (slab_is_available()) {
+                struct page *page = alloc_pages_node(node,
+                                GFP_KERNEL | __GFP_ZERO, get_order(size));
+                if (page)
+                        return page_address(page);
+                return NULL;
+        } else
+                return __alloc_bootmem_node(NODE_DATA(node), size, size,
+                                __pa(MAX_DMA_ADDRESS));
+}
+void __meminit vmemmap_verify(pte_t *pte, int node,
+                                unsigned long start, unsigned long end)
+{
+        unsigned long pfn = pte_pfn(*pte);
+        int actual_node = early_pfn_to_nid(pfn);
+        if (actual_node != node)
+                printk(KERN_WARNING "[%lx-%lx] potential offnode "
+                        "page_structs\n", start, end - 1);
+}
+pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
+{
+        pte_t *pte = pte_offset_kernel(pmd, addr);
+        if (pte_none(*pte)) {
+                pte_t entry;
+                void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+                if (!p)
+                        return 0;
+                entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
+                set_pte_at(&init_mm, addr, pte, entry);
+        }
+        return pte;
+}
+pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
+{
+        pmd_t *pmd = pmd_offset(pud, addr);
+        if (pmd_none(*pmd)) {
+                void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+                if (!p)
+                        return 0;
+                pmd_populate_kernel(&init_mm, pmd, p);
+        }
+        return pmd;
+}
+pud_t * __meminit vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node)
+{
+        pud_t *pud = pud_offset(pgd, addr);
+        if (pud_none(*pud)) {
+                void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+                if (!p)
+                        return 0;
+                pud_populate(&init_mm, pud, p);
+        }
+        return pud;
+}
+pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
+{
+        pgd_t *pgd = pgd_offset_k(addr);
+        if (pgd_none(*pgd)) {
+                void *p = vmemmap_alloc_block(PAGE_SIZE, node);
+                if (!p)
+                        return 0;
+                pgd_populate(&init_mm, pgd, p);
+        }
+        return pgd;
+}
+int __meminit vmemmap_populate_basepages(struct page *start_page,
+                                                unsigned long size, int node)
+{
+        unsigned long addr = (unsigned long)start_page;
+        unsigned long end = (unsigned long)(start_page + size);
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *pte;
+        for (; addr < end; addr += PAGE_SIZE) {
+                pgd = vmemmap_pgd_populate(addr, node);
+                if (!pgd)
+                        return -ENOMEM;
+                pud = vmemmap_pud_populate(pgd, addr, node);
+                if (!pud)
+                        return -ENOMEM;
+                pmd = vmemmap_pmd_populate(pud, addr, node);
+                if (!pmd)
+                        return -ENOMEM;
+                pte = vmemmap_pte_populate(pmd, addr, node);
+                if (!pte)
+                        return -ENOMEM;
+                vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
+        }
+        return 0;
+}
+struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
+{
+        struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION);
+        int error = vmemmap_populate(map, PAGES_PER_SECTION, nid);
+        if (error)
+                return NULL;
+        return map;
+}
diff --git a/mm/sparse.c b/mm/sparse.c
index 239f5a720d38..08fb14f5eea3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -9,6 +9,8 @@
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <asm/dma.h>
+#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
 /*
 * Permanent SPARSEMEM data:
@@ -106,7 +108,7 @@ static inline int sparse_index_init(unsigned long section_nr, int nid)
 /*
 * Although written for the SPARSEMEM_EXTREME case, this happens
- * to also work for the flat array case becase
+ * to also work for the flat array case because
 * NR_SECTION_ROOTS==NR_MEM_SECTIONS.
 */
 int __section_nr(struct mem_section* ms)
@@ -176,7 +178,7 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn,
                if (nid != early_pfn_to_nid(pfn))
                        continue;
-                if (pfn_valid(pfn))
+                if (pfn_present(pfn))
                        nr_pages += PAGES_PER_SECTION;
        }
@@ -204,13 +206,16 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
 }
 static int __meminit sparse_init_one_section(struct mem_section *ms,
-                unsigned long pnum, struct page *mem_map)
+                unsigned long pnum, struct page *mem_map,
+                unsigned long *pageblock_bitmap)
 {
-        if (!valid_section(ms))
+        if (!present_section(ms))
                return -EINVAL;
        ms->section_mem_map &= ~SECTION_MAP_MASK;
-        ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
+        ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
+                                                        SECTION_HAS_MEM_MAP;
+        ms->pageblock_flags = pageblock_bitmap;
        return 1;
 }
@@ -221,12 +226,43 @@ void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
        return NULL;
 }
-static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
+static unsigned long usemap_size(void)
 {
-        struct page *map;
+        unsigned long size_bytes;
+        size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
+        size_bytes = roundup(size_bytes, sizeof(unsigned long));
+        return size_bytes;
+}
+#ifdef CONFIG_MEMORY_HOTPLUG
+static unsigned long *__kmalloc_section_usemap(void)
+{
+        return kmalloc(usemap_size(), GFP_KERNEL);
+}
+#endif /* CONFIG_MEMORY_HOTPLUG */
+static unsigned long *sparse_early_usemap_alloc(unsigned long pnum)
+{
+        unsigned long *usemap;
        struct mem_section *ms = __nr_to_section(pnum);
        int nid = sparse_early_nid(ms);
+        usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size());
+        if (usemap)
+                return usemap;
+        /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */
+        nid = 0;
+        printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+        return NULL;
+}
+#ifndef CONFIG_SPARSEMEM_VMEMMAP
+struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
+{
+        struct page *map;
        map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
        if (map)
                return map;
@@ -238,10 +274,22 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
        map = alloc_bootmem_node(NODE_DATA(nid),
                        sizeof(struct page) * PAGES_PER_SECTION);
+        return map;
+}
+#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
+struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
+{
+        struct page *map;
+        struct mem_section *ms = __nr_to_section(pnum);
+        int nid = sparse_early_nid(ms);
+        map = sparse_mem_map_populate(pnum, nid);
        if (map)
                return map;
-        printk(KERN_WARNING "%s: allocation failed\n", __FUNCTION__);
+        printk(KERN_ERR "%s: sparsemem memory map backing failed "
+                        "some memory will not be available.\n", __FUNCTION__);
        ms->section_mem_map = 0;
        return NULL;
 }
@@ -254,19 +302,38 @@ void __init sparse_init(void)
 {
        unsigned long pnum;
        struct page *map;
+        unsigned long *usemap;
        for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
-                if (!valid_section_nr(pnum))
+                if (!present_section_nr(pnum))
                        continue;
                map = sparse_early_mem_map_alloc(pnum);
                if (!map)
                        continue;
-                sparse_init_one_section(__nr_to_section(pnum), pnum, map);
+                usemap = sparse_early_usemap_alloc(pnum);
+                if (!usemap)
+                        continue;
+                sparse_init_one_section(__nr_to_section(pnum), pnum, map,
+                                                                usemap);
        }
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+                                                 unsigned long nr_pages)
+{
+        /* This will make the necessary allocations eventually. */
+        return sparse_mem_map_populate(pnum, nid);
+}
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
+{
+        return; /* XXX: Not implemented yet */
+}
+#else
 static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
 {
        struct page *page, *ret;
@@ -289,6 +356,12 @@ got_map_ptr:
        return ret;
 }
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+                                                  unsigned long nr_pages)
+{
+        return __kmalloc_section_memmap(nr_pages);
+}
 static int vaddr_in_vmalloc_area(void *addr)
 {
        if (addr >= (void *)VMALLOC_START &&
@@ -305,6 +378,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
                free_pages((unsigned long)memmap,
                           get_order(sizeof(struct page) * nr_pages));
 }
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
 /*
 * returns the number of sections whose mem_maps were properly
@@ -318,6 +392,7 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
        struct pglist_data *pgdat = zone->zone_pgdat;
        struct mem_section *ms;
        struct page *memmap;
+        unsigned long *usemap;
        unsigned long flags;
        int ret;
@@ -326,7 +401,8 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
         * plus, it does a kmalloc
         */
        sparse_index_init(section_nr, pgdat->node_id);
-        memmap = __kmalloc_section_memmap(nr_pages);
+        memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
+        usemap = __kmalloc_section_usemap();
        pgdat_resize_lock(pgdat, &flags);
@@ -335,9 +411,14 @@ int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
                ret = -EEXIST;
                goto out;
        }
+        if (!usemap) {
+                ret = -ENOMEM;
+                goto out;
+        }
        ms->section_mem_map |= SECTION_MARKED_PRESENT;
-        ret = sparse_init_one_section(ms, section_nr, memmap);
+        ret = sparse_init_one_section(ms, section_nr, memmap, usemap);
 out:
        pgdat_resize_unlock(pgdat, &flags);
diff --git a/mm/swap.c b/mm/swap.c
index d3cb966fe992..d034b2128d2b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -24,16 +24,18 @@
 #include <linux/module.h>
 #include <linux/mm_inline.h>
 #include <linux/buffer_head.h>  /* for try_to_release_page() */
-#include <linux/module.h>
 #include <linux/percpu_counter.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
-#include <linux/init.h>
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
+static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
 /*
 * This path almost never happens for VM activity - pages are normally
 * freed via pagevecs.  But it gets used by networking.
@@ -94,23 +96,47 @@ void put_pages_list(struct list_head *pages)
 EXPORT_SYMBOL(put_pages_list);
 /*
+ * pagevec_move_tail() must be called with IRQ disabled.
+ * Otherwise this may cause nasty races.
+ */
+static void pagevec_move_tail(struct pagevec *pvec)
+{
+        int i;
+        int pgmoved = 0;
+        struct zone *zone = NULL;
+        for (i = 0; i < pagevec_count(pvec); i++) {
+                struct page *page = pvec->pages[i];
+                struct zone *pagezone = page_zone(page);
+                if (pagezone != zone) {
+                        if (zone)
+                                spin_unlock(&zone->lru_lock);
+                        zone = pagezone;
+                        spin_lock(&zone->lru_lock);
+                }
+                if (PageLRU(page) && !PageActive(page)) {
+                        list_move_tail(&page->lru, &zone->inactive_list);
+                        pgmoved++;
+                }
+        }
+        if (zone)
+                spin_unlock(&zone->lru_lock);
+        __count_vm_events(PGROTATED, pgmoved);
+        release_pages(pvec->pages, pvec->nr, pvec->cold);
+        pagevec_reinit(pvec);
+}
+/*
 * Writeback is about to end against a page which has been marked for immediate
 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
- * inactive list.  The page still has PageWriteback set, which will pin it.
+ * inactive list.
- *
- * We don't expect many pages to come through here, so don't bother batching
- * things up.
- *
- * To avoid placing the page at the tail of the LRU while PG_writeback is still
- * set, this function will clear PG_writeback before performing the page
- * motion.  Do that inside the lru lock because once PG_writeback is cleared
- * we may not touch the page.
 *
 * Returns zero if it cleared PG_writeback.
 */
 int rotate_reclaimable_page(struct page *page)
 {
-        struct zone *zone;
+        struct pagevec *pvec;
        unsigned long flags;
        if (PageLocked(page))
@@ -122,15 +148,16 @@ int rotate_reclaimable_page(struct page *page)
        if (!PageLRU(page))
                return 1;
-        zone = page_zone(page);
+        page_cache_get(page);
-        spin_lock_irqsave(&zone->lru_lock, flags);
+        local_irq_save(flags);
-        if (PageLRU(page) && !PageActive(page)) {
+        pvec = &__get_cpu_var(lru_rotate_pvecs);
-                list_move_tail(&page->lru, &zone->inactive_list);
+        if (!pagevec_add(pvec, page))
-                __count_vm_event(PGROTATED);
+                pagevec_move_tail(pvec);
-        }
+        local_irq_restore(flags);
        if (!test_clear_page_writeback(page))
                BUG();
-        spin_unlock_irqrestore(&zone->lru_lock, flags);
        return 0;
 }
@@ -174,9 +201,6 @@ EXPORT_SYMBOL(mark_page_accessed);
 * lru_cache_add: add a page to the page lists
 * @page: the page to add
 */
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
 void fastcall lru_cache_add(struct page *page)
 {
        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
@@ -197,21 +221,37 @@ void fastcall lru_cache_add_active(struct page *page)
        put_cpu_var(lru_add_active_pvecs);
 }
-static void __lru_add_drain(int cpu)
+/*
+ * Drain pages out of the cpu's pagevecs.
+ * Either "cpu" is the current CPU, and preemption has already been
+ * disabled; or "cpu" is being hot-unplugged, and is already dead.
+ */
+static void drain_cpu_pagevecs(int cpu)
 {
-        struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+        struct pagevec *pvec;
-        /* CPU is dead, so no locking needed. */
+        pvec = &per_cpu(lru_add_pvecs, cpu);
        if (pagevec_count(pvec))
                __pagevec_lru_add(pvec);
        pvec = &per_cpu(lru_add_active_pvecs, cpu);
        if (pagevec_count(pvec))
                __pagevec_lru_add_active(pvec);
+        pvec = &per_cpu(lru_rotate_pvecs, cpu);
+        if (pagevec_count(pvec)) {
+                unsigned long flags;
+                /* No harm done if a racing interrupt already did this */
+                local_irq_save(flags);
+                pagevec_move_tail(pvec);
+                local_irq_restore(flags);
+        }
 }
 void lru_add_drain(void)
 {
-        __lru_add_drain(get_cpu());
+        drain_cpu_pagevecs(get_cpu());
        put_cpu();
 }
@@ -258,6 +298,7 @@ void release_pages(struct page **pages, int nr, int cold)
        int i;
        struct pagevec pages_to_free;
        struct zone *zone = NULL;
+        unsigned long uninitialized_var(flags);
        pagevec_init(&pages_to_free, cold);
        for (i = 0; i < nr; i++) {
@@ -265,7 +306,7 @@ void release_pages(struct page **pages, int nr, int cold)
                if (unlikely(PageCompound(page))) {
                        if (zone) {
-                                spin_unlock_irq(&zone->lru_lock);
+                                spin_unlock_irqrestore(&zone->lru_lock, flags);
                                zone = NULL;
                        }
                        put_compound_page(page);
@@ -279,9 +320,10 @@ void release_pages(struct page **pages, int nr, int cold)
                        struct zone *pagezone = page_zone(page);
                        if (pagezone != zone) {
                                if (zone)
-                                        spin_unlock_irq(&zone->lru_lock);
+                                        spin_unlock_irqrestore(&zone->lru_lock,
+                                                                        flags);
                                zone = pagezone;
-                                spin_lock_irq(&zone->lru_lock);
+                                spin_lock_irqsave(&zone->lru_lock, flags);
                        }
                        VM_BUG_ON(!PageLRU(page));
                        __ClearPageLRU(page);
@@ -290,7 +332,7 @@ void release_pages(struct page **pages, int nr, int cold)
                if (!pagevec_add(&pages_to_free, page)) {
                        if (zone) {
-                                spin_unlock_irq(&zone->lru_lock);
+                                spin_unlock_irqrestore(&zone->lru_lock, flags);
                                zone = NULL;
                        }
                        __pagevec_free(&pages_to_free);
@@ -298,7 +340,7 @@ void release_pages(struct page **pages, int nr, int cold)
                }
        }
        if (zone)
-                spin_unlock_irq(&zone->lru_lock);
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
        pagevec_free(&pages_to_free);
 }
@@ -491,7 +533,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
                atomic_add(*committed, &vm_committed_space);
                *committed = 0;
-                __lru_add_drain((long)hcpu);
+                drain_cpu_pagevecs((long)hcpu);
        }
        return NOTIFY_OK;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 67daecb6031a..b52635601dfe 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -74,6 +74,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 {
        int error;
+        BUG_ON(!PageLocked(page));
        BUG_ON(PageSwapCache(page));
        BUG_ON(PagePrivate(page));
        error = radix_tree_preload(gfp_mask);
@@ -83,7 +84,6 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
                                                entry.val, page);
                if (!error) {
                        page_cache_get(page);
-                        SetPageLocked(page);
                        SetPageSwapCache(page);
                        set_page_private(page, entry.val);
                        total_swapcache_pages++;
@@ -99,15 +99,18 @@ static int add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
        int error;
+        BUG_ON(PageLocked(page));
        if (!swap_duplicate(entry)) {
                INC_CACHE_INFO(noent_race);
                return -ENOENT;
        }
+        SetPageLocked(page);
        error = __add_to_swap_cache(page, entry, GFP_KERNEL);
        /*
         * Anon pages are already on the LRU, we don't run lru_cache_add here.
         */
        if (error) {
+                ClearPageLocked(page);
                swap_free(entry);
                if (error == -EEXIST)
                        INC_CACHE_INFO(exist_race);
diff --git a/mm/util.c b/mm/util.c
index bf340d806868..5f64026cbb4d 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -81,14 +81,16 @@ EXPORT_SYMBOL(kmemdup);
 void *krealloc(const void *p, size_t new_size, gfp_t flags)
 {
        void *ret;
-        size_t ks;
+        size_t ks = 0;
        if (unlikely(!new_size)) {
                kfree(p);
                return ZERO_SIZE_PTR;
        }
-        ks = ksize(p);
+        if (p)
+                ks = ksize(p);
        if (ks >= new_size)
                return (void *)p;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 3cee76a8c9f0..2e01af365848 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -190,7 +190,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long fl
        if (unlikely(!size))
                return NULL;
-        area = kmalloc_node(sizeof(*area), gfp_mask & GFP_LEVEL_MASK, node);
+        area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
        if (unlikely(!area))
                return NULL;
@@ -439,7 +440,7 @@ void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
                area->flags |= VM_VPAGES;
        } else {
                pages = kmalloc_node(array_size,
-                                (gfp_mask & GFP_LEVEL_MASK) | __GFP_ZERO,
+                                (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO,
                                node);
        }
        area->pages = pages;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a6e65d024995..bbd194630c5b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -932,6 +932,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                long mapped_ratio;
                long distress;
                long swap_tendency;
+                long imbalance;
                if (zone_is_near_oom(zone))
                        goto force_reclaim_mapped;
@@ -967,6 +968,46 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
                /*
+                 * If there's huge imbalance between active and inactive
+                 * (think active 100 times larger than inactive) we should
+                 * become more permissive, or the system will take too much
+                 * cpu before it start swapping during memory pressure.
+                 * Distress is about avoiding early-oom, this is about
+                 * making swappiness graceful despite setting it to low
+                 * values.
+                 *
+                 * Avoid div by zero with nr_inactive+1, and max resulting
+                 * value is vm_total_pages.
+                 */
+                imbalance  = zone_page_state(zone, NR_ACTIVE);
+                imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
+                /*
+                 * Reduce the effect of imbalance if swappiness is low,
+                 * this means for a swappiness very low, the imbalance
+                 * must be much higher than 100 for this logic to make
+                 * the difference.
+                 *
+                 * Max temporary value is vm_total_pages*100.
+                 */
+                imbalance *= (vm_swappiness + 1);
+                imbalance /= 100;
+                /*
+                 * If not much of the ram is mapped, makes the imbalance
+                 * less relevant, it's high priority we refill the inactive
+                 * list with mapped pages only in presence of high ratio of
+                 * mapped pages.
+                 *
+                 * Max temporary value is vm_total_pages*100.
+                 */
+                imbalance *= mapped_ratio;
+                imbalance /= 100;
+                /* apply imbalance feedback to swap_tendency */
+                swap_tendency += imbalance;
+                /*
                 * Now use this metric to decide whether to start moving mapped
                 * memory onto the inactive list.
                 */
@@ -1371,7 +1412,13 @@ loop_again:
                        temp_priority[i] = priority;
                        sc.nr_scanned = 0;
                        note_zone_scanning_priority(zone, priority);
-                        nr_reclaimed += shrink_zone(priority, zone, &sc);
+                        /*
+                         * We put equal pressure on every zone, unless one
+                         * zone has way too many pages free already.
+                         */
+                        if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
+                                                end_zone, 0))
+                                nr_reclaimed += shrink_zone(priority, zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
                                                lru_pages);
@@ -1688,9 +1735,11 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 {
        pg_data_t *pgdat;
        cpumask_t mask;
+        int nid;
        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
-                for_each_online_pgdat(pgdat) {
+                for_each_node_state(nid, N_HIGH_MEMORY) {
+                        pgdat = NODE_DATA(nid);
                        mask = node_to_cpumask(pgdat->node_id);
                        if (any_online_cpu(mask) != NR_CPUS)
                                /* One of our CPUs online: restore mask */
@@ -1727,7 +1776,7 @@ static int __init kswapd_init(void)
        int nid;
        swap_setup();
-        for_each_online_node(nid)
+        for_each_node_state(nid, N_HIGH_MEMORY)
                kswapd_run(nid);
        hotcpu_notifier(cpu_callback, 0);
        return 0;
@@ -1847,7 +1896,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-        cpumask_t mask;
        int node_id;
        /*
@@ -1884,8 +1932,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         * as wide as possible.
         */
        node_id = zone_to_nid(zone);
-        mask = node_to_cpumask(node_id);
+        if (node_state(node_id, N_CPU) && node_id != numa_node_id())
-        if (!cpus_empty(mask) && node_id != numa_node_id())
                return 0;
        return __zone_reclaim(zone, gfp_mask, order);
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c64d169537bf..3b5e9043e7db 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -353,23 +353,6 @@ void refresh_cpu_vm_stats(int cpu)
        }
 }
-static void __refresh_cpu_vm_stats(void *dummy)
-{
-        refresh_cpu_vm_stats(smp_processor_id());
-}
-/*
- * Consolidate all counters.
- *
- * Note that the result is less inaccurate but still inaccurate
- * if concurrent processes are allowed to run.
- */
-void refresh_vm_stats(void)
-{
-        on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
-}
-EXPORT_SYMBOL(refresh_vm_stats);
 #endif
 #ifdef CONFIG_NUMA
@@ -398,6 +381,13 @@ void zone_statistics(struct zonelist *zonelist, struct zone *z)
 #include <linux/seq_file.h>
+static char * const migratetype_names[MIGRATE_TYPES] = {
+        "Unmovable",
+        "Reclaimable",
+        "Movable",
+        "Reserve",
+};
 static void *frag_start(struct seq_file *m, loff_t *pos)
 {
        pg_data_t *pgdat;
@@ -422,28 +412,144 @@ static void frag_stop(struct seq_file *m, void *arg)
 {
 }
-/*
+/* Walk all the zones in a node and print using a callback */
- * This walks the free areas for each zone.
+static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
- */
+                void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
-static int frag_show(struct seq_file *m, void *arg)
 {
-        pg_data_t *pgdat = (pg_data_t *)arg;
        struct zone *zone;
        struct zone *node_zones = pgdat->node_zones;
        unsigned long flags;
-        int order;
        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
                if (!populated_zone(zone))
                        continue;
                spin_lock_irqsave(&zone->lock, flags);
-                seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+                print(m, pgdat, zone);
-                for (order = 0; order < MAX_ORDER; ++order)
-                        seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
                spin_unlock_irqrestore(&zone->lock, flags);
+        }
+}
+static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
+                                                struct zone *zone)
+{
+        int order;
+        seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+        for (order = 0; order < MAX_ORDER; ++order)
+                seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+        seq_putc(m, '\n');
+}
+/*
+ * This walks the free areas for each zone.
+ */
+static int frag_show(struct seq_file *m, void *arg)
+{
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        walk_zones_in_node(m, pgdat, frag_show_print);
+        return 0;
+}
+static void pagetypeinfo_showfree_print(struct seq_file *m,
+                                        pg_data_t *pgdat, struct zone *zone)
+{
+        int order, mtype;
+        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) {
+                seq_printf(m, "Node %4d, zone %8s, type %12s ",
+                                        pgdat->node_id,
+                                        zone->name,
+                                        migratetype_names[mtype]);
+                for (order = 0; order < MAX_ORDER; ++order) {
+                        unsigned long freecount = 0;
+                        struct free_area *area;
+                        struct list_head *curr;
+                        area = &(zone->free_area[order]);
+                        list_for_each(curr, &area->free_list[mtype])
+                                freecount++;
+                        seq_printf(m, "%6lu ", freecount);
+                }
                seq_putc(m, '\n');
        }
+}
+/* Print out the free pages at each order for each migatetype */
+static int pagetypeinfo_showfree(struct seq_file *m, void *arg)
+{
+        int order;
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        /* Print header */
+        seq_printf(m, "%-43s ", "Free pages count per migrate type at order");
+        for (order = 0; order < MAX_ORDER; ++order)
+                seq_printf(m, "%6d ", order);
+        seq_putc(m, '\n');
+        walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print);
+        return 0;
+}
+static void pagetypeinfo_showblockcount_print(struct seq_file *m,
+                                        pg_data_t *pgdat, struct zone *zone)
+{
+        int mtype;
+        unsigned long pfn;
+        unsigned long start_pfn = zone->zone_start_pfn;
+        unsigned long end_pfn = start_pfn + zone->spanned_pages;
+        unsigned long count[MIGRATE_TYPES] = { 0, };
+        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+                struct page *page;
+                if (!pfn_valid(pfn))
+                        continue;
+                page = pfn_to_page(pfn);
+                mtype = get_pageblock_migratetype(page);
+                count[mtype]++;
+        }
+        /* Print counts */
+        seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+                seq_printf(m, "%12lu ", count[mtype]);
+        seq_putc(m, '\n');
+}
+/* Print out the free pages at each order for each migratetype */
+static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg)
+{
+        int mtype;
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        seq_printf(m, "\n%-23s", "Number of blocks type ");
+        for (mtype = 0; mtype < MIGRATE_TYPES; mtype++)
+                seq_printf(m, "%12s ", migratetype_names[mtype]);
+        seq_putc(m, '\n');
+        walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print);
+        return 0;
+}
+/*
+ * This prints out statistics in relation to grouping pages by mobility.
+ * It is expensive to collect so do not constantly read the file.
+ */
+static int pagetypeinfo_show(struct seq_file *m, void *arg)
+{
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        seq_printf(m, "Page block order: %d\n", pageblock_order);
+        seq_printf(m, "Pages per block:  %lu\n", pageblock_nr_pages);
+        seq_putc(m, '\n');
+        pagetypeinfo_showfree(m, pgdat);
+        pagetypeinfo_showblockcount(m, pgdat);
        return 0;
 }
@@ -454,6 +560,13 @@ const struct seq_operations fragmentation_op = {
        .show   = frag_show,
 };
+const struct seq_operations pagetypeinfo_op = {
+        .start  = frag_start,
+        .next   = frag_next,
+        .stop   = frag_stop,
+        .show   = pagetypeinfo_show,
+};
 #ifdef CONFIG_ZONE_DMA
 #define TEXT_FOR_DMA(xx) xx "_dma",
 #else
@@ -532,84 +645,78 @@ static const char * const vmstat_text[] = {
 #endif
 };
-/*
+static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
- * Output information about zones in @pgdat.
+                                                        struct zone *zone)
- */
-static int zoneinfo_show(struct seq_file *m, void *arg)
 {
-        pg_data_t *pgdat = arg;
+        int i;
-        struct zone *zone;
+        seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
-        struct zone *node_zones = pgdat->node_zones;
+        seq_printf(m,
-        unsigned long flags;
+                   "\n  pages free     %lu"
+                   "\n        min      %lu"
-        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
+                   "\n        low      %lu"
-                int i;
+                   "\n        high     %lu"
+                   "\n        scanned  %lu (a: %lu i: %lu)"
-                if (!populated_zone(zone))
+                   "\n        spanned  %lu"
-                        continue;
+                   "\n        present  %lu",
+                   zone_page_state(zone, NR_FREE_PAGES),
-                spin_lock_irqsave(&zone->lock, flags);
+                   zone->pages_min,
-                seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+                   zone->pages_low,
-                seq_printf(m,
+                   zone->pages_high,
-                           "\n  pages free     %lu"
+                   zone->pages_scanned,
-                           "\n        min      %lu"
+                   zone->nr_scan_active, zone->nr_scan_inactive,
-                           "\n        low      %lu"
+                   zone->spanned_pages,
-                           "\n        high     %lu"
+                   zone->present_pages);
-                           "\n        scanned  %lu (a: %lu i: %lu)"
-                           "\n        spanned  %lu"
-                           "\n        present  %lu",
-                           zone_page_state(zone, NR_FREE_PAGES),
-                           zone->pages_min,
-                           zone->pages_low,
-                           zone->pages_high,
-                           zone->pages_scanned,
-                           zone->nr_scan_active, zone->nr_scan_inactive,
-                           zone->spanned_pages,
-                           zone->present_pages);
-                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-                        seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
+                seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
-                                        zone_page_state(zone, i));
+                                zone_page_state(zone, i));
-                seq_printf(m,
+        seq_printf(m,
-                           "\n        protection: (%lu",
+                   "\n        protection: (%lu",
-                           zone->lowmem_reserve[0]);
+                   zone->lowmem_reserve[0]);
-                for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+        for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
-                        seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+                seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
-                seq_printf(m,
+        seq_printf(m,
-                           ")"
+                   ")"
-                           "\n  pagesets");
+                   "\n  pagesets");
-                for_each_online_cpu(i) {
+        for_each_online_cpu(i) {
-                        struct per_cpu_pageset *pageset;
+                struct per_cpu_pageset *pageset;
-                        int j;
+                int j;
-                        pageset = zone_pcp(zone, i);
+                pageset = zone_pcp(zone, i);
-                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+                for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
-                                seq_printf(m,
+                        seq_printf(m,
-                                           "\n    cpu: %i pcp: %i"
+                                   "\n    cpu: %i pcp: %i"
-                                           "\n              count: %i"
+                                   "\n              count: %i"
-                                           "\n              high:  %i"
+                                   "\n              high:  %i"
-                                           "\n              batch: %i",
+                                   "\n              batch: %i",
-                                           i, j,
+                                   i, j,
-                                           pageset->pcp[j].count,
+                                   pageset->pcp[j].count,
-                                           pageset->pcp[j].high,
+                                   pageset->pcp[j].high,
-                                           pageset->pcp[j].batch);
+                                   pageset->pcp[j].batch);
                        }
 #ifdef CONFIG_SMP
-                        seq_printf(m, "\n  vm stats threshold: %d",
+                seq_printf(m, "\n  vm stats threshold: %d",
-                                        pageset->stat_threshold);
+                                pageset->stat_threshold);
 #endif
-                }
-                seq_printf(m,
-                           "\n  all_unreclaimable: %u"
-                           "\n  prev_priority:     %i"
-                           "\n  start_pfn:         %lu",
-                           zone->all_unreclaimable,
-                           zone->prev_priority,
-                           zone->zone_start_pfn);
-                spin_unlock_irqrestore(&zone->lock, flags);
-                seq_putc(m, '\n');
        }
+        seq_printf(m,
+                   "\n  all_unreclaimable: %u"
+                   "\n  prev_priority:     %i"
+                   "\n  start_pfn:         %lu",
+                   zone->all_unreclaimable,
+                   zone->prev_priority,
+                   zone->zone_start_pfn);
+        seq_putc(m, '\n');
+}
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        walk_zones_in_node(m, pgdat, zoneinfo_show_print);
        return 0;
 }
@@ -741,7 +848,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
 static struct notifier_block __cpuinitdata vmstat_notifier =
        { &vmstat_cpuup_callback, NULL, 0 };
-int __init setup_vmstat(void)
+static int __init setup_vmstat(void)
 {
        int cpu;