Merge rsync://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6

Conflicts: include/linux/kernel.h
author: Steven Whitehouse <swhiteho@redhat.com> 2006-07-03 10:25:08 -0400
committer: Steven Whitehouse <swhiteho@redhat.com> 2006-07-03 10:25:08 -0400
commit: 0a1340c185734a57fbf4775927966ad4a1347b02 (patch)
tree: d9ed8f0dd809a7c542a3356601125ea5b5aaa804 /mm
parent: af18ddb8864b096e3ed4732e2d4b21c956dcfe3a (diff)
parent: 29454dde27d8e340bb1987bad9aa504af7081eba (diff)
36 files changed, 2879 insertions, 1798 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 332f5c29b53a..8f5b45615f7b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -115,7 +115,8 @@ config SPARSEMEM_EXTREME
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
-        depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND
+        depends on SPARSEMEM && HOTPLUG && !SOFTWARE_SUSPEND && ARCH_ENABLE_MEMORY_HOTPLUG
+        depends on (IA64 || X86 || PPC64)
 comment "Memory hotplug is currently incompatible with Software Suspend"
        depends on SPARSEMEM && HOTPLUG && SOFTWARE_SUSPEND
@@ -138,10 +139,16 @@ config SPLIT_PTLOCK_CPUS
 #
 config MIGRATION
        bool "Page migration"
-        def_bool y if NUMA
+        def_bool y
-        depends on SWAP && NUMA
+        depends on NUMA
        help
          Allows the migration of the physical location of pages of processes
          while the virtual addresses are not changed. This is useful for
          example on NUMA systems to put pages nearer to the processors accessing
          the page.
+config RESOURCES_64BIT
+        bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
+        default 64BIT
+        help
+          This option allows memory and IO resources to be 64 bit.
diff --git a/mm/Makefile b/mm/Makefile
index 0b8f73f2ed16..9dd824c11eeb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y                   := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           page_alloc.o page-writeback.o pdflush.o \
                           readahead.o swap.o truncate.o vmscan.o \
-                           prio_tree.o util.o mmzone.o $(mmu-y)
+                           prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
diff --git a/mm/filemap.c b/mm/filemap.c
index a02a0b2c986b..b9c91ab7f0f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,11 +9,11 @@
 * most "normal" filesystems (but you don't /have/ to use this:
 * the NFS filesystem used to do this differently, for example)
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
+#include <linux/uaccess.h>
 #include <linux/aio.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
@@ -38,7 +38,6 @@
 */
 #include <linux/buffer_head.h> /* for generic_osync_inode */
-#include <asm/uaccess.h>
 #include <asm/mman.h>
 static ssize_t
@@ -120,7 +119,7 @@ void __remove_from_page_cache(struct page *page)
        radix_tree_delete(&mapping->page_tree, page->index);
        page->mapping = NULL;
        mapping->nrpages--;
-        pagecache_acct(-1);
+        __dec_zone_page_state(page, NR_FILE_PAGES);
 }
 void remove_from_page_cache(struct page *page)
@@ -171,15 +170,17 @@ static int sync_page(void *word)
 }
 /**
- * filemap_fdatawrite_range - start writeback against all of a mapping's
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
- * dirty pages that lie within the byte offsets <start, end>
 * @mapping:    address space structure to write
 * @start:      offset in bytes where the range starts
 * @end:        offset in bytes where the range ends (inclusive)
 * @sync_mode:  enable synchronous operation
 *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory * cleansing writeback.  The difference between
+ * opposed to a regular memory cleansing writeback.  The difference between
 * these two operations is that if a dirty page/buffer is encountered, it must
 * be waited upon, and not just skipped over.
 */
@@ -190,8 +191,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
        struct writeback_control wbc = {
                .sync_mode = sync_mode,
                .nr_to_write = mapping->nrpages * 2,
-                .start = start,
+                .range_start = start,
-                .end = end,
+                .range_end = end,
        };
        if (!mapping_cap_writeback_dirty(mapping))
@@ -204,7 +205,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 static inline int __filemap_fdatawrite(struct address_space *mapping,
        int sync_mode)
 {
-        return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
+        return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
 }
 int filemap_fdatawrite(struct address_space *mapping)
@@ -219,7 +220,10 @@ static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
-/*
+/**
+ * filemap_flush - mostly a non-blocking flush
+ * @mapping:    target address_space
+ *
 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 * purposes - I/O may not be started against all dirty pages.
 */
@@ -229,7 +233,12 @@ int filemap_flush(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_flush);
-/*
+/**
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping:    target address_space
+ * @start:      beginning page index
+ * @end:        ending page index
+ *
 * Wait for writeback to complete against pages indexed by start->end
 * inclusive
 */
@@ -276,7 +285,13 @@ int wait_on_page_writeback_range(struct address_space *mapping,
        return ret;
 }
-/*
+/**
+ * sync_page_range - write and wait on all pages in the passed range
+ * @inode:      target inode
+ * @mapping:    target address_space
+ * @pos:        beginning offset in pages to write
+ * @count:      number of bytes to write
+ *
 * Write and wait upon all the pages in the passed range.  This is a "data
 * integrity" operation.  It waits upon in-flight writeout before starting and
 * waiting upon new writeout.  If there was an IO error, return it.
@@ -305,7 +320,13 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 }
 EXPORT_SYMBOL(sync_page_range);
-/*
+/**
+ * sync_page_range_nolock
+ * @inode:      target inode
+ * @mapping:    target address_space
+ * @pos:        beginning offset in pages to write
+ * @count:      number of bytes to write
+ *
 * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
 * as it forces O_SYNC writers to different parts of the same file
 * to be serialised right until io completion.
@@ -329,10 +350,11 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
 EXPORT_SYMBOL(sync_page_range_nolock);
 /**
- * filemap_fdatawait - walk the list of under-writeback pages of the given
+ * filemap_fdatawait - wait for all under-writeback pages to complete
- *     address space and wait for all of them.
- *
 * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them.
 */
 int filemap_fdatawait(struct address_space *mapping)
 {
@@ -368,7 +390,12 @@ int filemap_write_and_wait(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_write_and_wait);
-/*
+/**
+ * filemap_write_and_wait_range - write out & wait on a file range
+ * @mapping:    the address_space for the pages
+ * @lstart:     offset in bytes where the range starts
+ * @lend:       offset in bytes where the range ends (inclusive)
+ *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that `lend' is inclusive (describes the last byte to be written) so
@@ -394,8 +421,14 @@ int filemap_write_and_wait_range(struct address_space *mapping,
        return err;
 }
-/*
+/**
- * This function is used to add newly allocated pagecache pages:
+ * add_to_page_cache - add newly allocated pagecache pages
+ * @page:       page to add
+ * @mapping:    the page's address_space
+ * @offset:     page index
+ * @gfp_mask:   page allocation mode
+ *
+ * This function is used to add newly allocated pagecache pages;
 * the page is new, so we can just run SetPageLocked() against it.
 * The other page state flags were set by rmqueue().
 *
@@ -415,14 +448,13 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
                        page->mapping = mapping;
                        page->index = offset;
                        mapping->nrpages++;
-                        pagecache_acct(1);
+                        __inc_zone_page_state(page, NR_FILE_PAGES);
                }
                write_unlock_irq(&mapping->tree_lock);
                radix_tree_preload_end();
        }
        return error;
 }
 EXPORT_SYMBOL(add_to_page_cache);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
@@ -489,8 +521,7 @@ void fastcall wait_on_page_bit(struct page *page, int bit_nr)
 EXPORT_SYMBOL(wait_on_page_bit);
 /**
- * unlock_page() - unlock a locked page
+ * unlock_page - unlock a locked page
- *
 * @page: the page
 *
 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
@@ -513,8 +544,9 @@ void fastcall unlock_page(struct page *page)
 }
 EXPORT_SYMBOL(unlock_page);
-/*
+/**
- * End writeback against a page.
+ * end_page_writeback - end writeback against a page
+ * @page: the page
 */
 void end_page_writeback(struct page *page)
 {
@@ -527,10 +559,11 @@ void end_page_writeback(struct page *page)
 }
 EXPORT_SYMBOL(end_page_writeback);
-/*
+/**
- * Get a lock on the page, assuming we need to sleep to get it.
+ * __lock_page - get a lock on the page, assuming we need to sleep to get it
+ * @page: the page to lock
 *
- * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
+ * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
 * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
 * chances are that on the second loop, the block layer's plug list is empty,
 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
@@ -544,8 +577,12 @@ void fastcall __lock_page(struct page *page)
 }
 EXPORT_SYMBOL(__lock_page);
-/*
+/**
- * a rather lightweight function, finding and getting a reference to a
+ * find_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * A rather lightweight function, finding and getting a reference to a
 * hashed page atomically.
 */
 struct page * find_get_page(struct address_space *mapping, unsigned long offset)
@@ -559,11 +596,14 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
        read_unlock_irq(&mapping->tree_lock);
        return page;
 }
 EXPORT_SYMBOL(find_get_page);
-/*
+/**
- * Same as above, but trylock it instead of incrementing the count.
+ * find_trylock_page - find and lock a page
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Same as find_get_page(), but trylock it instead of incrementing the count.
 */
 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
 {
@@ -576,12 +616,10 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs
        read_unlock_irq(&mapping->tree_lock);
        return page;
 }
 EXPORT_SYMBOL(find_trylock_page);
 /**
 * find_lock_page - locate, pin and lock a pagecache page
- *
 * @mapping: the address_space to search
 * @offset: the page index
 *
@@ -617,12 +655,10 @@ repeat:
        read_unlock_irq(&mapping->tree_lock);
        return page;
 }
 EXPORT_SYMBOL(find_lock_page);
 /**
 * find_or_create_page - locate or add a pagecache page
- *
 * @mapping: the page's address_space
 * @index: the page's index into the mapping
 * @gfp_mask: page allocation mode
@@ -663,7 +699,6 @@ repeat:
                page_cache_release(cached_page);
        return page;
 }
 EXPORT_SYMBOL(find_or_create_page);
 /**
@@ -729,9 +764,16 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
        return i;
 }
-/*
+/**
+ * find_get_pages_tag - find and return pages that match @tag
+ * @mapping:    the address_space to search
+ * @index:      the starting page index
+ * @tag:        the tag index
+ * @nr_pages:   the maximum number of pages
+ * @pages:      where the resulting pages are placed
+ *
 * Like find_get_pages, except we only return pages which are tagged with
- * `tag'.   We update *index to index the next page for the traversal.
+ * @tag.   We update @index to index the next page for the traversal.
 */
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
                        int tag, unsigned int nr_pages, struct page **pages)
@@ -750,7 +792,11 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
        return ret;
 }
-/*
+/**
+ * grab_cache_page_nowait - returns locked page at given index in given cache
+ * @mapping: target address_space
+ * @index: the page index
+ *
 * Same as grab_cache_page, but do not wait if the page is unavailable.
 * This is intended for speculative data generators, where the data can
 * be regenerated if the page couldn't be grabbed.  This routine should
@@ -779,19 +825,51 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
        }
        return page;
 }
 EXPORT_SYMBOL(grab_cache_page_nowait);
 /*
+ * CD/DVDs are error prone. When a medium error occurs, the driver may fail
+ * a _large_ part of the i/o request. Imagine the worst scenario:
+ *
+ *      ---R__________________________________________B__________
+ *         ^ reading here                             ^ bad block(assume 4k)
+ *
+ * read(R) => miss => readahead(R...B) => media error => frustrating retries
+ * => failing the whole request => read(R) => read(R+1) =>
+ * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
+ * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
+ * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
+ *
+ * It is going insane. Fix it by quickly scaling down the readahead size.
+ */
+static void shrink_readahead_size_eio(struct file *filp,
+                                        struct file_ra_state *ra)
+{
+        if (!ra->ra_pages)
+                return;
+        ra->ra_pages /= 4;
+        printk(KERN_WARNING "Reducing readahead size to %luK\n",
+                        ra->ra_pages << (PAGE_CACHE_SHIFT - 10));
+}
+/**
+ * do_generic_mapping_read - generic file read routine
+ * @mapping:    address_space to be read
+ * @_ra:        file's readahead state
+ * @filp:       the file to read
+ * @ppos:       current file position
+ * @desc:       read_descriptor
+ * @actor:      read method
+ *
 * This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level
+ * mapping->a_ops->readpage() function for the actual low-level stuff.
- * stuff.
 *
 * This is really ugly. But the goto's actually try to clarify some
 * of the logic when it comes to error handling etc.
 *
- * Note the struct file* is only passed for the use of readpage.  It may be
+ * Note the struct file* is only passed for the use of readpage.
- * NULL.
+ * It may be NULL.
 */
 void do_generic_mapping_read(struct address_space *mapping,
                             struct file_ra_state *_ra,
@@ -932,6 +1010,7 @@ readpage:
                                }
                                unlock_page(page);
                                error = -EIO;
+                                shrink_readahead_size_eio(filp, &ra);
                                goto readpage_error;
                        }
                        unlock_page(page);
@@ -1004,7 +1083,6 @@ out:
        if (filp)
                file_accessed(filp);
 }
 EXPORT_SYMBOL(do_generic_mapping_read);
 int file_read_actor(read_descriptor_t *desc, struct page *page,
@@ -1046,7 +1124,13 @@ success:
 }
 EXPORT_SYMBOL_GPL(file_read_actor);
-/*
+/**
+ * __generic_file_aio_read - generic filesystem read routine
+ * @iocb:       kernel I/O control block
+ * @iov:        io vector request
+ * @nr_segs:    number of segments in the iovec
+ * @ppos:       current file position
+ *
 * This is the "read()" routine for all filesystems
 * that can use the page cache directly.
 */
@@ -1125,7 +1209,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 out:
        return retval;
 }
 EXPORT_SYMBOL(__generic_file_aio_read);
 ssize_t
@@ -1136,7 +1219,6 @@ generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t
        BUG_ON(iocb->ki_pos != pos);
        return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
 }
 EXPORT_SYMBOL(generic_file_aio_read);
 ssize_t
@@ -1152,7 +1234,6 @@ generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppo
                ret = wait_on_sync_kiocb(&kiocb);
        return ret;
 }
 EXPORT_SYMBOL(generic_file_read);
 int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
@@ -1193,7 +1274,6 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
                return desc.written;
        return desc.error;
 }
 EXPORT_SYMBOL(generic_file_sendfile);
 static ssize_t
@@ -1229,11 +1309,15 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
 }
 #ifdef CONFIG_MMU
-/*
+static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
+/**
+ * page_cache_read - adds requested page to the page cache if not already there
+ * @file:       file to read
+ * @offset:     page index
+ *
 * This adds the requested page to the page cache if it isn't already there,
 * and schedules an I/O to read in its contents from disk.
 */
-static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
 static int fastcall page_cache_read(struct file * file, unsigned long offset)
 {
        struct address_space *mapping = file->f_mapping;
@@ -1260,7 +1344,12 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 #define MMAP_LOTSAMISS  (100)
-/*
+/**
+ * filemap_nopage - read in file data for page fault handling
+ * @area:       the applicable vm_area
+ * @address:    target address to read in
+ * @type:       returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
+ *
 * filemap_nopage() is invoked via the vma operations vector for a
 * mapped memory region to read in file data during a page fault.
 *
@@ -1327,7 +1416,7 @@ retry_find:
                 */
                if (!did_readaround) {
                        majmin = VM_FAULT_MAJOR;
-                        inc_page_state(pgmajfault);
+                        count_vm_event(PGMAJFAULT);
                }
                did_readaround = 1;
                ra_pages = max_sane_readahead(file->f_ra.ra_pages);
@@ -1398,7 +1487,7 @@ no_cached_page:
 page_not_uptodate:
        if (!did_readaround) {
                majmin = VM_FAULT_MAJOR;
-                inc_page_state(pgmajfault);
+                count_vm_event(PGMAJFAULT);
        }
        lock_page(page);
@@ -1460,10 +1549,10 @@ page_not_uptodate:
         * Things didn't work out. Return zero to tell the
         * mm layer so, possibly freeing the page cache page first.
         */
+        shrink_readahead_size_eio(file, ra);
        page_cache_release(page);
        return NULL;
 }
 EXPORT_SYMBOL(filemap_nopage);
 static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
@@ -1717,7 +1806,13 @@ repeat:
        return page;
 }
-/*
+/**
+ * read_cache_page - read into page cache, fill it if needed
+ * @mapping:    the page's address_space
+ * @index:      the page index
+ * @filler:     function to perform the read
+ * @data:       destination for read data
+ *
 * Read into the page cache. If a page already exists,
 * and PageUptodate() is not set, try to fill the page.
 */
@@ -1755,7 +1850,6 @@ retry:
 out:
        return page;
 }
 EXPORT_SYMBOL(read_cache_page);
 /*
@@ -1826,7 +1920,7 @@ int remove_suid(struct dentry *dentry)
 EXPORT_SYMBOL(remove_suid);
 size_t
-__filemap_copy_from_user_iovec(char *vaddr, 
+__filemap_copy_from_user_iovec_inatomic(char *vaddr,
                        const struct iovec *iov, size_t base, size_t bytes)
 {
        size_t copied = 0, left = 0;
@@ -1836,18 +1930,14 @@ __filemap_copy_from_user_iovec(char *vaddr,
                int copy = min(bytes, iov->iov_len - base);
                base = 0;
-                left = __copy_from_user_inatomic(vaddr, buf, copy);
+                left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
                copied += copy;
                bytes -= copy;
                vaddr += copy;
                iov++;
-                if (unlikely(left)) {
+                if (unlikely(left))
-                        /* zero the rest of the target like __copy_from_user */
-                        if (bytes)
-                                memset(vaddr, 0, bytes);
                        break;
-                }
        }
        return copied - left;
 }
@@ -1855,7 +1945,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
 /*
 * Performs necessary checks before doing a write
 *
- * Can adjust writing position aor amount of bytes to write.
+ * Can adjust writing position or amount of bytes to write.
 * Returns appropriate error code that caller should return or
 * zero in case that write should be allowed.
 */
@@ -1979,7 +2069,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct file *file = iocb->ki_filp;
        struct address_space * mapping = file->f_mapping;
-        struct address_space_operations *a_ops = mapping->a_ops;
+        const struct address_space_operations *a_ops = mapping->a_ops;
        struct inode    *inode = mapping->host;
        long            status = 0;
        struct page     *page;
@@ -2005,14 +2095,21 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
        do {
                unsigned long index;
                unsigned long offset;
-                unsigned long maxlen;
                size_t copied;
                offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
                index = pos >> PAGE_CACHE_SHIFT;
                bytes = PAGE_CACHE_SIZE - offset;
-                if (bytes > count)
-                        bytes = count;
+                /* Limit the size of the copy to the caller's write size */
+                bytes = min(bytes, count);
+                /*
+                 * Limit the size of the copy to that of the current segment,
+                 * because fault_in_pages_readable() doesn't know how to walk
+                 * segments.
+                 */
+                bytes = min(bytes, cur_iov->iov_len - iov_base);
                /*
                 * Bring in the user page that we will copy from _first_.
@@ -2020,10 +2117,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                 * same page as we're writing to, without it being marked
                 * up-to-date.
                 */
-                maxlen = cur_iov->iov_len - iov_base;
+                fault_in_pages_readable(buf, bytes);
-                if (maxlen > bytes)
-                        maxlen = bytes;
-                fault_in_pages_readable(buf, maxlen);
                page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
                if (!page) {
@@ -2031,6 +2125,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                        break;
                }
+                if (unlikely(bytes == 0)) {
+                        status = 0;
+                        copied = 0;
+                        goto zero_length_segment;
+                }
                status = a_ops->prepare_write(file, page, offset, offset+bytes);
                if (unlikely(status)) {
                        loff_t isize = i_size_read(inode);
@@ -2060,7 +2160,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                        page_cache_release(page);
                        continue;
                }
-                if (likely(copied > 0)) {
+zero_length_segment:
+                if (likely(copied >= 0)) {
                        if (!status)
                                status = copied;
@@ -2125,7 +2226,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
                                unsigned long nr_segs, loff_t *ppos)
 {
        struct file *file = iocb->ki_filp;
-        struct address_space * mapping = file->f_mapping;
+        const struct address_space * mapping = file->f_mapping;
        size_t ocount;          /* original count */
        size_t count;           /* after file limit checks */
        struct inode    *inode = mapping->host;
diff --git a/mm/filemap.h b/mm/filemap.h
index 13793ba0ce17..3f2a343c6015 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -13,18 +13,26 @@
 #include <linux/highmem.h>
 #include <linux/uio.h>
 #include <linux/config.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 size_t
-__filemap_copy_from_user_iovec(char *vaddr,
+__filemap_copy_from_user_iovec_inatomic(char *vaddr,
-                               const struct iovec *iov,
+                                        const struct iovec *iov,
-                               size_t base,
+                                        size_t base,
-                               size_t bytes);
+                                        size_t bytes);
 /*
 * Copy as much as we can into the page and return the number of bytes which
 * were sucessfully copied.  If a fault is encountered then clear the page
 * out to (offset+bytes) and return the number of bytes which were copied.
+ *
+ * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache
+ * to *NOT* zero any tail of the buffer that it failed to copy.  If it does,
+ * and if the following non-atomic copy succeeds, then there is a small window
+ * where the target page contains neither the data before the write, nor the
+ * data after the write (it contains zero).  A read at this time will see
+ * data that is inconsistent with any ordering of the read and the write.
+ * (This has been detected in practice).
 */
 static inline size_t
 filemap_copy_from_user(struct page *page, unsigned long offset,
@@ -34,13 +42,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
        int left;
        kaddr = kmap_atomic(page, KM_USER0);
-        left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+        left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
        kunmap_atomic(kaddr, KM_USER0);
        if (left != 0) {
                /* Do it the slow way */
                kaddr = kmap(page);
-                left = __copy_from_user(kaddr + offset, buf, bytes);
+                left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
                kunmap(page);
        }
        return bytes - left;
@@ -60,13 +68,15 @@ filemap_copy_from_user_iovec(struct page *page, unsigned long offset,
        size_t copied;
        kaddr = kmap_atomic(page, KM_USER0);
-        copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+        copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
-                                                base, bytes);
+                                                         base, bytes);
        kunmap_atomic(kaddr, KM_USER0);
        if (copied != bytes) {
                kaddr = kmap(page);
-                copied = __filemap_copy_from_user_iovec(kaddr + offset, iov,
+                copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov,
-                                                        base, bytes);
+                                                                 base, bytes);
+                if (bytes - copied)
+                        memset(kaddr + offset + copied, 0, bytes - copied);
                kunmap(page);
        }
        return copied;
@@ -78,7 +88,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
        const struct iovec *iov = *iovp;
        size_t base = *basep;
-        while (bytes) {
+        do {
                int copy = min(bytes, iov->iov_len - base);
                bytes -= copy;
@@ -87,7 +97,7 @@ filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
                        iov++;
                        base = 0;
                }
-        }
+        } while (bytes);
        *iovp = iov;
        *basep = base;
 }
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b960ac8e5918..b4fd0d7c9bfb 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -273,7 +273,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
                  size_t count, loff_t pos, loff_t *ppos)
 {
        struct address_space * mapping = filp->f_mapping;
-        struct address_space_operations *a_ops = mapping->a_ops;
+        const struct address_space_operations *a_ops = mapping->a_ops;
        struct inode    *inode = mapping->host;
        long            status = 0;
        struct page     *page;
diff --git a/mm/fremap.c b/mm/fremap.c
index 9f381e58bf44..21b7d0cbc98c 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -83,6 +83,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page_add_file_rmap(page);
        pte_val = *pte;
        update_mmu_cache(vma, addr, pte_val);
+        lazy_mmu_prot_update(pte_val);
        err = 0;
 unlock:
        pte_unmap_unlock(pte, ptl);
@@ -114,7 +115,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
        set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
        pte_val = *pte;
-        update_mmu_cache(vma, addr, pte_val);
+        /*
+         * We don't need to run update_mmu_cache() here because the "file pte"
+         * being installed by install_file_pte() is not a real pte - it's a
+         * non-present entry (like a swap entry), noting what file offset should
+         * be mapped there when there's a fault (in a non-linear vma where
+         * that's not obvious).
+         */
        pte_unmap_unlock(pte, ptl);
        err = 0;
 out:
diff --git a/mm/highmem.c b/mm/highmem.c
index 9b274fdf9d08..9b2a5403c447 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -315,8 +315,8 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
                if (bvec->bv_page == org_vec->bv_page)
                        continue;
-                mempool_free(bvec->bv_page, pool);      
+                dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
-                dec_page_state(nr_bounce);
+                mempool_free(bvec->bv_page, pool);
        }
        bio_endio(bio_orig, bio_orig->bi_size, err);
@@ -397,7 +397,7 @@ static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
                to->bv_page = mempool_alloc(pool, q->bounce_gfp);
                to->bv_len = from->bv_len;
                to->bv_offset = from->bv_offset;
-                inc_page_state(nr_bounce);
+                inc_zone_page_state(to->bv_page, NR_BOUNCE);
                if (rw == WRITE) {
                        char *vto, *vfrom;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 832f676ca038..df499973255f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void)
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr)
 {
-        struct inode *inode = vma->vm_file->f_dentry->d_inode;
        struct page *page;
-        int use_reserve = 0;
-        unsigned long idx;
        spin_lock(&hugetlb_lock);
+        if (vma->vm_flags & VM_MAYSHARE)
-        if (vma->vm_flags & VM_MAYSHARE) {
+                resv_huge_pages--;
+        else if (free_huge_pages <= resv_huge_pages)
-                /* idx = radix tree index, i.e. offset into file in
+                goto fail;
-                 * HPAGE_SIZE units */
-                idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-                        + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-                /* The hugetlbfs specific inode info stores the number
-                 * of "guaranteed available" (huge) pages.  That is,
-                 * the first 'prereserved_hpages' pages of the inode
-                 * are either already instantiated, or have been
-                 * pre-reserved (by hugetlb_reserve_for_inode()). Here
-                 * we're in the process of instantiating the page, so
-                 * we use this to determine whether to draw from the
-                 * pre-reserved pool or the truly free pool. */
-                if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
-                        use_reserve = 1;
-        }
-        if (!use_reserve) {
-                if (free_huge_pages <= reserved_huge_pages)
-                        goto fail;
-        } else {
-                BUG_ON(reserved_huge_pages == 0);
-                reserved_huge_pages--;
-        }
        page = dequeue_huge_page(vma, addr);
        if (!page)
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        set_page_refcounted(page);
        return page;
- fail:
+fail:
-        WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
        spin_unlock(&hugetlb_lock);
        return NULL;
 }
-/* hugetlb_extend_reservation()
- *
- * Ensure that at least 'atleast' hugepages are, and will remain,
- * available to instantiate the first 'atleast' pages of the given
- * inode.  If the inode doesn't already have this many pages reserved
- * or instantiated, set aside some hugepages in the reserved pool to
- * satisfy later faults (or fail now if there aren't enough, rather
- * than getting the SIGBUS later).
- */
-int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
-                               unsigned long atleast)
-{
-        struct inode *inode = &info->vfs_inode;
-        unsigned long change_in_reserve = 0;
-        int ret = 0;
-        spin_lock(&hugetlb_lock);
-        read_lock_irq(&inode->i_mapping->tree_lock);
-        if (info->prereserved_hpages >= atleast)
-                goto out;
-        /* Because we always call this on shared mappings, none of the
-         * pages beyond info->prereserved_hpages can have been
-         * instantiated, so we need to reserve all of them now. */
-        change_in_reserve = atleast - info->prereserved_hpages;
-        if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        reserved_huge_pages += change_in_reserve;
-        info->prereserved_hpages = atleast;
- out:
-        read_unlock_irq(&inode->i_mapping->tree_lock);
-        spin_unlock(&hugetlb_lock);
-        return ret;
-}
-/* hugetlb_truncate_reservation()
- *
- * This returns pages reserved for the given inode to the general free
- * hugepage pool.  If the inode has any pages prereserved, but not
- * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
- * them.
- */
-void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
-                                  unsigned long atmost)
-{
-        struct inode *inode = &info->vfs_inode;
-        struct address_space *mapping = inode->i_mapping;
-        unsigned long idx;
-        unsigned long change_in_reserve = 0;
-        struct page *page;
-        spin_lock(&hugetlb_lock);
-        read_lock_irq(&inode->i_mapping->tree_lock);
-        if (info->prereserved_hpages <= atmost)
-                goto out;
-        /* Count pages which were reserved, but not instantiated, and
-         * which we can now release. */
-        for (idx = atmost; idx < info->prereserved_hpages; idx++) {
-                page = radix_tree_lookup(&mapping->page_tree, idx);
-                if (!page)
-                        /* Pages which are already instantiated can't
-                         * be unreserved (and in fact have already
-                         * been removed from the reserved pool) */
-                        change_in_reserve++;
-        }
-        BUG_ON(reserved_huge_pages < change_in_reserve);
-        reserved_huge_pages -= change_in_reserve;
-        info->prereserved_hpages = atmost;
- out:
-        read_unlock_irq(&inode->i_mapping->tree_lock);
-        spin_unlock(&hugetlb_lock);
-}
 static int __init hugetlb_init(void)
 {
        unsigned long i;
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
                return nr_huge_pages;
        spin_lock(&hugetlb_lock);
-        count = max(count, reserved_huge_pages);
+        count = max(count, resv_huge_pages);
        try_to_free_low(count);
        while (count < nr_huge_pages) {
                struct page *page = dequeue_huge_page(NULL, 0);
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf)
        return sprintf(buf,
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
-                        "HugePages_Rsvd:  %5lu\n"
+                        "HugePages_Rsvd:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
                        nr_huge_pages,
                        free_huge_pages,
-                        reserved_huge_pages,
+                        resv_huge_pages,
                        HPAGE_SIZE/1024);
 }
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        flush_tlb_range(vma, start, end);
 }
+struct file_region {
+        struct list_head link;
+        long from;
+        long to;
+};
+static long region_add(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg, *trg;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        /* Check for and consume any regions we now overlap with. */
+        nrg = rg;
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        break;
+                /* If this area reaches higher then extend our area to
+                 * include it completely.  If this is not the first area
+                 * which we intend to reuse, free it. */
+                if (rg->to > t)
+                        t = rg->to;
+                if (rg != nrg) {
+                        list_del(&rg->link);
+                        kfree(rg);
+                }
+        }
+        nrg->from = f;
+        nrg->to = t;
+        return 0;
+}
+static long region_chg(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg;
+        long chg = 0;
+        /* Locate the region we are before or in. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* If we are below the current region then a new region is required.
+         * Subtle, allocate a new region at the position but make it zero
+         * size such that we can guarentee to record the reservation. */
+        if (&rg->link == head || t < rg->from) {
+                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+                if (nrg == 0)
+                        return -ENOMEM;
+                nrg->from = f;
+                nrg->to   = f;
+                INIT_LIST_HEAD(&nrg->link);
+                list_add(&nrg->link, rg->link.prev);
+                return t - f;
+        }
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        chg = t - f;
+        /* Check for and consume any regions we now overlap with. */
+        list_for_each_entry(rg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        return chg;
+                /* We overlap with this area, if it extends futher than
+                 * us then we must extend ourselves.  Account for its
+                 * existing reservation. */
+                if (rg->to > t) {
+                        chg += rg->to - t;
+                        t = rg->to;
+                }
+                chg -= rg->to - rg->from;
+        }
+        return chg;
+}
+static long region_truncate(struct list_head *head, long end)
+{
+        struct file_region *rg, *trg;
+        long chg = 0;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (end <= rg->to)
+                        break;
+        if (&rg->link == head)
+                return 0;
+        /* If we are in the middle of a region then adjust it. */
+        if (end > rg->from) {
+                chg = rg->to - end;
+                rg->to = end;
+                rg = list_entry(rg->link.next, typeof(*rg), link);
+        }
+        /* Drop any remaining regions. */
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                chg += rg->to - rg->from;
+                list_del(&rg->link);
+                kfree(rg);
+        }
+        return chg;
+}
+static int hugetlb_acct_memory(long delta)
+{
+        int ret = -ENOMEM;
+        spin_lock(&hugetlb_lock);
+        if ((delta + resv_huge_pages) <= free_huge_pages) {
+                resv_huge_pages += delta;
+                ret = 0;
+        }
+        spin_unlock(&hugetlb_lock);
+        return ret;
+}
+int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+{
+        long ret, chg;
+        chg = region_chg(&inode->i_mapping->private_list, from, to);
+        if (chg < 0)
+                return chg;
+        ret = hugetlb_acct_memory(chg);
+        if (ret < 0)
+                return ret;
+        region_add(&inode->i_mapping->private_list, from, to);
+        return 0;
+}
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+{
+        long chg = region_truncate(&inode->i_mapping->private_list, offset);
+        hugetlb_acct_memory(freed - chg);
+}
diff --git a/mm/memory.c b/mm/memory.c
index 0ec7bc644271..7e2a4b1580e3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -126,7 +126,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
        pmd_clear(pmd);
        pte_lock_deinit(page);
        pte_free_tlb(tlb, page);
-        dec_page_state(nr_page_table_pages);
+        dec_zone_page_state(page, NR_PAGETABLE);
        tlb->mm->nr_ptes--;
 }
@@ -311,7 +311,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
                pte_free(new);
        } else {
                mm->nr_ptes++;
-                inc_page_state(nr_page_table_pages);
+                inc_zone_page_state(new, NR_PAGETABLE);
                pmd_populate(mm, pmd, new);
        }
        spin_unlock(&mm->page_table_lock);
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        /* pte contains position in swap or file, so copy. */
        if (unlikely(!pte_present(pte))) {
                if (!pte_file(pte)) {
-                        swap_duplicate(pte_to_swp_entry(pte));
+                        swp_entry_t entry = pte_to_swp_entry(pte);
+                        swap_duplicate(entry);
                        /* make sure dst_mm is on swapoff's mmlist. */
                        if (unlikely(list_empty(&dst_mm->mmlist))) {
                                spin_lock(&mmlist_lock);
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                                 &src_mm->mmlist);
                                spin_unlock(&mmlist_lock);
                        }
+                        if (is_write_migration_entry(entry) &&
+                                        is_cow_mapping(vm_flags)) {
+                                /*
+                                 * COW mappings require pages in both parent
+                                 * and child to be set to read.
+                                 */
+                                make_migration_entry_read(&entry);
+                                pte = swp_entry_to_pte(entry);
+                                set_pte_at(src_mm, addr, src_pte, pte);
+                        }
                }
                goto out_set_pte;
        }
@@ -1445,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *old_page, *new_page;
        pte_t entry;
-        int ret = VM_FAULT_MINOR;
+        int reuse, ret = VM_FAULT_MINOR;
        old_page = vm_normal_page(vma, address, orig_pte);
        if (!old_page)
                goto gotten;
-        if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
+        if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
-                int reuse = can_share_swap_page(old_page);
+                                (VM_SHARED|VM_WRITE))) {
-                unlock_page(old_page);
+                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
-                if (reuse) {
+                        /*
-                        flush_cache_page(vma, address, pte_pfn(orig_pte));
+                         * Notify the address space that the page is about to
-                        entry = pte_mkyoung(orig_pte);
+                         * become writable so that it can prohibit this or wait
-                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                         * for the page to get into an appropriate state.
-                        ptep_set_access_flags(vma, address, page_table, entry, 1);
+                         *
-                        update_mmu_cache(vma, address, entry);
+                         * We do this without the lock held, so that it can
-                        lazy_mmu_prot_update(entry);
+                         * sleep if it needs to.
-                        ret |= VM_FAULT_WRITE;
+                         */
-                        goto unlock;
+                        page_cache_get(old_page);
+                        pte_unmap_unlock(page_table, ptl);
+                        if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+                                goto unwritable_page;
+                        page_cache_release(old_page);
+                        /*
+                         * Since we dropped the lock we need to revalidate
+                         * the PTE as someone else may have changed it.  If
+                         * they did, we just return, as we can count on the
+                         * MMU to tell us if they didn't also make it writable.
+                         */
+                        page_table = pte_offset_map_lock(mm, pmd, address,
+                                                         &ptl);
+                        if (!pte_same(*page_table, orig_pte))
+                                goto unlock;
                }
+                reuse = 1;
+        } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
+                reuse = can_share_swap_page(old_page);
+                unlock_page(old_page);
+        } else {
+                reuse = 0;
+        }
+        if (reuse) {
+                flush_cache_page(vma, address, pte_pfn(orig_pte));
+                entry = pte_mkyoung(orig_pte);
+                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                ptep_set_access_flags(vma, address, page_table, entry, 1);
+                update_mmu_cache(vma, address, entry);
+                lazy_mmu_prot_update(entry);
+                ret |= VM_FAULT_WRITE;
+                goto unlock;
        }
        /*
@@ -1523,6 +1570,10 @@ oom:
        if (old_page)
                page_cache_release(old_page);
        return VM_FAULT_OOM;
+unwritable_page:
+        page_cache_release(old_page);
+        return VM_FAULT_SIGBUS;
 }
 /*
@@ -1879,7 +1930,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out;
        entry = pte_to_swp_entry(orig_pte);
-again:
+        if (is_migration_entry(entry)) {
+                migration_entry_wait(mm, pmd, address);
+                goto out;
+        }
        page = lookup_swap_cache(entry);
        if (!page) {
                swapin_readahead(entry, address, vma);
@@ -1897,18 +1951,12 @@ again:
                /* Had to read the page from swap area: Major fault */
                ret = VM_FAULT_MAJOR;
-                inc_page_state(pgmajfault);
+                count_vm_event(PGMAJFAULT);
                grab_swap_token();
        }
        mark_page_accessed(page);
        lock_page(page);
-        if (!PageSwapCache(page)) {
-                /* Page migration has occured */
-                unlock_page(page);
-                page_cache_release(page);
-                goto again;
-        }
        /*
         * Back out if somebody else already faulted in this pte.
@@ -2074,18 +2122,31 @@ retry:
        /*
         * Should we do an early C-O-W break?
         */
-        if (write_access && !(vma->vm_flags & VM_SHARED)) {
+        if (write_access) {
-                struct page *page;
+                if (!(vma->vm_flags & VM_SHARED)) {
+                        struct page *page;
-                if (unlikely(anon_vma_prepare(vma)))
+                        if (unlikely(anon_vma_prepare(vma)))
-                        goto oom;
+                                goto oom;
-                page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+                        page = alloc_page_vma(GFP_HIGHUSER, vma, address);
-                if (!page)
+                        if (!page)
-                        goto oom;
+                                goto oom;
-                copy_user_highpage(page, new_page, address);
+                        copy_user_highpage(page, new_page, address);
-                page_cache_release(new_page);
+                        page_cache_release(new_page);
-                new_page = page;
+                        new_page = page;
-                anon = 1;
+                        anon = 1;
+                } else {
+                        /* if the page will be shareable, see if the backing
+                         * address space wants to know that the page is about
+                         * to become writable */
+                        if (vma->vm_ops->page_mkwrite &&
+                            vma->vm_ops->page_mkwrite(vma, new_page) < 0
+                            ) {
+                                page_cache_release(new_page);
+                                return VM_FAULT_SIGBUS;
+                        }
+                }
        }
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
@@ -2263,7 +2324,7 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        __set_current_state(TASK_RUNNING);
-        inc_page_state(pgfault);
+        count_vm_event(PGFAULT);
        if (unlikely(is_vm_hugetlb_page(vma)))
                return hugetlb_fault(mm, vma, address, write_access);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 70df5c0d957e..01c9fb97c619 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -4,7 +4,6 @@
 *  Copyright (C)
 */
-#include <linux/config.h>
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -21,12 +20,13 @@
 #include <linux/memory_hotplug.h>
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
+#include <linux/ioport.h>
 #include <asm/tlbflush.h>
 extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
                          unsigned long size);
-static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
        int nr_pages = PAGES_PER_SECTION;
@@ -34,8 +34,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
        int zone_type;
        zone_type = zone - pgdat->node_zones;
+        if (!populated_zone(zone)) {
+                int ret = 0;
+                ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
+                if (ret < 0)
+                        return ret;
+        }
        memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
        zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+        return 0;
 }
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
@@ -50,7 +57,11 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
        if (ret < 0)
                return ret;
-        __add_zone(zone, phys_start_pfn);
+        ret = __add_zone(zone, phys_start_pfn);
+        if (ret < 0)
+                return ret;
        return register_new_memory(__pfn_to_section(phys_start_pfn));
 }
@@ -115,7 +126,11 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        unsigned long i;
        unsigned long flags;
        unsigned long onlined_pages = 0;
+        struct resource res;
+        u64 section_end;
+        unsigned long start_pfn;
        struct zone *zone;
+        int need_zonelists_rebuild = 0;
        /*
         * This doesn't need a lock to do pfn_to_page().
@@ -128,15 +143,140 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
        pgdat_resize_unlock(zone->zone_pgdat, &flags);
-        for (i = 0; i < nr_pages; i++) {
+        /*
-                struct page *page = pfn_to_page(pfn + i);
+         * If this zone is not populated, then it is not in zonelist.
-                online_page(page);
+         * This means the page allocator ignores this zone.
-                onlined_pages++;
+         * So, zonelist must be updated after online.
+         */
+        if (!populated_zone(zone))
+                need_zonelists_rebuild = 1;
+        res.start = (u64)pfn << PAGE_SHIFT;
+        res.end = res.start + ((u64)nr_pages << PAGE_SHIFT) - 1;
+        res.flags = IORESOURCE_MEM; /* we just need system ram */
+        section_end = res.end;
+        while (find_next_system_ram(&res) >= 0) {
+                start_pfn = (unsigned long)(res.start >> PAGE_SHIFT);
+                nr_pages = (unsigned long)
+                           ((res.end + 1 - res.start) >> PAGE_SHIFT);
+                if (PageReserved(pfn_to_page(start_pfn))) {
+                        /* this region's page is not onlined now */
+                        for (i = 0; i < nr_pages; i++) {
+                                struct page *page = pfn_to_page(start_pfn + i);
+                                online_page(page);
+                                onlined_pages++;
+                        }
+                }
+                res.start = res.end + 1;
+                res.end = section_end;
        }
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
        setup_per_zone_pages_min();
+        if (need_zonelists_rebuild)
+                build_all_zonelists();
+        vm_total_pages = nr_free_pagecache_pages();
        return 0;
 }
+static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
+{
+        struct pglist_data *pgdat;
+        unsigned long zones_size[MAX_NR_ZONES] = {0};
+        unsigned long zholes_size[MAX_NR_ZONES] = {0};
+        unsigned long start_pfn = start >> PAGE_SHIFT;
+        pgdat = arch_alloc_nodedata(nid);
+        if (!pgdat)
+                return NULL;
+        arch_refresh_nodedata(nid, pgdat);
+        /* we can use NODE_DATA(nid) from here */
+        /* init node's zones as empty zones, we don't have any present pages.*/
+        free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
+        return pgdat;
+}
+static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
+{
+        arch_refresh_nodedata(nid, NULL);
+        arch_free_nodedata(pgdat);
+        return;
+}
+/* add this memory to iomem resource */
+static void register_memory_resource(u64 start, u64 size)
+{
+        struct resource *res;
+        res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+        BUG_ON(!res);
+        res->name = "System RAM";
+        res->start = start;
+        res->end = start + size - 1;
+        res->flags = IORESOURCE_MEM;
+        if (request_resource(&iomem_resource, res) < 0) {
+                printk("System RAM resource %llx - %llx cannot be added\n",
+                (unsigned long long)res->start, (unsigned long long)res->end);
+                kfree(res);
+        }
+}
+int add_memory(int nid, u64 start, u64 size)
+{
+        pg_data_t *pgdat = NULL;
+        int new_pgdat = 0;
+        int ret;
+        if (!node_online(nid)) {
+                pgdat = hotadd_new_pgdat(nid, start);
+                if (!pgdat)
+                        return -ENOMEM;
+                new_pgdat = 1;
+                ret = kswapd_run(nid);
+                if (ret)
+                        goto error;
+        }
+        /* call arch's memory hotadd */
+        ret = arch_add_memory(nid, start, size);
+        if (ret < 0)
+                goto error;
+        /* we online node here. we can't roll back from here. */
+        node_set_online(nid);
+        if (new_pgdat) {
+                ret = register_one_node(nid);
+                /*
+                 * If sysfs file of new node can't create, cpu on the node
+                 * can't be hot-added. There is no rollback way now.
+                 * So, check by BUG_ON() to catch it reluctantly..
+                 */
+                BUG_ON(ret);
+        }
+        /* register this memory as resource */
+        register_memory_resource(start, size);
+        return ret;
+error:
+        /* rollback pgdat allocation and others */
+        if (new_pgdat)
+                rollback_node_hotadd(nid, pgdat);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(add_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8778f58880c4..e07e27e846a2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -87,6 +87,8 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/migrate.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -587,6 +589,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
                isolate_lru_page(page, pagelist);
 }
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
+{
+        return alloc_pages_node(node, GFP_HIGHUSER, 0);
+}
 /*
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
@@ -603,11 +610,9 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
-        if (!list_empty(&pagelist)) {
+        if (!list_empty(&pagelist))
-                err = migrate_pages_to(&pagelist, NULL, dest);
+                err = migrate_pages(&pagelist, new_node_page, dest);
-                if (!list_empty(&pagelist))
-                        putback_lru_pages(&pagelist);
-        }
        return err;
 }
@@ -627,6 +632,10 @@ int do_migrate_pages(struct mm_struct *mm,
        down_read(&mm->mmap_sem);
+        err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+        if (err)
+                goto out;
 /*
 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
@@ -686,7 +695,7 @@ int do_migrate_pages(struct mm_struct *mm,
                if (err < 0)
                        break;
        }
+out:
        up_read(&mm->mmap_sem);
        if (err < 0)
                return err;
@@ -694,6 +703,12 @@ int do_migrate_pages(struct mm_struct *mm,
 }
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+        struct vm_area_struct *vma = (struct vm_area_struct *)private;
+        return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
+}
 #else
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -706,6 +721,11 @@ int do_migrate_pages(struct mm_struct *mm,
 {
        return -ENOSYS;
 }
+static struct page *new_vma_page(struct page *page, unsigned long private)
+{
+        return NULL;
+}
 #endif
 long do_mbind(unsigned long start, unsigned long len,
@@ -767,15 +787,13 @@ long do_mbind(unsigned long start, unsigned long len,
                err = mbind_range(vma, start, end, new);
                if (!list_empty(&pagelist))
-                        nr_failed = migrate_pages_to(&pagelist, vma, -1);
+                        nr_failed = migrate_pages(&pagelist, new_vma_page,
+                                                (unsigned long)vma);
                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
        }
-        if (!list_empty(&pagelist))
-                putback_lru_pages(&pagelist);
        up_write(&mm->mmap_sem);
        mpol_free(new);
        return err;
@@ -929,6 +947,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
                goto out;
        }
+        err = security_task_movememory(task);
+        if (err)
+                goto out;
        err = do_migrate_pages(mm, &old, &new,
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
@@ -1187,10 +1209,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
        zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
        page = __alloc_pages(gfp, order, zl);
-        if (page && page_zone(page) == zl->zones[0]) {
+        if (page && page_zone(page) == zl->zones[0])
-                zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
+                inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
-                put_cpu();
-        }
        return page;
 }
@@ -1799,7 +1819,7 @@ static inline void check_huge_range(struct vm_area_struct *vma,
 int show_numa_map(struct seq_file *m, void *v)
 {
-        struct task_struct *task = m->private;
+        struct proc_maps_private *priv = m->private;
        struct vm_area_struct *vma = v;
        struct numa_maps *md;
        struct file *file = vma->vm_file;
@@ -1815,7 +1835,7 @@ int show_numa_map(struct seq_file *m, void *v)
                return 0;
        mpol_to_str(buffer, sizeof(buffer),
-                        get_vma_policy(task, vma, vma->vm_start));
+                            get_vma_policy(priv->task, vma, vma->vm_start));
        seq_printf(m, "%08lx %s", vma->vm_start, buffer);
@@ -1869,7 +1889,7 @@ out:
        kfree(md);
        if (m->count < m->size)
-                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+                m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
        return 0;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index 1c25040693d2..3f1e0c2c942c 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -15,6 +15,7 @@
 #include <linux/migrate.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/mm_inline.h>
@@ -23,13 +24,13 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
-#include <linux/swapops.h>
+#include <linux/writeback.h>
+#include <linux/mempolicy.h>
+#include <linux/vmalloc.h>
+#include <linux/security.h>
 #include "internal.h"
-/* The maximum number of pages to take off the LRU for migration */
-#define MIGRATE_CHUNK_SIZE 256
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 /*
@@ -64,16 +65,11 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
 }
 /*
- * migrate_prep() needs to be called after we have compiled the list of pages
+ * migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page() but before we begin a series of calls
+ * to be migrated using isolate_lru_page().
- * to migrate_pages().
 */
 int migrate_prep(void)
 {
-        /* Must have swap device for migration */
-        if (nr_swap_pages <= 0)
-                return -ENODEV;
        /*
         * Clear the LRU lists so pages can be isolated.
         * Note that pages may be moved off the LRU after we have
@@ -87,7 +83,6 @@ int migrate_prep(void)
 static inline void move_to_lru(struct page *page)
 {
-        list_del(&page->lru);
        if (PageActive(page)) {
                /*
                 * lru_cache_add_active checks that
@@ -113,113 +108,200 @@ int putback_lru_pages(struct list_head *l)
        int count = 0;
        list_for_each_entry_safe(page, page2, l, lru) {
+                list_del(&page->lru);
                move_to_lru(page);
                count++;
        }
        return count;
 }
-/*
+static inline int is_swap_pte(pte_t pte)
- * Non migratable page
- */
-int fail_migrate_page(struct page *newpage, struct page *page)
 {
-        return -EIO;
+        return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
 }
-EXPORT_SYMBOL(fail_migrate_page);
 /*
- * swapout a single page
+ * Restore a potential migration pte to a working pte entry
- * page is locked upon entry, unlocked on exit
 */
-static int swap_page(struct page *page)
+static void remove_migration_pte(struct vm_area_struct *vma,
+                struct page *old, struct page *new)
 {
-        struct address_space *mapping = page_mapping(page);
+        struct mm_struct *mm = vma->vm_mm;
+        swp_entry_t entry;
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *ptep, pte;
+        spinlock_t *ptl;
+        unsigned long addr = page_address_in_vma(new, vma);
+        if (addr == -EFAULT)
+                return;
+        pgd = pgd_offset(mm, addr);
+        if (!pgd_present(*pgd))
+                return;
+        pud = pud_offset(pgd, addr);
+        if (!pud_present(*pud))
+                return;
+        pmd = pmd_offset(pud, addr);
+        if (!pmd_present(*pmd))
+                return;
+        ptep = pte_offset_map(pmd, addr);
+        if (!is_swap_pte(*ptep)) {
+                pte_unmap(ptep);
+                return;
+        }
-        if (page_mapped(page) && mapping)
+        ptl = pte_lockptr(mm, pmd);
-                if (try_to_unmap(page, 1) != SWAP_SUCCESS)
+        spin_lock(ptl);
-                        goto unlock_retry;
+        pte = *ptep;
+        if (!is_swap_pte(pte))
+                goto out;
-        if (PageDirty(page)) {
+        entry = pte_to_swp_entry(pte);
-                /* Page is dirty, try to write it out here */
-                switch(pageout(page, mapping)) {
-                case PAGE_KEEP:
-                case PAGE_ACTIVATE:
-                        goto unlock_retry;
-                case PAGE_SUCCESS:
+        if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
-                        goto retry;
+                goto out;
-                case PAGE_CLEAN:
+        get_page(new);
-                        ; /* try to free the page below */
+        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
-                }
+        if (is_write_migration_entry(entry))
-        }
+                pte = pte_mkwrite(pte);
+        set_pte_at(mm, addr, ptep, pte);
-        if (PagePrivate(page)) {
+        if (PageAnon(new))
-                if (!try_to_release_page(page, GFP_KERNEL) ||
+                page_add_anon_rmap(new, vma, addr);
-                    (!mapping && page_count(page) == 1))
+        else
-                        goto unlock_retry;
+                page_add_file_rmap(new);
-        }
-        if (remove_mapping(mapping, page)) {
+        /* No need to invalidate - it was non-present before */
-                /* Success */
+        update_mmu_cache(vma, addr, pte);
-                unlock_page(page);
+        lazy_mmu_prot_update(pte);
-                return 0;
-        }
-unlock_retry:
+out:
-        unlock_page(page);
+        pte_unmap_unlock(ptep, ptl);
+}
-retry:
+/*
-        return -EAGAIN;
+ * Note that remove_file_migration_ptes will only work on regular mappings,
+ * Nonlinear mappings do not use migration entries.
+ */
+static void remove_file_migration_ptes(struct page *old, struct page *new)
+{
+        struct vm_area_struct *vma;
+        struct address_space *mapping = page_mapping(new);
+        struct prio_tree_iter iter;
+        pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        if (!mapping)
+                return;
+        spin_lock(&mapping->i_mmap_lock);
+        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
+                remove_migration_pte(vma, old, new);
+        spin_unlock(&mapping->i_mmap_lock);
 }
 /*
- * Remove references for a page and establish the new page with the correct
+ * Must hold mmap_sem lock on at least one of the vmas containing
- * basic settings to be able to stop accesses to the page.
+ * the page so that the anon_vma cannot vanish.
 */
-int migrate_page_remove_references(struct page *newpage,
+static void remove_anon_migration_ptes(struct page *old, struct page *new)
-                                struct page *page, int nr_refs)
 {
-        struct address_space *mapping = page_mapping(page);
+        struct anon_vma *anon_vma;
-        struct page **radix_pointer;
+        struct vm_area_struct *vma;
+        unsigned long mapping;
-        /*
+        mapping = (unsigned long)new->mapping;
-         * Avoid doing any of the following work if the page count
-         * indicates that the page is in use or truncate has removed
-         * the page.
-         */
-        if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
-                return -EAGAIN;
-        /*
+        if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
-         * Establish swap ptes for anonymous pages or destroy pte
+                return;
-         * maps for files.
-         *
-         * In order to reestablish file backed mappings the fault handlers
-         * will take the radix tree_lock which may then be used to stop
-         * processses from accessing this page until the new page is ready.
-         *
-         * A process accessing via a swap pte (an anonymous page) will take a
-         * page_lock on the old page which will block the process until the
-         * migration attempt is complete. At that time the PageSwapCache bit
-         * will be examined. If the page was migrated then the PageSwapCache
-         * bit will be clear and the operation to retrieve the page will be
-         * retried which will find the new page in the radix tree. Then a new
-         * direct mapping may be generated based on the radix tree contents.
-         *
-         * If the page was not migrated then the PageSwapCache bit
-         * is still set and the operation may continue.
-         */
-        if (try_to_unmap(page, 1) == SWAP_FAIL)
-                /* A vma has VM_LOCKED set -> permanent failure */
-                return -EPERM;
        /*
-         * Give up if we were unable to remove all mappings.
+         * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
         */
-        if (page_mapcount(page))
+        anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
-                return -EAGAIN;
+        spin_lock(&anon_vma->lock);
+        list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+                remove_migration_pte(vma, old, new);
+        spin_unlock(&anon_vma->lock);
+}
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+        if (PageAnon(new))
+                remove_anon_migration_ptes(old, new);
+        else
+                remove_file_migration_ptes(old, new);
+}
+/*
+ * Something used the pte of a page under migration. We need to
+ * get to the page and wait until migration is finished.
+ * When we return from this function the fault will be retried.
+ *
+ * This function is called from do_swap_page().
+ */
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+                                unsigned long address)
+{
+        pte_t *ptep, pte;
+        spinlock_t *ptl;
+        swp_entry_t entry;
+        struct page *page;
+        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+        pte = *ptep;
+        if (!is_swap_pte(pte))
+                goto out;
+        entry = pte_to_swp_entry(pte);
+        if (!is_migration_entry(entry))
+                goto out;
+        page = migration_entry_to_page(entry);
+        get_page(page);
+        pte_unmap_unlock(ptep, ptl);
+        wait_on_page_locked(page);
+        put_page(page);
+        return;
+out:
+        pte_unmap_unlock(ptep, ptl);
+}
+/*
+ * Replace the page in the mapping.
+ *
+ * The number of remaining references must be:
+ * 1 for anonymous pages without a mapping
+ * 2 for pages with a mapping
+ * 3 for pages with a mapping and PagePrivate set.
+ */
+static int migrate_page_move_mapping(struct address_space *mapping,
+                struct page *newpage, struct page *page)
+{
+        struct page **radix_pointer;
+        if (!mapping) {
+                /* Anonymous page */
+                if (page_count(page) != 1)
+                        return -EAGAIN;
+                return 0;
+        }
        write_lock_irq(&mapping->tree_lock);
@@ -227,7 +309,7 @@ int migrate_page_remove_references(struct page *newpage,
                                                &mapping->page_tree,
                                                page_index(page));
-        if (!page_mapping(page) || page_count(page) != nr_refs ||
+        if (page_count(page) != 2 + !!PagePrivate(page) ||
                        *radix_pointer != page) {
                write_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
@@ -235,19 +317,14 @@ int migrate_page_remove_references(struct page *newpage,
        /*
         * Now we know that no one else is looking at the page.
-         *
-         * Certain minimal information about a page must be available
-         * in order for other subsystems to properly handle the page if they
-         * find it through the radix tree update before we are finished
-         * copying the page.
         */
        get_page(newpage);
-        newpage->index = page->index;
+#ifdef CONFIG_SWAP
-        newpage->mapping = page->mapping;
        if (PageSwapCache(page)) {
                SetPageSwapCache(newpage);
                set_page_private(newpage, page_private(page));
        }
+#endif
        *radix_pointer = newpage;
        __put_page(page);
@@ -255,12 +332,11 @@ int migrate_page_remove_references(struct page *newpage,
        return 0;
 }
-EXPORT_SYMBOL(migrate_page_remove_references);
 /*
 * Copy the page to its new location
 */
-void migrate_page_copy(struct page *newpage, struct page *page)
+static void migrate_page_copy(struct page *newpage, struct page *page)
 {
        copy_highpage(newpage, page);
@@ -282,7 +358,9 @@ void migrate_page_copy(struct page *newpage, struct page *page)
                set_page_dirty(newpage);
        }
+#ifdef CONFIG_SWAP
        ClearPageSwapCache(page);
+#endif
        ClearPageActive(page);
        ClearPagePrivate(page);
        set_page_private(page, 0);
@@ -295,7 +373,18 @@ void migrate_page_copy(struct page *newpage, struct page *page)
        if (PageWriteback(newpage))
                end_page_writeback(newpage);
 }
-EXPORT_SYMBOL(migrate_page_copy);
+/************************************************************
+ *                    Migration functions
+ ***********************************************************/
+/* Always fail migration. Used for mappings that are not movable */
+int fail_migrate_page(struct address_space *mapping,
+                        struct page *newpage, struct page *page)
+{
+        return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
 /*
 * Common logic to directly migrate a single page suitable for
@@ -303,51 +392,284 @@ EXPORT_SYMBOL(migrate_page_copy);
 *
 * Pages are locked upon entry and exit.
 */
-int migrate_page(struct page *newpage, struct page *page)
+int migrate_page(struct address_space *mapping,
+                struct page *newpage, struct page *page)
 {
        int rc;
        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
-        rc = migrate_page_remove_references(newpage, page, 2);
+        rc = migrate_page_move_mapping(mapping, newpage, page);
+        if (rc)
+                return rc;
+        migrate_page_copy(newpage, page);
+        return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct address_space *mapping,
+                struct page *newpage, struct page *page)
+{
+        struct buffer_head *bh, *head;
+        int rc;
+        if (!page_has_buffers(page))
+                return migrate_page(mapping, newpage, page);
+        head = page_buffers(page);
+        rc = migrate_page_move_mapping(mapping, newpage, page);
        if (rc)
                return rc;
+        bh = head;
+        do {
+                get_bh(bh);
+                lock_buffer(bh);
+                bh = bh->b_this_page;
+        } while (bh != head);
+        ClearPagePrivate(page);
+        set_page_private(newpage, page_private(page));
+        set_page_private(page, 0);
+        put_page(page);
+        get_page(newpage);
+        bh = head;
+        do {
+                set_bh_page(bh, newpage, bh_offset(bh));
+                bh = bh->b_this_page;
+        } while (bh != head);
+        SetPagePrivate(newpage);
        migrate_page_copy(newpage, page);
+        bh = head;
+        do {
+                unlock_buffer(bh);
+                put_bh(bh);
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+/*
+ * Writeback a page to clean the dirty state
+ */
+static int writeout(struct address_space *mapping, struct page *page)
+{
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_NONE,
+                .nr_to_write = 1,
+                .range_start = 0,
+                .range_end = LLONG_MAX,
+                .nonblocking = 1,
+                .for_reclaim = 1
+        };
+        int rc;
+        if (!mapping->a_ops->writepage)
+                /* No write method for the address space */
+                return -EINVAL;
+        if (!clear_page_dirty_for_io(page))
+                /* Someone else already triggered a write */
+                return -EAGAIN;
        /*
-         * Remove auxiliary swap entries and replace
+         * A dirty page may imply that the underlying filesystem has
-         * them with real ptes.
+         * the page on some queue. So the page must be clean for
-         *
+         * migration. Writeout may mean we loose the lock and the
-         * Note that a real pte entry will allow processes that are not
+         * page state is no longer what we checked for earlier.
-         * waiting on the page lock to use the new page via the page tables
+         * At this point we know that the migration attempt cannot
-         * before the new page is unlocked.
+         * be successful.
         */
-        remove_from_swap(newpage);
+        remove_migration_ptes(page, page);
-        return 0;
+        rc = mapping->a_ops->writepage(page, &wbc);
+        if (rc < 0)
+                /* I/O Error writing */
+                return -EIO;
+        if (rc != AOP_WRITEPAGE_ACTIVATE)
+                /* unlocked. Relock */
+                lock_page(page);
+        return -EAGAIN;
+}
+/*
+ * Default handling if a filesystem does not provide a migration function.
+ */
+static int fallback_migrate_page(struct address_space *mapping,
+        struct page *newpage, struct page *page)
+{
+        if (PageDirty(page))
+                return writeout(mapping, page);
+        /*
+         * Buffers may be managed in a filesystem specific way.
+         * We must have no buffers or drop them.
+         */
+        if (page_has_buffers(page) &&
+            !try_to_release_page(page, GFP_KERNEL))
+                return -EAGAIN;
+        return migrate_page(mapping, newpage, page);
+}
+/*
+ * Move a page to a newly allocated page
+ * The page is locked and all ptes have been successfully removed.
+ *
+ * The new page will have replaced the old page if this function
+ * is successful.
+ */
+static int move_to_new_page(struct page *newpage, struct page *page)
+{
+        struct address_space *mapping;
+        int rc;
+        /*
+         * Block others from accessing the page when we get around to
+         * establishing additional references. We are the only one
+         * holding a reference to the new page at this point.
+         */
+        if (TestSetPageLocked(newpage))
+                BUG();
+        /* Prepare mapping for the new page.*/
+        newpage->index = page->index;
+        newpage->mapping = page->mapping;
+        mapping = page_mapping(page);
+        if (!mapping)
+                rc = migrate_page(mapping, newpage, page);
+        else if (mapping->a_ops->migratepage)
+                /*
+                 * Most pages have a mapping and most filesystems
+                 * should provide a migration function. Anonymous
+                 * pages are part of swap space which also has its
+                 * own migration function. This is the most common
+                 * path for page migration.
+                 */
+                rc = mapping->a_ops->migratepage(mapping,
+                                                newpage, page);
+        else
+                rc = fallback_migrate_page(mapping, newpage, page);
+        if (!rc)
+                remove_migration_ptes(page, newpage);
+        else
+                newpage->mapping = NULL;
+        unlock_page(newpage);
+        return rc;
+}
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+                        struct page *page, int force)
+{
+        int rc = 0;
+        int *result = NULL;
+        struct page *newpage = get_new_page(page, private, &result);
+        if (!newpage)
+                return -ENOMEM;
+        if (page_count(page) == 1)
+                /* page was freed from under us. So we are done. */
+                goto move_newpage;
+        rc = -EAGAIN;
+        if (TestSetPageLocked(page)) {
+                if (!force)
+                        goto move_newpage;
+                lock_page(page);
+        }
+        if (PageWriteback(page)) {
+                if (!force)
+                        goto unlock;
+                wait_on_page_writeback(page);
+        }
+        /*
+         * Establish migration ptes or remove ptes
+         */
+        try_to_unmap(page, 1);
+        if (!page_mapped(page))
+                rc = move_to_new_page(newpage, page);
+        if (rc)
+                remove_migration_ptes(page, page);
+unlock:
+        unlock_page(page);
+        if (rc != -EAGAIN) {
+                /*
+                 * A page that has been migrated has all references
+                 * removed and will be freed. A page that has not been
+                 * migrated will have kepts its references and be
+                 * restored.
+                 */
+                list_del(&page->lru);
+                move_to_lru(page);
+        }
+move_newpage:
+        /*
+         * Move the new page to the LRU. If migration was not successful
+         * then this will free the page.
+         */
+        move_to_lru(newpage);
+        if (result) {
+                if (rc)
+                        *result = rc;
+                else
+                        *result = page_to_nid(newpage);
+        }
+        return rc;
 }
-EXPORT_SYMBOL(migrate_page);
 /*
 * migrate_pages
 *
- * Two lists are passed to this function. The first list
+ * The function takes one list of pages to migrate and a function
- * contains the pages isolated from the LRU to be migrated.
+ * that determines from the page to be migrated and the private data
- * The second list contains new pages that the pages isolated
+ * the target of the move and allocates the page.
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
 *
 * The function returns after 10 attempts or if no pages
 * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
+ * or no retryable pages exist anymore. All pages will be
+ * retruned to the LRU or freed.
 *
- * Return: Number of pages not migrated when "to" ran empty.
+ * Return: Number of pages not migrated or error code.
 */
-int migrate_pages(struct list_head *from, struct list_head *to,
+int migrate_pages(struct list_head *from,
-                  struct list_head *moved, struct list_head *failed)
+                new_page_t get_new_page, unsigned long private)
 {
-        int retry;
+        int retry = 1;
        int nr_failed = 0;
        int pass = 0;
        struct page *page;
@@ -358,305 +680,317 @@ int migrate_pages(struct list_head *from, struct list_head *to,
        if (!swapwrite)
                current->flags |= PF_SWAPWRITE;
-redo:
+        for(pass = 0; pass < 10 && retry; pass++) {
-        retry = 0;
+                retry = 0;
+                list_for_each_entry_safe(page, page2, from, lru) {
+                        cond_resched();
+                        rc = unmap_and_move(get_new_page, private,
+                                                page, pass > 2);
+                        switch(rc) {
+                        case -ENOMEM:
+                                goto out;
+                        case -EAGAIN:
+                                retry++;
+                                break;
+                        case 0:
+                                break;
+                        default:
+                                /* Permanent failure */
+                                nr_failed++;
+                                break;
+                        }
+                }
+        }
+        rc = 0;
+out:
+        if (!swapwrite)
+                current->flags &= ~PF_SWAPWRITE;
+        putback_lru_pages(from);
+        if (rc)
+                return rc;
-        list_for_each_entry_safe(page, page2, from, lru) {
+        return nr_failed + retry;
-                struct page *newpage = NULL;
+}
-                struct address_space *mapping;
-                cond_resched();
+#ifdef CONFIG_NUMA
+/*
+ * Move a list of individual pages
+ */
+struct page_to_node {
+        unsigned long addr;
+        struct page *page;
+        int node;
+        int status;
+};
-                rc = 0;
+static struct page *new_page_node(struct page *p, unsigned long private,
-                if (page_count(page) == 1)
+                int **result)
-                        /* page was freed from under us. So we are done. */
+{
-                        goto next;
+        struct page_to_node *pm = (struct page_to_node *)private;
-                if (to && list_empty(to))
+        while (pm->node != MAX_NUMNODES && pm->page != p)
-                        break;
+                pm++;
-                /*
+        if (pm->node == MAX_NUMNODES)
-                 * Skip locked pages during the first two passes to give the
+                return NULL;
-                 * functions holding the lock time to release the page. Later we
-                 * use lock_page() to have a higher chance of acquiring the
-                 * lock.
-                 */
-                rc = -EAGAIN;
-                if (pass > 2)
-                        lock_page(page);
-                else
-                        if (TestSetPageLocked(page))
-                                goto next;
-                /*
+        *result = &pm->status;
-                 * Only wait on writeback if we have already done a pass where
-                 * we we may have triggered writeouts for lots of pages.
-                 */
-                if (pass > 0) {
-                        wait_on_page_writeback(page);
-                } else {
-                        if (PageWriteback(page))
-                                goto unlock_page;
-                }
-                /*
+        return alloc_pages_node(pm->node, GFP_HIGHUSER, 0);
-                 * Anonymous pages must have swap cache references otherwise
+}
-                 * the information contained in the page maps cannot be
-                 * preserved.
-                 */
-                if (PageAnon(page) && !PageSwapCache(page)) {
-                        if (!add_to_swap(page, GFP_KERNEL)) {
-                                rc = -ENOMEM;
-                                goto unlock_page;
-                        }
-                }
-                if (!to) {
+/*
-                        rc = swap_page(page);
+ * Move a set of pages as indicated in the pm array. The addr
-                        goto next;
+ * field must be set to the virtual address of the page to be moved
-                }
+ * and the node number must contain a valid target node.
+ */
+static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
+                                int migrate_all)
+{
+        int err;
+        struct page_to_node *pp;
+        LIST_HEAD(pagelist);
+        down_read(&mm->mmap_sem);
-                newpage = lru_to_page(to);
+        /*
-                lock_page(newpage);
+         * Build a list of pages to migrate
+         */
+        migrate_prep();
+        for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
+                struct vm_area_struct *vma;
+                struct page *page;
                /*
-                 * Pages are properly locked and writeback is complete.
+                 * A valid page pointer that will not match any of the
-                 * Try to migrate the page.
+                 * pages that will be moved.
                 */
-                mapping = page_mapping(page);
+                pp->page = ZERO_PAGE(0);
-                if (!mapping)
-                        goto unlock_both;
-                if (mapping->a_ops->migratepage) {
+                err = -EFAULT;
-                        /*
+                vma = find_vma(mm, pp->addr);
-                         * Most pages have a mapping and most filesystems
+                if (!vma)
-                         * should provide a migration function. Anonymous
+                        goto set_status;
-                         * pages are part of swap space which also has its
-                         * own migration function. This is the most common
-                         * path for page migration.
-                         */
-                        rc = mapping->a_ops->migratepage(newpage, page);
-                        goto unlock_both;
-                }
-                /* Make sure the dirty bit is up to date */
-                if (try_to_unmap(page, 1) == SWAP_FAIL) {
-                        rc = -EPERM;
-                        goto unlock_both;
-                }
-                if (page_mapcount(page)) {
+                page = follow_page(vma, pp->addr, FOLL_GET);
-                        rc = -EAGAIN;
+                err = -ENOENT;
-                        goto unlock_both;
+                if (!page)
-                }
+                        goto set_status;
-                /*
+                if (PageReserved(page))         /* Check for zero page */
-                 * Default handling if a filesystem does not provide
+                        goto put_and_set;
-                 * a migration function. We can only migrate clean
-                 * pages so try to write out any dirty pages first.
-                 */
-                if (PageDirty(page)) {
-                        switch (pageout(page, mapping)) {
-                        case PAGE_KEEP:
-                        case PAGE_ACTIVATE:
-                                goto unlock_both;
-                        case PAGE_SUCCESS:
-                                unlock_page(newpage);
-                                goto next;
-                        case PAGE_CLEAN:
-                                ; /* try to migrate the page below */
-                        }
-                }
-                /*
+                pp->page = page;
-                 * Buffers are managed in a filesystem specific way.
+                err = page_to_nid(page);
-                 * We must have no buffers or drop them.
-                 */
-                if (!page_has_buffers(page) ||
-                    try_to_release_page(page, GFP_KERNEL)) {
-                        rc = migrate_page(newpage, page);
-                        goto unlock_both;
-                }
-                /*
+                if (err == pp->node)
-                 * On early passes with mapped pages simply
-                 * retry. There may be a lock held for some
-                 * buffers that may go away. Later
-                 * swap them out.
-                 */
-                if (pass > 4) {
                        /*
-                         * Persistently unable to drop buffers..... As a
+                         * Node already in the right place
-                         * measure of last resort we fall back to
-                         * swap_page().
                         */
-                        unlock_page(newpage);
+                        goto put_and_set;
-                        newpage = NULL;
-                        rc = swap_page(page);
-                        goto next;
-                }
-unlock_both:
+                err = -EACCES;
-                unlock_page(newpage);
+                if (page_mapcount(page) > 1 &&
+                                !migrate_all)
-unlock_page:
+                        goto put_and_set;
-                unlock_page(page);
+                err = isolate_lru_page(page, &pagelist);
-next:
+put_and_set:
-                if (rc == -EAGAIN) {
+                /*
-                        retry++;
+                 * Either remove the duplicate refcount from
-                } else if (rc) {
+                 * isolate_lru_page() or drop the page ref if it was
-                        /* Permanent failure */
+                 * not isolated.
-                        list_move(&page->lru, failed);
+                 */
-                        nr_failed++;
+                put_page(page);
-                } else {
+set_status:
-                        if (newpage) {
+                pp->status = err;
-                                /* Successful migration. Return page to LRU */
-                                move_to_lru(newpage);
-                        }
-                        list_move(&page->lru, moved);
-                }
        }
-        if (retry && pass++ < 10)
-                goto redo;
-        if (!swapwrite)
+        if (!list_empty(&pagelist))
-                current->flags &= ~PF_SWAPWRITE;
+                err = migrate_pages(&pagelist, new_page_node,
+                                (unsigned long)pm);
+        else
+                err = -ENOENT;
-        return nr_failed + retry;
+        up_read(&mm->mmap_sem);
+        return err;
 }
 /*
- * Migration function for pages with buffers. This function can only be used
+ * Determine the nodes of a list of pages. The addr in the pm array
- * if the underlying filesystem guarantees that no other references to "page"
+ * must have been set to the virtual address of which we want to determine
- * exist.
+ * the node number.
 */
-int buffer_migrate_page(struct page *newpage, struct page *page)
+static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
 {
-        struct address_space *mapping = page->mapping;
+        down_read(&mm->mmap_sem);
-        struct buffer_head *bh, *head;
-        int rc;
+        for ( ; pm->node != MAX_NUMNODES; pm++) {
+                struct vm_area_struct *vma;
+                struct page *page;
+                int err;
+                err = -EFAULT;
+                vma = find_vma(mm, pm->addr);
+                if (!vma)
+                        goto set_status;
+                page = follow_page(vma, pm->addr, 0);
+                err = -ENOENT;
+                /* Use PageReserved to check for zero page */
+                if (!page || PageReserved(page))
+                        goto set_status;
+                err = page_to_nid(page);
+set_status:
+                pm->status = err;
+        }
-        if (!mapping)
+        up_read(&mm->mmap_sem);
-                return -EAGAIN;
+        return 0;
+}
-        if (!page_has_buffers(page))
+/*
-                return migrate_page(newpage, page);
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
+                        const void __user * __user *pages,
+                        const int __user *nodes,
+                        int __user *status, int flags)
+{
+        int err = 0;
+        int i;
+        struct task_struct *task;
+        nodemask_t task_nodes;
+        struct mm_struct *mm;
+        struct page_to_node *pm = NULL;
-        head = page_buffers(page);
+        /* Check flags */
+        if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+                return -EINVAL;
-        rc = migrate_page_remove_references(newpage, page, 3);
+        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+                return -EPERM;
-        if (rc)
+        /* Find the mm_struct */
-                return rc;
+        read_lock(&tasklist_lock);
+        task = pid ? find_task_by_pid(pid) : current;
+        if (!task) {
+                read_unlock(&tasklist_lock);
+                return -ESRCH;
+        }
+        mm = get_task_mm(task);
+        read_unlock(&tasklist_lock);
-        bh = head;
+        if (!mm)
-        do {
+                return -EINVAL;
-                get_bh(bh);
-                lock_buffer(bh);
-                bh = bh->b_this_page;
-        } while (bh != head);
+        /*
+         * Check if this process has the right to modify the specified
+         * process. The right exists if the process has administrative
+         * capabilities, superuser privileges or the same
+         * userid as the target process.
+         */
+        if ((current->euid != task->suid) && (current->euid != task->uid) &&
+            (current->uid != task->suid) && (current->uid != task->uid) &&
+            !capable(CAP_SYS_NICE)) {
+                err = -EPERM;
+                goto out2;
+        }
-        ClearPagePrivate(page);
+        err = security_task_movememory(task);
-        set_page_private(newpage, page_private(page));
+        if (err)
-        set_page_private(page, 0);
+                goto out2;
-        put_page(page);
-        get_page(newpage);
-        bh = head;
-        do {
-                set_bh_page(bh, newpage, bh_offset(bh));
-                bh = bh->b_this_page;
-        } while (bh != head);
+        task_nodes = cpuset_mems_allowed(task);
-        SetPagePrivate(newpage);
+        /* Limit nr_pages so that the multiplication may not overflow */
+        if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
+                err = -E2BIG;
+                goto out2;
+        }
-        migrate_page_copy(newpage, page);
+        pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
+        if (!pm) {
+                err = -ENOMEM;
+                goto out2;
+        }
-        bh = head;
+        /*
-        do {
+         * Get parameters from user space and initialize the pm
-                unlock_buffer(bh);
+         * array. Return various errors if the user did something wrong.
-                put_bh(bh);
+         */
-                bh = bh->b_this_page;
+        for (i = 0; i < nr_pages; i++) {
+                const void *p;
-        } while (bh != head);
+                err = -EFAULT;
+                if (get_user(p, pages + i))
+                        goto out;
-        return 0;
+                pm[i].addr = (unsigned long)p;
-}
+                if (nodes) {
-EXPORT_SYMBOL(buffer_migrate_page);
+                        int node;
-/*
+                        if (get_user(node, nodes + i))
- * Migrate the list 'pagelist' of pages to a certain destination.
+                                goto out;
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-int migrate_pages_to(struct list_head *pagelist,
-                        struct vm_area_struct *vma, int dest)
-{
-        LIST_HEAD(newlist);
-        LIST_HEAD(moved);
-        LIST_HEAD(failed);
-        int err = 0;
-        unsigned long offset = 0;
-        int nr_pages;
-        struct page *page;
-        struct list_head *p;
-redo:
+                        err = -ENODEV;
-        nr_pages = 0;
+                        if (!node_online(node))
-        list_for_each(p, pagelist) {
+                                goto out;
-                if (vma) {
-                        /*
-                         * The address passed to alloc_page_vma is used to
-                         * generate the proper interleave behavior. We fake
-                         * the address here by an increasing offset in order
-                         * to get the proper distribution of pages.
-                         *
-                         * No decision has been made as to which page
-                         * a certain old page is moved to so we cannot
-                         * specify the correct address.
-                         */
-                        page = alloc_page_vma(GFP_HIGHUSER, vma,
-                                        offset + vma->vm_start);
-                        offset += PAGE_SIZE;
-                }
-                else
-                        page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-                if (!page) {
+                        err = -EACCES;
-                        err = -ENOMEM;
+                        if (!node_isset(node, task_nodes))
-                        goto out;
+                                goto out;
+                        pm[i].node = node;
                }
-                list_add_tail(&page->lru, &newlist);
-                nr_pages++;
-                if (nr_pages > MIGRATE_CHUNK_SIZE)
-                        break;
        }
-        err = migrate_pages(pagelist, &newlist, &moved, &failed);
+        /* End marker */
+        pm[nr_pages].node = MAX_NUMNODES;
+        if (nodes)
+                err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
+        else
+                err = do_pages_stat(mm, pm);
-        putback_lru_pages(&moved);      /* Call release pages instead ?? */
+        if (err >= 0)
+                /* Return status information */
+                for (i = 0; i < nr_pages; i++)
+                        if (put_user(pm[i].status, status + i))
+                                err = -EFAULT;
-        if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-                goto redo;
 out:
-        /* Return leftover allocated pages */
+        vfree(pm);
-        while (!list_empty(&newlist)) {
+out2:
-                page = list_entry(newlist.next, struct page, lru);
+        mmput(mm);
-                list_del(&page->lru);
+        return err;
-                __free_page(page);
+}
-        }
+#endif
-        list_splice(&failed, pagelist);
-        if (err < 0)
+/*
-                return err;
+ * Call migration functions in the vma_ops that may prepare
+ * memory in a vm for migration. migration functions may perform
-        /* Calculate number of leftover pages */
+ * the migration for vmas that do not have an underlying page struct.
-        nr_pages = 0;
+ */
-        list_for_each(p, pagelist)
+int migrate_vmas(struct mm_struct *mm, const nodemask_t *to,
-                nr_pages++;
+        const nodemask_t *from, unsigned long flags)
-        return nr_pages;
+{
+        struct vm_area_struct *vma;
+        int err = 0;
+        for(vma = mm->mmap; vma->vm_next && !err; vma = vma->vm_next) {
+                if (vma->vm_ops && vma->vm_ops->migrate) {
+                        err = vma->vm_ops->migrate(vma, to, from, flags);
+                        if (err)
+                                break;
+                }
+        }
+        return err;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index e6ee12344b13..c1868ecdbc5f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -96,7 +96,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
                unsigned long n;
-                free = get_page_cache_size();
+                free = global_page_state(NR_FILE_PAGES);
                free += nr_swap_pages;
                /*
@@ -1065,7 +1065,8 @@ munmap_back:
        vma->vm_start = addr;
        vma->vm_end = addr + len;
        vma->vm_flags = vm_flags;
-        vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+        vma->vm_page_prot = protection_map[vm_flags &
+                                (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
        vma->vm_pgoff = pgoff;
        if (file) {
@@ -1089,6 +1090,12 @@ munmap_back:
                        goto free_vma;
        }
+        /* Don't make the VMA automatically writable if it's shared, but the
+         * backer wishes to know when pages are first written to */
+        if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+                vma->vm_page_prot =
+                        protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
        /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
         * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
         * that memory reservation must be checked; but that reservation
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
        vma->vm_end = addr + len;
        vma->vm_pgoff = pgoff;
        vma->vm_flags = flags;
-        vma->vm_page_prot = protection_map[flags & 0x0f];
+        vma->vm_page_prot = protection_map[flags &
+                                (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
        vma_link(mm, vma, prev, rb_link, rb_parent);
 out:
        mm->total_vm += len >> PAGE_SHIFT;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index b022370e612e..0959ee1a4795 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -5,7 +5,6 @@
 */
-#include <linux/config.h>
 #include <linux/stddef.h>
 #include <linux/mmzone.h>
 #include <linux/module.h>
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c14d4289b61..638edabaff71 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -19,7 +19,8 @@
 #include <linux/mempolicy.h>
 #include <linux/personality.h>
 #include <linux/syscalls.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -28,12 +29,13 @@
 static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                unsigned long addr, unsigned long end, pgprot_t newprot)
 {
-        pte_t *pte;
+        pte_t *pte, oldpte;
        spinlock_t *ptl;
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        do {
-                if (pte_present(*pte)) {
+                oldpte = *pte;
+                if (pte_present(oldpte)) {
                        pte_t ptent;
                        /* Avoid an SMP race with hardware updated dirty/clean
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
                        set_pte_at(mm, addr, pte, ptent);
                        lazy_mmu_prot_update(ptent);
+#ifdef CONFIG_MIGRATION
+                } else if (!pte_file(oldpte)) {
+                        swp_entry_t entry = pte_to_swp_entry(oldpte);
+                        if (is_write_migration_entry(entry)) {
+                                /*
+                                 * A protection check is difficult so
+                                 * just be safe and disable write
+                                 */
+                                make_migration_entry_read(&entry);
+                                set_pte_at(mm, addr, pte,
+                                        swp_entry_to_pte(entry));
+                        }
+#endif
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap_unlock(pte - 1, ptl);
 }
@@ -106,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
        unsigned long oldflags = vma->vm_flags;
        long nrpages = (end - start) >> PAGE_SHIFT;
        unsigned long charged = 0;
+        unsigned int mask;
        pgprot_t newprot;
        pgoff_t pgoff;
        int error;
@@ -132,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
                }
        }
-        newprot = protection_map[newflags & 0xf];
        /*
         * First try to merge with previous and/or next vma.
         */
@@ -160,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
        }
 success:
+        /* Don't make the VMA automatically writable if it's shared, but the
+         * backer wishes to know when pages are first written to */
+        mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
+        if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+                mask &= ~VM_SHARED;
+        newprot = protection_map[newflags & mask];
        /*
         * vm_flags and vm_page_prot are protected by the mmap_sem
         * held in write mode.
@@ -205,8 +229,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
        /*
         * Does the application expect PROT_READ to imply PROT_EXEC:
         */
-        if (unlikely((prot & PROT_READ) &&
+        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
-                        (current->personality & READ_IMPLIES_EXEC)))
                prot |= PROT_EXEC;
        vm_flags = calc_vm_prot_bits(prot);
diff --git a/mm/msync.c b/mm/msync.c
index bc6c95376366..d083544df21b 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
         * just ignore them, but return -ENOMEM at the end.
         */
        down_read(&current->mm->mmap_sem);
-        if (flags & MS_SYNC)
-                current->flags |= PF_SYNCWRITE;
        vma = find_vma(current->mm, start);
        if (!vma) {
                error = -ENOMEM;
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
                }
        } while (vma && !done);
 out_unlock:
-        current->flags &= ~PF_SYNCWRITE;
        up_read(&current->mm->mmap_sem);
 out:
        return error;
diff --git a/mm/nommu.c b/mm/nommu.c
index 029fadac0fb5..5151c44a8257 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1122,7 +1122,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin)
        if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
                unsigned long n;
-                free = get_page_cache_size();
+                free = global_page_state(NR_FILE_PAGES);
                free += nr_swap_pages;
                /*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 042e6436c3ee..d46ed0f1dc06 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -22,10 +22,11 @@
 #include <linux/jiffies.h>
 #include <linux/cpuset.h>
+int sysctl_panic_on_oom;
 /* #define DEBUG */
 /**
- * oom_badness - calculate a numeric value for how bad this task has been
+ * badness - calculate a numeric value for how bad this task has been
 * @p: task struct of which task we should calculate
 * @uptime: current uptime in seconds
 *
@@ -200,7 +201,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
                        continue;
                /*
-                 * This is in the process of releasing memory so for wait it
+                 * This is in the process of releasing memory so wait for it
                 * to finish before killing some other task by mistake.
                 */
                releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
@@ -306,7 +307,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
 }
 /**
- * oom_kill - kill the "best" process when we run out of memory
+ * out_of_memory - kill the "best" process when we run out of memory
 *
 * If we run out of memory, we have the choice between either
 * killing a random task (bad), letting the system crash (worse)
@@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
                break;
        case CONSTRAINT_NONE:
+                if (sysctl_panic_on_oom)
+                        panic("out of memory. panic_on_oom is selected\n");
 retry:
                /*
                 * Rambo mode: Shoot down a process and hope it solves whatever
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 75d7f48b79bb..e630188ccc40 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -99,22 +99,6 @@ EXPORT_SYMBOL(laptop_mode);
 static void background_writeout(unsigned long _min_pages);
-struct writeback_state
-{
-        unsigned long nr_dirty;
-        unsigned long nr_unstable;
-        unsigned long nr_mapped;
-        unsigned long nr_writeback;
-};
-static void get_writeback_state(struct writeback_state *wbs)
-{
-        wbs->nr_dirty = read_page_state(nr_dirty);
-        wbs->nr_unstable = read_page_state(nr_unstable);
-        wbs->nr_mapped = read_page_state(nr_mapped);
-        wbs->nr_writeback = read_page_state(nr_writeback);
-}
 /*
 * Work out the current dirty-memory clamping and background writeout
 * thresholds.
@@ -133,8 +117,8 @@ static void get_writeback_state(struct writeback_state *wbs)
 * clamping level.
 */
 static void
-get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
+get_dirty_limits(long *pbackground, long *pdirty,
-                struct address_space *mapping)
+                                        struct address_space *mapping)
 {
        int background_ratio;           /* Percentages */
        int dirty_ratio;
@@ -144,8 +128,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
        unsigned long available_memory = total_pages;
        struct task_struct *tsk;
-        get_writeback_state(wbs);
 #ifdef CONFIG_HIGHMEM
        /*
         * If this mapping can only allocate from low memory,
@@ -156,7 +138,9 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
 #endif
-        unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages;
+        unmapped_ratio = 100 - ((global_page_state(NR_FILE_MAPPED) +
+                                global_page_state(NR_ANON_PAGES)) * 100) /
+                                        total_pages;
        dirty_ratio = vm_dirty_ratio;
        if (dirty_ratio > unmapped_ratio / 2)
@@ -189,7 +173,6 @@ get_dirty_limits(struct writeback_state *wbs, long *pbackground, long *pdirty,
 */
 static void balance_dirty_pages(struct address_space *mapping)
 {
-        struct writeback_state wbs;
        long nr_reclaimable;
        long background_thresh;
        long dirty_thresh;
@@ -204,13 +187,15 @@ static void balance_dirty_pages(struct address_space *mapping)
                        .sync_mode      = WB_SYNC_NONE,
                        .older_than_this = NULL,
                        .nr_to_write    = write_chunk,
+                        .range_cyclic   = 1,
                };
-                get_dirty_limits(&wbs, &background_thresh,
+                get_dirty_limits(&background_thresh, &dirty_thresh, mapping);
-                                        &dirty_thresh, mapping);
+                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-                nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
+                                        global_page_state(NR_UNSTABLE_NFS);
-                if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+                if (nr_reclaimable + global_page_state(NR_WRITEBACK) <=
-                        break;
+                        dirty_thresh)
+                                break;
                if (!dirty_exceeded)
                        dirty_exceeded = 1;
@@ -223,11 +208,14 @@ static void balance_dirty_pages(struct address_space *mapping)
                 */
                if (nr_reclaimable) {
                        writeback_inodes(&wbc);
-                        get_dirty_limits(&wbs, &background_thresh,
+                        get_dirty_limits(&background_thresh,
-                                        &dirty_thresh, mapping);
+                                                &dirty_thresh, mapping);
-                        nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
+                        nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-                        if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+                                        global_page_state(NR_UNSTABLE_NFS);
-                                break;
+                        if (nr_reclaimable +
+                                global_page_state(NR_WRITEBACK)
+                                        <= dirty_thresh)
+                                                break;
                        pages_written += write_chunk - wbc.nr_to_write;
                        if (pages_written >= write_chunk)
                                break;          /* We've done our duty */
@@ -235,8 +223,9 @@ static void balance_dirty_pages(struct address_space *mapping)
                blk_congestion_wait(WRITE, HZ/10);
        }
-        if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded)
+        if (nr_reclaimable + global_page_state(NR_WRITEBACK)
-                dirty_exceeded = 0;
+                <= dirty_thresh && dirty_exceeded)
+                        dirty_exceeded = 0;
        if (writeback_in_progress(bdi))
                return;         /* pdflush is already working this queue */
@@ -298,12 +287,11 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 void throttle_vm_writeout(void)
 {
-        struct writeback_state wbs;
        long background_thresh;
        long dirty_thresh;
        for ( ; ; ) {
-                get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
+                get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
                /*
                 * Boost the allowable dirty threshold a bit for page
@@ -311,8 +299,9 @@ void throttle_vm_writeout(void)
                 */
                dirty_thresh += dirty_thresh / 10;      /* wheeee... */
-                if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh)
+                if (global_page_state(NR_UNSTABLE_NFS) +
-                        break;
+                        global_page_state(NR_WRITEBACK) <= dirty_thresh)
+                                break;
                blk_congestion_wait(WRITE, HZ/10);
        }
 }
@@ -331,15 +320,16 @@ static void background_writeout(unsigned long _min_pages)
                .older_than_this = NULL,
                .nr_to_write    = 0,
                .nonblocking    = 1,
+                .range_cyclic   = 1,
        };
        for ( ; ; ) {
-                struct writeback_state wbs;
                long background_thresh;
                long dirty_thresh;
-                get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
+                get_dirty_limits(&background_thresh, &dirty_thresh, NULL);
-                if (wbs.nr_dirty + wbs.nr_unstable < background_thresh
+                if (global_page_state(NR_FILE_DIRTY) +
+                        global_page_state(NR_UNSTABLE_NFS) < background_thresh
                                && min_pages <= 0)
                        break;
                wbc.encountered_congestion = 0;
@@ -363,12 +353,9 @@ static void background_writeout(unsigned long _min_pages)
 */
 int wakeup_pdflush(long nr_pages)
 {
-        if (nr_pages == 0) {
+        if (nr_pages == 0)
-                struct writeback_state wbs;
+                nr_pages = global_page_state(NR_FILE_DIRTY) +
+                                global_page_state(NR_UNSTABLE_NFS);
-                get_writeback_state(&wbs);
-                nr_pages = wbs.nr_dirty + wbs.nr_unstable;
-        }
        return pdflush_operation(background_writeout, nr_pages);
 }
@@ -399,7 +386,6 @@ static void wb_kupdate(unsigned long arg)
        unsigned long start_jif;
        unsigned long next_jif;
        long nr_to_write;
-        struct writeback_state wbs;
        struct writeback_control wbc = {
                .bdi            = NULL,
                .sync_mode      = WB_SYNC_NONE,
@@ -407,15 +393,16 @@ static void wb_kupdate(unsigned long arg)
                .nr_to_write    = 0,
                .nonblocking    = 1,
                .for_kupdate    = 1,
+                .range_cyclic   = 1,
        };
        sync_supers();
-        get_writeback_state(&wbs);
        oldest_jif = jiffies - dirty_expire_interval;
        start_jif = jiffies;
        next_jif = start_jif + dirty_writeback_interval;
-        nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
+        nr_to_write = global_page_state(NR_FILE_DIRTY) +
+                        global_page_state(NR_UNSTABLE_NFS) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        while (nr_to_write > 0) {
                wbc.encountered_congestion = 0;
@@ -513,14 +500,14 @@ static void set_ratelimit(void)
                ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
 }
-static int
+static int __cpuinit
 ratelimit_handler(struct notifier_block *self, unsigned long u, void *v)
 {
        set_ratelimit();
        return 0;
 }
-static struct notifier_block ratelimit_nb = {
+static struct notifier_block __cpuinitdata ratelimit_nb = {
        .notifier_call  = ratelimit_handler,
        .next           = NULL,
 };
@@ -637,7 +624,8 @@ int __set_page_dirty_nobuffers(struct page *page)
                        if (mapping2) { /* Race with truncate? */
                                BUG_ON(mapping2 != mapping);
                                if (mapping_cap_account_dirty(mapping))
-                                        inc_page_state(nr_dirty);
+                                        __inc_zone_page_state(page,
+                                                                NR_FILE_DIRTY);
                                radix_tree_tag_set(&mapping->page_tree,
                                        page_index(page), PAGECACHE_TAG_DIRTY);
                        }
@@ -724,9 +712,9 @@ int test_clear_page_dirty(struct page *page)
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
-                        write_unlock_irqrestore(&mapping->tree_lock, flags);
                        if (mapping_cap_account_dirty(mapping))
-                                dec_page_state(nr_dirty);
+                                __dec_zone_page_state(page, NR_FILE_DIRTY);
+                        write_unlock_irqrestore(&mapping->tree_lock, flags);
                        return 1;
                }
                write_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -757,7 +745,7 @@ int clear_page_dirty_for_io(struct page *page)
        if (mapping) {
                if (TestClearPageDirty(page)) {
                        if (mapping_cap_account_dirty(mapping))
-                                dec_page_state(nr_dirty);
+                                dec_zone_page_state(page, NR_FILE_DIRTY);
                        return 1;
                }
                return 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 253a450c400d..3e792a583f3b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -14,7 +14,6 @@
 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
 */
-#include <linux/config.h>
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -37,6 +36,7 @@
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
+#include <linux/stop_machine.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -83,8 +83,8 @@ EXPORT_SYMBOL(zone_table);
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
-unsigned long __initdata nr_kernel_pages;
+unsigned long __meminitdata nr_kernel_pages;
-unsigned long __initdata nr_all_pages;
+unsigned long __meminitdata nr_all_pages;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -265,7 +265,7 @@ static inline void rmv_page_order(struct page *page)
 * satisfies the following equation:
 *     P = B & ~(1 << O)
 *
- * Assumption: *_mem_map is contigious at least up to MAX_ORDER
+ * Assumption: *_mem_map is contiguous at least up to MAX_ORDER
 */
 static inline struct page *
 __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
@@ -286,22 +286,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 * we can do coalesce a page and its buddy if
 * (a) the buddy is not in a hole &&
 * (b) the buddy is in the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
 *
 * For recording whether a page is in the buddy system, we use PG_buddy.
 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, struct page *buddy,
+                                                                int order)
 {
 #ifdef CONFIG_HOLES_IN_ZONE
-        if (!pfn_valid(page_to_pfn(page)))
+        if (!pfn_valid(page_to_pfn(buddy)))
                return 0;
 #endif
-        if (PageBuddy(page) && page_order(page) == order) {
+        if (page_zone_id(page) != page_zone_id(buddy))
-                BUG_ON(page_count(page) != 0);
+                return 0;
+        if (PageBuddy(buddy) && page_order(buddy) == order) {
+                BUG_ON(page_count(buddy) != 0);
                return 1;
        }
        return 0;
@@ -352,7 +357,7 @@ static inline void __free_one_page(struct page *page,
                struct page *buddy;
                buddy = __page_find_buddy(page, page_idx, order);
-                if (!page_is_buddy(buddy, order))
+                if (!page_is_buddy(page, buddy, order))
                        break;          /* Move the buddy up one level. */
                list_del(&buddy->lru);
@@ -440,8 +445,8 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        arch_free_page(page, order);
        if (!PageHighMem(page))
-                mutex_debug_check_no_locks_freed(page_address(page),
+                debug_check_no_locks_freed(page_address(page),
-                                                 PAGE_SIZE<<order);
+                                           PAGE_SIZE<<order);
        for (i = 0 ; i < (1 << order) ; ++i)
                reserved += free_pages_check(page + i);
@@ -450,7 +455,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
        kernel_map_pages(page, 1 << order, 0);
        local_irq_save(flags);
-        __mod_page_state(pgfree, 1 << order);
+        __count_vm_events(PGFREE, 1 << order);
        free_one_page(page_zone(page), page, order);
        local_irq_restore(flags);
 }
@@ -703,27 +708,6 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
-static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
-{
-#ifdef CONFIG_NUMA
-        pg_data_t *pg = z->zone_pgdat;
-        pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
-        struct per_cpu_pageset *p;
-        p = zone_pcp(z, cpu);
-        if (pg == orig) {
-                p->numa_hit++;
-        } else {
-                p->numa_miss++;
-                zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
-        }
-        if (pg == NODE_DATA(numa_node_id()))
-                p->local_node++;
-        else
-                p->other_node++;
-#endif
-}
 /*
 * Free a 0-order page
 */
@@ -744,7 +728,7 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
        local_irq_save(flags);
-        __inc_page_state(pgfree);
+        __count_vm_event(PGFREE);
        list_add(&page->lru, &pcp->list);
        pcp->count++;
        if (pcp->count >= pcp->high) {
@@ -820,8 +804,8 @@ again:
                        goto failed;
        }
-        __mod_page_state_zone(zone, pgalloc, 1 << order);
+        __count_zone_vm_events(PGALLOC, zone, 1 << order);
-        zone_statistics(zonelist, zone, cpu);
+        zone_statistics(zonelist, zone);
        local_irq_restore(flags);
        put_cpu();
@@ -951,8 +935,7 @@ restart:
                goto got_pg;
        do {
-                if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL))
+                wakeup_kswapd(*z, order);
-                        wakeup_kswapd(*z, order);
        } while (*(++z));
        /*
@@ -1226,141 +1209,6 @@ static void show_node(struct zone *zone)
 #define show_node(zone) do { } while (0)
 #endif
-/*
- * Accumulate the page_state information across all CPUs.
- * The result is unavoidably approximate - it can change
- * during and after execution of this function.
- */
-static DEFINE_PER_CPU(struct page_state, page_states) = {0};
-atomic_t nr_pagecache = ATOMIC_INIT(0);
-EXPORT_SYMBOL(nr_pagecache);
-#ifdef CONFIG_SMP
-DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
-#endif
-static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
-{
-        unsigned cpu;
-        memset(ret, 0, nr * sizeof(unsigned long));
-        cpus_and(*cpumask, *cpumask, cpu_online_map);
-        for_each_cpu_mask(cpu, *cpumask) {
-                unsigned long *in;
-                unsigned long *out;
-                unsigned off;
-                unsigned next_cpu;
-                in = (unsigned long *)&per_cpu(page_states, cpu);
-                next_cpu = next_cpu(cpu, *cpumask);
-                if (likely(next_cpu < NR_CPUS))
-                        prefetch(&per_cpu(page_states, next_cpu));
-                out = (unsigned long *)ret;
-                for (off = 0; off < nr; off++)
-                        *out++ += *in++;
-        }
-}
-void get_page_state_node(struct page_state *ret, int node)
-{
-        int nr;
-        cpumask_t mask = node_to_cpumask(node);
-        nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
-        nr /= sizeof(unsigned long);
-        __get_page_state(ret, nr+1, &mask);
-}
-void get_page_state(struct page_state *ret)
-{
-        int nr;
-        cpumask_t mask = CPU_MASK_ALL;
-        nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
-        nr /= sizeof(unsigned long);
-        __get_page_state(ret, nr + 1, &mask);
-}
-void get_full_page_state(struct page_state *ret)
-{
-        cpumask_t mask = CPU_MASK_ALL;
-        __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
-}
-unsigned long read_page_state_offset(unsigned long offset)
-{
-        unsigned long ret = 0;
-        int cpu;
-        for_each_online_cpu(cpu) {
-                unsigned long in;
-                in = (unsigned long)&per_cpu(page_states, cpu) + offset;
-                ret += *((unsigned long *)in);
-        }
-        return ret;
-}
-void __mod_page_state_offset(unsigned long offset, unsigned long delta)
-{
-        void *ptr;
-        ptr = &__get_cpu_var(page_states);
-        *(unsigned long *)(ptr + offset) += delta;
-}
-EXPORT_SYMBOL(__mod_page_state_offset);
-void mod_page_state_offset(unsigned long offset, unsigned long delta)
-{
-        unsigned long flags;
-        void *ptr;
-        local_irq_save(flags);
-        ptr = &__get_cpu_var(page_states);
-        *(unsigned long *)(ptr + offset) += delta;
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL(mod_page_state_offset);
-void __get_zone_counts(unsigned long *active, unsigned long *inactive,
-                        unsigned long *free, struct pglist_data *pgdat)
-{
-        struct zone *zones = pgdat->node_zones;
-        int i;
-        *active = 0;
-        *inactive = 0;
-        *free = 0;
-        for (i = 0; i < MAX_NR_ZONES; i++) {
-                *active += zones[i].nr_active;
-                *inactive += zones[i].nr_inactive;
-                *free += zones[i].free_pages;
-        }
-}
-void get_zone_counts(unsigned long *active,
-                unsigned long *inactive, unsigned long *free)
-{
-        struct pglist_data *pgdat;
-        *active = 0;
-        *inactive = 0;
-        *free = 0;
-        for_each_online_pgdat(pgdat) {
-                unsigned long l, m, n;
-                __get_zone_counts(&l, &m, &n, pgdat);
-                *active += l;
-                *inactive += m;
-                *free += n;
-        }
-}
 void si_meminfo(struct sysinfo *val)
 {
        val->totalram = totalram_pages;
@@ -1401,7 +1249,6 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 */
 void show_free_areas(void)
 {
-        struct page_state ps;
        int cpu, temperature;
        unsigned long active;
        unsigned long inactive;
@@ -1433,7 +1280,6 @@ void show_free_areas(void)
                }
        }
-        get_page_state(&ps);
        get_zone_counts(&active, &inactive, &free);
        printk("Free pages: %11ukB (%ukB HighMem)\n",
@@ -1444,13 +1290,13 @@ void show_free_areas(void)
                "unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
                active,
                inactive,
-                ps.nr_dirty,
+                global_page_state(NR_FILE_DIRTY),
-                ps.nr_writeback,
+                global_page_state(NR_WRITEBACK),
-                ps.nr_unstable,
+                global_page_state(NR_UNSTABLE_NFS),
                nr_free_pages(),
-                ps.nr_slab,
+                global_page_state(NR_SLAB),
-                ps.nr_mapped,
+                global_page_state(NR_FILE_MAPPED),
-                ps.nr_page_table_pages);
+                global_page_state(NR_PAGETABLE));
        for_each_zone(zone) {
                int i;
@@ -1485,7 +1331,7 @@ void show_free_areas(void)
        }
        for_each_zone(zone) {
-                unsigned long nr, flags, order, total = 0;
+                unsigned long nr[MAX_ORDER], flags, order, total = 0;
                show_node(zone);
                printk("%s: ", zone->name);
@@ -1496,11 +1342,12 @@ void show_free_areas(void)
                spin_lock_irqsave(&zone->lock, flags);
                for (order = 0; order < MAX_ORDER; order++) {
-                        nr = zone->free_area[order].nr_free;
+                        nr[order] = zone->free_area[order].nr_free;
-                        total += nr << order;
+                        total += nr[order] << order;
-                        printk("%lu*%lukB ", nr, K(1UL) << order);
                }
                spin_unlock_irqrestore(&zone->lock, flags);
+                for (order = 0; order < MAX_ORDER; order++)
+                        printk("%lu*%lukB ", nr[order], K(1UL) << order);
                printk("= %lukB\n", K(total));
        }
@@ -1512,7 +1359,7 @@ void show_free_areas(void)
 *
 * Add all populated zones of a node to the zonelist.
 */
-static int __init build_zonelists_node(pg_data_t *pgdat,
+static int __meminit build_zonelists_node(pg_data_t *pgdat,
                        struct zonelist *zonelist, int nr_zones, int zone_type)
 {
        struct zone *zone;
@@ -1548,7 +1395,7 @@ static inline int highest_zone(int zone_bits)
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
-static int __initdata node_load[MAX_NUMNODES];
+static int __meminitdata node_load[MAX_NUMNODES];
 /**
 * find_next_best_node - find the next node that should appear in a given node's fallback list
 * @node: node whose fallback list we're appending
@@ -1563,7 +1410,7 @@ static int __initdata node_load[MAX_NUMNODES];
 * on them otherwise.
 * It returns -1 if no node is found.
 */
-static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
+static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
 {
        int n, val;
        int min_val = INT_MAX;
@@ -1609,7 +1456,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
        return best_node;
 }
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
 {
        int i, j, k, node, local_node;
        int prev_node, load;
@@ -1661,7 +1508,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
 #else   /* CONFIG_NUMA */
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
 {
        int i, j, k, node, local_node;
@@ -1699,14 +1546,29 @@ static void __init build_zonelists(pg_data_t *pgdat)
 #endif  /* CONFIG_NUMA */
-void __init build_all_zonelists(void)
+/* return values int ....just for stop_machine_run() */
+static int __meminit __build_all_zonelists(void *dummy)
 {
-        int i;
+        int nid;
+        for_each_online_node(nid)
+                build_zonelists(NODE_DATA(nid));
+        return 0;
+}
-        for_each_online_node(i)
+void __meminit build_all_zonelists(void)
-                build_zonelists(NODE_DATA(i));
+{
-        printk("Built %i zonelists\n", num_online_nodes());
+        if (system_state == SYSTEM_BOOTING) {
-        cpuset_init_current_mems_allowed();
+                __build_all_zonelists(0);
+                cpuset_init_current_mems_allowed();
+        } else {
+                /* we have to stop all cpus to guaranntee there is no user
+                   of zonelist */
+                stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+                /* cpuset refresh routine should be here */
+        }
+        vm_total_pages = nr_free_pagecache_pages();
+        printk("Built %i zonelists.  Total pages: %ld\n",
+                        num_online_nodes(), vm_total_pages);
 }
 /*
@@ -1722,7 +1584,8 @@ void __init build_all_zonelists(void)
 */
 #define PAGES_PER_WAITQUEUE     256
-static inline unsigned long wait_table_size(unsigned long pages)
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
        unsigned long size = 1;
@@ -1740,6 +1603,29 @@ static inline unsigned long wait_table_size(unsigned long pages)
        return max(size, 4UL);
 }
+#else
+/*
+ * A zone's size might be changed by hot-add, so it is not possible to determine
+ * a suitable size for its wait_table.  So we use the maximum size now.
+ *
+ * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
+ *
+ *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
+ *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
+ *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
+ *
+ * The maximum entries are prepared when a zone's memory is (512K + 256) pages
+ * or more by the traditional way. (See above).  It equals:
+ *
+ *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
+ *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
+ *    powerpc (64K page size)             : =  (32G +16M)byte.
+ */
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+        return 4096UL;
+}
+#endif
 /*
 * This is an integer logarithm so that shifts can be used later
@@ -1964,7 +1850,7 @@ static inline void free_zone_pagesets(int cpu)
        }
 }
-static int pageset_cpuup_callback(struct notifier_block *nfb,
+static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
                unsigned long action,
                void *hcpu)
 {
@@ -1986,7 +1872,7 @@ static int pageset_cpuup_callback(struct notifier_block *nfb,
        return ret;
 }
-static struct notifier_block pageset_notifier =
+static struct notifier_block __cpuinitdata pageset_notifier =
        { &pageset_cpuup_callback, NULL, 0 };
 void __init setup_per_cpu_pageset(void)
@@ -2005,23 +1891,46 @@ void __init setup_per_cpu_pageset(void)
 #endif
 static __meminit
-void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
        int i;
        struct pglist_data *pgdat = zone->zone_pgdat;
+        size_t alloc_size;
        /*
         * The per-page waitqueue mechanism uses hashed waitqueues
         * per zone.
         */
-        zone->wait_table_size = wait_table_size(zone_size_pages);
+        zone->wait_table_hash_nr_entries =
-        zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
+                 wait_table_hash_nr_entries(zone_size_pages);
-        zone->wait_table = (wait_queue_head_t *)
+        zone->wait_table_bits =
-                alloc_bootmem_node(pgdat, zone->wait_table_size
+                wait_table_bits(zone->wait_table_hash_nr_entries);
-                                        * sizeof(wait_queue_head_t));
+        alloc_size = zone->wait_table_hash_nr_entries
+                                        * sizeof(wait_queue_head_t);
+        if (system_state == SYSTEM_BOOTING) {
+                zone->wait_table = (wait_queue_head_t *)
+                        alloc_bootmem_node(pgdat, alloc_size);
+        } else {
+                /*
+                 * This case means that a zone whose size was 0 gets new memory
+                 * via memory hot-add.
+                 * But it may be the case that a new node was hot-added.  In
+                 * this case vmalloc() will not be able to use this new node's
+                 * memory - this wait_table must be initialized to use this new
+                 * node itself as well.
+                 * To use this new node's memory, further consideration will be
+                 * necessary.
+                 */
+                zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
+        }
+        if (!zone->wait_table)
+                return -ENOMEM;
-        for(i = 0; i < zone->wait_table_size; ++i)
+        for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
                init_waitqueue_head(zone->wait_table + i);
+        return 0;
 }
 static __meminit void zone_pcp_init(struct zone *zone)
@@ -2043,12 +1952,15 @@ static __meminit void zone_pcp_init(struct zone *zone)
                        zone->name, zone->present_pages, batch);
 }
-static __meminit void init_currently_empty_zone(struct zone *zone,
+__meminit int init_currently_empty_zone(struct zone *zone,
-                unsigned long zone_start_pfn, unsigned long size)
+                                        unsigned long zone_start_pfn,
+                                        unsigned long size)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
+        int ret;
-        zone_wait_table_init(zone, size);
+        ret = zone_wait_table_init(zone, size);
+        if (ret)
+                return ret;
        pgdat->nr_zones = zone_idx(zone) + 1;
        zone->zone_start_pfn = zone_start_pfn;
@@ -2056,6 +1968,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
        memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
        zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+        return 0;
 }
 /*
@@ -2064,12 +1978,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */
-static void __init free_area_init_core(struct pglist_data *pgdat,
+static void __meminit free_area_init_core(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
 {
        unsigned long j;
        int nid = pgdat->node_id;
        unsigned long zone_start_pfn = pgdat->node_start_pfn;
+        int ret;
        pgdat_resize_init(pgdat);
        pgdat->nr_zones = 0;
@@ -2106,12 +2021,14 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                zone->nr_scan_inactive = 0;
                zone->nr_active = 0;
                zone->nr_inactive = 0;
+                zap_zone_vm_stats(zone);
                atomic_set(&zone->reclaim_in_progress, 0);
                if (!size)
                        continue;
                zonetable_add(zone, nid, j, zone_start_pfn, size);
-                init_currently_empty_zone(zone, zone_start_pfn, size);
+                ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+                BUG_ON(ret);
                zone_start_pfn += size;
        }
 }
@@ -2152,7 +2069,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
-void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long node_start_pfn,
                unsigned long *zholes_size)
 {
@@ -2178,307 +2095,18 @@ void __init free_area_init(unsigned long *zones_size)
                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
 }
-#ifdef CONFIG_PROC_FS
-#include <linux/seq_file.h>
-static void *frag_start(struct seq_file *m, loff_t *pos)
-{
-        pg_data_t *pgdat;
-        loff_t node = *pos;
-        for (pgdat = first_online_pgdat();
-             pgdat && node;
-             pgdat = next_online_pgdat(pgdat))
-                --node;
-        return pgdat;
-}
-static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
-{
-        pg_data_t *pgdat = (pg_data_t *)arg;
-        (*pos)++;
-        return next_online_pgdat(pgdat);
-}
-static void frag_stop(struct seq_file *m, void *arg)
-{
-}
-/* 
- * This walks the free areas for each zone.
- */
-static int frag_show(struct seq_file *m, void *arg)
-{
-        pg_data_t *pgdat = (pg_data_t *)arg;
-        struct zone *zone;
-        struct zone *node_zones = pgdat->node_zones;
-        unsigned long flags;
-        int order;
-        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-                if (!populated_zone(zone))
-                        continue;
-                spin_lock_irqsave(&zone->lock, flags);
-                seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
-                for (order = 0; order < MAX_ORDER; ++order)
-                        seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
-                spin_unlock_irqrestore(&zone->lock, flags);
-                seq_putc(m, '\n');
-        }
-        return 0;
-}
-struct seq_operations fragmentation_op = {
-        .start  = frag_start,
-        .next   = frag_next,
-        .stop   = frag_stop,
-        .show   = frag_show,
-};
-/*
- * Output information about zones in @pgdat.
- */
-static int zoneinfo_show(struct seq_file *m, void *arg)
-{
-        pg_data_t *pgdat = arg;
-        struct zone *zone;
-        struct zone *node_zones = pgdat->node_zones;
-        unsigned long flags;
-        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
-                int i;
-                if (!populated_zone(zone))
-                        continue;
-                spin_lock_irqsave(&zone->lock, flags);
-                seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
-                seq_printf(m,
-                           "\n  pages free     %lu"
-                           "\n        min      %lu"
-                           "\n        low      %lu"
-                           "\n        high     %lu"
-                           "\n        active   %lu"
-                           "\n        inactive %lu"
-                           "\n        scanned  %lu (a: %lu i: %lu)"
-                           "\n        spanned  %lu"
-                           "\n        present  %lu",
-                           zone->free_pages,
-                           zone->pages_min,
-                           zone->pages_low,
-                           zone->pages_high,
-                           zone->nr_active,
-                           zone->nr_inactive,
-                           zone->pages_scanned,
-                           zone->nr_scan_active, zone->nr_scan_inactive,
-                           zone->spanned_pages,
-                           zone->present_pages);
-                seq_printf(m,
-                           "\n        protection: (%lu",
-                           zone->lowmem_reserve[0]);
-                for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
-                        seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
-                seq_printf(m,
-                           ")"
-                           "\n  pagesets");
-                for_each_online_cpu(i) {
-                        struct per_cpu_pageset *pageset;
-                        int j;
-                        pageset = zone_pcp(zone, i);
-                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
-                                if (pageset->pcp[j].count)
-                                        break;
-                        }
-                        if (j == ARRAY_SIZE(pageset->pcp))
-                                continue;
-                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
-                                seq_printf(m,
-                                           "\n    cpu: %i pcp: %i"
-                                           "\n              count: %i"
-                                           "\n              high:  %i"
-                                           "\n              batch: %i",
-                                           i, j,
-                                           pageset->pcp[j].count,
-                                           pageset->pcp[j].high,
-                                           pageset->pcp[j].batch);
-                        }
-#ifdef CONFIG_NUMA
-                        seq_printf(m,
-                                   "\n            numa_hit:       %lu"
-                                   "\n            numa_miss:      %lu"
-                                   "\n            numa_foreign:   %lu"
-                                   "\n            interleave_hit: %lu"
-                                   "\n            local_node:     %lu"
-                                   "\n            other_node:     %lu",
-                                   pageset->numa_hit,
-                                   pageset->numa_miss,
-                                   pageset->numa_foreign,
-                                   pageset->interleave_hit,
-                                   pageset->local_node,
-                                   pageset->other_node);
-#endif
-                }
-                seq_printf(m,
-                           "\n  all_unreclaimable: %u"
-                           "\n  prev_priority:     %i"
-                           "\n  temp_priority:     %i"
-                           "\n  start_pfn:         %lu",
-                           zone->all_unreclaimable,
-                           zone->prev_priority,
-                           zone->temp_priority,
-                           zone->zone_start_pfn);
-                spin_unlock_irqrestore(&zone->lock, flags);
-                seq_putc(m, '\n');
-        }
-        return 0;
-}
-struct seq_operations zoneinfo_op = {
-        .start  = frag_start, /* iterate over all zones. The same as in
-                               * fragmentation. */
-        .next   = frag_next,
-        .stop   = frag_stop,
-        .show   = zoneinfo_show,
-};
-static char *vmstat_text[] = {
-        "nr_dirty",
-        "nr_writeback",
-        "nr_unstable",
-        "nr_page_table_pages",
-        "nr_mapped",
-        "nr_slab",
-        "pgpgin",
-        "pgpgout",
-        "pswpin",
-        "pswpout",
-        "pgalloc_high",
-        "pgalloc_normal",
-        "pgalloc_dma32",
-        "pgalloc_dma",
-        "pgfree",
-        "pgactivate",
-        "pgdeactivate",
-        "pgfault",
-        "pgmajfault",
-        "pgrefill_high",
-        "pgrefill_normal",
-        "pgrefill_dma32",
-        "pgrefill_dma",
-        "pgsteal_high",
-        "pgsteal_normal",
-        "pgsteal_dma32",
-        "pgsteal_dma",
-        "pgscan_kswapd_high",
-        "pgscan_kswapd_normal",
-        "pgscan_kswapd_dma32",
-        "pgscan_kswapd_dma",
-        "pgscan_direct_high",
-        "pgscan_direct_normal",
-        "pgscan_direct_dma32",
-        "pgscan_direct_dma",
-        "pginodesteal",
-        "slabs_scanned",
-        "kswapd_steal",
-        "kswapd_inodesteal",
-        "pageoutrun",
-        "allocstall",
-        "pgrotated",
-        "nr_bounce",
-};
-static void *vmstat_start(struct seq_file *m, loff_t *pos)
-{
-        struct page_state *ps;
-        if (*pos >= ARRAY_SIZE(vmstat_text))
-                return NULL;
-        ps = kmalloc(sizeof(*ps), GFP_KERNEL);
-        m->private = ps;
-        if (!ps)
-                return ERR_PTR(-ENOMEM);
-        get_full_page_state(ps);
-        ps->pgpgin /= 2;                /* sectors -> kbytes */
-        ps->pgpgout /= 2;
-        return (unsigned long *)ps + *pos;
-}
-static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
-{
-        (*pos)++;
-        if (*pos >= ARRAY_SIZE(vmstat_text))
-                return NULL;
-        return (unsigned long *)m->private + *pos;
-}
-static int vmstat_show(struct seq_file *m, void *arg)
-{
-        unsigned long *l = arg;
-        unsigned long off = l - (unsigned long *)m->private;
-        seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
-        return 0;
-}
-static void vmstat_stop(struct seq_file *m, void *arg)
-{
-        kfree(m->private);
-        m->private = NULL;
-}
-struct seq_operations vmstat_op = {
-        .start  = vmstat_start,
-        .next   = vmstat_next,
-        .stop   = vmstat_stop,
-        .show   = vmstat_show,
-};
-#endif /* CONFIG_PROC_FS */
 #ifdef CONFIG_HOTPLUG_CPU
 static int page_alloc_cpu_notify(struct notifier_block *self,
                                 unsigned long action, void *hcpu)
 {
        int cpu = (unsigned long)hcpu;
-        long *count;
-        unsigned long *src, *dest;
        if (action == CPU_DEAD) {
-                int i;
-                /* Drain local pagecache count. */
-                count = &per_cpu(nr_pagecache_local, cpu);
-                atomic_add(*count, &nr_pagecache);
-                *count = 0;
                local_irq_disable();
                __drain_pages(cpu);
+                vm_events_fold_cpu(cpu);
-                /* Add dead cpu's page_states to our own. */
-                dest = (unsigned long *)&__get_cpu_var(page_states);
-                src = (unsigned long *)&per_cpu(page_states, cpu);
-                for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
-                                i++) {
-                        dest[i] += src[i];
-                        src[i] = 0;
-                }
                local_irq_enable();
+                refresh_cpu_vm_stats(cpu);
        }
        return NOTIFY_OK;
 }
@@ -2804,42 +2432,14 @@ void *__init alloc_large_system_hash(const char *tablename,
 }
 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
-/*
- * pfn <-> page translation. out-of-line version.
- * (see asm-generic/memory_model.h)
- */
-#if defined(CONFIG_FLATMEM)
-struct page *pfn_to_page(unsigned long pfn)
-{
-        return mem_map + (pfn - ARCH_PFN_OFFSET);
-}
-unsigned long page_to_pfn(struct page *page)
-{
-        return (page - mem_map) + ARCH_PFN_OFFSET;
-}
-#elif defined(CONFIG_DISCONTIGMEM)
-struct page *pfn_to_page(unsigned long pfn)
-{
-        int nid = arch_pfn_to_nid(pfn);
-        return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
-}
-unsigned long page_to_pfn(struct page *page)
-{
-        struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
-        return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
-}
-#elif defined(CONFIG_SPARSEMEM)
 struct page *pfn_to_page(unsigned long pfn)
 {
-        return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
+        return __pfn_to_page(pfn);
 }
 unsigned long page_to_pfn(struct page *page)
 {
-        long section_id = page_to_section(page);
+        return __page_to_pfn(page);
-        return page - __section_mem_map_addr(__nr_to_section(section_id));
 }
-#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
 EXPORT_SYMBOL(pfn_to_page);
 EXPORT_SYMBOL(page_to_pfn);
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
diff --git a/mm/page_io.c b/mm/page_io.c
index bb2b0d53889c..88029948d00a 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -101,7 +101,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
        }
        if (wbc->sync_mode == WB_SYNC_ALL)
                rw |= (1 << BIO_RW_SYNC);
-        inc_page_state(pswpout);
+        count_vm_event(PSWPOUT);
        set_page_writeback(page);
        unlock_page(page);
        submit_bio(rw, bio);
@@ -123,7 +123,7 @@ int swap_readpage(struct file *file, struct page *page)
                ret = -ENOMEM;
                goto out;
        }
-        inc_page_state(pswpin);
+        count_vm_event(PSWPIN);
        submit_bio(READ, bio);
 out:
        return ret;
diff --git a/mm/pdflush.c b/mm/pdflush.c
index c4b6d0afd736..b02102feeb4b 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -104,21 +104,20 @@ static int __pdflush(struct pdflush_work *my_work)
                list_move(&my_work->list, &pdflush_list);
                my_work->when_i_went_to_sleep = jiffies;
                spin_unlock_irq(&pdflush_lock);
                schedule();
-                if (try_to_freeze()) {
+                try_to_freeze();
-                        spin_lock_irq(&pdflush_lock);
-                        continue;
-                }
                spin_lock_irq(&pdflush_lock);
                if (!list_empty(&my_work->list)) {
-                        printk("pdflush: bogus wakeup!\n");
+                        /*
+                         * Someone woke us up, but without removing our control
+                         * structure from the global list.  swsusp will do this
+                         * in try_to_freeze()->refrigerator().  Handle it.
+                         */
                        my_work->fn = NULL;
                        continue;
                }
                if (my_work->fn == NULL) {
-                        printk("pdflush: NULL work function\n");
+                        printk("pdflush: bogus wakeup\n");
                        continue;
                }
                spin_unlock_irq(&pdflush_lock);
@@ -202,8 +201,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
        unsigned long flags;
        int ret = 0;
-        if (fn == NULL)
+        BUG_ON(fn == NULL);     /* Hard to diagnose if it's deferred */
-                BUG();          /* Hard to diagnose if it's deferred */
        spin_lock_irqsave(&pdflush_lock, flags);
        if (list_empty(&pdflush_list)) {
diff --git a/mm/readahead.c b/mm/readahead.c
index ba7db816f4c8..1ba736ac0367 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -119,8 +119,7 @@ static inline unsigned long get_next_ra_size(struct file_ra_state *ra)
 #define list_to_page(head) (list_entry((head)->prev, struct page, lru))
 /**
- * read_cache_pages - populate an address space with some pages, and
+ * read_cache_pages - populate an address space with some pages & start reads against them
- *                      start reads against them.
 * @mapping: the address_space
 * @pages: The address of a list_head which contains the target pages.  These
 *   pages have their ->index populated and are otherwise uninitialised.
@@ -183,14 +182,11 @@ static int read_pages(struct address_space *mapping, struct file *filp,
                list_del(&page->lru);
                if (!add_to_page_cache(page, mapping,
                                        page->index, GFP_KERNEL)) {
-                        ret = mapping->a_ops->readpage(filp, page);
+                        mapping->a_ops->readpage(filp, page);
-                        if (ret != AOP_TRUNCATED_PAGE) {
+                        if (!pagevec_add(&lru_pvec, page))
-                                if (!pagevec_add(&lru_pvec, page))
+                                __pagevec_lru_add(&lru_pvec);
-                                        __pagevec_lru_add(&lru_pvec);
+                } else
-                                continue;
+                        page_cache_release(page);
-                        } /* else fall through to release */
-                }
-                page_cache_release(page);
        }
        pagevec_lru_add(&lru_pvec);
        ret = 0;
@@ -395,8 +391,8 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'
 * is set wait till the read completes.  Otherwise attempt to read without
 * blocking.
- * Returns 1 meaning 'success' if read is succesfull without switching off
+ * Returns 1 meaning 'success' if read is successful without switching off
- * readhaead mode. Otherwise return failure.
+ * readahead mode. Otherwise return failure.
 */
 static int
 blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,
diff --git a/mm/rmap.c b/mm/rmap.c
index 1963e269314d..40158b59729e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
                        vma->anon_vma = anon_vma;
-                        list_add(&vma->anon_vma_node, &anon_vma->head);
+                        list_add_tail(&vma->anon_vma_node, &anon_vma->head);
                        allocated = NULL;
                }
                spin_unlock(&mm->page_table_lock);
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma)
        struct anon_vma *anon_vma = vma->anon_vma;
        if (anon_vma) {
-                list_add(&vma->anon_vma_node, &anon_vma->head);
+                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
                validate_anon_vma(vma);
        }
 }
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma)
        if (anon_vma) {
                spin_lock(&anon_vma->lock);
-                list_add(&vma->anon_vma_node, &anon_vma->head);
+                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
                validate_anon_vma(vma);
                spin_unlock(&anon_vma->lock);
        }
@@ -205,44 +205,6 @@ out:
        return anon_vma;
 }
-#ifdef CONFIG_MIGRATION
-/*
- * Remove an anonymous page from swap replacing the swap pte's
- * through real pte's pointing to valid pages and then releasing
- * the page from the swap cache.
- *
- * Must hold page lock on page and mmap_sem of one vma that contains
- * the page.
- */
-void remove_from_swap(struct page *page)
-{
-        struct anon_vma *anon_vma;
-        struct vm_area_struct *vma;
-        unsigned long mapping;
-        if (!PageSwapCache(page))
-                return;
-        mapping = (unsigned long)page->mapping;
-        if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
-                return;
-        /*
-         * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
-         */
-        anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
-        spin_lock(&anon_vma->lock);
-        list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
-                remove_vma_swap(vma, page);
-        spin_unlock(&anon_vma->lock);
-        delete_from_swap_cache(page);
-}
-EXPORT_SYMBOL(remove_from_swap);
-#endif
 /*
 * At what user virtual address is page expected in vma?
 */
@@ -493,7 +455,7 @@ static void __page_set_anon_rmap(struct page *page,
         * nr_mapped state can be updated without turning off
         * interrupts because it is not modified via interrupt.
         */
-        __inc_page_state(nr_mapped);
+        __inc_zone_page_state(page, NR_ANON_PAGES);
 }
 /**
@@ -537,7 +499,7 @@ void page_add_new_anon_rmap(struct page *page,
 void page_add_file_rmap(struct page *page)
 {
        if (atomic_inc_and_test(&page->_mapcount))
-                __inc_page_state(nr_mapped);
+                __inc_zone_page_state(page, NR_FILE_MAPPED);
 }
 /**
@@ -569,7 +531,8 @@ void page_remove_rmap(struct page *page)
                 */
                if (page_test_and_clear_dirty(page))
                        set_page_dirty(page);
-                __dec_page_state(nr_mapped);
+                __dec_zone_page_state(page,
+                                PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
        }
 }
@@ -578,7 +541,7 @@ void page_remove_rmap(struct page *page)
 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 */
 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-                                int ignore_refs)
+                                int migration)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -600,9 +563,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         * If it's recently referenced (perhaps page_referenced
         * skipped over this mm) then we should reactivate it.
         */
-        if ((vma->vm_flags & VM_LOCKED) ||
+        if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-                        (ptep_clear_flush_young(vma, address, pte)
+                        (ptep_clear_flush_young(vma, address, pte)))) {
-                                && !ignore_refs)) {
                ret = SWAP_FAIL;
                goto out_unmap;
        }
@@ -620,24 +582,45 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        if (PageAnon(page)) {
                swp_entry_t entry = { .val = page_private(page) };
-                /*
-                 * Store the swap location in the pte.
+                if (PageSwapCache(page)) {
-                 * See handle_pte_fault() ...
+                        /*
-                 */
+                         * Store the swap location in the pte.
-                BUG_ON(!PageSwapCache(page));
+                         * See handle_pte_fault() ...
-                swap_duplicate(entry);
+                         */
-                if (list_empty(&mm->mmlist)) {
+                        swap_duplicate(entry);
-                        spin_lock(&mmlist_lock);
+                        if (list_empty(&mm->mmlist)) {
-                        if (list_empty(&mm->mmlist))
+                                spin_lock(&mmlist_lock);
-                                list_add(&mm->mmlist, &init_mm.mmlist);
+                                if (list_empty(&mm->mmlist))
-                        spin_unlock(&mmlist_lock);
+                                        list_add(&mm->mmlist, &init_mm.mmlist);
+                                spin_unlock(&mmlist_lock);
+                        }
+                        dec_mm_counter(mm, anon_rss);
+#ifdef CONFIG_MIGRATION
+                } else {
+                        /*
+                         * Store the pfn of the page in a special migration
+                         * pte. do_swap_page() will wait until the migration
+                         * pte is removed and then restart fault handling.
+                         */
+                        BUG_ON(!migration);
+                        entry = make_migration_entry(page, pte_write(pteval));
+#endif
                }
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                BUG_ON(pte_file(*pte));
-                dec_mm_counter(mm, anon_rss);
        } else
+#ifdef CONFIG_MIGRATION
+        if (migration) {
+                /* Establish migration entry for a file page */
+                swp_entry_t entry;
+                entry = make_migration_entry(page, pte_write(pteval));
+                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+        } else
+#endif
                dec_mm_counter(mm, file_rss);
        page_remove_rmap(page);
        page_cache_release(page);
@@ -736,7 +719,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
        pte_unmap_unlock(pte - 1, ptl);
 }
-static int try_to_unmap_anon(struct page *page, int ignore_refs)
+static int try_to_unmap_anon(struct page *page, int migration)
 {
        struct anon_vma *anon_vma;
        struct vm_area_struct *vma;
@@ -747,7 +730,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
                return ret;
        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-                ret = try_to_unmap_one(page, vma, ignore_refs);
+                ret = try_to_unmap_one(page, vma, migration);
                if (ret == SWAP_FAIL || !page_mapped(page))
                        break;
        }
@@ -764,7 +747,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
 *
 * This function is only called from try_to_unmap for object-based pages.
 */
-static int try_to_unmap_file(struct page *page, int ignore_refs)
+static int try_to_unmap_file(struct page *page, int migration)
 {
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -778,7 +761,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs)
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-                ret = try_to_unmap_one(page, vma, ignore_refs);
+                ret = try_to_unmap_one(page, vma, migration);
                if (ret == SWAP_FAIL || !page_mapped(page))
                        goto out;
        }
@@ -788,7 +771,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs)
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                shared.vm_set.list) {
-                if (vma->vm_flags & VM_LOCKED)
+                if ((vma->vm_flags & VM_LOCKED) && !migration)
                        continue;
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
@@ -822,7 +805,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs)
        do {
                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                shared.vm_set.list) {
-                        if (vma->vm_flags & VM_LOCKED)
+                        if ((vma->vm_flags & VM_LOCKED) && !migration)
                                continue;
                        cursor = (unsigned long) vma->vm_private_data;
                        while ( cursor < max_nl_cursor &&
@@ -863,16 +846,16 @@ out:
 * SWAP_AGAIN   - we missed a mapping, try again later
 * SWAP_FAIL    - the page is unswappable
 */
-int try_to_unmap(struct page *page, int ignore_refs)
+int try_to_unmap(struct page *page, int migration)
 {
        int ret;
        BUG_ON(!PageLocked(page));
        if (PageAnon(page))
-                ret = try_to_unmap_anon(page, ignore_refs);
+                ret = try_to_unmap_anon(page, migration);
        else
-                ret = try_to_unmap_file(page, ignore_refs);
+                ret = try_to_unmap_file(page, migration);
        if (!page_mapped(page))
                ret = SWAP_SUCCESS;
diff --git a/mm/shmem.c b/mm/shmem.c
index 1e43c8a865ba..db21c51531ca 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -23,10 +23,8 @@
 * which makes it a completely usable filesystem.
 */
-#include <linux/config.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
@@ -174,7 +172,7 @@ static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 }
 static struct super_operations shmem_ops;
-static struct address_space_operations shmem_aops;
+static const struct address_space_operations shmem_aops;
 static struct file_operations shmem_file_operations;
 static struct inode_operations shmem_inode_operations;
 static struct inode_operations shmem_dir_inode_operations;
@@ -1046,12 +1044,12 @@ repeat:
                swappage = lookup_swap_cache(swap);
                if (!swappage) {
                        shmem_swp_unmap(entry);
-                        spin_unlock(&info->lock);
                        /* here we actually do the io */
                        if (type && *type == VM_FAULT_MINOR) {
-                                inc_page_state(pgmajfault);
+                                __count_vm_event(PGMAJFAULT);
                                *type = VM_FAULT_MAJOR;
                        }
+                        spin_unlock(&info->lock);
                        swappage = shmem_swapin(info, swap, idx);
                        if (!swappage) {
                                spin_lock(&info->lock);
@@ -1081,14 +1079,6 @@ repeat:
                        page_cache_release(swappage);
                        goto repeat;
                }
-                if (!PageSwapCache(swappage)) {
-                        /* Page migration has occured */
-                        shmem_swp_unmap(entry);
-                        spin_unlock(&info->lock);
-                        unlock_page(swappage);
-                        page_cache_release(swappage);
-                        goto repeat;
-                }
                if (PageWriteback(swappage)) {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
@@ -1654,9 +1644,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
        return desc.error;
 }
-static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
+static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
        buf->f_type = TMPFS_MAGIC;
        buf->f_bsize = PAGE_CACHE_SIZE;
@@ -2170,7 +2160,7 @@ static void destroy_inodecache(void)
                printk(KERN_INFO "shmem_inode_cache: not all structures were freed\n");
 }
-static struct address_space_operations shmem_aops = {
+static const struct address_space_operations shmem_aops = {
        .writepage      = shmem_writepage,
        .set_page_dirty = __set_page_dirty_nobuffers,
 #ifdef CONFIG_TMPFS
@@ -2233,10 +2223,10 @@ static struct vm_operations_struct shmem_vm_ops = {
 };
-static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
+static int shmem_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
+        return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
 }
 static struct file_system_type tmpfs_fs_type = {
@@ -2260,10 +2250,8 @@ static int __init init_tmpfs(void)
                printk(KERN_ERR "Could not register tmpfs\n");
                goto out2;
        }
-#ifdef CONFIG_TMPFS
-        devfs_mk_dir("shm");
+        shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
-#endif
-        shm_mnt = do_kern_mount(tmpfs_fs_type.name, MS_NOUSER,
                                tmpfs_fs_type.name, NULL);
        if (IS_ERR(shm_mnt)) {
                error = PTR_ERR(shm_mnt);
diff --git a/mm/slab.c b/mm/slab.c
index f1b644eb39d8..3936af344542 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -89,6 +89,7 @@
 #include        <linux/config.h>
 #include        <linux/slab.h>
 #include        <linux/mm.h>
+#include        <linux/poison.h>
 #include        <linux/swap.h>
 #include        <linux/cache.h>
 #include        <linux/interrupt.h>
@@ -106,6 +107,7 @@
 #include        <linux/nodemask.h>
 #include        <linux/mempolicy.h>
 #include        <linux/mutex.h>
+#include        <linux/rtmutex.h>
 #include        <asm/uaccess.h>
 #include        <asm/cacheflush.h>
@@ -307,6 +309,13 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 #define SIZE_AC 1
 #define SIZE_L3 (1 + MAX_NUMNODES)
+static int drain_freelist(struct kmem_cache *cache,
+                        struct kmem_list3 *l3, int tofree);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len,
+                        int node);
+static void enable_cpucache(struct kmem_cache *cachep);
+static void cache_reap(void *unused);
 /*
 * This function must be completely optimized away if a constant is passed to
 * it.  Mostly the same as what is in linux/slab.h except it returns an index.
@@ -331,6 +340,8 @@ static __always_inline int index_of(const size_t size)
        return 0;
 }
+static int slab_early_init = 1;
 #define INDEX_AC index_of(sizeof(struct arraycache_init))
 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
@@ -452,7 +463,7 @@ struct kmem_cache {
 #define STATS_DEC_ACTIVE(x)     ((x)->num_active--)
 #define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 #define STATS_INC_GROWN(x)      ((x)->grown++)
-#define STATS_INC_REAPED(x)     ((x)->reaped++)
+#define STATS_ADD_REAPED(x,y)   ((x)->reaped += (y))
 #define STATS_SET_HIGH(x)                                               \
        do {                                                            \
                if ((x)->num_active > (x)->high_mark)                   \
@@ -476,7 +487,7 @@ struct kmem_cache {
 #define STATS_DEC_ACTIVE(x)     do { } while (0)
 #define STATS_INC_ALLOCED(x)    do { } while (0)
 #define STATS_INC_GROWN(x)      do { } while (0)
-#define STATS_INC_REAPED(x)     do { } while (0)
+#define STATS_ADD_REAPED(x,y)   do { } while (0)
 #define STATS_SET_HIGH(x)       do { } while (0)
 #define STATS_INC_ERR(x)        do { } while (0)
 #define STATS_INC_NODEALLOCS(x) do { } while (0)
@@ -490,17 +501,6 @@ struct kmem_cache {
 #endif
 #if DEBUG
-/*
- * Magic nums for obj red zoning.
- * Placed in the first word before and the first word after an obj.
- */
-#define RED_INACTIVE    0x5A2CF071UL    /* when obj is inactive */
-#define RED_ACTIVE      0x170FC2A5UL    /* when obj is active */
-/* ...and for poisoning */
-#define POISON_INUSE    0x5a    /* for use-uninitialised poisoning */
-#define POISON_FREE     0x6b    /* for use-after-free poisoning */
-#define POISON_END      0xa5    /* end-byte of poisoning */
 /*
 * memory layout of objects:
@@ -592,6 +592,7 @@ static inline struct kmem_cache *page_get_cache(struct page *page)
 {
        if (unlikely(PageCompound(page)))
                page = (struct page *)page_private(page);
+        BUG_ON(!PageSlab(page));
        return (struct kmem_cache *)page->lru.next;
 }
@@ -604,6 +605,7 @@ static inline struct slab *page_get_slab(struct page *page)
 {
        if (unlikely(PageCompound(page)))
                page = (struct page *)page_private(page);
+        BUG_ON(!PageSlab(page));
        return (struct slab *)page->lru.prev;
 }
@@ -705,12 +707,6 @@ int slab_is_available(void)
 static DEFINE_PER_CPU(struct work_struct, reap_work);
-static void free_block(struct kmem_cache *cachep, void **objpp, int len,
-                        int node);
-static void enable_cpucache(struct kmem_cache *cachep);
-static void cache_reap(void *unused);
-static int __node_shrink(struct kmem_cache *cachep, int node);
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 {
        return cachep->array[smp_processor_id()];
@@ -1024,6 +1020,40 @@ static void drain_alien_cache(struct kmem_cache *cachep,
                }
        }
 }
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+        struct slab *slabp = virt_to_slab(objp);
+        int nodeid = slabp->nodeid;
+        struct kmem_list3 *l3;
+        struct array_cache *alien = NULL;
+        /*
+         * Make sure we are not freeing a object from another node to the array
+         * cache on this cpu.
+         */
+        if (likely(slabp->nodeid == numa_node_id()))
+                return 0;
+        l3 = cachep->nodelists[numa_node_id()];
+        STATS_INC_NODEFREES(cachep);
+        if (l3->alien && l3->alien[nodeid]) {
+                alien = l3->alien[nodeid];
+                spin_lock(&alien->lock);
+                if (unlikely(alien->avail == alien->limit)) {
+                        STATS_INC_ACOVERFLOW(cachep);
+                        __drain_alien_cache(cachep, alien, nodeid);
+                }
+                alien->entry[alien->avail++] = objp;
+                spin_unlock(&alien->lock);
+        } else {
+                spin_lock(&(cachep->nodelists[nodeid])->list_lock);
+                free_block(cachep, &objp, 1, nodeid);
+                spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
+        }
+        return 1;
+}
 #else
 #define drain_alien_cache(cachep, alien) do { } while (0)
@@ -1038,9 +1068,14 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
 {
 }
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+        return 0;
+}
 #endif
-static int cpuup_callback(struct notifier_block *nfb,
+static int __devinit cpuup_callback(struct notifier_block *nfb,
                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
@@ -1207,10 +1242,7 @@ free_array_cache:
                        l3 = cachep->nodelists[node];
                        if (!l3)
                                continue;
-                        spin_lock_irq(&l3->list_lock);
+                        drain_freelist(cachep, l3, l3->free_objects);
-                        /* free slabs belonging to this node */
-                        __node_shrink(cachep, node);
-                        spin_unlock_irq(&l3->list_lock);
                }
                mutex_unlock(&cache_chain_mutex);
                break;
@@ -1222,7 +1254,9 @@ bad:
        return NOTIFY_BAD;
 }
-static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
+static struct notifier_block __cpuinitdata cpucache_notifier = {
+        &cpuup_callback, NULL, 0
+};
 /*
 * swap the static kmem_list3 with kmalloced memory
@@ -1335,6 +1369,8 @@ void __init kmem_cache_init(void)
                                NULL, NULL);
        }
+        slab_early_init = 0;
        while (sizes->cs_size != ULONG_MAX) {
                /*
                 * For performance, all the general caches are L1 aligned.
@@ -1450,31 +1486,29 @@ __initcall(cpucache_init);
 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
        struct page *page;
-        void *addr;
+        int nr_pages;
        int i;
-        flags |= cachep->gfpflags;
 #ifndef CONFIG_MMU
-        /* nommu uses slab's for process anonymous memory allocations, so
+        /*
-         * requires __GFP_COMP to properly refcount higher order allocations"
+         * Nommu uses slab's for process anonymous memory allocations, and thus
+         * requires __GFP_COMP to properly refcount higher order allocations
         */
-        page = alloc_pages_node(nodeid, (flags | __GFP_COMP), cachep->gfporder);
+        flags |= __GFP_COMP;
-#else
-        page = alloc_pages_node(nodeid, flags, cachep->gfporder);
 #endif
+        flags |= cachep->gfpflags;
+        page = alloc_pages_node(nodeid, flags, cachep->gfporder);
        if (!page)
                return NULL;
-        addr = page_address(page);
-        i = (1 << cachep->gfporder);
+        nr_pages = (1 << cachep->gfporder);
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-                atomic_add(i, &slab_reclaim_pages);
+                atomic_add(nr_pages, &slab_reclaim_pages);
-        add_page_state(nr_slab, i);
+        add_zone_page_state(page_zone(page), NR_SLAB, nr_pages);
-        while (i--) {
+        for (i = 0; i < nr_pages; i++)
-                __SetPageSlab(page);
+                __SetPageSlab(page + i);
-                page++;
+        return page_address(page);
-        }
-        return addr;
 }
 /*
@@ -1486,12 +1520,12 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
        struct page *page = virt_to_page(addr);
        const unsigned long nr_freed = i;
+        sub_zone_page_state(page_zone(page), NR_SLAB, nr_freed);
        while (i--) {
                BUG_ON(!PageSlab(page));
                __ClearPageSlab(page);
                page++;
        }
-        sub_page_state(nr_slab, nr_freed);
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += nr_freed;
        free_pages((unsigned long)addr, cachep->gfporder);
@@ -1913,8 +1947,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
        size_t left_over, slab_size, ralign;
-        struct kmem_cache *cachep = NULL;
+        struct kmem_cache *cachep = NULL, *pc;
-        struct list_head *p;
        /*
         * Sanity checks... these are all serious usage bugs.
@@ -1934,8 +1967,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        mutex_lock(&cache_chain_mutex);
-        list_for_each(p, &cache_chain) {
+        list_for_each_entry(pc, &cache_chain, next) {
-                struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
                mm_segment_t old_fs = get_fs();
                char tmp;
                int res;
@@ -2069,8 +2101,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 #endif
 #endif
-        /* Determine if the slab management is 'on' or 'off' slab. */
+        /*
-        if (size >= (PAGE_SIZE >> 3))
+         * Determine if the slab management is 'on' or 'off' slab.
+         * (bootstrapping cannot cope with offslab caches so don't do
+         * it too early on.)
+         */
+        if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
                /*
                 * Size is large, assume best to place the slab management obj
                 * off-slab (should allow better packing of objs).
@@ -2210,32 +2246,45 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
        }
 }
-static int __node_shrink(struct kmem_cache *cachep, int node)
+/*
+ * Remove slabs from the list of free slabs.
+ * Specify the number of slabs to drain in tofree.
+ *
+ * Returns the actual number of slabs released.
+ */
+static int drain_freelist(struct kmem_cache *cache,
+                        struct kmem_list3 *l3, int tofree)
 {
+        struct list_head *p;
+        int nr_freed;
        struct slab *slabp;
-        struct kmem_list3 *l3 = cachep->nodelists[node];
-        int ret;
-        for (;;) {
+        nr_freed = 0;
-                struct list_head *p;
+        while (nr_freed < tofree && !list_empty(&l3->slabs_free)) {
+                spin_lock_irq(&l3->list_lock);
                p = l3->slabs_free.prev;
-                if (p == &l3->slabs_free)
+                if (p == &l3->slabs_free) {
-                        break;
+                        spin_unlock_irq(&l3->list_lock);
+                        goto out;
+                }
-                slabp = list_entry(l3->slabs_free.prev, struct slab, list);
+                slabp = list_entry(p, struct slab, list);
 #if DEBUG
                BUG_ON(slabp->inuse);
 #endif
                list_del(&slabp->list);
+                /*
-                l3->free_objects -= cachep->num;
+                 * Safe to drop the lock. The slab is no longer linked
+                 * to the cache.
+                 */
+                l3->free_objects -= cache->num;
                spin_unlock_irq(&l3->list_lock);
-                slab_destroy(cachep, slabp);
+                slab_destroy(cache, slabp);
-                spin_lock_irq(&l3->list_lock);
+                nr_freed++;
        }
-        ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
+out:
-        return ret;
+        return nr_freed;
 }
 static int __cache_shrink(struct kmem_cache *cachep)
@@ -2248,11 +2297,13 @@ static int __cache_shrink(struct kmem_cache *cachep)
        check_irq_on();
        for_each_online_node(i) {
                l3 = cachep->nodelists[i];
-                if (l3) {
+                if (!l3)
-                        spin_lock_irq(&l3->list_lock);
+                        continue;
-                        ret += __node_shrink(cachep, i);
-                        spin_unlock_irq(&l3->list_lock);
+                drain_freelist(cachep, l3, l3->free_objects);
-                }
+                ret += !list_empty(&l3->slabs_full) ||
+                        !list_empty(&l3->slabs_partial);
        }
        return (ret ? 1 : 0);
 }
@@ -2460,23 +2511,28 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
        slabp->inuse--;
 }
-static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
+/*
-                        void *objp)
+ * Map pages beginning at addr to the given cache and slab. This is required
+ * for the slab allocator to be able to lookup the cache and slab of a
+ * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ */
+static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
+                           void *addr)
 {
-        int i;
+        int nr_pages;
        struct page *page;
-        /* Nasty!!!!!! I hope this is OK. */
+        page = virt_to_page(addr);
-        page = virt_to_page(objp);
-        i = 1;
+        nr_pages = 1;
        if (likely(!PageCompound(page)))
-                i <<= cachep->gfporder;
+                nr_pages <<= cache->gfporder;
        do {
-                page_set_cache(page, cachep);
+                page_set_cache(page, cache);
-                page_set_slab(page, slabp);
+                page_set_slab(page, slab);
                page++;
-        } while (--i);
+        } while (--nr_pages);
 }
 /*
@@ -2548,7 +2604,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                goto opps1;
        slabp->nodeid = nodeid;
-        set_slab_attr(cachep, slabp, objp);
+        slab_map_pages(cachep, slabp, objp);
        cache_init_objs(cachep, slabp, ctor_flags);
@@ -2596,6 +2652,28 @@ static void kfree_debugcheck(const void *objp)
        }
 }
+static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
+{
+        unsigned long redzone1, redzone2;
+        redzone1 = *dbg_redzone1(cache, obj);
+        redzone2 = *dbg_redzone2(cache, obj);
+        /*
+         * Redzone is ok.
+         */
+        if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
+                return;
+        if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
+                slab_error(cache, "double free detected");
+        else
+                slab_error(cache, "memory outside object was overwritten");
+        printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
+                        obj, redzone1, redzone2);
+}
 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
                                   void *caller)
 {
@@ -2607,27 +2685,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        kfree_debugcheck(objp);
        page = virt_to_page(objp);
-        if (page_get_cache(page) != cachep) {
-                printk(KERN_ERR "mismatch in kmem_cache_free: expected "
-                                "cache %p, got %p\n",
-                       page_get_cache(page), cachep);
-                printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
-                printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
-                       page_get_cache(page)->name);
-                WARN_ON(1);
-        }
        slabp = page_get_slab(page);
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
+                verify_redzone_free(cachep, objp);
-                                *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-                        slab_error(cachep, "double free, or memory outside"
-                                                " object was overwritten");
-                        printk(KERN_ERR "%p: redzone 1:0x%lx, "
-                                        "redzone 2:0x%lx.\n",
-                               objp, *dbg_redzone1(cachep, objp),
-                               *dbg_redzone2(cachep, objp));
-                }
                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
        }
@@ -3087,41 +3148,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
        check_irq_off();
        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
-        /* Make sure we are not freeing a object from another
+        if (cache_free_alien(cachep, objp))
-         * node to the array cache on this cpu.
+                return;
-         */
-#ifdef CONFIG_NUMA
-        {
-                struct slab *slabp;
-                slabp = virt_to_slab(objp);
-                if (unlikely(slabp->nodeid != numa_node_id())) {
-                        struct array_cache *alien = NULL;
-                        int nodeid = slabp->nodeid;
-                        struct kmem_list3 *l3;
-                        l3 = cachep->nodelists[numa_node_id()];
-                        STATS_INC_NODEFREES(cachep);
-                        if (l3->alien && l3->alien[nodeid]) {
-                                alien = l3->alien[nodeid];
-                                spin_lock(&alien->lock);
-                                if (unlikely(alien->avail == alien->limit)) {
-                                        STATS_INC_ACOVERFLOW(cachep);
-                                        __drain_alien_cache(cachep,
-                                                            alien, nodeid);
-                                }
-                                alien->entry[alien->avail++] = objp;
-                                spin_unlock(&alien->lock);
-                        } else {
-                                spin_lock(&(cachep->nodelists[nodeid])->
-                                          list_lock);
-                                free_block(cachep, &objp, 1, nodeid);
-                                spin_unlock(&(cachep->nodelists[nodeid])->
-                                            list_lock);
-                        }
-                        return;
-                }
-        }
-#endif
        if (likely(ac->avail < ac->limit)) {
                STATS_INC_FREEHIT(cachep);
                ac->entry[ac->avail++] = objp;
@@ -3254,26 +3283,10 @@ EXPORT_SYMBOL(kmalloc_node);
 #endif
 /**
- * kmalloc - allocate memory
+ * __do_kmalloc - allocate memory
 * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
+ * @flags: the type of memory to allocate (see kmalloc).
 * @caller: function caller for debug tracking of the caller
- *
- * kmalloc is the normal method of allocating memory
- * in the kernel.
- *
- * The @flags argument may be one of:
- *
- * %GFP_USER - Allocate memory on behalf of user.  May sleep.
- *
- * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
- *
- * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
- *
- * Additionally, the %GFP_DMA flag may be set to indicate the memory
- * must be suitable for DMA.  This can mean different things on different
- * platforms.  For example, on i386, it means that the memory must come
- * from the first 16MB.
 */
 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
                                          void *caller)
@@ -3371,6 +3384,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
        unsigned long flags;
+        BUG_ON(virt_to_cache(objp) != cachep);
        local_irq_save(flags);
        __cache_free(cachep, objp);
        local_irq_restore(flags);
@@ -3396,7 +3411,7 @@ void kfree(const void *objp)
        local_irq_save(flags);
        kfree_debugcheck(objp);
        c = virt_to_cache(objp);
-        mutex_debug_check_no_locks_freed(objp, obj_size(c));
+        debug_check_no_locks_freed(objp, obj_size(c));
        __cache_free(c, (void *)objp);
        local_irq_restore(flags);
 }
@@ -3680,7 +3695,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 */
 static void cache_reap(void *unused)
 {
-        struct list_head *walk;
+        struct kmem_cache *searchp;
        struct kmem_list3 *l3;
        int node = numa_node_id();
@@ -3691,13 +3706,7 @@ static void cache_reap(void *unused)
                return;
        }
-        list_for_each(walk, &cache_chain) {
+        list_for_each_entry(searchp, &cache_chain, next) {
-                struct kmem_cache *searchp;
-                struct list_head *p;
-                int tofree;
-                struct slab *slabp;
-                searchp = list_entry(walk, struct kmem_cache, next);
                check_irq_on();
                /*
@@ -3722,47 +3731,22 @@ static void cache_reap(void *unused)
                drain_array(searchp, l3, l3->shared, 0, node);
-                if (l3->free_touched) {
+                if (l3->free_touched)
                        l3->free_touched = 0;
-                        goto next;
+                else {
-                }
+                        int freed;
-                tofree = (l3->free_limit + 5 * searchp->num - 1) /
+                        freed = drain_freelist(searchp, l3, (l3->free_limit +
-                                (5 * searchp->num);
+                                5 * searchp->num - 1) / (5 * searchp->num));
-                do {
+                        STATS_ADD_REAPED(searchp, freed);
-                        /*
+                }
-                         * Do not lock if there are no free blocks.
-                         */
-                        if (list_empty(&l3->slabs_free))
-                                break;
-                        spin_lock_irq(&l3->list_lock);
-                        p = l3->slabs_free.next;
-                        if (p == &(l3->slabs_free)) {
-                                spin_unlock_irq(&l3->list_lock);
-                                break;
-                        }
-                        slabp = list_entry(p, struct slab, list);
-                        BUG_ON(slabp->inuse);
-                        list_del(&slabp->list);
-                        STATS_INC_REAPED(searchp);
-                        /*
-                         * Safe to drop the lock. The slab is no longer linked
-                         * to the cache. searchp cannot disappear, we hold
-                         * cache_chain_lock
-                         */
-                        l3->free_objects -= searchp->num;
-                        spin_unlock_irq(&l3->list_lock);
-                        slab_destroy(searchp, slabp);
-                } while (--tofree > 0);
 next:
                cond_resched();
        }
        check_irq_on();
        mutex_unlock(&cache_chain_mutex);
        next_reap_node();
+        refresh_cpu_vm_stats(smp_processor_id());
        /* Set up the next iteration */
        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
@@ -3825,7 +3809,6 @@ static void s_stop(struct seq_file *m, void *p)
 static int s_show(struct seq_file *m, void *p)
 {
        struct kmem_cache *cachep = p;
-        struct list_head *q;
        struct slab *slabp;
        unsigned long active_objs;
        unsigned long num_objs;
@@ -3846,15 +3829,13 @@ static int s_show(struct seq_file *m, void *p)
                check_irq_on();
                spin_lock_irq(&l3->list_lock);
-                list_for_each(q, &l3->slabs_full) {
+                list_for_each_entry(slabp, &l3->slabs_full, list) {
-                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse != cachep->num && !error)
                                error = "slabs_full accounting error";
                        active_objs += cachep->num;
                        active_slabs++;
                }
-                list_for_each(q, &l3->slabs_partial) {
+                list_for_each_entry(slabp, &l3->slabs_partial, list) {
-                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse == cachep->num && !error)
                                error = "slabs_partial inuse accounting error";
                        if (!slabp->inuse && !error)
@@ -3862,8 +3843,7 @@ static int s_show(struct seq_file *m, void *p)
                        active_objs += slabp->inuse;
                        active_slabs++;
                }
-                list_for_each(q, &l3->slabs_free) {
+                list_for_each_entry(slabp, &l3->slabs_free, list) {
-                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse && !error)
                                error = "slabs_free/inuse accounting error";
                        num_slabs++;
@@ -3956,7 +3936,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 {
        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
        int limit, batchcount, shared, res;
-        struct list_head *p;
+        struct kmem_cache *cachep;
        if (count > MAX_SLABINFO_WRITE)
                return -EINVAL;
@@ -3975,10 +3955,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
        /* Find the cache in the chain of caches. */
        mutex_lock(&cache_chain_mutex);
        res = -EINVAL;
-        list_for_each(p, &cache_chain) {
+        list_for_each_entry(cachep, &cache_chain, next) {
-                struct kmem_cache *cachep;
-                cachep = list_entry(p, struct kmem_cache, next);
                if (!strcmp(cachep->name, kbuf)) {
                        if (limit < 1 || batchcount < 1 ||
                                        batchcount > limit || shared < 0) {
@@ -4080,7 +4057,6 @@ static void show_symbol(struct seq_file *m, unsigned long address)
 static int leaks_show(struct seq_file *m, void *p)
 {
        struct kmem_cache *cachep = p;
-        struct list_head *q;
        struct slab *slabp;
        struct kmem_list3 *l3;
        const char *name;
@@ -4105,14 +4081,10 @@ static int leaks_show(struct seq_file *m, void *p)
                check_irq_on();
                spin_lock_irq(&l3->list_lock);
-                list_for_each(q, &l3->slabs_full) {
+                list_for_each_entry(slabp, &l3->slabs_full, list)
-                        slabp = list_entry(q, struct slab, list);
                        handle_slab(n, cachep, slabp);
-                }
+                list_for_each_entry(slabp, &l3->slabs_partial, list)
-                list_for_each(q, &l3->slabs_partial) {
-                        slabp = list_entry(q, struct slab, list);
                        handle_slab(n, cachep, slabp);
-                }
                spin_unlock_irq(&l3->list_lock);
        }
        name = cachep->name;
diff --git a/mm/slob.c b/mm/slob.c
index a68255ba4553..7b52b20b9607 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -29,7 +29,6 @@
 * essentially no allocation space overhead.
 */
-#include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/cache.h>
diff --git a/mm/sparse.c b/mm/sparse.c
index 100040c0dfb6..86c52ab80878 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -1,7 +1,6 @@
 /*
 * sparse memory mappings.
 */
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
 #include <linux/bootmem.h>
@@ -45,7 +44,7 @@ static struct mem_section *sparse_index_alloc(int nid)
 static int sparse_index_init(unsigned long section_nr, int nid)
 {
-        static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED;
+        static DEFINE_SPINLOCK(index_init_lock);
        unsigned long root = SECTION_NR_TO_ROOT(section_nr);
        struct mem_section *section;
        int ret = 0;
@@ -99,6 +98,22 @@ int __section_nr(struct mem_section* ms)
        return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
+/*
+ * During early boot, before section_mem_map is used for an actual
+ * mem_map, we use section_mem_map to store the section's NUMA
+ * node.  This keeps us from having to use another data structure.  The
+ * node information is cleared just before we store the real mem_map.
+ */
+static inline unsigned long sparse_encode_early_nid(int nid)
+{
+        return (nid << SECTION_NID_SHIFT);
+}
+static inline int sparse_early_nid(struct mem_section *section)
+{
+        return (section->section_mem_map >> SECTION_NID_SHIFT);
+}
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {
@@ -113,7 +128,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
                ms = __nr_to_section(section);
                if (!ms->section_mem_map)
-                        ms->section_mem_map = SECTION_MARKED_PRESENT;
+                        ms->section_mem_map = sparse_encode_early_nid(nid) |
+                                                        SECTION_MARKED_PRESENT;
        }
 }
@@ -164,6 +180,7 @@ static int sparse_init_one_section(struct mem_section *ms,
        if (!valid_section(ms))
                return -EINVAL;
+        ms->section_mem_map &= ~SECTION_MAP_MASK;
        ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
        return 1;
@@ -172,8 +189,8 @@ static int sparse_init_one_section(struct mem_section *ms,
 static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 {
        struct page *map;
-        int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
        struct mem_section *ms = __nr_to_section(pnum);
+        int nid = sparse_early_nid(ms);
        map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
        if (map)
diff --git a/mm/swap.c b/mm/swap.c
index 88895c249bc9..8fd095c4ae51 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -86,9 +86,8 @@ int rotate_reclaimable_page(struct page *page)
        zone = page_zone(page);
        spin_lock_irqsave(&zone->lru_lock, flags);
        if (PageLRU(page) && !PageActive(page)) {
-                list_del(&page->lru);
+                list_move_tail(&page->lru, &zone->inactive_list);
-                list_add_tail(&page->lru, &zone->inactive_list);
+                __count_vm_event(PGROTATED);
-                inc_page_state(pgrotated);
        }
        if (!test_clear_page_writeback(page))
                BUG();
@@ -108,7 +107,7 @@ void fastcall activate_page(struct page *page)
                del_page_from_inactive_list(zone, page);
                SetPageActive(page);
                add_page_to_active_list(zone, page);
-                inc_page_state(pgactivate);
+                __count_vm_event(PGACTIVATE);
        }
        spin_unlock_irq(&zone->lru_lock);
 }
@@ -480,48 +479,6 @@ static int cpu_swap_callback(struct notifier_block *nfb,
 #endif /* CONFIG_HOTPLUG_CPU */
 #endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
-void percpu_counter_mod(struct percpu_counter *fbc, long amount)
-{
-        long count;
-        long *pcount;
-        int cpu = get_cpu();
-        pcount = per_cpu_ptr(fbc->counters, cpu);
-        count = *pcount + amount;
-        if (count >= FBC_BATCH || count <= -FBC_BATCH) {
-                spin_lock(&fbc->lock);
-                fbc->count += count;
-                *pcount = 0;
-                spin_unlock(&fbc->lock);
-        } else {
-                *pcount = count;
-        }
-        put_cpu();
-}
-EXPORT_SYMBOL(percpu_counter_mod);
-/*
- * Add up all the per-cpu counts, return the result.  This is a more accurate
- * but much slower version of percpu_counter_read_positive()
- */
-long percpu_counter_sum(struct percpu_counter *fbc)
-{
-        long ret;
-        int cpu;
-        spin_lock(&fbc->lock);
-        ret = fbc->count;
-        for_each_possible_cpu(cpu) {
-                long *pcount = per_cpu_ptr(fbc->counters, cpu);
-                ret += *pcount;
-        }
-        spin_unlock(&fbc->lock);
-        return ret < 0 ? 0 : ret;
-}
-EXPORT_SYMBOL(percpu_counter_sum);
-#endif
 /*
 * Perform any setup for the swap system
 */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e0e1583f32c2..fccbd9bba77b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -24,7 +24,7 @@
 * vmscan's shrink_list, to make sync_page look nicer, and to allow
 * future use of radix_tree tags in the swap cache.
 */
-static struct address_space_operations swap_aops = {
+static const struct address_space_operations swap_aops = {
        .writepage      = swap_writepage,
        .sync_page      = block_sync_page,
        .set_page_dirty = __set_page_dirty_nobuffers,
@@ -87,7 +87,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
                        SetPageSwapCache(page);
                        set_page_private(page, entry.val);
                        total_swapcache_pages++;
-                        pagecache_acct(1);
+                        __inc_zone_page_state(page, NR_FILE_PAGES);
                }
                write_unlock_irq(&swapper_space.tree_lock);
                radix_tree_preload_end();
@@ -132,7 +132,7 @@ void __delete_from_swap_cache(struct page *page)
        set_page_private(page, 0);
        ClearPageSwapCache(page);
        total_swapcache_pages--;
-        pagecache_acct(-1);
+        __dec_zone_page_state(page, NR_FILE_PAGES);
        INC_CACHE_INFO(del_total);
 }
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e5fd5385f0cc..e70d6c6d6fee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -5,7 +5,6 @@
 *  Swap reorganised 29.12.95, Stephen Tweedie
 */
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
@@ -395,6 +394,9 @@ void free_swap_and_cache(swp_entry_t entry)
        struct swap_info_struct * p;
        struct page *page = NULL;
+        if (is_migration_entry(entry))
+                return;
        p = swap_info_get(entry);
        if (p) {
                if (swap_entry_free(p, swp_offset(entry)) == 1) {
@@ -615,15 +617,6 @@ static int unuse_mm(struct mm_struct *mm,
        return 0;
 }
-#ifdef CONFIG_MIGRATION
-int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
-{
-        swp_entry_t entry = { .val = page_private(page) };
-        return unuse_vma(vma, entry, page);
-}
-#endif
 /*
 * Scan swap_map from current position to next entry still in use.
 * Recycle to start on reaching the end, returning 0 when empty.
@@ -716,7 +709,6 @@ static int try_to_unuse(unsigned int type)
                 */
                swap_map = &si->swap_map[i];
                entry = swp_entry(type, i);
-again:
                page = read_swap_cache_async(entry, NULL, 0);
                if (!page) {
                        /*
@@ -751,12 +743,6 @@ again:
                wait_on_page_locked(page);
                wait_on_page_writeback(page);
                lock_page(page);
-                if (!PageSwapCache(page)) {
-                        /* Page migration has occured */
-                        unlock_page(page);
-                        page_cache_release(page);
-                        goto again;
-                }
                wait_on_page_writeback(page);
                /*
@@ -785,10 +771,8 @@ again:
                        while (*swap_map > 1 && !retval &&
                                        (p = p->next) != &start_mm->mmlist) {
                                mm = list_entry(p, struct mm_struct, mmlist);
-                                if (atomic_inc_return(&mm->mm_users) == 1) {
+                                if (!atomic_inc_not_zero(&mm->mm_users))
-                                        atomic_dec(&mm->mm_users);
                                        continue;
-                                }
                                spin_unlock(&mmlist_lock);
                                mmput(prev_mm);
                                prev_mm = mm;
@@ -1407,19 +1391,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                if (!(p->flags & SWP_USED))
                        break;
        error = -EPERM;
-        /*
+        if (type >= MAX_SWAPFILES) {
-         * Test if adding another swap device is possible. There are
-         * two limiting factors: 1) the number of bits for the swap
-         * type swp_entry_t definition and 2) the number of bits for
-         * the swap type in the swap ptes as defined by the different
-         * architectures. To honor both limitations a swap entry
-         * with swap offset 0 and swap type ~0UL is created, encoded
-         * to a swap pte, decoded to a swp_entry_t again and finally
-         * the swap type part is extracted. This will mask all bits
-         * from the initial ~0UL that can't be encoded in either the
-         * swp_entry_t or the architecture definition of a swap pte.
-         */
-        if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
                spin_unlock(&swap_lock);
                goto out;
        }
@@ -1504,8 +1476,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                error = -EINVAL;
                goto bad_swap;
        }
-        page = read_cache_page(mapping, 0,
+        page = read_mapping_page(mapping, 0, swap_file);
-                        (filler_t *)mapping->a_ops->readpage, swap_file);
        if (IS_ERR(page)) {
                error = PTR_ERR(page);
                goto bad_swap;
@@ -1709,6 +1680,9 @@ int swap_duplicate(swp_entry_t entry)
        unsigned long offset, type;
        int result = 0;
+        if (is_migration_entry(entry))
+                return 1;
        type = swp_type(entry);
        if (type >= nr_swapfiles)
                goto bad_file;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index f9d6a9cc91c4..5f2cbf0f153c 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -12,7 +12,6 @@
 #include <linux/fs.h>
 #include <linux/init.h>
-#include <linux/devfs_fs_kernel.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
 #include <linux/file.h>
@@ -33,9 +32,6 @@ static int __init init_tmpfs(void)
 {
        BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
-#ifdef CONFIG_TMPFS
-        devfs_mk_dir("shm");
-#endif
        shm_mnt = kern_mount(&tmpfs_fs_type);
        BUG_ON(IS_ERR(shm_mnt));
diff --git a/mm/truncate.c b/mm/truncate.c
index 6cb3fff25f67..cf1b015df4a7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -230,14 +230,24 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                        pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
+                        pgoff_t index;
+                        int lock_failed;
-                        if (TestSetPageLocked(page)) {
+                        lock_failed = TestSetPageLocked(page);
-                                next++;
-                                continue;
+                        /*
-                        }
+                         * We really shouldn't be looking at the ->index of an
-                        if (page->index > next)
+                         * unlocked page.  But we're not allowed to lock these
-                                next = page->index;
+                         * pages.  So we rely upon nobody altering the ->index
+                         * of this (pinned-by-us) page.
+                         */
+                        index = page->index;
+                        if (index > next)
+                                next = index;
                        next++;
+                        if (lock_failed)
+                                continue;
                        if (PageDirty(page) || PageWriteback(page))
                                goto unlock;
                        if (page_mapped(page))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c0504f1e34eb..35f8553f893a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -257,6 +257,19 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int
 }
 /* Caller must hold vmlist_lock */
+static struct vm_struct *__find_vm_area(void *addr)
+{
+        struct vm_struct *tmp;
+        for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
+                 if (tmp->addr == addr)
+                        break;
+        }
+        return tmp;
+}
+/* Caller must hold vmlist_lock */
 struct vm_struct *__remove_vm_area(void *addr)
 {
        struct vm_struct **p, *tmp;
@@ -498,11 +511,33 @@ EXPORT_SYMBOL(__vmalloc);
 */
 void *vmalloc(unsigned long size)
 {
-       return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 }
 EXPORT_SYMBOL(vmalloc);
 /**
+ *      vmalloc_user  -  allocate virtually contiguous memory which has
+ *                         been zeroed so it can be mapped to userspace without
+ *                         leaking data.
+ *
+ *      @size:          allocation size
+ */
+void *vmalloc_user(unsigned long size)
+{
+        struct vm_struct *area;
+        void *ret;
+        ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+        write_lock(&vmlist_lock);
+        area = __find_vm_area(ret);
+        area->flags |= VM_USERMAP;
+        write_unlock(&vmlist_lock);
+        return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+/**
 *      vmalloc_node  -  allocate memory on a specific node
 *
 *      @size:          allocation size
@@ -516,7 +551,7 @@ EXPORT_SYMBOL(vmalloc);
 */
 void *vmalloc_node(unsigned long size, int node)
 {
-       return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
 }
 EXPORT_SYMBOL(vmalloc_node);
@@ -556,6 +591,28 @@ void *vmalloc_32(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc_32);
+/**
+ *      vmalloc_32_user  -  allocate virtually contiguous memory (32bit
+ *                            addressable) which is zeroed so it can be
+ *                            mapped to userspace without leaking data.
+ *
+ *      @size:          allocation size
+ */
+void *vmalloc_32_user(unsigned long size)
+{
+        struct vm_struct *area;
+        void *ret;
+        ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+        write_lock(&vmlist_lock);
+        area = __find_vm_area(ret);
+        area->flags |= VM_USERMAP;
+        write_unlock(&vmlist_lock);
+        return ret;
+}
+EXPORT_SYMBOL(vmalloc_32_user);
 long vread(char *buf, char *addr, unsigned long count)
 {
        struct vm_struct *tmp;
@@ -630,3 +687,64 @@ finished:
        read_unlock(&vmlist_lock);
        return buf - buf_start;
 }
+/**
+ *      remap_vmalloc_range  -  map vmalloc pages to userspace
+ *
+ *      @vma:           vma to cover (map full range of vma)
+ *      @addr:          vmalloc memory
+ *      @pgoff:         number of pages into addr before first page to map
+ *      @returns:       0 for success, -Exxx on failure
+ *
+ *      This function checks that addr is a valid vmalloc'ed area, and
+ *      that it is big enough to cover the vma. Will return failure if
+ *      that criteria isn't met.
+ *
+ *      Similar to remap_pfn_range (see mm/memory.c)
+ */
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+                                                unsigned long pgoff)
+{
+        struct vm_struct *area;
+        unsigned long uaddr = vma->vm_start;
+        unsigned long usize = vma->vm_end - vma->vm_start;
+        int ret;
+        if ((PAGE_SIZE-1) & (unsigned long)addr)
+                return -EINVAL;
+        read_lock(&vmlist_lock);
+        area = __find_vm_area(addr);
+        if (!area)
+                goto out_einval_locked;
+        if (!(area->flags & VM_USERMAP))
+                goto out_einval_locked;
+        if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
+                goto out_einval_locked;
+        read_unlock(&vmlist_lock);
+        addr += pgoff << PAGE_SHIFT;
+        do {
+                struct page *page = vmalloc_to_page(addr);
+                ret = vm_insert_page(vma, uaddr, page);
+                if (ret)
+                        return ret;
+                uaddr += PAGE_SIZE;
+                addr += PAGE_SIZE;
+                usize -= PAGE_SIZE;
+        } while (usize > 0);
+        /* Prevent "things" like memory migration? VM_flags need a cleanup... */
+        vma->vm_flags |= VM_RESERVED;
+        return ret;
+out_einval_locked:
+        read_unlock(&vmlist_lock);
+        return -EINVAL;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 440a733fe2e9..ff2ebe9458a3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -34,6 +34,7 @@
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
+#include <linux/kthread.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -46,8 +47,6 @@ struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
-        unsigned long nr_mapped;        /* From page_state */
        /* This context's GFP mask */
        gfp_t gfp_mask;
@@ -61,6 +60,8 @@ struct scan_control {
         * In this context, it doesn't matter that we scan the
         * whole list at once. */
        int swap_cluster_max;
+        int swappiness;
 };
 /*
@@ -108,7 +109,7 @@ struct shrinker {
 * From 0 .. 100.  Higher means more swappy.
 */
 int vm_swappiness = 60;
-static long total_memory;
+long vm_total_pages;    /* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -214,7 +215,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                                break;
                        if (shrink_ret < nr_before)
                                ret += nr_before - shrink_ret;
-                        mod_page_state(slabs_scanned, this_scan);
+                        count_vm_events(SLABS_SCANNED, this_scan);
                        total_scan -= this_scan;
                        cond_resched();
@@ -288,11 +289,23 @@ static void handle_write_error(struct address_space *mapping,
        unlock_page(page);
 }
+/* possible outcome of pageout() */
+typedef enum {
+        /* failed to write page out, page is locked */
+        PAGE_KEEP,
+        /* move page to the active list, page is locked */
+        PAGE_ACTIVATE,
+        /* page has been sent to the disk successfully, page is unlocked */
+        PAGE_SUCCESS,
+        /* page is clean and locked */
+        PAGE_CLEAN,
+} pageout_t;
 /*
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
 */
-pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -337,6 +350,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping)
                struct writeback_control wbc = {
                        .sync_mode = WB_SYNC_NONE,
                        .nr_to_write = SWAP_CLUSTER_MAX,
+                        .range_start = 0,
+                        .range_end = LLONG_MAX,
                        .nonblocking = 1,
                        .for_reclaim = 1,
                };
@@ -554,7 +569,7 @@ keep:
        list_splice(&ret_pages, page_list);
        if (pagevec_count(&freed_pvec))
                __pagevec_release_nonlru(&freed_pvec);
-        mod_page_state(pgactivate, pgactivate);
+        count_vm_events(PGACTIVATE, pgactivate);
        return nr_reclaimed;
 }
@@ -644,11 +659,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                nr_reclaimed += nr_freed;
                local_irq_disable();
                if (current_is_kswapd()) {
-                        __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+                        __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
-                        __mod_page_state(kswapd_steal, nr_freed);
+                        __count_vm_events(KSWAPD_STEAL, nr_freed);
                } else
-                        __mod_page_state_zone(zone, pgscan_direct, nr_scan);
+                        __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
-                __mod_page_state_zone(zone, pgsteal, nr_freed);
+                __count_vm_events(PGACTIVATE, nr_freed);
                if (nr_taken == 0)
                        goto done;
@@ -727,7 +742,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 * how much memory
                 * is mapped.
                 */
-                mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+                mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
+                                global_page_state(NR_ANON_PAGES)) * 100) /
+                                        vm_total_pages;
                /*
                 * Now decide how much we really want to unmap some pages.  The
@@ -741,7 +758,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 * A 100% value of vm_swappiness overrides this algorithm
                 * altogether.
                 */
-                swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+                swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
                /*
                 * Now use this metric to decide whether to start moving mapped
@@ -824,11 +841,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                }
        }
        zone->nr_active += pgmoved;
-        spin_unlock(&zone->lru_lock);
-        __mod_page_state_zone(zone, pgrefill, pgscanned);
+        __count_zone_vm_events(PGREFILL, zone, pgscanned);
-        __mod_page_state(pgdeactivate, pgdeactivate);
+        __count_vm_events(PGDEACTIVATE, pgdeactivate);
-        local_irq_enable();
+        spin_unlock_irq(&zone->lru_lock);
        pagevec_release(&pvec);
 }
@@ -957,9 +973,10 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                .may_writepage = !laptop_mode,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
                .may_swap = 1,
+                .swappiness = vm_swappiness,
        };
-        inc_page_state(allocstall);
+        count_vm_event(ALLOCSTALL);
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
@@ -972,7 +989,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        }
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-                sc.nr_mapped = read_page_state(nr_mapped);
                sc.nr_scanned = 0;
                if (!priority)
                        disable_swap_token();
@@ -1021,10 +1037,6 @@ out:
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at pages_high.
 *
- * If `nr_pages' is non-zero then it is the number of pages which are to be
- * reclaimed, regardless of the zone occupancies.  This is a software suspend
- * special.
- *
 * Returns the number of pages which were actually freed.
 *
 * There is special handling here for zones which are full of pinned pages.
@@ -1042,10 +1054,8 @@ out:
 * the page allocator fallback scheme to ensure that aging of pages is balanced
 * across the zones.
 */
-static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
-                                int order)
 {
-        unsigned long to_free = nr_pages;
        int all_zones_ok;
        int priority;
        int i;
@@ -1055,16 +1065,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_swap = 1,
-                .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+                .swap_cluster_max = SWAP_CLUSTER_MAX,
+                .swappiness = vm_swappiness,
        };
 loop_again:
        total_scanned = 0;
        nr_reclaimed = 0;
        sc.may_writepage = !laptop_mode;
-        sc.nr_mapped = read_page_state(nr_mapped);
+        count_vm_event(PAGEOUTRUN);
-        inc_page_state(pageoutrun);
        for (i = 0; i < pgdat->nr_zones; i++) {
                struct zone *zone = pgdat->node_zones + i;
@@ -1082,31 +1091,26 @@ loop_again:
                all_zones_ok = 1;
-                if (nr_pages == 0) {
+                /*
-                        /*
+                 * Scan in the highmem->dma direction for the highest
-                         * Scan in the highmem->dma direction for the highest
+                 * zone which needs scanning
-                         * zone which needs scanning
+                 */
-                         */
+                for (i = pgdat->nr_zones - 1; i >= 0; i--) {
-                        for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+                        struct zone *zone = pgdat->node_zones + i;
-                                struct zone *zone = pgdat->node_zones + i;
-                                if (!populated_zone(zone))
+                        if (!populated_zone(zone))
-                                        continue;
+                                continue;
-                                if (zone->all_unreclaimable &&
+                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
-                                                priority != DEF_PRIORITY)
+                                continue;
-                                        continue;
-                                if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok(zone, order, zone->pages_high,
-                                                zone->pages_high, 0, 0)) {
+                                               0, 0)) {
-                                        end_zone = i;
+                                end_zone = i;
-                                        goto scan;
+                                goto scan;
-                                }
                        }
-                        goto out;
-                } else {
-                        end_zone = pgdat->nr_zones - 1;
                }
+                goto out;
 scan:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
@@ -1133,11 +1137,9 @@ scan:
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
-                        if (nr_pages == 0) {    /* Not software suspend */
+                        if (!zone_watermark_ok(zone, order, zone->pages_high,
-                                if (!zone_watermark_ok(zone, order,
+                                               end_zone, 0))
-                                                zone->pages_high, end_zone, 0))
+                                all_zones_ok = 0;
-                                        all_zones_ok = 0;
-                        }
                        zone->temp_priority = priority;
                        if (zone->prev_priority > priority)
                                zone->prev_priority = priority;
@@ -1162,8 +1164,6 @@ scan:
                            total_scanned > nr_reclaimed + nr_reclaimed / 2)
                                sc.may_writepage = 1;
                }
-                if (nr_pages && to_free > nr_reclaimed)
-                        continue;       /* swsusp: need to do more work */
                if (all_zones_ok)
                        break;          /* kswapd: all done */
                /*
@@ -1179,7 +1179,7 @@ scan:
                 * matches the direct reclaim path behaviour in terms of impact
                 * on zone->*_priority.
                 */
-                if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
+                if (nr_reclaimed >= SWAP_CLUSTER_MAX)
                        break;
        }
 out:
@@ -1220,7 +1220,6 @@ static int kswapd(void *p)
        };
        cpumask_t cpumask;
-        daemonize("kswapd%d", pgdat->node_id);
        cpumask = node_to_cpumask(pgdat->node_id);
        if (!cpus_empty(cpumask))
                set_cpus_allowed(tsk, cpumask);
@@ -1261,7 +1260,7 @@ static int kswapd(void *p)
                }
                finish_wait(&pgdat->kswapd_wait, &wait);
-                balance_pgdat(pgdat, 0, order);
+                balance_pgdat(pgdat, order);
        }
        return 0;
 }
@@ -1290,35 +1289,152 @@ void wakeup_kswapd(struct zone *zone, int order)
 #ifdef CONFIG_PM
 /*
- * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
+ * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
- * pages.
+ * from LRU lists system-wide, for given pass and priority, and returns the
+ * number of reclaimed pages
+ *
+ * For pass > 3 we also try to shrink the LRU lists that contain a few pages
+ */
+static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
+                                      int prio, struct scan_control *sc)
+{
+        struct zone *zone;
+        unsigned long nr_to_scan, ret = 0;
+        for_each_zone(zone) {
+                if (!populated_zone(zone))
+                        continue;
+                if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+                        continue;
+                /* For pass = 0 we don't shrink the active list */
+                if (pass > 0) {
+                        zone->nr_scan_active += (zone->nr_active >> prio) + 1;
+                        if (zone->nr_scan_active >= nr_pages || pass > 3) {
+                                zone->nr_scan_active = 0;
+                                nr_to_scan = min(nr_pages, zone->nr_active);
+                                shrink_active_list(nr_to_scan, zone, sc);
+                        }
+                }
+                zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
+                if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
+                        zone->nr_scan_inactive = 0;
+                        nr_to_scan = min(nr_pages, zone->nr_inactive);
+                        ret += shrink_inactive_list(nr_to_scan, zone, sc);
+                        if (ret >= nr_pages)
+                                return ret;
+                }
+        }
+        return ret;
+}
+/*
+ * Try to free `nr_pages' of memory, system-wide, and return the number of
+ * freed pages.
+ *
+ * Rather than trying to age LRUs the aim is to preserve the overall
+ * LRU order by reclaiming preferentially
+ * inactive > active > active referenced > active mapped
 */
 unsigned long shrink_all_memory(unsigned long nr_pages)
 {
-        pg_data_t *pgdat;
+        unsigned long lru_pages, nr_slab;
-        unsigned long nr_to_free = nr_pages;
        unsigned long ret = 0;
-        unsigned retry = 2;
+        int pass;
-        struct reclaim_state reclaim_state = {
+        struct reclaim_state reclaim_state;
-                .reclaimed_slab = 0,
+        struct zone *zone;
+        struct scan_control sc = {
+                .gfp_mask = GFP_KERNEL,
+                .may_swap = 0,
+                .swap_cluster_max = nr_pages,
+                .may_writepage = 1,
+                .swappiness = vm_swappiness,
        };
        current->reclaim_state = &reclaim_state;
-repeat:
-        for_each_online_pgdat(pgdat) {
+        lru_pages = 0;
-                unsigned long freed;
+        for_each_zone(zone)
+                lru_pages += zone->nr_active + zone->nr_inactive;
-                freed = balance_pgdat(pgdat, nr_to_free, 0);
-                ret += freed;
+        nr_slab = global_page_state(NR_SLAB);
-                nr_to_free -= freed;
+        /* If slab caches are huge, it's better to hit them first */
-                if ((long)nr_to_free <= 0)
+        while (nr_slab >= lru_pages) {
+                reclaim_state.reclaimed_slab = 0;
+                shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+                if (!reclaim_state.reclaimed_slab)
                        break;
+                ret += reclaim_state.reclaimed_slab;
+                if (ret >= nr_pages)
+                        goto out;
+                nr_slab -= reclaim_state.reclaimed_slab;
        }
-        if (retry-- && ret < nr_pages) {
-                blk_congestion_wait(WRITE, HZ/5);
+        /*
-                goto repeat;
+         * We try to shrink LRUs in 5 passes:
+         * 0 = Reclaim from inactive_list only
+         * 1 = Reclaim from active list but don't reclaim mapped
+         * 2 = 2nd pass of type 1
+         * 3 = Reclaim mapped (normal reclaim)
+         * 4 = 2nd pass of type 3
+         */
+        for (pass = 0; pass < 5; pass++) {
+                int prio;
+                /* Needed for shrinking slab caches later on */
+                if (!lru_pages)
+                        for_each_zone(zone) {
+                                lru_pages += zone->nr_active;
+                                lru_pages += zone->nr_inactive;
+                        }
+                /* Force reclaiming mapped pages in the passes #3 and #4 */
+                if (pass > 2) {
+                        sc.may_swap = 1;
+                        sc.swappiness = 100;
+                }
+                for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+                        unsigned long nr_to_scan = nr_pages - ret;
+                        sc.nr_scanned = 0;
+                        ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+                        if (ret >= nr_pages)
+                                goto out;
+                        reclaim_state.reclaimed_slab = 0;
+                        shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+                        ret += reclaim_state.reclaimed_slab;
+                        if (ret >= nr_pages)
+                                goto out;
+                        if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
+                                blk_congestion_wait(WRITE, HZ / 10);
+                }
+                lru_pages = 0;
        }
+        /*
+         * If ret = 0, we could not shrink LRUs, but there may be something
+         * in slab caches
+         */
+        if (!ret)
+                do {
+                        reclaim_state.reclaimed_slab = 0;
+                        shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+                        ret += reclaim_state.reclaimed_slab;
+                } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+out:
        current->reclaim_state = NULL;
        return ret;
 }
 #endif
@@ -1328,7 +1444,7 @@ repeat:
   not required for correctness.  So if the last cpu in a node goes
   away, we get changed to run anywhere: as the first one comes back,
   restore their cpu bindings. */
-static int cpu_callback(struct notifier_block *nfb,
+static int __devinit cpu_callback(struct notifier_block *nfb,
                                  unsigned long action, void *hcpu)
 {
        pg_data_t *pgdat;
@@ -1346,21 +1462,35 @@ static int cpu_callback(struct notifier_block *nfb,
 }
 #endif /* CONFIG_HOTPLUG_CPU */
+/*
+ * This kswapd start function will be called by init and node-hot-add.
+ * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+ */
+int kswapd_run(int nid)
+{
+        pg_data_t *pgdat = NODE_DATA(nid);
+        int ret = 0;
+        if (pgdat->kswapd)
+                return 0;
+        pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+        if (IS_ERR(pgdat->kswapd)) {
+                /* failure at boot is fatal */
+                BUG_ON(system_state == SYSTEM_BOOTING);
+                printk("Failed to start kswapd on node %d\n",nid);
+                ret = -1;
+        }
+        return ret;
+}
 static int __init kswapd_init(void)
 {
-        pg_data_t *pgdat;
+        int nid;
        swap_setup();
-        for_each_online_pgdat(pgdat) {
+        for_each_online_node(nid)
-                pid_t pid;
+                kswapd_run(nid);
-                pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
-                BUG_ON(pid < 0);
-                read_lock(&tasklist_lock);
-                pgdat->kswapd = find_task_by_pid(pid);
-                read_unlock(&tasklist_lock);
-        }
-        total_memory = nr_free_pagecache_pages();
        hotcpu_notifier(cpu_callback, 0);
        return 0;
 }
@@ -1387,11 +1517,6 @@ int zone_reclaim_mode __read_mostly;
 #define RECLAIM_SLAB (1<<3)     /* Do a global slab shrink if the zone is out of memory */
 /*
- * Mininum time between zone reclaim scans
- */
-int zone_reclaim_interval __read_mostly = 30*HZ;
-/*
 * Priority for ZONE_RECLAIM. This determines the fraction of pages
 * of a node considered for each zone_reclaim. 4 scans 1/16th of
 * a zone.
@@ -1412,10 +1537,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        struct scan_control sc = {
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
                .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
-                .nr_mapped = read_page_state(nr_mapped),
                .swap_cluster_max = max_t(unsigned long, nr_pages,
                                        SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
+                .swappiness = vm_swappiness,
        };
        disable_swap_token();
@@ -1456,16 +1581,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        p->reclaim_state = NULL;
        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
-        if (nr_reclaimed == 0) {
-                /*
-                 * We were unable to reclaim enough pages to stay on node.  We
-                 * now allow off node accesses for a certain time period before
-                 * trying again to reclaim pages from the local zone.
-                 */
-                zone->last_unsuccessful_zone_reclaim = jiffies;
-        }
        return nr_reclaimed >= nr_pages;
 }
@@ -1475,13 +1590,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        int node_id;
        /*
-         * Do not reclaim if there was a recent unsuccessful attempt at zone
+         * Do not reclaim if there are not enough reclaimable pages in this
-         * reclaim.  In that case we let allocations go off node for the
+         * zone that would satify this allocations.
-         * zone_reclaim_interval.  Otherwise we would scan for each off-node
+         *
-         * page allocation.
+         * All unmapped pagecache pages are reclaimable.
+         *
+         * Both counters may be temporarily off a bit so we use
+         * SWAP_CLUSTER_MAX as the boundary. It may also be good to
+         * leave a few frequently used unmapped pagecache pages around.
         */
-        if (time_before(jiffies,
+        if (zone_page_state(zone, NR_FILE_PAGES) -
-                zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+                zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX)
                        return 0;
        /*
diff --git a/mm/vmstat.c b/mm/vmstat.c
new file mode 100644
index 000000000000..73b83d67bab6
--- /dev/null
+++ b/mm/vmstat.c
@@ -0,0 +1,614 @@
+/*
+ *  linux/mm/vmstat.c
+ *
+ *  Manages VM statistics
+ *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
+ *
+ *  zoned VM statistics
+ *  Copyright (C) 2006 Silicon Graphics, Inc.,
+ *              Christoph Lameter <christoph@lameter.com>
+ */
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+void __get_zone_counts(unsigned long *active, unsigned long *inactive,
+                        unsigned long *free, struct pglist_data *pgdat)
+{
+        struct zone *zones = pgdat->node_zones;
+        int i;
+        *active = 0;
+        *inactive = 0;
+        *free = 0;
+        for (i = 0; i < MAX_NR_ZONES; i++) {
+                *active += zones[i].nr_active;
+                *inactive += zones[i].nr_inactive;
+                *free += zones[i].free_pages;
+        }
+}
+void get_zone_counts(unsigned long *active,
+                unsigned long *inactive, unsigned long *free)
+{
+        struct pglist_data *pgdat;
+        *active = 0;
+        *inactive = 0;
+        *free = 0;
+        for_each_online_pgdat(pgdat) {
+                unsigned long l, m, n;
+                __get_zone_counts(&l, &m, &n, pgdat);
+                *active += l;
+                *inactive += m;
+                *free += n;
+        }
+}
+#ifdef CONFIG_VM_EVENT_COUNTERS
+DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
+EXPORT_PER_CPU_SYMBOL(vm_event_states);
+static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
+{
+        int cpu = 0;
+        int i;
+        memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
+        cpu = first_cpu(*cpumask);
+        while (cpu < NR_CPUS) {
+                struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
+                cpu = next_cpu(cpu, *cpumask);
+                if (cpu < NR_CPUS)
+                        prefetch(&per_cpu(vm_event_states, cpu));
+                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+                        ret[i] += this->event[i];
+        }
+}
+/*
+ * Accumulate the vm event counters across all CPUs.
+ * The result is unavoidably approximate - it can change
+ * during and after execution of this function.
+*/
+void all_vm_events(unsigned long *ret)
+{
+        sum_vm_events(ret, &cpu_online_map);
+}
+#ifdef CONFIG_HOTPLUG
+/*
+ * Fold the foreign cpu events into our own.
+ *
+ * This is adding to the events on one processor
+ * but keeps the global counts constant.
+ */
+void vm_events_fold_cpu(int cpu)
+{
+        struct vm_event_state *fold_state = &per_cpu(vm_event_states, cpu);
+        int i;
+        for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
+                count_vm_events(i, fold_state->event[i]);
+                fold_state->event[i] = 0;
+        }
+}
+#endif /* CONFIG_HOTPLUG */
+#endif /* CONFIG_VM_EVENT_COUNTERS */
+/*
+ * Manage combined zone based / global counters
+ *
+ * vm_stat contains the global counters
+ */
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+EXPORT_SYMBOL(vm_stat);
+#ifdef CONFIG_SMP
+#define STAT_THRESHOLD 32
+/*
+ * Determine pointer to currently valid differential byte given a zone and
+ * the item number.
+ *
+ * Preemption must be off
+ */
+static inline s8 *diff_pointer(struct zone *zone, enum zone_stat_item item)
+{
+        return &zone_pcp(zone, smp_processor_id())->vm_stat_diff[item];
+}
+/*
+ * For use when we know that interrupts are disabled.
+ */
+void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+                                int delta)
+{
+        s8 *p;
+        long x;
+        p = diff_pointer(zone, item);
+        x = delta + *p;
+        if (unlikely(x > STAT_THRESHOLD || x < -STAT_THRESHOLD)) {
+                zone_page_state_add(x, zone, item);
+                x = 0;
+        }
+        *p = x;
+}
+EXPORT_SYMBOL(__mod_zone_page_state);
+/*
+ * For an unknown interrupt state
+ */
+void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
+                                        int delta)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __mod_zone_page_state(zone, item, delta);
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(mod_zone_page_state);
+/*
+ * Optimized increment and decrement functions.
+ *
+ * These are only for a single page and therefore can take a struct page *
+ * argument instead of struct zone *. This allows the inclusion of the code
+ * generated for page_zone(page) into the optimized functions.
+ *
+ * No overflow check is necessary and therefore the differential can be
+ * incremented or decremented in place which may allow the compilers to
+ * generate better code.
+ *
+ * The increment or decrement is known and therefore one boundary check can
+ * be omitted.
+ *
+ * Some processors have inc/dec instructions that are atomic vs an interrupt.
+ * However, the code must first determine the differential location in a zone
+ * based on the processor number and then inc/dec the counter. There is no
+ * guarantee without disabling preemption that the processor will not change
+ * in between and therefore the atomicity vs. interrupt cannot be exploited
+ * in a useful way here.
+ */
+static void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+        s8 *p = diff_pointer(zone, item);
+        (*p)++;
+        if (unlikely(*p > STAT_THRESHOLD)) {
+                zone_page_state_add(*p, zone, item);
+                *p = 0;
+        }
+}
+void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        __inc_zone_state(page_zone(page), item);
+}
+EXPORT_SYMBOL(__inc_zone_page_state);
+void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        struct zone *zone = page_zone(page);
+        s8 *p = diff_pointer(zone, item);
+        (*p)--;
+        if (unlikely(*p < -STAT_THRESHOLD)) {
+                zone_page_state_add(*p, zone, item);
+                *p = 0;
+        }
+}
+EXPORT_SYMBOL(__dec_zone_page_state);
+void inc_zone_state(struct zone *zone, enum zone_stat_item item)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __inc_zone_state(zone, item);
+        local_irq_restore(flags);
+}
+void inc_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        unsigned long flags;
+        struct zone *zone;
+        zone = page_zone(page);
+        local_irq_save(flags);
+        __inc_zone_state(zone, item);
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(inc_zone_page_state);
+void dec_zone_page_state(struct page *page, enum zone_stat_item item)
+{
+        unsigned long flags;
+        struct zone *zone;
+        s8 *p;
+        zone = page_zone(page);
+        local_irq_save(flags);
+        p = diff_pointer(zone, item);
+        (*p)--;
+        if (unlikely(*p < -STAT_THRESHOLD)) {
+                zone_page_state_add(*p, zone, item);
+                *p = 0;
+        }
+        local_irq_restore(flags);
+}
+EXPORT_SYMBOL(dec_zone_page_state);
+/*
+ * Update the zone counters for one cpu.
+ */
+void refresh_cpu_vm_stats(int cpu)
+{
+        struct zone *zone;
+        int i;
+        unsigned long flags;
+        for_each_zone(zone) {
+                struct per_cpu_pageset *pcp;
+                pcp = zone_pcp(zone, cpu);
+                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                        if (pcp->vm_stat_diff[i]) {
+                                local_irq_save(flags);
+                                zone_page_state_add(pcp->vm_stat_diff[i],
+                                        zone, i);
+                                pcp->vm_stat_diff[i] = 0;
+                                local_irq_restore(flags);
+                        }
+        }
+}
+static void __refresh_cpu_vm_stats(void *dummy)
+{
+        refresh_cpu_vm_stats(smp_processor_id());
+}
+/*
+ * Consolidate all counters.
+ *
+ * Note that the result is less inaccurate but still inaccurate
+ * if concurrent processes are allowed to run.
+ */
+void refresh_vm_stats(void)
+{
+        on_each_cpu(__refresh_cpu_vm_stats, NULL, 0, 1);
+}
+EXPORT_SYMBOL(refresh_vm_stats);
+#endif
+#ifdef CONFIG_NUMA
+/*
+ * zonelist = the list of zones passed to the allocator
+ * z        = the zone from which the allocation occurred.
+ *
+ * Must be called with interrupts disabled.
+ */
+void zone_statistics(struct zonelist *zonelist, struct zone *z)
+{
+        if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) {
+                __inc_zone_state(z, NUMA_HIT);
+        } else {
+                __inc_zone_state(z, NUMA_MISS);
+                __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN);
+        }
+        if (z->zone_pgdat == NODE_DATA(numa_node_id()))
+                __inc_zone_state(z, NUMA_LOCAL);
+        else
+                __inc_zone_state(z, NUMA_OTHER);
+}
+#endif
+#ifdef CONFIG_PROC_FS
+#include <linux/seq_file.h>
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+        pg_data_t *pgdat;
+        loff_t node = *pos;
+        for (pgdat = first_online_pgdat();
+             pgdat && node;
+             pgdat = next_online_pgdat(pgdat))
+                --node;
+        return pgdat;
+}
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        (*pos)++;
+        return next_online_pgdat(pgdat);
+}
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+/*
+ * This walks the free areas for each zone.
+ */
+static int frag_show(struct seq_file *m, void *arg)
+{
+        pg_data_t *pgdat = (pg_data_t *)arg;
+        struct zone *zone;
+        struct zone *node_zones = pgdat->node_zones;
+        unsigned long flags;
+        int order;
+        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+                if (!populated_zone(zone))
+                        continue;
+                spin_lock_irqsave(&zone->lock, flags);
+                seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
+                for (order = 0; order < MAX_ORDER; ++order)
+                        seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
+                spin_unlock_irqrestore(&zone->lock, flags);
+                seq_putc(m, '\n');
+        }
+        return 0;
+}
+struct seq_operations fragmentation_op = {
+        .start  = frag_start,
+        .next   = frag_next,
+        .stop   = frag_stop,
+        .show   = frag_show,
+};
+static char *vmstat_text[] = {
+        /* Zoned VM counters */
+        "nr_anon_pages",
+        "nr_mapped",
+        "nr_file_pages",
+        "nr_slab",
+        "nr_page_table_pages",
+        "nr_dirty",
+        "nr_writeback",
+        "nr_unstable",
+        "nr_bounce",
+#ifdef CONFIG_NUMA
+        "numa_hit",
+        "numa_miss",
+        "numa_foreign",
+        "numa_interleave",
+        "numa_local",
+        "numa_other",
+#endif
+#ifdef CONFIG_VM_EVENT_COUNTERS
+        "pgpgin",
+        "pgpgout",
+        "pswpin",
+        "pswpout",
+        "pgalloc_dma",
+        "pgalloc_dma32",
+        "pgalloc_normal",
+        "pgalloc_high",
+        "pgfree",
+        "pgactivate",
+        "pgdeactivate",
+        "pgfault",
+        "pgmajfault",
+        "pgrefill_dma",
+        "pgrefill_dma32",
+        "pgrefill_normal",
+        "pgrefill_high",
+        "pgsteal_dma",
+        "pgsteal_dma32",
+        "pgsteal_normal",
+        "pgsteal_high",
+        "pgscan_kswapd_dma",
+        "pgscan_kswapd_dma32",
+        "pgscan_kswapd_normal",
+        "pgscan_kswapd_high",
+        "pgscan_direct_dma",
+        "pgscan_direct_dma32",
+        "pgscan_direct_normal",
+        "pgscan_direct_high",
+        "pginodesteal",
+        "slabs_scanned",
+        "kswapd_steal",
+        "kswapd_inodesteal",
+        "pageoutrun",
+        "allocstall",
+        "pgrotated",
+#endif
+};
+/*
+ * Output information about zones in @pgdat.
+ */
+static int zoneinfo_show(struct seq_file *m, void *arg)
+{
+        pg_data_t *pgdat = arg;
+        struct zone *zone;
+        struct zone *node_zones = pgdat->node_zones;
+        unsigned long flags;
+        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
+                int i;
+                if (!populated_zone(zone))
+                        continue;
+                spin_lock_irqsave(&zone->lock, flags);
+                seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
+                seq_printf(m,
+                           "\n  pages free     %lu"
+                           "\n        min      %lu"
+                           "\n        low      %lu"
+                           "\n        high     %lu"
+                           "\n        active   %lu"
+                           "\n        inactive %lu"
+                           "\n        scanned  %lu (a: %lu i: %lu)"
+                           "\n        spanned  %lu"
+                           "\n        present  %lu",
+                           zone->free_pages,
+                           zone->pages_min,
+                           zone->pages_low,
+                           zone->pages_high,
+                           zone->nr_active,
+                           zone->nr_inactive,
+                           zone->pages_scanned,
+                           zone->nr_scan_active, zone->nr_scan_inactive,
+                           zone->spanned_pages,
+                           zone->present_pages);
+                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                        seq_printf(m, "\n    %-12s %lu", vmstat_text[i],
+                                        zone_page_state(zone, i));
+                seq_printf(m,
+                           "\n        protection: (%lu",
+                           zone->lowmem_reserve[0]);
+                for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
+                        seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
+                seq_printf(m,
+                           ")"
+                           "\n  pagesets");
+                for_each_online_cpu(i) {
+                        struct per_cpu_pageset *pageset;
+                        int j;
+                        pageset = zone_pcp(zone, i);
+                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+                                if (pageset->pcp[j].count)
+                                        break;
+                        }
+                        if (j == ARRAY_SIZE(pageset->pcp))
+                                continue;
+                        for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
+                                seq_printf(m,
+                                           "\n    cpu: %i pcp: %i"
+                                           "\n              count: %i"
+                                           "\n              high:  %i"
+                                           "\n              batch: %i",
+                                           i, j,
+                                           pageset->pcp[j].count,
+                                           pageset->pcp[j].high,
+                                           pageset->pcp[j].batch);
+                        }
+                }
+                seq_printf(m,
+                           "\n  all_unreclaimable: %u"
+                           "\n  prev_priority:     %i"
+                           "\n  temp_priority:     %i"
+                           "\n  start_pfn:         %lu",
+                           zone->all_unreclaimable,
+                           zone->prev_priority,
+                           zone->temp_priority,
+                           zone->zone_start_pfn);
+                spin_unlock_irqrestore(&zone->lock, flags);
+                seq_putc(m, '\n');
+        }
+        return 0;
+}
+struct seq_operations zoneinfo_op = {
+        .start  = frag_start, /* iterate over all zones. The same as in
+                               * fragmentation. */
+        .next   = frag_next,
+        .stop   = frag_stop,
+        .show   = zoneinfo_show,
+};
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+        unsigned long *v;
+#ifdef CONFIG_VM_EVENT_COUNTERS
+        unsigned long *e;
+#endif
+        int i;
+        if (*pos >= ARRAY_SIZE(vmstat_text))
+                return NULL;
+#ifdef CONFIG_VM_EVENT_COUNTERS
+        v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long)
+                        + sizeof(struct vm_event_state), GFP_KERNEL);
+#else
+        v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long),
+                        GFP_KERNEL);
+#endif
+        m->private = v;
+        if (!v)
+                return ERR_PTR(-ENOMEM);
+        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                v[i] = global_page_state(i);
+#ifdef CONFIG_VM_EVENT_COUNTERS
+        e = v + NR_VM_ZONE_STAT_ITEMS;
+        all_vm_events(e);
+        e[PGPGIN] /= 2;         /* sectors -> kbytes */
+        e[PGPGOUT] /= 2;
+#endif
+        return v + *pos;
+}
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+        (*pos)++;
+        if (*pos >= ARRAY_SIZE(vmstat_text))
+                return NULL;
+        return (unsigned long *)m->private + *pos;
+}
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+        unsigned long *l = arg;
+        unsigned long off = l - (unsigned long *)m->private;
+        seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+        return 0;
+}
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+        kfree(m->private);
+        m->private = NULL;
+}
+struct seq_operations vmstat_op = {
+        .start  = vmstat_start,
+        .next   = vmstat_next,
+        .stop   = vmstat_stop,
+        .show   = vmstat_show,
+};
+#endif /* CONFIG_PROC_FS */
author	Steven Whitehouse <swhiteho@redhat.com>	2006-07-03 10:25:08 -0400
committer	Steven Whitehouse <swhiteho@redhat.com>	2006-07-03 10:25:08 -0400
commit	0a1340c185734a57fbf4775927966ad4a1347b02 (patch)
tree	d9ed8f0dd809a7c542a3356601125ea5b5aaa804 /mm
parent	af18ddb8864b096e3ed4732e2d4b21c956dcfe3a (diff)
parent	29454dde27d8e340bb1987bad9aa504af7081eba (diff)