Merge branch 'master' of /home/trondmy/kernel/linux-2.6/

Conflicts: fs/nfs/inode.c fs/super.c Fix conflicts between patch 'NFS: Split fs/nfs/inode.c' and patch 'VFS: Permit filesystem to override root dentry on mount'
author: Trond Myklebust <Trond.Myklebust@netapp.com> 2006-06-24 08:41:41 -0400
committer: Trond Myklebust <Trond.Myklebust@netapp.com> 2006-06-24 13:07:53 -0400
commit: 816724e65c72a90a44fbad0ef0b59b186c85fa90 (patch)
tree: 421fa29aedff988e392f92780637553e275d37a0 /mm
parent: 70ac4385a13f78bc478f26d317511893741b05bd (diff)
parent: d384ea691fe4ea8c2dd5b9b8d9042eb181776f18 (diff)
25 files changed, 1816 insertions, 1030 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 332f5c29b53a..66e65ab39426 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS
 #
 config MIGRATION
        bool "Page migration"
-        def_bool y if NUMA
+        def_bool y
-        depends on SWAP && NUMA
+        depends on NUMA
        help
          Allows the migration of the physical location of pages of processes
          while the virtual addresses are not changed. This is useful for
diff --git a/mm/filemap.c b/mm/filemap.c
index fd57442186cb..807a463fd5ed 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
+#include <linux/uaccess.h>
 #include <linux/aio.h>
 #include <linux/capability.h>
 #include <linux/kernel_stat.h>
@@ -38,7 +39,6 @@
 */
 #include <linux/buffer_head.h> /* for generic_osync_inode */
-#include <asm/uaccess.h>
 #include <asm/mman.h>
 static ssize_t
@@ -171,15 +171,17 @@ static int sync_page(void *word)
 }
 /**
- * filemap_fdatawrite_range - start writeback against all of a mapping's
+ * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
- * dirty pages that lie within the byte offsets <start, end>
 * @mapping:    address space structure to write
 * @start:      offset in bytes where the range starts
 * @end:        offset in bytes where the range ends (inclusive)
 * @sync_mode:  enable synchronous operation
 *
+ * Start writeback against all of a mapping's dirty pages that lie
+ * within the byte offsets <start, end> inclusive.
+ *
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory * cleansing writeback.  The difference between
+ * opposed to a regular memory cleansing writeback.  The difference between
 * these two operations is that if a dirty page/buffer is encountered, it must
 * be waited upon, and not just skipped over.
 */
@@ -190,8 +192,8 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
        struct writeback_control wbc = {
                .sync_mode = sync_mode,
                .nr_to_write = mapping->nrpages * 2,
-                .start = start,
+                .range_start = start,
-                .end = end,
+                .range_end = end,
        };
        if (!mapping_cap_writeback_dirty(mapping))
@@ -204,7 +206,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
 static inline int __filemap_fdatawrite(struct address_space *mapping,
        int sync_mode)
 {
-        return __filemap_fdatawrite_range(mapping, 0, 0, sync_mode);
+        return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
 }
 int filemap_fdatawrite(struct address_space *mapping)
@@ -219,7 +221,10 @@ static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
-/*
+/**
+ * filemap_flush - mostly a non-blocking flush
+ * @mapping:    target address_space
+ *
 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 * purposes - I/O may not be started against all dirty pages.
 */
@@ -229,7 +234,12 @@ int filemap_flush(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_flush);
-/*
+/**
+ * wait_on_page_writeback_range - wait for writeback to complete
+ * @mapping:    target address_space
+ * @start:      beginning page index
+ * @end:        ending page index
+ *
 * Wait for writeback to complete against pages indexed by start->end
 * inclusive
 */
@@ -276,7 +286,13 @@ int wait_on_page_writeback_range(struct address_space *mapping,
        return ret;
 }
-/*
+/**
+ * sync_page_range - write and wait on all pages in the passed range
+ * @inode:      target inode
+ * @mapping:    target address_space
+ * @pos:        beginning offset in pages to write
+ * @count:      number of bytes to write
+ *
 * Write and wait upon all the pages in the passed range.  This is a "data
 * integrity" operation.  It waits upon in-flight writeout before starting and
 * waiting upon new writeout.  If there was an IO error, return it.
@@ -305,7 +321,13 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 }
 EXPORT_SYMBOL(sync_page_range);
-/*
+/**
+ * sync_page_range_nolock
+ * @inode:      target inode
+ * @mapping:    target address_space
+ * @pos:        beginning offset in pages to write
+ * @count:      number of bytes to write
+ *
 * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
 * as it forces O_SYNC writers to different parts of the same file
 * to be serialised right until io completion.
@@ -329,10 +351,11 @@ int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
 EXPORT_SYMBOL(sync_page_range_nolock);
 /**
- * filemap_fdatawait - walk the list of under-writeback pages of the given
+ * filemap_fdatawait - wait for all under-writeback pages to complete
- *     address space and wait for all of them.
- *
 * @mapping: address space structure to wait for
+ *
+ * Walk the list of under-writeback pages of the given address space
+ * and wait for all of them.
 */
 int filemap_fdatawait(struct address_space *mapping)
 {
@@ -368,7 +391,12 @@ int filemap_write_and_wait(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_write_and_wait);
-/*
+/**
+ * filemap_write_and_wait_range - write out & wait on a file range
+ * @mapping:    the address_space for the pages
+ * @lstart:     offset in bytes where the range starts
+ * @lend:       offset in bytes where the range ends (inclusive)
+ *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that `lend' is inclusive (describes the last byte to be written) so
@@ -394,8 +422,14 @@ int filemap_write_and_wait_range(struct address_space *mapping,
        return err;
 }
-/*
+/**
- * This function is used to add newly allocated pagecache pages:
+ * add_to_page_cache - add newly allocated pagecache pages
+ * @page:       page to add
+ * @mapping:    the page's address_space
+ * @offset:     page index
+ * @gfp_mask:   page allocation mode
+ *
+ * This function is used to add newly allocated pagecache pages;
 * the page is new, so we can just run SetPageLocked() against it.
 * The other page state flags were set by rmqueue().
 *
@@ -422,7 +456,6 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
        }
        return error;
 }
 EXPORT_SYMBOL(add_to_page_cache);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
@@ -489,8 +522,7 @@ void fastcall wait_on_page_bit(struct page *page, int bit_nr)
 EXPORT_SYMBOL(wait_on_page_bit);
 /**
- * unlock_page() - unlock a locked page
+ * unlock_page - unlock a locked page
- *
 * @page: the page
 *
 * Unlocks the page and wakes up sleepers in ___wait_on_page_locked().
@@ -513,8 +545,9 @@ void fastcall unlock_page(struct page *page)
 }
 EXPORT_SYMBOL(unlock_page);
-/*
+/**
- * End writeback against a page.
+ * end_page_writeback - end writeback against a page
+ * @page: the page
 */
 void end_page_writeback(struct page *page)
 {
@@ -527,10 +560,11 @@ void end_page_writeback(struct page *page)
 }
 EXPORT_SYMBOL(end_page_writeback);
-/*
+/**
- * Get a lock on the page, assuming we need to sleep to get it.
+ * __lock_page - get a lock on the page, assuming we need to sleep to get it
+ * @page: the page to lock
 *
- * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
+ * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
 * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
 * chances are that on the second loop, the block layer's plug list is empty,
 * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
@@ -544,8 +578,12 @@ void fastcall __lock_page(struct page *page)
 }
 EXPORT_SYMBOL(__lock_page);
-/*
+/**
- * a rather lightweight function, finding and getting a reference to a
+ * find_get_page - find and get a page reference
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * A rather lightweight function, finding and getting a reference to a
 * hashed page atomically.
 */
 struct page * find_get_page(struct address_space *mapping, unsigned long offset)
@@ -559,11 +597,14 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
        read_unlock_irq(&mapping->tree_lock);
        return page;
 }
 EXPORT_SYMBOL(find_get_page);
-/*
+/**
- * Same as above, but trylock it instead of incrementing the count.
+ * find_trylock_page - find and lock a page
+ * @mapping: the address_space to search
+ * @offset: the page index
+ *
+ * Same as find_get_page(), but trylock it instead of incrementing the count.
 */
 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
 {
@@ -576,12 +617,10 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs
        read_unlock_irq(&mapping->tree_lock);
        return page;
 }
 EXPORT_SYMBOL(find_trylock_page);
 /**
 * find_lock_page - locate, pin and lock a pagecache page
- *
 * @mapping: the address_space to search
 * @offset: the page index
 *
@@ -617,12 +656,10 @@ repeat:
        read_unlock_irq(&mapping->tree_lock);
        return page;
 }
 EXPORT_SYMBOL(find_lock_page);
 /**
 * find_or_create_page - locate or add a pagecache page
- *
 * @mapping: the page's address_space
 * @index: the page's index into the mapping
 * @gfp_mask: page allocation mode
@@ -663,7 +700,6 @@ repeat:
                page_cache_release(cached_page);
        return page;
 }
 EXPORT_SYMBOL(find_or_create_page);
 /**
@@ -729,9 +765,16 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
        return i;
 }
-/*
+/**
+ * find_get_pages_tag - find and return pages that match @tag
+ * @mapping:    the address_space to search
+ * @index:      the starting page index
+ * @tag:        the tag index
+ * @nr_pages:   the maximum number of pages
+ * @pages:      where the resulting pages are placed
+ *
 * Like find_get_pages, except we only return pages which are tagged with
- * `tag'.   We update *index to index the next page for the traversal.
+ * @tag.   We update @index to index the next page for the traversal.
 */
 unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
                        int tag, unsigned int nr_pages, struct page **pages)
@@ -750,7 +793,11 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
        return ret;
 }
-/*
+/**
+ * grab_cache_page_nowait - returns locked page at given index in given cache
+ * @mapping: target address_space
+ * @index: the page index
+ *
 * Same as grab_cache_page, but do not wait if the page is unavailable.
 * This is intended for speculative data generators, where the data can
 * be regenerated if the page couldn't be grabbed.  This routine should
@@ -779,19 +826,25 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
        }
        return page;
 }
 EXPORT_SYMBOL(grab_cache_page_nowait);
-/*
+/**
+ * do_generic_mapping_read - generic file read routine
+ * @mapping:    address_space to be read
+ * @_ra:        file's readahead state
+ * @filp:       the file to read
+ * @ppos:       current file position
+ * @desc:       read_descriptor
+ * @actor:      read method
+ *
 * This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level
+ * mapping->a_ops->readpage() function for the actual low-level stuff.
- * stuff.
 *
 * This is really ugly. But the goto's actually try to clarify some
 * of the logic when it comes to error handling etc.
 *
- * Note the struct file* is only passed for the use of readpage.  It may be
+ * Note the struct file* is only passed for the use of readpage.
- * NULL.
+ * It may be NULL.
 */
 void do_generic_mapping_read(struct address_space *mapping,
                             struct file_ra_state *_ra,
@@ -1004,7 +1057,6 @@ out:
        if (filp)
                file_accessed(filp);
 }
 EXPORT_SYMBOL(do_generic_mapping_read);
 int file_read_actor(read_descriptor_t *desc, struct page *page,
@@ -1045,7 +1097,13 @@ success:
        return size;
 }
-/*
+/**
+ * __generic_file_aio_read - generic filesystem read routine
+ * @iocb:       kernel I/O control block
+ * @iov:        io vector request
+ * @nr_segs:    number of segments in the iovec
+ * @ppos:       current file position
+ *
 * This is the "read()" routine for all filesystems
 * that can use the page cache directly.
 */
@@ -1124,7 +1182,6 @@ __generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 out:
        return retval;
 }
 EXPORT_SYMBOL(__generic_file_aio_read);
 ssize_t
@@ -1135,7 +1192,6 @@ generic_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t
        BUG_ON(iocb->ki_pos != pos);
        return __generic_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
 }
 EXPORT_SYMBOL(generic_file_aio_read);
 ssize_t
@@ -1151,7 +1207,6 @@ generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppo
                ret = wait_on_sync_kiocb(&kiocb);
        return ret;
 }
 EXPORT_SYMBOL(generic_file_read);
 int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
@@ -1192,7 +1247,6 @@ ssize_t generic_file_sendfile(struct file *in_file, loff_t *ppos,
                return desc.written;
        return desc.error;
 }
 EXPORT_SYMBOL(generic_file_sendfile);
 static ssize_t
@@ -1228,11 +1282,15 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
 }
 #ifdef CONFIG_MMU
-/*
+static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
+/**
+ * page_cache_read - adds requested page to the page cache if not already there
+ * @file:       file to read
+ * @offset:     page index
+ *
 * This adds the requested page to the page cache if it isn't already there,
 * and schedules an I/O to read in its contents from disk.
 */
-static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
 static int fastcall page_cache_read(struct file * file, unsigned long offset)
 {
        struct address_space *mapping = file->f_mapping;
@@ -1259,7 +1317,12 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 #define MMAP_LOTSAMISS  (100)
-/*
+/**
+ * filemap_nopage - read in file data for page fault handling
+ * @area:       the applicable vm_area
+ * @address:    target address to read in
+ * @type:       returned with VM_FAULT_{MINOR,MAJOR} if not %NULL
+ *
 * filemap_nopage() is invoked via the vma operations vector for a
 * mapped memory region to read in file data during a page fault.
 *
@@ -1462,7 +1525,6 @@ page_not_uptodate:
        page_cache_release(page);
        return NULL;
 }
 EXPORT_SYMBOL(filemap_nopage);
 static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
@@ -1716,7 +1778,13 @@ repeat:
        return page;
 }
-/*
+/**
+ * read_cache_page - read into page cache, fill it if needed
+ * @mapping:    the page's address_space
+ * @index:      the page index
+ * @filler:     function to perform the read
+ * @data:       destination for read data
+ *
 * Read into the page cache. If a page already exists,
 * and PageUptodate() is not set, try to fill the page.
 */
@@ -1754,7 +1822,6 @@ retry:
 out:
        return page;
 }
 EXPORT_SYMBOL(read_cache_page);
 /*
@@ -1835,7 +1902,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
                int copy = min(bytes, iov->iov_len - base);
                base = 0;
-                left = __copy_from_user_inatomic(vaddr, buf, copy);
+                left = __copy_from_user_inatomic_nocache(vaddr, buf, copy);
                copied += copy;
                bytes -= copy;
                vaddr += copy;
@@ -1854,7 +1921,7 @@ __filemap_copy_from_user_iovec(char *vaddr,
 /*
 * Performs necessary checks before doing a write
 *
- * Can adjust writing position aor amount of bytes to write.
+ * Can adjust writing position or amount of bytes to write.
 * Returns appropriate error code that caller should return or
 * zero in case that write should be allowed.
 */
diff --git a/mm/filemap.h b/mm/filemap.h
index 13793ba0ce17..5683cde22055 100644
--- a/mm/filemap.h
+++ b/mm/filemap.h
@@ -13,7 +13,7 @@
 #include <linux/highmem.h>
 #include <linux/uio.h>
 #include <linux/config.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 size_t
 __filemap_copy_from_user_iovec(char *vaddr,
@@ -34,13 +34,13 @@ filemap_copy_from_user(struct page *page, unsigned long offset,
        int left;
        kaddr = kmap_atomic(page, KM_USER0);
-        left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+        left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes);
        kunmap_atomic(kaddr, KM_USER0);
        if (left != 0) {
                /* Do it the slow way */
                kaddr = kmap(page);
-                left = __copy_from_user(kaddr + offset, buf, bytes);
+                left = __copy_from_user_nocache(kaddr + offset, buf, bytes);
                kunmap(page);
        }
        return bytes - left;
diff --git a/mm/fremap.c b/mm/fremap.c
index 9f381e58bf44..21b7d0cbc98c 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -83,6 +83,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
        page_add_file_rmap(page);
        pte_val = *pte;
        update_mmu_cache(vma, addr, pte_val);
+        lazy_mmu_prot_update(pte_val);
        err = 0;
 unlock:
        pte_unmap_unlock(pte, ptl);
@@ -114,7 +115,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
        set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
        pte_val = *pte;
-        update_mmu_cache(vma, addr, pte_val);
+        /*
+         * We don't need to run update_mmu_cache() here because the "file pte"
+         * being installed by install_file_pte() is not a real pte - it's a
+         * non-present entry (like a swap entry), noting what file offset should
+         * be mapped there when there's a fault (in a non-linear vma where
+         * that's not obvious).
+         */
        pte_unmap_unlock(pte, ptl);
        err = 0;
 out:
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 832f676ca038..df499973255f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void)
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr)
 {
-        struct inode *inode = vma->vm_file->f_dentry->d_inode;
        struct page *page;
-        int use_reserve = 0;
-        unsigned long idx;
        spin_lock(&hugetlb_lock);
+        if (vma->vm_flags & VM_MAYSHARE)
-        if (vma->vm_flags & VM_MAYSHARE) {
+                resv_huge_pages--;
+        else if (free_huge_pages <= resv_huge_pages)
-                /* idx = radix tree index, i.e. offset into file in
+                goto fail;
-                 * HPAGE_SIZE units */
-                idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-                        + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-                /* The hugetlbfs specific inode info stores the number
-                 * of "guaranteed available" (huge) pages.  That is,
-                 * the first 'prereserved_hpages' pages of the inode
-                 * are either already instantiated, or have been
-                 * pre-reserved (by hugetlb_reserve_for_inode()). Here
-                 * we're in the process of instantiating the page, so
-                 * we use this to determine whether to draw from the
-                 * pre-reserved pool or the truly free pool. */
-                if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
-                        use_reserve = 1;
-        }
-        if (!use_reserve) {
-                if (free_huge_pages <= reserved_huge_pages)
-                        goto fail;
-        } else {
-                BUG_ON(reserved_huge_pages == 0);
-                reserved_huge_pages--;
-        }
        page = dequeue_huge_page(vma, addr);
        if (!page)
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        set_page_refcounted(page);
        return page;
- fail:
+fail:
-        WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
        spin_unlock(&hugetlb_lock);
        return NULL;
 }
-/* hugetlb_extend_reservation()
- *
- * Ensure that at least 'atleast' hugepages are, and will remain,
- * available to instantiate the first 'atleast' pages of the given
- * inode.  If the inode doesn't already have this many pages reserved
- * or instantiated, set aside some hugepages in the reserved pool to
- * satisfy later faults (or fail now if there aren't enough, rather
- * than getting the SIGBUS later).
- */
-int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
-                               unsigned long atleast)
-{
-        struct inode *inode = &info->vfs_inode;
-        unsigned long change_in_reserve = 0;
-        int ret = 0;
-        spin_lock(&hugetlb_lock);
-        read_lock_irq(&inode->i_mapping->tree_lock);
-        if (info->prereserved_hpages >= atleast)
-                goto out;
-        /* Because we always call this on shared mappings, none of the
-         * pages beyond info->prereserved_hpages can have been
-         * instantiated, so we need to reserve all of them now. */
-        change_in_reserve = atleast - info->prereserved_hpages;
-        if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        reserved_huge_pages += change_in_reserve;
-        info->prereserved_hpages = atleast;
- out:
-        read_unlock_irq(&inode->i_mapping->tree_lock);
-        spin_unlock(&hugetlb_lock);
-        return ret;
-}
-/* hugetlb_truncate_reservation()
- *
- * This returns pages reserved for the given inode to the general free
- * hugepage pool.  If the inode has any pages prereserved, but not
- * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
- * them.
- */
-void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
-                                  unsigned long atmost)
-{
-        struct inode *inode = &info->vfs_inode;
-        struct address_space *mapping = inode->i_mapping;
-        unsigned long idx;
-        unsigned long change_in_reserve = 0;
-        struct page *page;
-        spin_lock(&hugetlb_lock);
-        read_lock_irq(&inode->i_mapping->tree_lock);
-        if (info->prereserved_hpages <= atmost)
-                goto out;
-        /* Count pages which were reserved, but not instantiated, and
-         * which we can now release. */
-        for (idx = atmost; idx < info->prereserved_hpages; idx++) {
-                page = radix_tree_lookup(&mapping->page_tree, idx);
-                if (!page)
-                        /* Pages which are already instantiated can't
-                         * be unreserved (and in fact have already
-                         * been removed from the reserved pool) */
-                        change_in_reserve++;
-        }
-        BUG_ON(reserved_huge_pages < change_in_reserve);
-        reserved_huge_pages -= change_in_reserve;
-        info->prereserved_hpages = atmost;
- out:
-        read_unlock_irq(&inode->i_mapping->tree_lock);
-        spin_unlock(&hugetlb_lock);
-}
 static int __init hugetlb_init(void)
 {
        unsigned long i;
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
                return nr_huge_pages;
        spin_lock(&hugetlb_lock);
-        count = max(count, reserved_huge_pages);
+        count = max(count, resv_huge_pages);
        try_to_free_low(count);
        while (count < nr_huge_pages) {
                struct page *page = dequeue_huge_page(NULL, 0);
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf)
        return sprintf(buf,
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
-                        "HugePages_Rsvd:  %5lu\n"
+                        "HugePages_Rsvd:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
                        nr_huge_pages,
                        free_huge_pages,
-                        reserved_huge_pages,
+                        resv_huge_pages,
                        HPAGE_SIZE/1024);
 }
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        flush_tlb_range(vma, start, end);
 }
+struct file_region {
+        struct list_head link;
+        long from;
+        long to;
+};
+static long region_add(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg, *trg;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        /* Check for and consume any regions we now overlap with. */
+        nrg = rg;
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        break;
+                /* If this area reaches higher then extend our area to
+                 * include it completely.  If this is not the first area
+                 * which we intend to reuse, free it. */
+                if (rg->to > t)
+                        t = rg->to;
+                if (rg != nrg) {
+                        list_del(&rg->link);
+                        kfree(rg);
+                }
+        }
+        nrg->from = f;
+        nrg->to = t;
+        return 0;
+}
+static long region_chg(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg;
+        long chg = 0;
+        /* Locate the region we are before or in. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* If we are below the current region then a new region is required.
+         * Subtle, allocate a new region at the position but make it zero
+         * size such that we can guarentee to record the reservation. */
+        if (&rg->link == head || t < rg->from) {
+                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+                if (nrg == 0)
+                        return -ENOMEM;
+                nrg->from = f;
+                nrg->to   = f;
+                INIT_LIST_HEAD(&nrg->link);
+                list_add(&nrg->link, rg->link.prev);
+                return t - f;
+        }
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        chg = t - f;
+        /* Check for and consume any regions we now overlap with. */
+        list_for_each_entry(rg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        return chg;
+                /* We overlap with this area, if it extends futher than
+                 * us then we must extend ourselves.  Account for its
+                 * existing reservation. */
+                if (rg->to > t) {
+                        chg += rg->to - t;
+                        t = rg->to;
+                }
+                chg -= rg->to - rg->from;
+        }
+        return chg;
+}
+static long region_truncate(struct list_head *head, long end)
+{
+        struct file_region *rg, *trg;
+        long chg = 0;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (end <= rg->to)
+                        break;
+        if (&rg->link == head)
+                return 0;
+        /* If we are in the middle of a region then adjust it. */
+        if (end > rg->from) {
+                chg = rg->to - end;
+                rg->to = end;
+                rg = list_entry(rg->link.next, typeof(*rg), link);
+        }
+        /* Drop any remaining regions. */
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                chg += rg->to - rg->from;
+                list_del(&rg->link);
+                kfree(rg);
+        }
+        return chg;
+}
+static int hugetlb_acct_memory(long delta)
+{
+        int ret = -ENOMEM;
+        spin_lock(&hugetlb_lock);
+        if ((delta + resv_huge_pages) <= free_huge_pages) {
+                resv_huge_pages += delta;
+                ret = 0;
+        }
+        spin_unlock(&hugetlb_lock);
+        return ret;
+}
+int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+{
+        long ret, chg;
+        chg = region_chg(&inode->i_mapping->private_list, from, to);
+        if (chg < 0)
+                return chg;
+        ret = hugetlb_acct_memory(chg);
+        if (ret < 0)
+                return ret;
+        region_add(&inode->i_mapping->private_list, from, to);
+        return 0;
+}
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+{
+        long chg = region_truncate(&inode->i_mapping->private_list, offset);
+        hugetlb_acct_memory(freed - chg);
+}
diff --git a/mm/memory.c b/mm/memory.c
index 0ec7bc644271..247b5c312b9b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        /* pte contains position in swap or file, so copy. */
        if (unlikely(!pte_present(pte))) {
                if (!pte_file(pte)) {
-                        swap_duplicate(pte_to_swp_entry(pte));
+                        swp_entry_t entry = pte_to_swp_entry(pte);
+                        swap_duplicate(entry);
                        /* make sure dst_mm is on swapoff's mmlist. */
                        if (unlikely(list_empty(&dst_mm->mmlist))) {
                                spin_lock(&mmlist_lock);
@@ -443,6 +445,16 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                                 &src_mm->mmlist);
                                spin_unlock(&mmlist_lock);
                        }
+                        if (is_write_migration_entry(entry) &&
+                                        is_cow_mapping(vm_flags)) {
+                                /*
+                                 * COW mappings require pages in both parent
+                                 * and child to be set to read.
+                                 */
+                                make_migration_entry_read(&entry);
+                                pte = swp_entry_to_pte(entry);
+                                set_pte_at(src_mm, addr, src_pte, pte);
+                        }
                }
                goto out_set_pte;
        }
@@ -1445,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        struct page *old_page, *new_page;
        pte_t entry;
-        int ret = VM_FAULT_MINOR;
+        int reuse, ret = VM_FAULT_MINOR;
        old_page = vm_normal_page(vma, address, orig_pte);
        if (!old_page)
                goto gotten;
-        if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
+        if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
-                int reuse = can_share_swap_page(old_page);
+                                (VM_SHARED|VM_WRITE))) {
-                unlock_page(old_page);
+                if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
-                if (reuse) {
+                        /*
-                        flush_cache_page(vma, address, pte_pfn(orig_pte));
+                         * Notify the address space that the page is about to
-                        entry = pte_mkyoung(orig_pte);
+                         * become writable so that it can prohibit this or wait
-                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                         * for the page to get into an appropriate state.
-                        ptep_set_access_flags(vma, address, page_table, entry, 1);
+                         *
-                        update_mmu_cache(vma, address, entry);
+                         * We do this without the lock held, so that it can
-                        lazy_mmu_prot_update(entry);
+                         * sleep if it needs to.
-                        ret |= VM_FAULT_WRITE;
+                         */
-                        goto unlock;
+                        page_cache_get(old_page);
+                        pte_unmap_unlock(page_table, ptl);
+                        if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
+                                goto unwritable_page;
+                        page_cache_release(old_page);
+                        /*
+                         * Since we dropped the lock we need to revalidate
+                         * the PTE as someone else may have changed it.  If
+                         * they did, we just return, as we can count on the
+                         * MMU to tell us if they didn't also make it writable.
+                         */
+                        page_table = pte_offset_map_lock(mm, pmd, address,
+                                                         &ptl);
+                        if (!pte_same(*page_table, orig_pte))
+                                goto unlock;
                }
+                reuse = 1;
+        } else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
+                reuse = can_share_swap_page(old_page);
+                unlock_page(old_page);
+        } else {
+                reuse = 0;
+        }
+        if (reuse) {
+                flush_cache_page(vma, address, pte_pfn(orig_pte));
+                entry = pte_mkyoung(orig_pte);
+                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+                ptep_set_access_flags(vma, address, page_table, entry, 1);
+                update_mmu_cache(vma, address, entry);
+                lazy_mmu_prot_update(entry);
+                ret |= VM_FAULT_WRITE;
+                goto unlock;
        }
        /*
@@ -1523,6 +1570,10 @@ oom:
        if (old_page)
                page_cache_release(old_page);
        return VM_FAULT_OOM;
+unwritable_page:
+        page_cache_release(old_page);
+        return VM_FAULT_SIGBUS;
 }
 /*
@@ -1879,7 +1930,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out;
        entry = pte_to_swp_entry(orig_pte);
-again:
+        if (is_migration_entry(entry)) {
+                migration_entry_wait(mm, pmd, address);
+                goto out;
+        }
        page = lookup_swap_cache(entry);
        if (!page) {
                swapin_readahead(entry, address, vma);
@@ -1903,12 +1957,6 @@ again:
        mark_page_accessed(page);
        lock_page(page);
-        if (!PageSwapCache(page)) {
-                /* Page migration has occured */
-                unlock_page(page);
-                page_cache_release(page);
-                goto again;
-        }
        /*
         * Back out if somebody else already faulted in this pte.
@@ -2074,18 +2122,31 @@ retry:
        /*
         * Should we do an early C-O-W break?
         */
-        if (write_access && !(vma->vm_flags & VM_SHARED)) {
+        if (write_access) {
-                struct page *page;
+                if (!(vma->vm_flags & VM_SHARED)) {
+                        struct page *page;
-                if (unlikely(anon_vma_prepare(vma)))
+                        if (unlikely(anon_vma_prepare(vma)))
-                        goto oom;
+                                goto oom;
-                page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+                        page = alloc_page_vma(GFP_HIGHUSER, vma, address);
-                if (!page)
+                        if (!page)
-                        goto oom;
+                                goto oom;
-                copy_user_highpage(page, new_page, address);
+                        copy_user_highpage(page, new_page, address);
-                page_cache_release(new_page);
+                        page_cache_release(new_page);
-                new_page = page;
+                        new_page = page;
-                anon = 1;
+                        anon = 1;
+                } else {
+                        /* if the page will be shareable, see if the backing
+                         * address space wants to know that the page is about
+                         * to become writable */
+                        if (vma->vm_ops->page_mkwrite &&
+                            vma->vm_ops->page_mkwrite(vma, new_page) < 0
+                            ) {
+                                page_cache_release(new_page);
+                                return VM_FAULT_SIGBUS;
+                        }
+                }
        }
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 70df5c0d957e..841a077d5aeb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -26,7 +26,7 @@
 extern void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
                          unsigned long size);
-static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
+static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
        int nr_pages = PAGES_PER_SECTION;
@@ -34,8 +34,15 @@ static void __add_zone(struct zone *zone, unsigned long phys_start_pfn)
        int zone_type;
        zone_type = zone - pgdat->node_zones;
+        if (!populated_zone(zone)) {
+                int ret = 0;
+                ret = init_currently_empty_zone(zone, phys_start_pfn, nr_pages);
+                if (ret < 0)
+                        return ret;
+        }
        memmap_init_zone(nr_pages, nid, zone_type, phys_start_pfn);
        zonetable_add(zone, nid, zone_type, phys_start_pfn, nr_pages);
+        return 0;
 }
 extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
@@ -50,7 +57,11 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
        if (ret < 0)
                return ret;
-        __add_zone(zone, phys_start_pfn);
+        ret = __add_zone(zone, phys_start_pfn);
+        if (ret < 0)
+                return ret;
        return register_new_memory(__pfn_to_section(phys_start_pfn));
 }
@@ -116,6 +127,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        unsigned long flags;
        unsigned long onlined_pages = 0;
        struct zone *zone;
+        int need_zonelists_rebuild = 0;
        /*
         * This doesn't need a lock to do pfn_to_page().
@@ -128,6 +140,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
        pgdat_resize_unlock(zone->zone_pgdat, &flags);
+        /*
+         * If this zone is not populated, then it is not in zonelist.
+         * This means the page allocator ignores this zone.
+         * So, zonelist must be updated after online.
+         */
+        if (!populated_zone(zone))
+                need_zonelists_rebuild = 1;
        for (i = 0; i < nr_pages; i++) {
                struct page *page = pfn_to_page(pfn + i);
                online_page(page);
@@ -138,5 +158,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages)
        setup_per_zone_pages_min();
+        if (need_zonelists_rebuild)
+                build_all_zonelists();
+        vm_total_pages = nr_free_pagecache_pages();
        return 0;
 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8778f58880c4..ec4a1a950df9 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -87,6 +87,8 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 #include <linux/migrate.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -587,6 +589,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
                isolate_lru_page(page, pagelist);
 }
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
+{
+        return alloc_pages_node(node, GFP_HIGHUSER, 0);
+}
 /*
 * Migrate pages from one node to a target node.
 * Returns error or the number of pages not migrated.
@@ -603,11 +610,9 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
-        if (!list_empty(&pagelist)) {
+        if (!list_empty(&pagelist))
-                err = migrate_pages_to(&pagelist, NULL, dest);
+                err = migrate_pages(&pagelist, new_node_page, dest);
-                if (!list_empty(&pagelist))
-                        putback_lru_pages(&pagelist);
-        }
        return err;
 }
@@ -694,6 +699,12 @@ int do_migrate_pages(struct mm_struct *mm,
 }
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+        struct vm_area_struct *vma = (struct vm_area_struct *)private;
+        return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
+}
 #else
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
@@ -706,6 +717,11 @@ int do_migrate_pages(struct mm_struct *mm,
 {
        return -ENOSYS;
 }
+static struct page *new_vma_page(struct page *page, unsigned long private)
+{
+        return NULL;
+}
 #endif
 long do_mbind(unsigned long start, unsigned long len,
@@ -767,15 +783,13 @@ long do_mbind(unsigned long start, unsigned long len,
                err = mbind_range(vma, start, end, new);
                if (!list_empty(&pagelist))
-                        nr_failed = migrate_pages_to(&pagelist, vma, -1);
+                        nr_failed = migrate_pages(&pagelist, new_vma_page,
+                                                (unsigned long)vma);
                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
        }
-        if (!list_empty(&pagelist))
-                putback_lru_pages(&pagelist);
        up_write(&mm->mmap_sem);
        mpol_free(new);
        return err;
@@ -929,6 +943,10 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
                goto out;
        }
+        err = security_task_movememory(task);
+        if (err)
+                goto out;
        err = do_migrate_pages(mm, &old, &new,
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
diff --git a/mm/migrate.c b/mm/migrate.c
index 1c25040693d2..1c2a71aa05cd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -15,6 +15,7 @@
 #include <linux/migrate.h>
 #include <linux/module.h>
 #include <linux/swap.h>
+#include <linux/swapops.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/mm_inline.h>
@@ -23,13 +24,13 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
-#include <linux/swapops.h>
+#include <linux/writeback.h>
+#include <linux/mempolicy.h>
+#include <linux/vmalloc.h>
+#include <linux/security.h>
 #include "internal.h"
-/* The maximum number of pages to take off the LRU for migration */
-#define MIGRATE_CHUNK_SIZE 256
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
 /*
@@ -64,16 +65,11 @@ int isolate_lru_page(struct page *page, struct list_head *pagelist)
 }
 /*
- * migrate_prep() needs to be called after we have compiled the list of pages
+ * migrate_prep() needs to be called before we start compiling a list of pages
- * to be migrated using isolate_lru_page() but before we begin a series of calls
+ * to be migrated using isolate_lru_page().
- * to migrate_pages().
 */
 int migrate_prep(void)
 {
-        /* Must have swap device for migration */
-        if (nr_swap_pages <= 0)
-                return -ENODEV;
        /*
         * Clear the LRU lists so pages can be isolated.
         * Note that pages may be moved off the LRU after we have
@@ -87,7 +83,6 @@ int migrate_prep(void)
 static inline void move_to_lru(struct page *page)
 {
-        list_del(&page->lru);
        if (PageActive(page)) {
                /*
                 * lru_cache_add_active checks that
@@ -113,113 +108,200 @@ int putback_lru_pages(struct list_head *l)
        int count = 0;
        list_for_each_entry_safe(page, page2, l, lru) {
+                list_del(&page->lru);
                move_to_lru(page);
                count++;
        }
        return count;
 }
-/*
+static inline int is_swap_pte(pte_t pte)
- * Non migratable page
- */
-int fail_migrate_page(struct page *newpage, struct page *page)
 {
-        return -EIO;
+        return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
 }
-EXPORT_SYMBOL(fail_migrate_page);
 /*
- * swapout a single page
+ * Restore a potential migration pte to a working pte entry
- * page is locked upon entry, unlocked on exit
 */
-static int swap_page(struct page *page)
+static void remove_migration_pte(struct vm_area_struct *vma,
+                struct page *old, struct page *new)
 {
-        struct address_space *mapping = page_mapping(page);
+        struct mm_struct *mm = vma->vm_mm;
+        swp_entry_t entry;
+        pgd_t *pgd;
+        pud_t *pud;
+        pmd_t *pmd;
+        pte_t *ptep, pte;
+        spinlock_t *ptl;
+        unsigned long addr = page_address_in_vma(new, vma);
+        if (addr == -EFAULT)
+                return;
+        pgd = pgd_offset(mm, addr);
+        if (!pgd_present(*pgd))
+                return;
+        pud = pud_offset(pgd, addr);
+        if (!pud_present(*pud))
+                return;
+        pmd = pmd_offset(pud, addr);
+        if (!pmd_present(*pmd))
+                return;
+        ptep = pte_offset_map(pmd, addr);
+        if (!is_swap_pte(*ptep)) {
+                pte_unmap(ptep);
+                return;
+        }
-        if (page_mapped(page) && mapping)
+        ptl = pte_lockptr(mm, pmd);
-                if (try_to_unmap(page, 1) != SWAP_SUCCESS)
+        spin_lock(ptl);
-                        goto unlock_retry;
+        pte = *ptep;
+        if (!is_swap_pte(pte))
+                goto out;
-        if (PageDirty(page)) {
+        entry = pte_to_swp_entry(pte);
-                /* Page is dirty, try to write it out here */
-                switch(pageout(page, mapping)) {
-                case PAGE_KEEP:
-                case PAGE_ACTIVATE:
-                        goto unlock_retry;
-                case PAGE_SUCCESS:
+        if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
-                        goto retry;
+                goto out;
-                case PAGE_CLEAN:
+        get_page(new);
-                        ; /* try to free the page below */
+        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
-                }
+        if (is_write_migration_entry(entry))
-        }
+                pte = pte_mkwrite(pte);
+        set_pte_at(mm, addr, ptep, pte);
-        if (PagePrivate(page)) {
+        if (PageAnon(new))
-                if (!try_to_release_page(page, GFP_KERNEL) ||
+                page_add_anon_rmap(new, vma, addr);
-                    (!mapping && page_count(page) == 1))
+        else
-                        goto unlock_retry;
+                page_add_file_rmap(new);
-        }
-        if (remove_mapping(mapping, page)) {
+        /* No need to invalidate - it was non-present before */
-                /* Success */
+        update_mmu_cache(vma, addr, pte);
-                unlock_page(page);
+        lazy_mmu_prot_update(pte);
-                return 0;
-        }
-unlock_retry:
+out:
-        unlock_page(page);
+        pte_unmap_unlock(ptep, ptl);
+}
-retry:
+/*
-        return -EAGAIN;
+ * Note that remove_file_migration_ptes will only work on regular mappings,
+ * Nonlinear mappings do not use migration entries.
+ */
+static void remove_file_migration_ptes(struct page *old, struct page *new)
+{
+        struct vm_area_struct *vma;
+        struct address_space *mapping = page_mapping(new);
+        struct prio_tree_iter iter;
+        pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        if (!mapping)
+                return;
+        spin_lock(&mapping->i_mmap_lock);
+        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
+                remove_migration_pte(vma, old, new);
+        spin_unlock(&mapping->i_mmap_lock);
 }
 /*
- * Remove references for a page and establish the new page with the correct
+ * Must hold mmap_sem lock on at least one of the vmas containing
- * basic settings to be able to stop accesses to the page.
+ * the page so that the anon_vma cannot vanish.
 */
-int migrate_page_remove_references(struct page *newpage,
+static void remove_anon_migration_ptes(struct page *old, struct page *new)
-                                struct page *page, int nr_refs)
 {
-        struct address_space *mapping = page_mapping(page);
+        struct anon_vma *anon_vma;
-        struct page **radix_pointer;
+        struct vm_area_struct *vma;
+        unsigned long mapping;
-        /*
+        mapping = (unsigned long)new->mapping;
-         * Avoid doing any of the following work if the page count
-         * indicates that the page is in use or truncate has removed
-         * the page.
-         */
-        if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
-                return -EAGAIN;
-        /*
+        if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
-         * Establish swap ptes for anonymous pages or destroy pte
+                return;
-         * maps for files.
-         *
-         * In order to reestablish file backed mappings the fault handlers
-         * will take the radix tree_lock which may then be used to stop
-         * processses from accessing this page until the new page is ready.
-         *
-         * A process accessing via a swap pte (an anonymous page) will take a
-         * page_lock on the old page which will block the process until the
-         * migration attempt is complete. At that time the PageSwapCache bit
-         * will be examined. If the page was migrated then the PageSwapCache
-         * bit will be clear and the operation to retrieve the page will be
-         * retried which will find the new page in the radix tree. Then a new
-         * direct mapping may be generated based on the radix tree contents.
-         *
-         * If the page was not migrated then the PageSwapCache bit
-         * is still set and the operation may continue.
-         */
-        if (try_to_unmap(page, 1) == SWAP_FAIL)
-                /* A vma has VM_LOCKED set -> permanent failure */
-                return -EPERM;
        /*
-         * Give up if we were unable to remove all mappings.
+         * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
         */
-        if (page_mapcount(page))
+        anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
-                return -EAGAIN;
+        spin_lock(&anon_vma->lock);
+        list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+                remove_migration_pte(vma, old, new);
+        spin_unlock(&anon_vma->lock);
+}
+/*
+ * Get rid of all migration entries and replace them by
+ * references to the indicated page.
+ */
+static void remove_migration_ptes(struct page *old, struct page *new)
+{
+        if (PageAnon(new))
+                remove_anon_migration_ptes(old, new);
+        else
+                remove_file_migration_ptes(old, new);
+}
+/*
+ * Something used the pte of a page under migration. We need to
+ * get to the page and wait until migration is finished.
+ * When we return from this function the fault will be retried.
+ *
+ * This function is called from do_swap_page().
+ */
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+                                unsigned long address)
+{
+        pte_t *ptep, pte;
+        spinlock_t *ptl;
+        swp_entry_t entry;
+        struct page *page;
+        ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+        pte = *ptep;
+        if (!is_swap_pte(pte))
+                goto out;
+        entry = pte_to_swp_entry(pte);
+        if (!is_migration_entry(entry))
+                goto out;
+        page = migration_entry_to_page(entry);
+        get_page(page);
+        pte_unmap_unlock(ptep, ptl);
+        wait_on_page_locked(page);
+        put_page(page);
+        return;
+out:
+        pte_unmap_unlock(ptep, ptl);
+}
+/*
+ * Replace the page in the mapping.
+ *
+ * The number of remaining references must be:
+ * 1 for anonymous pages without a mapping
+ * 2 for pages with a mapping
+ * 3 for pages with a mapping and PagePrivate set.
+ */
+static int migrate_page_move_mapping(struct address_space *mapping,
+                struct page *newpage, struct page *page)
+{
+        struct page **radix_pointer;
+        if (!mapping) {
+                /* Anonymous page */
+                if (page_count(page) != 1)
+                        return -EAGAIN;
+                return 0;
+        }
        write_lock_irq(&mapping->tree_lock);
@@ -227,7 +309,7 @@ int migrate_page_remove_references(struct page *newpage,
                                                &mapping->page_tree,
                                                page_index(page));
-        if (!page_mapping(page) || page_count(page) != nr_refs ||
+        if (page_count(page) != 2 + !!PagePrivate(page) ||
                        *radix_pointer != page) {
                write_unlock_irq(&mapping->tree_lock);
                return -EAGAIN;
@@ -235,19 +317,14 @@ int migrate_page_remove_references(struct page *newpage,
        /*
         * Now we know that no one else is looking at the page.
-         *
-         * Certain minimal information about a page must be available
-         * in order for other subsystems to properly handle the page if they
-         * find it through the radix tree update before we are finished
-         * copying the page.
         */
        get_page(newpage);
-        newpage->index = page->index;
+#ifdef CONFIG_SWAP
-        newpage->mapping = page->mapping;
        if (PageSwapCache(page)) {
                SetPageSwapCache(newpage);
                set_page_private(newpage, page_private(page));
        }
+#endif
        *radix_pointer = newpage;
        __put_page(page);
@@ -255,12 +332,11 @@ int migrate_page_remove_references(struct page *newpage,
        return 0;
 }
-EXPORT_SYMBOL(migrate_page_remove_references);
 /*
 * Copy the page to its new location
 */
-void migrate_page_copy(struct page *newpage, struct page *page)
+static void migrate_page_copy(struct page *newpage, struct page *page)
 {
        copy_highpage(newpage, page);
@@ -282,7 +358,9 @@ void migrate_page_copy(struct page *newpage, struct page *page)
                set_page_dirty(newpage);
        }
+#ifdef CONFIG_SWAP
        ClearPageSwapCache(page);
+#endif
        ClearPageActive(page);
        ClearPagePrivate(page);
        set_page_private(page, 0);
@@ -295,7 +373,18 @@ void migrate_page_copy(struct page *newpage, struct page *page)
        if (PageWriteback(newpage))
                end_page_writeback(newpage);
 }
-EXPORT_SYMBOL(migrate_page_copy);
+/************************************************************
+ *                    Migration functions
+ ***********************************************************/
+/* Always fail migration. Used for mappings that are not movable */
+int fail_migrate_page(struct address_space *mapping,
+                        struct page *newpage, struct page *page)
+{
+        return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
 /*
 * Common logic to directly migrate a single page suitable for
@@ -303,51 +392,286 @@ EXPORT_SYMBOL(migrate_page_copy);
 *
 * Pages are locked upon entry and exit.
 */
-int migrate_page(struct page *newpage, struct page *page)
+int migrate_page(struct address_space *mapping,
+                struct page *newpage, struct page *page)
 {
        int rc;
        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
-        rc = migrate_page_remove_references(newpage, page, 2);
+        rc = migrate_page_move_mapping(mapping, newpage, page);
+        if (rc)
+                return rc;
+        migrate_page_copy(newpage, page);
+        return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct address_space *mapping,
+                struct page *newpage, struct page *page)
+{
+        struct buffer_head *bh, *head;
+        int rc;
+        if (!page_has_buffers(page))
+                return migrate_page(mapping, newpage, page);
+        head = page_buffers(page);
+        rc = migrate_page_move_mapping(mapping, newpage, page);
        if (rc)
                return rc;
+        bh = head;
+        do {
+                get_bh(bh);
+                lock_buffer(bh);
+                bh = bh->b_this_page;
+        } while (bh != head);
+        ClearPagePrivate(page);
+        set_page_private(newpage, page_private(page));
+        set_page_private(page, 0);
+        put_page(page);
+        get_page(newpage);
+        bh = head;
+        do {
+                set_bh_page(bh, newpage, bh_offset(bh));
+                bh = bh->b_this_page;
+        } while (bh != head);
+        SetPagePrivate(newpage);
        migrate_page_copy(newpage, page);
+        bh = head;
+        do {
+                unlock_buffer(bh);
+                put_bh(bh);
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+/*
+ * Writeback a page to clean the dirty state
+ */
+static int writeout(struct address_space *mapping, struct page *page)
+{
+        struct writeback_control wbc = {
+                .sync_mode = WB_SYNC_NONE,
+                .nr_to_write = 1,
+                .range_start = 0,
+                .range_end = LLONG_MAX,
+                .nonblocking = 1,
+                .for_reclaim = 1
+        };
+        int rc;
+        if (!mapping->a_ops->writepage)
+                /* No write method for the address space */
+                return -EINVAL;
+        if (!clear_page_dirty_for_io(page))
+                /* Someone else already triggered a write */
+                return -EAGAIN;
        /*
-         * Remove auxiliary swap entries and replace
+         * A dirty page may imply that the underlying filesystem has
-         * them with real ptes.
+         * the page on some queue. So the page must be clean for
-         *
+         * migration. Writeout may mean we loose the lock and the
-         * Note that a real pte entry will allow processes that are not
+         * page state is no longer what we checked for earlier.
-         * waiting on the page lock to use the new page via the page tables
+         * At this point we know that the migration attempt cannot
-         * before the new page is unlocked.
+         * be successful.
         */
-        remove_from_swap(newpage);
+        remove_migration_ptes(page, page);
-        return 0;
+        rc = mapping->a_ops->writepage(page, &wbc);
+        if (rc < 0)
+                /* I/O Error writing */
+                return -EIO;
+        if (rc != AOP_WRITEPAGE_ACTIVATE)
+                /* unlocked. Relock */
+                lock_page(page);
+        return -EAGAIN;
+}
+/*
+ * Default handling if a filesystem does not provide a migration function.
+ */
+static int fallback_migrate_page(struct address_space *mapping,
+        struct page *newpage, struct page *page)
+{
+        if (PageDirty(page))
+                return writeout(mapping, page);
+        /*
+         * Buffers may be managed in a filesystem specific way.
+         * We must have no buffers or drop them.
+         */
+        if (page_has_buffers(page) &&
+            !try_to_release_page(page, GFP_KERNEL))
+                return -EAGAIN;
+        return migrate_page(mapping, newpage, page);
+}
+/*
+ * Move a page to a newly allocated page
+ * The page is locked and all ptes have been successfully removed.
+ *
+ * The new page will have replaced the old page if this function
+ * is successful.
+ */
+static int move_to_new_page(struct page *newpage, struct page *page)
+{
+        struct address_space *mapping;
+        int rc;
+        /*
+         * Block others from accessing the page when we get around to
+         * establishing additional references. We are the only one
+         * holding a reference to the new page at this point.
+         */
+        if (TestSetPageLocked(newpage))
+                BUG();
+        /* Prepare mapping for the new page.*/
+        newpage->index = page->index;
+        newpage->mapping = page->mapping;
+        mapping = page_mapping(page);
+        if (!mapping)
+                rc = migrate_page(mapping, newpage, page);
+        else if (mapping->a_ops->migratepage)
+                /*
+                 * Most pages have a mapping and most filesystems
+                 * should provide a migration function. Anonymous
+                 * pages are part of swap space which also has its
+                 * own migration function. This is the most common
+                 * path for page migration.
+                 */
+                rc = mapping->a_ops->migratepage(mapping,
+                                                newpage, page);
+        else
+                rc = fallback_migrate_page(mapping, newpage, page);
+        if (!rc)
+                remove_migration_ptes(page, newpage);
+        else
+                newpage->mapping = NULL;
+        unlock_page(newpage);
+        return rc;
+}
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+                        struct page *page, int force)
+{
+        int rc = 0;
+        int *result = NULL;
+        struct page *newpage = get_new_page(page, private, &result);
+        if (!newpage)
+                return -ENOMEM;
+        if (page_count(page) == 1)
+                /* page was freed from under us. So we are done. */
+                goto move_newpage;
+        rc = -EAGAIN;
+        if (TestSetPageLocked(page)) {
+                if (!force)
+                        goto move_newpage;
+                lock_page(page);
+        }
+        if (PageWriteback(page)) {
+                if (!force)
+                        goto unlock;
+                wait_on_page_writeback(page);
+        }
+        /*
+         * Establish migration ptes or remove ptes
+         */
+        if (try_to_unmap(page, 1) != SWAP_FAIL) {
+                if (!page_mapped(page))
+                        rc = move_to_new_page(newpage, page);
+        } else
+                /* A vma has VM_LOCKED set -> permanent failure */
+                rc = -EPERM;
+        if (rc)
+                remove_migration_ptes(page, page);
+unlock:
+        unlock_page(page);
+        if (rc != -EAGAIN) {
+                /*
+                 * A page that has been migrated has all references
+                 * removed and will be freed. A page that has not been
+                 * migrated will have kepts its references and be
+                 * restored.
+                 */
+                list_del(&page->lru);
+                move_to_lru(page);
+        }
+move_newpage:
+        /*
+         * Move the new page to the LRU. If migration was not successful
+         * then this will free the page.
+         */
+        move_to_lru(newpage);
+        if (result) {
+                if (rc)
+                        *result = rc;
+                else
+                        *result = page_to_nid(newpage);
+        }
+        return rc;
 }
-EXPORT_SYMBOL(migrate_page);
 /*
 * migrate_pages
 *
- * Two lists are passed to this function. The first list
+ * The function takes one list of pages to migrate and a function
- * contains the pages isolated from the LRU to be migrated.
+ * that determines from the page to be migrated and the private data
- * The second list contains new pages that the pages isolated
+ * the target of the move and allocates the page.
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
 *
 * The function returns after 10 attempts or if no pages
 * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
+ * or no retryable pages exist anymore. All pages will be
+ * retruned to the LRU or freed.
 *
- * Return: Number of pages not migrated when "to" ran empty.
+ * Return: Number of pages not migrated or error code.
 */
-int migrate_pages(struct list_head *from, struct list_head *to,
+int migrate_pages(struct list_head *from,
-                  struct list_head *moved, struct list_head *failed)
+                new_page_t get_new_page, unsigned long private)
 {
-        int retry;
+        int retry = 1;
        int nr_failed = 0;
        int pass = 0;
        struct page *page;
@@ -358,305 +682,297 @@ int migrate_pages(struct list_head *from, struct list_head *to,
        if (!swapwrite)
                current->flags |= PF_SWAPWRITE;
-redo:
+        for(pass = 0; pass < 10 && retry; pass++) {
-        retry = 0;
+                retry = 0;
+                list_for_each_entry_safe(page, page2, from, lru) {
+                        cond_resched();
+                        rc = unmap_and_move(get_new_page, private,
+                                                page, pass > 2);
+                        switch(rc) {
+                        case -ENOMEM:
+                                goto out;
+                        case -EAGAIN:
+                                retry++;
+                                break;
+                        case 0:
+                                break;
+                        default:
+                                /* Permanent failure */
+                                nr_failed++;
+                                break;
+                        }
+                }
+        }
+        rc = 0;
+out:
+        if (!swapwrite)
+                current->flags &= ~PF_SWAPWRITE;
-        list_for_each_entry_safe(page, page2, from, lru) {
+        putback_lru_pages(from);
-                struct page *newpage = NULL;
-                struct address_space *mapping;
-                cond_resched();
+        if (rc)
+                return rc;
-                rc = 0;
+        return nr_failed + retry;
-                if (page_count(page) == 1)
+}
-                        /* page was freed from under us. So we are done. */
-                        goto next;
-                if (to && list_empty(to))
+#ifdef CONFIG_NUMA
-                        break;
+/*
+ * Move a list of individual pages
+ */
+struct page_to_node {
+        unsigned long addr;
+        struct page *page;
+        int node;
+        int status;
+};
-                /*
+static struct page *new_page_node(struct page *p, unsigned long private,
-                 * Skip locked pages during the first two passes to give the
+                int **result)
-                 * functions holding the lock time to release the page. Later we
+{
-                 * use lock_page() to have a higher chance of acquiring the
+        struct page_to_node *pm = (struct page_to_node *)private;
-                 * lock.
-                 */
-                rc = -EAGAIN;
-                if (pass > 2)
-                        lock_page(page);
-                else
-                        if (TestSetPageLocked(page))
-                                goto next;
-                /*
+        while (pm->node != MAX_NUMNODES && pm->page != p)
-                 * Only wait on writeback if we have already done a pass where
+                pm++;
-                 * we we may have triggered writeouts for lots of pages.
-                 */
-                if (pass > 0) {
-                        wait_on_page_writeback(page);
-                } else {
-                        if (PageWriteback(page))
-                                goto unlock_page;
-                }
-                /*
+        if (pm->node == MAX_NUMNODES)
-                 * Anonymous pages must have swap cache references otherwise
+                return NULL;
-                 * the information contained in the page maps cannot be
-                 * preserved.
-                 */
-                if (PageAnon(page) && !PageSwapCache(page)) {
-                        if (!add_to_swap(page, GFP_KERNEL)) {
-                                rc = -ENOMEM;
-                                goto unlock_page;
-                        }
-                }
-                if (!to) {
+        *result = &pm->status;
-                        rc = swap_page(page);
-                        goto next;
-                }
-                newpage = lru_to_page(to);
+        return alloc_pages_node(pm->node, GFP_HIGHUSER, 0);
-                lock_page(newpage);
+}
-                /*
+/*
-                 * Pages are properly locked and writeback is complete.
+ * Move a set of pages as indicated in the pm array. The addr
-                 * Try to migrate the page.
+ * field must be set to the virtual address of the page to be moved
-                 */
+ * and the node number must contain a valid target node.
-                mapping = page_mapping(page);
+ */
-                if (!mapping)
+static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
-                        goto unlock_both;
+                                int migrate_all)
+{
+        int err;
+        struct page_to_node *pp;
+        LIST_HEAD(pagelist);
-                if (mapping->a_ops->migratepage) {
+        down_read(&mm->mmap_sem);
-                        /*
-                         * Most pages have a mapping and most filesystems
-                         * should provide a migration function. Anonymous
-                         * pages are part of swap space which also has its
-                         * own migration function. This is the most common
-                         * path for page migration.
-                         */
-                        rc = mapping->a_ops->migratepage(newpage, page);
-                        goto unlock_both;
-                }
-                /* Make sure the dirty bit is up to date */
-                if (try_to_unmap(page, 1) == SWAP_FAIL) {
-                        rc = -EPERM;
-                        goto unlock_both;
-                }
-                if (page_mapcount(page)) {
+        /*
-                        rc = -EAGAIN;
+         * Build a list of pages to migrate
-                        goto unlock_both;
+         */
-                }
+        migrate_prep();
+        for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
+                struct vm_area_struct *vma;
+                struct page *page;
                /*
-                 * Default handling if a filesystem does not provide
+                 * A valid page pointer that will not match any of the
-                 * a migration function. We can only migrate clean
+                 * pages that will be moved.
-                 * pages so try to write out any dirty pages first.
                 */
-                if (PageDirty(page)) {
+                pp->page = ZERO_PAGE(0);
-                        switch (pageout(page, mapping)) {
-                        case PAGE_KEEP:
-                        case PAGE_ACTIVATE:
-                                goto unlock_both;
-                        case PAGE_SUCCESS:
-                                unlock_page(newpage);
-                                goto next;
-                        case PAGE_CLEAN:
-                                ; /* try to migrate the page below */
-                        }
-                }
-                /*
+                err = -EFAULT;
-                 * Buffers are managed in a filesystem specific way.
+                vma = find_vma(mm, pp->addr);
-                 * We must have no buffers or drop them.
+                if (!vma)
-                 */
+                        goto set_status;
-                if (!page_has_buffers(page) ||
-                    try_to_release_page(page, GFP_KERNEL)) {
-                        rc = migrate_page(newpage, page);
-                        goto unlock_both;
-                }
-                /*
+                page = follow_page(vma, pp->addr, FOLL_GET);
-                 * On early passes with mapped pages simply
+                err = -ENOENT;
-                 * retry. There may be a lock held for some
+                if (!page)
-                 * buffers that may go away. Later
+                        goto set_status;
-                 * swap them out.
-                 */
+                if (PageReserved(page))         /* Check for zero page */
-                if (pass > 4) {
+                        goto put_and_set;
+                pp->page = page;
+                err = page_to_nid(page);
+                if (err == pp->node)
                        /*
-                         * Persistently unable to drop buffers..... As a
+                         * Node already in the right place
-                         * measure of last resort we fall back to
-                         * swap_page().
                         */
-                        unlock_page(newpage);
+                        goto put_and_set;
-                        newpage = NULL;
-                        rc = swap_page(page);
-                        goto next;
-                }
-unlock_both:
+                err = -EACCES;
-                unlock_page(newpage);
+                if (page_mapcount(page) > 1 &&
+                                !migrate_all)
-unlock_page:
+                        goto put_and_set;
-                unlock_page(page);
+                err = isolate_lru_page(page, &pagelist);
-next:
+put_and_set:
-                if (rc == -EAGAIN) {
+                /*
-                        retry++;
+                 * Either remove the duplicate refcount from
-                } else if (rc) {
+                 * isolate_lru_page() or drop the page ref if it was
-                        /* Permanent failure */
+                 * not isolated.
-                        list_move(&page->lru, failed);
+                 */
-                        nr_failed++;
+                put_page(page);
-                } else {
+set_status:
-                        if (newpage) {
+                pp->status = err;
-                                /* Successful migration. Return page to LRU */
-                                move_to_lru(newpage);
-                        }
-                        list_move(&page->lru, moved);
-                }
        }
-        if (retry && pass++ < 10)
-                goto redo;
-        if (!swapwrite)
+        if (!list_empty(&pagelist))
-                current->flags &= ~PF_SWAPWRITE;
+                err = migrate_pages(&pagelist, new_page_node,
+                                (unsigned long)pm);
+        else
+                err = -ENOENT;
-        return nr_failed + retry;
+        up_read(&mm->mmap_sem);
+        return err;
 }
 /*
- * Migration function for pages with buffers. This function can only be used
+ * Determine the nodes of a list of pages. The addr in the pm array
- * if the underlying filesystem guarantees that no other references to "page"
+ * must have been set to the virtual address of which we want to determine
- * exist.
+ * the node number.
 */
-int buffer_migrate_page(struct page *newpage, struct page *page)
+static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
 {
-        struct address_space *mapping = page->mapping;
+        down_read(&mm->mmap_sem);
-        struct buffer_head *bh, *head;
-        int rc;
+        for ( ; pm->node != MAX_NUMNODES; pm++) {
+                struct vm_area_struct *vma;
+                struct page *page;
+                int err;
+                err = -EFAULT;
+                vma = find_vma(mm, pm->addr);
+                if (!vma)
+                        goto set_status;
+                page = follow_page(vma, pm->addr, 0);
+                err = -ENOENT;
+                /* Use PageReserved to check for zero page */
+                if (!page || PageReserved(page))
+                        goto set_status;
+                err = page_to_nid(page);
+set_status:
+                pm->status = err;
+        }
-        if (!mapping)
+        up_read(&mm->mmap_sem);
-                return -EAGAIN;
+        return 0;
+}
-        if (!page_has_buffers(page))
+/*
-                return migrate_page(newpage, page);
+ * Move a list of pages in the address space of the currently executing
+ * process.
+ */
+asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
+                        const void __user * __user *pages,
+                        const int __user *nodes,
+                        int __user *status, int flags)
+{
+        int err = 0;
+        int i;
+        struct task_struct *task;
+        nodemask_t task_nodes;
+        struct mm_struct *mm;
+        struct page_to_node *pm = NULL;
-        head = page_buffers(page);
+        /* Check flags */
+        if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
+                return -EINVAL;
-        rc = migrate_page_remove_references(newpage, page, 3);
+        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+                return -EPERM;
-        if (rc)
+        /* Find the mm_struct */
-                return rc;
+        read_lock(&tasklist_lock);
+        task = pid ? find_task_by_pid(pid) : current;
+        if (!task) {
+                read_unlock(&tasklist_lock);
+                return -ESRCH;
+        }
+        mm = get_task_mm(task);
+        read_unlock(&tasklist_lock);
-        bh = head;
+        if (!mm)
-        do {
+                return -EINVAL;
-                get_bh(bh);
-                lock_buffer(bh);
-                bh = bh->b_this_page;
-        } while (bh != head);
+        /*
+         * Check if this process has the right to modify the specified
+         * process. The right exists if the process has administrative
+         * capabilities, superuser privileges or the same
+         * userid as the target process.
+         */
+        if ((current->euid != task->suid) && (current->euid != task->uid) &&
+            (current->uid != task->suid) && (current->uid != task->uid) &&
+            !capable(CAP_SYS_NICE)) {
+                err = -EPERM;
+                goto out2;
+        }
-        ClearPagePrivate(page);
+        err = security_task_movememory(task);
-        set_page_private(newpage, page_private(page));
+        if (err)
-        set_page_private(page, 0);
+                goto out2;
-        put_page(page);
-        get_page(newpage);
-        bh = head;
-        do {
-                set_bh_page(bh, newpage, bh_offset(bh));
-                bh = bh->b_this_page;
-        } while (bh != head);
+        task_nodes = cpuset_mems_allowed(task);
-        SetPagePrivate(newpage);
+        /* Limit nr_pages so that the multiplication may not overflow */
+        if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
+                err = -E2BIG;
+                goto out2;
+        }
-        migrate_page_copy(newpage, page);
+        pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
+        if (!pm) {
+                err = -ENOMEM;
+                goto out2;
+        }
-        bh = head;
+        /*
-        do {
+         * Get parameters from user space and initialize the pm
-                unlock_buffer(bh);
+         * array. Return various errors if the user did something wrong.
-                put_bh(bh);
+         */
-                bh = bh->b_this_page;
+        for (i = 0; i < nr_pages; i++) {
+                const void *p;
-        } while (bh != head);
+                err = -EFAULT;
+                if (get_user(p, pages + i))
+                        goto out;
-        return 0;
+                pm[i].addr = (unsigned long)p;
-}
+                if (nodes) {
-EXPORT_SYMBOL(buffer_migrate_page);
+                        int node;
-/*
+                        if (get_user(node, nodes + i))
- * Migrate the list 'pagelist' of pages to a certain destination.
+                                goto out;
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-int migrate_pages_to(struct list_head *pagelist,
-                        struct vm_area_struct *vma, int dest)
-{
-        LIST_HEAD(newlist);
-        LIST_HEAD(moved);
-        LIST_HEAD(failed);
-        int err = 0;
-        unsigned long offset = 0;
-        int nr_pages;
-        struct page *page;
-        struct list_head *p;
-redo:
+                        err = -ENODEV;
-        nr_pages = 0;
+                        if (!node_online(node))
-        list_for_each(p, pagelist) {
+                                goto out;
-                if (vma) {
-                        /*
-                         * The address passed to alloc_page_vma is used to
-                         * generate the proper interleave behavior. We fake
-                         * the address here by an increasing offset in order
-                         * to get the proper distribution of pages.
-                         *
-                         * No decision has been made as to which page
-                         * a certain old page is moved to so we cannot
-                         * specify the correct address.
-                         */
-                        page = alloc_page_vma(GFP_HIGHUSER, vma,
-                                        offset + vma->vm_start);
-                        offset += PAGE_SIZE;
-                }
-                else
-                        page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-                if (!page) {
+                        err = -EACCES;
-                        err = -ENOMEM;
+                        if (!node_isset(node, task_nodes))
-                        goto out;
+                                goto out;
+                        pm[i].node = node;
                }
-                list_add_tail(&page->lru, &newlist);
-                nr_pages++;
-                if (nr_pages > MIGRATE_CHUNK_SIZE)
-                        break;
        }
-        err = migrate_pages(pagelist, &newlist, &moved, &failed);
+        /* End marker */
+        pm[nr_pages].node = MAX_NUMNODES;
+        if (nodes)
+                err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
+        else
+                err = do_pages_stat(mm, pm);
-        putback_lru_pages(&moved);      /* Call release pages instead ?? */
+        if (err >= 0)
+                /* Return status information */
+                for (i = 0; i < nr_pages; i++)
+                        if (put_user(pm[i].status, status + i))
+                                err = -EFAULT;
-        if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-                goto redo;
 out:
-        /* Return leftover allocated pages */
+        vfree(pm);
-        while (!list_empty(&newlist)) {
+out2:
-                page = list_entry(newlist.next, struct page, lru);
+        mmput(mm);
-                list_del(&page->lru);
+        return err;
-                __free_page(page);
-        }
-        list_splice(&failed, pagelist);
-        if (err < 0)
-                return err;
-        /* Calculate number of leftover pages */
-        nr_pages = 0;
-        list_for_each(p, pagelist)
-                nr_pages++;
-        return nr_pages;
 }
+#endif
diff --git a/mm/mmap.c b/mm/mmap.c
index e6ee12344b13..6446c6134b04 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1065,7 +1065,8 @@ munmap_back:
        vma->vm_start = addr;
        vma->vm_end = addr + len;
        vma->vm_flags = vm_flags;
-        vma->vm_page_prot = protection_map[vm_flags & 0x0f];
+        vma->vm_page_prot = protection_map[vm_flags &
+                                (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
        vma->vm_pgoff = pgoff;
        if (file) {
@@ -1089,6 +1090,12 @@ munmap_back:
                        goto free_vma;
        }
+        /* Don't make the VMA automatically writable if it's shared, but the
+         * backer wishes to know when pages are first written to */
+        if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+                vma->vm_page_prot =
+                        protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];
        /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
         * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
         * that memory reservation must be checked; but that reservation
@@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
        vma->vm_end = addr + len;
        vma->vm_pgoff = pgoff;
        vma->vm_flags = flags;
-        vma->vm_page_prot = protection_map[flags & 0x0f];
+        vma->vm_page_prot = protection_map[flags &
+                                (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
        vma_link(mm, vma, prev, rb_link, rb_parent);
 out:
        mm->total_vm += len >> PAGE_SHIFT;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 4c14d4289b61..638edabaff71 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -19,7 +19,8 @@
 #include <linux/mempolicy.h>
 #include <linux/personality.h>
 #include <linux/syscalls.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -28,12 +29,13 @@
 static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                unsigned long addr, unsigned long end, pgprot_t newprot)
 {
-        pte_t *pte;
+        pte_t *pte, oldpte;
        spinlock_t *ptl;
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        do {
-                if (pte_present(*pte)) {
+                oldpte = *pte;
+                if (pte_present(oldpte)) {
                        pte_t ptent;
                        /* Avoid an SMP race with hardware updated dirty/clean
@@ -43,7 +45,22 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
                        ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot);
                        set_pte_at(mm, addr, pte, ptent);
                        lazy_mmu_prot_update(ptent);
+#ifdef CONFIG_MIGRATION
+                } else if (!pte_file(oldpte)) {
+                        swp_entry_t entry = pte_to_swp_entry(oldpte);
+                        if (is_write_migration_entry(entry)) {
+                                /*
+                                 * A protection check is difficult so
+                                 * just be safe and disable write
+                                 */
+                                make_migration_entry_read(&entry);
+                                set_pte_at(mm, addr, pte,
+                                        swp_entry_to_pte(entry));
+                        }
+#endif
                }
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap_unlock(pte - 1, ptl);
 }
@@ -106,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
        unsigned long oldflags = vma->vm_flags;
        long nrpages = (end - start) >> PAGE_SHIFT;
        unsigned long charged = 0;
+        unsigned int mask;
        pgprot_t newprot;
        pgoff_t pgoff;
        int error;
@@ -132,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
                }
        }
-        newprot = protection_map[newflags & 0xf];
        /*
         * First try to merge with previous and/or next vma.
         */
@@ -160,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
        }
 success:
+        /* Don't make the VMA automatically writable if it's shared, but the
+         * backer wishes to know when pages are first written to */
+        mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
+        if (vma->vm_ops && vma->vm_ops->page_mkwrite)
+                mask &= ~VM_SHARED;
+        newprot = protection_map[newflags & mask];
        /*
         * vm_flags and vm_page_prot are protected by the mmap_sem
         * held in write mode.
@@ -205,8 +229,7 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
        /*
         * Does the application expect PROT_READ to imply PROT_EXEC:
         */
-        if (unlikely((prot & PROT_READ) &&
+        if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
-                        (current->personality & READ_IMPLIES_EXEC)))
                prot |= PROT_EXEC;
        vm_flags = calc_vm_prot_bits(prot);
diff --git a/mm/msync.c b/mm/msync.c
index bc6c95376366..d083544df21b 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -170,8 +170,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
         * just ignore them, but return -ENOMEM at the end.
         */
        down_read(&current->mm->mmap_sem);
-        if (flags & MS_SYNC)
-                current->flags |= PF_SYNCWRITE;
        vma = find_vma(current->mm, start);
        if (!vma) {
                error = -ENOMEM;
@@ -228,7 +226,6 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
                }
        } while (vma && !done);
 out_unlock:
-        current->flags &= ~PF_SYNCWRITE;
        up_read(&current->mm->mmap_sem);
 out:
        return error;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 042e6436c3ee..d46ed0f1dc06 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -22,10 +22,11 @@
 #include <linux/jiffies.h>
 #include <linux/cpuset.h>
+int sysctl_panic_on_oom;
 /* #define DEBUG */
 /**
- * oom_badness - calculate a numeric value for how bad this task has been
+ * badness - calculate a numeric value for how bad this task has been
 * @p: task struct of which task we should calculate
 * @uptime: current uptime in seconds
 *
@@ -200,7 +201,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints)
                        continue;
                /*
-                 * This is in the process of releasing memory so for wait it
+                 * This is in the process of releasing memory so wait for it
                 * to finish before killing some other task by mistake.
                 */
                releasing = test_tsk_thread_flag(p, TIF_MEMDIE) ||
@@ -306,7 +307,7 @@ static int oom_kill_process(struct task_struct *p, unsigned long points,
 }
 /**
- * oom_kill - kill the "best" process when we run out of memory
+ * out_of_memory - kill the "best" process when we run out of memory
 *
 * If we run out of memory, we have the choice between either
 * killing a random task (bad), letting the system crash (worse)
@@ -344,6 +345,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
                break;
        case CONSTRAINT_NONE:
+                if (sysctl_panic_on_oom)
+                        panic("out of memory. panic_on_oom is selected\n");
 retry:
                /*
                 * Rambo mode: Shoot down a process and hope it solves whatever
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 75d7f48b79bb..8ccf6f1b1473 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -204,6 +204,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                        .sync_mode      = WB_SYNC_NONE,
                        .older_than_this = NULL,
                        .nr_to_write    = write_chunk,
+                        .range_cyclic   = 1,
                };
                get_dirty_limits(&wbs, &background_thresh,
@@ -331,6 +332,7 @@ static void background_writeout(unsigned long _min_pages)
                .older_than_this = NULL,
                .nr_to_write    = 0,
                .nonblocking    = 1,
+                .range_cyclic   = 1,
        };
        for ( ; ; ) {
@@ -407,6 +409,7 @@ static void wb_kupdate(unsigned long arg)
                .nr_to_write    = 0,
                .nonblocking    = 1,
                .for_kupdate    = 1,
+                .range_cyclic   = 1,
        };
        sync_supers();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 253a450c400d..423db0db7c02 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,7 @@
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
 #include <linux/mempolicy.h>
+#include <linux/stop_machine.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -83,8 +84,8 @@ EXPORT_SYMBOL(zone_table);
 static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
-unsigned long __initdata nr_kernel_pages;
+unsigned long __meminitdata nr_kernel_pages;
-unsigned long __initdata nr_all_pages;
+unsigned long __meminitdata nr_all_pages;
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
@@ -286,22 +287,27 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 * we can do coalesce a page and its buddy if
 * (a) the buddy is not in a hole &&
 * (b) the buddy is in the buddy system &&
- * (c) a page and its buddy have the same order.
+ * (c) a page and its buddy have the same order &&
+ * (d) a page and its buddy are in the same zone.
 *
 * For recording whether a page is in the buddy system, we use PG_buddy.
 * Setting, clearing, and testing PG_buddy is serialized by zone->lock.
 *
 * For recording page's order, we use page_private(page).
 */
-static inline int page_is_buddy(struct page *page, int order)
+static inline int page_is_buddy(struct page *page, struct page *buddy,
+                                                                int order)
 {
 #ifdef CONFIG_HOLES_IN_ZONE
-        if (!pfn_valid(page_to_pfn(page)))
+        if (!pfn_valid(page_to_pfn(buddy)))
                return 0;
 #endif
-        if (PageBuddy(page) && page_order(page) == order) {
+        if (page_zone_id(page) != page_zone_id(buddy))
-                BUG_ON(page_count(page) != 0);
+                return 0;
+        if (PageBuddy(buddy) && page_order(buddy) == order) {
+                BUG_ON(page_count(buddy) != 0);
                return 1;
        }
        return 0;
@@ -352,7 +358,7 @@ static inline void __free_one_page(struct page *page,
                struct page *buddy;
                buddy = __page_find_buddy(page, page_idx, order);
-                if (!page_is_buddy(buddy, order))
+                if (!page_is_buddy(page, buddy, order))
                        break;          /* Move the buddy up one level. */
                list_del(&buddy->lru);
@@ -1485,7 +1491,7 @@ void show_free_areas(void)
        }
        for_each_zone(zone) {
-                unsigned long nr, flags, order, total = 0;
+                unsigned long nr[MAX_ORDER], flags, order, total = 0;
                show_node(zone);
                printk("%s: ", zone->name);
@@ -1496,11 +1502,12 @@ void show_free_areas(void)
                spin_lock_irqsave(&zone->lock, flags);
                for (order = 0; order < MAX_ORDER; order++) {
-                        nr = zone->free_area[order].nr_free;
+                        nr[order] = zone->free_area[order].nr_free;
-                        total += nr << order;
+                        total += nr[order] << order;
-                        printk("%lu*%lukB ", nr, K(1UL) << order);
                }
                spin_unlock_irqrestore(&zone->lock, flags);
+                for (order = 0; order < MAX_ORDER; order++)
+                        printk("%lu*%lukB ", nr[order], K(1UL) << order);
                printk("= %lukB\n", K(total));
        }
@@ -1512,7 +1519,7 @@ void show_free_areas(void)
 *
 * Add all populated zones of a node to the zonelist.
 */
-static int __init build_zonelists_node(pg_data_t *pgdat,
+static int __meminit build_zonelists_node(pg_data_t *pgdat,
                        struct zonelist *zonelist, int nr_zones, int zone_type)
 {
        struct zone *zone;
@@ -1548,7 +1555,7 @@ static inline int highest_zone(int zone_bits)
 #ifdef CONFIG_NUMA
 #define MAX_NODE_LOAD (num_online_nodes())
-static int __initdata node_load[MAX_NUMNODES];
+static int __meminitdata node_load[MAX_NUMNODES];
 /**
 * find_next_best_node - find the next node that should appear in a given node's fallback list
 * @node: node whose fallback list we're appending
@@ -1563,7 +1570,7 @@ static int __initdata node_load[MAX_NUMNODES];
 * on them otherwise.
 * It returns -1 if no node is found.
 */
-static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
+static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
 {
        int n, val;
        int min_val = INT_MAX;
@@ -1609,7 +1616,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
        return best_node;
 }
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
 {
        int i, j, k, node, local_node;
        int prev_node, load;
@@ -1661,7 +1668,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
 #else   /* CONFIG_NUMA */
-static void __init build_zonelists(pg_data_t *pgdat)
+static void __meminit build_zonelists(pg_data_t *pgdat)
 {
        int i, j, k, node, local_node;
@@ -1699,14 +1706,29 @@ static void __init build_zonelists(pg_data_t *pgdat)
 #endif  /* CONFIG_NUMA */
-void __init build_all_zonelists(void)
+/* return values int ....just for stop_machine_run() */
+static int __meminit __build_all_zonelists(void *dummy)
 {
-        int i;
+        int nid;
+        for_each_online_node(nid)
+                build_zonelists(NODE_DATA(nid));
+        return 0;
+}
-        for_each_online_node(i)
+void __meminit build_all_zonelists(void)
-                build_zonelists(NODE_DATA(i));
+{
-        printk("Built %i zonelists\n", num_online_nodes());
+        if (system_state == SYSTEM_BOOTING) {
-        cpuset_init_current_mems_allowed();
+                __build_all_zonelists(0);
+                cpuset_init_current_mems_allowed();
+        } else {
+                /* we have to stop all cpus to guaranntee there is no user
+                   of zonelist */
+                stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+                /* cpuset refresh routine should be here */
+        }
+        vm_total_pages = nr_free_pagecache_pages();
+        printk("Built %i zonelists.  Total pages: %ld\n",
+                        num_online_nodes(), vm_total_pages);
 }
 /*
@@ -1722,7 +1744,8 @@ void __init build_all_zonelists(void)
 */
 #define PAGES_PER_WAITQUEUE     256
-static inline unsigned long wait_table_size(unsigned long pages)
+#ifndef CONFIG_MEMORY_HOTPLUG
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
 {
        unsigned long size = 1;
@@ -1740,6 +1763,29 @@ static inline unsigned long wait_table_size(unsigned long pages)
        return max(size, 4UL);
 }
+#else
+/*
+ * A zone's size might be changed by hot-add, so it is not possible to determine
+ * a suitable size for its wait_table.  So we use the maximum size now.
+ *
+ * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
+ *
+ *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
+ *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
+ *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
+ *
+ * The maximum entries are prepared when a zone's memory is (512K + 256) pages
+ * or more by the traditional way. (See above).  It equals:
+ *
+ *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
+ *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
+ *    powerpc (64K page size)             : =  (32G +16M)byte.
+ */
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+        return 4096UL;
+}
+#endif
 /*
 * This is an integer logarithm so that shifts can be used later
@@ -2005,23 +2051,46 @@ void __init setup_per_cpu_pageset(void)
 #endif
 static __meminit
-void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
        int i;
        struct pglist_data *pgdat = zone->zone_pgdat;
+        size_t alloc_size;
        /*
         * The per-page waitqueue mechanism uses hashed waitqueues
         * per zone.
         */
-        zone->wait_table_size = wait_table_size(zone_size_pages);
+        zone->wait_table_hash_nr_entries =
-        zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
+                 wait_table_hash_nr_entries(zone_size_pages);
-        zone->wait_table = (wait_queue_head_t *)
+        zone->wait_table_bits =
-                alloc_bootmem_node(pgdat, zone->wait_table_size
+                wait_table_bits(zone->wait_table_hash_nr_entries);
-                                        * sizeof(wait_queue_head_t));
+        alloc_size = zone->wait_table_hash_nr_entries
+                                        * sizeof(wait_queue_head_t);
+        if (system_state == SYSTEM_BOOTING) {
+                zone->wait_table = (wait_queue_head_t *)
+                        alloc_bootmem_node(pgdat, alloc_size);
+        } else {
+                /*
+                 * This case means that a zone whose size was 0 gets new memory
+                 * via memory hot-add.
+                 * But it may be the case that a new node was hot-added.  In
+                 * this case vmalloc() will not be able to use this new node's
+                 * memory - this wait_table must be initialized to use this new
+                 * node itself as well.
+                 * To use this new node's memory, further consideration will be
+                 * necessary.
+                 */
+                zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
+        }
+        if (!zone->wait_table)
+                return -ENOMEM;
-        for(i = 0; i < zone->wait_table_size; ++i)
+        for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
                init_waitqueue_head(zone->wait_table + i);
+        return 0;
 }
 static __meminit void zone_pcp_init(struct zone *zone)
@@ -2043,12 +2112,15 @@ static __meminit void zone_pcp_init(struct zone *zone)
                        zone->name, zone->present_pages, batch);
 }
-static __meminit void init_currently_empty_zone(struct zone *zone,
+__meminit int init_currently_empty_zone(struct zone *zone,
-                unsigned long zone_start_pfn, unsigned long size)
+                                        unsigned long zone_start_pfn,
+                                        unsigned long size)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
+        int ret;
-        zone_wait_table_init(zone, size);
+        ret = zone_wait_table_init(zone, size);
+        if (ret)
+                return ret;
        pgdat->nr_zones = zone_idx(zone) + 1;
        zone->zone_start_pfn = zone_start_pfn;
@@ -2056,6 +2128,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
        memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
        zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+        return 0;
 }
 /*
@@ -2064,12 +2138,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */
-static void __init free_area_init_core(struct pglist_data *pgdat,
+static void __meminit free_area_init_core(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
 {
        unsigned long j;
        int nid = pgdat->node_id;
        unsigned long zone_start_pfn = pgdat->node_start_pfn;
+        int ret;
        pgdat_resize_init(pgdat);
        pgdat->nr_zones = 0;
@@ -2111,7 +2186,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                        continue;
                zonetable_add(zone, nid, j, zone_start_pfn, size);
-                init_currently_empty_zone(zone, zone_start_pfn, size);
+                ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+                BUG_ON(ret);
                zone_start_pfn += size;
        }
 }
@@ -2152,7 +2228,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
 #endif /* CONFIG_FLAT_NODE_MEM_MAP */
 }
-void __init free_area_init_node(int nid, struct pglist_data *pgdat,
+void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long node_start_pfn,
                unsigned long *zholes_size)
 {
@@ -2804,42 +2880,14 @@ void *__init alloc_large_system_hash(const char *tablename,
 }
 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
-/*
- * pfn <-> page translation. out-of-line version.
- * (see asm-generic/memory_model.h)
- */
-#if defined(CONFIG_FLATMEM)
 struct page *pfn_to_page(unsigned long pfn)
 {
-        return mem_map + (pfn - ARCH_PFN_OFFSET);
+        return __pfn_to_page(pfn);
 }
 unsigned long page_to_pfn(struct page *page)
 {
-        return (page - mem_map) + ARCH_PFN_OFFSET;
+        return __page_to_pfn(page);
-}
-#elif defined(CONFIG_DISCONTIGMEM)
-struct page *pfn_to_page(unsigned long pfn)
-{
-        int nid = arch_pfn_to_nid(pfn);
-        return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
-}
-unsigned long page_to_pfn(struct page *page)
-{
-        struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
-        return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
-}
-#elif defined(CONFIG_SPARSEMEM)
-struct page *pfn_to_page(unsigned long pfn)
-{
-        return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
-}
-unsigned long page_to_pfn(struct page *page)
-{
-        long section_id = page_to_section(page);
-        return page - __section_mem_map_addr(__nr_to_section(section_id));
 }
-#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
 EXPORT_SYMBOL(pfn_to_page);
 EXPORT_SYMBOL(page_to_pfn);
 #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
diff --git a/mm/pdflush.c b/mm/pdflush.c
index c4b6d0afd736..df7e50b8f70c 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -202,8 +202,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
        unsigned long flags;
        int ret = 0;
-        if (fn == NULL)
+        BUG_ON(fn == NULL);     /* Hard to diagnose if it's deferred */
-                BUG();          /* Hard to diagnose if it's deferred */
        spin_lock_irqsave(&pdflush_lock, flags);
        if (list_empty(&pdflush_list)) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 1963e269314d..882a85826bb2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -103,7 +103,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
                        vma->anon_vma = anon_vma;
-                        list_add(&vma->anon_vma_node, &anon_vma->head);
+                        list_add_tail(&vma->anon_vma_node, &anon_vma->head);
                        allocated = NULL;
                }
                spin_unlock(&mm->page_table_lock);
@@ -127,7 +127,7 @@ void __anon_vma_link(struct vm_area_struct *vma)
        struct anon_vma *anon_vma = vma->anon_vma;
        if (anon_vma) {
-                list_add(&vma->anon_vma_node, &anon_vma->head);
+                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
                validate_anon_vma(vma);
        }
 }
@@ -138,7 +138,7 @@ void anon_vma_link(struct vm_area_struct *vma)
        if (anon_vma) {
                spin_lock(&anon_vma->lock);
-                list_add(&vma->anon_vma_node, &anon_vma->head);
+                list_add_tail(&vma->anon_vma_node, &anon_vma->head);
                validate_anon_vma(vma);
                spin_unlock(&anon_vma->lock);
        }
@@ -205,44 +205,6 @@ out:
        return anon_vma;
 }
-#ifdef CONFIG_MIGRATION
-/*
- * Remove an anonymous page from swap replacing the swap pte's
- * through real pte's pointing to valid pages and then releasing
- * the page from the swap cache.
- *
- * Must hold page lock on page and mmap_sem of one vma that contains
- * the page.
- */
-void remove_from_swap(struct page *page)
-{
-        struct anon_vma *anon_vma;
-        struct vm_area_struct *vma;
-        unsigned long mapping;
-        if (!PageSwapCache(page))
-                return;
-        mapping = (unsigned long)page->mapping;
-        if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
-                return;
-        /*
-         * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
-         */
-        anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
-        spin_lock(&anon_vma->lock);
-        list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
-                remove_vma_swap(vma, page);
-        spin_unlock(&anon_vma->lock);
-        delete_from_swap_cache(page);
-}
-EXPORT_SYMBOL(remove_from_swap);
-#endif
 /*
 * At what user virtual address is page expected in vma?
 */
@@ -578,7 +540,7 @@ void page_remove_rmap(struct page *page)
 * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
 */
 static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
-                                int ignore_refs)
+                                int migration)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -602,7 +564,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
         */
        if ((vma->vm_flags & VM_LOCKED) ||
                        (ptep_clear_flush_young(vma, address, pte)
-                                && !ignore_refs)) {
+                                && !migration)) {
                ret = SWAP_FAIL;
                goto out_unmap;
        }
@@ -620,24 +582,45 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
        if (PageAnon(page)) {
                swp_entry_t entry = { .val = page_private(page) };
-                /*
-                 * Store the swap location in the pte.
+                if (PageSwapCache(page)) {
-                 * See handle_pte_fault() ...
+                        /*
-                 */
+                         * Store the swap location in the pte.
-                BUG_ON(!PageSwapCache(page));
+                         * See handle_pte_fault() ...
-                swap_duplicate(entry);
+                         */
-                if (list_empty(&mm->mmlist)) {
+                        swap_duplicate(entry);
-                        spin_lock(&mmlist_lock);
+                        if (list_empty(&mm->mmlist)) {
-                        if (list_empty(&mm->mmlist))
+                                spin_lock(&mmlist_lock);
-                                list_add(&mm->mmlist, &init_mm.mmlist);
+                                if (list_empty(&mm->mmlist))
-                        spin_unlock(&mmlist_lock);
+                                        list_add(&mm->mmlist, &init_mm.mmlist);
+                                spin_unlock(&mmlist_lock);
+                        }
+                        dec_mm_counter(mm, anon_rss);
+#ifdef CONFIG_MIGRATION
+                } else {
+                        /*
+                         * Store the pfn of the page in a special migration
+                         * pte. do_swap_page() will wait until the migration
+                         * pte is removed and then restart fault handling.
+                         */
+                        BUG_ON(!migration);
+                        entry = make_migration_entry(page, pte_write(pteval));
+#endif
                }
                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
                BUG_ON(pte_file(*pte));
-                dec_mm_counter(mm, anon_rss);
        } else
+#ifdef CONFIG_MIGRATION
+        if (migration) {
+                /* Establish migration entry for a file page */
+                swp_entry_t entry;
+                entry = make_migration_entry(page, pte_write(pteval));
+                set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
+        } else
+#endif
                dec_mm_counter(mm, file_rss);
        page_remove_rmap(page);
        page_cache_release(page);
@@ -736,7 +719,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
        pte_unmap_unlock(pte - 1, ptl);
 }
-static int try_to_unmap_anon(struct page *page, int ignore_refs)
+static int try_to_unmap_anon(struct page *page, int migration)
 {
        struct anon_vma *anon_vma;
        struct vm_area_struct *vma;
@@ -747,7 +730,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
                return ret;
        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-                ret = try_to_unmap_one(page, vma, ignore_refs);
+                ret = try_to_unmap_one(page, vma, migration);
                if (ret == SWAP_FAIL || !page_mapped(page))
                        break;
        }
@@ -764,7 +747,7 @@ static int try_to_unmap_anon(struct page *page, int ignore_refs)
 *
 * This function is only called from try_to_unmap for object-based pages.
 */
-static int try_to_unmap_file(struct page *page, int ignore_refs)
+static int try_to_unmap_file(struct page *page, int migration)
 {
        struct address_space *mapping = page->mapping;
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -778,7 +761,7 @@ static int try_to_unmap_file(struct page *page, int ignore_refs)
        spin_lock(&mapping->i_mmap_lock);
        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
-                ret = try_to_unmap_one(page, vma, ignore_refs);
+                ret = try_to_unmap_one(page, vma, migration);
                if (ret == SWAP_FAIL || !page_mapped(page))
                        goto out;
        }
@@ -863,16 +846,16 @@ out:
 * SWAP_AGAIN   - we missed a mapping, try again later
 * SWAP_FAIL    - the page is unswappable
 */
-int try_to_unmap(struct page *page, int ignore_refs)
+int try_to_unmap(struct page *page, int migration)
 {
        int ret;
        BUG_ON(!PageLocked(page));
        if (PageAnon(page))
-                ret = try_to_unmap_anon(page, ignore_refs);
+                ret = try_to_unmap_anon(page, migration);
        else
-                ret = try_to_unmap_file(page, ignore_refs);
+                ret = try_to_unmap_file(page, migration);
        if (!page_mapped(page))
                ret = SWAP_SUCCESS;
diff --git a/mm/shmem.c b/mm/shmem.c
index 797eef3805ce..38bc3334f263 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1081,14 +1081,6 @@ repeat:
                        page_cache_release(swappage);
                        goto repeat;
                }
-                if (!PageSwapCache(swappage)) {
-                        /* Page migration has occured */
-                        shmem_swp_unmap(entry);
-                        spin_unlock(&info->lock);
-                        unlock_page(swappage);
-                        page_cache_release(swappage);
-                        goto repeat;
-                }
                if (PageWriteback(swappage)) {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
@@ -1654,9 +1646,9 @@ static ssize_t shmem_file_sendfile(struct file *in_file, loff_t *ppos,
        return desc.error;
 }
-static int shmem_statfs(struct super_block *sb, struct kstatfs *buf)
+static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+        struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
        buf->f_type = TMPFS_MAGIC;
        buf->f_bsize = PAGE_CACHE_SIZE;
@@ -2233,10 +2225,10 @@ static struct vm_operations_struct shmem_vm_ops = {
 };
-static struct super_block *shmem_get_sb(struct file_system_type *fs_type,
+static int shmem_get_sb(struct file_system_type *fs_type,
-        int flags, const char *dev_name, void *data)
+        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
-        return get_sb_nodev(fs_type, flags, data, shmem_fill_super);
+        return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt);
 }
 static struct file_system_type tmpfs_fs_type = {
diff --git a/mm/slab.c b/mm/slab.c
index f1b644eb39d8..98ac20bc0de9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -331,6 +331,8 @@ static __always_inline int index_of(const size_t size)
        return 0;
 }
+static int slab_early_init = 1;
 #define INDEX_AC index_of(sizeof(struct arraycache_init))
 #define INDEX_L3 index_of(sizeof(struct kmem_list3))
@@ -592,6 +594,7 @@ static inline struct kmem_cache *page_get_cache(struct page *page)
 {
        if (unlikely(PageCompound(page)))
                page = (struct page *)page_private(page);
+        BUG_ON(!PageSlab(page));
        return (struct kmem_cache *)page->lru.next;
 }
@@ -604,6 +607,7 @@ static inline struct slab *page_get_slab(struct page *page)
 {
        if (unlikely(PageCompound(page)))
                page = (struct page *)page_private(page);
+        BUG_ON(!PageSlab(page));
        return (struct slab *)page->lru.prev;
 }
@@ -1024,6 +1028,40 @@ static void drain_alien_cache(struct kmem_cache *cachep,
                }
        }
 }
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+        struct slab *slabp = virt_to_slab(objp);
+        int nodeid = slabp->nodeid;
+        struct kmem_list3 *l3;
+        struct array_cache *alien = NULL;
+        /*
+         * Make sure we are not freeing a object from another node to the array
+         * cache on this cpu.
+         */
+        if (likely(slabp->nodeid == numa_node_id()))
+                return 0;
+        l3 = cachep->nodelists[numa_node_id()];
+        STATS_INC_NODEFREES(cachep);
+        if (l3->alien && l3->alien[nodeid]) {
+                alien = l3->alien[nodeid];
+                spin_lock(&alien->lock);
+                if (unlikely(alien->avail == alien->limit)) {
+                        STATS_INC_ACOVERFLOW(cachep);
+                        __drain_alien_cache(cachep, alien, nodeid);
+                }
+                alien->entry[alien->avail++] = objp;
+                spin_unlock(&alien->lock);
+        } else {
+                spin_lock(&(cachep->nodelists[nodeid])->list_lock);
+                free_block(cachep, &objp, 1, nodeid);
+                spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
+        }
+        return 1;
+}
 #else
 #define drain_alien_cache(cachep, alien) do { } while (0)
@@ -1038,6 +1076,11 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
 {
 }
+static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
+{
+        return 0;
+}
 #endif
 static int cpuup_callback(struct notifier_block *nfb,
@@ -1335,6 +1378,8 @@ void __init kmem_cache_init(void)
                                NULL, NULL);
        }
+        slab_early_init = 0;
        while (sizes->cs_size != ULONG_MAX) {
                /*
                 * For performance, all the general caches are L1 aligned.
@@ -1450,31 +1495,29 @@ __initcall(cpucache_init);
 static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
        struct page *page;
-        void *addr;
+        int nr_pages;
        int i;
-        flags |= cachep->gfpflags;
 #ifndef CONFIG_MMU
-        /* nommu uses slab's for process anonymous memory allocations, so
+        /*
-         * requires __GFP_COMP to properly refcount higher order allocations"
+         * Nommu uses slab's for process anonymous memory allocations, and thus
+         * requires __GFP_COMP to properly refcount higher order allocations
         */
-        page = alloc_pages_node(nodeid, (flags | __GFP_COMP), cachep->gfporder);
+        flags |= __GFP_COMP;
-#else
-        page = alloc_pages_node(nodeid, flags, cachep->gfporder);
 #endif
+        flags |= cachep->gfpflags;
+        page = alloc_pages_node(nodeid, flags, cachep->gfporder);
        if (!page)
                return NULL;
-        addr = page_address(page);
-        i = (1 << cachep->gfporder);
+        nr_pages = (1 << cachep->gfporder);
        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-                atomic_add(i, &slab_reclaim_pages);
+                atomic_add(nr_pages, &slab_reclaim_pages);
-        add_page_state(nr_slab, i);
+        add_page_state(nr_slab, nr_pages);
-        while (i--) {
+        for (i = 0; i < nr_pages; i++)
-                __SetPageSlab(page);
+                __SetPageSlab(page + i);
-                page++;
+        return page_address(page);
-        }
-        return addr;
 }
 /*
@@ -1913,8 +1956,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
        size_t left_over, slab_size, ralign;
-        struct kmem_cache *cachep = NULL;
+        struct kmem_cache *cachep = NULL, *pc;
-        struct list_head *p;
        /*
         * Sanity checks... these are all serious usage bugs.
@@ -1934,8 +1976,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        mutex_lock(&cache_chain_mutex);
-        list_for_each(p, &cache_chain) {
+        list_for_each_entry(pc, &cache_chain, next) {
-                struct kmem_cache *pc = list_entry(p, struct kmem_cache, next);
                mm_segment_t old_fs = get_fs();
                char tmp;
                int res;
@@ -2069,8 +2110,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 #endif
 #endif
-        /* Determine if the slab management is 'on' or 'off' slab. */
+        /*
-        if (size >= (PAGE_SIZE >> 3))
+         * Determine if the slab management is 'on' or 'off' slab.
+         * (bootstrapping cannot cope with offslab caches so don't do
+         * it too early on.)
+         */
+        if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init)
                /*
                 * Size is large, assume best to place the slab management obj
                 * off-slab (should allow better packing of objs).
@@ -2460,23 +2505,28 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
        slabp->inuse--;
 }
-static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
+/*
-                        void *objp)
+ * Map pages beginning at addr to the given cache and slab. This is required
+ * for the slab allocator to be able to lookup the cache and slab of a
+ * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging.
+ */
+static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
+                           void *addr)
 {
-        int i;
+        int nr_pages;
        struct page *page;
-        /* Nasty!!!!!! I hope this is OK. */
+        page = virt_to_page(addr);
-        page = virt_to_page(objp);
-        i = 1;
+        nr_pages = 1;
        if (likely(!PageCompound(page)))
-                i <<= cachep->gfporder;
+                nr_pages <<= cache->gfporder;
        do {
-                page_set_cache(page, cachep);
+                page_set_cache(page, cache);
-                page_set_slab(page, slabp);
+                page_set_slab(page, slab);
                page++;
-        } while (--i);
+        } while (--nr_pages);
 }
 /*
@@ -2548,7 +2598,7 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                goto opps1;
        slabp->nodeid = nodeid;
-        set_slab_attr(cachep, slabp, objp);
+        slab_map_pages(cachep, slabp, objp);
        cache_init_objs(cachep, slabp, ctor_flags);
@@ -2596,6 +2646,28 @@ static void kfree_debugcheck(const void *objp)
        }
 }
+static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
+{
+        unsigned long redzone1, redzone2;
+        redzone1 = *dbg_redzone1(cache, obj);
+        redzone2 = *dbg_redzone2(cache, obj);
+        /*
+         * Redzone is ok.
+         */
+        if (redzone1 == RED_ACTIVE && redzone2 == RED_ACTIVE)
+                return;
+        if (redzone1 == RED_INACTIVE && redzone2 == RED_INACTIVE)
+                slab_error(cache, "double free detected");
+        else
+                slab_error(cache, "memory outside object was overwritten");
+        printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
+                        obj, redzone1, redzone2);
+}
 static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
                                   void *caller)
 {
@@ -2607,27 +2679,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        kfree_debugcheck(objp);
        page = virt_to_page(objp);
-        if (page_get_cache(page) != cachep) {
-                printk(KERN_ERR "mismatch in kmem_cache_free: expected "
-                                "cache %p, got %p\n",
-                       page_get_cache(page), cachep);
-                printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
-                printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
-                       page_get_cache(page)->name);
-                WARN_ON(1);
-        }
        slabp = page_get_slab(page);
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
+                verify_redzone_free(cachep, objp);
-                                *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-                        slab_error(cachep, "double free, or memory outside"
-                                                " object was overwritten");
-                        printk(KERN_ERR "%p: redzone 1:0x%lx, "
-                                        "redzone 2:0x%lx.\n",
-                               objp, *dbg_redzone1(cachep, objp),
-                               *dbg_redzone2(cachep, objp));
-                }
                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
        }
@@ -3087,41 +3142,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
        check_irq_off();
        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
-        /* Make sure we are not freeing a object from another
+        if (cache_free_alien(cachep, objp))
-         * node to the array cache on this cpu.
+                return;
-         */
-#ifdef CONFIG_NUMA
-        {
-                struct slab *slabp;
-                slabp = virt_to_slab(objp);
-                if (unlikely(slabp->nodeid != numa_node_id())) {
-                        struct array_cache *alien = NULL;
-                        int nodeid = slabp->nodeid;
-                        struct kmem_list3 *l3;
-                        l3 = cachep->nodelists[numa_node_id()];
-                        STATS_INC_NODEFREES(cachep);
-                        if (l3->alien && l3->alien[nodeid]) {
-                                alien = l3->alien[nodeid];
-                                spin_lock(&alien->lock);
-                                if (unlikely(alien->avail == alien->limit)) {
-                                        STATS_INC_ACOVERFLOW(cachep);
-                                        __drain_alien_cache(cachep,
-                                                            alien, nodeid);
-                                }
-                                alien->entry[alien->avail++] = objp;
-                                spin_unlock(&alien->lock);
-                        } else {
-                                spin_lock(&(cachep->nodelists[nodeid])->
-                                          list_lock);
-                                free_block(cachep, &objp, 1, nodeid);
-                                spin_unlock(&(cachep->nodelists[nodeid])->
-                                            list_lock);
-                        }
-                        return;
-                }
-        }
-#endif
        if (likely(ac->avail < ac->limit)) {
                STATS_INC_FREEHIT(cachep);
                ac->entry[ac->avail++] = objp;
@@ -3254,26 +3277,10 @@ EXPORT_SYMBOL(kmalloc_node);
 #endif
 /**
- * kmalloc - allocate memory
+ * __do_kmalloc - allocate memory
 * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
+ * @flags: the type of memory to allocate (see kmalloc).
 * @caller: function caller for debug tracking of the caller
- *
- * kmalloc is the normal method of allocating memory
- * in the kernel.
- *
- * The @flags argument may be one of:
- *
- * %GFP_USER - Allocate memory on behalf of user.  May sleep.
- *
- * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
- *
- * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
- *
- * Additionally, the %GFP_DMA flag may be set to indicate the memory
- * must be suitable for DMA.  This can mean different things on different
- * platforms.  For example, on i386, it means that the memory must come
- * from the first 16MB.
 */
 static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
                                          void *caller)
@@ -3371,6 +3378,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
        unsigned long flags;
+        BUG_ON(virt_to_cache(objp) != cachep);
        local_irq_save(flags);
        __cache_free(cachep, objp);
        local_irq_restore(flags);
@@ -3680,7 +3689,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 */
 static void cache_reap(void *unused)
 {
-        struct list_head *walk;
+        struct kmem_cache *searchp;
        struct kmem_list3 *l3;
        int node = numa_node_id();
@@ -3691,13 +3700,11 @@ static void cache_reap(void *unused)
                return;
        }
-        list_for_each(walk, &cache_chain) {
+        list_for_each_entry(searchp, &cache_chain, next) {
-                struct kmem_cache *searchp;
                struct list_head *p;
                int tofree;
                struct slab *slabp;
-                searchp = list_entry(walk, struct kmem_cache, next);
                check_irq_on();
                /*
@@ -3825,7 +3832,6 @@ static void s_stop(struct seq_file *m, void *p)
 static int s_show(struct seq_file *m, void *p)
 {
        struct kmem_cache *cachep = p;
-        struct list_head *q;
        struct slab *slabp;
        unsigned long active_objs;
        unsigned long num_objs;
@@ -3846,15 +3852,13 @@ static int s_show(struct seq_file *m, void *p)
                check_irq_on();
                spin_lock_irq(&l3->list_lock);
-                list_for_each(q, &l3->slabs_full) {
+                list_for_each_entry(slabp, &l3->slabs_full, list) {
-                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse != cachep->num && !error)
                                error = "slabs_full accounting error";
                        active_objs += cachep->num;
                        active_slabs++;
                }
-                list_for_each(q, &l3->slabs_partial) {
+                list_for_each_entry(slabp, &l3->slabs_partial, list) {
-                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse == cachep->num && !error)
                                error = "slabs_partial inuse accounting error";
                        if (!slabp->inuse && !error)
@@ -3862,8 +3866,7 @@ static int s_show(struct seq_file *m, void *p)
                        active_objs += slabp->inuse;
                        active_slabs++;
                }
-                list_for_each(q, &l3->slabs_free) {
+                list_for_each_entry(slabp, &l3->slabs_free, list) {
-                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse && !error)
                                error = "slabs_free/inuse accounting error";
                        num_slabs++;
@@ -3956,7 +3959,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 {
        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
        int limit, batchcount, shared, res;
-        struct list_head *p;
+        struct kmem_cache *cachep;
        if (count > MAX_SLABINFO_WRITE)
                return -EINVAL;
@@ -3975,10 +3978,7 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
        /* Find the cache in the chain of caches. */
        mutex_lock(&cache_chain_mutex);
        res = -EINVAL;
-        list_for_each(p, &cache_chain) {
+        list_for_each_entry(cachep, &cache_chain, next) {
-                struct kmem_cache *cachep;
-                cachep = list_entry(p, struct kmem_cache, next);
                if (!strcmp(cachep->name, kbuf)) {
                        if (limit < 1 || batchcount < 1 ||
                                        batchcount > limit || shared < 0) {
@@ -4080,7 +4080,6 @@ static void show_symbol(struct seq_file *m, unsigned long address)
 static int leaks_show(struct seq_file *m, void *p)
 {
        struct kmem_cache *cachep = p;
-        struct list_head *q;
        struct slab *slabp;
        struct kmem_list3 *l3;
        const char *name;
@@ -4105,14 +4104,10 @@ static int leaks_show(struct seq_file *m, void *p)
                check_irq_on();
                spin_lock_irq(&l3->list_lock);
-                list_for_each(q, &l3->slabs_full) {
+                list_for_each_entry(slabp, &l3->slabs_full, list)
-                        slabp = list_entry(q, struct slab, list);
                        handle_slab(n, cachep, slabp);
-                }
+                list_for_each_entry(slabp, &l3->slabs_partial, list)
-                list_for_each(q, &l3->slabs_partial) {
-                        slabp = list_entry(q, struct slab, list);
                        handle_slab(n, cachep, slabp);
-                }
                spin_unlock_irq(&l3->list_lock);
        }
        name = cachep->name;
diff --git a/mm/sparse.c b/mm/sparse.c
index 100040c0dfb6..e0a3fe48aa37 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -99,6 +99,22 @@ int __section_nr(struct mem_section* ms)
        return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
 }
+/*
+ * During early boot, before section_mem_map is used for an actual
+ * mem_map, we use section_mem_map to store the section's NUMA
+ * node.  This keeps us from having to use another data structure.  The
+ * node information is cleared just before we store the real mem_map.
+ */
+static inline unsigned long sparse_encode_early_nid(int nid)
+{
+        return (nid << SECTION_NID_SHIFT);
+}
+static inline int sparse_early_nid(struct mem_section *section)
+{
+        return (section->section_mem_map >> SECTION_NID_SHIFT);
+}
 /* Record a memory area against a node. */
 void memory_present(int nid, unsigned long start, unsigned long end)
 {
@@ -113,7 +129,8 @@ void memory_present(int nid, unsigned long start, unsigned long end)
                ms = __nr_to_section(section);
                if (!ms->section_mem_map)
-                        ms->section_mem_map = SECTION_MARKED_PRESENT;
+                        ms->section_mem_map = sparse_encode_early_nid(nid) |
+                                                        SECTION_MARKED_PRESENT;
        }
 }
@@ -164,6 +181,7 @@ static int sparse_init_one_section(struct mem_section *ms,
        if (!valid_section(ms))
                return -EINVAL;
+        ms->section_mem_map &= ~SECTION_MAP_MASK;
        ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum);
        return 1;
@@ -172,8 +190,8 @@ static int sparse_init_one_section(struct mem_section *ms,
 static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
 {
        struct page *map;
-        int nid = early_pfn_to_nid(section_nr_to_pfn(pnum));
        struct mem_section *ms = __nr_to_section(pnum);
+        int nid = sparse_early_nid(ms);
        map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION);
        if (map)
diff --git a/mm/swap.c b/mm/swap.c
index 88895c249bc9..03ae2076f92f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -480,48 +480,6 @@ static int cpu_swap_callback(struct notifier_block *nfb,
 #endif /* CONFIG_HOTPLUG_CPU */
 #endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
-void percpu_counter_mod(struct percpu_counter *fbc, long amount)
-{
-        long count;
-        long *pcount;
-        int cpu = get_cpu();
-        pcount = per_cpu_ptr(fbc->counters, cpu);
-        count = *pcount + amount;
-        if (count >= FBC_BATCH || count <= -FBC_BATCH) {
-                spin_lock(&fbc->lock);
-                fbc->count += count;
-                *pcount = 0;
-                spin_unlock(&fbc->lock);
-        } else {
-                *pcount = count;
-        }
-        put_cpu();
-}
-EXPORT_SYMBOL(percpu_counter_mod);
-/*
- * Add up all the per-cpu counts, return the result.  This is a more accurate
- * but much slower version of percpu_counter_read_positive()
- */
-long percpu_counter_sum(struct percpu_counter *fbc)
-{
-        long ret;
-        int cpu;
-        spin_lock(&fbc->lock);
-        ret = fbc->count;
-        for_each_possible_cpu(cpu) {
-                long *pcount = per_cpu_ptr(fbc->counters, cpu);
-                ret += *pcount;
-        }
-        spin_unlock(&fbc->lock);
-        return ret < 0 ? 0 : ret;
-}
-EXPORT_SYMBOL(percpu_counter_sum);
-#endif
 /*
 * Perform any setup for the swap system
 */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e5fd5385f0cc..cc367f7e75d8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t entry)
        struct swap_info_struct * p;
        struct page *page = NULL;
+        if (is_migration_entry(entry))
+                return;
        p = swap_info_get(entry);
        if (p) {
                if (swap_entry_free(p, swp_offset(entry)) == 1) {
@@ -615,15 +618,6 @@ static int unuse_mm(struct mm_struct *mm,
        return 0;
 }
-#ifdef CONFIG_MIGRATION
-int remove_vma_swap(struct vm_area_struct *vma, struct page *page)
-{
-        swp_entry_t entry = { .val = page_private(page) };
-        return unuse_vma(vma, entry, page);
-}
-#endif
 /*
 * Scan swap_map from current position to next entry still in use.
 * Recycle to start on reaching the end, returning 0 when empty.
@@ -716,7 +710,6 @@ static int try_to_unuse(unsigned int type)
                 */
                swap_map = &si->swap_map[i];
                entry = swp_entry(type, i);
-again:
                page = read_swap_cache_async(entry, NULL, 0);
                if (!page) {
                        /*
@@ -751,12 +744,6 @@ again:
                wait_on_page_locked(page);
                wait_on_page_writeback(page);
                lock_page(page);
-                if (!PageSwapCache(page)) {
-                        /* Page migration has occured */
-                        unlock_page(page);
-                        page_cache_release(page);
-                        goto again;
-                }
                wait_on_page_writeback(page);
                /*
@@ -785,10 +772,8 @@ again:
                        while (*swap_map > 1 && !retval &&
                                        (p = p->next) != &start_mm->mmlist) {
                                mm = list_entry(p, struct mm_struct, mmlist);
-                                if (atomic_inc_return(&mm->mm_users) == 1) {
+                                if (!atomic_inc_not_zero(&mm->mm_users))
-                                        atomic_dec(&mm->mm_users);
                                        continue;
-                                }
                                spin_unlock(&mmlist_lock);
                                mmput(prev_mm);
                                prev_mm = mm;
@@ -1407,19 +1392,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                if (!(p->flags & SWP_USED))
                        break;
        error = -EPERM;
-        /*
+        if (type >= MAX_SWAPFILES) {
-         * Test if adding another swap device is possible. There are
-         * two limiting factors: 1) the number of bits for the swap
-         * type swp_entry_t definition and 2) the number of bits for
-         * the swap type in the swap ptes as defined by the different
-         * architectures. To honor both limitations a swap entry
-         * with swap offset 0 and swap type ~0UL is created, encoded
-         * to a swap pte, decoded to a swp_entry_t again and finally
-         * the swap type part is extracted. This will mask all bits
-         * from the initial ~0UL that can't be encoded in either the
-         * swp_entry_t or the architecture definition of a swap pte.
-         */
-        if (type > swp_type(pte_to_swp_entry(swp_entry_to_pte(swp_entry(~0UL,0))))) {
                spin_unlock(&swap_lock);
                goto out;
        }
@@ -1504,8 +1477,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                error = -EINVAL;
                goto bad_swap;
        }
-        page = read_cache_page(mapping, 0,
+        page = read_mapping_page(mapping, 0, swap_file);
-                        (filler_t *)mapping->a_ops->readpage, swap_file);
        if (IS_ERR(page)) {
                error = PTR_ERR(page);
                goto bad_swap;
@@ -1709,6 +1681,9 @@ int swap_duplicate(swp_entry_t entry)
        unsigned long offset, type;
        int result = 0;
+        if (is_migration_entry(entry))
+                return 1;
        type = swp_type(entry);
        if (type >= nr_swapfiles)
                goto bad_file;
diff --git a/mm/truncate.c b/mm/truncate.c
index 6cb3fff25f67..cf1b015df4a7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -230,14 +230,24 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
                        pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
+                        pgoff_t index;
+                        int lock_failed;
-                        if (TestSetPageLocked(page)) {
+                        lock_failed = TestSetPageLocked(page);
-                                next++;
-                                continue;
+                        /*
-                        }
+                         * We really shouldn't be looking at the ->index of an
-                        if (page->index > next)
+                         * unlocked page.  But we're not allowed to lock these
-                                next = page->index;
+                         * pages.  So we rely upon nobody altering the ->index
+                         * of this (pinned-by-us) page.
+                         */
+                        index = page->index;
+                        if (index > next)
+                                next = index;
                        next++;
+                        if (lock_failed)
+                                continue;
                        if (PageDirty(page) || PageWriteback(page))
                                goto unlock;
                        if (page_mapped(page))
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c0504f1e34eb..35f8553f893a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -257,6 +257,19 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int
 }
 /* Caller must hold vmlist_lock */
+static struct vm_struct *__find_vm_area(void *addr)
+{
+        struct vm_struct *tmp;
+        for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
+                 if (tmp->addr == addr)
+                        break;
+        }
+        return tmp;
+}
+/* Caller must hold vmlist_lock */
 struct vm_struct *__remove_vm_area(void *addr)
 {
        struct vm_struct **p, *tmp;
@@ -498,11 +511,33 @@ EXPORT_SYMBOL(__vmalloc);
 */
 void *vmalloc(unsigned long size)
 {
-       return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+        return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
 }
 EXPORT_SYMBOL(vmalloc);
 /**
+ *      vmalloc_user  -  allocate virtually contiguous memory which has
+ *                         been zeroed so it can be mapped to userspace without
+ *                         leaking data.
+ *
+ *      @size:          allocation size
+ */
+void *vmalloc_user(unsigned long size)
+{
+        struct vm_struct *area;
+        void *ret;
+        ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
+        write_lock(&vmlist_lock);
+        area = __find_vm_area(ret);
+        area->flags |= VM_USERMAP;
+        write_unlock(&vmlist_lock);
+        return ret;
+}
+EXPORT_SYMBOL(vmalloc_user);
+/**
 *      vmalloc_node  -  allocate memory on a specific node
 *
 *      @size:          allocation size
@@ -516,7 +551,7 @@ EXPORT_SYMBOL(vmalloc);
 */
 void *vmalloc_node(unsigned long size, int node)
 {
-       return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
+        return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node);
 }
 EXPORT_SYMBOL(vmalloc_node);
@@ -556,6 +591,28 @@ void *vmalloc_32(unsigned long size)
 }
 EXPORT_SYMBOL(vmalloc_32);
+/**
+ *      vmalloc_32_user  -  allocate virtually contiguous memory (32bit
+ *                            addressable) which is zeroed so it can be
+ *                            mapped to userspace without leaking data.
+ *
+ *      @size:          allocation size
+ */
+void *vmalloc_32_user(unsigned long size)
+{
+        struct vm_struct *area;
+        void *ret;
+        ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+        write_lock(&vmlist_lock);
+        area = __find_vm_area(ret);
+        area->flags |= VM_USERMAP;
+        write_unlock(&vmlist_lock);
+        return ret;
+}
+EXPORT_SYMBOL(vmalloc_32_user);
 long vread(char *buf, char *addr, unsigned long count)
 {
        struct vm_struct *tmp;
@@ -630,3 +687,64 @@ finished:
        read_unlock(&vmlist_lock);
        return buf - buf_start;
 }
+/**
+ *      remap_vmalloc_range  -  map vmalloc pages to userspace
+ *
+ *      @vma:           vma to cover (map full range of vma)
+ *      @addr:          vmalloc memory
+ *      @pgoff:         number of pages into addr before first page to map
+ *      @returns:       0 for success, -Exxx on failure
+ *
+ *      This function checks that addr is a valid vmalloc'ed area, and
+ *      that it is big enough to cover the vma. Will return failure if
+ *      that criteria isn't met.
+ *
+ *      Similar to remap_pfn_range (see mm/memory.c)
+ */
+int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
+                                                unsigned long pgoff)
+{
+        struct vm_struct *area;
+        unsigned long uaddr = vma->vm_start;
+        unsigned long usize = vma->vm_end - vma->vm_start;
+        int ret;
+        if ((PAGE_SIZE-1) & (unsigned long)addr)
+                return -EINVAL;
+        read_lock(&vmlist_lock);
+        area = __find_vm_area(addr);
+        if (!area)
+                goto out_einval_locked;
+        if (!(area->flags & VM_USERMAP))
+                goto out_einval_locked;
+        if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
+                goto out_einval_locked;
+        read_unlock(&vmlist_lock);
+        addr += pgoff << PAGE_SHIFT;
+        do {
+                struct page *page = vmalloc_to_page(addr);
+                ret = vm_insert_page(vma, uaddr, page);
+                if (ret)
+                        return ret;
+                uaddr += PAGE_SIZE;
+                addr += PAGE_SIZE;
+                usize -= PAGE_SIZE;
+        } while (usize > 0);
+        /* Prevent "things" like memory migration? VM_flags need a cleanup... */
+        vma->vm_flags |= VM_RESERVED;
+        return ret;
+out_einval_locked:
+        read_unlock(&vmlist_lock);
+        return -EINVAL;
+}
+EXPORT_SYMBOL(remap_vmalloc_range);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 440a733fe2e9..72babac71dea 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -61,6 +61,8 @@ struct scan_control {
         * In this context, it doesn't matter that we scan the
         * whole list at once. */
        int swap_cluster_max;
+        int swappiness;
 };
 /*
@@ -108,7 +110,7 @@ struct shrinker {
 * From 0 .. 100.  Higher means more swappy.
 */
 int vm_swappiness = 60;
-static long total_memory;
+long vm_total_pages;    /* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -288,11 +290,23 @@ static void handle_write_error(struct address_space *mapping,
        unlock_page(page);
 }
+/* possible outcome of pageout() */
+typedef enum {
+        /* failed to write page out, page is locked */
+        PAGE_KEEP,
+        /* move page to the active list, page is locked */
+        PAGE_ACTIVATE,
+        /* page has been sent to the disk successfully, page is unlocked */
+        PAGE_SUCCESS,
+        /* page is clean and locked */
+        PAGE_CLEAN,
+} pageout_t;
 /*
 * pageout is called by shrink_page_list() for each dirty page.
 * Calls ->writepage().
 */
-pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -337,6 +351,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping)
                struct writeback_control wbc = {
                        .sync_mode = WB_SYNC_NONE,
                        .nr_to_write = SWAP_CLUSTER_MAX,
+                        .range_start = 0,
+                        .range_end = LLONG_MAX,
                        .nonblocking = 1,
                        .for_reclaim = 1,
                };
@@ -727,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 * how much memory
                 * is mapped.
                 */
-                mapped_ratio = (sc->nr_mapped * 100) / total_memory;
+                mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages;
                /*
                 * Now decide how much we really want to unmap some pages.  The
@@ -741,7 +757,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                 * A 100% value of vm_swappiness overrides this algorithm
                 * altogether.
                 */
-                swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
+                swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
                /*
                 * Now use this metric to decide whether to start moving mapped
@@ -957,6 +973,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                .may_writepage = !laptop_mode,
                .swap_cluster_max = SWAP_CLUSTER_MAX,
                .may_swap = 1,
+                .swappiness = vm_swappiness,
        };
        inc_page_state(allocstall);
@@ -1021,10 +1038,6 @@ out:
 * For kswapd, balance_pgdat() will work across all this node's zones until
 * they are all at pages_high.
 *
- * If `nr_pages' is non-zero then it is the number of pages which are to be
- * reclaimed, regardless of the zone occupancies.  This is a software suspend
- * special.
- *
 * Returns the number of pages which were actually freed.
 *
 * There is special handling here for zones which are full of pinned pages.
@@ -1042,10 +1055,8 @@ out:
 * the page allocator fallback scheme to ensure that aging of pages is balanced
 * across the zones.
 */
-static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
-                                int order)
 {
-        unsigned long to_free = nr_pages;
        int all_zones_ok;
        int priority;
        int i;
@@ -1055,7 +1066,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
                .may_swap = 1,
-                .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+                .swap_cluster_max = SWAP_CLUSTER_MAX,
+                .swappiness = vm_swappiness,
        };
 loop_again:
@@ -1082,31 +1094,26 @@ loop_again:
                all_zones_ok = 1;
-                if (nr_pages == 0) {
+                /*
-                        /*
+                 * Scan in the highmem->dma direction for the highest
-                         * Scan in the highmem->dma direction for the highest
+                 * zone which needs scanning
-                         * zone which needs scanning
+                 */
-                         */
+                for (i = pgdat->nr_zones - 1; i >= 0; i--) {
-                        for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+                        struct zone *zone = pgdat->node_zones + i;
-                                struct zone *zone = pgdat->node_zones + i;
-                                if (!populated_zone(zone))
+                        if (!populated_zone(zone))
-                                        continue;
+                                continue;
-                                if (zone->all_unreclaimable &&
+                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
-                                                priority != DEF_PRIORITY)
+                                continue;
-                                        continue;
-                                if (!zone_watermark_ok(zone, order,
+                        if (!zone_watermark_ok(zone, order, zone->pages_high,
-                                                zone->pages_high, 0, 0)) {
+                                               0, 0)) {
-                                        end_zone = i;
+                                end_zone = i;
-                                        goto scan;
+                                goto scan;
-                                }
                        }
-                        goto out;
-                } else {
-                        end_zone = pgdat->nr_zones - 1;
                }
+                goto out;
 scan:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
@@ -1133,11 +1140,9 @@ scan:
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
-                        if (nr_pages == 0) {    /* Not software suspend */
+                        if (!zone_watermark_ok(zone, order, zone->pages_high,
-                                if (!zone_watermark_ok(zone, order,
+                                               end_zone, 0))
-                                                zone->pages_high, end_zone, 0))
+                                all_zones_ok = 0;
-                                        all_zones_ok = 0;
-                        }
                        zone->temp_priority = priority;
                        if (zone->prev_priority > priority)
                                zone->prev_priority = priority;
@@ -1162,8 +1167,6 @@ scan:
                            total_scanned > nr_reclaimed + nr_reclaimed / 2)
                                sc.may_writepage = 1;
                }
-                if (nr_pages && to_free > nr_reclaimed)
-                        continue;       /* swsusp: need to do more work */
                if (all_zones_ok)
                        break;          /* kswapd: all done */
                /*
@@ -1179,7 +1182,7 @@ scan:
                 * matches the direct reclaim path behaviour in terms of impact
                 * on zone->*_priority.
                 */
-                if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
+                if (nr_reclaimed >= SWAP_CLUSTER_MAX)
                        break;
        }
 out:
@@ -1261,7 +1264,7 @@ static int kswapd(void *p)
                }
                finish_wait(&pgdat->kswapd_wait, &wait);
-                balance_pgdat(pgdat, 0, order);
+                balance_pgdat(pgdat, order);
        }
        return 0;
 }
@@ -1290,35 +1293,154 @@ void wakeup_kswapd(struct zone *zone, int order)
 #ifdef CONFIG_PM
 /*
- * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
+ * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' pages
- * pages.
+ * from LRU lists system-wide, for given pass and priority, and returns the
+ * number of reclaimed pages
+ *
+ * For pass > 3 we also try to shrink the LRU lists that contain a few pages
+ */
+static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
+                                      int prio, struct scan_control *sc)
+{
+        struct zone *zone;
+        unsigned long nr_to_scan, ret = 0;
+        for_each_zone(zone) {
+                if (!populated_zone(zone))
+                        continue;
+                if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+                        continue;
+                /* For pass = 0 we don't shrink the active list */
+                if (pass > 0) {
+                        zone->nr_scan_active += (zone->nr_active >> prio) + 1;
+                        if (zone->nr_scan_active >= nr_pages || pass > 3) {
+                                zone->nr_scan_active = 0;
+                                nr_to_scan = min(nr_pages, zone->nr_active);
+                                shrink_active_list(nr_to_scan, zone, sc);
+                        }
+                }
+                zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
+                if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
+                        zone->nr_scan_inactive = 0;
+                        nr_to_scan = min(nr_pages, zone->nr_inactive);
+                        ret += shrink_inactive_list(nr_to_scan, zone, sc);
+                        if (ret >= nr_pages)
+                                return ret;
+                }
+        }
+        return ret;
+}
+/*
+ * Try to free `nr_pages' of memory, system-wide, and return the number of
+ * freed pages.
+ *
+ * Rather than trying to age LRUs the aim is to preserve the overall
+ * LRU order by reclaiming preferentially
+ * inactive > active > active referenced > active mapped
 */
 unsigned long shrink_all_memory(unsigned long nr_pages)
 {
-        pg_data_t *pgdat;
+        unsigned long lru_pages, nr_slab;
-        unsigned long nr_to_free = nr_pages;
        unsigned long ret = 0;
-        unsigned retry = 2;
+        int pass;
-        struct reclaim_state reclaim_state = {
+        struct reclaim_state reclaim_state;
-                .reclaimed_slab = 0,
+        struct zone *zone;
+        struct scan_control sc = {
+                .gfp_mask = GFP_KERNEL,
+                .may_swap = 0,
+                .swap_cluster_max = nr_pages,
+                .may_writepage = 1,
+                .swappiness = vm_swappiness,
        };
        current->reclaim_state = &reclaim_state;
-repeat:
-        for_each_online_pgdat(pgdat) {
-                unsigned long freed;
-                freed = balance_pgdat(pgdat, nr_to_free, 0);
+        lru_pages = 0;
-                ret += freed;
+        for_each_zone(zone)
-                nr_to_free -= freed;
+                lru_pages += zone->nr_active + zone->nr_inactive;
-                if ((long)nr_to_free <= 0)
+        nr_slab = read_page_state(nr_slab);
+        /* If slab caches are huge, it's better to hit them first */
+        while (nr_slab >= lru_pages) {
+                reclaim_state.reclaimed_slab = 0;
+                shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+                if (!reclaim_state.reclaimed_slab)
                        break;
+                ret += reclaim_state.reclaimed_slab;
+                if (ret >= nr_pages)
+                        goto out;
+                nr_slab -= reclaim_state.reclaimed_slab;
        }
-        if (retry-- && ret < nr_pages) {
-                blk_congestion_wait(WRITE, HZ/5);
+        /*
-                goto repeat;
+         * We try to shrink LRUs in 5 passes:
+         * 0 = Reclaim from inactive_list only
+         * 1 = Reclaim from active list but don't reclaim mapped
+         * 2 = 2nd pass of type 1
+         * 3 = Reclaim mapped (normal reclaim)
+         * 4 = 2nd pass of type 3
+         */
+        for (pass = 0; pass < 5; pass++) {
+                int prio;
+                /* Needed for shrinking slab caches later on */
+                if (!lru_pages)
+                        for_each_zone(zone) {
+                                lru_pages += zone->nr_active;
+                                lru_pages += zone->nr_inactive;
+                        }
+                /* Force reclaiming mapped pages in the passes #3 and #4 */
+                if (pass > 2) {
+                        sc.may_swap = 1;
+                        sc.swappiness = 100;
+                }
+                for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+                        unsigned long nr_to_scan = nr_pages - ret;
+                        sc.nr_mapped = read_page_state(nr_mapped);
+                        sc.nr_scanned = 0;
+                        ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+                        if (ret >= nr_pages)
+                                goto out;
+                        reclaim_state.reclaimed_slab = 0;
+                        shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+                        ret += reclaim_state.reclaimed_slab;
+                        if (ret >= nr_pages)
+                                goto out;
+                        if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
+                                blk_congestion_wait(WRITE, HZ / 10);
+                }
+                lru_pages = 0;
        }
+        /*
+         * If ret = 0, we could not shrink LRUs, but there may be something
+         * in slab caches
+         */
+        if (!ret)
+                do {
+                        reclaim_state.reclaimed_slab = 0;
+                        shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+                        ret += reclaim_state.reclaimed_slab;
+                } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+out:
        current->reclaim_state = NULL;
        return ret;
 }
 #endif
@@ -1360,7 +1482,6 @@ static int __init kswapd_init(void)
                pgdat->kswapd = find_task_by_pid(pid);
                read_unlock(&tasklist_lock);
        }
-        total_memory = nr_free_pagecache_pages();
        hotcpu_notifier(cpu_callback, 0);
        return 0;
 }
@@ -1416,6 +1537,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .swap_cluster_max = max_t(unsigned long, nr_pages,
                                        SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
+                .swappiness = vm_swappiness,
        };
        disable_swap_token();
author	Trond Myklebust <Trond.Myklebust@netapp.com>	2006-06-24 08:41:41 -0400
committer	Trond Myklebust <Trond.Myklebust@netapp.com>	2006-06-24 13:07:53 -0400
commit	816724e65c72a90a44fbad0ef0b59b186c85fa90 (patch)
tree	421fa29aedff988e392f92780637553e275d37a0 /mm
parent	70ac4385a13f78bc478f26d317511893741b05bd (diff)
parent	d384ea691fe4ea8c2dd5b9b8d9042eb181776f18 (diff)