Merge with /home/shaggy/git/linus-clean/

author: Dave Kleikamp <shaggy@austin.ibm.com> 2006-01-24 15:34:47 -0500
committer: Dave Kleikamp <shaggy@austin.ibm.com> 2006-01-24 15:34:47 -0500
commit: 0a0fc0ddbe732779366ab6b1b879f62195e65967 (patch)
tree: 7b42490a676cf39ae0691b6859ecf7fd410f229b /mm
parent: 4d5dbd0945d9e0833dd7964a3d6ee33157f7cc7a (diff)
parent: 3ee68c4af3fd7228c1be63254b9f884614f9ebb2 (diff)
37 files changed, 3479 insertions, 1709 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index ae9ce6b73e..a9cb80ae64 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,7 +11,7 @@ choice
 config FLATMEM_MANUAL
        bool "Flat Memory"
-        depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
+        depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
        help
          This option allows you to change some of the ways that
          Linux manages its memory internally.  Most users will
@@ -125,12 +125,17 @@ comment "Memory hotplug is currently incompatible with Software Suspend"
 # space can be handled with less contention: split it at this NR_CPUS.
 # Default to 4 for wider testing, though 8 might be more appropriate.
 # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
-# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
-# ARM26 and SPARC32 and PPC64 may use one page for multiple page tables.
 #
 config SPLIT_PTLOCK_CPUS
        int
        default "4096" if ARM && !CPU_CACHE_VIPT
-        default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+        default "4096" if PARISC && !PA20
-        default "4096" if ARM26 || SPARC32 || PPC64
        default "4"
+#
+# support for page migration
+#
+config MIGRATION
+        def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
+        depends on SWAP
diff --git a/mm/Makefile b/mm/Makefile
index 2fa6d2ca9f..9aa03fa1dc 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y                   := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           page_alloc.o page-writeback.o pdflush.o \
-                           readahead.o slab.o swap.o truncate.o vmscan.o \
+                           readahead.o swap.o truncate.o vmscan.o \
-                           prio_tree.o $(mmu-y)
+                           prio_tree.o util.o $(mmu-y)
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index e8c567177d..35c32290f7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -204,6 +204,8 @@ restart_scan:
                unsigned long j;
                i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i);
                i = ALIGN(i, incr);
+                if (i >= eidx)
+                        break;
                if (test_bit(i, bdata->node_bootmem_map))
                        continue;
                for (j = i + 1; j < i + areasize; ++j) {
@@ -294,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                unsigned long v = ~map[i / BITS_PER_LONG];
                if (gofast && v == ~0UL) {
-                        int j, order;
+                        int order;
                        page = pfn_to_page(pfn);
                        count += BITS_PER_LONG;
-                        __ClearPageReserved(page);
                        order = ffs(BITS_PER_LONG) - 1;
-                        set_page_refs(page, order);
+                        __free_pages_bootmem(page, order);
-                        for (j = 1; j < BITS_PER_LONG; j++) {
-                                if (j + 16 < BITS_PER_LONG)
-                                        prefetchw(page + j + 16);
-                                __ClearPageReserved(page + j);
-                                set_page_count(page + j, 0);
-                        }
-                        __free_pages(page, order);
                        i += BITS_PER_LONG;
                        page += BITS_PER_LONG;
                } else if (v) {
@@ -317,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                        for (m = 1; m && i < idx; m<<=1, page++, i++) {
                                if (v & m) {
                                        count++;
-                                        __ClearPageReserved(page);
+                                        __free_pages_bootmem(page, 0);
-                                        set_page_refs(page, 0);
-                                        __free_page(page);
                                }
                        }
                } else {
@@ -337,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
        count = 0;
        for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
                count++;
-                __ClearPageReserved(page);
+                __free_pages_bootmem(page, 0);
-                set_page_count(page, 1);
-                __free_page(page);
        }
        total += count;
        bdata->node_bootmem_map = NULL;
@@ -391,15 +381,14 @@ unsigned long __init free_all_bootmem (void)
        return(free_all_bootmem_core(NODE_DATA(0)));
 }
-void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal,
+void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
-                                unsigned long limit)
 {
        pg_data_t *pgdat = pgdat_list;
        void *ptr;
        for_each_pgdat(pgdat)
                if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
-                                                 align, goal, limit)))
+                                                 align, goal, 0)))
                        return(ptr);
        /*
@@ -411,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
 }
-void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align,
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
-                                     unsigned long goal, unsigned long limit)
+                                   unsigned long goal)
 {
        void *ptr;
-        ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit);
+        ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
        if (ptr)
                return (ptr);
-        return __alloc_bootmem_limit(size, align, goal, limit);
+        return __alloc_bootmem(size, align, goal);
+}
+#define LOW32LIMIT 0xffffffff
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
+{
+        pg_data_t *pgdat = pgdat_list;
+        void *ptr;
+        for_each_pgdat(pgdat)
+                if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+                                                 align, goal, LOW32LIMIT)))
+                        return(ptr);
+        /*
+         * Whoops, we cannot satisfy the allocation request.
+         */
+        printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
+        panic("Out of low memory");
+        return NULL;
 }
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+                                       unsigned long align, unsigned long goal)
+{
+        return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
+}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 5f19e87bc5..d257c89e77 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
        if (!file)
                return -EBADF;
+        if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
+                ret = -ESPIPE;
+                goto out;
+        }
        mapping = file->f_mapping;
        if (!mapping || len < 0) {
                ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 5d6e4c2000..44da3d4769 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/aio.h>
+#include <linux/capability.h>
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -61,7 +62,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 *      ->swap_lock             (exclusive_swap_page, others)
 *        ->mapping->tree_lock
 *
- *  ->i_sem
+ *  ->i_mutex
 *    ->i_mmap_lock             (truncate->unmap_mapping_range)
 *
 *  ->mmap_sem
@@ -73,9 +74,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 *    ->lock_page               (access_process_vm)
 *
 *  ->mmap_sem
- *    ->i_sem                   (msync)
+ *    ->i_mutex                 (msync)
 *
- *  ->i_sem
+ *  ->i_mutex
 *    ->i_alloc_sem             (various)
 *
 *  ->inode_lock
@@ -93,6 +94,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 *    ->private_lock            (try_to_unmap_one)
 *    ->tree_lock               (try_to_unmap_one)
 *    ->zone.lru_lock           (follow_page->mark_page_accessed)
+ *    ->zone.lru_lock           (check_pte_range->isolate_lru_page)
 *    ->private_lock            (page_remove_rmap->set_page_dirty)
 *    ->tree_lock               (page_remove_rmap->set_page_dirty)
 *    ->inode_lock              (page_remove_rmap->set_page_dirty)
@@ -134,7 +136,7 @@ static int sync_page(void *word)
        struct address_space *mapping;
        struct page *page;
-        page = container_of((page_flags_t *)word, struct page, flags);
+        page = container_of((unsigned long *)word, struct page, flags);
        /*
         * page_mapping() is being called without PG_locked held.
@@ -276,11 +278,11 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
 * integrity" operation.  It waits upon in-flight writeout before starting and
 * waiting upon new writeout.  If there was an IO error, return it.
 *
- * We need to re-take i_sem during the generic_osync_inode list walk because
+ * We need to re-take i_mutex during the generic_osync_inode list walk because
 * it is otherwise livelockable.
 */
 int sync_page_range(struct inode *inode, struct address_space *mapping,
-                        loff_t pos, size_t count)
+                        loff_t pos, loff_t count)
 {
        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -290,9 +292,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
                return 0;
        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
        if (ret == 0) {
-                down(&inode->i_sem);
+                mutex_lock(&inode->i_mutex);
                ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-                up(&inode->i_sem);
+                mutex_unlock(&inode->i_mutex);
        }
        if (ret == 0)
                ret = wait_on_page_writeback_range(mapping, start, end);
@@ -301,13 +303,12 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 EXPORT_SYMBOL(sync_page_range);
 /*
- * Note: Holding i_sem across sync_page_range_nolock is not a good idea
+ * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
 * as it forces O_SYNC writers to different parts of the same file
 * to be serialised right until io completion.
 */
-static int sync_page_range_nolock(struct inode *inode,
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
-                                  struct address_space *mapping,
+                           loff_t pos, loff_t count)
-                                  loff_t pos, size_t count)
 {
        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +323,7 @@ static int sync_page_range_nolock(struct inode *inode,
                ret = wait_on_page_writeback_range(mapping, start, end);
        return ret;
 }
+EXPORT_SYMBOL(sync_page_range_nolock);
 /**
 * filemap_fdatawait - walk the list of under-writeback pages of the given
@@ -343,30 +345,44 @@ EXPORT_SYMBOL(filemap_fdatawait);
 int filemap_write_and_wait(struct address_space *mapping)
 {
-        int retval = 0;
+        int err = 0;
        if (mapping->nrpages) {
-                retval = filemap_fdatawrite(mapping);
+                err = filemap_fdatawrite(mapping);
-                if (retval == 0)
+                /*
-                        retval = filemap_fdatawait(mapping);
+                 * Even if the above returned error, the pages may be
+                 * written partially (e.g. -ENOSPC), so we wait for it.
+                 * But the -EIO is special case, it may indicate the worst
+                 * thing (e.g. bug) happened, so we avoid waiting for it.
+                 */
+                if (err != -EIO) {
+                        int err2 = filemap_fdatawait(mapping);
+                        if (!err)
+                                err = err2;
+                }
        }
-        return retval;
+        return err;
 }
+EXPORT_SYMBOL(filemap_write_and_wait);
 int filemap_write_and_wait_range(struct address_space *mapping,
                                 loff_t lstart, loff_t lend)
 {
-        int retval = 0;
+        int err = 0;
        if (mapping->nrpages) {
-                retval = __filemap_fdatawrite_range(mapping, lstart, lend,
+                err = __filemap_fdatawrite_range(mapping, lstart, lend,
-                                                    WB_SYNC_ALL);
+                                                 WB_SYNC_ALL);
-                if (retval == 0)
+                /* See comment of filemap_write_and_wait() */
-                        retval = wait_on_page_writeback_range(mapping,
+                if (err != -EIO) {
-                                                    lstart >> PAGE_CACHE_SHIFT,
+                        int err2 = wait_on_page_writeback_range(mapping,
-                                                    lend >> PAGE_CACHE_SHIFT);
+                                                lstart >> PAGE_CACHE_SHIFT,
+                                                lend >> PAGE_CACHE_SHIFT);
+                        if (!err)
+                                err = err2;
+                }
        }
-        return retval;
+        return err;
 }
 /*
@@ -555,11 +571,12 @@ repeat:
                page_cache_get(page);
                if (TestSetPageLocked(page)) {
                        read_unlock_irq(&mapping->tree_lock);
-                        lock_page(page);
+                        __lock_page(page);
                        read_lock_irq(&mapping->tree_lock);
                        /* Has the page been truncated while we slept? */
-                        if (page->mapping != mapping || page->index != offset) {
+                        if (unlikely(page->mapping != mapping ||
+                                     page->index != offset)) {
                                unlock_page(page);
                                page_cache_release(page);
                                goto repeat;
@@ -831,8 +848,13 @@ readpage:
                /* Start the actual read. The read will unlock the page. */
                error = mapping->a_ops->readpage(filp, page);
-                if (unlikely(error))
+                if (unlikely(error)) {
+                        if (error == AOP_TRUNCATED_PAGE) {
+                                page_cache_release(page);
+                                goto find_page;
+                        }
                        goto readpage_error;
+                }
                if (!PageUptodate(page)) {
                        lock_page(page);
@@ -1152,26 +1174,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 {
        struct address_space *mapping = file->f_mapping;
        struct page *page; 
-        int error;
+        int ret;
-        page = page_cache_alloc_cold(mapping);
+        do {
-        if (!page)
+                page = page_cache_alloc_cold(mapping);
-                return -ENOMEM;
+                if (!page)
+                        return -ENOMEM;
+                ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+                if (ret == 0)
+                        ret = mapping->a_ops->readpage(file, page);
+                else if (ret == -EEXIST)
+                        ret = 0; /* losing race to add is OK */
-        error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
-        if (!error) {
-                error = mapping->a_ops->readpage(file, page);
                page_cache_release(page);
-                return error;
-        }
-        /*
+        } while (ret == AOP_TRUNCATED_PAGE);
-         * We arrive here in the unlikely event that someone 
+                
-         * raced with us and added our page to the cache first
+        return ret;
-         * or we are out of memory for radix-tree nodes.
-         */
-        page_cache_release(page);
-        return error == -EEXIST ? 0 : error;
 }
 #define MMAP_LOTSAMISS  (100)
@@ -1331,10 +1351,14 @@ page_not_uptodate:
                goto success;
        }
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1358,10 +1382,14 @@ page_not_uptodate:
                goto success;
        }
        ClearPageError(page);
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1444,10 +1472,14 @@ page_not_uptodate:
                goto success;
        }
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1470,10 +1502,14 @@ page_not_uptodate:
        }
        ClearPageError(page);
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1858,7 +1894,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        /*
         * Sync the fs metadata but not the minor inode changes and
         * of course not the data as we did direct DMA for the IO.
-         * i_sem is held, which protects generic_osync_inode() from
+         * i_mutex is held, which protects generic_osync_inode() from
         * livelocking.
         */
        if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -1934,12 +1970,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                status = a_ops->prepare_write(file, page, offset, offset+bytes);
                if (unlikely(status)) {
                        loff_t isize = i_size_read(inode);
+                        if (status != AOP_TRUNCATED_PAGE)
+                                unlock_page(page);
+                        page_cache_release(page);
+                        if (status == AOP_TRUNCATED_PAGE)
+                                continue;
                        /*
                         * prepare_write() may have instantiated a few blocks
                         * outside i_size.  Trim these off again.
                         */
-                        unlock_page(page);
-                        page_cache_release(page);
                        if (pos + bytes > isize)
                                vmtruncate(inode, isize);
                        break;
@@ -1952,6 +1992,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                                                cur_iov, iov_base, bytes);
                flush_dcache_page(page);
                status = a_ops->commit_write(file, page, offset, offset+bytes);
+                if (status == AOP_TRUNCATED_PAGE) {
+                        page_cache_release(page);
+                        continue;
+                }
                if (likely(copied > 0)) {
                        if (!status)
                                status = copied;
@@ -2066,7 +2110,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
        if (err)
                goto out;
-        inode_update_time(inode, 1);
+        file_update_time(file);
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (unlikely(file->f_flags & O_DIRECT)) {
@@ -2153,10 +2197,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
        BUG_ON(iocb->ki_pos != pos);
-        down(&inode->i_sem);
+        mutex_lock(&inode->i_mutex);
        ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
                                                &iocb->ki_pos);
-        up(&inode->i_sem);
+        mutex_unlock(&inode->i_mutex);
        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                ssize_t err;
@@ -2178,9 +2222,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
        struct iovec local_iov = { .iov_base = (void __user *)buf,
                                        .iov_len = count };
-        down(&inode->i_sem);
+        mutex_lock(&inode->i_mutex);
        ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
-        up(&inode->i_sem);
+        mutex_unlock(&inode->i_mutex);
        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                ssize_t err;
@@ -2214,9 +2258,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
        struct inode *inode = mapping->host;
        ssize_t ret;
-        down(&inode->i_sem);
+        mutex_lock(&inode->i_mutex);
        ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
-        up(&inode->i_sem);
+        mutex_unlock(&inode->i_mutex);
        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                int err;
@@ -2230,7 +2274,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
 EXPORT_SYMBOL(generic_file_writev);
 /*
- * Called under i_sem for writes to S_ISREG files.   Returns -EIO if something
+ * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
 * went wrong during pagecache shootdown.
 */
 static ssize_t
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9cf687e4a2..b960ac8e59 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
        *ppos = pos;
        /*
         * No need to use i_size_read() here, the i_size
-         * cannot change under us because we hold i_sem.
+         * cannot change under us because we hold i_mutex.
         */
        if (pos > inode->i_size) {
                i_size_write(inode, pos);
@@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        loff_t pos;
        ssize_t ret;
-        down(&inode->i_sem);
+        mutex_lock(&inode->i_mutex);
        if (!access_ok(VERIFY_READ, buf, len)) {
                ret=-EFAULT;
@@ -383,14 +383,14 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        if (ret)
                goto out_backing;
-        inode_update_time(inode, 1);
+        file_update_time(filp);
        ret = __xip_file_write (filp, buf, count, pos, ppos);
 out_backing:
        current->backing_dev_info = NULL;
 out_up:
-        up(&inode->i_sem);
+        mutex_unlock(&inode->i_mutex);
        return ret;
 }
 EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/fremap.c b/mm/fremap.c
index d862be3bc3..9f381e58bf 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
        struct page *page = NULL;
        if (pte_present(pte)) {
-                unsigned long pfn = pte_pfn(pte);
+                flush_cache_page(vma, addr, pte_pfn(pte));
-                flush_cache_page(vma, addr, pfn);
                pte = ptep_clear_flush(vma, addr, ptep);
-                if (unlikely(!pfn_valid(pfn))) {
+                page = vm_normal_page(vma, addr, pte);
-                        print_bad_pte(vma, pte, addr);
+                if (page) {
-                        goto out;
+                        if (pte_dirty(pte))
+                                set_page_dirty(page);
+                        page_remove_rmap(page);
+                        page_cache_release(page);
                }
-                page = pfn_to_page(pfn);
-                if (pte_dirty(pte))
-                        set_page_dirty(page);
-                page_remove_rmap(page);
-                page_cache_release(page);
        } else {
                if (!pte_file(pte))
                        free_swap_and_cache(pte_to_swp_entry(pte));
                pte_clear(mm, addr, ptep);
        }
-out:
        return !!page;
 }
@@ -59,22 +55,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
        pgoff_t size;
        int err = -ENOMEM;
        pte_t *pte;
-        pmd_t *pmd;
-        pud_t *pud;
-        pgd_t *pgd;
        pte_t pte_val;
        spinlock_t *ptl;
-        BUG_ON(vma->vm_flags & VM_RESERVED);
+        pte = get_locked_pte(mm, addr, &ptl);
-        pgd = pgd_offset(mm, addr);
-        pud = pud_alloc(mm, pgd, addr);
-        if (!pud)
-                goto out;
-        pmd = pmd_alloc(mm, pud, addr);
-        if (!pmd)
-                goto out;
-        pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                goto out;
@@ -116,22 +100,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        int err = -ENOMEM;
        pte_t *pte;
-        pmd_t *pmd;
-        pud_t *pud;
-        pgd_t *pgd;
        pte_t pte_val;
        spinlock_t *ptl;
-        BUG_ON(vma->vm_flags & VM_RESERVED);
+        pte = get_locked_pte(mm, addr, &ptl);
-        pgd = pgd_offset(mm, addr);
-        pud = pud_alloc(mm, pgd, addr);
-        if (!pud)
-                goto out;
-        pmd = pmd_alloc(mm, pud, addr);
-        if (!pmd)
-                goto out;
-        pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
        if (!pte)
                goto out;
@@ -204,12 +176,10 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
         * Make sure the vma is shared, that it supports prefaulting,
         * and that the remapped range is valid and fully within
         * the single existing vma.  vm_private_data is used as a
-         * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED
+         * swapout cursor in a VM_NONLINEAR vma.
-         * or VM_LOCKED, but VM_LOCKED could be revoked later on).
         */
        if (vma && (vma->vm_flags & VM_SHARED) &&
-                (!vma->vm_private_data ||
+                (!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) &&
-                        (vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) &&
                vma->vm_ops && vma->vm_ops->populate &&
                        end > start && start >= vma->vm_start &&
                                end <= vma->vm_end) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 728e9bda12..b21d78c941 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,9 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+#include <linux/cpuset.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -22,6 +25,10 @@ unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
+/*
+ * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ */
 static DEFINE_SPINLOCK(hugetlb_lock);
 static void enqueue_huge_page(struct page *page)
@@ -32,18 +39,22 @@ static void enqueue_huge_page(struct page *page)
        free_huge_pages_node[nid]++;
 }
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+                                unsigned long address)
 {
        int nid = numa_node_id();
        struct page *page = NULL;
+        struct zonelist *zonelist = huge_zonelist(vma, address);
+        struct zone **z;
-        if (list_empty(&hugepage_freelists[nid])) {
+        for (z = zonelist->zones; *z; z++) {
-                for (nid = 0; nid < MAX_NUMNODES; ++nid)
+                nid = (*z)->zone_pgdat->node_id;
-                        if (!list_empty(&hugepage_freelists[nid]))
+                if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
-                                break;
+                    !list_empty(&hugepage_freelists[nid]))
+                        break;
        }
-        if (nid >= 0 && nid < MAX_NUMNODES &&
-            !list_empty(&hugepage_freelists[nid])) {
+        if (*z) {
                page = list_entry(hugepage_freelists[nid].next,
                                  struct page, lru);
                list_del(&page->lru);
@@ -61,8 +72,10 @@ static struct page *alloc_fresh_huge_page(void)
                                        HUGETLB_PAGE_ORDER);
        nid = (nid + 1) % num_online_nodes();
        if (page) {
+                spin_lock(&hugetlb_lock);
                nr_huge_pages++;
                nr_huge_pages_node[page_to_nid(page)]++;
+                spin_unlock(&hugetlb_lock);
        }
        return page;
 }
@@ -79,13 +92,13 @@ void free_huge_page(struct page *page)
        spin_unlock(&hugetlb_lock);
 }
-struct page *alloc_huge_page(void)
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
        struct page *page;
        int i;
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page();
+        page = dequeue_huge_page(vma, addr);
        if (!page) {
                spin_unlock(&hugetlb_lock);
                return NULL;
@@ -188,7 +201,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
        spin_lock(&hugetlb_lock);
        try_to_free_low(count);
        while (count < nr_huge_pages) {
-                struct page *page = dequeue_huge_page();
+                struct page *page = dequeue_huge_page(NULL, 0);
                if (!page)
                        break;
                update_and_free_page(page);
@@ -255,11 +268,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
        .nopage = hugetlb_nopage,
 };
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+                                int writable)
 {
        pte_t entry;
-        if (vma->vm_flags & VM_WRITE) {
+        if (writable) {
                entry =
                    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
        } else {
@@ -271,12 +285,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
        return entry;
 }
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+                                   unsigned long address, pte_t *ptep)
+{
+        pte_t entry;
+        entry = pte_mkwrite(pte_mkdirty(*ptep));
+        ptep_set_access_flags(vma, address, ptep, entry, 1);
+        update_mmu_cache(vma, address, entry);
+        lazy_mmu_prot_update(entry);
+}
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *vma)
 {
        pte_t *src_pte, *dst_pte, entry;
        struct page *ptepage;
        unsigned long addr;
+        int cow;
+        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
        for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
                src_pte = huge_pte_offset(src, addr);
@@ -288,6 +317,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                spin_lock(&dst->page_table_lock);
                spin_lock(&src->page_table_lock);
                if (!pte_none(*src_pte)) {
+                        if (cow)
+                                ptep_set_wrprotect(src, addr, src_pte);
                        entry = *src_pte;
                        ptepage = pte_page(entry);
                        get_page(ptepage);
@@ -339,57 +370,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        flush_tlb_range(vma, start, end);
 }
-static struct page *find_lock_huge_page(struct address_space *mapping,
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long idx)
+                        unsigned long address, pte_t *ptep, pte_t pte)
 {
-        struct page *page;
+        struct page *old_page, *new_page;
-        int err;
+        int i, avoidcopy;
-        struct inode *inode = mapping->host;
-        unsigned long size;
-retry:
+        old_page = pte_page(pte);
-        page = find_lock_page(mapping, idx);
-        if (page)
-                goto out;
-        /* Check to make sure the mapping hasn't been truncated */
+        /* If no-one else is actually using this page, avoid the copy
-        size = i_size_read(inode) >> HPAGE_SHIFT;
+         * and just make the page writable */
-        if (idx >= size)
+        avoidcopy = (page_count(old_page) == 1);
-                goto out;
+        if (avoidcopy) {
+                set_huge_ptep_writable(vma, address, ptep);
+                return VM_FAULT_MINOR;
+        }
-        if (hugetlb_get_quota(mapping))
+        page_cache_get(old_page);
-                goto out;
+        new_page = alloc_huge_page(vma, address);
-        page = alloc_huge_page();
-        if (!page) {
+        if (!new_page) {
-                hugetlb_put_quota(mapping);
+                page_cache_release(old_page);
-                goto out;
+                /* Logically this is OOM, not a SIGBUS, but an OOM
+                 * could cause the kernel to go killing other
+                 * processes which won't help the hugepage situation
+                 * at all (?) */
+                return VM_FAULT_SIGBUS;
        }
-        err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+        spin_unlock(&mm->page_table_lock);
-        if (err) {
+        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
-                put_page(page);
+                copy_user_highpage(new_page + i, old_page + i,
-                hugetlb_put_quota(mapping);
+                                   address + i*PAGE_SIZE);
-                if (err == -EEXIST)
+        spin_lock(&mm->page_table_lock);
-                        goto retry;
-                page = NULL;
+        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+        if (likely(pte_same(*ptep, pte))) {
+                /* Break COW */
+                set_huge_pte_at(mm, address, ptep,
+                                make_huge_pte(vma, new_page, 1));
+                /* Make the old page be freed below */
+                new_page = old_page;
        }
-out:
+        page_cache_release(new_page);
-        return page;
+        page_cache_release(old_page);
+        return VM_FAULT_MINOR;
 }
-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long address, int write_access)
+                        unsigned long address, pte_t *ptep, int write_access)
 {
        int ret = VM_FAULT_SIGBUS;
        unsigned long idx;
        unsigned long size;
-        pte_t *pte;
        struct page *page;
        struct address_space *mapping;
+        pte_t new_pte;
-        pte = huge_pte_alloc(mm, address);
-        if (!pte)
-                goto out;
        mapping = vma->vm_file->f_mapping;
        idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -399,9 +436,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * Use page lock to guard against racing truncation
         * before we get page_table_lock.
         */
-        page = find_lock_huge_page(mapping, idx);
+retry:
-        if (!page)
+        page = find_lock_page(mapping, idx);
-                goto out;
+        if (!page) {
+                if (hugetlb_get_quota(mapping))
+                        goto out;
+                page = alloc_huge_page(vma, address);
+                if (!page) {
+                        hugetlb_put_quota(mapping);
+                        goto out;
+                }
+                if (vma->vm_flags & VM_SHARED) {
+                        int err;
+                        err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+                        if (err) {
+                                put_page(page);
+                                hugetlb_put_quota(mapping);
+                                if (err == -EEXIST)
+                                        goto retry;
+                                goto out;
+                        }
+                } else
+                        lock_page(page);
+        }
        spin_lock(&mm->page_table_lock);
        size = i_size_read(mapping->host) >> HPAGE_SHIFT;
@@ -409,11 +468,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                goto backout;
        ret = VM_FAULT_MINOR;
-        if (!pte_none(*pte))
+        if (!pte_none(*ptep))
                goto backout;
        add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
-        set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+                                && (vma->vm_flags & VM_SHARED)));
+        set_huge_pte_at(mm, address, ptep, new_pte);
+        if (write_access && !(vma->vm_flags & VM_SHARED)) {
+                /* Optimization, do the COW without a second fault */
+                ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+        }
        spin_unlock(&mm->page_table_lock);
        unlock_page(page);
 out:
@@ -427,6 +494,33 @@ backout:
        goto out;
 }
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+                        unsigned long address, int write_access)
+{
+        pte_t *ptep;
+        pte_t entry;
+        int ret;
+        ptep = huge_pte_alloc(mm, address);
+        if (!ptep)
+                return VM_FAULT_OOM;
+        entry = *ptep;
+        if (pte_none(entry))
+                return hugetlb_no_page(mm, vma, address, ptep, write_access);
+        ret = VM_FAULT_MINOR;
+        spin_lock(&mm->page_table_lock);
+        /* Check for a racing update before calling hugetlb_cow */
+        if (likely(pte_same(entry, *ptep)))
+                if (write_access && !pte_write(entry))
+                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i)
diff --git a/mm/internal.h b/mm/internal.h
index 6bf134e8fb..17256bb2f4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -9,5 +9,22 @@
 * 2 of the License, or (at your option) any later version.
 */
-/* page_alloc.c */
+static inline void set_page_refs(struct page *page, int order)
-extern void set_page_refs(struct page *page, int order);
+{
+#ifdef CONFIG_MMU
+        set_page_count(page, 1);
+#else
+        int i;
+        /*
+         * We need to reference all the pages for this order, otherwise if
+         * anyone accesses one of the pages with (get/put) it will be freed.
+         * - eg: access_process_vm()
+         */
+        for (i = 0; i < (1 << order); i++)
+                set_page_count(page + i, 1);
+#endif /* CONFIG_MMU */
+}
+extern void fastcall __init __free_pages_bootmem(struct page *page,
+                                                unsigned int order);
diff --git a/mm/madvise.c b/mm/madvise.c
index 17aaf3e164..ae0ae3ea29 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
                             unsigned long start, unsigned long end)
 {
        *prev = vma;
-        if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
+        if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
                return -EINVAL;
        if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
        return 0;
 }
+/*
+ * Application wants to free up the pages and associated backing store.
+ * This is effectively punching a hole into the middle of a file.
+ *
+ * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
+ * Other filesystems return -ENOSYS.
+ */
+static long madvise_remove(struct vm_area_struct *vma,
+                                unsigned long start, unsigned long end)
+{
+        struct address_space *mapping;
+        loff_t offset, endoff;
+        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+                return -EINVAL;
+        if (!vma->vm_file || !vma->vm_file->f_mapping
+                || !vma->vm_file->f_mapping->host) {
+                        return -EINVAL;
+        }
+        mapping = vma->vm_file->f_mapping;
+        offset = (loff_t)(start - vma->vm_start)
+                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+        endoff = (loff_t)(end - vma->vm_start - 1)
+                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+        return  vmtruncate_range(mapping->host, offset, endoff);
+}
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
        case MADV_RANDOM:
                error = madvise_behavior(vma, prev, start, end, behavior);
                break;
+        case MADV_REMOVE:
+                error = madvise_remove(vma, start, end);
+                break;
        case MADV_WILLNEED:
                error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 *              some pages ahead.
 *  MADV_DONTNEED - the application is finished with the given range,
 *              so the kernel can free resources associated with it.
+ *  MADV_REMOVE - the application wants to free up the given range of
+ *              pages and associated backing store.
 *
 * return values:
 *  zero    - success
diff --git a/mm/memory.c b/mm/memory.c
index 0f60baf6f6..7a11ddd506 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 }
 /*
- * This function is called to print an error when a pte in a
+ * This function is called to print an error when a bad pte
- * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is found. For example, we might have a PFN-mapped pte in
- * is an error.
+ * a region that doesn't allow it.
 *
 * The calling function must still handle the error.
 */
@@ -349,6 +349,66 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
        dump_stack();
 }
+static inline int is_cow_mapping(unsigned int flags)
+{
+        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+}
+/*
+ * This function gets the "struct page" associated with a pte.
+ *
+ * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
+ * will have each page table entry just pointing to a raw page frame
+ * number, and as far as the VM layer is concerned, those do not have
+ * pages associated with them - even if the PFN might point to memory
+ * that otherwise is perfectly fine and has a "struct page".
+ *
+ * The way we recognize those mappings is through the rules set up
+ * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
+ * and the vm_pgoff will point to the first PFN mapped: thus every
+ * page that is a raw mapping will always honor the rule
+ *
+ *      pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
+ *
+ * and if that isn't true, the page has been COW'ed (in which case it
+ * _does_ have a "struct page" associated with it even if it is in a
+ * VM_PFNMAP range).
+ */
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+{
+        unsigned long pfn = pte_pfn(pte);
+        if (vma->vm_flags & VM_PFNMAP) {
+                unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
+                if (pfn == vma->vm_pgoff + off)
+                        return NULL;
+                if (!is_cow_mapping(vma->vm_flags))
+                        return NULL;
+        }
+        /*
+         * Add some anal sanity checks for now. Eventually,
+         * we should just do "return pfn_to_page(pfn)", but
+         * in the meantime we check that we get a valid pfn,
+         * and that the resulting page looks ok.
+         *
+         * Remove this test eventually!
+         */
+        if (unlikely(!pfn_valid(pfn))) {
+                print_bad_pte(vma, pte, addr);
+                return NULL;
+        }
+        /*
+         * NOTE! We still have PageReserved() pages in the page 
+         * tables. 
+         *
+         * The PAGE_ZERO() pages and various VDSO mappings can
+         * cause them to exist.
+         */
+        return pfn_to_page(pfn);
+}
 /*
 * copy one vm_area from one task to the other. Assumes the page tables
 * already present in the new task to be cleared in the whole range
@@ -363,7 +423,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        unsigned long vm_flags = vma->vm_flags;
        pte_t pte = *src_pte;
        struct page *page;
-        unsigned long pfn;
        /* pte contains position in swap or file, so copy. */
        if (unlikely(!pte_present(pte))) {
@@ -381,28 +440,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                goto out_set_pte;
        }
-        /* If the region is VM_RESERVED, the mapping is not
-         * mapped via rmap - duplicate the pte as is.
-         */
-        if (vm_flags & VM_RESERVED)
-                goto out_set_pte;
-        pfn = pte_pfn(pte);
-        /* If the pte points outside of valid memory but
-         * the region is not VM_RESERVED, we have a problem.
-         */
-        if (unlikely(!pfn_valid(pfn))) {
-                print_bad_pte(vma, pte, addr);
-                goto out_set_pte; /* try to do something sane */
-        }
-        page = pfn_to_page(pfn);
        /*
         * If it's a COW mapping, write protect it both
         * in the parent and the child
         */
-        if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) {
+        if (is_cow_mapping(vm_flags)) {
                ptep_set_wrprotect(src_mm, addr, src_pte);
                pte = *src_pte;
        }
@@ -414,9 +456,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        if (vm_flags & VM_SHARED)
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);
-        get_page(page);
-        page_dup_rmap(page);
+        page = vm_normal_page(vma, addr, pte);
-        rss[!!PageAnon(page)]++;
+        if (page) {
+                get_page(page);
+                page_dup_rmap(page);
+                rss[!!PageAnon(page)]++;
+        }
 out_set_pte:
        set_pte_at(dst_mm, addr, dst_pte, pte);
@@ -528,7 +574,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * readonly mappings. The tradeoff is that copy_page_range is more
         * efficient than faulting.
         */
-        if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) {
+        if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
                if (!vma->anon_vma)
                        return 0;
        }
@@ -549,10 +595,10 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        return 0;
 }
-static void zap_pte_range(struct mmu_gather *tlb,
+static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end,
-                                struct zap_details *details)
+                                long *zap_work, struct zap_details *details)
 {
        struct mm_struct *mm = tlb->mm;
        pte_t *pte;
@@ -563,17 +609,16 @@ static void zap_pte_range(struct mmu_gather *tlb,
        pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
        do {
                pte_t ptent = *pte;
-                if (pte_none(ptent))
+                if (pte_none(ptent)) {
+                        (*zap_work)--;
                        continue;
+                }
                if (pte_present(ptent)) {
-                        struct page *page = NULL;
+                        struct page *page;
-                        if (!(vma->vm_flags & VM_RESERVED)) {
-                                unsigned long pfn = pte_pfn(ptent);
+                        (*zap_work) -= PAGE_SIZE;
-                                if (unlikely(!pfn_valid(pfn)))
-                                        print_bad_pte(vma, ptent, addr);
+                        page = vm_normal_page(vma, addr, ptent);
-                                else
-                                        page = pfn_to_page(pfn);
-                        }
                        if (unlikely(details) && page) {
                                /*
                                 * unmap_shared_mapping_pages() wants to
@@ -624,16 +669,18 @@ static void zap_pte_range(struct mmu_gather *tlb,
                if (!pte_file(ptent))
                        free_swap_and_cache(pte_to_swp_entry(ptent));
                pte_clear_full(mm, addr, pte, tlb->fullmm);
-        } while (pte++, addr += PAGE_SIZE, addr != end);
+        } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
        add_mm_rss(mm, file_rss, anon_rss);
        pte_unmap_unlock(pte - 1, ptl);
+        return addr;
 }
-static inline void zap_pmd_range(struct mmu_gather *tlb,
+static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pud_t *pud,
                                unsigned long addr, unsigned long end,
-                                struct zap_details *details)
+                                long *zap_work, struct zap_details *details)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -641,16 +688,21 @@ static inline void zap_pmd_range(struct mmu_gather *tlb,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-                if (pmd_none_or_clear_bad(pmd))
+                if (pmd_none_or_clear_bad(pmd)) {
+                        (*zap_work)--;
                        continue;
-                zap_pte_range(tlb, vma, pmd, addr, next, details);
+                }
-        } while (pmd++, addr = next, addr != end);
+                next = zap_pte_range(tlb, vma, pmd, addr, next,
+                                                zap_work, details);
+        } while (pmd++, addr = next, (addr != end && *zap_work > 0));
+        return addr;
 }
-static inline void zap_pud_range(struct mmu_gather *tlb,
+static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                struct vm_area_struct *vma, pgd_t *pgd,
                                unsigned long addr, unsigned long end,
-                                struct zap_details *details)
+                                long *zap_work, struct zap_details *details)
 {
        pud_t *pud;
        unsigned long next;
@@ -658,15 +710,21 @@ static inline void zap_pud_range(struct mmu_gather *tlb,
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
-                if (pud_none_or_clear_bad(pud))
+                if (pud_none_or_clear_bad(pud)) {
+                        (*zap_work)--;
                        continue;
-                zap_pmd_range(tlb, vma, pud, addr, next, details);
+                }
-        } while (pud++, addr = next, addr != end);
+                next = zap_pmd_range(tlb, vma, pud, addr, next,
+                                                zap_work, details);
+        } while (pud++, addr = next, (addr != end && *zap_work > 0));
+        return addr;
 }
-static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
+static unsigned long unmap_page_range(struct mmu_gather *tlb,
+                                struct vm_area_struct *vma,
                                unsigned long addr, unsigned long end,
-                                struct zap_details *details)
+                                long *zap_work, struct zap_details *details)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -679,11 +737,16 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        pgd = pgd_offset(vma->vm_mm, addr);
        do {
                next = pgd_addr_end(addr, end);
-                if (pgd_none_or_clear_bad(pgd))
+                if (pgd_none_or_clear_bad(pgd)) {
+                        (*zap_work)--;
                        continue;
-                zap_pud_range(tlb, vma, pgd, addr, next, details);
+                }
-        } while (pgd++, addr = next, addr != end);
+                next = zap_pud_range(tlb, vma, pgd, addr, next,
+                                                zap_work, details);
+        } while (pgd++, addr = next, (addr != end && *zap_work > 0));
        tlb_end_vma(tlb, vma);
+        return addr;
 }
 #ifdef CONFIG_PREEMPT
@@ -724,7 +787,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                unsigned long end_addr, unsigned long *nr_accounted,
                struct zap_details *details)
 {
-        unsigned long zap_bytes = ZAP_BLOCK_SIZE;
+        long zap_work = ZAP_BLOCK_SIZE;
        unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
        int tlb_start_valid = 0;
        unsigned long start = start_addr;
@@ -745,27 +808,25 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        *nr_accounted += (end - start) >> PAGE_SHIFT;
                while (start != end) {
-                        unsigned long block;
                        if (!tlb_start_valid) {
                                tlb_start = start;
                                tlb_start_valid = 1;
                        }
-                        if (is_vm_hugetlb_page(vma)) {
+                        if (unlikely(is_vm_hugetlb_page(vma))) {
-                                block = end - start;
                                unmap_hugepage_range(vma, start, end);
-                        } else {
+                                zap_work -= (end - start) /
-                                block = min(zap_bytes, end - start);
+                                                (HPAGE_SIZE / PAGE_SIZE);
-                                unmap_page_range(*tlbp, vma, start,
+                                start = end;
-                                                start + block, details);
+                        } else
+                                start = unmap_page_range(*tlbp, vma,
+                                                start, end, &zap_work, details);
+                        if (zap_work > 0) {
+                                BUG_ON(start != end);
+                                break;
                        }
-                        start += block;
-                        zap_bytes -= block;
-                        if ((long)zap_bytes > 0)
-                                continue;
                        tlb_finish_mmu(*tlbp, tlb_start, start);
                        if (need_resched() ||
@@ -779,7 +840,7 @@ unsigned long unmap_vmas(struct mmu_gather **tlbp,
                        *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm);
                        tlb_start_valid = 0;
-                        zap_bytes = ZAP_BLOCK_SIZE;
+                        zap_work = ZAP_BLOCK_SIZE;
                }
        }
 out:
@@ -813,7 +874,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 /*
 * Do a quick page-table lookup for a single page.
 */
-struct page *follow_page(struct mm_struct *mm, unsigned long address,
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                        unsigned int flags)
 {
        pgd_t *pgd;
@@ -821,8 +882,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
        pmd_t *pmd;
        pte_t *ptep, pte;
        spinlock_t *ptl;
-        unsigned long pfn;
        struct page *page;
+        struct mm_struct *mm = vma->vm_mm;
        page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
        if (!IS_ERR(page)) {
@@ -858,11 +919,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
                goto unlock;
        if ((flags & FOLL_WRITE) && !pte_write(pte))
                goto unlock;
-        pfn = pte_pfn(pte);
+        page = vm_normal_page(vma, address, pte);
-        if (!pfn_valid(pfn))
+        if (unlikely(!page))
                goto unlock;
-        page = pfn_to_page(pfn);
        if (flags & FOLL_GET)
                get_page(page);
        if (flags & FOLL_TOUCH) {
@@ -935,8 +995,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                return i ? : -EFAULT;
                        }
                        if (pages) {
-                                pages[i] = pte_page(*pte);
+                                struct page *page = vm_normal_page(gate_vma, start, *pte);
-                                get_page(pages[i]);
+                                pages[i] = page;
+                                if (page)
+                                        get_page(page);
                        }
                        pte_unmap(pte);
                        if (vmas)
@@ -947,7 +1009,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        continue;
                }
-                if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
+                if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
                                || !(vm_flags & vma->vm_flags))
                        return i ? : -EFAULT;
@@ -971,7 +1033,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                foll_flags |= FOLL_WRITE;
                        cond_resched();
-                        while (!(page = follow_page(mm, start, foll_flags))) {
+                        while (!(page = follow_page(vma, start, foll_flags))) {
                                int ret;
                                ret = __handle_mm_fault(mm, vma, start,
                                                foll_flags & FOLL_WRITE);
@@ -1091,6 +1153,86 @@ int zeromap_page_range(struct vm_area_struct *vma,
        return err;
 }
+pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+{
+        pgd_t * pgd = pgd_offset(mm, addr);
+        pud_t * pud = pud_alloc(mm, pgd, addr);
+        if (pud) {
+                pmd_t * pmd = pmd_alloc(mm, pud, addr);
+                if (pmd)
+                        return pte_alloc_map_lock(mm, pmd, addr, ptl);
+        }
+        return NULL;
+}
+/*
+ * This is the old fallback for page remapping.
+ *
+ * For historical reasons, it only allows reserved pages. Only
+ * old drivers should use this, and they needed to mark their
+ * pages reserved for the old functions anyway.
+ */
+static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+{
+        int retval;
+        pte_t *pte;
+        spinlock_t *ptl;  
+        retval = -EINVAL;
+        if (PageAnon(page))
+                goto out;
+        retval = -ENOMEM;
+        flush_dcache_page(page);
+        pte = get_locked_pte(mm, addr, &ptl);
+        if (!pte)
+                goto out;
+        retval = -EBUSY;
+        if (!pte_none(*pte))
+                goto out_unlock;
+        /* Ok, finally just insert the thing.. */
+        get_page(page);
+        inc_mm_counter(mm, file_rss);
+        page_add_file_rmap(page);
+        set_pte_at(mm, addr, pte, mk_pte(page, prot));
+        retval = 0;
+out_unlock:
+        pte_unmap_unlock(pte, ptl);
+out:
+        return retval;
+}
+/*
+ * This allows drivers to insert individual pages they've allocated
+ * into a user vma.
+ *
+ * The page has to be a nice clean _individual_ kernel allocation.
+ * If you allocate a compound page, you need to have marked it as
+ * such (__GFP_COMP), or manually just split the page up yourself
+ * (which is mainly an issue of doing "set_page_count(page, 1)" for
+ * each sub-page, and then freeing them one by one when you free
+ * them rather than freeing it as a compound page).
+ *
+ * NOTE! Traditionally this was done with "remap_pfn_range()" which
+ * took an arbitrary page protection parameter. This doesn't allow
+ * that. Your vma protection will have to be set up correctly, which
+ * means that if you want a shared writable mapping, you'd better
+ * ask for a shared writable mapping!
+ *
+ * The page does not need to be reserved.
+ */
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
+{
+        if (addr < vma->vm_start || addr >= vma->vm_end)
+                return -EFAULT;
+        if (!page_count(page))
+                return -EINVAL;
+        vma->vm_flags |= VM_INSERTPAGE;
+        return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_page);
 /*
 * maps a range of physical memory into the requested pages. the old
 * mappings are removed. any references to nonexistent pages results
@@ -1170,10 +1312,26 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
         * rest of the world about it:
         *   VM_IO tells people not to look at these pages
         *      (accesses can have side effects).
-         *   VM_RESERVED tells the core MM not to "manage" these pages
+         *   VM_RESERVED is specified all over the place, because
-         *      (e.g. refcount, mapcount, try to swap them out).
+         *      in 2.4 it kept swapout's vma scan off this vma; but
+         *      in 2.6 the LRU scan won't even find its pages, so this
+         *      flag means no more than count its pages in reserved_vm,
+         *      and omit it from core dump, even when VM_IO turned off.
+         *   VM_PFNMAP tells the core MM that the base pages are just
+         *      raw PFN mappings, and do not have a "struct page" associated
+         *      with them.
+         *
+         * There's a horrible special case to handle copy-on-write
+         * behaviour that some programs depend on. We mark the "original"
+         * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         */
-        vma->vm_flags |= VM_IO | VM_RESERVED;
+        if (is_cow_mapping(vma->vm_flags)) {
+                if (addr != vma->vm_start || end != vma->vm_end)
+                        return -EINVAL;
+                vma->vm_pgoff = pfn;
+        }
+        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
@@ -1228,6 +1386,33 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
        return pte;
 }
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+{
+        /*
+         * If the source page was a PFN mapping, we don't have
+         * a "struct page" for it. We do a best-effort copy by
+         * just copying from the original user address. If that
+         * fails, we just zero-fill it. Live with it.
+         */
+        if (unlikely(!src)) {
+                void *kaddr = kmap_atomic(dst, KM_USER0);
+                void __user *uaddr = (void __user *)(va & PAGE_MASK);
+                /*
+                 * This really shouldn't fail, because the page is there
+                 * in the page tables. But it might just be unreadable,
+                 * in which case we just give up and fill the result with
+                 * zeroes.
+                 */
+                if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
+                        memset(kaddr, 0, PAGE_SIZE);
+                kunmap_atomic(kaddr, KM_USER0);
+                return;
+                
+        }
+        copy_user_highpage(dst, src, va);
+}
 /*
 * This routine handles present pages, when users try to write
 * to a shared page. It is done by copying the page to a new address
@@ -1251,27 +1436,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                spinlock_t *ptl, pte_t orig_pte)
 {
        struct page *old_page, *new_page;
-        unsigned long pfn = pte_pfn(orig_pte);
        pte_t entry;
        int ret = VM_FAULT_MINOR;
-        BUG_ON(vma->vm_flags & VM_RESERVED);
+        old_page = vm_normal_page(vma, address, orig_pte);
+        if (!old_page)
-        if (unlikely(!pfn_valid(pfn))) {
+                goto gotten;
-                /*
-                 * Page table corrupted: show pte and kill process.
-                 */
-                print_bad_pte(vma, orig_pte, address);
-                ret = VM_FAULT_OOM;
-                goto unlock;
-        }
-        old_page = pfn_to_page(pfn);
        if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
                int reuse = can_share_swap_page(old_page);
                unlock_page(old_page);
                if (reuse) {
-                        flush_cache_page(vma, address, pfn);
+                        flush_cache_page(vma, address, pte_pfn(orig_pte));
                        entry = pte_mkyoung(orig_pte);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        ptep_set_access_flags(vma, address, page_table, entry, 1);
@@ -1286,6 +1462,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * Ok, we need to copy. Oh, well..
         */
        page_cache_get(old_page);
+gotten:
        pte_unmap_unlock(page_table, ptl);
        if (unlikely(anon_vma_prepare(vma)))
@@ -1298,7 +1475,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
                if (!new_page)
                        goto oom;
-                copy_user_highpage(new_page, old_page, address);
+                cow_user_page(new_page, old_page, address);
        }
        /*
@@ -1306,31 +1483,37 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
        if (likely(pte_same(*page_table, orig_pte))) {
-                page_remove_rmap(old_page);
+                if (old_page) {
-                if (!PageAnon(old_page)) {
+                        page_remove_rmap(old_page);
+                        if (!PageAnon(old_page)) {
+                                dec_mm_counter(mm, file_rss);
+                                inc_mm_counter(mm, anon_rss);
+                        }
+                } else
                        inc_mm_counter(mm, anon_rss);
-                        dec_mm_counter(mm, file_rss);
+                flush_cache_page(vma, address, pte_pfn(orig_pte));
-                }
-                flush_cache_page(vma, address, pfn);
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                ptep_establish(vma, address, page_table, entry);
                update_mmu_cache(vma, address, entry);
                lazy_mmu_prot_update(entry);
                lru_cache_add_active(new_page);
-                page_add_anon_rmap(new_page, vma, address);
+                page_add_new_anon_rmap(new_page, vma, address);
                /* Free the old page.. */
                new_page = old_page;
                ret |= VM_FAULT_WRITE;
        }
-        page_cache_release(new_page);
+        if (new_page)
-        page_cache_release(old_page);
+                page_cache_release(new_page);
+        if (old_page)
+                page_cache_release(old_page);
 unlock:
        pte_unmap_unlock(page_table, ptl);
        return ret;
 oom:
-        page_cache_release(old_page);
+        if (old_page)
+                page_cache_release(old_page);
        return VM_FAULT_OOM;
 }
@@ -1587,9 +1770,32 @@ out_big:
 out_busy:
        return -ETXTBSY;
 }
 EXPORT_SYMBOL(vmtruncate);
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+        struct address_space *mapping = inode->i_mapping;
+        /*
+         * If the underlying filesystem is not going to provide
+         * a way to truncate a range of blocks (punch a hole) -
+         * we should return failure right now.
+         */
+        if (!inode->i_op || !inode->i_op->truncate_range)
+                return -ENOSYS;
+        mutex_lock(&inode->i_mutex);
+        down_write(&inode->i_alloc_sem);
+        unmap_mapping_range(mapping, offset, (end - offset), 1);
+        truncate_inode_pages_range(mapping, offset, end);
+        inode->i_op->truncate_range(inode, offset, end);
+        up_write(&inode->i_alloc_sem);
+        mutex_unlock(&inode->i_mutex);
+        return 0;
+}
+EXPORT_SYMBOL(vmtruncate_range);
 /* 
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
@@ -1771,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        goto release;
                inc_mm_counter(mm, anon_rss);
                lru_cache_add_active(page);
-                SetPageReferenced(page);
+                page_add_new_anon_rmap(page, vma, address);
-                page_add_anon_rmap(page, vma, address);
        } else {
                /* Map the ZERO_PAGE - vm_page_prot is readonly */
                page = ZERO_PAGE(address);
@@ -1828,6 +2033,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int anon = 0;
        pte_unmap(page_table);
+        BUG_ON(vma->vm_flags & VM_PFNMAP);
        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
@@ -1902,8 +2108,8 @@ retry:
                if (anon) {
                        inc_mm_counter(mm, anon_rss);
                        lru_cache_add_active(new_page);
-                        page_add_anon_rmap(new_page, vma, address);
+                        page_add_new_anon_rmap(new_page, vma, address);
-                } else if (!(vma->vm_flags & VM_RESERVED)) {
+                } else {
                        inc_mm_counter(mm, file_rss);
                        page_add_file_rmap(new_page);
                }
@@ -2061,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
+EXPORT_SYMBOL_GPL(__handle_mm_fault);
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
 * Allocate page upper directory.
@@ -2080,6 +2288,12 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
        spin_unlock(&mm->page_table_lock);
        return 0;
 }
+#else
+/* Workaround for gcc 2.96 */
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+        return 0;
+}
 #endif /* __PAGETABLE_PUD_FOLDED */
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -2108,6 +2322,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
        spin_unlock(&mm->page_table_lock);
        return 0;
 }
+#else
+/* Workaround for gcc 2.96 */
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+{
+        return 0;
+}
 #endif /* __PAGETABLE_PMD_FOLDED */
 int make_pages_present(unsigned long addr, unsigned long end)
@@ -2182,7 +2402,7 @@ static int __init gate_vma_init(void)
        gate_vma.vm_start = FIXADDR_USER_START;
        gate_vma.vm_end = FIXADDR_USER_END;
        gate_vma.vm_page_prot = PAGE_READONLY;
-        gate_vma.vm_flags = VM_RESERVED;
+        gate_vma.vm_flags = 0;
        return 0;
 }
 __initcall(gate_vma_init);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 431a64f021..a918f77f02 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
                                  int nr_pages);
 static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
-        struct pglist_data *pgdat = zone->zone_pgdat;
        int nr_pages = PAGES_PER_SECTION;
        int ret;
@@ -104,7 +103,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
                pgdat->node_start_pfn = start_pfn;
        if (end_pfn > old_pgdat_end_pfn)
-                pgdat->node_spanned_pages = end_pfn - pgdat->node_spanned_pages;
+                pgdat->node_spanned_pages = end_pfn - pgdat->node_start_pfn;
 }
 int online_pages(unsigned long pfn, unsigned long nr_pages)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5abc57c2b8..73790188b0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,18 @@
 #include <linux/init.h>
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
+/* Internal flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
+#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
@@ -93,7 +102,7 @@ static kmem_cache_t *sn_cache;
 /* Highest zone. An specific allocation for a zone below that is not
   policied. */
-static int policy_zone;
+int policy_zone = ZONE_DMA;
 struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +140,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
        if (!zl)
                return NULL;
        num = 0;
-        for_each_node_mask(nd, *nodes) {
+        for_each_node_mask(nd, *nodes)
-                int k;
+                zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
-                for (k = MAX_NR_ZONES-1; k >= 0; k--) {
-                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
-                        if (!z->present_pages)
-                                continue;
-                        zl->zones[num++] = z;
-                        if (k > policy_zone)
-                                policy_zone = k;
-                }
-        }
        zl->zones[num] = NULL;
        return zl;
 }
@@ -161,6 +161,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
        switch (mode) {
        case MPOL_INTERLEAVE:
                policy->v.nodes = *nodes;
+                if (nodes_weight(*nodes) == 0) {
+                        kmem_cache_free(policy_cache, policy);
+                        return ERR_PTR(-EINVAL);
+                }
                break;
        case MPOL_PREFERRED:
                policy->v.preferred_node = first_node(*nodes);
@@ -176,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
                break;
        }
        policy->policy = mode;
+        policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
        return policy;
 }
-/* Ensure all existing pages follow the policy. */
+static void gather_stats(struct page *, void *);
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                                unsigned long flags);
+/* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pte_t *orig_pte;
        pte_t *pte;
@@ -189,18 +200,36 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
        orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        do {
-                unsigned long pfn;
+                struct page *page;
                unsigned int nid;
                if (!pte_present(*pte))
                        continue;
-                pfn = pte_pfn(*pte);
+                page = vm_normal_page(vma, addr, *pte);
-                if (!pfn_valid(pfn)) {
+                if (!page)
-                        print_bad_pte(vma, *pte, addr);
                        continue;
-                }
+                /*
-                nid = pfn_to_nid(pfn);
+                 * The check for PageReserved here is important to avoid
-                if (!node_isset(nid, *nodes))
+                 * handling zero pages and other pages that may have been
+                 * marked special by the system.
+                 *
+                 * If the PageReserved would not be checked here then f.e.
+                 * the location of the zero page could have an influence
+                 * on MPOL_MF_STRICT, zero pages would be counted for
+                 * the per node stats, and there would be useless attempts
+                 * to put zero pages on the migration list.
+                 */
+                if (PageReserved(page))
+                        continue;
+                nid = page_to_nid(page);
+                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+                        continue;
+                if (flags & MPOL_MF_STATS)
+                        gather_stats(page, private);
+                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+                        migrate_page_add(page, private, flags);
+                else
                        break;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap_unlock(orig_pte, ptl);
@@ -208,7 +237,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 }
 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -218,14 +249,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                if (check_pte_range(vma, pmd, addr, next, nodes))
+                if (check_pte_range(vma, pmd, addr, next, nodes,
+                                    flags, private))
                        return -EIO;
        } while (pmd++, addr = next, addr != end);
        return 0;
 }
 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pud_t *pud;
        unsigned long next;
@@ -235,14 +269,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                if (check_pmd_range(vma, pud, addr, next, nodes))
+                if (check_pmd_range(vma, pud, addr, next, nodes,
+                                    flags, private))
                        return -EIO;
        } while (pud++, addr = next, addr != end);
        return 0;
 }
 static inline int check_pgd_range(struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -252,38 +289,61 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                if (check_pud_range(vma, pgd, addr, next, nodes))
+                if (check_pud_range(vma, pgd, addr, next, nodes,
+                                    flags, private))
                        return -EIO;
        } while (pgd++, addr = next, addr != end);
        return 0;
 }
-/* Step 1: check the range */
+/* Check if a vma is migratable */
+static inline int vma_migratable(struct vm_area_struct *vma)
+{
+        if (vma->vm_flags & (
+                VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
+                return 0;
+        return 1;
+}
+/*
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
+ */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-            nodemask_t *nodes, unsigned long flags)
+                const nodemask_t *nodes, unsigned long flags, void *private)
 {
        int err;
        struct vm_area_struct *first, *vma, *prev;
+        /* Clear the LRU lists so pages can be isolated */
+        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+                lru_add_drain_all();
        first = find_vma(mm, start);
        if (!first)
                return ERR_PTR(-EFAULT);
-        if (first->vm_flags & VM_RESERVED)
-                return ERR_PTR(-EACCES);
        prev = NULL;
        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
-                if (!vma->vm_next && vma->vm_end < end)
+                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
-                        return ERR_PTR(-EFAULT);
+                        if (!vma->vm_next && vma->vm_end < end)
-                if (prev && prev->vm_end < vma->vm_start)
+                                return ERR_PTR(-EFAULT);
-                        return ERR_PTR(-EFAULT);
+                        if (prev && prev->vm_end < vma->vm_start)
-                if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+                                return ERR_PTR(-EFAULT);
+                }
+                if (!is_vm_hugetlb_page(vma) &&
+                    ((flags & MPOL_MF_STRICT) ||
+                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+                                vma_migratable(vma)))) {
                        unsigned long endvma = vma->vm_end;
                        if (endvma > end)
                                endvma = end;
                        if (vma->vm_start > start)
                                start = vma->vm_start;
-                        err = check_pgd_range(vma, start, endvma, nodes);
+                        err = check_pgd_range(vma, start, endvma, nodes,
+                                                flags, private);
                        if (err) {
                                first = ERR_PTR(err);
                                break;
@@ -342,51 +402,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
        if (!nodes)
                return 0;
-        /* Update current mems_allowed */
+        cpuset_update_task_memory_state();
-        cpuset_update_current_mems_allowed();
+        if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
-        /* Ignore nodes not set in current->mems_allowed */
-        cpuset_restrict_to_mems_allowed(nodes->bits);
-        return mpol_check_policy(mode, nodes);
-}
-long do_mbind(unsigned long start, unsigned long len,
-                unsigned long mode, nodemask_t *nmask, unsigned long flags)
-{
-        struct vm_area_struct *vma;
-        struct mm_struct *mm = current->mm;
-        struct mempolicy *new;
-        unsigned long end;
-        int err;
-        if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
-                return -EINVAL;
-        if (start & ~PAGE_MASK)
-                return -EINVAL;
-        if (mode == MPOL_DEFAULT)
-                flags &= ~MPOL_MF_STRICT;
-        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
-        end = start + len;
-        if (end < start)
-                return -EINVAL;
-        if (end == start)
-                return 0;
-        if (mpol_check_policy(mode, nmask))
                return -EINVAL;
-        new = mpol_new(mode, nmask);
+        return mpol_check_policy(mode, nodes);
-        if (IS_ERR(new))
-                return PTR_ERR(new);
-        PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-                        mode,nodes_addr(nodes)[0]);
-        down_write(&mm->mmap_sem);
-        vma = check_range(mm, start, end, nmask, flags);
-        err = PTR_ERR(vma);
-        if (!IS_ERR(vma))
-                err = mbind_range(vma, start, end, new);
-        up_write(&mm->mmap_sem);
-        mpol_free(new);
-        return err;
 }
 /* Set the process memory policy */
@@ -457,7 +476,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
        struct vm_area_struct *vma = NULL;
        struct mempolicy *pol = current->mempolicy;
-        cpuset_update_current_mems_allowed();
+        cpuset_update_task_memory_state();
        if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
                return -EINVAL;
        if (flags & MPOL_F_ADDR) {
@@ -509,11 +528,141 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 }
 /*
+ * page migration
+ */
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                                unsigned long flags)
+{
+        /*
+         * Avoid migrating a page that is shared with others.
+         */
+        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+                if (isolate_lru_page(page))
+                        list_add(&page->lru, pagelist);
+        }
+}
+static int swap_pages(struct list_head *pagelist)
+{
+        LIST_HEAD(moved);
+        LIST_HEAD(failed);
+        int n;
+        n = migrate_pages(pagelist, NULL, &moved, &failed);
+        putback_lru_pages(&failed);
+        putback_lru_pages(&moved);
+        return n;
+}
+/*
+ * For now migrate_pages simply swaps out the pages from nodes that are in
+ * the source set but not in the target set. In the future, we would
+ * want a function that moves pages between the two nodesets in such
+ * a way as to preserve the physical layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+        LIST_HEAD(pagelist);
+        int count = 0;
+        nodemask_t nodes;
+        nodes_andnot(nodes, *from_nodes, *to_nodes);
+        down_read(&mm->mmap_sem);
+        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
+                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+        if (!list_empty(&pagelist)) {
+                count = swap_pages(&pagelist);
+                putback_lru_pages(&pagelist);
+        }
+        up_read(&mm->mmap_sem);
+        return count;
+}
+long do_mbind(unsigned long start, unsigned long len,
+                unsigned long mode, nodemask_t *nmask, unsigned long flags)
+{
+        struct vm_area_struct *vma;
+        struct mm_struct *mm = current->mm;
+        struct mempolicy *new;
+        unsigned long end;
+        int err;
+        LIST_HEAD(pagelist);
+        if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+            || mode > MPOL_MAX)
+                return -EINVAL;
+        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+                return -EPERM;
+        if (start & ~PAGE_MASK)
+                return -EINVAL;
+        if (mode == MPOL_DEFAULT)
+                flags &= ~MPOL_MF_STRICT;
+        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+        end = start + len;
+        if (end < start)
+                return -EINVAL;
+        if (end == start)
+                return 0;
+        if (mpol_check_policy(mode, nmask))
+                return -EINVAL;
+        new = mpol_new(mode, nmask);
+        if (IS_ERR(new))
+                return PTR_ERR(new);
+        /*
+         * If we are using the default policy then operation
+         * on discontinuous address spaces is okay after all
+         */
+        if (!new)
+                flags |= MPOL_MF_DISCONTIG_OK;
+        PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+                        mode,nodes_addr(nodes)[0]);
+        down_write(&mm->mmap_sem);
+        vma = check_range(mm, start, end, nmask,
+                          flags | MPOL_MF_INVERT, &pagelist);
+        err = PTR_ERR(vma);
+        if (!IS_ERR(vma)) {
+                int nr_failed = 0;
+                err = mbind_range(vma, start, end, new);
+                if (!list_empty(&pagelist))
+                        nr_failed = swap_pages(&pagelist);
+                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+                        err = -EIO;
+        }
+        if (!list_empty(&pagelist))
+                putback_lru_pages(&pagelist);
+        up_write(&mm->mmap_sem);
+        mpol_free(new);
+        return err;
+}
+/*
 * User space interface with variable sized bitmaps for nodelists.
 */
 /* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                     unsigned long maxnode)
 {
        unsigned long k;
@@ -602,6 +751,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
        return do_set_mempolicy(mode, &nodes);
 }
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+                const unsigned long __user *old_nodes,
+                const unsigned long __user *new_nodes)
+{
+        struct mm_struct *mm;
+        struct task_struct *task;
+        nodemask_t old;
+        nodemask_t new;
+        nodemask_t task_nodes;
+        int err;
+        err = get_nodes(&old, old_nodes, maxnode);
+        if (err)
+                return err;
+        err = get_nodes(&new, new_nodes, maxnode);
+        if (err)
+                return err;
+        /* Find the mm_struct */
+        read_lock(&tasklist_lock);
+        task = pid ? find_task_by_pid(pid) : current;
+        if (!task) {
+                read_unlock(&tasklist_lock);
+                return -ESRCH;
+        }
+        mm = get_task_mm(task);
+        read_unlock(&tasklist_lock);
+        if (!mm)
+                return -EINVAL;
+        /*
+         * Check if this process has the right to modify the specified
+         * process. The right exists if the process has administrative
+         * capabilities, superuser priviledges or the same
+         * userid as the target process.
+         */
+        if ((current->euid != task->suid) && (current->euid != task->uid) &&
+            (current->uid != task->suid) && (current->uid != task->uid) &&
+            !capable(CAP_SYS_ADMIN)) {
+                err = -EPERM;
+                goto out;
+        }
+        task_nodes = cpuset_mems_allowed(task);
+        /* Is the user allowed to access the target nodes? */
+        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+                err = -EPERM;
+                goto out;
+        }
+        err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+out:
+        mmput(mm);
+        return err;
+}
 /* Retrieve NUMA policy */
 asmlinkage long sys_get_mempolicy(int __user *policy,
                                unsigned long __user *nmask,
@@ -708,8 +916,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 #endif
 /* Return effective policy for a VMA */
-struct mempolicy *
+static struct mempolicy * get_vma_policy(struct task_struct *task,
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+                struct vm_area_struct *vma, unsigned long addr)
 {
        struct mempolicy *pol = task->mempolicy;
@@ -768,6 +976,33 @@ static unsigned interleave_nodes(struct mempolicy *policy)
        return nid;
 }
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+        switch (policy->policy) {
+        case MPOL_INTERLEAVE:
+                return interleave_nodes(policy);
+        case MPOL_BIND:
+                /*
+                 * Follow bind policy behavior and start allocation at the
+                 * first node.
+                 */
+                return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+        case MPOL_PREFERRED:
+                if (policy->v.preferred_node >= 0)
+                        return policy->v.preferred_node;
+                /* Fall through */
+        default:
+                return numa_node_id();
+        }
+}
 /* Do static interleaving for a VMA with known offset. */
 static unsigned offset_il_node(struct mempolicy *pol,
                struct vm_area_struct *vma, unsigned long off)
@@ -785,6 +1020,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
        return nid;
 }
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+                 struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+        if (vma) {
+                unsigned long off;
+                off = vma->vm_pgoff;
+                off += (addr - vma->vm_start) >> shift;
+                return offset_il_node(pol, vma, off);
+        } else
+                return interleave_nodes(pol);
+}
+/* Return a zonelist suitable for a huge page allocation. */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+{
+        struct mempolicy *pol = get_vma_policy(current, vma, addr);
+        if (pol->policy == MPOL_INTERLEAVE) {
+                unsigned nid;
+                nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+                return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+        }
+        return zonelist_policy(GFP_HIGHUSER, pol);
+}
 /* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -829,19 +1092,12 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
-        cpuset_update_current_mems_allowed();
+        cpuset_update_task_memory_state();
        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
                unsigned nid;
-                if (vma) {
-                        unsigned long off;
+                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
-                        off = vma->vm_pgoff;
-                        off += (addr - vma->vm_start) >> PAGE_SHIFT;
-                        nid = offset_il_node(pol, vma, off);
-                } else {
-                        /* fall back to process interleaving */
-                        nid = interleave_nodes(pol);
-                }
                return alloc_page_interleave(gfp, 0, nid);
        }
        return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
@@ -862,7 +1118,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 *      interrupt context and apply the current process NUMA policy.
 *      Returns NULL when no page can be allocated.
 *
- *      Don't call cpuset_update_current_mems_allowed() unless
+ *      Don't call cpuset_update_task_memory_state() unless
 *      1) it's ok to take cpuset_sem (can WAIT), and
 *      2) allocating for current task (not interrupt).
 */
@@ -871,7 +1127,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
        struct mempolicy *pol = current->mempolicy;
        if ((gfp & __GFP_WAIT) && !in_interrupt())
-                cpuset_update_current_mems_allowed();
+                cpuset_update_task_memory_state();
        if (!pol || in_interrupt())
                pol = &default_policy;
        if (pol->policy == MPOL_INTERLEAVE)
@@ -880,6 +1136,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 }
 EXPORT_SYMBOL(alloc_pages_current);
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed().  This
+ * keeps mempolicies cpuset relative after its cpuset moves.  See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
 /* Slow path of a mempolicy copy */
 struct mempolicy *__mpol_copy(struct mempolicy *old)
 {
@@ -887,6 +1152,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
        if (!new)
                return ERR_PTR(-ENOMEM);
+        if (current_cpuset_is_being_rebound()) {
+                nodemask_t mems = cpuset_mems_allowed(current);
+                mpol_rebind_policy(old, &mems);
+        }
        *new = *old;
        atomic_set(&new->refcnt, 1);
        if (new->policy == MPOL_BIND) {
@@ -940,54 +1209,6 @@ void __mpol_free(struct mempolicy *p)
 }
 /*
- * Hugetlb policy. Same as above, just works with node numbers instead of
- * zonelists.
- */
-/* Find first node suitable for an allocation */
-int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
-{
-        struct mempolicy *pol = get_vma_policy(current, vma, addr);
-        switch (pol->policy) {
-        case MPOL_DEFAULT:
-                return numa_node_id();
-        case MPOL_BIND:
-                return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
-        case MPOL_INTERLEAVE:
-                return interleave_nodes(pol);
-        case MPOL_PREFERRED:
-                return pol->v.preferred_node >= 0 ?
-                                pol->v.preferred_node : numa_node_id();
-        }
-        BUG();
-        return 0;
-}
-/* Find secondary valid nodes for an allocation */
-int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
-{
-        struct mempolicy *pol = get_vma_policy(current, vma, addr);
-        switch (pol->policy) {
-        case MPOL_PREFERRED:
-        case MPOL_DEFAULT:
-        case MPOL_INTERLEAVE:
-                return 1;
-        case MPOL_BIND: {
-                struct zone **z;
-                for (z = pol->v.zonelist->zones; *z; z++)
-                        if ((*z)->zone_pgdat->node_id == nid)
-                                return 1;
-                return 0;
-        }
-        default:
-                BUG();
-                return 0;
-        }
-}
-/*
 * Shared memory backing store policy support.
 *
 * Remember policies even when nobody has shared memory mapped.
@@ -1141,6 +1362,30 @@ restart:
        return 0;
 }
+void mpol_shared_policy_init(struct shared_policy *info, int policy,
+                                nodemask_t *policy_nodes)
+{
+        info->root = RB_ROOT;
+        spin_lock_init(&info->lock);
+        if (policy != MPOL_DEFAULT) {
+                struct mempolicy *newpol;
+                /* Falls back to MPOL_DEFAULT on any error */
+                newpol = mpol_new(policy, policy_nodes);
+                if (!IS_ERR(newpol)) {
+                        /* Create pseudo-vma that contains just the policy */
+                        struct vm_area_struct pvma;
+                        memset(&pvma, 0, sizeof(struct vm_area_struct));
+                        /* Policy covers entire file */
+                        pvma.vm_end = TASK_SIZE;
+                        mpol_set_shared_policy(info, &pvma, newpol);
+                        mpol_free(newpol);
+                }
+        }
+}
 int mpol_set_shared_policy(struct shared_policy *info,
                        struct vm_area_struct *vma, struct mempolicy *npol)
 {
@@ -1209,25 +1454,31 @@ void numa_default_policy(void)
 }
 /* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
-                                                        const nodemask_t *new)
 {
+        nodemask_t *mpolmask;
        nodemask_t tmp;
        if (!pol)
                return;
+        mpolmask = &pol->cpuset_mems_allowed;
+        if (nodes_equal(*mpolmask, *newmask))
+                return;
        switch (pol->policy) {
        case MPOL_DEFAULT:
                break;
        case MPOL_INTERLEAVE:
-                nodes_remap(tmp, pol->v.nodes, *old, *new);
+                nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
                pol->v.nodes = tmp;
-                current->il_next = node_remap(current->il_next, *old, *new);
+                *mpolmask = *newmask;
+                current->il_next = node_remap(current->il_next,
+                                                *mpolmask, *newmask);
                break;
        case MPOL_PREFERRED:
                pol->v.preferred_node = node_remap(pol->v.preferred_node,
-                                                                *old, *new);
+                                                *mpolmask, *newmask);
+                *mpolmask = *newmask;
                break;
        case MPOL_BIND: {
                nodemask_t nodes;
@@ -1237,7 +1488,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
                nodes_clear(nodes);
                for (z = pol->v.zonelist->zones; *z; z++)
                        node_set((*z)->zone_pgdat->node_id, nodes);
-                nodes_remap(tmp, nodes, *old, *new);
+                nodes_remap(tmp, nodes, *mpolmask, *newmask);
                nodes = tmp;
                zonelist = bind_zonelist(&nodes);
@@ -1252,6 +1503,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
                        kfree(pol->v.zonelist);
                        pol->v.zonelist = zonelist;
                }
+                *mpolmask = *newmask;
                break;
        }
        default:
@@ -1261,12 +1513,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 }
 /*
- * Someone moved this task to different nodes.  Fixup mempolicies.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+        mpol_rebind_policy(tsk->mempolicy, new);
+}
+/*
+ * Rebind each vma in mm to new nodemask.
 *
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
 */
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
-        rebind_policy(current->mempolicy, old, new);
+        struct vm_area_struct *vma;
+        down_write(&mm->mmap_sem);
+        for (vma = mm->mmap; vma; vma = vma->vm_next)
+                mpol_rebind_policy(vma->vm_policy, new);
+        up_write(&mm->mmap_sem);
 }
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static const char *policy_types[] = { "default", "prefer", "bind",
+                                      "interleave" };
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
+ */
+static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+        char *p = buffer;
+        int l;
+        nodemask_t nodes;
+        int mode = pol ? pol->policy : MPOL_DEFAULT;
+        switch (mode) {
+        case MPOL_DEFAULT:
+                nodes_clear(nodes);
+                break;
+        case MPOL_PREFERRED:
+                nodes_clear(nodes);
+                node_set(pol->v.preferred_node, nodes);
+                break;
+        case MPOL_BIND:
+                get_zonemask(pol, &nodes);
+                break;
+        case MPOL_INTERLEAVE:
+                nodes = pol->v.nodes;
+                break;
+        default:
+                BUG();
+                return -EFAULT;
+        }
+        l = strlen(policy_types[mode]);
+        if (buffer + maxlen < p + l + 1)
+                return -ENOSPC;
+        strcpy(p, policy_types[mode]);
+        p += l;
+        if (!nodes_empty(nodes)) {
+                if (buffer + maxlen < p + 2)
+                        return -ENOSPC;
+                *p++ = '=';
+                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+        }
+        return p - buffer;
+}
+struct numa_maps {
+        unsigned long pages;
+        unsigned long anon;
+        unsigned long mapped;
+        unsigned long mapcount_max;
+        unsigned long node[MAX_NUMNODES];
+};
+static void gather_stats(struct page *page, void *private)
+{
+        struct numa_maps *md = private;
+        int count = page_mapcount(page);
+        if (count)
+                md->mapped++;
+        if (count > md->mapcount_max)
+                md->mapcount_max = count;
+        md->pages++;
+        if (PageAnon(page))
+                md->anon++;
+        md->node[page_to_nid(page)]++;
+        cond_resched();
+}
+int show_numa_map(struct seq_file *m, void *v)
+{
+        struct task_struct *task = m->private;
+        struct vm_area_struct *vma = v;
+        struct numa_maps *md;
+        int n;
+        char buffer[50];
+        if (!vma->vm_mm)
+                return 0;
+        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
+        if (!md)
+                return 0;
+        check_pgd_range(vma, vma->vm_start, vma->vm_end,
+                    &node_online_map, MPOL_MF_STATS, md);
+        if (md->pages) {
+                mpol_to_str(buffer, sizeof(buffer),
+                            get_vma_policy(task, vma, vma->vm_start));
+                seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
+                           vma->vm_start, buffer, md->pages,
+                           md->mapped, md->mapcount_max);
+                if (md->anon)
+                        seq_printf(m," anon=%lu",md->anon);
+                for_each_online_node(n)
+                        if (md->node[n])
+                                seq_printf(m, " N%d=%lu", n, md->node[n]);
+                seq_putc(m, '\n');
+        }
+        kfree(md);
+        if (m->count < m->size)
+                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+        return 0;
+}
diff --git a/mm/mlock.c b/mm/mlock.c
index 4ae3a46ff7..b90c59573a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -5,6 +5,7 @@
 *  (C) Copyright 2002 Christoph Hellwig
 */
+#include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/mempolicy.h>
diff --git a/mm/mmap.c b/mm/mmap.c
index 6c997b1596..47556d2b3e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/fs.h>
@@ -611,7 +612,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those.
 */
-#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED)
+#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                        struct file *file, unsigned long vm_flags)
@@ -1076,17 +1077,6 @@ munmap_back:
                error = file->f_op->mmap(file, vma);
                if (error)
                        goto unmap_and_free_vma;
-                if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
-                                                == (VM_WRITE | VM_RESERVED)) {
-                        printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
-                                "PROT_WRITE mmap of VM_RESERVED memory, which "
-                                "is deprecated. Please report this to "
-                                "linux-kernel@vger.kernel.org\n",current->comm);
-                        if (vma->vm_ops && vma->vm_ops->close)
-                                vma->vm_ops->close(vma);
-                        error = -EACCES;
-                        goto unmap_and_free_vma;
-                }
        } else if (vm_flags & VM_SHARED) {
                error = shmem_zero_setup(vma);
                if (error)
@@ -1501,7 +1491,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
 * PA-RISC uses this for its stack; IA64 for its Register Backing Store.
 * vma is the last one with address > vma->vm_end.  Have to extend vma.
 */
-#ifdef CONFIG_STACK_GROWSUP
+#ifndef CONFIG_IA64
 static inline
 #endif
 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 17a2b52b75..653b8571c1 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,14 +124,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
         * a MAP_NORESERVE private mapping to writable will now reserve.
         */
        if (newflags & VM_WRITE) {
-                if (oldflags & VM_RESERVED) {
-                        BUG_ON(oldflags & VM_WRITE);
-                        printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
-                                "PROT_WRITE mprotect of VM_RESERVED memory, "
-                                "which is deprecated. Please report this to "
-                                "linux-kernel@vger.kernel.org\n",current->comm);
-                        return -EACCES;
-                }
                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
                        charged = nrpages;
                        if (security_vm_enough_memory(charged))
diff --git a/mm/mremap.c b/mm/mremap.c
index b535438c36..1903bdf65e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -13,6 +13,7 @@
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/highmem.h>
 #include <linux/security.h>
@@ -323,7 +324,7 @@ unsigned long do_mremap(unsigned long addr,
        /* We can't remap across vm area boundaries */
        if (old_len > vma->vm_end - addr)
                goto out;
-        if (vma->vm_flags & VM_DONTEXPAND) {
+        if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) {
                if (new_len > old_len)
                        goto out;
        }
diff --git a/mm/msync.c b/mm/msync.c
index 0e040e9c39..3563a56e1a 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 again:
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
        do {
-                unsigned long pfn;
                struct page *page;
                if (progress >= 64) {
@@ -40,13 +39,9 @@ again:
                        continue;
                if (!pte_maybe_dirty(*pte))
                        continue;
-                pfn = pte_pfn(*pte);
+                page = vm_normal_page(vma, addr, *pte);
-                if (unlikely(!pfn_valid(pfn))) {
+                if (!page)
-                        print_bad_pte(vma, *pte, addr);
                        continue;
-                }
-                page = pfn_to_page(pfn);
                if (ptep_clear_flush_dirty(vma, addr, pte) ||
                    page_test_and_clear_dirty(page))
                        set_page_dirty(page);
@@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma,
        /* For hugepages we can't go walking the page table normally,
         * but that's ok, hugetlbfs is memory based, so we don't need
         * to do anything more on an msync().
-         * Can't do anything with VM_RESERVED regions either.
         */
-        if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
+        if (vma->vm_flags & VM_HUGETLB)
                return;
        BUG_ON(addr >= end);
@@ -143,7 +137,7 @@ static int msync_interval(struct vm_area_struct *vma,
                        ret = filemap_fdatawrite(mapping);
                        if (file->f_op && file->f_op->fsync) {
                                /*
-                                 * We don't take i_sem here because mmap_sem
+                                 * We don't take i_mutex here because mmap_sem
                                 * is already held.
                                 */
                                err = file->f_op->fsync(file,file->f_dentry,1);
diff --git a/mm/nommu.c b/mm/nommu.c
index 6deb6ab3d6..c10262d682 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 EXPORT_SYMBOL(find_vma);
-struct page *follow_page(struct mm_struct *mm, unsigned long address,
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
                        unsigned int foll_flags)
 {
        return NULL;
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
 {
        return 0;
 }
+struct page *filemap_nopage(struct vm_area_struct *area,
+                        unsigned long address, int *type)
+{
+        BUG();
+        return NULL;
+}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d348b90359..14bd4ec795 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -274,6 +274,7 @@ void out_of_memory(gfp_t gfp_mask, int order)
                show_mem();
        }
+        cpuset_lock();
        read_lock(&tasklist_lock);
 retry:
        p = select_bad_process();
@@ -284,6 +285,7 @@ retry:
        /* Found nothing?!?! Either we hang forever, or we panic. */
        if (!p) {
                read_unlock(&tasklist_lock);
+                cpuset_unlock();
                panic("Out of memory and no killable processes...\n");
        }
@@ -293,12 +295,14 @@ retry:
 out:
        read_unlock(&tasklist_lock);
+        cpuset_unlock();
        if (mm)
                mmput(mm);
        /*
         * Give "p" a good chance of killing itself before we
-         * retry to allocate memory.
+         * retry to allocate memory unless "p" is current
         */
-        schedule_timeout_interruptible(1);
+        if (!test_thread_flag(TIF_MEMDIE))
+                schedule_timeout_interruptible(1);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 74138c9a22..945559fb63 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -46,7 +46,7 @@
 static long ratelimit_pages = 32;
 static long total_pages;        /* The total number of pages in the machine. */
-static int dirty_exceeded;      /* Dirty mem may be over limit */
+static int dirty_exceeded __cacheline_aligned_in_smp;   /* Dirty mem may be over limit */
 /*
 * When balance_dirty_pages decides that the caller needs to perform some
@@ -212,7 +212,8 @@ static void balance_dirty_pages(struct address_space *mapping)
                if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
                        break;
-                dirty_exceeded = 1;
+                if (!dirty_exceeded)
+                        dirty_exceeded = 1;
                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
                 * Unstable writes are a feature of certain networked
@@ -234,7 +235,7 @@ static void balance_dirty_pages(struct address_space *mapping)
                blk_congestion_wait(WRITE, HZ/10);
        }
-        if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
+        if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh && dirty_exceeded)
                dirty_exceeded = 0;
        if (writeback_in_progress(bdi))
@@ -550,11 +551,17 @@ void __init page_writeback_init(void)
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
+        int ret;
        if (wbc->nr_to_write <= 0)
                return 0;
+        wbc->for_writepages = 1;
        if (mapping->a_ops->writepages)
-                return mapping->a_ops->writepages(mapping, wbc);
+                ret =  mapping->a_ops->writepages(mapping, wbc);
-        return generic_writepages(mapping, wbc);
+        else
+                ret = generic_writepages(mapping, wbc);
+        wbc->for_writepages = 0;
+        return ret;
 }
 /**
@@ -750,6 +757,7 @@ int clear_page_dirty_for_io(struct page *page)
        }
        return TestClearPageDirty(page);
 }
+EXPORT_SYMBOL(clear_page_dirty_for_io);
 int test_clear_page_writeback(struct page *page)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 987225bdd6..df54e2fc8e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
+int percpu_pagelist_fraction;
+static void fastcall free_hot_cold_page(struct page *page, int cold);
 /*
 * results with 256, 32 in the lowmem_reserve sysctl:
@@ -60,8 +64,11 @@ long nr_swap_pages;
 *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
 *      HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *
+ * TBD: should special case ZONE_DMA32 machines here - in those we normally
+ * don't need any ZONE_NORMAL reservation
 */
-int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
+int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
 EXPORT_SYMBOL(totalram_pages);
@@ -72,12 +79,13 @@ EXPORT_SYMBOL(totalram_pages);
 struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
 EXPORT_SYMBOL(zone_table);
-static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
 int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
+#ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
        int ret = 0;
@@ -119,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page)
        return 0;
 }
-static void bad_page(const char *function, struct page *page)
+#else
+static inline int bad_range(struct zone *zone, struct page *page)
+{
+        return 0;
+}
+#endif
+static void bad_page(struct page *page)
 {
-        printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
+        printk(KERN_EMERG "Bad page state in process '%s'\n"
-                function, current->comm, page);
+                KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
-        printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
+                KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
-                (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
+                KERN_EMERG "Backtrace:\n",
-                page->mapping, page_mapcount(page), page_count(page));
+                current->comm, page, (int)(2*sizeof(unsigned long)),
-        printk(KERN_EMERG "Backtrace:\n");
+                (unsigned long)page->flags, page->mapping,
+                page_mapcount(page), page_count(page));
        dump_stack();
-        printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
        page->flags &= ~(1 << PG_lru    |
                        1 << PG_private |
                        1 << PG_locked  |
@@ -137,18 +152,13 @@ static void bad_page(const char *function, struct page *page)
                        1 << PG_reclaim |
                        1 << PG_slab    |
                        1 << PG_swapcache |
-                        1 << PG_writeback |
+                        1 << PG_writeback );
-                        1 << PG_reserved );
        set_page_count(page, 0);
        reset_page_mapcount(page);
        page->mapping = NULL;
        add_taint(TAINT_BAD_PAGE);
 }
-#ifndef CONFIG_HUGETLB_PAGE
-#define prep_compound_page(page, order) do { } while (0)
-#define destroy_compound_page(page, order) do { } while (0)
-#else
 /*
 * Higher-order pages are called "compound pages".  They are structured thusly:
 *
@@ -186,23 +196,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
        int i;
        int nr_pages = 1 << order;
-        if (!PageCompound(page))
+        if (unlikely(page[1].index != order))
-                return;
+                bad_page(page);
-        if (page[1].index != order)
-                bad_page(__FUNCTION__, page);
        for (i = 0; i < nr_pages; i++) {
                struct page *p = page + i;
-                if (!PageCompound(p))
+                if (unlikely(!PageCompound(p) |
-                        bad_page(__FUNCTION__, page);
+                                (page_private(p) != (unsigned long)page)))
-                if (page_private(p) != (unsigned long)page)
+                        bad_page(page);
-                        bad_page(__FUNCTION__, page);
                ClearPageCompound(p);
        }
 }
-#endif          /* CONFIG_HUGETLB_PAGE */
 /*
 * function for dealing with page's order in buddy system.
@@ -258,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 /*
 * This function checks whether a page is free && is the buddy
 * we can do coalesce a page and its buddy if
- * (a) the buddy is free &&
+ * (a) the buddy is not in a hole &&
- * (b) the buddy is on the buddy system &&
+ * (b) the buddy is free &&
- * (c) a page and its buddy have the same order.
+ * (c) the buddy is on the buddy system &&
+ * (d) a page and its buddy have the same order.
 * for recording page's order, we use page_private(page) and PG_private.
 *
 */
 static inline int page_is_buddy(struct page *page, int order)
 {
+#ifdef CONFIG_HOLES_IN_ZONE
+        if (!pfn_valid(page_to_pfn(page)))
+                return 0;
+#endif
       if (PagePrivate(page)           &&
           (page_order(page) == order) &&
            page_count(page) == 0)
@@ -297,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order)
 * -- wli
 */
-static inline void __free_pages_bulk (struct page *page,
+static inline void __free_one_page(struct page *page,
                struct zone *zone, unsigned int order)
 {
        unsigned long page_idx;
        int order_size = 1 << order;
-        if (unlikely(order))
+        if (unlikely(PageCompound(page)))
                destroy_compound_page(page, order);
        page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -317,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page,
                struct free_area *area;
                struct page *buddy;
-                combined_idx = __find_combined_index(page_idx, order);
                buddy = __page_find_buddy(page, page_idx, order);
-                if (bad_range(zone, buddy))
-                        break;
                if (!page_is_buddy(buddy, order))
                        break;          /* Move the buddy up one level. */
                list_del(&buddy->lru);
                area = zone->free_area + order;
                area->nr_free--;
                rmv_page_order(buddy);
+                combined_idx = __find_combined_index(page_idx, order);
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
                order++;
@@ -337,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page,
        zone->free_area[order].nr_free++;
 }
-static inline void free_pages_check(const char *function, struct page *page)
+static inline int free_pages_check(struct page *page)
 {
-        if (    page_mapcount(page) ||
+        if (unlikely(page_mapcount(page) |
-                page->mapping != NULL ||
+                (page->mapping != NULL)  |
-                page_count(page) != 0 ||
+                (page_count(page) != 0)  |
                (page->flags & (
                        1 << PG_lru     |
                        1 << PG_private |
@@ -351,10 +360,16 @@ static inline void free_pages_check(const char *function, struct page *page)
                        1 << PG_slab    |
                        1 << PG_swapcache |
                        1 << PG_writeback |
-                        1 << PG_reserved )))
+                        1 << PG_reserved ))))
-                bad_page(function, page);
+                bad_page(page);
        if (PageDirty(page))
                __ClearPageDirty(page);
+        /*
+         * For now, we report if PG_reserved was found set, but do not
+         * clear it, and do not free the page.  But we shall soon need
+         * to do more, for when the ZERO_PAGE count wraps negative.
+         */
+        return PageReserved(page);
 }
 /*
@@ -368,48 +383,90 @@ static inline void free_pages_check(const char *function, struct page *page)
 * And clear the zone's pages_scanned counter, to hold off the "all pages are
 * pinned" detection logic.
 */
-static int
+static void free_pages_bulk(struct zone *zone, int count,
-free_pages_bulk(struct zone *zone, int count,
+                                        struct list_head *list, int order)
-                struct list_head *list, unsigned int order)
 {
-        unsigned long flags;
+        spin_lock(&zone->lock);
-        struct page *page = NULL;
-        int ret = 0;
-        spin_lock_irqsave(&zone->lock, flags);
        zone->all_unreclaimable = 0;
        zone->pages_scanned = 0;
-        while (!list_empty(list) && count--) {
+        while (count--) {
+                struct page *page;
+                BUG_ON(list_empty(list));
                page = list_entry(list->prev, struct page, lru);
-                /* have to delete it as __free_pages_bulk list manipulates */
+                /* have to delete it as __free_one_page list manipulates */
                list_del(&page->lru);
-                __free_pages_bulk(page, zone, order);
+                __free_one_page(page, zone, order);
-                ret++;
        }
-        spin_unlock_irqrestore(&zone->lock, flags);
+        spin_unlock(&zone->lock);
-        return ret;
 }
-void __free_pages_ok(struct page *page, unsigned int order)
+static void free_one_page(struct zone *zone, struct page *page, int order)
 {
        LIST_HEAD(list);
+        list_add(&page->lru, &list);
+        free_pages_bulk(zone, 1, &list, order);
+}
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+        unsigned long flags;
        int i;
+        int reserved = 0;
        arch_free_page(page, order);
+        if (!PageHighMem(page))
-        mod_page_state(pgfree, 1 << order);
+                mutex_debug_check_no_locks_freed(page_address(page),
+                                                 PAGE_SIZE<<order);
 #ifndef CONFIG_MMU
-        if (order > 0)
+        for (i = 1 ; i < (1 << order) ; ++i)
-                for (i = 1 ; i < (1 << order) ; ++i)
+                __put_page(page + i);
-                        __put_page(page + i);
 #endif
        for (i = 0 ; i < (1 << order) ; ++i)
-                free_pages_check(__FUNCTION__, page + i);
+                reserved += free_pages_check(page + i);
-        list_add(&page->lru, &list);
+        if (reserved)
-        kernel_map_pages(page, 1<<order, 0);
+                return;
-        free_pages_bulk(page_zone(page), 1, &list, order);
+        kernel_map_pages(page, 1 << order, 0);
+        local_irq_save(flags);
+        __mod_page_state(pgfree, 1 << order);
+        free_one_page(page_zone(page), page, order);
+        local_irq_restore(flags);
+}
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+{
+        if (order == 0) {
+                __ClearPageReserved(page);
+                set_page_count(page, 0);
+                free_hot_cold_page(page, 0);
+        } else {
+                LIST_HEAD(list);
+                int loop;
+                for (loop = 0; loop < BITS_PER_LONG; loop++) {
+                        struct page *p = &page[loop];
+                        if (loop + 16 < BITS_PER_LONG)
+                                prefetchw(p + 16);
+                        __ClearPageReserved(p);
+                        set_page_count(p, 0);
+                }
+                arch_free_page(page, order);
+                mod_page_state(pgfree, 1 << order);
+                list_add(&page->lru, &list);
+                kernel_map_pages(page, 1 << order, 0);
+                free_pages_bulk(page_zone(page), 1, &list, order);
+        }
 }
@@ -427,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
 *
 * -- wli
 */
-static inline struct page *
+static inline void expand(struct zone *zone, struct page *page,
-expand(struct zone *zone, struct page *page,
        int low, int high, struct free_area *area)
 {
        unsigned long size = 1 << high;
@@ -442,34 +498,16 @@ expand(struct zone *zone, struct page *page,
                area->nr_free++;
                set_page_order(&page[size], high);
        }
-        return page;
-}
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
-        set_page_count(page, 1);
-#else
-        int i;
-        /*
-         * We need to reference all the pages for this order, otherwise if
-         * anyone accesses one of the pages with (get/put) it will be freed.
-         * - eg: access_process_vm()
-         */
-        for (i = 0; i < (1 << order); i++)
-                set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
 }
 /*
 * This page is about to be returned from the page allocator
 */
-static void prep_new_page(struct page *page, int order)
+static int prep_new_page(struct page *page, int order)
 {
-        if (    page_mapcount(page) ||
+        if (unlikely(page_mapcount(page) |
-                page->mapping != NULL ||
+                (page->mapping != NULL)  |
-                page_count(page) != 0 ||
+                (page_count(page) != 0)  |
                (page->flags & (
                        1 << PG_lru     |
                        1 << PG_private |
@@ -480,8 +518,15 @@ static void prep_new_page(struct page *page, int order)
                        1 << PG_slab    |
                        1 << PG_swapcache |
                        1 << PG_writeback |
-                        1 << PG_reserved )))
+                        1 << PG_reserved ))))
-                bad_page(__FUNCTION__, page);
+                bad_page(page);
+        /*
+         * For now, we report if PG_reserved was found set, but do not
+         * clear it, and do not allocate the page: as a safety net.
+         */
+        if (PageReserved(page))
+                return 1;
        page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
                        1 << PG_referenced | 1 << PG_arch_1 |
@@ -489,6 +534,7 @@ static void prep_new_page(struct page *page, int order)
        set_page_private(page, 0);
        set_page_refs(page, order);
        kernel_map_pages(page, 1 << order, 1);
+        return 0;
 }
 /* 
@@ -511,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
                rmv_page_order(page);
                area->nr_free--;
                zone->free_pages -= 1UL << order;
-                return expand(zone, page, order, current_order, area);
+                expand(zone, page, order, current_order, area);
+                return page;
        }
        return NULL;
@@ -525,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
                        unsigned long count, struct list_head *list)
 {
-        unsigned long flags;
        int i;
-        int allocated = 0;
-        struct page *page;
        
-        spin_lock_irqsave(&zone->lock, flags);
+        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
-                page = __rmqueue(zone, order);
+                struct page *page = __rmqueue(zone, order);
-                if (page == NULL)
+                if (unlikely(page == NULL))
                        break;
-                allocated++;
                list_add_tail(&page->lru, list);
        }
-        spin_unlock_irqrestore(&zone->lock, flags);
+        spin_unlock(&zone->lock);
-        return allocated;
+        return i;
 }
 #ifdef CONFIG_NUMA
@@ -558,14 +601,13 @@ void drain_remote_pages(void)
                if (zone->zone_pgdat->node_id == numa_node_id())
                        continue;
-                pset = zone->pageset[smp_processor_id()];
+                pset = zone_pcp(zone, smp_processor_id());
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
                        pcp = &pset->pcp[i];
-                        if (pcp->count)
+                        free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-                                pcp->count -= free_pages_bulk(zone, pcp->count,
+                        pcp->count = 0;
-                                                &pcp->list, 0);
                }
        }
        local_irq_restore(flags);
@@ -575,6 +617,7 @@ void drain_remote_pages(void)
 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
+        unsigned long flags;
        struct zone *zone;
        int i;
@@ -586,8 +629,10 @@ static void __drain_pages(unsigned int cpu)
                        struct per_cpu_pages *pcp;
                        pcp = &pset->pcp[i];
-                        pcp->count -= free_pages_bulk(zone, pcp->count,
+                        local_irq_save(flags);
-                                                &pcp->list, 0);
+                        free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+                        pcp->count = 0;
+                        local_irq_restore(flags);
                }
        }
 }
@@ -633,18 +678,14 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
-static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
 {
 #ifdef CONFIG_NUMA
-        unsigned long flags;
-        int cpu;
        pg_data_t *pg = z->zone_pgdat;
        pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
        struct per_cpu_pageset *p;
-        local_irq_save(flags);
+        p = zone_pcp(z, cpu);
-        cpu = smp_processor_id();
-        p = zone_pcp(z,cpu);
        if (pg == orig) {
                p->numa_hit++;
        } else {
@@ -655,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
                p->local_node++;
        else
                p->other_node++;
-        local_irq_restore(flags);
 #endif
 }
 /*
 * Free a 0-order page
 */
-static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
 static void fastcall free_hot_cold_page(struct page *page, int cold)
 {
        struct zone *zone = page_zone(page);
@@ -671,17 +710,22 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        arch_free_page(page, 0);
-        kernel_map_pages(page, 1, 0);
-        inc_page_state(pgfree);
        if (PageAnon(page))
                page->mapping = NULL;
-        free_pages_check(__FUNCTION__, page);
+        if (free_pages_check(page))
+                return;
+        kernel_map_pages(page, 1, 0);
        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
        local_irq_save(flags);
+        __inc_page_state(pgfree);
        list_add(&page->lru, &pcp->list);
        pcp->count++;
-        if (pcp->count >= pcp->high)
+        if (pcp->count >= pcp->high) {
-                pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+                free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+                pcp->count -= pcp->batch;
+        }
        local_irq_restore(flags);
        put_cpu();
 }
@@ -710,64 +754,82 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
 */
-static struct page *
+static struct page *buffered_rmqueue(struct zonelist *zonelist,
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+                        struct zone *zone, int order, gfp_t gfp_flags)
 {
        unsigned long flags;
-        struct page *page = NULL;
+        struct page *page;
        int cold = !!(gfp_flags & __GFP_COLD);
+        int cpu;
-        if (order == 0) {
+again:
+        cpu  = get_cpu();
+        if (likely(order == 0)) {
                struct per_cpu_pages *pcp;
-                pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
+                pcp = &zone_pcp(zone, cpu)->pcp[cold];
                local_irq_save(flags);
-                if (pcp->count <= pcp->low)
+                if (!pcp->count) {
                        pcp->count += rmqueue_bulk(zone, 0,
                                                pcp->batch, &pcp->list);
-                if (pcp->count) {
+                        if (unlikely(!pcp->count))
-                        page = list_entry(pcp->list.next, struct page, lru);
+                                goto failed;
-                        list_del(&page->lru);
-                        pcp->count--;
                }
-                local_irq_restore(flags);
+                page = list_entry(pcp->list.next, struct page, lru);
-                put_cpu();
+                list_del(&page->lru);
-        }
+                pcp->count--;
+        } else {
-        if (page == NULL) {
                spin_lock_irqsave(&zone->lock, flags);
                page = __rmqueue(zone, order);
-                spin_unlock_irqrestore(&zone->lock, flags);
+                spin_unlock(&zone->lock);
+                if (!page)
+                        goto failed;
        }
-        if (page != NULL) {
+        __mod_page_state_zone(zone, pgalloc, 1 << order);
-                BUG_ON(bad_range(zone, page));
+        zone_statistics(zonelist, zone, cpu);
-                mod_page_state_zone(zone, pgalloc, 1 << order);
+        local_irq_restore(flags);
-                prep_new_page(page, order);
+        put_cpu();
+        BUG_ON(bad_range(zone, page));
+        if (prep_new_page(page, order))
+                goto again;
-                if (gfp_flags & __GFP_ZERO)
+        if (gfp_flags & __GFP_ZERO)
-                        prep_zero_page(page, order, gfp_flags);
+                prep_zero_page(page, order, gfp_flags);
-                if (order && (gfp_flags & __GFP_COMP))
+        if (order && (gfp_flags & __GFP_COMP))
-                        prep_compound_page(page, order);
+                prep_compound_page(page, order);
-        }
        return page;
+failed:
+        local_irq_restore(flags);
+        put_cpu();
+        return NULL;
 }
+#define ALLOC_NO_WATERMARKS     0x01 /* don't check watermarks at all */
+#define ALLOC_WMARK_MIN         0x02 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW         0x04 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH        0x08 /* use pages_high watermark */
+#define ALLOC_HARDER            0x10 /* try to alloc harder */
+#define ALLOC_HIGH              0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET            0x40 /* check for correct cpuset */
 /*
 * Return 1 if free pages are above 'mark'. This takes into account the order
 * of the allocation.
 */
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
-                      int classzone_idx, int can_try_harder, gfp_t gfp_high)
+                      int classzone_idx, int alloc_flags)
 {
        /* free_pages my go negative - that's OK */
        long min = mark, free_pages = z->free_pages - (1 << order) + 1;
        int o;
-        if (gfp_high)
+        if (alloc_flags & ALLOC_HIGH)
                min -= min / 2;
-        if (can_try_harder)
+        if (alloc_flags & ALLOC_HARDER)
                min -= min / 4;
        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -785,14 +847,48 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
        return 1;
 }
-static inline int
+/*
-should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
+ * get_page_from_freeliest goes through the zonelist trying to allocate
+ * a page.
+ */
+static struct page *
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
+                struct zonelist *zonelist, int alloc_flags)
 {
-        if (!z->reclaim_pages)
+        struct zone **z = zonelist->zones;
-                return 0;
+        struct page *page = NULL;
-        if (gfp_mask & __GFP_NORECLAIM)
+        int classzone_idx = zone_idx(*z);
-                return 0;
-        return 1;
+        /*
+         * Go through the zonelist once, looking for a zone with enough free.
+         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+         */
+        do {
+                if ((alloc_flags & ALLOC_CPUSET) &&
+                                !cpuset_zone_allowed(*z, gfp_mask))
+                        continue;
+                if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
+                        unsigned long mark;
+                        if (alloc_flags & ALLOC_WMARK_MIN)
+                                mark = (*z)->pages_min;
+                        else if (alloc_flags & ALLOC_WMARK_LOW)
+                                mark = (*z)->pages_low;
+                        else
+                                mark = (*z)->pages_high;
+                        if (!zone_watermark_ok(*z, order, mark,
+                                    classzone_idx, alloc_flags))
+                                if (!zone_reclaim_mode ||
+                                    !zone_reclaim(*z, gfp_mask, order))
+                                        continue;
+                }
+                page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
+                if (page) {
+                        break;
+                }
+        } while (*(++z) != NULL);
+        return page;
 }
 /*
@@ -803,105 +899,76 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
                struct zonelist *zonelist)
 {
        const gfp_t wait = gfp_mask & __GFP_WAIT;
-        struct zone **zones, *z;
+        struct zone **z;
        struct page *page;
        struct reclaim_state reclaim_state;
        struct task_struct *p = current;
-        int i;
-        int classzone_idx;
        int do_retry;
-        int can_try_harder;
+        int alloc_flags;
        int did_some_progress;
        might_sleep_if(wait);
-        /*
+restart:
-         * The caller may dip into page reserves a bit more if the caller
+        z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
-         * cannot run direct reclaim, or is the caller has realtime scheduling
-         * policy
-         */
-        can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
-        zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
-        if (unlikely(zones[0] == NULL)) {
+        if (unlikely(*z == NULL)) {
                /* Should this ever happen?? */
                return NULL;
        }
-        classzone_idx = zone_idx(zones[0]);
+        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+                                zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+        if (page)
+                goto got_pg;
+        do {
+                wakeup_kswapd(*z, order);
+        } while (*(++z));
-restart:
        /*
-         * Go through the zonelist once, looking for a zone with enough free.
+         * OK, we're below the kswapd watermark and have kicked background
-         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+         * reclaim. Now things get more complex, so set up alloc_flags according
+         * to how we want to proceed.
+         *
+         * The caller may dip into page reserves a bit more if the caller
+         * cannot run direct reclaim, or if the caller has realtime scheduling
+         * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+         * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
         */
-        for (i = 0; (z = zones[i]) != NULL; i++) {
+        alloc_flags = ALLOC_WMARK_MIN;
-                int do_reclaim = should_reclaim_zone(z, gfp_mask);
+        if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
+                alloc_flags |= ALLOC_HARDER;
-                if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
+        if (gfp_mask & __GFP_HIGH)
-                        continue;
+                alloc_flags |= ALLOC_HIGH;
+        alloc_flags |= ALLOC_CPUSET;
-                /*
-                 * If the zone is to attempt early page reclaim then this loop
-                 * will try to reclaim pages and check the watermark a second
-                 * time before giving up and falling back to the next zone.
-                 */
-zone_reclaim_retry:
-                if (!zone_watermark_ok(z, order, z->pages_low,
-                                       classzone_idx, 0, 0)) {
-                        if (!do_reclaim)
-                                continue;
-                        else {
-                                zone_reclaim(z, gfp_mask, order);
-                                /* Only try reclaim once */
-                                do_reclaim = 0;
-                                goto zone_reclaim_retry;
-                        }
-                }
-                page = buffered_rmqueue(z, order, gfp_mask);
-                if (page)
-                        goto got_pg;
-        }
-        for (i = 0; (z = zones[i]) != NULL; i++)
-                wakeup_kswapd(z, order);
        /*
         * Go through the zonelist again. Let __GFP_HIGH and allocations
-         * coming from realtime tasks to go deeper into reserves
+         * coming from realtime tasks go deeper into reserves.
         *
         * This is the last chance, in general, before the goto nopage.
         * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
         * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
         */
-        for (i = 0; (z = zones[i]) != NULL; i++) {
+        page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
-                if (!zone_watermark_ok(z, order, z->pages_min,
+        if (page)
-                                       classzone_idx, can_try_harder,
+                goto got_pg;
-                                       gfp_mask & __GFP_HIGH))
-                        continue;
-                if (wait && !cpuset_zone_allowed(z, gfp_mask))
-                        continue;
-                page = buffered_rmqueue(z, order, gfp_mask);
-                if (page)
-                        goto got_pg;
-        }
        /* This allocation should allow future memory freeing. */
        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
                        && !in_interrupt()) {
                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+nofail_alloc:
                        /* go through the zonelist yet again, ignoring mins */
-                        for (i = 0; (z = zones[i]) != NULL; i++) {
+                        page = get_page_from_freelist(gfp_mask, order,
-                                if (!cpuset_zone_allowed(z, gfp_mask))
+                                zonelist, ALLOC_NO_WATERMARKS);
-                                        continue;
+                        if (page)
-                                page = buffered_rmqueue(z, order, gfp_mask);
+                                goto got_pg;
-                                if (page)
+                        if (gfp_mask & __GFP_NOFAIL) {
-                                        goto got_pg;
+                                blk_congestion_wait(WRITE, HZ/50);
+                                goto nofail_alloc;
                        }
                }
                goto nopage;
@@ -915,11 +982,12 @@ rebalance:
        cond_resched();
        /* We now go into synchronous reclaim */
+        cpuset_memory_pressure_bump();
        p->flags |= PF_MEMALLOC;
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
-        did_some_progress = try_to_free_pages(zones, gfp_mask);
+        did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
        p->reclaim_state = NULL;
        p->flags &= ~PF_MEMALLOC;
@@ -927,19 +995,10 @@ rebalance:
        cond_resched();
        if (likely(did_some_progress)) {
-                for (i = 0; (z = zones[i]) != NULL; i++) {
+                page = get_page_from_freelist(gfp_mask, order,
-                        if (!zone_watermark_ok(z, order, z->pages_min,
+                                                zonelist, alloc_flags);
-                                               classzone_idx, can_try_harder,
+                if (page)
-                                               gfp_mask & __GFP_HIGH))
+                        goto got_pg;
-                                continue;
-                        if (!cpuset_zone_allowed(z, gfp_mask))
-                                continue;
-                        page = buffered_rmqueue(z, order, gfp_mask);
-                        if (page)
-                                goto got_pg;
-                }
        } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                /*
                 * Go through the zonelist yet one more time, keep
@@ -947,18 +1006,10 @@ rebalance:
                 * a parallel oom killing, we must fail if we're still
                 * under heavy pressure.
                 */
-                for (i = 0; (z = zones[i]) != NULL; i++) {
+                page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-                        if (!zone_watermark_ok(z, order, z->pages_high,
+                                zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
-                                               classzone_idx, 0, 0))
+                if (page)
-                                continue;
+                        goto got_pg;
-                        if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
-                                continue;
-                        page = buffered_rmqueue(z, order, gfp_mask);
-                        if (page)
-                                goto got_pg;
-                }
                out_of_memory(gfp_mask, order);
                goto restart;
@@ -991,9 +1042,7 @@ nopage:
                dump_stack();
                show_mem();
        }
-        return NULL;
 got_pg:
-        zone_statistics(zonelist, z);
        return page;
 }
@@ -1160,7 +1209,7 @@ EXPORT_SYMBOL(nr_pagecache);
 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 #endif
-void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
+static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
        int cpu = 0;
@@ -1213,7 +1262,7 @@ void get_full_page_state(struct page_state *ret)
        __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
-unsigned long __read_page_state(unsigned long offset)
+unsigned long read_page_state_offset(unsigned long offset)
 {
        unsigned long ret = 0;
        int cpu;
@@ -1227,18 +1276,26 @@ unsigned long __read_page_state(unsigned long offset)
        return ret;
 }
-void __mod_page_state(unsigned long offset, unsigned long delta)
+void __mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+        void *ptr;
+        ptr = &__get_cpu_var(page_states);
+        *(unsigned long *)(ptr + offset) += delta;
+}
+EXPORT_SYMBOL(__mod_page_state_offset);
+void mod_page_state_offset(unsigned long offset, unsigned long delta)
 {
        unsigned long flags;
-        void* ptr;
+        void *ptr;
        local_irq_save(flags);
        ptr = &__get_cpu_var(page_states);
-        *(unsigned long*)(ptr + offset) += delta;
+        *(unsigned long *)(ptr + offset) += delta;
        local_irq_restore(flags);
 }
+EXPORT_SYMBOL(mod_page_state_offset);
-EXPORT_SYMBOL(__mod_page_state);
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
                        unsigned long *free, struct pglist_data *pgdat)
@@ -1324,7 +1381,7 @@ void show_free_areas(void)
                show_node(zone);
                printk("%s per-cpu:", zone->name);
-                if (!zone->present_pages) {
+                if (!populated_zone(zone)) {
                        printk(" empty\n");
                        continue;
                } else
@@ -1336,10 +1393,9 @@ void show_free_areas(void)
                        pageset = zone_pcp(zone, cpu);
                        for (temperature = 0; temperature < 2; temperature++)
-                                printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
+                                printk("cpu %d %s: high %d, batch %d used:%d\n",
                                        cpu,
                                        temperature ? "cold" : "hot",
-                                        pageset->pcp[temperature].low,
                                        pageset->pcp[temperature].high,
                                        pageset->pcp[temperature].batch,
                                        pageset->pcp[temperature].count);
@@ -1402,7 +1458,7 @@ void show_free_areas(void)
                show_node(zone);
                printk("%s: ", zone->name);
-                if (!zone->present_pages) {
+                if (!populated_zone(zone)) {
                        printk("empty\n");
                        continue;
                }
@@ -1422,32 +1478,29 @@ void show_free_areas(void)
 /*
 * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
 */
-static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+static int __init build_zonelists_node(pg_data_t *pgdat,
-{
+                        struct zonelist *zonelist, int nr_zones, int zone_type)
-        switch (k) {
+{
-                struct zone *zone;
+        struct zone *zone;
-        default:
-                BUG();
+        BUG_ON(zone_type > ZONE_HIGHMEM);
-        case ZONE_HIGHMEM:
-                zone = pgdat->node_zones + ZONE_HIGHMEM;
+        do {
-                if (zone->present_pages) {
+                zone = pgdat->node_zones + zone_type;
+                if (populated_zone(zone)) {
 #ifndef CONFIG_HIGHMEM
-                        BUG();
+                        BUG_ON(zone_type > ZONE_NORMAL);
 #endif
-                        zonelist->zones[j++] = zone;
+                        zonelist->zones[nr_zones++] = zone;
+                        check_highest_zone(zone_type);
                }
-        case ZONE_NORMAL:
+                zone_type--;
-                zone = pgdat->node_zones + ZONE_NORMAL;
-                if (zone->present_pages)
-                        zonelist->zones[j++] = zone;
-        case ZONE_DMA:
-                zone = pgdat->node_zones + ZONE_DMA;
-                if (zone->present_pages)
-                        zonelist->zones[j++] = zone;
-        }
-        return j;
+        } while (zone_type >= 0);
+        return nr_zones;
 }
 static inline int highest_zone(int zone_bits)
@@ -1455,6 +1508,8 @@ static inline int highest_zone(int zone_bits)
        int res = ZONE_NORMAL;
        if (zone_bits & (__force int)__GFP_HIGHMEM)
                res = ZONE_HIGHMEM;
+        if (zone_bits & (__force int)__GFP_DMA32)
+                res = ZONE_DMA32;
        if (zone_bits & (__force int)__GFP_DMA)
                res = ZONE_DMA;
        return res;
@@ -1542,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
        prev_node = local_node;
        nodes_clear(used_mask);
        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
+                int distance = node_distance(local_node, node);
+                /*
+                 * If another node is sufficiently far away then it is better
+                 * to reclaim pages in a zone before going off node.
+                 */
+                if (distance > RECLAIM_DISTANCE)
+                        zone_reclaim_mode = 1;
                /*
                 * We don't want to pressure a particular node.
                 * So adding penalty to the first node in same
                 * distance group to make it round-robin.
                 */
-                if (node_distance(local_node, node) !=
-                                node_distance(local_node, prev_node))
+                if (distance != node_distance(local_node, prev_node))
                        node_load[node] += load;
                prev_node = node;
                load--;
@@ -1682,18 +1746,16 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
 * up by free_all_bootmem() once the early boot process is
 * done. Non-atomic initialization, single-pass.
 */
-void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
+void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                unsigned long start_pfn)
 {
        struct page *page;
        unsigned long end_pfn = start_pfn + size;
        unsigned long pfn;
-        for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
+        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                if (!early_pfn_valid(pfn))
                        continue;
-                if (!early_pfn_in_nid(pfn, nid))
-                        continue;
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
                set_page_count(page, 1);
@@ -1737,7 +1799,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
        memmap_init_zone((size), (nid), (zone), (start_pfn))
 #endif
-static int __devinit zone_batchsize(struct zone *zone)
+static int __meminit zone_batchsize(struct zone *zone)
 {
        int batch;
@@ -1755,16 +1817,16 @@ static int __devinit zone_batchsize(struct zone *zone)
                batch = 1;
        /*
-         * We will be trying to allcoate bigger chunks of contiguous
+         * Clamp the batch to a 2^n - 1 value. Having a power
-         * memory of the order of fls(batch).  This should result in
+         * of 2 value was found to be more likely to have
-         * better cache coloring.
+         * suboptimal cache aliasing properties in some cases.
         *
-         * A sanity check also to ensure that batch is still in limits.
+         * For example if 2 tasks are alternately allocating
+         * batches of pages, one task can end up with a lot
+         * of pages of one half of the possible page colors
+         * and the other with pages of the other colors.
         */
-        batch = (1 << fls(batch + batch/2));
+        batch = (1 << (fls(batch + batch/2)-1)) - 1;
-        if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
-                batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
        return batch;
 }
@@ -1777,19 +1839,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        pcp = &p->pcp[0];               /* hot */
        pcp->count = 0;
-        pcp->low = 0;
        pcp->high = 6 * batch;
        pcp->batch = max(1UL, 1 * batch);
        INIT_LIST_HEAD(&pcp->list);
        pcp = &p->pcp[1];               /* cold*/
        pcp->count = 0;
-        pcp->low = 0;
        pcp->high = 2 * batch;
        pcp->batch = max(1UL, batch/2);
        INIT_LIST_HEAD(&pcp->list);
 }
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+                                unsigned long high)
+{
+        struct per_cpu_pages *pcp;
+        pcp = &p->pcp[0]; /* hot list */
+        pcp->high = high;
+        pcp->batch = max(1UL, high/4);
+        if ((high/4) > (PAGE_SHIFT * 8))
+                pcp->batch = PAGE_SHIFT * 8;
+}
 #ifdef CONFIG_NUMA
 /*
 * Boot pageset table. One per cpu which is going to be used for all
@@ -1815,18 +1893,22 @@ static struct per_cpu_pageset
 * Dynamically allocate memory for the
 * per cpu pageset array in struct zone.
 */
-static int __devinit process_zones(int cpu)
+static int __meminit process_zones(int cpu)
 {
        struct zone *zone, *dzone;
        for_each_zone(zone) {
-                zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+                zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
                                         GFP_KERNEL, cpu_to_node(cpu));
-                if (!zone->pageset[cpu])
+                if (!zone_pcp(zone, cpu))
                        goto bad;
-                setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+                setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+                if (percpu_pagelist_fraction)
+                        setup_pagelist_highmark(zone_pcp(zone, cpu),
+                                (zone->present_pages / percpu_pagelist_fraction));
        }
        return 0;
@@ -1834,15 +1916,14 @@ bad:
        for_each_zone(dzone) {
                if (dzone == zone)
                        break;
-                kfree(dzone->pageset[cpu]);
+                kfree(zone_pcp(dzone, cpu));
-                dzone->pageset[cpu] = NULL;
+                zone_pcp(dzone, cpu) = NULL;
        }
        return -ENOMEM;
 }
 static inline void free_zone_pagesets(int cpu)
 {
-#ifdef CONFIG_NUMA
        struct zone *zone;
        for_each_zone(zone) {
@@ -1851,10 +1932,9 @@ static inline void free_zone_pagesets(int cpu)
                zone_pcp(zone, cpu) = NULL;
                kfree(pset);
        }
-#endif
 }
-static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
+static int __meminit pageset_cpuup_callback(struct notifier_block *nfb,
                unsigned long action,
                void *hcpu)
 {
@@ -1866,11 +1946,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
                        if (process_zones(cpu))
                                ret = NOTIFY_BAD;
                        break;
-#ifdef CONFIG_HOTPLUG_CPU
+                case CPU_UP_CANCELED:
                case CPU_DEAD:
                        free_zone_pagesets(cpu);
                        break;
-#endif
                default:
                        break;
        }
@@ -1880,7 +1959,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
 static struct notifier_block pageset_notifier =
        { &pageset_cpuup_callback, NULL, 0 };
-void __init setup_per_cpu_pageset()
+void __init setup_per_cpu_pageset(void)
 {
        int err;
@@ -1895,7 +1974,7 @@ void __init setup_per_cpu_pageset()
 #endif
-static __devinit
+static __meminit
 void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
        int i;
@@ -1915,7 +1994,7 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
                init_waitqueue_head(zone->wait_table + i);
 }
-static __devinit void zone_pcp_init(struct zone *zone)
+static __meminit void zone_pcp_init(struct zone *zone)
 {
        int cpu;
        unsigned long batch = zone_batchsize(zone);
@@ -1923,7 +2002,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
        for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_NUMA
                /* Early boot. Slab allocator not functional yet */
-                zone->pageset[cpu] = &boot_pageset[cpu];
+                zone_pcp(zone, cpu) = &boot_pageset[cpu];
                setup_pageset(&boot_pageset[cpu],0);
 #else
                setup_pageset(zone_pcp(zone,cpu), batch);
@@ -1933,7 +2012,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
                zone->name, zone->present_pages, batch);
 }
-static __devinit void init_currently_empty_zone(struct zone *zone,
+static __meminit void init_currently_empty_zone(struct zone *zone,
                unsigned long zone_start_pfn, unsigned long size)
 {
        struct pglist_data *pgdat = zone->zone_pgdat;
@@ -1975,7 +2054,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
                if (zholes_size)
                        realsize -= zholes_size[j];
-                if (j == ZONE_DMA || j == ZONE_NORMAL)
+                if (j < ZONE_HIGHMEM)
                        nr_kernel_pages += realsize;
                nr_all_pages += realsize;
@@ -2100,7 +2179,7 @@ static int frag_show(struct seq_file *m, void *arg)
        int order;
        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-                if (!zone->present_pages)
+                if (!populated_zone(zone))
                        continue;
                spin_lock_irqsave(&zone->lock, flags);
@@ -2133,7 +2212,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
                int i;
-                if (!zone->present_pages)
+                if (!populated_zone(zone))
                        continue;
                spin_lock_irqsave(&zone->lock, flags);
@@ -2166,7 +2245,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                seq_printf(m,
                           ")"
                           "\n  pagesets");
-                for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+                for_each_online_cpu(i) {
                        struct per_cpu_pageset *pageset;
                        int j;
@@ -2181,12 +2260,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                                seq_printf(m,
                                           "\n    cpu: %i pcp: %i"
                                           "\n              count: %i"
-                                           "\n              low:   %i"
                                           "\n              high:  %i"
                                           "\n              batch: %i",
                                           i, j,
                                           pageset->pcp[j].count,
-                                           pageset->pcp[j].low,
                                           pageset->pcp[j].high,
                                           pageset->pcp[j].batch);
                        }
@@ -2241,32 +2318,40 @@ static char *vmstat_text[] = {
        "pgpgout",
        "pswpin",
        "pswpout",
-        "pgalloc_high",
+        "pgalloc_high",
        "pgalloc_normal",
+        "pgalloc_dma32",
        "pgalloc_dma",
        "pgfree",
        "pgactivate",
        "pgdeactivate",
        "pgfault",
        "pgmajfault",
        "pgrefill_high",
        "pgrefill_normal",
+        "pgrefill_dma32",
        "pgrefill_dma",
        "pgsteal_high",
        "pgsteal_normal",
+        "pgsteal_dma32",
        "pgsteal_dma",
        "pgscan_kswapd_high",
        "pgscan_kswapd_normal",
+        "pgscan_kswapd_dma32",
        "pgscan_kswapd_dma",
        "pgscan_direct_high",
        "pgscan_direct_normal",
+        "pgscan_direct_dma32",
        "pgscan_direct_dma",
-        "pginodesteal",
+        "pginodesteal",
        "slabs_scanned",
        "kswapd_steal",
        "kswapd_inodesteal",
@@ -2417,13 +2502,18 @@ void setup_per_zone_pages_min(void)
        }
        for_each_zone(zone) {
+                unsigned long tmp;
                spin_lock_irqsave(&zone->lru_lock, flags);
+                tmp = (pages_min * zone->present_pages) / lowmem_pages;
                if (is_highmem(zone)) {
                        /*
-                         * Often, highmem doesn't need to reserve any pages.
+                         * __GFP_HIGH and PF_MEMALLOC allocations usually don't
-                         * But the pages_min/low/high values are also used for
+                         * need highmem pages, so cap pages_min to a small
-                         * batching up page reclaim activity so we need a
+                         * value here.
-                         * decent value here.
+                         *
+                         * The (pages_high-pages_low) and (pages_low-pages_min)
+                         * deltas controls asynch page reclaim, and so should
+                         * not be capped for highmem.
                         */
                        int min_pages;
@@ -2434,19 +2524,15 @@ void setup_per_zone_pages_min(void)
                                min_pages = 128;
                        zone->pages_min = min_pages;
                } else {
-                        /* if it's a lowmem zone, reserve a number of pages
+                        /*
+                         * If it's a lowmem zone, reserve a number of pages
                         * proportionate to the zone's size.
                         */
-                        zone->pages_min = (pages_min * zone->present_pages) /
+                        zone->pages_min = tmp;
-                                           lowmem_pages;
                }
-                /*
+                zone->pages_low   = zone->pages_min + tmp / 4;
-                 * When interpreting these watermarks, just keep in mind that:
+                zone->pages_high  = zone->pages_min + tmp / 2;
-                 * zone->pages_min == (zone->pages_min * 4) / 4;
-                 */
-                zone->pages_low   = (zone->pages_min * 5) / 4;
-                zone->pages_high  = (zone->pages_min * 6) / 4;
                spin_unlock_irqrestore(&zone->lru_lock, flags);
        }
 }
@@ -2522,6 +2608,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
        return 0;
 }
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+        struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+        struct zone *zone;
+        unsigned int cpu;
+        int ret;
+        ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+        if (!write || (ret == -EINVAL))
+                return ret;
+        for_each_zone(zone) {
+                for_each_online_cpu(cpu) {
+                        unsigned long  high;
+                        high = zone->present_pages / percpu_pagelist_fraction;
+                        setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+                }
+        }
+        return 0;
+}
 __initdata int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 52822c98c4..c4b6d0afd7 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -90,7 +90,7 @@ struct pdflush_work {
 static int __pdflush(struct pdflush_work *my_work)
 {
-        current->flags |= PF_FLUSHER;
+        current->flags |= PF_FLUSHER | PF_SWAPWRITE;
        my_work->fn = NULL;
        my_work->who = current;
        INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87..8d6eeaaa62 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 {
        unsigned page_idx;
        struct pagevec lru_pvec;
-        int ret = 0;
+        int ret;
        if (mapping->a_ops->readpages) {
                ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
                list_del(&page->lru);
                if (!add_to_page_cache(page, mapping,
                                        page->index, GFP_KERNEL)) {
-                        mapping->a_ops->readpage(filp, page);
+                        ret = mapping->a_ops->readpage(filp, page);
-                        if (!pagevec_add(&lru_pvec, page))
+                        if (ret != AOP_TRUNCATED_PAGE) {
-                                __pagevec_lru_add(&lru_pvec);
+                                if (!pagevec_add(&lru_pvec, page))
-                } else {
+                                        __pagevec_lru_add(&lru_pvec);
-                        page_cache_release(page);
+                                continue;
+                        } /* else fall through to release */
                }
+                page_cache_release(page);
        }
        pagevec_lru_add(&lru_pvec);
+        ret = 0;
 out:
        return ret;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index 914d04b98b..d85a99d28c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -20,20 +20,20 @@
 /*
 * Lock ordering in mm:
 *
- * inode->i_sem (while writing or truncating, not reading or faulting)
+ * inode->i_mutex       (while writing or truncating, not reading or faulting)
 *   inode->i_alloc_sem
 *
 * When a page fault occurs in writing from user to file, down_read
- * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
+ * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
- * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
+ * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
- * taken together; in truncation, i_sem is taken outermost.
+ * taken together; in truncation, i_mutex is taken outermost.
 *
 * mm->mmap_sem
 *   page->flags PG_locked (lock_page)
 *     mapping->i_mmap_lock
 *       anon_vma->lock
 *         mm->page_table_lock or pte_lock
- *           zone->lru_lock (in mark_page_accessed)
+ *           zone->lru_lock (in mark_page_accessed, isolate_lru_page)
 *           swap_lock (in swap_duplicate, swap_info_get)
 *             mmlist_lock (in mmput, drain_mmlist and others)
 *             mapping->private_lock (in __set_page_dirty_buffers)
@@ -225,7 +225,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 /*
 * At what user virtual address is page expected in vma? checking that the
- * page matches the vma: currently only used by unuse_process, on anon pages.
+ * page matches the vma: currently only used on anon pages, by unuse_vma;
 */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
@@ -234,7 +234,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
                    (void *)page->mapping - PAGE_MAPPING_ANON)
                        return -EFAULT;
        } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
-                if (vma->vm_file->f_mapping != page->mapping)
+                if (!vma->vm_file ||
+                    vma->vm_file->f_mapping != page->mapping)
                        return -EFAULT;
        } else
                return -EFAULT;
@@ -289,7 +290,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 * repeatedly from either page_referenced_anon or page_referenced_file.
 */
 static int page_referenced_one(struct page *page,
-        struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token)
+        struct vm_area_struct *vma, unsigned int *mapcount)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -310,7 +311,7 @@ static int page_referenced_one(struct page *page,
        /* Pretend the page is referenced if the task has the
           swap token and is in the middle of a page fault. */
-        if (mm != current->mm && !ignore_token && has_swap_token(mm) &&
+        if (mm != current->mm && has_swap_token(mm) &&
                        rwsem_is_locked(&mm->mmap_sem))
                referenced++;
@@ -320,7 +321,7 @@ out:
        return referenced;
 }
-static int page_referenced_anon(struct page *page, int ignore_token)
+static int page_referenced_anon(struct page *page)
 {
        unsigned int mapcount;
        struct anon_vma *anon_vma;
@@ -333,8 +334,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
        mapcount = page_mapcount(page);
        list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-                referenced += page_referenced_one(page, vma, &mapcount,
+                referenced += page_referenced_one(page, vma, &mapcount);
-                                                        ignore_token);
                if (!mapcount)
                        break;
        }
@@ -353,7 +353,7 @@ static int page_referenced_anon(struct page *page, int ignore_token)
 *
 * This function is only called from page_referenced for object-based pages.
 */
-static int page_referenced_file(struct page *page, int ignore_token)
+static int page_referenced_file(struct page *page)
 {
        unsigned int mapcount;
        struct address_space *mapping = page->mapping;
@@ -391,8 +391,7 @@ static int page_referenced_file(struct page *page, int ignore_token)
                        referenced++;
                        break;
                }
-                referenced += page_referenced_one(page, vma, &mapcount,
+                referenced += page_referenced_one(page, vma, &mapcount);
-                                                        ignore_token);
                if (!mapcount)
                        break;
        }
@@ -409,13 +408,10 @@ static int page_referenced_file(struct page *page, int ignore_token)
 * Quick test_and_clear_referenced for all mappings to a page,
 * returns the number of ptes which referenced the page.
 */
-int page_referenced(struct page *page, int is_locked, int ignore_token)
+int page_referenced(struct page *page, int is_locked)
 {
        int referenced = 0;
-        if (!swap_token_default_timeout)
-                ignore_token = 1;
        if (page_test_and_clear_young(page))
                referenced++;
@@ -424,15 +420,14 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
        if (page_mapped(page) && page->mapping) {
                if (PageAnon(page))
-                        referenced += page_referenced_anon(page, ignore_token);
+                        referenced += page_referenced_anon(page);
                else if (is_locked)
-                        referenced += page_referenced_file(page, ignore_token);
+                        referenced += page_referenced_file(page);
                else if (TestSetPageLocked(page))
                        referenced++;
                else {
                        if (page->mapping)
-                                referenced += page_referenced_file(page,
+                                referenced += page_referenced_file(page);
-                                                                ignore_token);
                        unlock_page(page);
                }
        }
@@ -440,6 +435,30 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
 }
 /**
+ * page_set_anon_rmap - setup new anonymous rmap
+ * @page:       the page to add the mapping to
+ * @vma:        the vm area in which the mapping is added
+ * @address:    the user virtual address mapped
+ */
+static void __page_set_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address)
+{
+        struct anon_vma *anon_vma = vma->anon_vma;
+        BUG_ON(!anon_vma);
+        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+        page->mapping = (struct address_space *) anon_vma;
+        page->index = linear_page_index(vma, address);
+        /*
+         * nr_mapped state can be updated without turning off
+         * interrupts because it is not modified via interrupt.
+         */
+        __inc_page_state(nr_mapped);
+}
+/**
 * page_add_anon_rmap - add pte mapping to an anonymous page
 * @page:       the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
@@ -450,20 +469,27 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
 void page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
-        if (atomic_inc_and_test(&page->_mapcount)) {
+        if (atomic_inc_and_test(&page->_mapcount))
-                struct anon_vma *anon_vma = vma->anon_vma;
+                __page_set_anon_rmap(page, vma, address);
-                BUG_ON(!anon_vma);
-                anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-                page->mapping = (struct address_space *) anon_vma;
-                page->index = linear_page_index(vma, address);
-                inc_page_state(nr_mapped);
-        }
        /* else checking page index and mapping is racy */
 }
+/*
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page:       the page to add the mapping to
+ * @vma:        the vm area in which the mapping is added
+ * @address:    the user virtual address mapped
+ *
+ * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * This means the inc-and-test can be bypassed.
+ */
+void page_add_new_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address)
+{
+        atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+        __page_set_anon_rmap(page, vma, address);
+}
 /**
 * page_add_file_rmap - add pte mapping to a file page
 * @page: the page to add the mapping to
@@ -476,7 +502,7 @@ void page_add_file_rmap(struct page *page)
        BUG_ON(!pfn_valid(page_to_pfn(page)));
        if (atomic_inc_and_test(&page->_mapcount))
-                inc_page_state(nr_mapped);
+                __inc_page_state(nr_mapped);
 }
 /**
@@ -488,6 +514,13 @@ void page_add_file_rmap(struct page *page)
 void page_remove_rmap(struct page *page)
 {
        if (atomic_add_negative(-1, &page->_mapcount)) {
+                if (page_mapcount(page) < 0) {
+                        printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
+                        printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
+                        printk (KERN_EMERG "  page->count = %x\n", page_count(page));
+                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
+                }
                BUG_ON(page_mapcount(page) < 0);
                /*
                 * It would be tidy to reset the PageAnon mapping here,
@@ -500,7 +533,7 @@ void page_remove_rmap(struct page *page)
                 */
                if (page_test_and_clear_dirty(page))
                        set_page_dirty(page);
-                dec_page_state(nr_mapped);
+                __dec_page_state(nr_mapped);
        }
 }
@@ -529,10 +562,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
         * If the page is mlock()d, we cannot swap it out.
         * If it's recently referenced (perhaps page_referenced
         * skipped over this mm) then we should reactivate it.
-         *
-         * Pages belonging to VM_RESERVED regions should not happen here.
         */
-        if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) ||
+        if ((vma->vm_flags & VM_LOCKED) ||
                        ptep_clear_flush_young(vma, address, pte)) {
                ret = SWAP_FAIL;
                goto out_unmap;
@@ -613,7 +644,6 @@ static void try_to_unmap_cluster(unsigned long cursor,
        struct page *page;
        unsigned long address;
        unsigned long end;
-        unsigned long pfn;
        address = (vma->vm_start + cursor) & CLUSTER_MASK;
        end = address + CLUSTER_SIZE;
@@ -642,21 +672,14 @@ static void try_to_unmap_cluster(unsigned long cursor,
        for (; address < end; pte++, address += PAGE_SIZE) {
                if (!pte_present(*pte))
                        continue;
+                page = vm_normal_page(vma, address, *pte);
-                pfn = pte_pfn(*pte);
+                BUG_ON(!page || PageAnon(page));
-                if (unlikely(!pfn_valid(pfn))) {
-                        print_bad_pte(vma, *pte, address);
-                        continue;
-                }
-                page = pfn_to_page(pfn);
-                BUG_ON(PageAnon(page));
                if (ptep_clear_flush_young(vma, address, pte))
                        continue;
                /* Nuke the page table entry. */
-                flush_cache_page(vma, address, pfn);
+                flush_cache_page(vma, address, pte_pfn(*pte));
                pteval = ptep_clear_flush(vma, address, pte);
                /* If nonlinear, store the file page offset in the pte. */
@@ -727,7 +750,7 @@ static int try_to_unmap_file(struct page *page)
        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                shared.vm_set.list) {
-                if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+                if (vma->vm_flags & VM_LOCKED)
                        continue;
                cursor = (unsigned long) vma->vm_private_data;
                if (cursor > max_nl_cursor)
@@ -761,7 +784,7 @@ static int try_to_unmap_file(struct page *page)
        do {
                list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
                                                shared.vm_set.list) {
-                        if (vma->vm_flags & (VM_LOCKED|VM_RESERVED))
+                        if (vma->vm_flags & VM_LOCKED)
                                continue;
                        cursor = (unsigned long) vma->vm_private_data;
                        while ( cursor < max_nl_cursor &&
@@ -783,11 +806,8 @@ static int try_to_unmap_file(struct page *page)
         * in locked vmas).  Reset cursor on all unreserved nonlinear
         * vmas, now forgetting on which ones it had fallen behind.
         */
-        list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
+        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-                                                shared.vm_set.list) {
+                vma->vm_private_data = NULL;
-                if (!(vma->vm_flags & VM_RESERVED))
-                        vma->vm_private_data = NULL;
-        }
 out:
        spin_unlock(&mapping->i_mmap_lock);
        return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61..ce501bce1c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
        } while (next);
 }
-static void shmem_truncate(struct inode *inode)
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
        unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
        long nr_swaps_freed = 0;
        int offset;
        int freed;
+        int punch_hole = 0;
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-        idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (idx >= info->next_index)
                return;
        spin_lock(&info->lock);
        info->flags |= SHMEM_TRUNCATE;
-        limit = info->next_index;
+        if (likely(end == (loff_t) -1)) {
-        info->next_index = idx;
+                limit = info->next_index;
+                info->next_index = idx;
+        } else {
+                limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                if (limit > info->next_index)
+                        limit = info->next_index;
+                punch_hole = 1;
+        }
        topdir = info->i_indirect;
-        if (topdir && idx <= SHMEM_NR_DIRECT) {
+        if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
                info->i_indirect = NULL;
                nr_pages_to_free++;
                list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
                        set_page_private(subdir, page_private(subdir) - freed);
                        if (offset)
                                spin_unlock(&info->lock);
-                        BUG_ON(page_private(subdir) > offset);
+                        if (!punch_hole)
+                                BUG_ON(page_private(subdir) > offset);
                }
                if (offset)
                        offset = 0;
-                else if (subdir) {
+                else if (subdir && !page_private(subdir)) {
                        dir[diroff] = NULL;
                        nr_pages_to_free++;
                        list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
                 * Also, though shmem_getpage checks i_size before adding to
                 * cache, no recheck after: so fix the narrow window there too.
                 */
-                truncate_inode_pages(inode->i_mapping, inode->i_size);
+                truncate_inode_pages_range(inode->i_mapping, start, end);
        }
        spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
        }
 }
+static void shmem_truncate(struct inode *inode)
+{
+        shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+}
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
@@ -855,7 +870,7 @@ unlock:
        swap_free(swap);
 redirty:
        set_page_dirty(page);
-        return WRITEPAGE_ACTIVATE;      /* Return with the page locked */
+        return AOP_WRITEPAGE_ACTIVATE;  /* Return with the page locked */
 }
 #ifdef CONFIG_NUMA
@@ -1255,7 +1270,7 @@ out_nomem:
        return retval;
 }
-static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
        file_accessed(file);
        vma->vm_ops = &shmem_vm_ops;
@@ -1301,7 +1316,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                case S_IFREG:
                        inode->i_op = &shmem_inode_operations;
                        inode->i_fop = &shmem_file_operations;
-                        mpol_shared_policy_init(&info->policy);
+                        mpol_shared_policy_init(&info->policy, sbinfo->policy,
+                                                        &sbinfo->policy_nodes);
                        break;
                case S_IFDIR:
                        inode->i_nlink++;
@@ -1315,7 +1331,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
                         * Must not load anything in the rbtree,
                         * mpol_free_shared_policy will not be called.
                         */
-                        mpol_shared_policy_init(&info->policy);
+                        mpol_shared_policy_init(&info->policy, MPOL_DEFAULT,
+                                                NULL);
                        break;
                }
        } else if (sbinfo->max_inodes) {
@@ -1355,7 +1372,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
        if (!access_ok(VERIFY_READ, buf, count))
                return -EFAULT;
-        down(&inode->i_sem);
+        mutex_lock(&inode->i_mutex);
        pos = *ppos;
        written = 0;
@@ -1440,7 +1457,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
        if (written)
                err = written;
 out:
-        up(&inode->i_sem);
+        mutex_unlock(&inode->i_mutex);
        return err;
 }
@@ -1476,7 +1493,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
                /*
                 * We must evaluate after, since reads (unlike writes)
-                 * are called without i_sem protection against truncate
+                 * are called without i_mutex protection against truncate
                 */
                nr = PAGE_CACHE_SIZE;
                i_size = i_size_read(inode);
@@ -1828,7 +1845,9 @@ static struct inode_operations shmem_symlink_inode_operations = {
        .put_link       = shmem_put_link,
 };
-static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
+static int shmem_parse_options(char *options, int *mode, uid_t *uid,
+        gid_t *gid, unsigned long *blocks, unsigned long *inodes,
+        int *policy, nodemask_t *policy_nodes)
 {
        char *this_char, *value, *rest;
@@ -1882,6 +1901,19 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid,
                        *gid = simple_strtoul(value,&rest,0);
                        if (*rest)
                                goto bad_val;
+                } else if (!strcmp(this_char,"mpol")) {
+                        if (!strcmp(value,"default"))
+                                *policy = MPOL_DEFAULT;
+                        else if (!strcmp(value,"preferred"))
+                                *policy = MPOL_PREFERRED;
+                        else if (!strcmp(value,"bind"))
+                                *policy = MPOL_BIND;
+                        else if (!strcmp(value,"interleave"))
+                                *policy = MPOL_INTERLEAVE;
+                        else
+                                goto bad_val;
+                } else if (!strcmp(this_char,"mpol_nodelist")) {
+                        nodelist_parse(value, *policy_nodes);
                } else {
                        printk(KERN_ERR "tmpfs: Bad mount option %s\n",
                               this_char);
@@ -1902,12 +1934,14 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        unsigned long max_blocks = sbinfo->max_blocks;
        unsigned long max_inodes = sbinfo->max_inodes;
+        int policy = sbinfo->policy;
+        nodemask_t policy_nodes = sbinfo->policy_nodes;
        unsigned long blocks;
        unsigned long inodes;
        int error = -EINVAL;
-        if (shmem_parse_options(data, NULL, NULL, NULL,
+        if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks,
-                                &max_blocks, &max_inodes))
+                                &max_inodes, &policy, &policy_nodes))
                return error;
        spin_lock(&sbinfo->stat_lock);
@@ -1933,6 +1967,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        sbinfo->free_blocks = max_blocks - blocks;
        sbinfo->max_inodes  = max_inodes;
        sbinfo->free_inodes = max_inodes - inodes;
+        sbinfo->policy = policy;
+        sbinfo->policy_nodes = policy_nodes;
 out:
        spin_unlock(&sbinfo->stat_lock);
        return error;
@@ -1957,6 +1993,8 @@ static int shmem_fill_super(struct super_block *sb,
        struct shmem_sb_info *sbinfo;
        unsigned long blocks = 0;
        unsigned long inodes = 0;
+        int policy = MPOL_DEFAULT;
+        nodemask_t policy_nodes = node_online_map;
 #ifdef CONFIG_TMPFS
        /*
@@ -1969,8 +2007,8 @@ static int shmem_fill_super(struct super_block *sb,
                inodes = totalram_pages - totalhigh_pages;
                if (inodes > blocks)
                        inodes = blocks;
-                if (shmem_parse_options(data, &mode, &uid, &gid,
+                if (shmem_parse_options(data, &mode, &uid, &gid, &blocks,
-                                        &blocks, &inodes))
+                                        &inodes, &policy, &policy_nodes))
                        return -EINVAL;
        }
 #else
@@ -1988,6 +2026,8 @@ static int shmem_fill_super(struct super_block *sb,
        sbinfo->free_blocks = blocks;
        sbinfo->max_inodes = inodes;
        sbinfo->free_inodes = inodes;
+        sbinfo->policy = policy;
+        sbinfo->policy_nodes = policy_nodes;
        sb->s_fs_info = sbinfo;
        sb->s_maxbytes = SHMEM_MAX_BYTES;
@@ -2083,6 +2123,7 @@ static struct file_operations shmem_file_operations = {
 static struct inode_operations shmem_inode_operations = {
        .truncate       = shmem_truncate,
        .setattr        = shmem_notify_change,
+        .truncate_range = shmem_truncate_range,
 };
 static struct inode_operations shmem_dir_inode_operations = {
diff --git a/mm/slab.c b/mm/slab.c
index e291f5e1af..6f8495e218 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -68,7 +68,7 @@
 * Further notes from the original documentation:
 *
 * 11 April '97.  Started multi-threading - markhe
- *      The global cache-chain is protected by the semaphore 'cache_chain_sem'.
+ *      The global cache-chain is protected by the mutex 'cache_chain_mutex'.
 *      The sem is only needed when accessing/extending the cache-chain, which
 *      can never happen inside an interrupt (kmem_cache_create(),
 *      kmem_cache_shrink() and kmem_cache_reap()).
@@ -103,6 +103,8 @@
 #include        <linux/rcupdate.h>
 #include        <linux/string.h>
 #include        <linux/nodemask.h>
+#include        <linux/mempolicy.h>
+#include        <linux/mutex.h>
 #include        <asm/uaccess.h>
 #include        <asm/cacheflush.h>
@@ -130,7 +132,6 @@
 #define FORCED_DEBUG    0
 #endif
 /* Shouldn't this be in a header file somewhere? */
 #define BYTES_PER_WORD          sizeof(void *)
@@ -217,12 +218,12 @@ static unsigned long offslab_limit;
 * Slabs are chained into three list: fully used, partial, fully free slabs.
 */
 struct slab {
-        struct list_head        list;
+        struct list_head list;
-        unsigned long           colouroff;
+        unsigned long colouroff;
-        void                    *s_mem;         /* including colour offset */
+        void *s_mem;            /* including colour offset */
-        unsigned int            inuse;          /* num of objs active in slab */
+        unsigned int inuse;     /* num of objs active in slab */
-        kmem_bufctl_t           free;
+        kmem_bufctl_t free;
-        unsigned short          nodeid;
+        unsigned short nodeid;
 };
 /*
@@ -242,9 +243,9 @@ struct slab {
 * We assume struct slab_rcu can overlay struct slab when destroying.
 */
 struct slab_rcu {
-        struct rcu_head         head;
+        struct rcu_head head;
-        kmem_cache_t            *cachep;
+        kmem_cache_t *cachep;
-        void                    *addr;
+        void *addr;
 };
 /*
@@ -279,23 +280,23 @@ struct array_cache {
 #define BOOT_CPUCACHE_ENTRIES   1
 struct arraycache_init {
        struct array_cache cache;
-        void * entries[BOOT_CPUCACHE_ENTRIES];
+        void *entries[BOOT_CPUCACHE_ENTRIES];
 };
 /*
 * The slab lists for all objects.
 */
 struct kmem_list3 {
-        struct list_head        slabs_partial;  /* partial list first, better asm code */
+        struct list_head slabs_partial; /* partial list first, better asm code */
-        struct list_head        slabs_full;
+        struct list_head slabs_full;
-        struct list_head        slabs_free;
+        struct list_head slabs_free;
-        unsigned long   free_objects;
+        unsigned long free_objects;
-        unsigned long   next_reap;
+        unsigned long next_reap;
-        int             free_touched;
+        int free_touched;
-        unsigned int    free_limit;
+        unsigned int free_limit;
-        spinlock_t      list_lock;
+        spinlock_t list_lock;
-        struct array_cache      *shared;        /* shared per node */
+        struct array_cache *shared;     /* shared per node */
-        struct array_cache      **alien;        /* on other nodes */
+        struct array_cache **alien;     /* on other nodes */
 };
 /*
@@ -367,63 +368,63 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
 *
 * manages a cache.
 */
-        
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
-        struct array_cache      *array[NR_CPUS];
+        struct array_cache *array[NR_CPUS];
-        unsigned int            batchcount;
+        unsigned int batchcount;
-        unsigned int            limit;
+        unsigned int limit;
-        unsigned int            shared;
+        unsigned int shared;
-        unsigned int            objsize;
+        unsigned int objsize;
 /* 2) touched by every alloc & free from the backend */
-        struct kmem_list3       *nodelists[MAX_NUMNODES];
+        struct kmem_list3 *nodelists[MAX_NUMNODES];
-        unsigned int            flags;  /* constant flags */
+        unsigned int flags;     /* constant flags */
-        unsigned int            num;    /* # of objs per slab */
+        unsigned int num;       /* # of objs per slab */
-        spinlock_t              spinlock;
+        spinlock_t spinlock;
 /* 3) cache_grow/shrink */
        /* order of pgs per slab (2^n) */
-        unsigned int            gfporder;
+        unsigned int gfporder;
        /* force GFP flags, e.g. GFP_DMA */
-        gfp_t                   gfpflags;
+        gfp_t gfpflags;
-        size_t                  colour;         /* cache colouring range */
+        size_t colour;          /* cache colouring range */
-        unsigned int            colour_off;     /* colour offset */
+        unsigned int colour_off;        /* colour offset */
-        unsigned int            colour_next;    /* cache colouring */
+        unsigned int colour_next;       /* cache colouring */
-        kmem_cache_t            *slabp_cache;
+        kmem_cache_t *slabp_cache;
-        unsigned int            slab_size;
+        unsigned int slab_size;
-        unsigned int            dflags;         /* dynamic flags */
+        unsigned int dflags;    /* dynamic flags */
        /* constructor func */
-        void (*ctor)(void *, kmem_cache_t *, unsigned long);
+        void (*ctor) (void *, kmem_cache_t *, unsigned long);
        /* de-constructor func */
-        void (*dtor)(void *, kmem_cache_t *, unsigned long);
+        void (*dtor) (void *, kmem_cache_t *, unsigned long);
 /* 4) cache creation/removal */
-        const char              *name;
+        const char *name;
-        struct list_head        next;
+        struct list_head next;
 /* 5) statistics */
 #if STATS
-        unsigned long           num_active;
+        unsigned long num_active;
-        unsigned long           num_allocations;
+        unsigned long num_allocations;
-        unsigned long           high_mark;
+        unsigned long high_mark;
-        unsigned long           grown;
+        unsigned long grown;
-        unsigned long           reaped;
+        unsigned long reaped;
-        unsigned long           errors;
+        unsigned long errors;
-        unsigned long           max_freeable;
+        unsigned long max_freeable;
-        unsigned long           node_allocs;
+        unsigned long node_allocs;
-        unsigned long           node_frees;
+        unsigned long node_frees;
-        atomic_t                allochit;
+        atomic_t allochit;
-        atomic_t                allocmiss;
+        atomic_t allocmiss;
-        atomic_t                freehit;
+        atomic_t freehit;
-        atomic_t                freemiss;
+        atomic_t freemiss;
 #endif
 #if DEBUG
-        int                     dbghead;
+        int dbghead;
-        int                     reallen;
+        int reallen;
 #endif
 };
@@ -434,7 +435,7 @@ struct kmem_cache {
 /* Optimization question: fewer reaps means less 
 * probability for unnessary cpucache drain/refill cycles.
 *
- * OTHO the cpuarrays can contain lots of objects,
+ * OTOH the cpuarrays can contain lots of objects,
 * which could lock up otherwise freeable slabs.
 */
 #define REAPTIMEOUT_CPUC        (2*HZ)
@@ -523,14 +524,15 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
        if (cachep->flags & SLAB_STORE_USER)
-                return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
+                return (unsigned long *)(objp + cachep->objsize -
-        return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
+                                         2 * BYTES_PER_WORD);
+        return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
 }
 static void **dbg_userword(kmem_cache_t *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-        return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
+        return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
 }
 #else
@@ -565,14 +567,29 @@ static void **dbg_userword(kmem_cache_t *cachep, void *objp)
 #define BREAK_GFP_ORDER_LO      0
 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
-/* Macros for storing/retrieving the cachep and or slab from the
+/* Functions for storing/retrieving the cachep and or slab from the
 * global 'mem_map'. These are used to find the slab an obj belongs to.
 * With kfree(), these are used to find the cache which an obj belongs to.
 */
-#define SET_PAGE_CACHE(pg,x)  ((pg)->lru.next = (struct list_head *)(x))
+static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
-#define GET_PAGE_CACHE(pg)    ((kmem_cache_t *)(pg)->lru.next)
+{
-#define SET_PAGE_SLAB(pg,x)   ((pg)->lru.prev = (struct list_head *)(x))
+        page->lru.next = (struct list_head *)cache;
-#define GET_PAGE_SLAB(pg)     ((struct slab *)(pg)->lru.prev)
+}
+static inline struct kmem_cache *page_get_cache(struct page *page)
+{
+        return (struct kmem_cache *)page->lru.next;
+}
+static inline void page_set_slab(struct page *page, struct slab *slab)
+{
+        page->lru.prev = (struct list_head *)slab;
+}
+static inline struct slab *page_get_slab(struct page *page)
+{
+        return (struct slab *)page->lru.prev;
+}
 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
 struct cache_sizes malloc_sizes[] = {
@@ -592,31 +609,31 @@ struct cache_names {
 static struct cache_names __initdata cache_names[] = {
 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 #include <linux/kmalloc_sizes.h>
-        { NULL, }
+        {NULL,}
 #undef CACHE
 };
 static struct arraycache_init initarray_cache __initdata =
-        { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 static struct arraycache_init initarray_generic =
-        { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 /* internal cache of cache description objs */
 static kmem_cache_t cache_cache = {
-        .batchcount     = 1,
+        .batchcount = 1,
-        .limit          = BOOT_CPUCACHE_ENTRIES,
+        .limit = BOOT_CPUCACHE_ENTRIES,
-        .shared         = 1,
+        .shared = 1,
-        .objsize        = sizeof(kmem_cache_t),
+        .objsize = sizeof(kmem_cache_t),
-        .flags          = SLAB_NO_REAP,
+        .flags = SLAB_NO_REAP,
-        .spinlock       = SPIN_LOCK_UNLOCKED,
+        .spinlock = SPIN_LOCK_UNLOCKED,
-        .name           = "kmem_cache",
+        .name = "kmem_cache",
 #if DEBUG
-        .reallen        = sizeof(kmem_cache_t),
+        .reallen = sizeof(kmem_cache_t),
 #endif
 };
 /* Guard access to the cache-chain. */
-static struct semaphore cache_chain_sem;
+static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 /*
@@ -640,9 +657,9 @@ static enum {
 static DEFINE_PER_CPU(struct work_struct, reap_work);
-static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node);
+static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
-static void enable_cpucache (kmem_cache_t *cachep);
+static void enable_cpucache(kmem_cache_t *cachep);
-static void cache_reap (void *unused);
+static void cache_reap(void *unused);
 static int __node_shrink(kmem_cache_t *cachep, int node);
 static inline struct array_cache *ac_data(kmem_cache_t *cachep)
@@ -656,9 +673,9 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
 #if DEBUG
        /* This happens if someone tries to call
-        * kmem_cache_create(), or __kmalloc(), before
+         * kmem_cache_create(), or __kmalloc(), before
-        * the generic caches are initialized.
+         * the generic caches are initialized.
-        */
+         */
        BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 #endif
        while (size > csizep->cs_size)
@@ -682,10 +699,10 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
 static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
-                 int flags, size_t *left_over, unsigned int *num)
+                           int flags, size_t *left_over, unsigned int *num)
 {
        int i;
-        size_t wastage = PAGE_SIZE<<gfporder;
+        size_t wastage = PAGE_SIZE << gfporder;
        size_t extra = 0;
        size_t base = 0;
@@ -694,7 +711,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
                extra = sizeof(kmem_bufctl_t);
        }
        i = 0;
-        while (i*size + ALIGN(base+i*extra, align) <= wastage)
+        while (i * size + ALIGN(base + i * extra, align) <= wastage)
                i++;
        if (i > 0)
                i--;
@@ -703,8 +720,8 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
                i = SLAB_LIMIT;
        *num = i;
-        wastage -= i*size;
+        wastage -= i * size;
-        wastage -= ALIGN(base+i*extra, align);
+        wastage -= ALIGN(base + i * extra, align);
        *left_over = wastage;
 }
@@ -713,7 +730,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
 {
        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
-                function, cachep->name, msg);
+               function, cachep->name, msg);
        dump_stack();
 }
@@ -740,9 +757,9 @@ static void __devinit start_cpu_timer(int cpu)
 }
 static struct array_cache *alloc_arraycache(int node, int entries,
-                                                int batchcount)
+                                            int batchcount)
 {
-        int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
+        int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
        struct array_cache *nc = NULL;
        nc = kmalloc_node(memsize, GFP_KERNEL, node);
@@ -757,10 +774,12 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 }
 #ifdef CONFIG_NUMA
+static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
        struct array_cache **ac_ptr;
-        int memsize = sizeof(void*)*MAX_NUMNODES;
+        int memsize = sizeof(void *) * MAX_NUMNODES;
        int i;
        if (limit > 1)
@@ -774,7 +793,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
                        }
                        ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
                        if (!ac_ptr[i]) {
-                                for (i--; i <=0; i--)
+                                for (i--; i <= 0; i--)
                                        kfree(ac_ptr[i]);
                                kfree(ac_ptr);
                                return NULL;
@@ -792,12 +811,13 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
                return;
        for_each_node(i)
-                kfree(ac_ptr[i]);
+            kfree(ac_ptr[i]);
        kfree(ac_ptr);
 }
-static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
+static inline void __drain_alien_cache(kmem_cache_t *cachep,
+                                       struct array_cache *ac, int node)
 {
        struct kmem_list3 *rl3 = cachep->nodelists[node];
@@ -811,7 +831,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache
 static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
 {
-        int i=0;
+        int i = 0;
        struct array_cache *ac;
        unsigned long flags;
@@ -831,18 +851,17 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
 #endif
 static int __devinit cpuup_callback(struct notifier_block *nfb,
-                                  unsigned long action, void *hcpu)
+                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
-        kmem_cache_t* cachep;
+        kmem_cache_t *cachep;
        struct kmem_list3 *l3 = NULL;
        int node = cpu_to_node(cpu);
        int memsize = sizeof(struct kmem_list3);
-        struct array_cache *nc = NULL;
        switch (action) {
        case CPU_UP_PREPARE:
-                down(&cache_chain_sem);
+                mutex_lock(&cache_chain_mutex);
                /* we need to do this right in the beginning since
                 * alloc_arraycache's are going to use this list.
                 * kmalloc_node allows us to add the slab to the right
@@ -856,27 +875,29 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                         */
                        if (!cachep->nodelists[node]) {
                                if (!(l3 = kmalloc_node(memsize,
-                                                GFP_KERNEL, node)))
+                                                        GFP_KERNEL, node)))
                                        goto bad;
                                kmem_list3_init(l3);
                                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                                  ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
                                cachep->nodelists[node] = l3;
                        }
                        spin_lock_irq(&cachep->nodelists[node]->list_lock);
                        cachep->nodelists[node]->free_limit =
-                                (1 + nr_cpus_node(node)) *
+                            (1 + nr_cpus_node(node)) *
-                                cachep->batchcount + cachep->num;
+                            cachep->batchcount + cachep->num;
                        spin_unlock_irq(&cachep->nodelists[node]->list_lock);
                }
                /* Now we can go ahead with allocating the shared array's
-                  & array cache's */
+                   & array cache's */
                list_for_each_entry(cachep, &cache_chain, next) {
+                        struct array_cache *nc;
                        nc = alloc_arraycache(node, cachep->limit,
-                                        cachep->batchcount);
+                                              cachep->batchcount);
                        if (!nc)
                                goto bad;
                        cachep->array[cpu] = nc;
@@ -885,16 +906,17 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        BUG_ON(!l3);
                        if (!l3->shared) {
                                if (!(nc = alloc_arraycache(node,
-                                        cachep->shared*cachep->batchcount,
+                                                            cachep->shared *
-                                        0xbaadf00d)))
+                                                            cachep->batchcount,
-                                        goto  bad;
+                                                            0xbaadf00d)))
+                                        goto bad;
                                /* we are serialised from CPU_DEAD or
-                                  CPU_UP_CANCELLED by the cpucontrol lock */
+                                   CPU_UP_CANCELLED by the cpucontrol lock */
                                l3->shared = nc;
                        }
                }
-                up(&cache_chain_sem);
+                mutex_unlock(&cache_chain_mutex);
                break;
        case CPU_ONLINE:
                start_cpu_timer(cpu);
@@ -903,7 +925,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
        case CPU_DEAD:
                /* fall thru */
        case CPU_UP_CANCELED:
-                down(&cache_chain_sem);
+                mutex_lock(&cache_chain_mutex);
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
@@ -927,13 +949,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                                free_block(cachep, nc->entry, nc->avail, node);
                        if (!cpus_empty(mask)) {
-                                spin_unlock(&l3->list_lock);
+                                spin_unlock(&l3->list_lock);
-                                goto unlock_cache;
+                                goto unlock_cache;
-                        }
+                        }
                        if (l3->shared) {
                                free_block(cachep, l3->shared->entry,
-                                                l3->shared->avail, node);
+                                           l3->shared->avail, node);
                                kfree(l3->shared);
                                l3->shared = NULL;
                        }
@@ -951,17 +973,17 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        } else {
                                spin_unlock(&l3->list_lock);
                        }
-unlock_cache:
+                      unlock_cache:
                        spin_unlock_irq(&cachep->spinlock);
                        kfree(nc);
                }
-                up(&cache_chain_sem);
+                mutex_unlock(&cache_chain_mutex);
                break;
 #endif
        }
        return NOTIFY_OK;
-bad:
+      bad:
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
        return NOTIFY_BAD;
 }
@@ -970,8 +992,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
 * swap the static kmem_list3 with kmalloced memory
 */
-static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
+static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
-                int nodeid)
 {
        struct kmem_list3 *ptr;
@@ -1030,7 +1051,6 @@ void __init kmem_cache_init(void)
         */
        /* 1) create the cache_cache */
-        init_MUTEX(&cache_chain_sem);
        INIT_LIST_HEAD(&cache_chain);
        list_add(&cache_cache.next, &cache_chain);
        cache_cache.colour_off = cache_line_size();
@@ -1040,14 +1060,14 @@ void __init kmem_cache_init(void)
        cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
        cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
-                                &left_over, &cache_cache.num);
+                       &left_over, &cache_cache.num);
        if (!cache_cache.num)
                BUG();
-        cache_cache.colour = left_over/cache_cache.colour_off;
+        cache_cache.colour = left_over / cache_cache.colour_off;
        cache_cache.colour_next = 0;
-        cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
+        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
-                                sizeof(struct slab), cache_line_size());
+                                      sizeof(struct slab), cache_line_size());
        /* 2+3) create the kmalloc caches */
        sizes = malloc_sizes;
@@ -1059,14 +1079,18 @@ void __init kmem_cache_init(void)
         */
        sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-                                sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
+                                                      sizes[INDEX_AC].cs_size,
-                                (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+                                                      ARCH_KMALLOC_MINALIGN,
+                                                      (ARCH_KMALLOC_FLAGS |
+                                                       SLAB_PANIC), NULL, NULL);
        if (INDEX_AC != INDEX_L3)
                sizes[INDEX_L3].cs_cachep =
-                        kmem_cache_create(names[INDEX_L3].name,
+                    kmem_cache_create(names[INDEX_L3].name,
-                                sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
+                                      sizes[INDEX_L3].cs_size,
-                                (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+                                      ARCH_KMALLOC_MINALIGN,
+                                      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
+                                      NULL);
        while (sizes->cs_size != ULONG_MAX) {
                /*
@@ -1076,35 +1100,41 @@ void __init kmem_cache_init(void)
                 * Note for systems short on memory removing the alignment will
                 * allow tighter packing of the smaller caches.
                 */
-                if(!sizes->cs_cachep)
+                if (!sizes->cs_cachep)
                        sizes->cs_cachep = kmem_cache_create(names->name,
-                                sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+                                                             sizes->cs_size,
-                                (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+                                                             ARCH_KMALLOC_MINALIGN,
+                                                             (ARCH_KMALLOC_FLAGS
+                                                              | SLAB_PANIC),
+                                                             NULL, NULL);
                /* Inc off-slab bufctl limit until the ceiling is hit. */
                if (!(OFF_SLAB(sizes->cs_cachep))) {
-                        offslab_limit = sizes->cs_size-sizeof(struct slab);
+                        offslab_limit = sizes->cs_size - sizeof(struct slab);
                        offslab_limit /= sizeof(kmem_bufctl_t);
                }
                sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
-                        sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+                                                        sizes->cs_size,
-                        (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
+                                                        ARCH_KMALLOC_MINALIGN,
-                        NULL, NULL);
+                                                        (ARCH_KMALLOC_FLAGS |
+                                                         SLAB_CACHE_DMA |
+                                                         SLAB_PANIC), NULL,
+                                                        NULL);
                sizes++;
                names++;
        }
        /* 4) Replace the bootstrap head arrays */
        {
-                void * ptr;
+                void *ptr;
                ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
                local_irq_disable();
                BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
                memcpy(ptr, ac_data(&cache_cache),
-                                sizeof(struct arraycache_init));
+                       sizeof(struct arraycache_init));
                cache_cache.array[smp_processor_id()] = ptr;
                local_irq_enable();
@@ -1112,11 +1142,11 @@ void __init kmem_cache_init(void)
                local_irq_disable();
                BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
-                                != &initarray_generic.cache);
+                       != &initarray_generic.cache);
                memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
-                                sizeof(struct arraycache_init));
+                       sizeof(struct arraycache_init));
                malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-                                                ptr;
+                    ptr;
                local_irq_enable();
        }
        /* 5) Replace the bootstrap kmem_list3's */
@@ -1124,16 +1154,16 @@ void __init kmem_cache_init(void)
                int node;
                /* Replace the static kmem_list3 structures for the boot cpu */
                init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
-                                numa_node_id());
+                          numa_node_id());
                for_each_online_node(node) {
                        init_list(malloc_sizes[INDEX_AC].cs_cachep,
-                                        &initkmem_list3[SIZE_AC+node], node);
+                                  &initkmem_list3[SIZE_AC + node], node);
                        if (INDEX_AC != INDEX_L3) {
                                init_list(malloc_sizes[INDEX_L3].cs_cachep,
-                                                &initkmem_list3[SIZE_L3+node],
+                                          &initkmem_list3[SIZE_L3 + node],
-                                                node);
+                                          node);
                        }
                }
        }
@@ -1141,10 +1171,10 @@ void __init kmem_cache_init(void)
        /* 6) resize the head arrays to their final sizes */
        {
                kmem_cache_t *cachep;
-                down(&cache_chain_sem);
+                mutex_lock(&cache_chain_mutex);
                list_for_each_entry(cachep, &cache_chain, next)
-                        enable_cpucache(cachep);
+                    enable_cpucache(cachep);
-                up(&cache_chain_sem);
+                mutex_unlock(&cache_chain_mutex);
        }
        /* Done! */
@@ -1169,7 +1199,7 @@ static int __init cpucache_init(void)
         * pages to gfp.
         */
        for_each_online_cpu(cpu)
-                start_cpu_timer(cpu);
+            start_cpu_timer(cpu);
        return 0;
 }
@@ -1190,11 +1220,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
        int i;
        flags |= cachep->gfpflags;
-        if (likely(nodeid == -1)) {
+        page = alloc_pages_node(nodeid, flags, cachep->gfporder);
-                page = alloc_pages(flags, cachep->gfporder);
-        } else {
-                page = alloc_pages_node(nodeid, flags, cachep->gfporder);
-        }
        if (!page)
                return NULL;
        addr = page_address(page);
@@ -1215,7 +1241,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 */
 static void kmem_freepages(kmem_cache_t *cachep, void *addr)
 {
-        unsigned long i = (1<<cachep->gfporder);
+        unsigned long i = (1 << cachep->gfporder);
        struct page *page = virt_to_page(addr);
        const unsigned long nr_freed = i;
@@ -1228,13 +1254,13 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += nr_freed;
        free_pages((unsigned long)addr, cachep->gfporder);
-        if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 
+        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-                atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
+                atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
 }
 static void kmem_rcu_free(struct rcu_head *head)
 {
-        struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
+        struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
        kmem_cache_t *cachep = slab_rcu->cachep;
        kmem_freepages(cachep, slab_rcu->addr);
@@ -1246,19 +1272,19 @@ static void kmem_rcu_free(struct rcu_head *head)
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
-                                unsigned long caller)
+                            unsigned long caller)
 {
        int size = obj_reallen(cachep);
-        addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
+        addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
-        if (size < 5*sizeof(unsigned long))
+        if (size < 5 * sizeof(unsigned long))
                return;
-        *addr++=0x12345678;
+        *addr++ = 0x12345678;
-        *addr++=caller;
+        *addr++ = caller;
-        *addr++=smp_processor_id();
+        *addr++ = smp_processor_id();
-        size -= 3*sizeof(unsigned long);
+        size -= 3 * sizeof(unsigned long);
        {
                unsigned long *sptr = &caller;
                unsigned long svalue;
@@ -1266,7 +1292,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
                while (!kstack_end(sptr)) {
                        svalue = *sptr++;
                        if (kernel_text_address(svalue)) {
-                                *addr++=svalue;
+                                *addr++ = svalue;
                                size -= sizeof(unsigned long);
                                if (size <= sizeof(unsigned long))
                                        break;
@@ -1274,25 +1300,25 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
                }
        }
-        *addr++=0x87654321;
+        *addr++ = 0x87654321;
 }
 #endif
 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
 {
        int size = obj_reallen(cachep);
-        addr = &((char*)addr)[obj_dbghead(cachep)];
+        addr = &((char *)addr)[obj_dbghead(cachep)];
        memset(addr, val, size);
-        *(unsigned char *)(addr+size-1) = POISON_END;
+        *(unsigned char *)(addr + size - 1) = POISON_END;
 }
 static void dump_line(char *data, int offset, int limit)
 {
        int i;
        printk(KERN_ERR "%03x:", offset);
-        for (i=0;i<limit;i++) {
+        for (i = 0; i < limit; i++) {
-                printk(" %02x", (unsigned char)data[offset+i]);
+                printk(" %02x", (unsigned char)data[offset + i]);
        }
        printk("\n");
 }
@@ -1307,24 +1333,24 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
        if (cachep->flags & SLAB_RED_ZONE) {
                printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
-                        *dbg_redzone1(cachep, objp),
+                       *dbg_redzone1(cachep, objp),
-                        *dbg_redzone2(cachep, objp));
+                       *dbg_redzone2(cachep, objp));
        }
        if (cachep->flags & SLAB_STORE_USER) {
                printk(KERN_ERR "Last user: [<%p>]",
-                                *dbg_userword(cachep, objp));
+                       *dbg_userword(cachep, objp));
                print_symbol("(%s)",
-                                (unsigned long)*dbg_userword(cachep, objp));
+                             (unsigned long)*dbg_userword(cachep, objp));
                printk("\n");
        }
-        realobj = (char*)objp+obj_dbghead(cachep);
+        realobj = (char *)objp + obj_dbghead(cachep);
        size = obj_reallen(cachep);
-        for (i=0; i<size && lines;i+=16, lines--) {
+        for (i = 0; i < size && lines; i += 16, lines--) {
                int limit;
                limit = 16;
-                if (i+limit > size)
+                if (i + limit > size)
-                        limit = size-i;
+                        limit = size - i;
                dump_line(realobj, i, limit);
        }
 }
@@ -1335,27 +1361,28 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
        int size, i;
        int lines = 0;
-        realobj = (char*)objp+obj_dbghead(cachep);
+        realobj = (char *)objp + obj_dbghead(cachep);
        size = obj_reallen(cachep);
-        for (i=0;i<size;i++) {
+        for (i = 0; i < size; i++) {
                char exp = POISON_FREE;
-                if (i == size-1)
+                if (i == size - 1)
                        exp = POISON_END;
                if (realobj[i] != exp) {
                        int limit;
                        /* Mismatch ! */
                        /* Print header */
                        if (lines == 0) {
-                                printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
+                                printk(KERN_ERR
-                                                realobj, size);
+                                       "Slab corruption: start=%p, len=%d\n",
+                                       realobj, size);
                                print_objinfo(cachep, objp, 0);
                        }
                        /* Hexdump the affected line */
-                        i = (i/16)*16;
+                        i = (i / 16) * 16;
                        limit = 16;
-                        if (i+limit > size)
+                        if (i + limit > size)
-                                limit = size-i;
+                                limit = size - i;
                        dump_line(realobj, i, limit);
                        i += 16;
                        lines++;
@@ -1368,22 +1395,22 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
                /* Print some data about the neighboring objects, if they
                 * exist:
                 */
-                struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
+                struct slab *slabp = page_get_slab(virt_to_page(objp));
                int objnr;
-                objnr = (objp-slabp->s_mem)/cachep->objsize;
+                objnr = (objp - slabp->s_mem) / cachep->objsize;
                if (objnr) {
-                        objp = slabp->s_mem+(objnr-1)*cachep->objsize;
+                        objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
-                        realobj = (char*)objp+obj_dbghead(cachep);
+                        realobj = (char *)objp + obj_dbghead(cachep);
                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
-                                                realobj, size);
+                               realobj, size);
                        print_objinfo(cachep, objp, 2);
                }
-                if (objnr+1 < cachep->num) {
+                if (objnr + 1 < cachep->num) {
-                        objp = slabp->s_mem+(objnr+1)*cachep->objsize;
+                        objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
-                        realobj = (char*)objp+obj_dbghead(cachep);
+                        realobj = (char *)objp + obj_dbghead(cachep);
                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
-                                                realobj, size);
+                               realobj, size);
                        print_objinfo(cachep, objp, 2);
                }
        }
@@ -1394,7 +1421,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 * Before calling the slab must have been unlinked from the cache.
 * The cache-lock is not held/needed.
 */
-static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 {
        void *addr = slabp->s_mem - slabp->colouroff;
@@ -1405,8 +1432,11 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
                if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                        if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
+                        if ((cachep->objsize % PAGE_SIZE) == 0
-                                kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
+                            && OFF_SLAB(cachep))
+                                kernel_map_pages(virt_to_page(objp),
+                                                 cachep->objsize / PAGE_SIZE,
+                                                 1);
                        else
                                check_poison_obj(cachep, objp);
 #else
@@ -1416,20 +1446,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "start of a freed object "
-                                                        "was overwritten");
+                                           "was overwritten");
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "end of a freed object "
-                                                        "was overwritten");
+                                           "was overwritten");
                }
                if (cachep->dtor && !(cachep->flags & SLAB_POISON))
-                        (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
+                        (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
        }
 #else
        if (cachep->dtor) {
                int i;
                for (i = 0; i < cachep->num; i++) {
-                        void* objp = slabp->s_mem+cachep->objsize*i;
+                        void *objp = slabp->s_mem + cachep->objsize * i;
-                        (cachep->dtor)(objp, cachep, 0);
+                        (cachep->dtor) (objp, cachep, 0);
                }
        }
 #endif
@@ -1437,7 +1467,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
                struct slab_rcu *slab_rcu;
-                slab_rcu = (struct slab_rcu *) slabp;
+                slab_rcu = (struct slab_rcu *)slabp;
                slab_rcu->cachep = cachep;
                slab_rcu->addr = addr;
                call_rcu(&slab_rcu->head, kmem_rcu_free);
@@ -1455,11 +1485,58 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
        int node;
        for_each_online_node(node) {
-                cachep->nodelists[node] = &initkmem_list3[index+node];
+                cachep->nodelists[node] = &initkmem_list3[index + node];
                cachep->nodelists[node]->next_reap = jiffies +
-                        REAPTIMEOUT_LIST3 +
+                    REAPTIMEOUT_LIST3 +
-                        ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+        }
+}
+/**
+ * calculate_slab_order - calculate size (page order) of slabs and the number
+ *                        of objects per slab.
+ *
+ * This could be made much more intelligent.  For now, try to avoid using
+ * high order pages for slabs.  When the gfp() functions are more friendly
+ * towards high-order requests, this should be changed.
+ */
+static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
+                                          size_t align, gfp_t flags)
+{
+        size_t left_over = 0;
+        for (;; cachep->gfporder++) {
+                unsigned int num;
+                size_t remainder;
+                if (cachep->gfporder > MAX_GFP_ORDER) {
+                        cachep->num = 0;
+                        break;
+                }
+                cache_estimate(cachep->gfporder, size, align, flags,
+                               &remainder, &num);
+                if (!num)
+                        continue;
+                /* More than offslab_limit objects will cause problems */
+                if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+                        break;
+                cachep->num = num;
+                left_over = remainder;
+                /*
+                 * Large number of objects is good, but very large slabs are
+                 * currently bad for the gfp()s.
+                 */
+                if (cachep->gfporder >= slab_break_gfp_order)
+                        break;
+                if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
+                        /* Acceptable internal fragmentation */
+                        break;
        }
+        return left_over;
 }
 /**
@@ -1508,16 +1585,15 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * Sanity checks... these are all serious usage bugs.
         */
        if ((!name) ||
-                in_interrupt() ||
+            in_interrupt() ||
-                (size < BYTES_PER_WORD) ||
+            (size < BYTES_PER_WORD) ||
-                (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
+            (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
-                (dtor && !ctor)) {
+                printk(KERN_ERR "%s: Early error in slab %s\n",
-                        printk(KERN_ERR "%s: Early error in slab %s\n",
+                       __FUNCTION__, name);
-                                        __FUNCTION__, name);
+                BUG();
-                        BUG();
+        }
-                }
-        down(&cache_chain_sem);
+        mutex_lock(&cache_chain_mutex);
        list_for_each(p, &cache_chain) {
                kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
@@ -1535,11 +1611,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                set_fs(old_fs);
                if (res) {
                        printk("SLAB: cache with size %d has lost its name\n",
-                                        pc->objsize);
+                               pc->objsize);
                        continue;
                }
-                if (!strcmp(pc->name,name)) {
+                if (!strcmp(pc->name, name)) {
                        printk("kmem_cache_create: duplicate cache %s\n", name);
                        dump_stack();
                        goto oops;
@@ -1551,10 +1627,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
                /* No constructor, but inital state check requested */
                printk(KERN_ERR "%s: No con, but init state check "
-                                "requested - %s\n", __FUNCTION__, name);
+                       "requested - %s\n", __FUNCTION__, name);
                flags &= ~SLAB_DEBUG_INITIAL;
        }
 #if FORCED_DEBUG
        /*
         * Enable redzoning and last user accounting, except for caches with
@@ -1562,8 +1637,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * above the next power of two: caches with object sizes just above a
         * power of two have a significant amount of internal fragmentation.
         */
-        if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
+        if ((size < 4096
-                flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
+             || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
+                flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
        if (!(flags & SLAB_DESTROY_BY_RCU))
                flags |= SLAB_POISON;
 #endif
@@ -1584,9 +1660,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * unaligned accesses for some archs when redzoning is used, and makes
         * sure any on-slab bufctl's are also correctly aligned.
         */
-        if (size & (BYTES_PER_WORD-1)) {
+        if (size & (BYTES_PER_WORD - 1)) {
-                size += (BYTES_PER_WORD-1);
+                size += (BYTES_PER_WORD - 1);
-                size &= ~(BYTES_PER_WORD-1);
+                size &= ~(BYTES_PER_WORD - 1);
        }
        /* calculate out the final buffer alignment: */
@@ -1597,7 +1673,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                 * objects into one cacheline.
                 */
                ralign = cache_line_size();
-                while (size <= ralign/2)
+                while (size <= ralign / 2)
                        ralign /= 2;
        } else {
                ralign = BYTES_PER_WORD;
@@ -1606,13 +1682,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (ralign < ARCH_SLAB_MINALIGN) {
                ralign = ARCH_SLAB_MINALIGN;
                if (ralign > BYTES_PER_WORD)
-                        flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        }
        /* 3) caller mandated alignment: disables debug if necessary */
        if (ralign < align) {
                ralign = align;
                if (ralign > BYTES_PER_WORD)
-                        flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        }
        /* 4) Store it. Note that the debug code below can reduce
         *    the alignment to BYTES_PER_WORD.
@@ -1634,7 +1710,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                /* add space for red zone words */
                cachep->dbghead += BYTES_PER_WORD;
-                size += 2*BYTES_PER_WORD;
+                size += 2 * BYTES_PER_WORD;
        }
        if (flags & SLAB_STORE_USER) {
                /* user store requires word alignment and
@@ -1645,7 +1721,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                size += BYTES_PER_WORD;
        }
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-        if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
+        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
+            && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
                cachep->dbghead += PAGE_SIZE - size;
                size = PAGE_SIZE;
        }
@@ -1653,7 +1730,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 #endif
        /* Determine if the slab management is 'on' or 'off' slab. */
-        if (size >= (PAGE_SIZE>>3))
+        if (size >= (PAGE_SIZE >> 3))
                /*
                 * Size is large, assume best to place the slab management obj
                 * off-slab (should allow better packing of objs).
@@ -1670,47 +1747,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                 */
                cachep->gfporder = 0;
                cache_estimate(cachep->gfporder, size, align, flags,
-                                        &left_over, &cachep->num);
+                               &left_over, &cachep->num);
-        } else {
+        } else
-                /*
+                left_over = calculate_slab_order(cachep, size, align, flags);
-                 * Calculate size (in pages) of slabs, and the num of objs per
-                 * slab.  This could be made much more intelligent.  For now,
-                 * try to avoid using high page-orders for slabs.  When the
-                 * gfp() funcs are more friendly towards high-order requests,
-                 * this should be changed.
-                 */
-                do {
-                        unsigned int break_flag = 0;
-cal_wastage:
-                        cache_estimate(cachep->gfporder, size, align, flags,
-                                                &left_over, &cachep->num);
-                        if (break_flag)
-                                break;
-                        if (cachep->gfporder >= MAX_GFP_ORDER)
-                                break;
-                        if (!cachep->num)
-                                goto next;
-                        if (flags & CFLGS_OFF_SLAB &&
-                                        cachep->num > offslab_limit) {
-                                /* This num of objs will cause problems. */
-                                cachep->gfporder--;
-                                break_flag++;
-                                goto cal_wastage;
-                        }
-                        /*
-                         * Large num of objs is good, but v. large slabs are
-                         * currently bad for the gfp()s.
-                         */
-                        if (cachep->gfporder >= slab_break_gfp_order)
-                                break;
-                        if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
-                                break;  /* Acceptable internal fragmentation. */
-next:
-                        cachep->gfporder++;
-                } while (1);
-        }
        if (!cachep->num) {
                printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1718,8 +1757,8 @@ next:
                cachep = NULL;
                goto oops;
        }
-        slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
+        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
-                                + sizeof(struct slab), align);
+                          + sizeof(struct slab), align);
        /*
         * If the slab has been placed off-slab, and we have enough space then
@@ -1732,14 +1771,15 @@ next:
        if (flags & CFLGS_OFF_SLAB) {
                /* really off slab. No need for manual alignment */
-                slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
+                slab_size =
+                    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
        }
        cachep->colour_off = cache_line_size();
        /* Offset must be a multiple of the alignment. */
        if (cachep->colour_off < align)
                cachep->colour_off = align;
-        cachep->colour = left_over/cachep->colour_off;
+        cachep->colour = left_over / cachep->colour_off;
        cachep->slab_size = slab_size;
        cachep->flags = flags;
        cachep->gfpflags = 0;
@@ -1766,7 +1806,7 @@ next:
                         * the creation of further caches will BUG().
                         */
                        cachep->array[smp_processor_id()] =
-                                &initarray_generic.cache;
+                            &initarray_generic.cache;
                        /* If the cache that's used by
                         * kmalloc(sizeof(kmem_list3)) is the first cache,
@@ -1780,8 +1820,7 @@ next:
                                g_cpucache_up = PARTIAL_AC;
                } else {
                        cachep->array[smp_processor_id()] =
-                                kmalloc(sizeof(struct arraycache_init),
+                            kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-                                                GFP_KERNEL);
                        if (g_cpucache_up == PARTIAL_AC) {
                                set_up_list3s(cachep, SIZE_L3);
@@ -1791,16 +1830,18 @@ next:
                                for_each_online_node(node) {
                                        cachep->nodelists[node] =
-                                                kmalloc_node(sizeof(struct kmem_list3),
+                                            kmalloc_node(sizeof
-                                                                GFP_KERNEL, node);
+                                                         (struct kmem_list3),
+                                                         GFP_KERNEL, node);
                                        BUG_ON(!cachep->nodelists[node]);
-                                        kmem_list3_init(cachep->nodelists[node]);
+                                        kmem_list3_init(cachep->
+                                                        nodelists[node]);
                                }
                        }
                }
                cachep->nodelists[numa_node_id()]->next_reap =
-                        jiffies + REAPTIMEOUT_LIST3 +
+                    jiffies + REAPTIMEOUT_LIST3 +
-                        ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
                BUG_ON(!ac_data(cachep));
                ac_data(cachep)->avail = 0;
@@ -1809,16 +1850,16 @@ next:
                ac_data(cachep)->touched = 0;
                cachep->batchcount = 1;
                cachep->limit = BOOT_CPUCACHE_ENTRIES;
-        } 
+        }
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
        unlock_cpu_hotplug();
-oops:
+      oops:
        if (!cachep && (flags & SLAB_PANIC))
                panic("kmem_cache_create(): failed to create slab `%s'\n",
-                        name);
+                      name);
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
        return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -1860,7 +1901,7 @@ static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
 /*
 * Waits for all CPUs to execute func().
 */
-static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
+static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 {
        check_irq_on();
        preempt_disable();
@@ -1875,12 +1916,12 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
        preempt_enable();
 }
-static void drain_array_locked(kmem_cache_t* cachep,
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
-                                struct array_cache *ac, int force, int node);
+                                int force, int node);
 static void do_drain(void *arg)
 {
-        kmem_cache_t *cachep = (kmem_cache_t*)arg;
+        kmem_cache_t *cachep = (kmem_cache_t *) arg;
        struct array_cache *ac;
        int node = numa_node_id();
@@ -1900,7 +1941,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
        smp_call_function_all_cpus(do_drain, cachep);
        check_irq_on();
        spin_lock_irq(&cachep->spinlock);
-        for_each_online_node(node)  {
+        for_each_online_node(node) {
                l3 = cachep->nodelists[node];
                if (l3) {
                        spin_lock(&l3->list_lock);
@@ -1938,8 +1979,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
                slab_destroy(cachep, slabp);
                spin_lock_irq(&l3->list_lock);
        }
-        ret = !list_empty(&l3->slabs_full) ||
+        ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
-                !list_empty(&l3->slabs_partial);
        return ret;
 }
@@ -1995,7 +2035,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 * The caller must guarantee that noone will allocate memory from the cache
 * during the kmem_cache_destroy().
 */
-int kmem_cache_destroy(kmem_cache_t * cachep)
+int kmem_cache_destroy(kmem_cache_t *cachep)
 {
        int i;
        struct kmem_list3 *l3;
@@ -2007,18 +2047,18 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
        lock_cpu_hotplug();
        /* Find the cache in the chain of caches. */
-        down(&cache_chain_sem);
+        mutex_lock(&cache_chain_mutex);
        /*
         * the chain is never empty, cache_cache is never destroyed
         */
        list_del(&cachep->next);
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
        if (__cache_shrink(cachep)) {
                slab_error(cachep, "Can't free all objects");
-                down(&cache_chain_sem);
+                mutex_lock(&cache_chain_mutex);
-                list_add(&cachep->next,&cache_chain);
+                list_add(&cachep->next, &cache_chain);
-                up(&cache_chain_sem);
+                mutex_unlock(&cache_chain_mutex);
                unlock_cpu_hotplug();
                return 1;
        }
@@ -2027,7 +2067,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
                synchronize_rcu();
        for_each_online_cpu(i)
-                kfree(cachep->array[i]);
+            kfree(cachep->array[i]);
        /* NUMA: free the list3 structures */
        for_each_online_node(i) {
@@ -2046,39 +2086,39 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
 EXPORT_SYMBOL(kmem_cache_destroy);
 /* Get the memory for a slab management obj. */
-static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
+static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
-                        int colour_off, gfp_t local_flags)
+                                   int colour_off, gfp_t local_flags)
 {
        struct slab *slabp;
-        
        if (OFF_SLAB(cachep)) {
                /* Slab management obj is off-slab. */
                slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
                if (!slabp)
                        return NULL;
        } else {
-                slabp = objp+colour_off;
+                slabp = objp + colour_off;
                colour_off += cachep->slab_size;
        }
        slabp->inuse = 0;
        slabp->colouroff = colour_off;
-        slabp->s_mem = objp+colour_off;
+        slabp->s_mem = objp + colour_off;
        return slabp;
 }
 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
 {
-        return (kmem_bufctl_t *)(slabp+1);
+        return (kmem_bufctl_t *) (slabp + 1);
 }
 static void cache_init_objs(kmem_cache_t *cachep,
-                        struct slab *slabp, unsigned long ctor_flags)
+                            struct slab *slabp, unsigned long ctor_flags)
 {
        int i;
        for (i = 0; i < cachep->num; i++) {
-                void *objp = slabp->s_mem+cachep->objsize*i;
+                void *objp = slabp->s_mem + cachep->objsize * i;
 #if DEBUG
                /* need to poison the objs? */
                if (cachep->flags & SLAB_POISON)
@@ -2096,25 +2136,28 @@ static void cache_init_objs(kmem_cache_t *cachep,
                 * Otherwise, deadlock. They must also be threaded.
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-                        cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);
+                        cachep->ctor(objp + obj_dbghead(cachep), cachep,
+                                     ctor_flags);
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "constructor overwrote the"
-                                                        " end of an object");
+                                           " end of an object");
                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "constructor overwrote the"
-                                                        " start of an object");
+                                           " start of an object");
                }
-                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
+                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
-                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+                    && cachep->flags & SLAB_POISON)
+                        kernel_map_pages(virt_to_page(objp),
+                                         cachep->objsize / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
                        cachep->ctor(objp, cachep, ctor_flags);
 #endif
-                slab_bufctl(slabp)[i] = i+1;
+                slab_bufctl(slabp)[i] = i + 1;
        }
-        slab_bufctl(slabp)[i-1] = BUFCTL_END;
+        slab_bufctl(slabp)[i - 1] = BUFCTL_END;
        slabp->free = 0;
 }
@@ -2138,8 +2181,8 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
        i = 1 << cachep->gfporder;
        page = virt_to_page(objp);
        do {
-                SET_PAGE_CACHE(page, cachep);
+                page_set_cache(page, cachep);
-                SET_PAGE_SLAB(page, slabp);
+                page_set_slab(page, slabp);
                page++;
        } while (--i);
 }
@@ -2150,17 +2193,17 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
 */
 static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
-        struct slab     *slabp;
+        struct slab *slabp;
-        void            *objp;
+        void *objp;
-        size_t           offset;
+        size_t offset;
-        gfp_t            local_flags;
+        gfp_t local_flags;
-        unsigned long    ctor_flags;
+        unsigned long ctor_flags;
        struct kmem_list3 *l3;
        /* Be lazy and only check for valid flags here,
-         * keeping it out of the critical path in kmem_cache_alloc().
+         * keeping it out of the critical path in kmem_cache_alloc().
         */
-        if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
+        if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
                BUG();
        if (flags & SLAB_NO_GROW)
                return 0;
@@ -2226,9 +2269,9 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
        l3->free_objects += cachep->num;
        spin_unlock(&l3->list_lock);
        return 1;
-opps1:
+      opps1:
        kmem_freepages(cachep, objp);
-failed:
+      failed:
        if (local_flags & __GFP_WAIT)
                local_irq_disable();
        return 0;
@@ -2248,18 +2291,19 @@ static void kfree_debugcheck(const void *objp)
        if (!virt_addr_valid(objp)) {
                printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
-                        (unsigned long)objp);   
+                       (unsigned long)objp);
-                BUG();  
+                BUG();
        }
        page = virt_to_page(objp);
        if (!PageSlab(page)) {
-                printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
+                printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
+                       (unsigned long)objp);
                BUG();
        }
 }
 static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
-                                        void *caller)
+                                   void *caller)
 {
        struct page *page;
        unsigned int objnr;
@@ -2269,21 +2313,27 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
        kfree_debugcheck(objp);
        page = virt_to_page(objp);
-        if (GET_PAGE_CACHE(page) != cachep) {
+        if (page_get_cache(page) != cachep) {
-                printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+                printk(KERN_ERR
-                                GET_PAGE_CACHE(page),cachep);
+                       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+                       page_get_cache(page), cachep);
                printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
-                printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name);
+                printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
+                       page_get_cache(page)->name);
                WARN_ON(1);
        }
-        slabp = GET_PAGE_SLAB(page);
+        slabp = page_get_slab(page);
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
-                        slab_error(cachep, "double free, or memory outside"
+                    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-                                                " object was overwritten");
+                        slab_error(cachep,
-                        printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                                   "double free, or memory outside"
-                                        objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+                                   " object was overwritten");
+                        printk(KERN_ERR
+                               "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                               objp, *dbg_redzone1(cachep, objp),
+                               *dbg_redzone2(cachep, objp));
                }
                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
@@ -2291,30 +2341,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
        if (cachep->flags & SLAB_STORE_USER)
                *dbg_userword(cachep, objp) = caller;
-        objnr = (objp-slabp->s_mem)/cachep->objsize;
+        objnr = (objp - slabp->s_mem) / cachep->objsize;
        BUG_ON(objnr >= cachep->num);
-        BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
+        BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
        if (cachep->flags & SLAB_DEBUG_INITIAL) {
                /* Need to call the slab's constructor so the
                 * caller can perform a verify of its state (debugging).
                 * Called without the cache-lock held.
                 */
-                cachep->ctor(objp+obj_dbghead(cachep),
+                cachep->ctor(objp + obj_dbghead(cachep),
-                                        cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
+                             cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
        }
        if (cachep->flags & SLAB_POISON && cachep->dtor) {
                /* we want to cache poison the object,
                 * call the destruction callback
                 */
-                cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
+                cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
        }
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
                        store_stackinfo(cachep, objp, (unsigned long)caller);
-                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+                        kernel_map_pages(virt_to_page(objp),
+                                         cachep->objsize / PAGE_SIZE, 0);
                } else {
                        poison_obj(cachep, objp, POISON_FREE);
                }
@@ -2329,7 +2380,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
 {
        kmem_bufctl_t i;
        int entries = 0;
-        
        /* Check slab's freelist to see if this obj is there. */
        for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
                entries++;
@@ -2337,13 +2388,16 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
                        goto bad;
        }
        if (entries != cachep->num - slabp->inuse) {
-bad:
+              bad:
-                printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+                printk(KERN_ERR
-                                cachep->name, cachep->num, slabp, slabp->inuse);
+                       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-                for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) {
+                       cachep->name, cachep->num, slabp, slabp->inuse);
-                        if ((i%16)==0)
+                for (i = 0;
+                     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+                     i++) {
+                        if ((i % 16) == 0)
                                printk("\n%03x:", i);
-                        printk(" %02x", ((unsigned char*)slabp)[i]);
+                        printk(" %02x", ((unsigned char *)slabp)[i]);
                }
                printk("\n");
                BUG();
@@ -2363,7 +2417,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
        check_irq_off();
        ac = ac_data(cachep);
-retry:
+      retry:
        batchcount = ac->batchcount;
        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
                /* if there was little recent activity on this
@@ -2385,8 +2439,8 @@ retry:
                        shared_array->avail -= batchcount;
                        ac->avail = batchcount;
                        memcpy(ac->entry,
-                                &(shared_array->entry[shared_array->avail]),
+                               &(shared_array->entry[shared_array->avail]),
-                                sizeof(void*)*batchcount);
+                               sizeof(void *) * batchcount);
                        shared_array->touched = 1;
                        goto alloc_done;
                }
@@ -2414,7 +2468,7 @@ retry:
                        /* get obj pointer */
                        ac->entry[ac->avail++] = slabp->s_mem +
-                                slabp->free*cachep->objsize;
+                            slabp->free * cachep->objsize;
                        slabp->inuse++;
                        next = slab_bufctl(slabp)[slabp->free];
@@ -2422,7 +2476,7 @@ retry:
                        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
                        WARN_ON(numa_node_id() != slabp->nodeid);
 #endif
-                        slabp->free = next;
+                        slabp->free = next;
                }
                check_slabp(cachep, slabp);
@@ -2434,9 +2488,9 @@ retry:
                        list_add(&slabp->list, &l3->slabs_partial);
        }
-must_grow:
+      must_grow:
        l3->free_objects -= ac->avail;
-alloc_done:
+      alloc_done:
        spin_unlock(&l3->list_lock);
        if (unlikely(!ac->avail)) {
@@ -2448,7 +2502,7 @@ alloc_done:
                if (!x && ac->avail == 0)       // no objects in sight? abort
                        return NULL;
-                if (!ac->avail)         // objects refilled by interrupt?
+                if (!ac->avail) // objects refilled by interrupt?
                        goto retry;
        }
        ac->touched = 1;
@@ -2465,16 +2519,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
 }
 #if DEBUG
-static void *
+static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
-cache_alloc_debugcheck_after(kmem_cache_t *cachep,
+                                        void *objp, void *caller)
-                        gfp_t flags, void *objp, void *caller)
 {
-        if (!objp)      
+        if (!objp)
                return objp;
-        if (cachep->flags & SLAB_POISON) {
+        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
-                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
+                        kernel_map_pages(virt_to_page(objp),
+                                         cachep->objsize / PAGE_SIZE, 1);
                else
                        check_poison_obj(cachep, objp);
 #else
@@ -2486,24 +2540,28 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
                *dbg_userword(cachep, objp) = caller;
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
-                        slab_error(cachep, "double free, or memory outside"
+                    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-                                                " object was overwritten");
+                        slab_error(cachep,
-                        printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                                   "double free, or memory outside"
-                                        objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+                                   " object was overwritten");
+                        printk(KERN_ERR
+                               "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                               objp, *dbg_redzone1(cachep, objp),
+                               *dbg_redzone2(cachep, objp));
                }
                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
        }
        objp += obj_dbghead(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON) {
-                unsigned long   ctor_flags = SLAB_CTOR_CONSTRUCTOR;
+                unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
                if (!(flags & __GFP_WAIT))
                        ctor_flags |= SLAB_CTOR_ATOMIC;
                cachep->ctor(objp, cachep, ctor_flags);
-        }       
+        }
        return objp;
 }
 #else
@@ -2512,9 +2570,18 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
 static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
-        void* objp;
+        void *objp;
        struct array_cache *ac;
+#ifdef CONFIG_NUMA
+        if (unlikely(current->mempolicy && !in_interrupt())) {
+                int nid = slab_node(current->mempolicy);
+                if (nid != numa_node_id())
+                        return __cache_alloc_node(cachep, flags, nid);
+        }
+#endif
        check_irq_off();
        ac = ac_data(cachep);
        if (likely(ac->avail)) {
@@ -2531,7 +2598,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
        unsigned long save_flags;
-        void* objp;
+        void *objp;
        cache_alloc_debugcheck_before(cachep, flags);
@@ -2539,7 +2606,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
        objp = ____cache_alloc(cachep, flags);
        local_irq_restore(save_flags);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp,
-                                        __builtin_return_address(0));
+                                            __builtin_return_address(0));
        prefetchw(objp);
        return objp;
 }
@@ -2551,74 +2618,75 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
        struct list_head *entry;
-        struct slab *slabp;
+        struct slab *slabp;
-        struct kmem_list3 *l3;
+        struct kmem_list3 *l3;
-        void *obj;
+        void *obj;
-        kmem_bufctl_t next;
+        kmem_bufctl_t next;
-        int x;
+        int x;
-        l3 = cachep->nodelists[nodeid];
+        l3 = cachep->nodelists[nodeid];
-        BUG_ON(!l3);
+        BUG_ON(!l3);
-retry:
+      retry:
-        spin_lock(&l3->list_lock);
+        spin_lock(&l3->list_lock);
-        entry = l3->slabs_partial.next;
+        entry = l3->slabs_partial.next;
-        if (entry == &l3->slabs_partial) {
+        if (entry == &l3->slabs_partial) {
-                l3->free_touched = 1;
+                l3->free_touched = 1;
-                entry = l3->slabs_free.next;
+                entry = l3->slabs_free.next;
-                if (entry == &l3->slabs_free)
+                if (entry == &l3->slabs_free)
-                        goto must_grow;
+                        goto must_grow;
-        }
+        }
-        slabp = list_entry(entry, struct slab, list);
+        slabp = list_entry(entry, struct slab, list);
-        check_spinlock_acquired_node(cachep, nodeid);
+        check_spinlock_acquired_node(cachep, nodeid);
-        check_slabp(cachep, slabp);
+        check_slabp(cachep, slabp);
-        STATS_INC_NODEALLOCS(cachep);
+        STATS_INC_NODEALLOCS(cachep);
-        STATS_INC_ACTIVE(cachep);
+        STATS_INC_ACTIVE(cachep);
-        STATS_SET_HIGH(cachep);
+        STATS_SET_HIGH(cachep);
-        BUG_ON(slabp->inuse == cachep->num);
+        BUG_ON(slabp->inuse == cachep->num);
-        /* get obj pointer */
+        /* get obj pointer */
-        obj =  slabp->s_mem + slabp->free*cachep->objsize;
+        obj = slabp->s_mem + slabp->free * cachep->objsize;
-        slabp->inuse++;
+        slabp->inuse++;
-        next = slab_bufctl(slabp)[slabp->free];
+        next = slab_bufctl(slabp)[slabp->free];
 #if DEBUG
-        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
 #endif
-        slabp->free = next;
+        slabp->free = next;
-        check_slabp(cachep, slabp);
+        check_slabp(cachep, slabp);
-        l3->free_objects--;
+        l3->free_objects--;
-        /* move slabp to correct slabp list: */
+        /* move slabp to correct slabp list: */
-        list_del(&slabp->list);
+        list_del(&slabp->list);
-        if (slabp->free == BUFCTL_END) {
+        if (slabp->free == BUFCTL_END) {
-                list_add(&slabp->list, &l3->slabs_full);
+                list_add(&slabp->list, &l3->slabs_full);
-        } else {
+        } else {
-                list_add(&slabp->list, &l3->slabs_partial);
+                list_add(&slabp->list, &l3->slabs_partial);
-        }
+        }
-        spin_unlock(&l3->list_lock);
+        spin_unlock(&l3->list_lock);
-        goto done;
+        goto done;
-must_grow:
+      must_grow:
-        spin_unlock(&l3->list_lock);
+        spin_unlock(&l3->list_lock);
-        x = cache_grow(cachep, flags, nodeid);
+        x = cache_grow(cachep, flags, nodeid);
-        if (!x)
+        if (!x)
-                return NULL;
+                return NULL;
-        goto retry;
+        goto retry;
-done:
+      done:
-        return obj;
+        return obj;
 }
 #endif
 /*
 * Caller needs to acquire correct kmem_list's list_lock
 */
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node)
+static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
+                       int node)
 {
        int i;
        struct kmem_list3 *l3;
@@ -2628,7 +2696,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
                struct slab *slabp;
                unsigned int objnr;
-                slabp = GET_PAGE_SLAB(virt_to_page(objp));
+                slabp = page_get_slab(virt_to_page(objp));
                l3 = cachep->nodelists[node];
                list_del(&slabp->list);
                objnr = (objp - slabp->s_mem) / cachep->objsize;
@@ -2641,7 +2709,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
                if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
                        printk(KERN_ERR "slab: double free detected in cache "
-                                        "'%s', objp %p\n", cachep->name, objp);
+                               "'%s', objp %p\n", cachep->name, objp);
                        BUG();
                }
 #endif
@@ -2685,20 +2753,19 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
        spin_lock(&l3->list_lock);
        if (l3->shared) {
                struct array_cache *shared_array = l3->shared;
-                int max = shared_array->limit-shared_array->avail;
+                int max = shared_array->limit - shared_array->avail;
                if (max) {
                        if (batchcount > max)
                                batchcount = max;
                        memcpy(&(shared_array->entry[shared_array->avail]),
-                                        ac->entry,
+                               ac->entry, sizeof(void *) * batchcount);
-                                        sizeof(void*)*batchcount);
                        shared_array->avail += batchcount;
                        goto free_done;
                }
        }
        free_block(cachep, ac->entry, batchcount, node);
-free_done:
+      free_done:
 #if STATS
        {
                int i = 0;
@@ -2720,10 +2787,9 @@ free_done:
        spin_unlock(&l3->list_lock);
        ac->avail -= batchcount;
        memmove(ac->entry, &(ac->entry[batchcount]),
-                        sizeof(void*)*ac->avail);
+                sizeof(void *) * ac->avail);
 }
 /*
 * __cache_free
 * Release an obj back to its cache. If the obj has a constructed
@@ -2744,11 +2810,12 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 #ifdef CONFIG_NUMA
        {
                struct slab *slabp;
-                slabp = GET_PAGE_SLAB(virt_to_page(objp));
+                slabp = page_get_slab(virt_to_page(objp));
                if (unlikely(slabp->nodeid != numa_node_id())) {
                        struct array_cache *alien = NULL;
                        int nodeid = slabp->nodeid;
-                        struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
+                        struct kmem_list3 *l3 =
+                            cachep->nodelists[numa_node_id()];
                        STATS_INC_NODEFREES(cachep);
                        if (l3->alien && l3->alien[nodeid]) {
@@ -2756,15 +2823,15 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
                                spin_lock(&alien->lock);
                                if (unlikely(alien->avail == alien->limit))
                                        __drain_alien_cache(cachep,
-                                                        alien, nodeid);
+                                                            alien, nodeid);
                                alien->entry[alien->avail++] = objp;
                                spin_unlock(&alien->lock);
                        } else {
                                spin_lock(&(cachep->nodelists[nodeid])->
-                                                list_lock);
+                                          list_lock);
                                free_block(cachep, &objp, 1, nodeid);
                                spin_unlock(&(cachep->nodelists[nodeid])->
-                                                list_lock);
+                                            list_lock);
                        }
                        return;
                }
@@ -2811,9 +2878,9 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 */
 int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
 {
-        unsigned long addr = (unsigned long) ptr;
+        unsigned long addr = (unsigned long)ptr;
        unsigned long min_addr = PAGE_OFFSET;
-        unsigned long align_mask = BYTES_PER_WORD-1;
+        unsigned long align_mask = BYTES_PER_WORD - 1;
        unsigned long size = cachep->objsize;
        struct page *page;
@@ -2830,10 +2897,10 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
        page = virt_to_page(ptr);
        if (unlikely(!PageSlab(page)))
                goto out;
-        if (unlikely(GET_PAGE_CACHE(page) != cachep))
+        if (unlikely(page_get_cache(page) != cachep))
                goto out;
        return 1;
-out:
+      out:
        return 0;
 }
@@ -2860,8 +2927,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
        if (unlikely(!cachep->nodelists[nodeid])) {
                /* Fall back to __cache_alloc if we run into trouble */
-                printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name);
+                printk(KERN_WARNING
-                return __cache_alloc(cachep,flags);
+                       "slab: not allocating in inactive node %d for cache %s\n",
+                       nodeid, cachep->name);
+                return __cache_alloc(cachep, flags);
        }
        cache_alloc_debugcheck_before(cachep, flags);
@@ -2871,7 +2940,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
        else
                ptr = __cache_alloc_node(cachep, flags, nodeid);
        local_irq_restore(save_flags);
-        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
+        ptr =
+            cache_alloc_debugcheck_after(cachep, flags, ptr,
+                                         __builtin_return_address(0));
        return ptr;
 }
@@ -2933,12 +3004,11 @@ EXPORT_SYMBOL(__kmalloc);
 * Objects should be dereferenced using the per_cpu_ptr macro only.
 *
 * @size: how many bytes of memory are required.
- * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
 */
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
 {
        int i;
-        struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+        struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
        if (!pdata)
                return NULL;
@@ -2962,9 +3032,9 @@ void *__alloc_percpu(size_t size, size_t align)
        }
        /* Catch derefs w/o wrappers */
-        return (void *) (~(unsigned long) pdata);
+        return (void *)(~(unsigned long)pdata);
-unwind_oom:
+      unwind_oom:
        while (--i >= 0) {
                if (!cpu_possible(i))
                        continue;
@@ -2995,20 +3065,6 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
 EXPORT_SYMBOL(kmem_cache_free);
 /**
- * kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- */
-void *kzalloc(size_t size, gfp_t flags)
-{
-        void *ret = kmalloc(size, flags);
-        if (ret)
-                memset(ret, 0, size);
-        return ret;
-}
-EXPORT_SYMBOL(kzalloc);
-/**
 * kfree - free previously allocated memory
 * @objp: pointer returned by kmalloc.
 *
@@ -3026,8 +3082,9 @@ void kfree(const void *objp)
                return;
        local_irq_save(flags);
        kfree_debugcheck(objp);
-        c = GET_PAGE_CACHE(virt_to_page(objp));
+        c = page_get_cache(virt_to_page(objp));
-        __cache_free(c, (void*)objp);
+        mutex_debug_check_no_locks_freed(objp, obj_reallen(c));
+        __cache_free(c, (void *)objp);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
@@ -3040,17 +3097,16 @@ EXPORT_SYMBOL(kfree);
 * Don't free memory not originally allocated by alloc_percpu()
 * The complemented objp is to check for that.
 */
-void
+void free_percpu(const void *objp)
-free_percpu(const void *objp)
 {
        int i;
-        struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+        struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
        /*
         * We allocate for all cpus so we cannot use for online cpu here.
         */
        for_each_cpu(i)
-                kfree(p->ptrs[i]);
+            kfree(p->ptrs[i]);
        kfree(p);
 }
 EXPORT_SYMBOL(free_percpu);
@@ -3084,44 +3140,44 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
                if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
                        goto fail;
 #endif
-                if (!(new = alloc_arraycache(node, (cachep->shared*
+                if (!(new = alloc_arraycache(node, (cachep->shared *
-                                cachep->batchcount), 0xbaadf00d)))
+                                                    cachep->batchcount),
+                                             0xbaadf00d)))
                        goto fail;
                if ((l3 = cachep->nodelists[node])) {
                        spin_lock_irq(&l3->list_lock);
                        if ((nc = cachep->nodelists[node]->shared))
-                                free_block(cachep, nc->entry,
+                                free_block(cachep, nc->entry, nc->avail, node);
-                                                        nc->avail, node);
                        l3->shared = new;
                        if (!cachep->nodelists[node]->alien) {
                                l3->alien = new_alien;
                                new_alien = NULL;
                        }
-                        l3->free_limit = (1 + nr_cpus_node(node))*
+                        l3->free_limit = (1 + nr_cpus_node(node)) *
-                                cachep->batchcount + cachep->num;
+                            cachep->batchcount + cachep->num;
                        spin_unlock_irq(&l3->list_lock);
                        kfree(nc);
                        free_alien_cache(new_alien);
                        continue;
                }
                if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
-                                                GFP_KERNEL, node)))
+                                        GFP_KERNEL, node)))
                        goto fail;
                kmem_list3_init(l3);
                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                        ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
                l3->shared = new;
                l3->alien = new_alien;
-                l3->free_limit = (1 + nr_cpus_node(node))*
+                l3->free_limit = (1 + nr_cpus_node(node)) *
-                        cachep->batchcount + cachep->num;
+                    cachep->batchcount + cachep->num;
                cachep->nodelists[node] = l3;
        }
        return err;
-fail:
+      fail:
        err = -ENOMEM;
        return err;
 }
@@ -3143,18 +3199,19 @@ static void do_ccupdate_local(void *info)
        new->new[smp_processor_id()] = old;
 }
 static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
-                                int shared)
+                            int shared)
 {
        struct ccupdate_struct new;
        int i, err;
-        memset(&new.new,0,sizeof(new.new));
+        memset(&new.new, 0, sizeof(new.new));
        for_each_online_cpu(i) {
-                new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
+                new.new[i] =
+                    alloc_arraycache(cpu_to_node(i), limit, batchcount);
                if (!new.new[i]) {
-                        for (i--; i >= 0; i--) kfree(new.new[i]);
+                        for (i--; i >= 0; i--)
+                                kfree(new.new[i]);
                        return -ENOMEM;
                }
        }
@@ -3182,13 +3239,12 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
        err = alloc_kmemlist(cachep);
        if (err) {
                printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
-                                cachep->name, -err);
+                       cachep->name, -err);
                BUG();
        }
        return 0;
 }
 static void enable_cpucache(kmem_cache_t *cachep)
 {
        int err;
@@ -3235,14 +3291,14 @@ static void enable_cpucache(kmem_cache_t *cachep)
        if (limit > 32)
                limit = 32;
 #endif
-        err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
+        err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
        if (err)
                printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
-                                        cachep->name, -err);
+                       cachep->name, -err);
 }
-static void drain_array_locked(kmem_cache_t *cachep,
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
-                                struct array_cache *ac, int force, int node)
+                                int force, int node)
 {
        int tofree;
@@ -3250,14 +3306,14 @@ static void drain_array_locked(kmem_cache_t *cachep,
        if (ac->touched && !force) {
                ac->touched = 0;
        } else if (ac->avail) {
-                tofree = force ? ac->avail : (ac->limit+4)/5;
+                tofree = force ? ac->avail : (ac->limit + 4) / 5;
                if (tofree > ac->avail) {
-                        tofree = (ac->avail+1)/2;
+                        tofree = (ac->avail + 1) / 2;
                }
                free_block(cachep, ac->entry, tofree, node);
                ac->avail -= tofree;
                memmove(ac->entry, &(ac->entry[tofree]),
-                                        sizeof(void*)*ac->avail);
+                        sizeof(void *) * ac->avail);
        }
 }
@@ -3270,7 +3326,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
 * - clear the per-cpu caches for this CPU.
 * - return freeable pages to the main free memory pool.
 *
- * If we cannot acquire the cache chain semaphore then just give up - we'll
+ * If we cannot acquire the cache chain mutex then just give up - we'll
 * try again on the next iteration.
 */
 static void cache_reap(void *unused)
@@ -3278,15 +3334,16 @@ static void cache_reap(void *unused)
        struct list_head *walk;
        struct kmem_list3 *l3;
-        if (down_trylock(&cache_chain_sem)) {
+        if (!mutex_trylock(&cache_chain_mutex)) {
                /* Give up. Setup the next iteration. */
-                schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
+                schedule_delayed_work(&__get_cpu_var(reap_work),
+                                      REAPTIMEOUT_CPUC);
                return;
        }
        list_for_each(walk, &cache_chain) {
                kmem_cache_t *searchp;
-                struct list_head* p;
+                struct list_head *p;
                int tofree;
                struct slab *slabp;
@@ -3303,7 +3360,7 @@ static void cache_reap(void *unused)
                spin_lock_irq(&l3->list_lock);
                drain_array_locked(searchp, ac_data(searchp), 0,
-                                numa_node_id());
+                                   numa_node_id());
                if (time_after(l3->next_reap, jiffies))
                        goto next_unlock;
@@ -3312,14 +3369,16 @@ static void cache_reap(void *unused)
                if (l3->shared)
                        drain_array_locked(searchp, l3->shared, 0,
-                                numa_node_id());
+                                           numa_node_id());
                if (l3->free_touched) {
                        l3->free_touched = 0;
                        goto next_unlock;
                }
-                tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
+                tofree =
+                    (l3->free_limit + 5 * searchp->num -
+                     1) / (5 * searchp->num);
                do {
                        p = l3->slabs_free.next;
                        if (p == &(l3->slabs_free))
@@ -3339,14 +3398,14 @@ static void cache_reap(void *unused)
                        spin_unlock_irq(&l3->list_lock);
                        slab_destroy(searchp, slabp);
                        spin_lock_irq(&l3->list_lock);
-                } while(--tofree > 0);
+                } while (--tofree > 0);
-next_unlock:
+              next_unlock:
                spin_unlock_irq(&l3->list_lock);
-next:
+              next:
                cond_resched();
        }
        check_irq_on();
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
        drain_remote_pages();
        /* Setup the next iteration */
        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
@@ -3354,32 +3413,37 @@ next:
 #ifdef CONFIG_PROC_FS
-static void *s_start(struct seq_file *m, loff_t *pos)
+static void print_slabinfo_header(struct seq_file *m)
 {
-        loff_t n = *pos;
+        /*
-        struct list_head *p;
+         * Output format version, so at least we can change it
+         * without _too_ many complaints.
-        down(&cache_chain_sem);
+         */
-        if (!n) {
-                /*
-                 * Output format version, so at least we can change it
-                 * without _too_ many complaints.
-                 */
 #if STATS
-                seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+        seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
 #else
-                seq_puts(m, "slabinfo - version: 2.1\n");
+        seq_puts(m, "slabinfo - version: 2.1\n");
 #endif
-                seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
+        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
-                seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+                 "<objperslab> <pagesperslab>");
-                seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #if STATS
-                seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
+        seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
-                                " <error> <maxfreeable> <nodeallocs> <remotefrees>");
+                 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
-                seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+        seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
-                seq_putc(m, '\n');
+        seq_putc(m, '\n');
-        }
+}
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+        loff_t n = *pos;
+        struct list_head *p;
+        mutex_lock(&cache_chain_mutex);
+        if (!n)
+                print_slabinfo_header(m);
        p = cache_chain.next;
        while (n--) {
                p = p->next;
@@ -3394,23 +3458,23 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
        kmem_cache_t *cachep = p;
        ++*pos;
        return cachep->next.next == &cache_chain ? NULL
-                : list_entry(cachep->next.next, kmem_cache_t, next);
+            : list_entry(cachep->next.next, kmem_cache_t, next);
 }
 static void s_stop(struct seq_file *m, void *p)
 {
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
 }
 static int s_show(struct seq_file *m, void *p)
 {
        kmem_cache_t *cachep = p;
        struct list_head *q;
-        struct slab     *slabp;
+        struct slab *slabp;
-        unsigned long   active_objs;
+        unsigned long active_objs;
-        unsigned long   num_objs;
+        unsigned long num_objs;
-        unsigned long   active_slabs = 0;
+        unsigned long active_slabs = 0;
-        unsigned long   num_slabs, free_objects = 0, shared_avail = 0;
+        unsigned long num_slabs, free_objects = 0, shared_avail = 0;
        const char *name;
        char *error = NULL;
        int node;
@@ -3427,14 +3491,14 @@ static int s_show(struct seq_file *m, void *p)
                spin_lock(&l3->list_lock);
-                list_for_each(q,&l3->slabs_full) {
+                list_for_each(q, &l3->slabs_full) {
                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse != cachep->num && !error)
                                error = "slabs_full accounting error";
                        active_objs += cachep->num;
                        active_slabs++;
                }
-                list_for_each(q,&l3->slabs_partial) {
+                list_for_each(q, &l3->slabs_partial) {
                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse == cachep->num && !error)
                                error = "slabs_partial inuse accounting error";
@@ -3443,7 +3507,7 @@ static int s_show(struct seq_file *m, void *p)
                        active_objs += slabp->inuse;
                        active_slabs++;
                }
-                list_for_each(q,&l3->slabs_free) {
+                list_for_each(q, &l3->slabs_free) {
                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse && !error)
                                error = "slabs_free/inuse accounting error";
@@ -3454,25 +3518,24 @@ static int s_show(struct seq_file *m, void *p)
                spin_unlock(&l3->list_lock);
        }
-        num_slabs+=active_slabs;
+        num_slabs += active_slabs;
-        num_objs = num_slabs*cachep->num;
+        num_objs = num_slabs * cachep->num;
        if (num_objs - active_objs != free_objects && !error)
                error = "free_objects accounting error";
-        name = cachep->name; 
+        name = cachep->name;
        if (error)
                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-                name, active_objs, num_objs, cachep->objsize,
+                   name, active_objs, num_objs, cachep->objsize,
-                cachep->num, (1<<cachep->gfporder));
+                   cachep->num, (1 << cachep->gfporder));
        seq_printf(m, " : tunables %4u %4u %4u",
-                        cachep->limit, cachep->batchcount,
+                   cachep->limit, cachep->batchcount, cachep->shared);
-                        cachep->shared);
        seq_printf(m, " : slabdata %6lu %6lu %6lu",
-                        active_slabs, num_slabs, shared_avail);
+                   active_slabs, num_slabs, shared_avail);
 #if STATS
-        {       /* list3 stats */
+        {                       /* list3 stats */
                unsigned long high = cachep->high_mark;
                unsigned long allocs = cachep->num_allocations;
                unsigned long grown = cachep->grown;
@@ -3483,9 +3546,7 @@ static int s_show(struct seq_file *m, void *p)
                unsigned long node_frees = cachep->node_frees;
                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-                                %4lu %4lu %4lu %4lu",
+                                %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
-                                allocs, high, grown, reaped, errors,
-                                max_freeable, node_allocs, node_frees);
        }
        /* cpu stats */
        {
@@ -3495,7 +3556,7 @@ static int s_show(struct seq_file *m, void *p)
                unsigned long freemiss = atomic_read(&cachep->freemiss);
                seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
-                        allochit, allocmiss, freehit, freemiss);
+                           allochit, allocmiss, freehit, freemiss);
        }
 #endif
        seq_putc(m, '\n');
@@ -3518,10 +3579,10 @@ static int s_show(struct seq_file *m, void *p)
 */
 struct seq_operations slabinfo_op = {
-        .start  = s_start,
+        .start = s_start,
-        .next   = s_next,
+        .next = s_next,
-        .stop   = s_stop,
+        .stop = s_stop,
-        .show   = s_show,
+        .show = s_show,
 };
 #define MAX_SLABINFO_WRITE 128
@@ -3532,18 +3593,18 @@ struct seq_operations slabinfo_op = {
 * @count: data length
 * @ppos: unused
 */
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ssize_t slabinfo_write(struct file *file, const char __user * buffer,
-                                size_t count, loff_t *ppos)
+                       size_t count, loff_t *ppos)
 {
-        char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
+        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
        int limit, batchcount, shared, res;
        struct list_head *p;
-        
        if (count > MAX_SLABINFO_WRITE)
                return -EINVAL;
        if (copy_from_user(&kbuf, buffer, count))
                return -EFAULT;
-        kbuf[MAX_SLABINFO_WRITE] = '\0'; 
+        kbuf[MAX_SLABINFO_WRITE] = '\0';
        tmp = strchr(kbuf, ' ');
        if (!tmp)
@@ -3554,25 +3615,24 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                return -EINVAL;
        /* Find the cache in the chain of caches. */
-        down(&cache_chain_sem);
+        mutex_lock(&cache_chain_mutex);
        res = -EINVAL;
-        list_for_each(p,&cache_chain) {
+        list_for_each(p, &cache_chain) {
                kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
                if (!strcmp(cachep->name, kbuf)) {
                        if (limit < 1 ||
                            batchcount < 1 ||
-                            batchcount > limit ||
+                            batchcount > limit || shared < 0) {
-                            shared < 0) {
                                res = 0;
                        } else {
                                res = do_tune_cpucache(cachep, limit,
-                                                        batchcount, shared);
+                                                       batchcount, shared);
                        }
                        break;
                }
        }
-        up(&cache_chain_sem);
+        mutex_unlock(&cache_chain_mutex);
        if (res >= 0)
                res = count;
        return res;
@@ -3596,28 +3656,5 @@ unsigned int ksize(const void *objp)
        if (unlikely(objp == NULL))
                return 0;
-        return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp)));
+        return obj_reallen(page_get_cache(virt_to_page(objp)));
-}
-/*
- * kstrdup - allocate space for and copy an existing string
- *
- * @s: the string to duplicate
- * @gfp: the GFP mask used in the kmalloc() call when allocating memory
- */
-char *kstrdup(const char *s, gfp_t gfp)
-{
-        size_t len;
-        char *buf;
-        if (!s)
-                return NULL;
-        len = strlen(s) + 1;
-        buf = kmalloc(len, gfp);
-        if (buf)
-                memcpy(buf, s, len);
-        return buf;
 }
-EXPORT_SYMBOL(kstrdup);
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 0000000000..1c240c4b71
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is 8 bytes on x86, though it's perhaps possible to reduce
+ * this to 4 if it's deemed worth the effort. The slob heap is a
+ * singly-linked list of pages from __get_free_page, grown on demand
+ * and allocation from the heap is currently first-fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * __get_free_pages directly so that it can return page-aligned blocks
+ * and keeps a linked list of such pages and their orders. These
+ * objects are detected in kfree() by their page alignment.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with
+ * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
+ * set, in which case the low-level allocator will fragment blocks to
+ * create the proper alignment. Again, objects of page-size or greater
+ * are allocated by calling __get_free_pages. As SLAB objects know
+ * their size, no separate size bookkeeping is necessary and there is
+ * essentially no allocation space overhead.
+ */
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+struct slob_block {
+        int units;
+        struct slob_block *next;
+};
+typedef struct slob_block slob_t;
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_ALIGN L1_CACHE_BYTES
+struct bigblock {
+        int order;
+        void *pages;
+        struct bigblock *next;
+};
+typedef struct bigblock bigblock_t;
+static slob_t arena = { .next = &arena, .units = 1 };
+static slob_t *slobfree = &arena;
+static bigblock_t *bigblocks;
+static DEFINE_SPINLOCK(slob_lock);
+static DEFINE_SPINLOCK(block_lock);
+static void slob_free(void *b, int size);
+static void *slob_alloc(size_t size, gfp_t gfp, int align)
+{
+        slob_t *prev, *cur, *aligned = 0;
+        int delta = 0, units = SLOB_UNITS(size);
+        unsigned long flags;
+        spin_lock_irqsave(&slob_lock, flags);
+        prev = slobfree;
+        for (cur = prev->next; ; prev = cur, cur = cur->next) {
+                if (align) {
+                        aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+                        delta = aligned - cur;
+                }
+                if (cur->units >= units + delta) { /* room enough? */
+                        if (delta) { /* need to fragment head to align? */
+                                aligned->units = cur->units - delta;
+                                aligned->next = cur->next;
+                                cur->next = aligned;
+                                cur->units = delta;
+                                prev = cur;
+                                cur = aligned;
+                        }
+                        if (cur->units == units) /* exact fit? */
+                                prev->next = cur->next; /* unlink */
+                        else { /* fragment */
+                                prev->next = cur + units;
+                                prev->next->units = cur->units - units;
+                                prev->next->next = cur->next;
+                                cur->units = units;
+                        }
+                        slobfree = prev;
+                        spin_unlock_irqrestore(&slob_lock, flags);
+                        return cur;
+                }
+                if (cur == slobfree) {
+                        spin_unlock_irqrestore(&slob_lock, flags);
+                        if (size == PAGE_SIZE) /* trying to shrink arena? */
+                                return 0;
+                        cur = (slob_t *)__get_free_page(gfp);
+                        if (!cur)
+                                return 0;
+                        slob_free(cur, PAGE_SIZE);
+                        spin_lock_irqsave(&slob_lock, flags);
+                        cur = slobfree;
+                }
+        }
+}
+static void slob_free(void *block, int size)
+{
+        slob_t *cur, *b = (slob_t *)block;
+        unsigned long flags;
+        if (!block)
+                return;
+        if (size)
+                b->units = SLOB_UNITS(size);
+        /* Find reinsertion point */
+        spin_lock_irqsave(&slob_lock, flags);
+        for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+                if (cur >= cur->next && (b > cur || b < cur->next))
+                        break;
+        if (b + b->units == cur->next) {
+                b->units += cur->next->units;
+                b->next = cur->next->next;
+        } else
+                b->next = cur->next;
+        if (cur + cur->units == b) {
+                cur->units += b->units;
+                cur->next = b->next;
+        } else
+                cur->next = b;
+        slobfree = cur;
+        spin_unlock_irqrestore(&slob_lock, flags);
+}
+static int FASTCALL(find_order(int size));
+static int fastcall find_order(int size)
+{
+        int order = 0;
+        for ( ; size > 4096 ; size >>=1)
+                order++;
+        return order;
+}
+void *kmalloc(size_t size, gfp_t gfp)
+{
+        slob_t *m;
+        bigblock_t *bb;
+        unsigned long flags;
+        if (size < PAGE_SIZE - SLOB_UNIT) {
+                m = slob_alloc(size + SLOB_UNIT, gfp, 0);
+                return m ? (void *)(m + 1) : 0;
+        }
+        bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+        if (!bb)
+                return 0;
+        bb->order = find_order(size);
+        bb->pages = (void *)__get_free_pages(gfp, bb->order);
+        if (bb->pages) {
+                spin_lock_irqsave(&block_lock, flags);
+                bb->next = bigblocks;
+                bigblocks = bb;
+                spin_unlock_irqrestore(&block_lock, flags);
+                return bb->pages;
+        }
+        slob_free(bb, sizeof(bigblock_t));
+        return 0;
+}
+EXPORT_SYMBOL(kmalloc);
+void kfree(const void *block)
+{
+        bigblock_t *bb, **last = &bigblocks;
+        unsigned long flags;
+        if (!block)
+                return;
+        if (!((unsigned long)block & (PAGE_SIZE-1))) {
+                /* might be on the big block list */
+                spin_lock_irqsave(&block_lock, flags);
+                for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
+                        if (bb->pages == block) {
+                                *last = bb->next;
+                                spin_unlock_irqrestore(&block_lock, flags);
+                                free_pages((unsigned long)block, bb->order);
+                                slob_free(bb, sizeof(bigblock_t));
+                                return;
+                        }
+                }
+                spin_unlock_irqrestore(&block_lock, flags);
+        }
+        slob_free((slob_t *)block - 1, 0);
+        return;
+}
+EXPORT_SYMBOL(kfree);
+unsigned int ksize(const void *block)
+{
+        bigblock_t *bb;
+        unsigned long flags;
+        if (!block)
+                return 0;
+        if (!((unsigned long)block & (PAGE_SIZE-1))) {
+                spin_lock_irqsave(&block_lock, flags);
+                for (bb = bigblocks; bb; bb = bb->next)
+                        if (bb->pages == block) {
+                                spin_unlock_irqrestore(&slob_lock, flags);
+                                return PAGE_SIZE << bb->order;
+                        }
+                spin_unlock_irqrestore(&block_lock, flags);
+        }
+        return ((slob_t *)block - 1)->units * SLOB_UNIT;
+}
+struct kmem_cache {
+        unsigned int size, align;
+        const char *name;
+        void (*ctor)(void *, struct kmem_cache *, unsigned long);
+        void (*dtor)(void *, struct kmem_cache *, unsigned long);
+};
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+        size_t align, unsigned long flags,
+        void (*ctor)(void*, struct kmem_cache *, unsigned long),
+        void (*dtor)(void*, struct kmem_cache *, unsigned long))
+{
+        struct kmem_cache *c;
+        c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+        if (c) {
+                c->name = name;
+                c->size = size;
+                c->ctor = ctor;
+                c->dtor = dtor;
+                /* ignore alignment unless it's forced */
+                c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+                if (c->align < align)
+                        c->align = align;
+        }
+        return c;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+int kmem_cache_destroy(struct kmem_cache *c)
+{
+        slob_free(c, sizeof(struct kmem_cache));
+        return 0;
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+        void *b;
+        if (c->size < PAGE_SIZE)
+                b = slob_alloc(c->size, flags, c->align);
+        else
+                b = (void *)__get_free_pages(flags, find_order(c->size));
+        if (c->ctor)
+                c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+        return b;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+        if (c->dtor)
+                c->dtor(b, c, 0);
+        if (c->size < PAGE_SIZE)
+                slob_free(b, c->size);
+        else
+                free_pages((unsigned long)b, find_order(c->size));
+}
+EXPORT_SYMBOL(kmem_cache_free);
+unsigned int kmem_cache_size(struct kmem_cache *c)
+{
+        return c->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+const char *kmem_cache_name(struct kmem_cache *c)
+{
+        return c->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+static struct timer_list slob_timer = TIMER_INITIALIZER(
+        (void (*)(unsigned long))kmem_cache_init, 0, 0);
+void kmem_cache_init(void)
+{
+        void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+        if (p)
+                free_page((unsigned long)p);
+        mod_timer(&slob_timer, jiffies + HZ);
+}
+atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
+EXPORT_SYMBOL(slab_reclaim_pages);
+#ifdef CONFIG_SMP
+void *__alloc_percpu(size_t size, size_t align)
+{
+        int i;
+        struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+        if (!pdata)
+                return NULL;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!cpu_possible(i))
+                        continue;
+                pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+                if (!pdata->ptrs[i])
+                        goto unwind_oom;
+                memset(pdata->ptrs[i], 0, size);
+        }
+        /* Catch derefs w/o wrappers */
+        return (void *) (~(unsigned long) pdata);
+unwind_oom:
+        while (--i >= 0) {
+                if (!cpu_possible(i))
+                        continue;
+                kfree(pdata->ptrs[i]);
+        }
+        kfree(pdata);
+        return NULL;
+}
+EXPORT_SYMBOL(__alloc_percpu);
+void
+free_percpu(const void *objp)
+{
+        int i;
+        struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!cpu_possible(i))
+                        continue;
+                kfree(p->ptrs[i]);
+        }
+        kfree(p);
+}
+EXPORT_SYMBOL(free_percpu);
+#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 72079b538e..0a51f36ba3 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -18,10 +18,10 @@
 */
 #ifdef CONFIG_SPARSEMEM_EXTREME
 struct mem_section *mem_section[NR_SECTION_ROOTS]
-        ____cacheline_maxaligned_in_smp;
+        ____cacheline_internodealigned_in_smp;
 #else
 struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
-        ____cacheline_maxaligned_in_smp;
+        ____cacheline_internodealigned_in_smp;
 #endif
 EXPORT_SYMBOL(mem_section);
diff --git a/mm/swap.c b/mm/swap.c
index d09cf7f03e..bc2442a7b0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -34,8 +34,6 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
-#ifdef CONFIG_HUGETLB_PAGE
 void put_page(struct page *page)
 {
        if (unlikely(PageCompound(page))) {
@@ -52,7 +50,6 @@ void put_page(struct page *page)
                __page_cache_release(page);
 }
 EXPORT_SYMBOL(put_page);
-#endif
 /*
 * Writeback is about to end against a page which has been marked for immediate
@@ -159,18 +156,50 @@ void fastcall lru_cache_add_active(struct page *page)
        put_cpu_var(lru_add_active_pvecs);
 }
-void lru_add_drain(void)
+static void __lru_add_drain(int cpu)
 {
-        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+        struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+        /* CPU is dead, so no locking needed. */
        if (pagevec_count(pvec))
                __pagevec_lru_add(pvec);
-        pvec = &__get_cpu_var(lru_add_active_pvecs);
+        pvec = &per_cpu(lru_add_active_pvecs, cpu);
        if (pagevec_count(pvec))
                __pagevec_lru_add_active(pvec);
-        put_cpu_var(lru_add_pvecs);
 }
+void lru_add_drain(void)
+{
+        __lru_add_drain(get_cpu());
+        put_cpu();
+}
+#ifdef CONFIG_NUMA
+static void lru_add_drain_per_cpu(void *dummy)
+{
+        lru_add_drain();
+}
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+        return schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+}
+#else
+/*
+ * Returns 0 for success
+ */
+int lru_add_drain_all(void)
+{
+        lru_add_drain();
+        return 0;
+}
+#endif
 /*
 * This path almost never happens for VM activity - pages are normally
 * freed via pagevecs.  But it gets used by networking.
@@ -381,6 +410,8 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
        return pagevec_count(pvec);
 }
+EXPORT_SYMBOL(pagevec_lookup);
 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
                pgoff_t *index, int tag, unsigned nr_pages)
 {
@@ -415,17 +446,6 @@ void vm_acct_memory(long pages)
 }
 #ifdef CONFIG_HOTPLUG_CPU
-static void lru_drain_cache(unsigned int cpu)
-{
-        struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
-        /* CPU is dead, so no locking needed. */
-        if (pagevec_count(pvec))
-                __pagevec_lru_add(pvec);
-        pvec = &per_cpu(lru_add_active_pvecs, cpu);
-        if (pagevec_count(pvec))
-                __pagevec_lru_add_active(pvec);
-}
 /* Drop the CPU's cached committed space back into the central pool. */
 static int cpu_swap_callback(struct notifier_block *nfb,
@@ -438,7 +458,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
        if (action == CPU_DEAD) {
                atomic_add(*committed, &vm_committed_space);
                *committed = 0;
-                lru_drain_cache((long)hcpu);
+                __lru_add_drain((long)hcpu);
        }
        return NOTIFY_OK;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0df9a57b1d..7b09ac503f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
+#include <linux/pagevec.h>
 #include <asm/pgtable.h>
@@ -140,7 +141,7 @@ void __delete_from_swap_cache(struct page *page)
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
-int add_to_swap(struct page * page)
+int add_to_swap(struct page * page, gfp_t gfp_mask)
 {
        swp_entry_t entry;
        int err;
@@ -165,7 +166,7 @@ int add_to_swap(struct page * page)
                 * Add it to the swap cache and mark it dirty
                 */
                err = __add_to_swap_cache(page, entry,
-                                GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN);
+                                gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
                switch (err) {
                case 0:                         /* Success */
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page)
 */
 void free_pages_and_swap_cache(struct page **pages, int nr)
 {
-        int chunk = 16;
        struct page **pagep = pages;
        lru_add_drain();
        while (nr) {
-                int todo = min(chunk, nr);
+                int todo = min(nr, PAGEVEC_SIZE);
                int i;
                for (i = 0; i < todo; i++)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeace30..f1e69c30d2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,8 @@
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/capability.h>
 #include <linux/syscalls.h>
 #include <asm/pgtable.h>
@@ -45,12 +47,12 @@ struct swap_list_t swap_list = {-1, -1};
 struct swap_info_struct swap_info[MAX_SWAPFILES];
-static DECLARE_MUTEX(swapon_sem);
+static DEFINE_MUTEX(swapon_mutex);
 /*
 * We need this because the bdev->unplug_fn can sleep and we cannot
 * hold swap_lock while calling the unplug_fn. And swap_lock
- * cannot be turned into a semaphore.
+ * cannot be turned into a mutex.
 */
 static DECLARE_RWSEM(swap_unplug_sem);
@@ -211,6 +213,26 @@ noswap:
        return (swp_entry_t) {0};
 }
+swp_entry_t get_swap_page_of_type(int type)
+{
+        struct swap_info_struct *si;
+        pgoff_t offset;
+        spin_lock(&swap_lock);
+        si = swap_info + type;
+        if (si->flags & SWP_WRITEOK) {
+                nr_swap_pages--;
+                offset = scan_swap_map(si);
+                if (offset) {
+                        spin_unlock(&swap_lock);
+                        return swp_entry(type, offset);
+                }
+                nr_swap_pages++;
+        }
+        spin_unlock(&swap_lock);
+        return (swp_entry_t) {0};
+}
 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 {
        struct swap_info_struct * p;
@@ -1140,7 +1162,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        up_write(&swap_unplug_sem);
        destroy_swap_extents(p);
-        down(&swapon_sem);
+        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        drain_mmlist();
@@ -1159,7 +1181,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
        p->swap_map = NULL;
        p->flags = 0;
        spin_unlock(&swap_lock);
-        up(&swapon_sem);
+        mutex_unlock(&swapon_mutex);
        vfree(swap_map);
        inode = mapping->host;
        if (S_ISBLK(inode->i_mode)) {
@@ -1167,9 +1189,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
                set_blocksize(bdev, p->old_block_size);
                bd_release(bdev);
        } else {
-                down(&inode->i_sem);
+                mutex_lock(&inode->i_mutex);
                inode->i_flags &= ~S_SWAPFILE;
-                up(&inode->i_sem);
+                mutex_unlock(&inode->i_mutex);
        }
        filp_close(swap_file, NULL);
        err = 0;
@@ -1188,7 +1210,7 @@ static void *swap_start(struct seq_file *swap, loff_t *pos)
        int i;
        loff_t l = *pos;
-        down(&swapon_sem);
+        mutex_lock(&swapon_mutex);
        for (i = 0; i < nr_swapfiles; i++, ptr++) {
                if (!(ptr->flags & SWP_USED) || !ptr->swap_map)
@@ -1217,7 +1239,7 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
 static void swap_stop(struct seq_file *swap, void *v)
 {
-        up(&swapon_sem);
+        mutex_unlock(&swapon_mutex);
 }
 static int swap_show(struct seq_file *swap, void *v)
@@ -1386,7 +1408,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                p->bdev = bdev;
        } else if (S_ISREG(inode->i_mode)) {
                p->bdev = inode->i_sb->s_bdev;
-                down(&inode->i_sem);
+                mutex_lock(&inode->i_mutex);
                did_down = 1;
                if (IS_SWAPFILE(inode)) {
                        error = -EBUSY;
@@ -1422,7 +1444,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
                swap_header_version = 2;
        else {
-                printk("Unable to find swap-space signature\n");
+                printk(KERN_ERR "Unable to find swap-space signature\n");
                error = -EINVAL;
                goto bad_swap;
        }
@@ -1473,7 +1495,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                        goto bad_swap;
                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
                        goto bad_swap;
-                
                /* OK, set up the swap map and apply the bad block list */
                if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
                        error = -ENOMEM;
@@ -1482,17 +1504,17 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                error = 0;
                memset(p->swap_map, 0, maxpages * sizeof(short));
-                for (i=0; i<swap_header->info.nr_badpages; i++) {
+                for (i = 0; i < swap_header->info.nr_badpages; i++) {
-                        int page = swap_header->info.badpages[i];
+                        int page_nr = swap_header->info.badpages[i];
-                        if (page <= 0 || page >= swap_header->info.last_page)
+                        if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
                                error = -EINVAL;
                        else
-                                p->swap_map[page] = SWAP_MAP_BAD;
+                                p->swap_map[page_nr] = SWAP_MAP_BAD;
                }
                nr_good_pages = swap_header->info.last_page -
                                swap_header->info.nr_badpages -
                                1 /* header page */;
-                if (error) 
+                if (error)
                        goto bad_swap;
        }
@@ -1519,7 +1541,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                goto bad_swap;
        }
-        down(&swapon_sem);
+        mutex_lock(&swapon_mutex);
        spin_lock(&swap_lock);
        p->flags = SWP_ACTIVE;
        nr_swap_pages += nr_good_pages;
@@ -1545,7 +1567,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                swap_info[prev].next = p - swap_info;
        }
        spin_unlock(&swap_lock);
-        up(&swapon_sem);
+        mutex_unlock(&swapon_mutex);
        error = 0;
        goto out;
 bad_swap:
@@ -1576,7 +1598,7 @@ out:
        if (did_down) {
                if (!error)
                        inode->i_flags |= S_SWAPFILE;
-                up(&inode->i_sem);
+                mutex_unlock(&inode->i_mutex);
        }
        return error;
 }
diff --git a/mm/thrash.c b/mm/thrash.c
index eff3c18c33..f4c560b4a2 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -57,14 +57,17 @@ void grab_swap_token(void)
        /* We have the token. Let others know we still need it. */
        if (has_swap_token(current->mm)) {
                current->mm->recent_pagein = 1;
+                if (unlikely(!swap_token_default_timeout))
+                        disable_swap_token();
                return;
        }
        if (time_after(jiffies, swap_token_check)) {
-                /* Can't get swapout protection if we exceed our RSS limit. */
+                if (!swap_token_default_timeout) {
-                // if (current->mm->rss > current->mm->rlimit_rss)
+                        swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
-                //      return;
+                        return;
+                }
                /* ... or if we recently held the token. */
                if (time_before(jiffies, current->mm->swap_token_time))
@@ -95,6 +98,7 @@ void __put_swap_token(struct mm_struct *mm)
 {
        spin_lock(&swap_token_lock);
        if (likely(mm == swap_token_mm)) {
+                mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL;
                swap_token_mm = &init_mm;
                swap_token_check = jiffies;
        }
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf44e..f9d6a9cc91 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
                goto close_file;
        d_instantiate(dentry, inode);
-        inode->i_size = size;
        inode->i_nlink = 0;     /* It is unlinked */
        file->f_vfsmnt = mntget(shm_mnt);
        file->f_dentry = dentry;
        file->f_mapping = inode->i_mapping;
        file->f_op = &ramfs_file_operations;
        file->f_mode = FMODE_WRITE | FMODE_READ;
+        /* notify everyone as to the change of file size */
+        error = do_truncate(dentry, size, 0, file);
+        if (error < 0)
+                goto close_file;
        return file;
 close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 {
        return 0;
 }
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        file_accessed(file);
+#ifndef CONFIG_MMU
+        return ramfs_nommu_mmap(file, vma);
+#else
+        return 0;
+#endif
+}
+#ifndef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+                                      unsigned long addr,
+                                      unsigned long len,
+                                      unsigned long pgoff,
+                                      unsigned long flags)
+{
+        return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
diff --git a/mm/truncate.c b/mm/truncate.c
index 29c18f68dc..6cb3fff25f 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 }
 /**
- * truncate_inode_pages - truncate *all* the pages from an offset
+ * truncate_inode_pages - truncate range of pages specified by start and
+ * end byte offsets
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
 *
- * Truncate the page cache at a set offset, removing the pages that are beyond
+ * Truncate the page cache, removing the pages that are between
- * that offset (and zeroing out partial pages).
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
 *
 * Truncate takes two passes - the first pass is nonblocking.  It will not
 * block on page locks and it will not block on writeback.  The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 * We pass down the cache-hot hint to the page freeing code.  Even if the
 * mapping is large, it is probably the case that the final pages are the most
 * recently touched, and freeing happens in ascending file offset order.
- *
- * Called under (and serialised by) inode->i_sem.
 */
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping,
+                                loff_t lstart, loff_t lend)
 {
        const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+        pgoff_t end;
        const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
        struct pagevec pvec;
        pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
        if (mapping->nrpages == 0)
                return;
+        BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+        end = (lend >> PAGE_CACHE_SHIFT);
        pagevec_init(&pvec, 0);
        next = start;
-        while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+        while (next <= end &&
+               pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t page_index = page->index;
+                        if (page_index > end) {
+                                next = page_index;
+                                break;
+                        }
                        if (page_index > next)
                                next = page_index;
                        next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
                        next = start;
                        continue;
                }
+                if (pvec.pages[0]->index > end) {
+                        pagevec_release(&pvec);
+                        break;
+                }
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
+                        if (page->index > end)
+                                break;
                        lock_page(page);
                        wait_on_page_writeback(page);
                        if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
                pagevec_release(&pvec);
        }
 }
+EXPORT_SYMBOL(truncate_inode_pages_range);
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_mutex.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+        truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
 EXPORT_SYMBOL(truncate_inode_pages);
 /**
@@ -219,7 +249,6 @@ unlock:
                                break;
                }
                pagevec_release(&pvec);
-                cond_resched();
        }
        return ret;
 }
@@ -282,8 +311,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                                         * Zap the rest of the file in one hit.
                                         */
                                        unmap_mapping_range(mapping,
-                                            page_index << PAGE_CACHE_SHIFT,
+                                           (loff_t)page_index<<PAGE_CACHE_SHIFT,
-                                            (end - page_index + 1)
+                                           (loff_t)(end - page_index + 1)
                                                        << PAGE_CACHE_SHIFT,
                                            0);
                                        did_range_unmap = 1;
@@ -292,7 +321,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
                                         * Just zap this page
                                         */
                                        unmap_mapping_range(mapping,
-                                          page_index << PAGE_CACHE_SHIFT,
+                                          (loff_t)page_index<<PAGE_CACHE_SHIFT,
                                          PAGE_CACHE_SIZE, 0);
                                }
                        }
diff --git a/mm/util.c b/mm/util.c
new file mode 100644
index 0000000000..5f4bb59da6
--- /dev/null
+++ b/mm/util.c
@@ -0,0 +1,39 @@
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ */
+void *kzalloc(size_t size, gfp_t flags)
+{
+        void *ret = kmalloc(size, flags);
+        if (ret)
+                memset(ret, 0, size);
+        return ret;
+}
+EXPORT_SYMBOL(kzalloc);
+/*
+ * kstrdup - allocate space for and copy an existing string
+ *
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, gfp_t gfp)
+{
+        size_t len;
+        char *buf;
+        if (!s)
+                return NULL;
+        len = strlen(s) + 1;
+        buf = kmalloc(len, gfp);
+        if (buf)
+                memcpy(buf, s, len);
+        return buf;
+}
+EXPORT_SYMBOL(kstrdup);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 135bf8ca96..2e34b61a70 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,9 +63,6 @@ struct scan_control {
        unsigned long nr_mapped;        /* From page_state */
-        /* How many pages shrink_cache() should reclaim */
-        int nr_to_reclaim;
        /* Ask shrink_caches, or shrink_zone to scan at this priority */
        unsigned int priority;
@@ -186,8 +183,7 @@ EXPORT_SYMBOL(remove_shrinker);
 *
 * Returns the number of slab objects which we shrunk.
 */
-static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
-                        unsigned long lru_pages)
 {
        struct shrinker *shrinker;
        int ret = 0;
@@ -201,13 +197,25 @@ static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
        list_for_each_entry(shrinker, &shrinker_list, list) {
                unsigned long long delta;
                unsigned long total_scan;
+                unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
                delta = (4 * scanned) / shrinker->seeks;
-                delta *= (*shrinker->shrinker)(0, gfp_mask);
+                delta *= max_pass;
                do_div(delta, lru_pages + 1);
                shrinker->nr += delta;
-                if (shrinker->nr < 0)
+                if (shrinker->nr < 0) {
-                        shrinker->nr = LONG_MAX;        /* It wrapped! */
+                        printk(KERN_ERR "%s: nr=%ld\n",
+                                        __FUNCTION__, shrinker->nr);
+                        shrinker->nr = max_pass;
+                }
+                /*
+                 * Avoid risking looping forever due to too large nr value:
+                 * never try to free more than twice the estimate number of
+                 * freeable entries.
+                 */
+                if (shrinker->nr > max_pass * 2)
+                        shrinker->nr = max_pass * 2;
                total_scan = shrinker->nr;
                shrinker->nr = 0;
@@ -263,9 +271,7 @@ static inline int is_page_cache_freeable(struct page *page)
 static int may_write_to_queue(struct backing_dev_info *bdi)
 {
-        if (current_is_kswapd())
+        if (current->flags & PF_SWAPWRITE)
-                return 1;
-        if (current_is_pdflush())       /* This is unlikely, but why not... */
                return 1;
        if (!bdi_write_congested(bdi))
                return 1;
@@ -355,7 +361,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
                res = mapping->a_ops->writepage(page, &wbc);
                if (res < 0)
                        handle_write_error(mapping, page, res);
-                if (res == WRITEPAGE_ACTIVATE) {
+                if (res == AOP_WRITEPAGE_ACTIVATE) {
                        ClearPageReclaim(page);
                        return PAGE_ACTIVATE;
                }
@@ -370,6 +376,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
        return PAGE_CLEAN;
 }
+static int remove_mapping(struct address_space *mapping, struct page *page)
+{
+        if (!mapping)
+                return 0;               /* truncate got there first */
+        write_lock_irq(&mapping->tree_lock);
+        /*
+         * The non-racy check for busy page.  It is critical to check
+         * PageDirty _after_ making sure that the page is freeable and
+         * not in use by anybody.       (pagecache + us == 2)
+         */
+        if (unlikely(page_count(page) != 2))
+                goto cannot_free;
+        smp_rmb();
+        if (unlikely(PageDirty(page)))
+                goto cannot_free;
+        if (PageSwapCache(page)) {
+                swp_entry_t swap = { .val = page_private(page) };
+                __delete_from_swap_cache(page);
+                write_unlock_irq(&mapping->tree_lock);
+                swap_free(swap);
+                __put_page(page);       /* The pagecache ref */
+                return 1;
+        }
+        __remove_from_page_cache(page);
+        write_unlock_irq(&mapping->tree_lock);
+        __put_page(page);
+        return 1;
+cannot_free:
+        write_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
 /*
 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
 */
@@ -407,7 +450,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                if (PageWriteback(page))
                        goto keep_locked;
-                referenced = page_referenced(page, 1, sc->priority <= 0);
+                referenced = page_referenced(page, 1);
                /* In active use or really unfreeable?  Activate it. */
                if (referenced && page_mapping_inuse(page))
                        goto activate_locked;
@@ -420,7 +463,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                if (PageAnon(page) && !PageSwapCache(page)) {
                        if (!sc->may_swap)
                                goto keep_locked;
-                        if (!add_to_swap(page))
+                        if (!add_to_swap(page, GFP_ATOMIC))
                                goto activate_locked;
                }
 #endif /* CONFIG_SWAP */
@@ -503,36 +546,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                                goto free_it;
                }
-                if (!mapping)
+                if (!remove_mapping(mapping, page))
-                        goto keep_locked;       /* truncate got there first */
+                        goto keep_locked;
-                write_lock_irq(&mapping->tree_lock);
-                /*
-                 * The non-racy check for busy page.  It is critical to check
-                 * PageDirty _after_ making sure that the page is freeable and
-                 * not in use by anybody.       (pagecache + us == 2)
-                 */
-                if (unlikely(page_count(page) != 2))
-                        goto cannot_free;
-                smp_rmb();
-                if (unlikely(PageDirty(page)))
-                        goto cannot_free;
-#ifdef CONFIG_SWAP
-                if (PageSwapCache(page)) {
-                        swp_entry_t swap = { .val = page_private(page) };
-                        __delete_from_swap_cache(page);
-                        write_unlock_irq(&mapping->tree_lock);
-                        swap_free(swap);
-                        __put_page(page);       /* The pagecache ref */
-                        goto free_it;
-                }
-#endif /* CONFIG_SWAP */
-                __remove_from_page_cache(page);
-                write_unlock_irq(&mapping->tree_lock);
-                __put_page(page);
 free_it:
                unlock_page(page);
@@ -541,10 +556,6 @@ free_it:
                        __pagevec_release_nonlru(&freed_pvec);
                continue;
-cannot_free:
-                write_unlock_irq(&mapping->tree_lock);
-                goto keep_locked;
 activate_locked:
                SetPageActive(page);
                pgactivate++;
@@ -562,6 +573,228 @@ keep:
        return reclaimed;
 }
+#ifdef CONFIG_MIGRATION
+static inline void move_to_lru(struct page *page)
+{
+        list_del(&page->lru);
+        if (PageActive(page)) {
+                /*
+                 * lru_cache_add_active checks that
+                 * the PG_active bit is off.
+                 */
+                ClearPageActive(page);
+                lru_cache_add_active(page);
+        } else {
+                lru_cache_add(page);
+        }
+        put_page(page);
+}
+/*
+ * Add isolated pages on the list back to the LRU.
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+        struct page *page;
+        struct page *page2;
+        int count = 0;
+        list_for_each_entry_safe(page, page2, l, lru) {
+                move_to_lru(page);
+                count++;
+        }
+        return count;
+}
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+        struct address_space *mapping = page_mapping(page);
+        if (page_mapped(page) && mapping)
+                if (try_to_unmap(page) != SWAP_SUCCESS)
+                        goto unlock_retry;
+        if (PageDirty(page)) {
+                /* Page is dirty, try to write it out here */
+                switch(pageout(page, mapping)) {
+                case PAGE_KEEP:
+                case PAGE_ACTIVATE:
+                        goto unlock_retry;
+                case PAGE_SUCCESS:
+                        goto retry;
+                case PAGE_CLEAN:
+                        ; /* try to free the page below */
+                }
+        }
+        if (PagePrivate(page)) {
+                if (!try_to_release_page(page, GFP_KERNEL) ||
+                    (!mapping && page_count(page) == 1))
+                        goto unlock_retry;
+        }
+        if (remove_mapping(mapping, page)) {
+                /* Success */
+                unlock_page(page);
+                return 0;
+        }
+unlock_retry:
+        unlock_page(page);
+retry:
+        return -EAGAIN;
+}
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because t has become empty
+ * or no retryable pages exist anymore.
+ *
+ * SIMPLIFIED VERSION: This implementation of migrate_pages
+ * is only swapping out pages and never touches the second
+ * list. The direct migration patchset
+ * extends this function to avoid the use of swap.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+                  struct list_head *moved, struct list_head *failed)
+{
+        int retry;
+        int nr_failed = 0;
+        int pass = 0;
+        struct page *page;
+        struct page *page2;
+        int swapwrite = current->flags & PF_SWAPWRITE;
+        int rc;
+        if (!swapwrite)
+                current->flags |= PF_SWAPWRITE;
+redo:
+        retry = 0;
+        list_for_each_entry_safe(page, page2, from, lru) {
+                cond_resched();
+                rc = 0;
+                if (page_count(page) == 1)
+                        /* page was freed from under us. So we are done. */
+                        goto next;
+                /*
+                 * Skip locked pages during the first two passes to give the
+                 * functions holding the lock time to release the page. Later we
+                 * use lock_page() to have a higher chance of acquiring the
+                 * lock.
+                 */
+                rc = -EAGAIN;
+                if (pass > 2)
+                        lock_page(page);
+                else
+                        if (TestSetPageLocked(page))
+                                goto next;
+                /*
+                 * Only wait on writeback if we have already done a pass where
+                 * we we may have triggered writeouts for lots of pages.
+                 */
+                if (pass > 0) {
+                        wait_on_page_writeback(page);
+                } else {
+                        if (PageWriteback(page))
+                                goto unlock_page;
+                }
+                /*
+                 * Anonymous pages must have swap cache references otherwise
+                 * the information contained in the page maps cannot be
+                 * preserved.
+                 */
+                if (PageAnon(page) && !PageSwapCache(page)) {
+                        if (!add_to_swap(page, GFP_KERNEL)) {
+                                rc = -ENOMEM;
+                                goto unlock_page;
+                        }
+                }
+                /*
+                 * Page is properly locked and writeback is complete.
+                 * Try to migrate the page.
+                 */
+                rc = swap_page(page);
+                goto next;
+unlock_page:
+                unlock_page(page);
+next:
+                if (rc == -EAGAIN) {
+                        retry++;
+                } else if (rc) {
+                        /* Permanent failure */
+                        list_move(&page->lru, failed);
+                        nr_failed++;
+                } else {
+                        /* Success */
+                        list_move(&page->lru, moved);
+                }
+        }
+        if (retry && pass++ < 10)
+                goto redo;
+        if (!swapwrite)
+                current->flags &= ~PF_SWAPWRITE;
+        return nr_failed + retry;
+}
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list with elevated refcount.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page)
+{
+        int ret = 0;
+        if (PageLRU(page)) {
+                struct zone *zone = page_zone(page);
+                spin_lock_irq(&zone->lru_lock);
+                if (TestClearPageLRU(page)) {
+                        ret = 1;
+                        get_page(page);
+                        if (PageActive(page))
+                                del_page_from_active_list(zone, page);
+                        else
+                                del_page_from_inactive_list(zone, page);
+                }
+                spin_unlock_irq(&zone->lru_lock);
+        }
+        return ret;
+}
+#endif
 /*
 * zone->lru_lock is heavily contended.  Some of the functions that
 * shrink the lists perform better by taking out a batch of pages
@@ -641,17 +874,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
                        goto done;
                max_scan -= nr_scan;
-                if (current_is_kswapd())
-                        mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
-                else
-                        mod_page_state_zone(zone, pgscan_direct, nr_scan);
                nr_freed = shrink_list(&page_list, sc);
-                if (current_is_kswapd())
-                        mod_page_state(kswapd_steal, nr_freed);
-                mod_page_state_zone(zone, pgsteal, nr_freed);
-                sc->nr_to_reclaim -= nr_freed;
-                spin_lock_irq(&zone->lru_lock);
+                local_irq_disable();
+                if (current_is_kswapd()) {
+                        __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+                        __mod_page_state(kswapd_steal, nr_freed);
+                } else
+                        __mod_page_state_zone(zone, pgscan_direct, nr_scan);
+                __mod_page_state_zone(zone, pgsteal, nr_freed);
+                spin_lock(&zone->lru_lock);
                /*
                 * Put back any unfreeable pages.
                 */
@@ -756,7 +989,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
                if (page_mapped(page)) {
                        if (!reclaim_mapped ||
                            (total_swap_pages == 0 && PageAnon(page)) ||
-                            page_referenced(page, 0, sc->priority <= 0)) {
+                            page_referenced(page, 0)) {
                                list_add(&page->lru, &l_active);
                                continue;
                        }
@@ -813,11 +1046,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
                }
        }
        zone->nr_active += pgmoved;
-        spin_unlock_irq(&zone->lru_lock);
+        spin_unlock(&zone->lru_lock);
-        pagevec_release(&pvec);
+        __mod_page_state_zone(zone, pgrefill, pgscanned);
+        __mod_page_state(pgdeactivate, pgdeactivate);
+        local_irq_enable();
-        mod_page_state_zone(zone, pgrefill, pgscanned);
+        pagevec_release(&pvec);
-        mod_page_state(pgdeactivate, pgdeactivate);
 }
 /*
@@ -849,8 +1084,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
        else
                nr_inactive = 0;
-        sc->nr_to_reclaim = sc->swap_cluster_max;
        while (nr_active || nr_inactive) {
                if (nr_active) {
                        sc->nr_to_scan = min(nr_active,
@@ -864,8 +1097,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
                                        (unsigned long)sc->swap_cluster_max);
                        nr_inactive -= sc->nr_to_scan;
                        shrink_cache(zone, sc);
-                        if (sc->nr_to_reclaim <= 0)
-                                break;
                }
        }
@@ -898,7 +1129,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
-                if (zone->present_pages == 0)
+                if (!populated_zone(zone))
                        continue;
                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -960,6 +1191,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                sc.nr_reclaimed = 0;
                sc.priority = priority;
                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+                if (!priority)
+                        disable_swap_token();
                shrink_caches(zones, &sc);
                shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
                if (reclaim_state) {
@@ -1056,6 +1289,10 @@ loop_again:
                int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                unsigned long lru_pages = 0;
+                /* The swap token gets in the way of swapout... */
+                if (!priority)
+                        disable_swap_token();
                all_zones_ok = 1;
                if (nr_pages == 0) {
@@ -1066,7 +1303,7 @@ loop_again:
                        for (i = pgdat->nr_zones - 1; i >= 0; i--) {
                                struct zone *zone = pgdat->node_zones + i;
-                                if (zone->present_pages == 0)
+                                if (!populated_zone(zone))
                                        continue;
                                if (zone->all_unreclaimable &&
@@ -1074,7 +1311,7 @@ loop_again:
                                        continue;
                                if (!zone_watermark_ok(zone, order,
-                                                zone->pages_high, 0, 0, 0)) {
+                                                zone->pages_high, 0, 0)) {
                                        end_zone = i;
                                        goto scan;
                                }
@@ -1103,7 +1340,7 @@ scan:
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
-                        if (zone->present_pages == 0)
+                        if (!populated_zone(zone))
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1111,7 +1348,7 @@ scan:
                        if (nr_pages == 0) {    /* Not software suspend */
                                if (!zone_watermark_ok(zone, order,
-                                                zone->pages_high, end_zone, 0, 0))
+                                                zone->pages_high, end_zone, 0))
                                        all_zones_ok = 0;
                        }
                        zone->temp_priority = priority;
@@ -1220,7 +1457,7 @@ static int kswapd(void *p)
         * us from recursively trying to free more memory as we're
         * trying to free the first piece of memory in the first place).
         */
-        tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
        order = 0;
        for ( ; ; ) {
@@ -1255,11 +1492,11 @@ void wakeup_kswapd(struct zone *zone, int order)
 {
        pg_data_t *pgdat;
-        if (zone->present_pages == 0)
+        if (!populated_zone(zone))
                return;
        pgdat = zone->zone_pgdat;
-        if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
+        if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
                return;
        if (pgdat->kswapd_max_order < order)
                pgdat->kswapd_max_order = order;
@@ -1336,74 +1573,70 @@ static int __init kswapd_init(void)
 module_init(kswapd_init)
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ *
+ * In the future we may add flags to the mode. However, the page allocator
+ * should only have to check that zone_reclaim_mode != 0 before calling
+ * zone_reclaim().
+ */
+int zone_reclaim_mode __read_mostly;
 /*
+ * Mininum time between zone reclaim scans
+ */
+#define ZONE_RECLAIM_INTERVAL HZ/2
+/*
 * Try to free up some pages from this zone through reclaim.
 */
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-        struct scan_control sc;
        int nr_pages = 1 << order;
-        int total_reclaimed = 0;
+        struct task_struct *p = current;
+        struct reclaim_state reclaim_state;
+        struct scan_control sc = {
+                .gfp_mask       = gfp_mask,
+                .may_writepage  = 0,
+                .may_swap       = 0,
+                .nr_mapped      = read_page_state(nr_mapped),
+                .nr_scanned     = 0,
+                .nr_reclaimed   = 0,
+                .priority       = 0
+        };
-        /* The reclaim may sleep, so don't do it if sleep isn't allowed */
+        if (!(gfp_mask & __GFP_WAIT) ||
-        if (!(gfp_mask & __GFP_WAIT))
+                zone->zone_pgdat->node_id != numa_node_id() ||
-                return 0;
+                zone->all_unreclaimable ||
-        if (zone->all_unreclaimable)
+                atomic_read(&zone->reclaim_in_progress) > 0)
-                return 0;
+                        return 0;
-        sc.gfp_mask = gfp_mask;
+        if (time_before(jiffies,
-        sc.may_writepage = 0;
+                zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
-        sc.may_swap = 0;
+                        return 0;
-        sc.nr_mapped = read_page_state(nr_mapped);
-        sc.nr_scanned = 0;
+        disable_swap_token();
-        sc.nr_reclaimed = 0;
-        /* scan at the highest priority */
-        sc.priority = 0;
        if (nr_pages > SWAP_CLUSTER_MAX)
                sc.swap_cluster_max = nr_pages;
        else
                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-        /* Don't reclaim the zone if there are other reclaimers active */
+        cond_resched();
-        if (atomic_read(&zone->reclaim_in_progress) > 0)
+        p->flags |= PF_MEMALLOC;
-                goto out;
+        reclaim_state.reclaimed_slab = 0;
+        p->reclaim_state = &reclaim_state;
        shrink_zone(zone, &sc);
-        total_reclaimed = sc.nr_reclaimed;
+        p->reclaim_state = NULL;
+        current->flags &= ~PF_MEMALLOC;
- out:
-        return total_reclaimed;
-}
-asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
-                                     unsigned int state)
-{
-        struct zone *z;
-        int i;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EACCES;
-        if (node >= MAX_NUMNODES || !node_online(node))
+        if (sc.nr_reclaimed == 0)
-                return -EINVAL;
+                zone->last_unsuccessful_zone_reclaim = jiffies;
-        /* This will break if we ever add more zones */
+        return sc.nr_reclaimed > nr_pages;
-        if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
-                return -EINVAL;
-        for (i = 0; i < MAX_NR_ZONES; i++) {
-                if (!(zone & 1<<i))
-                        continue;
-                z = &NODE_DATA(node)->node_zones[i];
-                if (state)
-                        z->reclaim_pages = 1;
-                else
-                        z->reclaim_pages = 0;
-        }
-        return 0;
 }
+#endif
author	Dave Kleikamp <shaggy@austin.ibm.com>	2006-01-24 15:34:47 -0500
committer	Dave Kleikamp <shaggy@austin.ibm.com>	2006-01-24 15:34:47 -0500
commit	0a0fc0ddbe732779366ab6b1b879f62195e65967 (patch)
tree	7b42490a676cf39ae0691b6859ecf7fd410f229b /mm
parent	4d5dbd0945d9e0833dd7964a3d6ee33157f7cc7a (diff)
parent	3ee68c4af3fd7228c1be63254b9f884614f9ebb2 (diff)