34 files changed, 2722 insertions, 1297 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 21eb51d4da8f..a9cb80ae6409 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -11,7 +11,7 @@ choice
 config FLATMEM_MANUAL
        bool "Flat Memory"
-        depends on !ARCH_DISCONTIGMEM_ENABLE || ARCH_FLATMEM_ENABLE
+        depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
        help
          This option allows you to change some of the ways that
          Linux manages its memory internally.  Most users will
@@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS
        default "4096" if ARM && !CPU_CACHE_VIPT
        default "4096" if PARISC && !PA20
        default "4"
+#
+# support for page migration
+#
+config MIGRATION
+        def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
+        depends on SWAP
diff --git a/mm/Makefile b/mm/Makefile
index 2fa6d2ca9f28..9aa03fa1dcc3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y                   := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           page_alloc.o page-writeback.o pdflush.o \
-                           readahead.o slab.o swap.o truncate.o vmscan.o \
+                           readahead.o swap.o truncate.o vmscan.o \
-                           prio_tree.o $(mmu-y)
+                           prio_tree.o util.o $(mmu-y)
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 16b9465eb4eb..35c32290f717 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -296,20 +296,12 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                unsigned long v = ~map[i / BITS_PER_LONG];
                if (gofast && v == ~0UL) {
-                        int j, order;
+                        int order;
                        page = pfn_to_page(pfn);
                        count += BITS_PER_LONG;
-                        __ClearPageReserved(page);
                        order = ffs(BITS_PER_LONG) - 1;
-                        set_page_refs(page, order);
+                        __free_pages_bootmem(page, order);
-                        for (j = 1; j < BITS_PER_LONG; j++) {
-                                if (j + 16 < BITS_PER_LONG)
-                                        prefetchw(page + j + 16);
-                                __ClearPageReserved(page + j);
-                                set_page_count(page + j, 0);
-                        }
-                        __free_pages(page, order);
                        i += BITS_PER_LONG;
                        page += BITS_PER_LONG;
                } else if (v) {
@@ -319,9 +311,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                        for (m = 1; m && i < idx; m<<=1, page++, i++) {
                                if (v & m) {
                                        count++;
-                                        __ClearPageReserved(page);
+                                        __free_pages_bootmem(page, 0);
-                                        set_page_refs(page, 0);
-                                        __free_page(page);
                                }
                        }
                } else {
@@ -339,9 +329,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
        count = 0;
        for (i = 0; i < ((bdata->node_low_pfn-(bdata->node_boot_start >> PAGE_SHIFT))/8 + PAGE_SIZE-1)/PAGE_SIZE; i++,page++) {
                count++;
-                __ClearPageReserved(page);
+                __free_pages_bootmem(page, 0);
-                set_page_count(page, 1);
-                __free_page(page);
        }
        total += count;
        bdata->node_bootmem_map = NULL;
@@ -393,15 +381,14 @@ unsigned long __init free_all_bootmem (void)
        return(free_all_bootmem_core(NODE_DATA(0)));
 }
-void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, unsigned long goal,
+void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
-                                unsigned long limit)
 {
        pg_data_t *pgdat = pgdat_list;
        void *ptr;
        for_each_pgdat(pgdat)
                if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
-                                                 align, goal, limit)))
+                                                 align, goal, 0)))
                        return(ptr);
        /*
@@ -413,15 +400,40 @@ void * __init __alloc_bootmem_limit (unsigned long size, unsigned long align, un
 }
-void * __init __alloc_bootmem_node_limit (pg_data_t *pgdat, unsigned long size, unsigned long align,
+void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align,
-                                     unsigned long goal, unsigned long limit)
+                                   unsigned long goal)
 {
        void *ptr;
-        ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, limit);
+        ptr = __alloc_bootmem_core(pgdat->bdata, size, align, goal, 0);
        if (ptr)
                return (ptr);
-        return __alloc_bootmem_limit(size, align, goal, limit);
+        return __alloc_bootmem(size, align, goal);
+}
+#define LOW32LIMIT 0xffffffff
+void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
+{
+        pg_data_t *pgdat = pgdat_list;
+        void *ptr;
+        for_each_pgdat(pgdat)
+                if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+                                                 align, goal, LOW32LIMIT)))
+                        return(ptr);
+        /*
+         * Whoops, we cannot satisfy the allocation request.
+         */
+        printk(KERN_ALERT "low bootmem alloc of %lu bytes failed!\n", size);
+        panic("Out of low memory");
+        return NULL;
 }
+void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
+                                       unsigned long align, unsigned long goal)
+{
+        return __alloc_bootmem_core(pgdat->bdata, size, align, goal, LOW32LIMIT);
+}
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 5f19e87bc5af..d257c89e7704 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
        if (!file)
                return -EBADF;
+        if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
+                ret = -ESPIPE;
+                goto out;
+        }
        mapping = file->f_mapping;
        if (!mapping || len < 0) {
                ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 33a28bfde158..a965b6b35f26 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/aio.h>
+#include <linux/capability.h>
 #include <linux/kernel_stat.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
@@ -61,7 +62,7 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 *      ->swap_lock             (exclusive_swap_page, others)
 *        ->mapping->tree_lock
 *
- *  ->i_sem
+ *  ->i_mutex
 *    ->i_mmap_lock             (truncate->unmap_mapping_range)
 *
 *  ->mmap_sem
@@ -73,9 +74,9 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 *    ->lock_page               (access_process_vm)
 *
 *  ->mmap_sem
- *    ->i_sem                   (msync)
+ *    ->i_mutex                 (msync)
 *
- *  ->i_sem
+ *  ->i_mutex
 *    ->i_alloc_sem             (various)
 *
 *  ->inode_lock
@@ -276,11 +277,11 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
 * integrity" operation.  It waits upon in-flight writeout before starting and
 * waiting upon new writeout.  If there was an IO error, return it.
 *
- * We need to re-take i_sem during the generic_osync_inode list walk because
+ * We need to re-take i_mutex during the generic_osync_inode list walk because
 * it is otherwise livelockable.
 */
 int sync_page_range(struct inode *inode, struct address_space *mapping,
-                        loff_t pos, size_t count)
+                        loff_t pos, loff_t count)
 {
        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -290,9 +291,9 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
                return 0;
        ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1);
        if (ret == 0) {
-                down(&inode->i_sem);
+                mutex_lock(&inode->i_mutex);
                ret = generic_osync_inode(inode, mapping, OSYNC_METADATA);
-                up(&inode->i_sem);
+                mutex_unlock(&inode->i_mutex);
        }
        if (ret == 0)
                ret = wait_on_page_writeback_range(mapping, start, end);
@@ -301,13 +302,12 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 EXPORT_SYMBOL(sync_page_range);
 /*
- * Note: Holding i_sem across sync_page_range_nolock is not a good idea
+ * Note: Holding i_mutex across sync_page_range_nolock is not a good idea
 * as it forces O_SYNC writers to different parts of the same file
 * to be serialised right until io completion.
 */
-static int sync_page_range_nolock(struct inode *inode,
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
-                                  struct address_space *mapping,
+                           loff_t pos, loff_t count)
-                                  loff_t pos, size_t count)
 {
        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +322,7 @@ static int sync_page_range_nolock(struct inode *inode,
                ret = wait_on_page_writeback_range(mapping, start, end);
        return ret;
 }
+EXPORT_SYMBOL(sync_page_range_nolock);
 /**
 * filemap_fdatawait - walk the list of under-writeback pages of the given
@@ -343,30 +344,44 @@ EXPORT_SYMBOL(filemap_fdatawait);
 int filemap_write_and_wait(struct address_space *mapping)
 {
-        int retval = 0;
+        int err = 0;
        if (mapping->nrpages) {
-                retval = filemap_fdatawrite(mapping);
+                err = filemap_fdatawrite(mapping);
-                if (retval == 0)
+                /*
-                        retval = filemap_fdatawait(mapping);
+                 * Even if the above returned error, the pages may be
+                 * written partially (e.g. -ENOSPC), so we wait for it.
+                 * But the -EIO is special case, it may indicate the worst
+                 * thing (e.g. bug) happened, so we avoid waiting for it.
+                 */
+                if (err != -EIO) {
+                        int err2 = filemap_fdatawait(mapping);
+                        if (!err)
+                                err = err2;
+                }
        }
-        return retval;
+        return err;
 }
+EXPORT_SYMBOL(filemap_write_and_wait);
 int filemap_write_and_wait_range(struct address_space *mapping,
                                 loff_t lstart, loff_t lend)
 {
-        int retval = 0;
+        int err = 0;
        if (mapping->nrpages) {
-                retval = __filemap_fdatawrite_range(mapping, lstart, lend,
+                err = __filemap_fdatawrite_range(mapping, lstart, lend,
-                                                    WB_SYNC_ALL);
+                                                 WB_SYNC_ALL);
-                if (retval == 0)
+                /* See comment of filemap_write_and_wait() */
-                        retval = wait_on_page_writeback_range(mapping,
+                if (err != -EIO) {
-                                                    lstart >> PAGE_CACHE_SHIFT,
+                        int err2 = wait_on_page_writeback_range(mapping,
-                                                    lend >> PAGE_CACHE_SHIFT);
+                                                lstart >> PAGE_CACHE_SHIFT,
+                                                lend >> PAGE_CACHE_SHIFT);
+                        if (!err)
+                                err = err2;
+                }
        }
-        return retval;
+        return err;
 }
 /*
@@ -555,11 +570,12 @@ repeat:
                page_cache_get(page);
                if (TestSetPageLocked(page)) {
                        read_unlock_irq(&mapping->tree_lock);
-                        lock_page(page);
+                        __lock_page(page);
                        read_lock_irq(&mapping->tree_lock);
                        /* Has the page been truncated while we slept? */
-                        if (page->mapping != mapping || page->index != offset) {
+                        if (unlikely(page->mapping != mapping ||
+                                     page->index != offset)) {
                                unlock_page(page);
                                page_cache_release(page);
                                goto repeat;
@@ -831,8 +847,13 @@ readpage:
                /* Start the actual read. The read will unlock the page. */
                error = mapping->a_ops->readpage(filp, page);
-                if (unlikely(error))
+                if (unlikely(error)) {
+                        if (error == AOP_TRUNCATED_PAGE) {
+                                page_cache_release(page);
+                                goto find_page;
+                        }
                        goto readpage_error;
+                }
                if (!PageUptodate(page)) {
                        lock_page(page);
@@ -1152,26 +1173,24 @@ static int fastcall page_cache_read(struct file * file, unsigned long offset)
 {
        struct address_space *mapping = file->f_mapping;
        struct page *page; 
-        int error;
+        int ret;
-        page = page_cache_alloc_cold(mapping);
+        do {
-        if (!page)
+                page = page_cache_alloc_cold(mapping);
-                return -ENOMEM;
+                if (!page)
+                        return -ENOMEM;
+                ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+                if (ret == 0)
+                        ret = mapping->a_ops->readpage(file, page);
+                else if (ret == -EEXIST)
+                        ret = 0; /* losing race to add is OK */
-        error = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
-        if (!error) {
-                error = mapping->a_ops->readpage(file, page);
                page_cache_release(page);
-                return error;
-        }
-        /*
+        } while (ret == AOP_TRUNCATED_PAGE);
-         * We arrive here in the unlikely event that someone 
+                
-         * raced with us and added our page to the cache first
+        return ret;
-         * or we are out of memory for radix-tree nodes.
-         */
-        page_cache_release(page);
-        return error == -EEXIST ? 0 : error;
 }
 #define MMAP_LOTSAMISS  (100)
@@ -1331,10 +1350,14 @@ page_not_uptodate:
                goto success;
        }
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1358,10 +1381,14 @@ page_not_uptodate:
                goto success;
        }
        ClearPageError(page);
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1444,10 +1471,14 @@ page_not_uptodate:
                goto success;
        }
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1470,10 +1501,14 @@ page_not_uptodate:
        }
        ClearPageError(page);
-        if (!mapping->a_ops->readpage(file, page)) {
+        error = mapping->a_ops->readpage(file, page);
+        if (!error) {
                wait_on_page_locked(page);
                if (PageUptodate(page))
                        goto success;
+        } else if (error == AOP_TRUNCATED_PAGE) {
+                page_cache_release(page);
+                goto retry_find;
        }
        /*
@@ -1858,7 +1893,7 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
        /*
         * Sync the fs metadata but not the minor inode changes and
         * of course not the data as we did direct DMA for the IO.
-         * i_sem is held, which protects generic_osync_inode() from
+         * i_mutex is held, which protects generic_osync_inode() from
         * livelocking.
         */
        if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
@@ -1934,12 +1969,16 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                status = a_ops->prepare_write(file, page, offset, offset+bytes);
                if (unlikely(status)) {
                        loff_t isize = i_size_read(inode);
+                        if (status != AOP_TRUNCATED_PAGE)
+                                unlock_page(page);
+                        page_cache_release(page);
+                        if (status == AOP_TRUNCATED_PAGE)
+                                continue;
                        /*
                         * prepare_write() may have instantiated a few blocks
                         * outside i_size.  Trim these off again.
                         */
-                        unlock_page(page);
-                        page_cache_release(page);
                        if (pos + bytes > isize)
                                vmtruncate(inode, isize);
                        break;
@@ -1952,6 +1991,10 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                                                cur_iov, iov_base, bytes);
                flush_dcache_page(page);
                status = a_ops->commit_write(file, page, offset, offset+bytes);
+                if (status == AOP_TRUNCATED_PAGE) {
+                        page_cache_release(page);
+                        continue;
+                }
                if (likely(copied > 0)) {
                        if (!status)
                                status = copied;
@@ -2066,7 +2109,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
        if (err)
                goto out;
-        inode_update_time(inode, 1);
+        file_update_time(file);
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (unlikely(file->f_flags & O_DIRECT)) {
@@ -2153,10 +2196,10 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const char __user *buf,
        BUG_ON(iocb->ki_pos != pos);
-        down(&inode->i_sem);
+        mutex_lock(&inode->i_mutex);
        ret = __generic_file_aio_write_nolock(iocb, &local_iov, 1,
                                                &iocb->ki_pos);
-        up(&inode->i_sem);
+        mutex_unlock(&inode->i_mutex);
        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                ssize_t err;
@@ -2178,9 +2221,9 @@ ssize_t generic_file_write(struct file *file, const char __user *buf,
        struct iovec local_iov = { .iov_base = (void __user *)buf,
                                        .iov_len = count };
-        down(&inode->i_sem);
+        mutex_lock(&inode->i_mutex);
        ret = __generic_file_write_nolock(file, &local_iov, 1, ppos);
-        up(&inode->i_sem);
+        mutex_unlock(&inode->i_mutex);
        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                ssize_t err;
@@ -2214,9 +2257,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
        struct inode *inode = mapping->host;
        ssize_t ret;
-        down(&inode->i_sem);
+        mutex_lock(&inode->i_mutex);
        ret = __generic_file_write_nolock(file, iov, nr_segs, ppos);
-        up(&inode->i_sem);
+        mutex_unlock(&inode->i_mutex);
        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
                int err;
@@ -2230,7 +2273,7 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
 EXPORT_SYMBOL(generic_file_writev);
 /*
- * Called under i_sem for writes to S_ISREG files.   Returns -EIO if something
+ * Called under i_mutex for writes to S_ISREG files.   Returns -EIO if something
 * went wrong during pagecache shootdown.
 */
 static ssize_t
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 9cf687e4a29a..b960ac8e5918 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -338,7 +338,7 @@ __xip_file_write(struct file *filp, const char __user *buf,
        *ppos = pos;
        /*
         * No need to use i_size_read() here, the i_size
-         * cannot change under us because we hold i_sem.
+         * cannot change under us because we hold i_mutex.
         */
        if (pos > inode->i_size) {
                i_size_write(inode, pos);
@@ -358,7 +358,7 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        loff_t pos;
        ssize_t ret;
-        down(&inode->i_sem);
+        mutex_lock(&inode->i_mutex);
        if (!access_ok(VERIFY_READ, buf, len)) {
                ret=-EFAULT;
@@ -383,14 +383,14 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        if (ret)
                goto out_backing;
-        inode_update_time(inode, 1);
+        file_update_time(filp);
        ret = __xip_file_write (filp, buf, count, pos, ppos);
 out_backing:
        current->backing_dev_info = NULL;
 out_up:
-        up(&inode->i_sem);
+        mutex_unlock(&inode->i_mutex);
        return ret;
 }
 EXPORT_SYMBOL_GPL(xip_file_write);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3e52df7c471b..b21d78c941b5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,9 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/mempolicy.h>
+#include <linux/cpuset.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -36,18 +39,22 @@ static void enqueue_huge_page(struct page *page)
        free_huge_pages_node[nid]++;
 }
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+                                unsigned long address)
 {
        int nid = numa_node_id();
        struct page *page = NULL;
+        struct zonelist *zonelist = huge_zonelist(vma, address);
+        struct zone **z;
-        if (list_empty(&hugepage_freelists[nid])) {
+        for (z = zonelist->zones; *z; z++) {
-                for (nid = 0; nid < MAX_NUMNODES; ++nid)
+                nid = (*z)->zone_pgdat->node_id;
-                        if (!list_empty(&hugepage_freelists[nid]))
+                if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
-                                break;
+                    !list_empty(&hugepage_freelists[nid]))
+                        break;
        }
-        if (nid >= 0 && nid < MAX_NUMNODES &&
-            !list_empty(&hugepage_freelists[nid])) {
+        if (*z) {
                page = list_entry(hugepage_freelists[nid].next,
                                  struct page, lru);
                list_del(&page->lru);
@@ -85,13 +92,13 @@ void free_huge_page(struct page *page)
        spin_unlock(&hugetlb_lock);
 }
-struct page *alloc_huge_page(void)
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
        struct page *page;
        int i;
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page();
+        page = dequeue_huge_page(vma, addr);
        if (!page) {
                spin_unlock(&hugetlb_lock);
                return NULL;
@@ -194,7 +201,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
        spin_lock(&hugetlb_lock);
        try_to_free_low(count);
        while (count < nr_huge_pages) {
-                struct page *page = dequeue_huge_page();
+                struct page *page = dequeue_huge_page(NULL, 0);
                if (!page)
                        break;
                update_and_free_page(page);
@@ -261,11 +268,12 @@ struct vm_operations_struct hugetlb_vm_ops = {
        .nopage = hugetlb_nopage,
 };
-static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
+static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
+                                int writable)
 {
        pte_t entry;
-        if (vma->vm_flags & VM_WRITE) {
+        if (writable) {
                entry =
                    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
        } else {
@@ -277,12 +285,27 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page)
        return entry;
 }
+static void set_huge_ptep_writable(struct vm_area_struct *vma,
+                                   unsigned long address, pte_t *ptep)
+{
+        pte_t entry;
+        entry = pte_mkwrite(pte_mkdirty(*ptep));
+        ptep_set_access_flags(vma, address, ptep, entry, 1);
+        update_mmu_cache(vma, address, entry);
+        lazy_mmu_prot_update(entry);
+}
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *vma)
 {
        pte_t *src_pte, *dst_pte, entry;
        struct page *ptepage;
        unsigned long addr;
+        int cow;
+        cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
        for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
                src_pte = huge_pte_offset(src, addr);
@@ -294,6 +317,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                spin_lock(&dst->page_table_lock);
                spin_lock(&src->page_table_lock);
                if (!pte_none(*src_pte)) {
+                        if (cow)
+                                ptep_set_wrprotect(src, addr, src_pte);
                        entry = *src_pte;
                        ptepage = pte_page(entry);
                        get_page(ptepage);
@@ -345,57 +370,63 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        flush_tlb_range(vma, start, end);
 }
-static struct page *find_lock_huge_page(struct address_space *mapping,
+static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long idx)
+                        unsigned long address, pte_t *ptep, pte_t pte)
 {
-        struct page *page;
+        struct page *old_page, *new_page;
-        int err;
+        int i, avoidcopy;
-        struct inode *inode = mapping->host;
-        unsigned long size;
-retry:
+        old_page = pte_page(pte);
-        page = find_lock_page(mapping, idx);
-        if (page)
-                goto out;
-        /* Check to make sure the mapping hasn't been truncated */
+        /* If no-one else is actually using this page, avoid the copy
-        size = i_size_read(inode) >> HPAGE_SHIFT;
+         * and just make the page writable */
-        if (idx >= size)
+        avoidcopy = (page_count(old_page) == 1);
-                goto out;
+        if (avoidcopy) {
+                set_huge_ptep_writable(vma, address, ptep);
+                return VM_FAULT_MINOR;
+        }
-        if (hugetlb_get_quota(mapping))
+        page_cache_get(old_page);
-                goto out;
+        new_page = alloc_huge_page(vma, address);
-        page = alloc_huge_page();
-        if (!page) {
+        if (!new_page) {
-                hugetlb_put_quota(mapping);
+                page_cache_release(old_page);
-                goto out;
+                /* Logically this is OOM, not a SIGBUS, but an OOM
+                 * could cause the kernel to go killing other
+                 * processes which won't help the hugepage situation
+                 * at all (?) */
+                return VM_FAULT_SIGBUS;
        }
-        err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+        spin_unlock(&mm->page_table_lock);
-        if (err) {
+        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
-                put_page(page);
+                copy_user_highpage(new_page + i, old_page + i,
-                hugetlb_put_quota(mapping);
+                                   address + i*PAGE_SIZE);
-                if (err == -EEXIST)
+        spin_lock(&mm->page_table_lock);
-                        goto retry;
-                page = NULL;
+        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+        if (likely(pte_same(*ptep, pte))) {
+                /* Break COW */
+                set_huge_pte_at(mm, address, ptep,
+                                make_huge_pte(vma, new_page, 1));
+                /* Make the old page be freed below */
+                new_page = old_page;
        }
-out:
+        page_cache_release(new_page);
-        return page;
+        page_cache_release(old_page);
+        return VM_FAULT_MINOR;
 }
-int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                        unsigned long address, int write_access)
+                        unsigned long address, pte_t *ptep, int write_access)
 {
        int ret = VM_FAULT_SIGBUS;
        unsigned long idx;
        unsigned long size;
-        pte_t *pte;
        struct page *page;
        struct address_space *mapping;
+        pte_t new_pte;
-        pte = huge_pte_alloc(mm, address);
-        if (!pte)
-                goto out;
        mapping = vma->vm_file->f_mapping;
        idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
@@ -405,9 +436,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * Use page lock to guard against racing truncation
         * before we get page_table_lock.
         */
-        page = find_lock_huge_page(mapping, idx);
+retry:
-        if (!page)
+        page = find_lock_page(mapping, idx);
-                goto out;
+        if (!page) {
+                if (hugetlb_get_quota(mapping))
+                        goto out;
+                page = alloc_huge_page(vma, address);
+                if (!page) {
+                        hugetlb_put_quota(mapping);
+                        goto out;
+                }
+                if (vma->vm_flags & VM_SHARED) {
+                        int err;
+                        err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
+                        if (err) {
+                                put_page(page);
+                                hugetlb_put_quota(mapping);
+                                if (err == -EEXIST)
+                                        goto retry;
+                                goto out;
+                        }
+                } else
+                        lock_page(page);
+        }
        spin_lock(&mm->page_table_lock);
        size = i_size_read(mapping->host) >> HPAGE_SHIFT;
@@ -415,11 +468,19 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                goto backout;
        ret = VM_FAULT_MINOR;
-        if (!pte_none(*pte))
+        if (!pte_none(*ptep))
                goto backout;
        add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
-        set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+        new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+                                && (vma->vm_flags & VM_SHARED)));
+        set_huge_pte_at(mm, address, ptep, new_pte);
+        if (write_access && !(vma->vm_flags & VM_SHARED)) {
+                /* Optimization, do the COW without a second fault */
+                ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
+        }
        spin_unlock(&mm->page_table_lock);
        unlock_page(page);
 out:
@@ -433,6 +494,33 @@ backout:
        goto out;
 }
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+                        unsigned long address, int write_access)
+{
+        pte_t *ptep;
+        pte_t entry;
+        int ret;
+        ptep = huge_pte_alloc(mm, address);
+        if (!ptep)
+                return VM_FAULT_OOM;
+        entry = *ptep;
+        if (pte_none(entry))
+                return hugetlb_no_page(mm, vma, address, ptep, write_access);
+        ret = VM_FAULT_MINOR;
+        spin_lock(&mm->page_table_lock);
+        /* Check for a racing update before calling hugetlb_cow */
+        if (likely(pte_same(entry, *ptep)))
+                if (write_access && !pte_write(entry))
+                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
+        spin_unlock(&mm->page_table_lock);
+        return ret;
+}
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i)
diff --git a/mm/internal.h b/mm/internal.h
index 6bf134e8fb3d..17256bb2f4ef 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -9,5 +9,22 @@
 * 2 of the License, or (at your option) any later version.
 */
-/* page_alloc.c */
+static inline void set_page_refs(struct page *page, int order)
-extern void set_page_refs(struct page *page, int order);
+{
+#ifdef CONFIG_MMU
+        set_page_count(page, 1);
+#else
+        int i;
+        /*
+         * We need to reference all the pages for this order, otherwise if
+         * anyone accesses one of the pages with (get/put) it will be freed.
+         * - eg: access_process_vm()
+         */
+        for (i = 0; i < (1 << order); i++)
+                set_page_count(page + i, 1);
+#endif /* CONFIG_MMU */
+}
+extern void fastcall __init __free_pages_bootmem(struct page *page,
+                                                unsigned int order);
diff --git a/mm/madvise.c b/mm/madvise.c
index 2b7cf0400a21..ae0ae3ea299a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -140,6 +140,36 @@ static long madvise_dontneed(struct vm_area_struct * vma,
        return 0;
 }
+/*
+ * Application wants to free up the pages and associated backing store.
+ * This is effectively punching a hole into the middle of a file.
+ *
+ * NOTE: Currently, only shmfs/tmpfs is supported for this operation.
+ * Other filesystems return -ENOSYS.
+ */
+static long madvise_remove(struct vm_area_struct *vma,
+                                unsigned long start, unsigned long end)
+{
+        struct address_space *mapping;
+        loff_t offset, endoff;
+        if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+                return -EINVAL;
+        if (!vma->vm_file || !vma->vm_file->f_mapping
+                || !vma->vm_file->f_mapping->host) {
+                        return -EINVAL;
+        }
+        mapping = vma->vm_file->f_mapping;
+        offset = (loff_t)(start - vma->vm_start)
+                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+        endoff = (loff_t)(end - vma->vm_start - 1)
+                        + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+        return  vmtruncate_range(mapping->host, offset, endoff);
+}
 static long
 madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
                unsigned long start, unsigned long end, int behavior)
@@ -152,6 +182,9 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
        case MADV_RANDOM:
                error = madvise_behavior(vma, prev, start, end, behavior);
                break;
+        case MADV_REMOVE:
+                error = madvise_remove(vma, start, end);
+                break;
        case MADV_WILLNEED:
                error = madvise_willneed(vma, prev, start, end);
@@ -190,6 +223,8 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
 *              some pages ahead.
 *  MADV_DONTNEED - the application is finished with the given range,
 *              so the kernel can free resources associated with it.
+ *  MADV_REMOVE - the application wants to free up the given range of
+ *              pages and associated backing store.
 *
 * return values:
 *  zero    - success
diff --git a/mm/memory.c b/mm/memory.c
index d8dde07a3656..7a11ddd5060f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1498,7 +1498,7 @@ gotten:
                update_mmu_cache(vma, address, entry);
                lazy_mmu_prot_update(entry);
                lru_cache_add_active(new_page);
-                page_add_anon_rmap(new_page, vma, address);
+                page_add_new_anon_rmap(new_page, vma, address);
                /* Free the old page.. */
                new_page = old_page;
@@ -1770,9 +1770,32 @@ out_big:
 out_busy:
        return -ETXTBSY;
 }
 EXPORT_SYMBOL(vmtruncate);
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+        struct address_space *mapping = inode->i_mapping;
+        /*
+         * If the underlying filesystem is not going to provide
+         * a way to truncate a range of blocks (punch a hole) -
+         * we should return failure right now.
+         */
+        if (!inode->i_op || !inode->i_op->truncate_range)
+                return -ENOSYS;
+        mutex_lock(&inode->i_mutex);
+        down_write(&inode->i_alloc_sem);
+        unmap_mapping_range(mapping, offset, (end - offset), 1);
+        truncate_inode_pages_range(mapping, offset, end);
+        inode->i_op->truncate_range(inode, offset, end);
+        up_write(&inode->i_alloc_sem);
+        mutex_unlock(&inode->i_mutex);
+        return 0;
+}
+EXPORT_SYMBOL(vmtruncate_range);
 /* 
 * Primitive swap readahead code. We simply read an aligned block of
 * (1 << page_cluster) entries in the swap area. This method is chosen
@@ -1954,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        goto release;
                inc_mm_counter(mm, anon_rss);
                lru_cache_add_active(page);
-                SetPageReferenced(page);
+                page_add_new_anon_rmap(page, vma, address);
-                page_add_anon_rmap(page, vma, address);
        } else {
                /* Map the ZERO_PAGE - vm_page_prot is readonly */
                page = ZERO_PAGE(address);
@@ -2086,7 +2108,7 @@ retry:
                if (anon) {
                        inc_mm_counter(mm, anon_rss);
                        lru_cache_add_active(new_page);
-                        page_add_anon_rmap(new_page, vma, address);
+                        page_add_new_anon_rmap(new_page, vma, address);
                } else {
                        inc_mm_counter(mm, file_rss);
                        page_add_file_rmap(new_page);
@@ -2245,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
+EXPORT_SYMBOL_GPL(__handle_mm_fault);
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
 * Allocate page upper directory.
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f6d4af8af8a8..a918f77f02f3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -42,7 +42,6 @@ extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
                                  int nr_pages);
 static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
 {
-        struct pglist_data *pgdat = zone->zone_pgdat;
        int nr_pages = PAGES_PER_SECTION;
        int ret;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bec88c81244e..b62cab575a84 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,18 @@
 #include <linux/init.h>
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
+/* Internal flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
+#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
@@ -93,7 +102,7 @@ static kmem_cache_t *sn_cache;
 /* Highest zone. An specific allocation for a zone below that is not
   policied. */
-static int policy_zone;
+int policy_zone = ZONE_DMA;
 struct mempolicy default_policy = {
        .refcnt = ATOMIC_INIT(1), /* never free it */
@@ -131,17 +140,8 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
        if (!zl)
                return NULL;
        num = 0;
-        for_each_node_mask(nd, *nodes) {
+        for_each_node_mask(nd, *nodes)
-                int k;
+                zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
-                for (k = MAX_NR_ZONES-1; k >= 0; k--) {
-                        struct zone *z = &NODE_DATA(nd)->node_zones[k];
-                        if (!z->present_pages)
-                                continue;
-                        zl->zones[num++] = z;
-                        if (k > policy_zone)
-                                policy_zone = k;
-                }
-        }
        zl->zones[num] = NULL;
        return zl;
 }
@@ -161,6 +161,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
        switch (mode) {
        case MPOL_INTERLEAVE:
                policy->v.nodes = *nodes;
+                if (nodes_weight(*nodes) == 0) {
+                        kmem_cache_free(policy_cache, policy);
+                        return ERR_PTR(-EINVAL);
+                }
                break;
        case MPOL_PREFERRED:
                policy->v.preferred_node = first_node(*nodes);
@@ -176,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
                break;
        }
        policy->policy = mode;
+        policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
        return policy;
 }
-/* Ensure all existing pages follow the policy. */
+static void gather_stats(struct page *, void *);
+static void migrate_page_add(struct vm_area_struct *vma,
+        struct page *page, struct list_head *pagelist, unsigned long flags);
+/* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pte_t *orig_pte;
        pte_t *pte;
@@ -197,8 +208,20 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                page = vm_normal_page(vma, addr, *pte);
                if (!page)
                        continue;
+                if (PageReserved(page))
+                        continue;
                nid = page_to_nid(page);
-                if (!node_isset(nid, *nodes))
+                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+                        continue;
+                if (flags & MPOL_MF_STATS)
+                        gather_stats(page, private);
+                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+                        spin_unlock(ptl);
+                        migrate_page_add(vma, page, private, flags);
+                        spin_lock(ptl);
+                }
+                else
                        break;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap_unlock(orig_pte, ptl);
@@ -206,7 +229,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 }
 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -216,14 +241,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                if (check_pte_range(vma, pmd, addr, next, nodes))
+                if (check_pte_range(vma, pmd, addr, next, nodes,
+                                    flags, private))
                        return -EIO;
        } while (pmd++, addr = next, addr != end);
        return 0;
 }
 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pud_t *pud;
        unsigned long next;
@@ -233,14 +261,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                if (check_pmd_range(vma, pud, addr, next, nodes))
+                if (check_pmd_range(vma, pud, addr, next, nodes,
+                                    flags, private))
                        return -EIO;
        } while (pud++, addr = next, addr != end);
        return 0;
 }
 static inline int check_pgd_range(struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -250,16 +281,30 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                if (check_pud_range(vma, pgd, addr, next, nodes))
+                if (check_pud_range(vma, pgd, addr, next, nodes,
+                                    flags, private))
                        return -EIO;
        } while (pgd++, addr = next, addr != end);
        return 0;
 }
-/* Step 1: check the range */
+/* Check if a vma is migratable */
+static inline int vma_migratable(struct vm_area_struct *vma)
+{
+        if (vma->vm_flags & (
+                VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
+                return 0;
+        return 1;
+}
+/*
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
+ */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-            nodemask_t *nodes, unsigned long flags)
+                const nodemask_t *nodes, unsigned long flags, void *private)
 {
        int err;
        struct vm_area_struct *first, *vma, *prev;
@@ -269,17 +314,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                return ERR_PTR(-EFAULT);
        prev = NULL;
        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
-                if (!vma->vm_next && vma->vm_end < end)
+                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
-                        return ERR_PTR(-EFAULT);
+                        if (!vma->vm_next && vma->vm_end < end)
-                if (prev && prev->vm_end < vma->vm_start)
+                                return ERR_PTR(-EFAULT);
-                        return ERR_PTR(-EFAULT);
+                        if (prev && prev->vm_end < vma->vm_start)
-                if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+                                return ERR_PTR(-EFAULT);
+                }
+                if (!is_vm_hugetlb_page(vma) &&
+                    ((flags & MPOL_MF_STRICT) ||
+                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+                                vma_migratable(vma)))) {
                        unsigned long endvma = vma->vm_end;
                        if (endvma > end)
                                endvma = end;
                        if (vma->vm_start > start)
                                start = vma->vm_start;
-                        err = check_pgd_range(vma, start, endvma, nodes);
+                        err = check_pgd_range(vma, start, endvma, nodes,
+                                                flags, private);
                        if (err) {
                                first = ERR_PTR(err);
                                break;
@@ -338,51 +390,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
        if (!nodes)
                return 0;
-        /* Update current mems_allowed */
+        cpuset_update_task_memory_state();
-        cpuset_update_current_mems_allowed();
+        if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
-        /* Ignore nodes not set in current->mems_allowed */
-        cpuset_restrict_to_mems_allowed(nodes->bits);
-        return mpol_check_policy(mode, nodes);
-}
-long do_mbind(unsigned long start, unsigned long len,
-                unsigned long mode, nodemask_t *nmask, unsigned long flags)
-{
-        struct vm_area_struct *vma;
-        struct mm_struct *mm = current->mm;
-        struct mempolicy *new;
-        unsigned long end;
-        int err;
-        if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
-                return -EINVAL;
-        if (start & ~PAGE_MASK)
-                return -EINVAL;
-        if (mode == MPOL_DEFAULT)
-                flags &= ~MPOL_MF_STRICT;
-        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
-        end = start + len;
-        if (end < start)
-                return -EINVAL;
-        if (end == start)
-                return 0;
-        if (mpol_check_policy(mode, nmask))
                return -EINVAL;
-        new = mpol_new(mode, nmask);
+        return mpol_check_policy(mode, nodes);
-        if (IS_ERR(new))
-                return PTR_ERR(new);
-        PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-                        mode,nodes_addr(nodes)[0]);
-        down_write(&mm->mmap_sem);
-        vma = check_range(mm, start, end, nmask, flags);
-        err = PTR_ERR(vma);
-        if (!IS_ERR(vma))
-                err = mbind_range(vma, start, end, new);
-        up_write(&mm->mmap_sem);
-        mpol_free(new);
-        return err;
 }
 /* Set the process memory policy */
@@ -453,7 +464,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
        struct vm_area_struct *vma = NULL;
        struct mempolicy *pol = current->mempolicy;
-        cpuset_update_current_mems_allowed();
+        cpuset_update_task_memory_state();
        if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
                return -EINVAL;
        if (flags & MPOL_F_ADDR) {
@@ -505,11 +516,177 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 }
 /*
+ * page migration
+ */
+/* Check if we are the only process mapping the page in question */
+static inline int single_mm_mapping(struct mm_struct *mm,
+                        struct address_space *mapping)
+{
+        struct vm_area_struct *vma;
+        struct prio_tree_iter iter;
+        int rc = 1;
+        spin_lock(&mapping->i_mmap_lock);
+        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+                if (mm != vma->vm_mm) {
+                        rc = 0;
+                        goto out;
+                }
+        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+                if (mm != vma->vm_mm) {
+                        rc = 0;
+                        goto out;
+                }
+out:
+        spin_unlock(&mapping->i_mmap_lock);
+        return rc;
+}
+/*
+ * Add a page to be migrated to the pagelist
+ */
+static void migrate_page_add(struct vm_area_struct *vma,
+        struct page *page, struct list_head *pagelist, unsigned long flags)
+{
+        /*
+         * Avoid migrating a page that is shared by others and not writable.
+         */
+        if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
+            mapping_writably_mapped(page->mapping) ||
+            single_mm_mapping(vma->vm_mm, page->mapping)) {
+                int rc = isolate_lru_page(page);
+                if (rc == 1)
+                        list_add(&page->lru, pagelist);
+                /*
+                 * If the isolate attempt was not successful then we just
+                 * encountered an unswappable page. Something must be wrong.
+                 */
+                WARN_ON(rc == 0);
+        }
+}
+static int swap_pages(struct list_head *pagelist)
+{
+        LIST_HEAD(moved);
+        LIST_HEAD(failed);
+        int n;
+        n = migrate_pages(pagelist, NULL, &moved, &failed);
+        putback_lru_pages(&failed);
+        putback_lru_pages(&moved);
+        return n;
+}
+/*
+ * For now migrate_pages simply swaps out the pages from nodes that are in
+ * the source set but not in the target set. In the future, we would
+ * want a function that moves pages between the two nodesets in such
+ * a way as to preserve the physical layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+        LIST_HEAD(pagelist);
+        int count = 0;
+        nodemask_t nodes;
+        nodes_andnot(nodes, *from_nodes, *to_nodes);
+        down_read(&mm->mmap_sem);
+        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
+                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+        if (!list_empty(&pagelist)) {
+                count = swap_pages(&pagelist);
+                putback_lru_pages(&pagelist);
+        }
+        up_read(&mm->mmap_sem);
+        return count;
+}
+long do_mbind(unsigned long start, unsigned long len,
+                unsigned long mode, nodemask_t *nmask, unsigned long flags)
+{
+        struct vm_area_struct *vma;
+        struct mm_struct *mm = current->mm;
+        struct mempolicy *new;
+        unsigned long end;
+        int err;
+        LIST_HEAD(pagelist);
+        if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+            || mode > MPOL_MAX)
+                return -EINVAL;
+        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+                return -EPERM;
+        if (start & ~PAGE_MASK)
+                return -EINVAL;
+        if (mode == MPOL_DEFAULT)
+                flags &= ~MPOL_MF_STRICT;
+        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+        end = start + len;
+        if (end < start)
+                return -EINVAL;
+        if (end == start)
+                return 0;
+        if (mpol_check_policy(mode, nmask))
+                return -EINVAL;
+        new = mpol_new(mode, nmask);
+        if (IS_ERR(new))
+                return PTR_ERR(new);
+        /*
+         * If we are using the default policy then operation
+         * on discontinuous address spaces is okay after all
+         */
+        if (!new)
+                flags |= MPOL_MF_DISCONTIG_OK;
+        PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+                        mode,nodes_addr(nodes)[0]);
+        down_write(&mm->mmap_sem);
+        vma = check_range(mm, start, end, nmask,
+                          flags | MPOL_MF_INVERT, &pagelist);
+        err = PTR_ERR(vma);
+        if (!IS_ERR(vma)) {
+                int nr_failed = 0;
+                err = mbind_range(vma, start, end, new);
+                if (!list_empty(&pagelist))
+                        nr_failed = swap_pages(&pagelist);
+                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+                        err = -EIO;
+        }
+        if (!list_empty(&pagelist))
+                putback_lru_pages(&pagelist);
+        up_write(&mm->mmap_sem);
+        mpol_free(new);
+        return err;
+}
+/*
 * User space interface with variable sized bitmaps for nodelists.
 */
 /* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                     unsigned long maxnode)
 {
        unsigned long k;
@@ -598,6 +775,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
        return do_set_mempolicy(mode, &nodes);
 }
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+                const unsigned long __user *old_nodes,
+                const unsigned long __user *new_nodes)
+{
+        struct mm_struct *mm;
+        struct task_struct *task;
+        nodemask_t old;
+        nodemask_t new;
+        nodemask_t task_nodes;
+        int err;
+        err = get_nodes(&old, old_nodes, maxnode);
+        if (err)
+                return err;
+        err = get_nodes(&new, new_nodes, maxnode);
+        if (err)
+                return err;
+        /* Find the mm_struct */
+        read_lock(&tasklist_lock);
+        task = pid ? find_task_by_pid(pid) : current;
+        if (!task) {
+                read_unlock(&tasklist_lock);
+                return -ESRCH;
+        }
+        mm = get_task_mm(task);
+        read_unlock(&tasklist_lock);
+        if (!mm)
+                return -EINVAL;
+        /*
+         * Check if this process has the right to modify the specified
+         * process. The right exists if the process has administrative
+         * capabilities, superuser priviledges or the same
+         * userid as the target process.
+         */
+        if ((current->euid != task->suid) && (current->euid != task->uid) &&
+            (current->uid != task->suid) && (current->uid != task->uid) &&
+            !capable(CAP_SYS_ADMIN)) {
+                err = -EPERM;
+                goto out;
+        }
+        task_nodes = cpuset_mems_allowed(task);
+        /* Is the user allowed to access the target nodes? */
+        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+                err = -EPERM;
+                goto out;
+        }
+        err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+out:
+        mmput(mm);
+        return err;
+}
 /* Retrieve NUMA policy */
 asmlinkage long sys_get_mempolicy(int __user *policy,
                                unsigned long __user *nmask,
@@ -704,8 +940,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 #endif
 /* Return effective policy for a VMA */
-struct mempolicy *
+static struct mempolicy * get_vma_policy(struct task_struct *task,
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+                struct vm_area_struct *vma, unsigned long addr)
 {
        struct mempolicy *pol = task->mempolicy;
@@ -781,6 +1017,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
        return nid;
 }
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+                 struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+        if (vma) {
+                unsigned long off;
+                off = vma->vm_pgoff;
+                off += (addr - vma->vm_start) >> shift;
+                return offset_il_node(pol, vma, off);
+        } else
+                return interleave_nodes(pol);
+}
+/* Return a zonelist suitable for a huge page allocation. */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+{
+        struct mempolicy *pol = get_vma_policy(current, vma, addr);
+        if (pol->policy == MPOL_INTERLEAVE) {
+                unsigned nid;
+                nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+                return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+        }
+        return zonelist_policy(GFP_HIGHUSER, pol);
+}
 /* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -825,19 +1089,12 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
-        cpuset_update_current_mems_allowed();
+        cpuset_update_task_memory_state();
        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
                unsigned nid;
-                if (vma) {
-                        unsigned long off;
+                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
-                        off = vma->vm_pgoff;
-                        off += (addr - vma->vm_start) >> PAGE_SHIFT;
-                        nid = offset_il_node(pol, vma, off);
-                } else {
-                        /* fall back to process interleaving */
-                        nid = interleave_nodes(pol);
-                }
                return alloc_page_interleave(gfp, 0, nid);
        }
        return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
@@ -858,7 +1115,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 *      interrupt context and apply the current process NUMA policy.
 *      Returns NULL when no page can be allocated.
 *
- *      Don't call cpuset_update_current_mems_allowed() unless
+ *      Don't call cpuset_update_task_memory_state() unless
 *      1) it's ok to take cpuset_sem (can WAIT), and
 *      2) allocating for current task (not interrupt).
 */
@@ -867,7 +1124,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
        struct mempolicy *pol = current->mempolicy;
        if ((gfp & __GFP_WAIT) && !in_interrupt())
-                cpuset_update_current_mems_allowed();
+                cpuset_update_task_memory_state();
        if (!pol || in_interrupt())
                pol = &default_policy;
        if (pol->policy == MPOL_INTERLEAVE)
@@ -876,6 +1133,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 }
 EXPORT_SYMBOL(alloc_pages_current);
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed().  This
+ * keeps mempolicies cpuset relative after its cpuset moves.  See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
 /* Slow path of a mempolicy copy */
 struct mempolicy *__mpol_copy(struct mempolicy *old)
 {
@@ -883,6 +1149,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
        if (!new)
                return ERR_PTR(-ENOMEM);
+        if (current_cpuset_is_being_rebound()) {
+                nodemask_t mems = cpuset_mems_allowed(current);
+                mpol_rebind_policy(old, &mems);
+        }
        *new = *old;
        atomic_set(&new->refcnt, 1);
        if (new->policy == MPOL_BIND) {
@@ -936,54 +1206,6 @@ void __mpol_free(struct mempolicy *p)
 }
 /*
- * Hugetlb policy. Same as above, just works with node numbers instead of
- * zonelists.
- */
-/* Find first node suitable for an allocation */
-int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
-{
-        struct mempolicy *pol = get_vma_policy(current, vma, addr);
-        switch (pol->policy) {
-        case MPOL_DEFAULT:
-                return numa_node_id();
-        case MPOL_BIND:
-                return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
-        case MPOL_INTERLEAVE:
-                return interleave_nodes(pol);
-        case MPOL_PREFERRED:
-                return pol->v.preferred_node >= 0 ?
-                                pol->v.preferred_node : numa_node_id();
-        }
-        BUG();
-        return 0;
-}
-/* Find secondary valid nodes for an allocation */
-int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
-{
-        struct mempolicy *pol = get_vma_policy(current, vma, addr);
-        switch (pol->policy) {
-        case MPOL_PREFERRED:
-        case MPOL_DEFAULT:
-        case MPOL_INTERLEAVE:
-                return 1;
-        case MPOL_BIND: {
-                struct zone **z;
-                for (z = pol->v.zonelist->zones; *z; z++)
-                        if ((*z)->zone_pgdat->node_id == nid)
-                                return 1;
-                return 0;
-        }
-        default:
-                BUG();
-                return 0;
-        }
-}
-/*
 * Shared memory backing store policy support.
 *
 * Remember policies even when nobody has shared memory mapped.
@@ -1205,25 +1427,31 @@ void numa_default_policy(void)
 }
 /* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
-                                                        const nodemask_t *new)
 {
+        nodemask_t *mpolmask;
        nodemask_t tmp;
        if (!pol)
                return;
+        mpolmask = &pol->cpuset_mems_allowed;
+        if (nodes_equal(*mpolmask, *newmask))
+                return;
        switch (pol->policy) {
        case MPOL_DEFAULT:
                break;
        case MPOL_INTERLEAVE:
-                nodes_remap(tmp, pol->v.nodes, *old, *new);
+                nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
                pol->v.nodes = tmp;
-                current->il_next = node_remap(current->il_next, *old, *new);
+                *mpolmask = *newmask;
+                current->il_next = node_remap(current->il_next,
+                                                *mpolmask, *newmask);
                break;
        case MPOL_PREFERRED:
                pol->v.preferred_node = node_remap(pol->v.preferred_node,
-                                                                *old, *new);
+                                                *mpolmask, *newmask);
+                *mpolmask = *newmask;
                break;
        case MPOL_BIND: {
                nodemask_t nodes;
@@ -1233,7 +1461,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
                nodes_clear(nodes);
                for (z = pol->v.zonelist->zones; *z; z++)
                        node_set((*z)->zone_pgdat->node_id, nodes);
-                nodes_remap(tmp, nodes, *old, *new);
+                nodes_remap(tmp, nodes, *mpolmask, *newmask);
                nodes = tmp;
                zonelist = bind_zonelist(&nodes);
@@ -1248,6 +1476,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
                        kfree(pol->v.zonelist);
                        pol->v.zonelist = zonelist;
                }
+                *mpolmask = *newmask;
                break;
        }
        default:
@@ -1257,12 +1486,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 }
 /*
- * Someone moved this task to different nodes.  Fixup mempolicies.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+        mpol_rebind_policy(tsk->mempolicy, new);
+}
+/*
+ * Rebind each vma in mm to new nodemask.
 *
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
 */
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
+{
+        struct vm_area_struct *vma;
+        down_write(&mm->mmap_sem);
+        for (vma = mm->mmap; vma; vma = vma->vm_next)
+                mpol_rebind_policy(vma->vm_policy, new);
+        up_write(&mm->mmap_sem);
+}
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static const char *policy_types[] = { "default", "prefer", "bind",
+                                      "interleave" };
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
+ */
+static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+        char *p = buffer;
+        int l;
+        nodemask_t nodes;
+        int mode = pol ? pol->policy : MPOL_DEFAULT;
+        switch (mode) {
+        case MPOL_DEFAULT:
+                nodes_clear(nodes);
+                break;
+        case MPOL_PREFERRED:
+                nodes_clear(nodes);
+                node_set(pol->v.preferred_node, nodes);
+                break;
+        case MPOL_BIND:
+                get_zonemask(pol, &nodes);
+                break;
+        case MPOL_INTERLEAVE:
+                nodes = pol->v.nodes;
+                break;
+        default:
+                BUG();
+                return -EFAULT;
+        }
+        l = strlen(policy_types[mode]);
+        if (buffer + maxlen < p + l + 1)
+                return -ENOSPC;
+        strcpy(p, policy_types[mode]);
+        p += l;
+        if (!nodes_empty(nodes)) {
+                if (buffer + maxlen < p + 2)
+                        return -ENOSPC;
+                *p++ = '=';
+                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+        }
+        return p - buffer;
+}
+struct numa_maps {
+        unsigned long pages;
+        unsigned long anon;
+        unsigned long mapped;
+        unsigned long mapcount_max;
+        unsigned long node[MAX_NUMNODES];
+};
+static void gather_stats(struct page *page, void *private)
 {
-        rebind_policy(current->mempolicy, old, new);
+        struct numa_maps *md = private;
+        int count = page_mapcount(page);
+        if (count)
+                md->mapped++;
+        if (count > md->mapcount_max)
+                md->mapcount_max = count;
+        md->pages++;
+        if (PageAnon(page))
+                md->anon++;
+        md->node[page_to_nid(page)]++;
+        cond_resched();
+}
+int show_numa_map(struct seq_file *m, void *v)
+{
+        struct task_struct *task = m->private;
+        struct vm_area_struct *vma = v;
+        struct numa_maps *md;
+        int n;
+        char buffer[50];
+        if (!vma->vm_mm)
+                return 0;
+        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
+        if (!md)
+                return 0;
+        check_pgd_range(vma, vma->vm_start, vma->vm_end,
+                    &node_online_map, MPOL_MF_STATS, md);
+        if (md->pages) {
+                mpol_to_str(buffer, sizeof(buffer),
+                            get_vma_policy(task, vma, vma->vm_start));
+                seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
+                           vma->vm_start, buffer, md->pages,
+                           md->mapped, md->mapcount_max);
+                if (md->anon)
+                        seq_printf(m," anon=%lu",md->anon);
+                for_each_online_node(n)
+                        if (md->node[n])
+                                seq_printf(m, " N%d=%lu", n, md->node[n]);
+                seq_putc(m, '\n');
+        }
+        kfree(md);
+        if (m->count < m->size)
+                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+        return 0;
 }
diff --git a/mm/mlock.c b/mm/mlock.c
index 4ae3a46ff768..b90c59573abf 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -5,6 +5,7 @@
 *  (C) Copyright 2002 Christoph Hellwig
 */
+#include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/mempolicy.h>
diff --git a/mm/mmap.c b/mm/mmap.c
index 64ba4dbcb7de..47556d2b3e90 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/syscalls.h>
+#include <linux/capability.h>
 #include <linux/init.h>
 #include <linux/file.h>
 #include <linux/fs.h>
diff --git a/mm/mremap.c b/mm/mremap.c
index ddaeee9a0b69..1903bdf65e42 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -13,6 +13,7 @@
 #include <linux/shm.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/capability.h>
 #include <linux/fs.h>
 #include <linux/highmem.h>
 #include <linux/security.h>
diff --git a/mm/msync.c b/mm/msync.c
index 1b5b6f662dcf..3563a56e1a51 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -137,7 +137,7 @@ static int msync_interval(struct vm_area_struct *vma,
                        ret = filemap_fdatawrite(mapping);
                        if (file->f_op && file->f_op->fsync) {
                                /*
-                                 * We don't take i_sem here because mmap_sem
+                                 * We don't take i_mutex here because mmap_sem
                                 * is already held.
                                 */
                                err = file->f_op->fsync(file,file->f_dentry,1);
diff --git a/mm/nommu.c b/mm/nommu.c
index c1196812876b..c10262d68232 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1177,3 +1177,10 @@ int in_gate_area_no_task(unsigned long addr)
 {
        return 0;
 }
+struct page *filemap_nopage(struct vm_area_struct *area,
+                        unsigned long address, int *type)
+{
+        BUG();
+        return NULL;
+}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d348b9035955..4748b906aff2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -298,7 +298,8 @@ retry:
        /*
         * Give "p" a good chance of killing itself before we
-         * retry to allocate memory.
+         * retry to allocate memory unless "p" is current
         */
-        schedule_timeout_interruptible(1);
+        if (!test_thread_flag(TIF_MEMDIE))
+                schedule_timeout_interruptible(1);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0166ea15c9ee..5240e426c1f7 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -550,11 +550,17 @@ void __init page_writeback_init(void)
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
+        int ret;
        if (wbc->nr_to_write <= 0)
                return 0;
+        wbc->for_writepages = 1;
        if (mapping->a_ops->writepages)
-                return mapping->a_ops->writepages(mapping, wbc);
+                ret =  mapping->a_ops->writepages(mapping, wbc);
-        return generic_writepages(mapping, wbc);
+        else
+                ret = generic_writepages(mapping, wbc);
+        wbc->for_writepages = 0;
+        return ret;
 }
 /**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe14a8c87fc2..8c960b469593 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/nodemask.h>
 #include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
+int percpu_pagelist_fraction;
+static void fastcall free_hot_cold_page(struct page *page, int cold);
 /*
 * results with 256, 32 in the lowmem_reserve sysctl:
@@ -81,6 +85,7 @@ int min_free_kbytes = 1024;
 unsigned long __initdata nr_kernel_pages;
 unsigned long __initdata nr_all_pages;
+#ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
        int ret = 0;
@@ -122,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page)
        return 0;
 }
-static void bad_page(const char *function, struct page *page)
+#else
+static inline int bad_range(struct zone *zone, struct page *page)
 {
-        printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
+        return 0;
-                function, current->comm, page);
+}
-        printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
+#endif
-                (int)(2*sizeof(unsigned long)), (unsigned long)page->flags,
-                page->mapping, page_mapcount(page), page_count(page));
+static void bad_page(struct page *page)
-        printk(KERN_EMERG "Backtrace:\n");
+{
+        printk(KERN_EMERG "Bad page state in process '%s'\n"
+                KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
+                KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
+                KERN_EMERG "Backtrace:\n",
+                current->comm, page, (int)(2*sizeof(unsigned long)),
+                (unsigned long)page->flags, page->mapping,
+                page_mapcount(page), page_count(page));
        dump_stack();
-        printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
        page->flags &= ~(1 << PG_lru    |
                        1 << PG_private |
                        1 << PG_locked  |
@@ -184,19 +196,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
        int i;
        int nr_pages = 1 << order;
-        if (!PageCompound(page))
+        if (unlikely(page[1].index != order))
-                return;
+                bad_page(page);
-        if (page[1].index != order)
-                bad_page(__FUNCTION__, page);
        for (i = 0; i < nr_pages; i++) {
                struct page *p = page + i;
-                if (!PageCompound(p))
+                if (unlikely(!PageCompound(p) |
-                        bad_page(__FUNCTION__, page);
+                                (page_private(p) != (unsigned long)page)))
-                if (page_private(p) != (unsigned long)page)
+                        bad_page(page);
-                        bad_page(__FUNCTION__, page);
                ClearPageCompound(p);
        }
 }
@@ -255,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
 /*
 * This function checks whether a page is free && is the buddy
 * we can do coalesce a page and its buddy if
- * (a) the buddy is free &&
+ * (a) the buddy is not in a hole &&
- * (b) the buddy is on the buddy system &&
+ * (b) the buddy is free &&
- * (c) a page and its buddy have the same order.
+ * (c) the buddy is on the buddy system &&
+ * (d) a page and its buddy have the same order.
 * for recording page's order, we use page_private(page) and PG_private.
 *
 */
 static inline int page_is_buddy(struct page *page, int order)
 {
+#ifdef CONFIG_HOLES_IN_ZONE
+        if (!pfn_valid(page_to_pfn(page)))
+                return 0;
+#endif
       if (PagePrivate(page)           &&
           (page_order(page) == order) &&
            page_count(page) == 0)
@@ -294,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order)
 * -- wli
 */
-static inline void __free_pages_bulk (struct page *page,
+static inline void __free_one_page(struct page *page,
                struct zone *zone, unsigned int order)
 {
        unsigned long page_idx;
        int order_size = 1 << order;
-        if (unlikely(order))
+        if (unlikely(PageCompound(page)))
                destroy_compound_page(page, order);
        page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -314,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page,
                struct free_area *area;
                struct page *buddy;
-                combined_idx = __find_combined_index(page_idx, order);
                buddy = __page_find_buddy(page, page_idx, order);
-                if (bad_range(zone, buddy))
-                        break;
                if (!page_is_buddy(buddy, order))
                        break;          /* Move the buddy up one level. */
                list_del(&buddy->lru);
                area = zone->free_area + order;
                area->nr_free--;
                rmv_page_order(buddy);
+                combined_idx = __find_combined_index(page_idx, order);
                page = page + (combined_idx - page_idx);
                page_idx = combined_idx;
                order++;
@@ -334,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page,
        zone->free_area[order].nr_free++;
 }
-static inline int free_pages_check(const char *function, struct page *page)
+static inline int free_pages_check(struct page *page)
 {
-        if (    page_mapcount(page) ||
+        if (unlikely(page_mapcount(page) |
-                page->mapping != NULL ||
+                (page->mapping != NULL)  |
-                page_count(page) != 0 ||
+                (page_count(page) != 0)  |
                (page->flags & (
                        1 << PG_lru     |
                        1 << PG_private |
@@ -348,8 +360,8 @@ static inline int free_pages_check(const char *function, struct page *page)
                        1 << PG_slab    |
                        1 << PG_swapcache |
                        1 << PG_writeback |
-                        1 << PG_reserved )))
+                        1 << PG_reserved ))))
-                bad_page(function, page);
+                bad_page(page);
        if (PageDirty(page))
                __ClearPageDirty(page);
        /*
@@ -371,51 +383,90 @@ static inline int free_pages_check(const char *function, struct page *page)
 * And clear the zone's pages_scanned counter, to hold off the "all pages are
 * pinned" detection logic.
 */
-static int
+static void free_pages_bulk(struct zone *zone, int count,
-free_pages_bulk(struct zone *zone, int count,
+                                        struct list_head *list, int order)
-                struct list_head *list, unsigned int order)
 {
-        unsigned long flags;
+        spin_lock(&zone->lock);
-        struct page *page = NULL;
-        int ret = 0;
-        spin_lock_irqsave(&zone->lock, flags);
        zone->all_unreclaimable = 0;
        zone->pages_scanned = 0;
-        while (!list_empty(list) && count--) {
+        while (count--) {
+                struct page *page;
+                BUG_ON(list_empty(list));
                page = list_entry(list->prev, struct page, lru);
-                /* have to delete it as __free_pages_bulk list manipulates */
+                /* have to delete it as __free_one_page list manipulates */
                list_del(&page->lru);
-                __free_pages_bulk(page, zone, order);
+                __free_one_page(page, zone, order);
-                ret++;
        }
-        spin_unlock_irqrestore(&zone->lock, flags);
+        spin_unlock(&zone->lock);
-        return ret;
 }
-void __free_pages_ok(struct page *page, unsigned int order)
+static void free_one_page(struct zone *zone, struct page *page, int order)
 {
        LIST_HEAD(list);
+        list_add(&page->lru, &list);
+        free_pages_bulk(zone, 1, &list, order);
+}
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+        unsigned long flags;
        int i;
        int reserved = 0;
        arch_free_page(page, order);
+        if (!PageHighMem(page))
+                mutex_debug_check_no_locks_freed(page_address(page),
+                                                 PAGE_SIZE<<order);
 #ifndef CONFIG_MMU
-        if (order > 0)
+        for (i = 1 ; i < (1 << order) ; ++i)
-                for (i = 1 ; i < (1 << order) ; ++i)
+                __put_page(page + i);
-                        __put_page(page + i);
 #endif
        for (i = 0 ; i < (1 << order) ; ++i)
-                reserved += free_pages_check(__FUNCTION__, page + i);
+                reserved += free_pages_check(page + i);
        if (reserved)
                return;
-        list_add(&page->lru, &list);
+        kernel_map_pages(page, 1 << order, 0);
-        mod_page_state(pgfree, 1 << order);
+        local_irq_save(flags);
-        kernel_map_pages(page, 1<<order, 0);
+        __mod_page_state(pgfree, 1 << order);
-        free_pages_bulk(page_zone(page), 1, &list, order);
+        free_one_page(page_zone(page), page, order);
+        local_irq_restore(flags);
+}
+/*
+ * permit the bootmem allocator to evade page validation on high-order frees
+ */
+void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
+{
+        if (order == 0) {
+                __ClearPageReserved(page);
+                set_page_count(page, 0);
+                free_hot_cold_page(page, 0);
+        } else {
+                LIST_HEAD(list);
+                int loop;
+                for (loop = 0; loop < BITS_PER_LONG; loop++) {
+                        struct page *p = &page[loop];
+                        if (loop + 16 < BITS_PER_LONG)
+                                prefetchw(p + 16);
+                        __ClearPageReserved(p);
+                        set_page_count(p, 0);
+                }
+                arch_free_page(page, order);
+                mod_page_state(pgfree, 1 << order);
+                list_add(&page->lru, &list);
+                kernel_map_pages(page, 1 << order, 0);
+                free_pages_bulk(page_zone(page), 1, &list, order);
+        }
 }
@@ -433,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
 *
 * -- wli
 */
-static inline struct page *
+static inline void expand(struct zone *zone, struct page *page,
-expand(struct zone *zone, struct page *page,
        int low, int high, struct free_area *area)
 {
        unsigned long size = 1 << high;
@@ -448,24 +498,6 @@ expand(struct zone *zone, struct page *page,
                area->nr_free++;
                set_page_order(&page[size], high);
        }
-        return page;
-}
-void set_page_refs(struct page *page, int order)
-{
-#ifdef CONFIG_MMU
-        set_page_count(page, 1);
-#else
-        int i;
-        /*
-         * We need to reference all the pages for this order, otherwise if
-         * anyone accesses one of the pages with (get/put) it will be freed.
-         * - eg: access_process_vm()
-         */
-        for (i = 0; i < (1 << order); i++)
-                set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
 }
 /*
@@ -473,9 +505,9 @@ void set_page_refs(struct page *page, int order)
 */
 static int prep_new_page(struct page *page, int order)
 {
-        if (    page_mapcount(page) ||
+        if (unlikely(page_mapcount(page) |
-                page->mapping != NULL ||
+                (page->mapping != NULL)  |
-                page_count(page) != 0 ||
+                (page_count(page) != 0)  |
                (page->flags & (
                        1 << PG_lru     |
                        1 << PG_private |
@@ -486,8 +518,8 @@ static int prep_new_page(struct page *page, int order)
                        1 << PG_slab    |
                        1 << PG_swapcache |
                        1 << PG_writeback |
-                        1 << PG_reserved )))
+                        1 << PG_reserved ))))
-                bad_page(__FUNCTION__, page);
+                bad_page(page);
        /*
         * For now, we report if PG_reserved was found set, but do not
@@ -525,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
                rmv_page_order(page);
                area->nr_free--;
                zone->free_pages -= 1UL << order;
-                return expand(zone, page, order, current_order, area);
+                expand(zone, page, order, current_order, area);
+                return page;
        }
        return NULL;
@@ -539,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
 static int rmqueue_bulk(struct zone *zone, unsigned int order, 
                        unsigned long count, struct list_head *list)
 {
-        unsigned long flags;
        int i;
-        int allocated = 0;
-        struct page *page;
        
-        spin_lock_irqsave(&zone->lock, flags);
+        spin_lock(&zone->lock);
        for (i = 0; i < count; ++i) {
-                page = __rmqueue(zone, order);
+                struct page *page = __rmqueue(zone, order);
-                if (page == NULL)
+                if (unlikely(page == NULL))
                        break;
-                allocated++;
                list_add_tail(&page->lru, list);
        }
-        spin_unlock_irqrestore(&zone->lock, flags);
+        spin_unlock(&zone->lock);
-        return allocated;
+        return i;
 }
 #ifdef CONFIG_NUMA
@@ -572,14 +601,13 @@ void drain_remote_pages(void)
                if (zone->zone_pgdat->node_id == numa_node_id())
                        continue;
-                pset = zone->pageset[smp_processor_id()];
+                pset = zone_pcp(zone, smp_processor_id());
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
                        pcp = &pset->pcp[i];
-                        if (pcp->count)
+                        free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-                                pcp->count -= free_pages_bulk(zone, pcp->count,
+                        pcp->count = 0;
-                                                &pcp->list, 0);
                }
        }
        local_irq_restore(flags);
@@ -589,6 +617,7 @@ void drain_remote_pages(void)
 #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
 static void __drain_pages(unsigned int cpu)
 {
+        unsigned long flags;
        struct zone *zone;
        int i;
@@ -600,8 +629,10 @@ static void __drain_pages(unsigned int cpu)
                        struct per_cpu_pages *pcp;
                        pcp = &pset->pcp[i];
-                        pcp->count -= free_pages_bulk(zone, pcp->count,
+                        local_irq_save(flags);
-                                                &pcp->list, 0);
+                        free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+                        pcp->count = 0;
+                        local_irq_restore(flags);
                }
        }
 }
@@ -647,18 +678,14 @@ void drain_local_pages(void)
 }
 #endif /* CONFIG_PM */
-static void zone_statistics(struct zonelist *zonelist, struct zone *z)
+static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
 {
 #ifdef CONFIG_NUMA
-        unsigned long flags;
-        int cpu;
        pg_data_t *pg = z->zone_pgdat;
        pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
        struct per_cpu_pageset *p;
-        local_irq_save(flags);
+        p = zone_pcp(z, cpu);
-        cpu = smp_processor_id();
-        p = zone_pcp(z,cpu);
        if (pg == orig) {
                p->numa_hit++;
        } else {
@@ -669,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
                p->local_node++;
        else
                p->other_node++;
-        local_irq_restore(flags);
 #endif
 }
 /*
 * Free a 0-order page
 */
-static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
 static void fastcall free_hot_cold_page(struct page *page, int cold)
 {
        struct zone *zone = page_zone(page);
@@ -687,18 +712,20 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        if (PageAnon(page))
                page->mapping = NULL;
-        if (free_pages_check(__FUNCTION__, page))
+        if (free_pages_check(page))
                return;
-        inc_page_state(pgfree);
        kernel_map_pages(page, 1, 0);
        pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
        local_irq_save(flags);
+        __inc_page_state(pgfree);
        list_add(&page->lru, &pcp->list);
        pcp->count++;
-        if (pcp->count >= pcp->high)
+        if (pcp->count >= pcp->high) {
-                pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+                free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+                pcp->count -= pcp->batch;
+        }
        local_irq_restore(flags);
        put_cpu();
 }
@@ -727,49 +754,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
 */
-static struct page *
+static struct page *buffered_rmqueue(struct zonelist *zonelist,
-buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
+                        struct zone *zone, int order, gfp_t gfp_flags)
 {
        unsigned long flags;
        struct page *page;
        int cold = !!(gfp_flags & __GFP_COLD);
+        int cpu;
 again:
-        if (order == 0) {
+        cpu  = get_cpu();
+        if (likely(order == 0)) {
                struct per_cpu_pages *pcp;
-                page = NULL;
+                pcp = &zone_pcp(zone, cpu)->pcp[cold];
-                pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
                local_irq_save(flags);
-                if (pcp->count <= pcp->low)
+                if (!pcp->count) {
                        pcp->count += rmqueue_bulk(zone, 0,
                                                pcp->batch, &pcp->list);
-                if (pcp->count) {
+                        if (unlikely(!pcp->count))
-                        page = list_entry(pcp->list.next, struct page, lru);
+                                goto failed;
-                        list_del(&page->lru);
-                        pcp->count--;
                }
-                local_irq_restore(flags);
+                page = list_entry(pcp->list.next, struct page, lru);
-                put_cpu();
+                list_del(&page->lru);
+                pcp->count--;
        } else {
                spin_lock_irqsave(&zone->lock, flags);
                page = __rmqueue(zone, order);
-                spin_unlock_irqrestore(&zone->lock, flags);
+                spin_unlock(&zone->lock);
+                if (!page)
+                        goto failed;
        }
-        if (page != NULL) {
+        __mod_page_state_zone(zone, pgalloc, 1 << order);
-                BUG_ON(bad_range(zone, page));
+        zone_statistics(zonelist, zone, cpu);
-                mod_page_state_zone(zone, pgalloc, 1 << order);
+        local_irq_restore(flags);
-                if (prep_new_page(page, order))
+        put_cpu();
-                        goto again;
-                if (gfp_flags & __GFP_ZERO)
+        BUG_ON(bad_range(zone, page));
-                        prep_zero_page(page, order, gfp_flags);
+        if (prep_new_page(page, order))
+                goto again;
-                if (order && (gfp_flags & __GFP_COMP))
+        if (gfp_flags & __GFP_ZERO)
-                        prep_compound_page(page, order);
+                prep_zero_page(page, order, gfp_flags);
-        }
+        if (order && (gfp_flags & __GFP_COMP))
+                prep_compound_page(page, order);
        return page;
+failed:
+        local_irq_restore(flags);
+        put_cpu();
+        return NULL;
 }
 #define ALLOC_NO_WATERMARKS     0x01 /* don't check watermarks at all */
@@ -845,9 +881,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                                continue;
                }
-                page = buffered_rmqueue(*z, order, gfp_mask);
+                page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
                if (page) {
-                        zone_statistics(zonelist, *z);
                        break;
                }
        } while (*(++z) != NULL);
@@ -896,15 +931,15 @@ restart:
         *
         * The caller may dip into page reserves a bit more if the caller
         * cannot run direct reclaim, or if the caller has realtime scheduling
-         * policy.
+         * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+         * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
         */
        alloc_flags = ALLOC_WMARK_MIN;
        if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
                alloc_flags |= ALLOC_HARDER;
        if (gfp_mask & __GFP_HIGH)
                alloc_flags |= ALLOC_HIGH;
-        if (wait)
+        alloc_flags |= ALLOC_CPUSET;
-                alloc_flags |= ALLOC_CPUSET;
        /*
         * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -926,7 +961,7 @@ restart:
 nofail_alloc:
                        /* go through the zonelist yet again, ignoring mins */
                        page = get_page_from_freelist(gfp_mask, order,
-                                zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET);
+                                zonelist, ALLOC_NO_WATERMARKS);
                        if (page)
                                goto got_pg;
                        if (gfp_mask & __GFP_NOFAIL) {
@@ -945,6 +980,7 @@ rebalance:
        cond_resched();
        /* We now go into synchronous reclaim */
+        cpuset_memory_pressure_bump();
        p->flags |= PF_MEMALLOC;
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
@@ -1171,7 +1207,7 @@ EXPORT_SYMBOL(nr_pagecache);
 DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 #endif
-void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
+static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
        int cpu = 0;
@@ -1224,7 +1260,7 @@ void get_full_page_state(struct page_state *ret)
        __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
 }
-unsigned long __read_page_state(unsigned long offset)
+unsigned long read_page_state_offset(unsigned long offset)
 {
        unsigned long ret = 0;
        int cpu;
@@ -1238,18 +1274,26 @@ unsigned long __read_page_state(unsigned long offset)
        return ret;
 }
-void __mod_page_state(unsigned long offset, unsigned long delta)
+void __mod_page_state_offset(unsigned long offset, unsigned long delta)
+{
+        void *ptr;
+        ptr = &__get_cpu_var(page_states);
+        *(unsigned long *)(ptr + offset) += delta;
+}
+EXPORT_SYMBOL(__mod_page_state_offset);
+void mod_page_state_offset(unsigned long offset, unsigned long delta)
 {
        unsigned long flags;
-        void* ptr;
+        void *ptr;
        local_irq_save(flags);
        ptr = &__get_cpu_var(page_states);
-        *(unsigned long*)(ptr + offset) += delta;
+        *(unsigned long *)(ptr + offset) += delta;
        local_irq_restore(flags);
 }
+EXPORT_SYMBOL(mod_page_state_offset);
-EXPORT_SYMBOL(__mod_page_state);
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
                        unsigned long *free, struct pglist_data *pgdat)
@@ -1335,7 +1379,7 @@ void show_free_areas(void)
                show_node(zone);
                printk("%s per-cpu:", zone->name);
-                if (!zone->present_pages) {
+                if (!populated_zone(zone)) {
                        printk(" empty\n");
                        continue;
                } else
@@ -1347,10 +1391,9 @@ void show_free_areas(void)
                        pageset = zone_pcp(zone, cpu);
                        for (temperature = 0; temperature < 2; temperature++)
-                                printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
+                                printk("cpu %d %s: high %d, batch %d used:%d\n",
                                        cpu,
                                        temperature ? "cold" : "hot",
-                                        pageset->pcp[temperature].low,
                                        pageset->pcp[temperature].high,
                                        pageset->pcp[temperature].batch,
                                        pageset->pcp[temperature].count);
@@ -1413,7 +1456,7 @@ void show_free_areas(void)
                show_node(zone);
                printk("%s: ", zone->name);
-                if (!zone->present_pages) {
+                if (!populated_zone(zone)) {
                        printk("empty\n");
                        continue;
                }
@@ -1433,36 +1476,29 @@ void show_free_areas(void)
 /*
 * Builds allocation fallback zone lists.
+ *
+ * Add all populated zones of a node to the zonelist.
 */
-static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
+static int __init build_zonelists_node(pg_data_t *pgdat,
-{
+                        struct zonelist *zonelist, int nr_zones, int zone_type)
-        switch (k) {
+{
-                struct zone *zone;
+        struct zone *zone;
-        default:
-                BUG();
+        BUG_ON(zone_type > ZONE_HIGHMEM);
-        case ZONE_HIGHMEM:
-                zone = pgdat->node_zones + ZONE_HIGHMEM;
+        do {
-                if (zone->present_pages) {
+                zone = pgdat->node_zones + zone_type;
+                if (populated_zone(zone)) {
 #ifndef CONFIG_HIGHMEM
-                        BUG();
+                        BUG_ON(zone_type > ZONE_NORMAL);
 #endif
-                        zonelist->zones[j++] = zone;
+                        zonelist->zones[nr_zones++] = zone;
+                        check_highest_zone(zone_type);
                }
-        case ZONE_NORMAL:
+                zone_type--;
-                zone = pgdat->node_zones + ZONE_NORMAL;
-                if (zone->present_pages)
-                        zonelist->zones[j++] = zone;
-        case ZONE_DMA32:
-                zone = pgdat->node_zones + ZONE_DMA32;
-                if (zone->present_pages)
-                        zonelist->zones[j++] = zone;
-        case ZONE_DMA:
-                zone = pgdat->node_zones + ZONE_DMA;
-                if (zone->present_pages)
-                        zonelist->zones[j++] = zone;
-        }
-        return j;
+        } while (zone_type >= 0);
+        return nr_zones;
 }
 static inline int highest_zone(int zone_bits)
@@ -1706,11 +1742,9 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
        unsigned long end_pfn = start_pfn + size;
        unsigned long pfn;
-        for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
+        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                if (!early_pfn_valid(pfn))
                        continue;
-                if (!early_pfn_in_nid(pfn, nid))
-                        continue;
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
                set_page_count(page, 1);
@@ -1794,19 +1828,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        pcp = &p->pcp[0];               /* hot */
        pcp->count = 0;
-        pcp->low = 0;
        pcp->high = 6 * batch;
        pcp->batch = max(1UL, 1 * batch);
        INIT_LIST_HEAD(&pcp->list);
        pcp = &p->pcp[1];               /* cold*/
        pcp->count = 0;
-        pcp->low = 0;
        pcp->high = 2 * batch;
        pcp->batch = max(1UL, batch/2);
        INIT_LIST_HEAD(&pcp->list);
 }
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+                                unsigned long high)
+{
+        struct per_cpu_pages *pcp;
+        pcp = &p->pcp[0]; /* hot list */
+        pcp->high = high;
+        pcp->batch = max(1UL, high/4);
+        if ((high/4) > (PAGE_SHIFT * 8))
+                pcp->batch = PAGE_SHIFT * 8;
+}
 #ifdef CONFIG_NUMA
 /*
 * Boot pageset table. One per cpu which is going to be used for all
@@ -1838,12 +1888,16 @@ static int __devinit process_zones(int cpu)
        for_each_zone(zone) {
-                zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+                zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
                                         GFP_KERNEL, cpu_to_node(cpu));
-                if (!zone->pageset[cpu])
+                if (!zone_pcp(zone, cpu))
                        goto bad;
-                setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+                setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+                if (percpu_pagelist_fraction)
+                        setup_pagelist_highmark(zone_pcp(zone, cpu),
+                                (zone->present_pages / percpu_pagelist_fraction));
        }
        return 0;
@@ -1851,15 +1905,14 @@ bad:
        for_each_zone(dzone) {
                if (dzone == zone)
                        break;
-                kfree(dzone->pageset[cpu]);
+                kfree(zone_pcp(dzone, cpu));
-                dzone->pageset[cpu] = NULL;
+                zone_pcp(dzone, cpu) = NULL;
        }
        return -ENOMEM;
 }
 static inline void free_zone_pagesets(int cpu)
 {
-#ifdef CONFIG_NUMA
        struct zone *zone;
        for_each_zone(zone) {
@@ -1868,7 +1921,6 @@ static inline void free_zone_pagesets(int cpu)
                zone_pcp(zone, cpu) = NULL;
                kfree(pset);
        }
-#endif
 }
 static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
@@ -1939,7 +1991,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
        for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_NUMA
                /* Early boot. Slab allocator not functional yet */
-                zone->pageset[cpu] = &boot_pageset[cpu];
+                zone_pcp(zone, cpu) = &boot_pageset[cpu];
                setup_pageset(&boot_pageset[cpu],0);
 #else
                setup_pageset(zone_pcp(zone,cpu), batch);
@@ -2116,7 +2168,7 @@ static int frag_show(struct seq_file *m, void *arg)
        int order;
        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-                if (!zone->present_pages)
+                if (!populated_zone(zone))
                        continue;
                spin_lock_irqsave(&zone->lock, flags);
@@ -2149,7 +2201,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
        for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
                int i;
-                if (!zone->present_pages)
+                if (!populated_zone(zone))
                        continue;
                spin_lock_irqsave(&zone->lock, flags);
@@ -2182,7 +2234,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                seq_printf(m,
                           ")"
                           "\n  pagesets");
-                for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+                for_each_online_cpu(i) {
                        struct per_cpu_pageset *pageset;
                        int j;
@@ -2197,12 +2249,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                                seq_printf(m,
                                           "\n    cpu: %i pcp: %i"
                                           "\n              count: %i"
-                                           "\n              low:   %i"
                                           "\n              high:  %i"
                                           "\n              batch: %i",
                                           i, j,
                                           pageset->pcp[j].count,
-                                           pageset->pcp[j].low,
                                           pageset->pcp[j].high,
                                           pageset->pcp[j].batch);
                        }
@@ -2257,32 +2307,40 @@ static char *vmstat_text[] = {
        "pgpgout",
        "pswpin",
        "pswpout",
-        "pgalloc_high",
+        "pgalloc_high",
        "pgalloc_normal",
+        "pgalloc_dma32",
        "pgalloc_dma",
        "pgfree",
        "pgactivate",
        "pgdeactivate",
        "pgfault",
        "pgmajfault",
        "pgrefill_high",
        "pgrefill_normal",
+        "pgrefill_dma32",
        "pgrefill_dma",
        "pgsteal_high",
        "pgsteal_normal",
+        "pgsteal_dma32",
        "pgsteal_dma",
        "pgscan_kswapd_high",
        "pgscan_kswapd_normal",
+        "pgscan_kswapd_dma32",
        "pgscan_kswapd_dma",
        "pgscan_direct_high",
        "pgscan_direct_normal",
+        "pgscan_direct_dma32",
        "pgscan_direct_dma",
-        "pginodesteal",
+        "pginodesteal",
        "slabs_scanned",
        "kswapd_steal",
        "kswapd_inodesteal",
@@ -2539,6 +2597,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
        return 0;
 }
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+        struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+        struct zone *zone;
+        unsigned int cpu;
+        int ret;
+        ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+        if (!write || (ret == -EINVAL))
+                return ret;
+        for_each_zone(zone) {
+                for_each_online_cpu(cpu) {
+                        unsigned long  high;
+                        high = zone->present_pages / percpu_pagelist_fraction;
+                        setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+                }
+        }
+        return 0;
+}
 __initdata int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 52822c98c489..c4b6d0afd736 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -90,7 +90,7 @@ struct pdflush_work {
 static int __pdflush(struct pdflush_work *my_work)
 {
-        current->flags |= PF_FLUSHER;
+        current->flags |= PF_FLUSHER | PF_SWAPWRITE;
        my_work->fn = NULL;
        my_work->who = current;
        INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/readahead.c b/mm/readahead.c
index 72e7adbb87c7..8d6eeaaa6296 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -158,7 +158,7 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 {
        unsigned page_idx;
        struct pagevec lru_pvec;
-        int ret = 0;
+        int ret;
        if (mapping->a_ops->readpages) {
                ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
@@ -171,14 +171,17 @@ static int read_pages(struct address_space *mapping, struct file *filp,
                list_del(&page->lru);
                if (!add_to_page_cache(page, mapping,
                                        page->index, GFP_KERNEL)) {
-                        mapping->a_ops->readpage(filp, page);
+                        ret = mapping->a_ops->readpage(filp, page);
-                        if (!pagevec_add(&lru_pvec, page))
+                        if (ret != AOP_TRUNCATED_PAGE) {
-                                __pagevec_lru_add(&lru_pvec);
+                                if (!pagevec_add(&lru_pvec, page))
-                } else {
+                                        __pagevec_lru_add(&lru_pvec);
-                        page_cache_release(page);
+                                continue;
+                        } /* else fall through to release */
                }
+                page_cache_release(page);
        }
        pagevec_lru_add(&lru_pvec);
+        ret = 0;
 out:
        return ret;
 }
diff --git a/mm/rmap.c b/mm/rmap.c
index f853c6def159..dfbb89f99a15 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -20,13 +20,13 @@
 /*
 * Lock ordering in mm:
 *
- * inode->i_sem (while writing or truncating, not reading or faulting)
+ * inode->i_mutex       (while writing or truncating, not reading or faulting)
 *   inode->i_alloc_sem
 *
 * When a page fault occurs in writing from user to file, down_read
- * of mmap_sem nests within i_sem; in sys_msync, i_sem nests within
+ * of mmap_sem nests within i_mutex; in sys_msync, i_mutex nests within
- * down_read of mmap_sem; i_sem and down_write of mmap_sem are never
+ * down_read of mmap_sem; i_mutex and down_write of mmap_sem are never
- * taken together; in truncation, i_sem is taken outermost.
+ * taken together; in truncation, i_mutex is taken outermost.
 *
 * mm->mmap_sem
 *   page->flags PG_locked (lock_page)
@@ -435,6 +435,30 @@ int page_referenced(struct page *page, int is_locked)
 }
 /**
+ * page_set_anon_rmap - setup new anonymous rmap
+ * @page:       the page to add the mapping to
+ * @vma:        the vm area in which the mapping is added
+ * @address:    the user virtual address mapped
+ */
+static void __page_set_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address)
+{
+        struct anon_vma *anon_vma = vma->anon_vma;
+        BUG_ON(!anon_vma);
+        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+        page->mapping = (struct address_space *) anon_vma;
+        page->index = linear_page_index(vma, address);
+        /*
+         * nr_mapped state can be updated without turning off
+         * interrupts because it is not modified via interrupt.
+         */
+        __inc_page_state(nr_mapped);
+}
+/**
 * page_add_anon_rmap - add pte mapping to an anonymous page
 * @page:       the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
@@ -445,20 +469,27 @@ int page_referenced(struct page *page, int is_locked)
 void page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
-        if (atomic_inc_and_test(&page->_mapcount)) {
+        if (atomic_inc_and_test(&page->_mapcount))
-                struct anon_vma *anon_vma = vma->anon_vma;
+                __page_set_anon_rmap(page, vma, address);
-                BUG_ON(!anon_vma);
-                anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-                page->mapping = (struct address_space *) anon_vma;
-                page->index = linear_page_index(vma, address);
-                inc_page_state(nr_mapped);
-        }
        /* else checking page index and mapping is racy */
 }
+/*
+ * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page:       the page to add the mapping to
+ * @vma:        the vm area in which the mapping is added
+ * @address:    the user virtual address mapped
+ *
+ * Same as page_add_anon_rmap but must only be called on *new* pages.
+ * This means the inc-and-test can be bypassed.
+ */
+void page_add_new_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address)
+{
+        atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
+        __page_set_anon_rmap(page, vma, address);
+}
 /**
 * page_add_file_rmap - add pte mapping to a file page
 * @page: the page to add the mapping to
@@ -471,7 +502,7 @@ void page_add_file_rmap(struct page *page)
        BUG_ON(!pfn_valid(page_to_pfn(page)));
        if (atomic_inc_and_test(&page->_mapcount))
-                inc_page_state(nr_mapped);
+                __inc_page_state(nr_mapped);
 }
 /**
@@ -483,6 +514,13 @@ void page_add_file_rmap(struct page *page)
 void page_remove_rmap(struct page *page)
 {
        if (atomic_add_negative(-1, &page->_mapcount)) {
+                if (page_mapcount(page) < 0) {
+                        printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
+                        printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
+                        printk (KERN_EMERG "  page->count = %x\n", page_count(page));
+                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
+                }
                BUG_ON(page_mapcount(page) < 0);
                /*
                 * It would be tidy to reset the PageAnon mapping here,
@@ -495,7 +533,7 @@ void page_remove_rmap(struct page *page)
                 */
                if (page_test_and_clear_dirty(page))
                        set_page_dirty(page);
-                dec_page_state(nr_mapped);
+                __dec_page_state(nr_mapped);
        }
 }
diff --git a/mm/shmem.c b/mm/shmem.c
index dc25565a61e9..343b3c0937e5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -457,7 +457,7 @@ static void shmem_free_pages(struct list_head *next)
        } while (next);
 }
-static void shmem_truncate(struct inode *inode)
+static void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
        unsigned long idx;
@@ -475,18 +475,27 @@ static void shmem_truncate(struct inode *inode)
        long nr_swaps_freed = 0;
        int offset;
        int freed;
+        int punch_hole = 0;
        inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-        idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (idx >= info->next_index)
                return;
        spin_lock(&info->lock);
        info->flags |= SHMEM_TRUNCATE;
-        limit = info->next_index;
+        if (likely(end == (loff_t) -1)) {
-        info->next_index = idx;
+                limit = info->next_index;
+                info->next_index = idx;
+        } else {
+                limit = (end + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                if (limit > info->next_index)
+                        limit = info->next_index;
+                punch_hole = 1;
+        }
        topdir = info->i_indirect;
-        if (topdir && idx <= SHMEM_NR_DIRECT) {
+        if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
                info->i_indirect = NULL;
                nr_pages_to_free++;
                list_add(&topdir->lru, &pages_to_free);
@@ -573,11 +582,12 @@ static void shmem_truncate(struct inode *inode)
                        set_page_private(subdir, page_private(subdir) - freed);
                        if (offset)
                                spin_unlock(&info->lock);
-                        BUG_ON(page_private(subdir) > offset);
+                        if (!punch_hole)
+                                BUG_ON(page_private(subdir) > offset);
                }
                if (offset)
                        offset = 0;
-                else if (subdir) {
+                else if (subdir && !page_private(subdir)) {
                        dir[diroff] = NULL;
                        nr_pages_to_free++;
                        list_add(&subdir->lru, &pages_to_free);
@@ -594,7 +604,7 @@ done2:
                 * Also, though shmem_getpage checks i_size before adding to
                 * cache, no recheck after: so fix the narrow window there too.
                 */
-                truncate_inode_pages(inode->i_mapping, inode->i_size);
+                truncate_inode_pages_range(inode->i_mapping, start, end);
        }
        spin_lock(&info->lock);
@@ -614,6 +624,11 @@ done2:
        }
 }
+static void shmem_truncate(struct inode *inode)
+{
+        shmem_truncate_range(inode, inode->i_size, (loff_t)-1);
+}
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
@@ -855,7 +870,7 @@ unlock:
        swap_free(swap);
 redirty:
        set_page_dirty(page);
-        return WRITEPAGE_ACTIVATE;      /* Return with the page locked */
+        return AOP_WRITEPAGE_ACTIVATE;  /* Return with the page locked */
 }
 #ifdef CONFIG_NUMA
@@ -1255,7 +1270,7 @@ out_nomem:
        return retval;
 }
-static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
 {
        file_accessed(file);
        vma->vm_ops = &shmem_vm_ops;
@@ -1355,7 +1370,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
        if (!access_ok(VERIFY_READ, buf, count))
                return -EFAULT;
-        down(&inode->i_sem);
+        mutex_lock(&inode->i_mutex);
        pos = *ppos;
        written = 0;
@@ -1440,7 +1455,7 @@ shmem_file_write(struct file *file, const char __user *buf, size_t count, loff_t
        if (written)
                err = written;
 out:
-        up(&inode->i_sem);
+        mutex_unlock(&inode->i_mutex);
        return err;
 }
@@ -1476,7 +1491,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
                /*
                 * We must evaluate after, since reads (unlike writes)
-                 * are called without i_sem protection against truncate
+                 * are called without i_mutex protection against truncate
                 */
                nr = PAGE_CACHE_SIZE;
                i_size = i_size_read(inode);
@@ -2083,6 +2098,7 @@ static struct file_operations shmem_file_operations = {
 static struct inode_operations shmem_inode_operations = {
        .truncate       = shmem_truncate,
        .setattr        = shmem_notify_change,
+        .truncate_range = shmem_truncate_range,
 };
 static struct inode_operations shmem_dir_inode_operations = {
diff --git a/mm/slab.c b/mm/slab.c
index e5ec26e0c460..9374293a3012 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -130,7 +130,6 @@
 #define FORCED_DEBUG    0
 #endif
 /* Shouldn't this be in a header file somewhere? */
 #define BYTES_PER_WORD          sizeof(void *)
@@ -217,12 +216,12 @@ static unsigned long offslab_limit;
 * Slabs are chained into three list: fully used, partial, fully free slabs.
 */
 struct slab {
-        struct list_head        list;
+        struct list_head list;
-        unsigned long           colouroff;
+        unsigned long colouroff;
-        void                    *s_mem;         /* including colour offset */
+        void *s_mem;            /* including colour offset */
-        unsigned int            inuse;          /* num of objs active in slab */
+        unsigned int inuse;     /* num of objs active in slab */
-        kmem_bufctl_t           free;
+        kmem_bufctl_t free;
-        unsigned short          nodeid;
+        unsigned short nodeid;
 };
 /*
@@ -242,9 +241,9 @@ struct slab {
 * We assume struct slab_rcu can overlay struct slab when destroying.
 */
 struct slab_rcu {
-        struct rcu_head         head;
+        struct rcu_head head;
-        kmem_cache_t            *cachep;
+        kmem_cache_t *cachep;
-        void                    *addr;
+        void *addr;
 };
 /*
@@ -279,23 +278,23 @@ struct array_cache {
 #define BOOT_CPUCACHE_ENTRIES   1
 struct arraycache_init {
        struct array_cache cache;
-        void * entries[BOOT_CPUCACHE_ENTRIES];
+        void *entries[BOOT_CPUCACHE_ENTRIES];
 };
 /*
 * The slab lists for all objects.
 */
 struct kmem_list3 {
-        struct list_head        slabs_partial;  /* partial list first, better asm code */
+        struct list_head slabs_partial; /* partial list first, better asm code */
-        struct list_head        slabs_full;
+        struct list_head slabs_full;
-        struct list_head        slabs_free;
+        struct list_head slabs_free;
-        unsigned long   free_objects;
+        unsigned long free_objects;
-        unsigned long   next_reap;
+        unsigned long next_reap;
-        int             free_touched;
+        int free_touched;
-        unsigned int    free_limit;
+        unsigned int free_limit;
-        spinlock_t      list_lock;
+        spinlock_t list_lock;
-        struct array_cache      *shared;        /* shared per node */
+        struct array_cache *shared;     /* shared per node */
-        struct array_cache      **alien;        /* on other nodes */
+        struct array_cache **alien;     /* on other nodes */
 };
 /*
@@ -367,63 +366,63 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
 *
 * manages a cache.
 */
-        
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
-        struct array_cache      *array[NR_CPUS];
+        struct array_cache *array[NR_CPUS];
-        unsigned int            batchcount;
+        unsigned int batchcount;
-        unsigned int            limit;
+        unsigned int limit;
-        unsigned int            shared;
+        unsigned int shared;
-        unsigned int            objsize;
+        unsigned int objsize;
 /* 2) touched by every alloc & free from the backend */
-        struct kmem_list3       *nodelists[MAX_NUMNODES];
+        struct kmem_list3 *nodelists[MAX_NUMNODES];
-        unsigned int            flags;  /* constant flags */
+        unsigned int flags;     /* constant flags */
-        unsigned int            num;    /* # of objs per slab */
+        unsigned int num;       /* # of objs per slab */
-        spinlock_t              spinlock;
+        spinlock_t spinlock;
 /* 3) cache_grow/shrink */
        /* order of pgs per slab (2^n) */
-        unsigned int            gfporder;
+        unsigned int gfporder;
        /* force GFP flags, e.g. GFP_DMA */
-        gfp_t                   gfpflags;
+        gfp_t gfpflags;
-        size_t                  colour;         /* cache colouring range */
+        size_t colour;          /* cache colouring range */
-        unsigned int            colour_off;     /* colour offset */
+        unsigned int colour_off;        /* colour offset */
-        unsigned int            colour_next;    /* cache colouring */
+        unsigned int colour_next;       /* cache colouring */
-        kmem_cache_t            *slabp_cache;
+        kmem_cache_t *slabp_cache;
-        unsigned int            slab_size;
+        unsigned int slab_size;
-        unsigned int            dflags;         /* dynamic flags */
+        unsigned int dflags;    /* dynamic flags */
        /* constructor func */
-        void (*ctor)(void *, kmem_cache_t *, unsigned long);
+        void (*ctor) (void *, kmem_cache_t *, unsigned long);
        /* de-constructor func */
-        void (*dtor)(void *, kmem_cache_t *, unsigned long);
+        void (*dtor) (void *, kmem_cache_t *, unsigned long);
 /* 4) cache creation/removal */
-        const char              *name;
+        const char *name;
-        struct list_head        next;
+        struct list_head next;
 /* 5) statistics */
 #if STATS
-        unsigned long           num_active;
+        unsigned long num_active;
-        unsigned long           num_allocations;
+        unsigned long num_allocations;
-        unsigned long           high_mark;
+        unsigned long high_mark;
-        unsigned long           grown;
+        unsigned long grown;
-        unsigned long           reaped;
+        unsigned long reaped;
-        unsigned long           errors;
+        unsigned long errors;
-        unsigned long           max_freeable;
+        unsigned long max_freeable;
-        unsigned long           node_allocs;
+        unsigned long node_allocs;
-        unsigned long           node_frees;
+        unsigned long node_frees;
-        atomic_t                allochit;
+        atomic_t allochit;
-        atomic_t                allocmiss;
+        atomic_t allocmiss;
-        atomic_t                freehit;
+        atomic_t freehit;
-        atomic_t                freemiss;
+        atomic_t freemiss;
 #endif
 #if DEBUG
-        int                     dbghead;
+        int dbghead;
-        int                     reallen;
+        int reallen;
 #endif
 };
@@ -523,14 +522,15 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
        if (cachep->flags & SLAB_STORE_USER)
-                return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
+                return (unsigned long *)(objp + cachep->objsize -
-        return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
+                                         2 * BYTES_PER_WORD);
+        return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
 }
 static void **dbg_userword(kmem_cache_t *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-        return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
+        return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
 }
 #else
@@ -607,31 +607,31 @@ struct cache_names {
 static struct cache_names __initdata cache_names[] = {
 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 #include <linux/kmalloc_sizes.h>
-        { NULL, }
+        {NULL,}
 #undef CACHE
 };
 static struct arraycache_init initarray_cache __initdata =
-        { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 static struct arraycache_init initarray_generic =
-        { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 /* internal cache of cache description objs */
 static kmem_cache_t cache_cache = {
-        .batchcount     = 1,
+        .batchcount = 1,
-        .limit          = BOOT_CPUCACHE_ENTRIES,
+        .limit = BOOT_CPUCACHE_ENTRIES,
-        .shared         = 1,
+        .shared = 1,
-        .objsize        = sizeof(kmem_cache_t),
+        .objsize = sizeof(kmem_cache_t),
-        .flags          = SLAB_NO_REAP,
+        .flags = SLAB_NO_REAP,
-        .spinlock       = SPIN_LOCK_UNLOCKED,
+        .spinlock = SPIN_LOCK_UNLOCKED,
-        .name           = "kmem_cache",
+        .name = "kmem_cache",
 #if DEBUG
-        .reallen        = sizeof(kmem_cache_t),
+        .reallen = sizeof(kmem_cache_t),
 #endif
 };
 /* Guard access to the cache-chain. */
-static struct semaphore cache_chain_sem;
+static struct semaphore cache_chain_sem;
 static struct list_head cache_chain;
 /*
@@ -655,9 +655,9 @@ static enum {
 static DEFINE_PER_CPU(struct work_struct, reap_work);
-static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node);
+static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
-static void enable_cpucache (kmem_cache_t *cachep);
+static void enable_cpucache(kmem_cache_t *cachep);
-static void cache_reap (void *unused);
+static void cache_reap(void *unused);
 static int __node_shrink(kmem_cache_t *cachep, int node);
 static inline struct array_cache *ac_data(kmem_cache_t *cachep)
@@ -671,9 +671,9 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
 #if DEBUG
        /* This happens if someone tries to call
-        * kmem_cache_create(), or __kmalloc(), before
+         * kmem_cache_create(), or __kmalloc(), before
-        * the generic caches are initialized.
+         * the generic caches are initialized.
-        */
+         */
        BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 #endif
        while (size > csizep->cs_size)
@@ -697,10 +697,10 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
 static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
-                 int flags, size_t *left_over, unsigned int *num)
+                           int flags, size_t *left_over, unsigned int *num)
 {
        int i;
-        size_t wastage = PAGE_SIZE<<gfporder;
+        size_t wastage = PAGE_SIZE << gfporder;
        size_t extra = 0;
        size_t base = 0;
@@ -709,7 +709,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
                extra = sizeof(kmem_bufctl_t);
        }
        i = 0;
-        while (i*size + ALIGN(base+i*extra, align) <= wastage)
+        while (i * size + ALIGN(base + i * extra, align) <= wastage)
                i++;
        if (i > 0)
                i--;
@@ -718,8 +718,8 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
                i = SLAB_LIMIT;
        *num = i;
-        wastage -= i*size;
+        wastage -= i * size;
-        wastage -= ALIGN(base+i*extra, align);
+        wastage -= ALIGN(base + i * extra, align);
        *left_over = wastage;
 }
@@ -728,7 +728,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
 {
        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
-                function, cachep->name, msg);
+               function, cachep->name, msg);
        dump_stack();
 }
@@ -755,9 +755,9 @@ static void __devinit start_cpu_timer(int cpu)
 }
 static struct array_cache *alloc_arraycache(int node, int entries,
-                                                int batchcount)
+                                            int batchcount)
 {
-        int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
+        int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
        struct array_cache *nc = NULL;
        nc = kmalloc_node(memsize, GFP_KERNEL, node);
@@ -775,7 +775,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
        struct array_cache **ac_ptr;
-        int memsize = sizeof(void*)*MAX_NUMNODES;
+        int memsize = sizeof(void *) * MAX_NUMNODES;
        int i;
        if (limit > 1)
@@ -789,7 +789,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
                        }
                        ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
                        if (!ac_ptr[i]) {
-                                for (i--; i <=0; i--)
+                                for (i--; i <= 0; i--)
                                        kfree(ac_ptr[i]);
                                kfree(ac_ptr);
                                return NULL;
@@ -807,12 +807,13 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
                return;
        for_each_node(i)
-                kfree(ac_ptr[i]);
+            kfree(ac_ptr[i]);
        kfree(ac_ptr);
 }
-static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
+static inline void __drain_alien_cache(kmem_cache_t *cachep,
+                                       struct array_cache *ac, int node)
 {
        struct kmem_list3 *rl3 = cachep->nodelists[node];
@@ -826,7 +827,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache
 static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
 {
-        int i=0;
+        int i = 0;
        struct array_cache *ac;
        unsigned long flags;
@@ -846,14 +847,13 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
 #endif
 static int __devinit cpuup_callback(struct notifier_block *nfb,
-                                  unsigned long action, void *hcpu)
+                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
-        kmem_cache_t* cachep;
+        kmem_cache_t *cachep;
        struct kmem_list3 *l3 = NULL;
        int node = cpu_to_node(cpu);
        int memsize = sizeof(struct kmem_list3);
-        struct array_cache *nc = NULL;
        switch (action) {
        case CPU_UP_PREPARE:
@@ -871,27 +871,29 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                         */
                        if (!cachep->nodelists[node]) {
                                if (!(l3 = kmalloc_node(memsize,
-                                                GFP_KERNEL, node)))
+                                                        GFP_KERNEL, node)))
                                        goto bad;
                                kmem_list3_init(l3);
                                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                                  ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
                                cachep->nodelists[node] = l3;
                        }
                        spin_lock_irq(&cachep->nodelists[node]->list_lock);
                        cachep->nodelists[node]->free_limit =
-                                (1 + nr_cpus_node(node)) *
+                            (1 + nr_cpus_node(node)) *
-                                cachep->batchcount + cachep->num;
+                            cachep->batchcount + cachep->num;
                        spin_unlock_irq(&cachep->nodelists[node]->list_lock);
                }
                /* Now we can go ahead with allocating the shared array's
-                  & array cache's */
+                   & array cache's */
                list_for_each_entry(cachep, &cache_chain, next) {
+                        struct array_cache *nc;
                        nc = alloc_arraycache(node, cachep->limit,
-                                        cachep->batchcount);
+                                              cachep->batchcount);
                        if (!nc)
                                goto bad;
                        cachep->array[cpu] = nc;
@@ -900,12 +902,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        BUG_ON(!l3);
                        if (!l3->shared) {
                                if (!(nc = alloc_arraycache(node,
-                                        cachep->shared*cachep->batchcount,
+                                                            cachep->shared *
-                                        0xbaadf00d)))
+                                                            cachep->batchcount,
-                                        goto  bad;
+                                                            0xbaadf00d)))
+                                        goto bad;
                                /* we are serialised from CPU_DEAD or
-                                  CPU_UP_CANCELLED by the cpucontrol lock */
+                                   CPU_UP_CANCELLED by the cpucontrol lock */
                                l3->shared = nc;
                        }
                }
@@ -942,13 +945,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                                free_block(cachep, nc->entry, nc->avail, node);
                        if (!cpus_empty(mask)) {
-                                spin_unlock(&l3->list_lock);
+                                spin_unlock(&l3->list_lock);
-                                goto unlock_cache;
+                                goto unlock_cache;
-                        }
+                        }
                        if (l3->shared) {
                                free_block(cachep, l3->shared->entry,
-                                                l3->shared->avail, node);
+                                           l3->shared->avail, node);
                                kfree(l3->shared);
                                l3->shared = NULL;
                        }
@@ -966,7 +969,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        } else {
                                spin_unlock(&l3->list_lock);
                        }
-unlock_cache:
+                      unlock_cache:
                        spin_unlock_irq(&cachep->spinlock);
                        kfree(nc);
                }
@@ -975,7 +978,7 @@ unlock_cache:
 #endif
        }
        return NOTIFY_OK;
-bad:
+      bad:
        up(&cache_chain_sem);
        return NOTIFY_BAD;
 }
@@ -985,8 +988,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
 * swap the static kmem_list3 with kmalloced memory
 */
-static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
+static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
-                int nodeid)
 {
        struct kmem_list3 *ptr;
@@ -1055,14 +1057,14 @@ void __init kmem_cache_init(void)
        cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
        cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
-                                &left_over, &cache_cache.num);
+                       &left_over, &cache_cache.num);
        if (!cache_cache.num)
                BUG();
-        cache_cache.colour = left_over/cache_cache.colour_off;
+        cache_cache.colour = left_over / cache_cache.colour_off;
        cache_cache.colour_next = 0;
-        cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
+        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
-                                sizeof(struct slab), cache_line_size());
+                                      sizeof(struct slab), cache_line_size());
        /* 2+3) create the kmalloc caches */
        sizes = malloc_sizes;
@@ -1074,14 +1076,18 @@ void __init kmem_cache_init(void)
         */
        sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-                                sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
+                                                      sizes[INDEX_AC].cs_size,
-                                (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+                                                      ARCH_KMALLOC_MINALIGN,
+                                                      (ARCH_KMALLOC_FLAGS |
+                                                       SLAB_PANIC), NULL, NULL);
        if (INDEX_AC != INDEX_L3)
                sizes[INDEX_L3].cs_cachep =
-                        kmem_cache_create(names[INDEX_L3].name,
+                    kmem_cache_create(names[INDEX_L3].name,
-                                sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
+                                      sizes[INDEX_L3].cs_size,
-                                (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+                                      ARCH_KMALLOC_MINALIGN,
+                                      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
+                                      NULL);
        while (sizes->cs_size != ULONG_MAX) {
                /*
@@ -1091,35 +1097,41 @@ void __init kmem_cache_init(void)
                 * Note for systems short on memory removing the alignment will
                 * allow tighter packing of the smaller caches.
                 */
-                if(!sizes->cs_cachep)
+                if (!sizes->cs_cachep)
                        sizes->cs_cachep = kmem_cache_create(names->name,
-                                sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+                                                             sizes->cs_size,
-                                (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+                                                             ARCH_KMALLOC_MINALIGN,
+                                                             (ARCH_KMALLOC_FLAGS
+                                                              | SLAB_PANIC),
+                                                             NULL, NULL);
                /* Inc off-slab bufctl limit until the ceiling is hit. */
                if (!(OFF_SLAB(sizes->cs_cachep))) {
-                        offslab_limit = sizes->cs_size-sizeof(struct slab);
+                        offslab_limit = sizes->cs_size - sizeof(struct slab);
                        offslab_limit /= sizeof(kmem_bufctl_t);
                }
                sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
-                        sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+                                                        sizes->cs_size,
-                        (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
+                                                        ARCH_KMALLOC_MINALIGN,
-                        NULL, NULL);
+                                                        (ARCH_KMALLOC_FLAGS |
+                                                         SLAB_CACHE_DMA |
+                                                         SLAB_PANIC), NULL,
+                                                        NULL);
                sizes++;
                names++;
        }
        /* 4) Replace the bootstrap head arrays */
        {
-                void * ptr;
+                void *ptr;
                ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
                local_irq_disable();
                BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
                memcpy(ptr, ac_data(&cache_cache),
-                                sizeof(struct arraycache_init));
+                       sizeof(struct arraycache_init));
                cache_cache.array[smp_processor_id()] = ptr;
                local_irq_enable();
@@ -1127,11 +1139,11 @@ void __init kmem_cache_init(void)
                local_irq_disable();
                BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
-                                != &initarray_generic.cache);
+                       != &initarray_generic.cache);
                memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
-                                sizeof(struct arraycache_init));
+                       sizeof(struct arraycache_init));
                malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-                                                ptr;
+                    ptr;
                local_irq_enable();
        }
        /* 5) Replace the bootstrap kmem_list3's */
@@ -1139,16 +1151,16 @@ void __init kmem_cache_init(void)
                int node;
                /* Replace the static kmem_list3 structures for the boot cpu */
                init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
-                                numa_node_id());
+                          numa_node_id());
                for_each_online_node(node) {
                        init_list(malloc_sizes[INDEX_AC].cs_cachep,
-                                        &initkmem_list3[SIZE_AC+node], node);
+                                  &initkmem_list3[SIZE_AC + node], node);
                        if (INDEX_AC != INDEX_L3) {
                                init_list(malloc_sizes[INDEX_L3].cs_cachep,
-                                                &initkmem_list3[SIZE_L3+node],
+                                          &initkmem_list3[SIZE_L3 + node],
-                                                node);
+                                          node);
                        }
                }
        }
@@ -1158,7 +1170,7 @@ void __init kmem_cache_init(void)
                kmem_cache_t *cachep;
                down(&cache_chain_sem);
                list_for_each_entry(cachep, &cache_chain, next)
-                        enable_cpucache(cachep);
+                    enable_cpucache(cachep);
                up(&cache_chain_sem);
        }
@@ -1184,7 +1196,7 @@ static int __init cpucache_init(void)
         * pages to gfp.
         */
        for_each_online_cpu(cpu)
-                start_cpu_timer(cpu);
+            start_cpu_timer(cpu);
        return 0;
 }
@@ -1226,7 +1238,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 */
 static void kmem_freepages(kmem_cache_t *cachep, void *addr)
 {
-        unsigned long i = (1<<cachep->gfporder);
+        unsigned long i = (1 << cachep->gfporder);
        struct page *page = virt_to_page(addr);
        const unsigned long nr_freed = i;
@@ -1239,13 +1251,13 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += nr_freed;
        free_pages((unsigned long)addr, cachep->gfporder);
-        if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 
+        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-                atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
+                atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
 }
 static void kmem_rcu_free(struct rcu_head *head)
 {
-        struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
+        struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
        kmem_cache_t *cachep = slab_rcu->cachep;
        kmem_freepages(cachep, slab_rcu->addr);
@@ -1257,19 +1269,19 @@ static void kmem_rcu_free(struct rcu_head *head)
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
-                                unsigned long caller)
+                            unsigned long caller)
 {
        int size = obj_reallen(cachep);
-        addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
+        addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
-        if (size < 5*sizeof(unsigned long))
+        if (size < 5 * sizeof(unsigned long))
                return;
-        *addr++=0x12345678;
+        *addr++ = 0x12345678;
-        *addr++=caller;
+        *addr++ = caller;
-        *addr++=smp_processor_id();
+        *addr++ = smp_processor_id();
-        size -= 3*sizeof(unsigned long);
+        size -= 3 * sizeof(unsigned long);
        {
                unsigned long *sptr = &caller;
                unsigned long svalue;
@@ -1277,7 +1289,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
                while (!kstack_end(sptr)) {
                        svalue = *sptr++;
                        if (kernel_text_address(svalue)) {
-                                *addr++=svalue;
+                                *addr++ = svalue;
                                size -= sizeof(unsigned long);
                                if (size <= sizeof(unsigned long))
                                        break;
@@ -1285,25 +1297,25 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
                }
        }
-        *addr++=0x87654321;
+        *addr++ = 0x87654321;
 }
 #endif
 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
 {
        int size = obj_reallen(cachep);
-        addr = &((char*)addr)[obj_dbghead(cachep)];
+        addr = &((char *)addr)[obj_dbghead(cachep)];
        memset(addr, val, size);
-        *(unsigned char *)(addr+size-1) = POISON_END;
+        *(unsigned char *)(addr + size - 1) = POISON_END;
 }
 static void dump_line(char *data, int offset, int limit)
 {
        int i;
        printk(KERN_ERR "%03x:", offset);
-        for (i=0;i<limit;i++) {
+        for (i = 0; i < limit; i++) {
-                printk(" %02x", (unsigned char)data[offset+i]);
+                printk(" %02x", (unsigned char)data[offset + i]);
        }
        printk("\n");
 }
@@ -1318,24 +1330,24 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
        if (cachep->flags & SLAB_RED_ZONE) {
                printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
-                        *dbg_redzone1(cachep, objp),
+                       *dbg_redzone1(cachep, objp),
-                        *dbg_redzone2(cachep, objp));
+                       *dbg_redzone2(cachep, objp));
        }
        if (cachep->flags & SLAB_STORE_USER) {
                printk(KERN_ERR "Last user: [<%p>]",
-                                *dbg_userword(cachep, objp));
+                       *dbg_userword(cachep, objp));
                print_symbol("(%s)",
-                                (unsigned long)*dbg_userword(cachep, objp));
+                             (unsigned long)*dbg_userword(cachep, objp));
                printk("\n");
        }
-        realobj = (char*)objp+obj_dbghead(cachep);
+        realobj = (char *)objp + obj_dbghead(cachep);
        size = obj_reallen(cachep);
-        for (i=0; i<size && lines;i+=16, lines--) {
+        for (i = 0; i < size && lines; i += 16, lines--) {
                int limit;
                limit = 16;
-                if (i+limit > size)
+                if (i + limit > size)
-                        limit = size-i;
+                        limit = size - i;
                dump_line(realobj, i, limit);
        }
 }
@@ -1346,27 +1358,28 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
        int size, i;
        int lines = 0;
-        realobj = (char*)objp+obj_dbghead(cachep);
+        realobj = (char *)objp + obj_dbghead(cachep);
        size = obj_reallen(cachep);
-        for (i=0;i<size;i++) {
+        for (i = 0; i < size; i++) {
                char exp = POISON_FREE;
-                if (i == size-1)
+                if (i == size - 1)
                        exp = POISON_END;
                if (realobj[i] != exp) {
                        int limit;
                        /* Mismatch ! */
                        /* Print header */
                        if (lines == 0) {
-                                printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
+                                printk(KERN_ERR
-                                                realobj, size);
+                                       "Slab corruption: start=%p, len=%d\n",
+                                       realobj, size);
                                print_objinfo(cachep, objp, 0);
                        }
                        /* Hexdump the affected line */
-                        i = (i/16)*16;
+                        i = (i / 16) * 16;
                        limit = 16;
-                        if (i+limit > size)
+                        if (i + limit > size)
-                                limit = size-i;
+                                limit = size - i;
                        dump_line(realobj, i, limit);
                        i += 16;
                        lines++;
@@ -1382,19 +1395,19 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
                struct slab *slabp = page_get_slab(virt_to_page(objp));
                int objnr;
-                objnr = (objp-slabp->s_mem)/cachep->objsize;
+                objnr = (objp - slabp->s_mem) / cachep->objsize;
                if (objnr) {
-                        objp = slabp->s_mem+(objnr-1)*cachep->objsize;
+                        objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
-                        realobj = (char*)objp+obj_dbghead(cachep);
+                        realobj = (char *)objp + obj_dbghead(cachep);
                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
-                                                realobj, size);
+                               realobj, size);
                        print_objinfo(cachep, objp, 2);
                }
-                if (objnr+1 < cachep->num) {
+                if (objnr + 1 < cachep->num) {
-                        objp = slabp->s_mem+(objnr+1)*cachep->objsize;
+                        objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
-                        realobj = (char*)objp+obj_dbghead(cachep);
+                        realobj = (char *)objp + obj_dbghead(cachep);
                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
-                                                realobj, size);
+                               realobj, size);
                        print_objinfo(cachep, objp, 2);
                }
        }
@@ -1405,7 +1418,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 * Before calling the slab must have been unlinked from the cache.
 * The cache-lock is not held/needed.
 */
-static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 {
        void *addr = slabp->s_mem - slabp->colouroff;
@@ -1416,8 +1429,11 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
                if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                        if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
+                        if ((cachep->objsize % PAGE_SIZE) == 0
-                                kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
+                            && OFF_SLAB(cachep))
+                                kernel_map_pages(virt_to_page(objp),
+                                                 cachep->objsize / PAGE_SIZE,
+                                                 1);
                        else
                                check_poison_obj(cachep, objp);
 #else
@@ -1427,20 +1443,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "start of a freed object "
-                                                        "was overwritten");
+                                           "was overwritten");
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "end of a freed object "
-                                                        "was overwritten");
+                                           "was overwritten");
                }
                if (cachep->dtor && !(cachep->flags & SLAB_POISON))
-                        (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
+                        (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
        }
 #else
        if (cachep->dtor) {
                int i;
                for (i = 0; i < cachep->num; i++) {
-                        void* objp = slabp->s_mem+cachep->objsize*i;
+                        void *objp = slabp->s_mem + cachep->objsize * i;
-                        (cachep->dtor)(objp, cachep, 0);
+                        (cachep->dtor) (objp, cachep, 0);
                }
        }
 #endif
@@ -1448,7 +1464,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
                struct slab_rcu *slab_rcu;
-                slab_rcu = (struct slab_rcu *) slabp;
+                slab_rcu = (struct slab_rcu *)slabp;
                slab_rcu->cachep = cachep;
                slab_rcu->addr = addr;
                call_rcu(&slab_rcu->head, kmem_rcu_free);
@@ -1466,11 +1482,58 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
        int node;
        for_each_online_node(node) {
-                cachep->nodelists[node] = &initkmem_list3[index+node];
+                cachep->nodelists[node] = &initkmem_list3[index + node];
                cachep->nodelists[node]->next_reap = jiffies +
-                        REAPTIMEOUT_LIST3 +
+                    REAPTIMEOUT_LIST3 +
-                        ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+        }
+}
+/**
+ * calculate_slab_order - calculate size (page order) of slabs and the number
+ *                        of objects per slab.
+ *
+ * This could be made much more intelligent.  For now, try to avoid using
+ * high order pages for slabs.  When the gfp() functions are more friendly
+ * towards high-order requests, this should be changed.
+ */
+static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
+                                          size_t align, gfp_t flags)
+{
+        size_t left_over = 0;
+        for (;; cachep->gfporder++) {
+                unsigned int num;
+                size_t remainder;
+                if (cachep->gfporder > MAX_GFP_ORDER) {
+                        cachep->num = 0;
+                        break;
+                }
+                cache_estimate(cachep->gfporder, size, align, flags,
+                               &remainder, &num);
+                if (!num)
+                        continue;
+                /* More than offslab_limit objects will cause problems */
+                if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+                        break;
+                cachep->num = num;
+                left_over = remainder;
+                /*
+                 * Large number of objects is good, but very large slabs are
+                 * currently bad for the gfp()s.
+                 */
+                if (cachep->gfporder >= slab_break_gfp_order)
+                        break;
+                if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
+                        /* Acceptable internal fragmentation */
+                        break;
        }
+        return left_over;
 }
 /**
@@ -1519,14 +1582,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * Sanity checks... these are all serious usage bugs.
         */
        if ((!name) ||
-                in_interrupt() ||
+            in_interrupt() ||
-                (size < BYTES_PER_WORD) ||
+            (size < BYTES_PER_WORD) ||
-                (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
+            (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
-                (dtor && !ctor)) {
+                printk(KERN_ERR "%s: Early error in slab %s\n",
-                        printk(KERN_ERR "%s: Early error in slab %s\n",
+                       __FUNCTION__, name);
-                                        __FUNCTION__, name);
+                BUG();
-                        BUG();
+        }
-                }
        down(&cache_chain_sem);
@@ -1546,11 +1608,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                set_fs(old_fs);
                if (res) {
                        printk("SLAB: cache with size %d has lost its name\n",
-                                        pc->objsize);
+                               pc->objsize);
                        continue;
                }
-                if (!strcmp(pc->name,name)) {
+                if (!strcmp(pc->name, name)) {
                        printk("kmem_cache_create: duplicate cache %s\n", name);
                        dump_stack();
                        goto oops;
@@ -1562,10 +1624,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
                /* No constructor, but inital state check requested */
                printk(KERN_ERR "%s: No con, but init state check "
-                                "requested - %s\n", __FUNCTION__, name);
+                       "requested - %s\n", __FUNCTION__, name);
                flags &= ~SLAB_DEBUG_INITIAL;
        }
 #if FORCED_DEBUG
        /*
         * Enable redzoning and last user accounting, except for caches with
@@ -1573,8 +1634,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * above the next power of two: caches with object sizes just above a
         * power of two have a significant amount of internal fragmentation.
         */
-        if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
+        if ((size < 4096
-                flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
+             || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
+                flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
        if (!(flags & SLAB_DESTROY_BY_RCU))
                flags |= SLAB_POISON;
 #endif
@@ -1595,9 +1657,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * unaligned accesses for some archs when redzoning is used, and makes
         * sure any on-slab bufctl's are also correctly aligned.
         */
-        if (size & (BYTES_PER_WORD-1)) {
+        if (size & (BYTES_PER_WORD - 1)) {
-                size += (BYTES_PER_WORD-1);
+                size += (BYTES_PER_WORD - 1);
-                size &= ~(BYTES_PER_WORD-1);
+                size &= ~(BYTES_PER_WORD - 1);
        }
        /* calculate out the final buffer alignment: */
@@ -1608,7 +1670,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                 * objects into one cacheline.
                 */
                ralign = cache_line_size();
-                while (size <= ralign/2)
+                while (size <= ralign / 2)
                        ralign /= 2;
        } else {
                ralign = BYTES_PER_WORD;
@@ -1617,13 +1679,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (ralign < ARCH_SLAB_MINALIGN) {
                ralign = ARCH_SLAB_MINALIGN;
                if (ralign > BYTES_PER_WORD)
-                        flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        }
        /* 3) caller mandated alignment: disables debug if necessary */
        if (ralign < align) {
                ralign = align;
                if (ralign > BYTES_PER_WORD)
-                        flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        }
        /* 4) Store it. Note that the debug code below can reduce
         *    the alignment to BYTES_PER_WORD.
@@ -1645,7 +1707,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                /* add space for red zone words */
                cachep->dbghead += BYTES_PER_WORD;
-                size += 2*BYTES_PER_WORD;
+                size += 2 * BYTES_PER_WORD;
        }
        if (flags & SLAB_STORE_USER) {
                /* user store requires word alignment and
@@ -1656,7 +1718,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                size += BYTES_PER_WORD;
        }
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-        if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
+        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
+            && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
                cachep->dbghead += PAGE_SIZE - size;
                size = PAGE_SIZE;
        }
@@ -1664,7 +1727,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 #endif
        /* Determine if the slab management is 'on' or 'off' slab. */
-        if (size >= (PAGE_SIZE>>3))
+        if (size >= (PAGE_SIZE >> 3))
                /*
                 * Size is large, assume best to place the slab management obj
                 * off-slab (should allow better packing of objs).
@@ -1681,47 +1744,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                 */
                cachep->gfporder = 0;
                cache_estimate(cachep->gfporder, size, align, flags,
-                                        &left_over, &cachep->num);
+                               &left_over, &cachep->num);
-        } else {
+        } else
-                /*
+                left_over = calculate_slab_order(cachep, size, align, flags);
-                 * Calculate size (in pages) of slabs, and the num of objs per
-                 * slab.  This could be made much more intelligent.  For now,
-                 * try to avoid using high page-orders for slabs.  When the
-                 * gfp() funcs are more friendly towards high-order requests,
-                 * this should be changed.
-                 */
-                do {
-                        unsigned int break_flag = 0;
-cal_wastage:
-                        cache_estimate(cachep->gfporder, size, align, flags,
-                                                &left_over, &cachep->num);
-                        if (break_flag)
-                                break;
-                        if (cachep->gfporder >= MAX_GFP_ORDER)
-                                break;
-                        if (!cachep->num)
-                                goto next;
-                        if (flags & CFLGS_OFF_SLAB &&
-                                        cachep->num > offslab_limit) {
-                                /* This num of objs will cause problems. */
-                                cachep->gfporder--;
-                                break_flag++;
-                                goto cal_wastage;
-                        }
-                        /*
-                         * Large num of objs is good, but v. large slabs are
-                         * currently bad for the gfp()s.
-                         */
-                        if (cachep->gfporder >= slab_break_gfp_order)
-                                break;
-                        if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
-                                break;  /* Acceptable internal fragmentation. */
-next:
-                        cachep->gfporder++;
-                } while (1);
-        }
        if (!cachep->num) {
                printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1729,8 +1754,8 @@ next:
                cachep = NULL;
                goto oops;
        }
-        slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
+        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
-                                + sizeof(struct slab), align);
+                          + sizeof(struct slab), align);
        /*
         * If the slab has been placed off-slab, and we have enough space then
@@ -1743,14 +1768,15 @@ next:
        if (flags & CFLGS_OFF_SLAB) {
                /* really off slab. No need for manual alignment */
-                slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
+                slab_size =
+                    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
        }
        cachep->colour_off = cache_line_size();
        /* Offset must be a multiple of the alignment. */
        if (cachep->colour_off < align)
                cachep->colour_off = align;
-        cachep->colour = left_over/cachep->colour_off;
+        cachep->colour = left_over / cachep->colour_off;
        cachep->slab_size = slab_size;
        cachep->flags = flags;
        cachep->gfpflags = 0;
@@ -1777,7 +1803,7 @@ next:
                         * the creation of further caches will BUG().
                         */
                        cachep->array[smp_processor_id()] =
-                                &initarray_generic.cache;
+                            &initarray_generic.cache;
                        /* If the cache that's used by
                         * kmalloc(sizeof(kmem_list3)) is the first cache,
@@ -1791,8 +1817,7 @@ next:
                                g_cpucache_up = PARTIAL_AC;
                } else {
                        cachep->array[smp_processor_id()] =
-                                kmalloc(sizeof(struct arraycache_init),
+                            kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-                                                GFP_KERNEL);
                        if (g_cpucache_up == PARTIAL_AC) {
                                set_up_list3s(cachep, SIZE_L3);
@@ -1802,16 +1827,18 @@ next:
                                for_each_online_node(node) {
                                        cachep->nodelists[node] =
-                                                kmalloc_node(sizeof(struct kmem_list3),
+                                            kmalloc_node(sizeof
-                                                                GFP_KERNEL, node);
+                                                         (struct kmem_list3),
+                                                         GFP_KERNEL, node);
                                        BUG_ON(!cachep->nodelists[node]);
-                                        kmem_list3_init(cachep->nodelists[node]);
+                                        kmem_list3_init(cachep->
+                                                        nodelists[node]);
                                }
                        }
                }
                cachep->nodelists[numa_node_id()]->next_reap =
-                        jiffies + REAPTIMEOUT_LIST3 +
+                    jiffies + REAPTIMEOUT_LIST3 +
-                        ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
                BUG_ON(!ac_data(cachep));
                ac_data(cachep)->avail = 0;
@@ -1820,15 +1847,15 @@ next:
                ac_data(cachep)->touched = 0;
                cachep->batchcount = 1;
                cachep->limit = BOOT_CPUCACHE_ENTRIES;
-        } 
+        }
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
        unlock_cpu_hotplug();
-oops:
+      oops:
        if (!cachep && (flags & SLAB_PANIC))
                panic("kmem_cache_create(): failed to create slab `%s'\n",
-                        name);
+                      name);
        up(&cache_chain_sem);
        return cachep;
 }
@@ -1871,7 +1898,7 @@ static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
 /*
 * Waits for all CPUs to execute func().
 */
-static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
+static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 {
        check_irq_on();
        preempt_disable();
@@ -1886,12 +1913,12 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
        preempt_enable();
 }
-static void drain_array_locked(kmem_cache_t* cachep,
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
-                                struct array_cache *ac, int force, int node);
+                                int force, int node);
 static void do_drain(void *arg)
 {
-        kmem_cache_t *cachep = (kmem_cache_t*)arg;
+        kmem_cache_t *cachep = (kmem_cache_t *) arg;
        struct array_cache *ac;
        int node = numa_node_id();
@@ -1911,7 +1938,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
        smp_call_function_all_cpus(do_drain, cachep);
        check_irq_on();
        spin_lock_irq(&cachep->spinlock);
-        for_each_online_node(node)  {
+        for_each_online_node(node) {
                l3 = cachep->nodelists[node];
                if (l3) {
                        spin_lock(&l3->list_lock);
@@ -1949,8 +1976,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
                slab_destroy(cachep, slabp);
                spin_lock_irq(&l3->list_lock);
        }
-        ret = !list_empty(&l3->slabs_full) ||
+        ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
-                !list_empty(&l3->slabs_partial);
        return ret;
 }
@@ -2006,7 +2032,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 * The caller must guarantee that noone will allocate memory from the cache
 * during the kmem_cache_destroy().
 */
-int kmem_cache_destroy(kmem_cache_t * cachep)
+int kmem_cache_destroy(kmem_cache_t *cachep)
 {
        int i;
        struct kmem_list3 *l3;
@@ -2028,7 +2054,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
        if (__cache_shrink(cachep)) {
                slab_error(cachep, "Can't free all objects");
                down(&cache_chain_sem);
-                list_add(&cachep->next,&cache_chain);
+                list_add(&cachep->next, &cache_chain);
                up(&cache_chain_sem);
                unlock_cpu_hotplug();
                return 1;
@@ -2038,7 +2064,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
                synchronize_rcu();
        for_each_online_cpu(i)
-                kfree(cachep->array[i]);
+            kfree(cachep->array[i]);
        /* NUMA: free the list3 structures */
        for_each_online_node(i) {
@@ -2057,39 +2083,39 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
 EXPORT_SYMBOL(kmem_cache_destroy);
 /* Get the memory for a slab management obj. */
-static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
+static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
-                        int colour_off, gfp_t local_flags)
+                                   int colour_off, gfp_t local_flags)
 {
        struct slab *slabp;
-        
        if (OFF_SLAB(cachep)) {
                /* Slab management obj is off-slab. */
                slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
                if (!slabp)
                        return NULL;
        } else {
-                slabp = objp+colour_off;
+                slabp = objp + colour_off;
                colour_off += cachep->slab_size;
        }
        slabp->inuse = 0;
        slabp->colouroff = colour_off;
-        slabp->s_mem = objp+colour_off;
+        slabp->s_mem = objp + colour_off;
        return slabp;
 }
 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
 {
-        return (kmem_bufctl_t *)(slabp+1);
+        return (kmem_bufctl_t *) (slabp + 1);
 }
 static void cache_init_objs(kmem_cache_t *cachep,
-                        struct slab *slabp, unsigned long ctor_flags)
+                            struct slab *slabp, unsigned long ctor_flags)
 {
        int i;
        for (i = 0; i < cachep->num; i++) {
-                void *objp = slabp->s_mem+cachep->objsize*i;
+                void *objp = slabp->s_mem + cachep->objsize * i;
 #if DEBUG
                /* need to poison the objs? */
                if (cachep->flags & SLAB_POISON)
@@ -2107,25 +2133,28 @@ static void cache_init_objs(kmem_cache_t *cachep,
                 * Otherwise, deadlock. They must also be threaded.
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-                        cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);
+                        cachep->ctor(objp + obj_dbghead(cachep), cachep,
+                                     ctor_flags);
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "constructor overwrote the"
-                                                        " end of an object");
+                                           " end of an object");
                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "constructor overwrote the"
-                                                        " start of an object");
+                                           " start of an object");
                }
-                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
+                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
-                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+                    && cachep->flags & SLAB_POISON)
+                        kernel_map_pages(virt_to_page(objp),
+                                         cachep->objsize / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
                        cachep->ctor(objp, cachep, ctor_flags);
 #endif
-                slab_bufctl(slabp)[i] = i+1;
+                slab_bufctl(slabp)[i] = i + 1;
        }
-        slab_bufctl(slabp)[i-1] = BUFCTL_END;
+        slab_bufctl(slabp)[i - 1] = BUFCTL_END;
        slabp->free = 0;
 }
@@ -2161,17 +2190,17 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
 */
 static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
-        struct slab     *slabp;
+        struct slab *slabp;
-        void            *objp;
+        void *objp;
-        size_t           offset;
+        size_t offset;
-        gfp_t            local_flags;
+        gfp_t local_flags;
-        unsigned long    ctor_flags;
+        unsigned long ctor_flags;
        struct kmem_list3 *l3;
        /* Be lazy and only check for valid flags here,
-         * keeping it out of the critical path in kmem_cache_alloc().
+         * keeping it out of the critical path in kmem_cache_alloc().
         */
-        if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
+        if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
                BUG();
        if (flags & SLAB_NO_GROW)
                return 0;
@@ -2237,9 +2266,9 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
        l3->free_objects += cachep->num;
        spin_unlock(&l3->list_lock);
        return 1;
-opps1:
+      opps1:
        kmem_freepages(cachep, objp);
-failed:
+      failed:
        if (local_flags & __GFP_WAIT)
                local_irq_disable();
        return 0;
@@ -2259,18 +2288,19 @@ static void kfree_debugcheck(const void *objp)
        if (!virt_addr_valid(objp)) {
                printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
-                        (unsigned long)objp);   
+                       (unsigned long)objp);
-                BUG();  
+                BUG();
        }
        page = virt_to_page(objp);
        if (!PageSlab(page)) {
-                printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
+                printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
+                       (unsigned long)objp);
                BUG();
        }
 }
 static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
-                                        void *caller)
+                                   void *caller)
 {
        struct page *page;
        unsigned int objnr;
@@ -2281,20 +2311,26 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
        page = virt_to_page(objp);
        if (page_get_cache(page) != cachep) {
-                printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+                printk(KERN_ERR
-                                page_get_cache(page),cachep);
+                       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+                       page_get_cache(page), cachep);
                printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
-                printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name);
+                printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
+                       page_get_cache(page)->name);
                WARN_ON(1);
        }
        slabp = page_get_slab(page);
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
-                        slab_error(cachep, "double free, or memory outside"
+                    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-                                                " object was overwritten");
+                        slab_error(cachep,
-                        printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                                   "double free, or memory outside"
-                                        objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+                                   " object was overwritten");
+                        printk(KERN_ERR
+                               "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                               objp, *dbg_redzone1(cachep, objp),
+                               *dbg_redzone2(cachep, objp));
                }
                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
@@ -2302,30 +2338,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
        if (cachep->flags & SLAB_STORE_USER)
                *dbg_userword(cachep, objp) = caller;
-        objnr = (objp-slabp->s_mem)/cachep->objsize;
+        objnr = (objp - slabp->s_mem) / cachep->objsize;
        BUG_ON(objnr >= cachep->num);
-        BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
+        BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
        if (cachep->flags & SLAB_DEBUG_INITIAL) {
                /* Need to call the slab's constructor so the
                 * caller can perform a verify of its state (debugging).
                 * Called without the cache-lock held.
                 */
-                cachep->ctor(objp+obj_dbghead(cachep),
+                cachep->ctor(objp + obj_dbghead(cachep),
-                                        cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
+                             cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
        }
        if (cachep->flags & SLAB_POISON && cachep->dtor) {
                /* we want to cache poison the object,
                 * call the destruction callback
                 */
-                cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
+                cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
        }
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
                        store_stackinfo(cachep, objp, (unsigned long)caller);
-                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+                        kernel_map_pages(virt_to_page(objp),
+                                         cachep->objsize / PAGE_SIZE, 0);
                } else {
                        poison_obj(cachep, objp, POISON_FREE);
                }
@@ -2340,7 +2377,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
 {
        kmem_bufctl_t i;
        int entries = 0;
-        
        /* Check slab's freelist to see if this obj is there. */
        for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
                entries++;
@@ -2348,13 +2385,16 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
                        goto bad;
        }
        if (entries != cachep->num - slabp->inuse) {
-bad:
+              bad:
-                printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+                printk(KERN_ERR
-                                cachep->name, cachep->num, slabp, slabp->inuse);
+                       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-                for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) {
+                       cachep->name, cachep->num, slabp, slabp->inuse);
-                        if ((i%16)==0)
+                for (i = 0;
+                     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+                     i++) {
+                        if ((i % 16) == 0)
                                printk("\n%03x:", i);
-                        printk(" %02x", ((unsigned char*)slabp)[i]);
+                        printk(" %02x", ((unsigned char *)slabp)[i]);
                }
                printk("\n");
                BUG();
@@ -2374,7 +2414,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
        check_irq_off();
        ac = ac_data(cachep);
-retry:
+      retry:
        batchcount = ac->batchcount;
        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
                /* if there was little recent activity on this
@@ -2396,8 +2436,8 @@ retry:
                        shared_array->avail -= batchcount;
                        ac->avail = batchcount;
                        memcpy(ac->entry,
-                                &(shared_array->entry[shared_array->avail]),
+                               &(shared_array->entry[shared_array->avail]),
-                                sizeof(void*)*batchcount);
+                               sizeof(void *) * batchcount);
                        shared_array->touched = 1;
                        goto alloc_done;
                }
@@ -2425,7 +2465,7 @@ retry:
                        /* get obj pointer */
                        ac->entry[ac->avail++] = slabp->s_mem +
-                                slabp->free*cachep->objsize;
+                            slabp->free * cachep->objsize;
                        slabp->inuse++;
                        next = slab_bufctl(slabp)[slabp->free];
@@ -2433,7 +2473,7 @@ retry:
                        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
                        WARN_ON(numa_node_id() != slabp->nodeid);
 #endif
-                        slabp->free = next;
+                        slabp->free = next;
                }
                check_slabp(cachep, slabp);
@@ -2445,9 +2485,9 @@ retry:
                        list_add(&slabp->list, &l3->slabs_partial);
        }
-must_grow:
+      must_grow:
        l3->free_objects -= ac->avail;
-alloc_done:
+      alloc_done:
        spin_unlock(&l3->list_lock);
        if (unlikely(!ac->avail)) {
@@ -2459,7 +2499,7 @@ alloc_done:
                if (!x && ac->avail == 0)       // no objects in sight? abort
                        return NULL;
-                if (!ac->avail)         // objects refilled by interrupt?
+                if (!ac->avail) // objects refilled by interrupt?
                        goto retry;
        }
        ac->touched = 1;
@@ -2476,16 +2516,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
 }
 #if DEBUG
-static void *
+static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
-cache_alloc_debugcheck_after(kmem_cache_t *cachep,
+                                        void *objp, void *caller)
-                        gfp_t flags, void *objp, void *caller)
 {
-        if (!objp)      
+        if (!objp)
                return objp;
-        if (cachep->flags & SLAB_POISON) {
+        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
-                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
+                        kernel_map_pages(virt_to_page(objp),
+                                         cachep->objsize / PAGE_SIZE, 1);
                else
                        check_poison_obj(cachep, objp);
 #else
@@ -2497,24 +2537,28 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
                *dbg_userword(cachep, objp) = caller;
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
-                        slab_error(cachep, "double free, or memory outside"
+                    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-                                                " object was overwritten");
+                        slab_error(cachep,
-                        printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                                   "double free, or memory outside"
-                                        objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+                                   " object was overwritten");
+                        printk(KERN_ERR
+                               "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                               objp, *dbg_redzone1(cachep, objp),
+                               *dbg_redzone2(cachep, objp));
                }
                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
        }
        objp += obj_dbghead(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON) {
-                unsigned long   ctor_flags = SLAB_CTOR_CONSTRUCTOR;
+                unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
                if (!(flags & __GFP_WAIT))
                        ctor_flags |= SLAB_CTOR_ATOMIC;
                cachep->ctor(objp, cachep, ctor_flags);
-        }       
+        }
        return objp;
 }
 #else
@@ -2523,7 +2567,7 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
 static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
-        void* objp;
+        void *objp;
        struct array_cache *ac;
        check_irq_off();
@@ -2542,7 +2586,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
        unsigned long save_flags;
-        void* objp;
+        void *objp;
        cache_alloc_debugcheck_before(cachep, flags);
@@ -2550,7 +2594,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
        objp = ____cache_alloc(cachep, flags);
        local_irq_restore(save_flags);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp,
-                                        __builtin_return_address(0));
+                                            __builtin_return_address(0));
        prefetchw(objp);
        return objp;
 }
@@ -2562,74 +2606,75 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
        struct list_head *entry;
-        struct slab *slabp;
+        struct slab *slabp;
-        struct kmem_list3 *l3;
+        struct kmem_list3 *l3;
-        void *obj;
+        void *obj;
-        kmem_bufctl_t next;
+        kmem_bufctl_t next;
-        int x;
+        int x;
-        l3 = cachep->nodelists[nodeid];
+        l3 = cachep->nodelists[nodeid];
-        BUG_ON(!l3);
+        BUG_ON(!l3);
-retry:
+      retry:
-        spin_lock(&l3->list_lock);
+        spin_lock(&l3->list_lock);
-        entry = l3->slabs_partial.next;
+        entry = l3->slabs_partial.next;
-        if (entry == &l3->slabs_partial) {
+        if (entry == &l3->slabs_partial) {
-                l3->free_touched = 1;
+                l3->free_touched = 1;
-                entry = l3->slabs_free.next;
+                entry = l3->slabs_free.next;
-                if (entry == &l3->slabs_free)
+                if (entry == &l3->slabs_free)
-                        goto must_grow;
+                        goto must_grow;
-        }
+        }
-        slabp = list_entry(entry, struct slab, list);
+        slabp = list_entry(entry, struct slab, list);
-        check_spinlock_acquired_node(cachep, nodeid);
+        check_spinlock_acquired_node(cachep, nodeid);
-        check_slabp(cachep, slabp);
+        check_slabp(cachep, slabp);
-        STATS_INC_NODEALLOCS(cachep);
+        STATS_INC_NODEALLOCS(cachep);
-        STATS_INC_ACTIVE(cachep);
+        STATS_INC_ACTIVE(cachep);
-        STATS_SET_HIGH(cachep);
+        STATS_SET_HIGH(cachep);
-        BUG_ON(slabp->inuse == cachep->num);
+        BUG_ON(slabp->inuse == cachep->num);
-        /* get obj pointer */
+        /* get obj pointer */
-        obj =  slabp->s_mem + slabp->free*cachep->objsize;
+        obj = slabp->s_mem + slabp->free * cachep->objsize;
-        slabp->inuse++;
+        slabp->inuse++;
-        next = slab_bufctl(slabp)[slabp->free];
+        next = slab_bufctl(slabp)[slabp->free];
 #if DEBUG
-        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
 #endif
-        slabp->free = next;
+        slabp->free = next;
-        check_slabp(cachep, slabp);
+        check_slabp(cachep, slabp);
-        l3->free_objects--;
+        l3->free_objects--;
-        /* move slabp to correct slabp list: */
+        /* move slabp to correct slabp list: */
-        list_del(&slabp->list);
+        list_del(&slabp->list);
-        if (slabp->free == BUFCTL_END) {
+        if (slabp->free == BUFCTL_END) {
-                list_add(&slabp->list, &l3->slabs_full);
+                list_add(&slabp->list, &l3->slabs_full);
-        } else {
+        } else {
-                list_add(&slabp->list, &l3->slabs_partial);
+                list_add(&slabp->list, &l3->slabs_partial);
-        }
+        }
-        spin_unlock(&l3->list_lock);
+        spin_unlock(&l3->list_lock);
-        goto done;
+        goto done;
-must_grow:
+      must_grow:
-        spin_unlock(&l3->list_lock);
+        spin_unlock(&l3->list_lock);
-        x = cache_grow(cachep, flags, nodeid);
+        x = cache_grow(cachep, flags, nodeid);
-        if (!x)
+        if (!x)
-                return NULL;
+                return NULL;
-        goto retry;
+        goto retry;
-done:
+      done:
-        return obj;
+        return obj;
 }
 #endif
 /*
 * Caller needs to acquire correct kmem_list's list_lock
 */
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node)
+static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
+                       int node)
 {
        int i;
        struct kmem_list3 *l3;
@@ -2652,7 +2697,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
                if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
                        printk(KERN_ERR "slab: double free detected in cache "
-                                        "'%s', objp %p\n", cachep->name, objp);
+                               "'%s', objp %p\n", cachep->name, objp);
                        BUG();
                }
 #endif
@@ -2696,20 +2741,19 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
        spin_lock(&l3->list_lock);
        if (l3->shared) {
                struct array_cache *shared_array = l3->shared;
-                int max = shared_array->limit-shared_array->avail;
+                int max = shared_array->limit - shared_array->avail;
                if (max) {
                        if (batchcount > max)
                                batchcount = max;
                        memcpy(&(shared_array->entry[shared_array->avail]),
-                                        ac->entry,
+                               ac->entry, sizeof(void *) * batchcount);
-                                        sizeof(void*)*batchcount);
                        shared_array->avail += batchcount;
                        goto free_done;
                }
        }
        free_block(cachep, ac->entry, batchcount, node);
-free_done:
+      free_done:
 #if STATS
        {
                int i = 0;
@@ -2731,10 +2775,9 @@ free_done:
        spin_unlock(&l3->list_lock);
        ac->avail -= batchcount;
        memmove(ac->entry, &(ac->entry[batchcount]),
-                        sizeof(void*)*ac->avail);
+                sizeof(void *) * ac->avail);
 }
 /*
 * __cache_free
 * Release an obj back to its cache. If the obj has a constructed
@@ -2759,7 +2802,8 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
                if (unlikely(slabp->nodeid != numa_node_id())) {
                        struct array_cache *alien = NULL;
                        int nodeid = slabp->nodeid;
-                        struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
+                        struct kmem_list3 *l3 =
+                            cachep->nodelists[numa_node_id()];
                        STATS_INC_NODEFREES(cachep);
                        if (l3->alien && l3->alien[nodeid]) {
@@ -2767,15 +2811,15 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
                                spin_lock(&alien->lock);
                                if (unlikely(alien->avail == alien->limit))
                                        __drain_alien_cache(cachep,
-                                                        alien, nodeid);
+                                                            alien, nodeid);
                                alien->entry[alien->avail++] = objp;
                                spin_unlock(&alien->lock);
                        } else {
                                spin_lock(&(cachep->nodelists[nodeid])->
-                                                list_lock);
+                                          list_lock);
                                free_block(cachep, &objp, 1, nodeid);
                                spin_unlock(&(cachep->nodelists[nodeid])->
-                                                list_lock);
+                                            list_lock);
                        }
                        return;
                }
@@ -2822,9 +2866,9 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 */
 int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
 {
-        unsigned long addr = (unsigned long) ptr;
+        unsigned long addr = (unsigned long)ptr;
        unsigned long min_addr = PAGE_OFFSET;
-        unsigned long align_mask = BYTES_PER_WORD-1;
+        unsigned long align_mask = BYTES_PER_WORD - 1;
        unsigned long size = cachep->objsize;
        struct page *page;
@@ -2844,7 +2888,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
        if (unlikely(page_get_cache(page) != cachep))
                goto out;
        return 1;
-out:
+      out:
        return 0;
 }
@@ -2871,8 +2915,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
        if (unlikely(!cachep->nodelists[nodeid])) {
                /* Fall back to __cache_alloc if we run into trouble */
-                printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name);
+                printk(KERN_WARNING
-                return __cache_alloc(cachep,flags);
+                       "slab: not allocating in inactive node %d for cache %s\n",
+                       nodeid, cachep->name);
+                return __cache_alloc(cachep, flags);
        }
        cache_alloc_debugcheck_before(cachep, flags);
@@ -2882,7 +2928,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
        else
                ptr = __cache_alloc_node(cachep, flags, nodeid);
        local_irq_restore(save_flags);
-        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
+        ptr =
+            cache_alloc_debugcheck_after(cachep, flags, ptr,
+                                         __builtin_return_address(0));
        return ptr;
 }
@@ -2944,12 +2992,11 @@ EXPORT_SYMBOL(__kmalloc);
 * Objects should be dereferenced using the per_cpu_ptr macro only.
 *
 * @size: how many bytes of memory are required.
- * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
 */
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
 {
        int i;
-        struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+        struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
        if (!pdata)
                return NULL;
@@ -2973,9 +3020,9 @@ void *__alloc_percpu(size_t size, size_t align)
        }
        /* Catch derefs w/o wrappers */
-        return (void *) (~(unsigned long) pdata);
+        return (void *)(~(unsigned long)pdata);
-unwind_oom:
+      unwind_oom:
        while (--i >= 0) {
                if (!cpu_possible(i))
                        continue;
@@ -3006,20 +3053,6 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
 EXPORT_SYMBOL(kmem_cache_free);
 /**
- * kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- */
-void *kzalloc(size_t size, gfp_t flags)
-{
-        void *ret = kmalloc(size, flags);
-        if (ret)
-                memset(ret, 0, size);
-        return ret;
-}
-EXPORT_SYMBOL(kzalloc);
-/**
 * kfree - free previously allocated memory
 * @objp: pointer returned by kmalloc.
 *
@@ -3038,7 +3071,8 @@ void kfree(const void *objp)
        local_irq_save(flags);
        kfree_debugcheck(objp);
        c = page_get_cache(virt_to_page(objp));
-        __cache_free(c, (void*)objp);
+        mutex_debug_check_no_locks_freed(objp, obj_reallen(c));
+        __cache_free(c, (void *)objp);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
@@ -3051,17 +3085,16 @@ EXPORT_SYMBOL(kfree);
 * Don't free memory not originally allocated by alloc_percpu()
 * The complemented objp is to check for that.
 */
-void
+void free_percpu(const void *objp)
-free_percpu(const void *objp)
 {
        int i;
-        struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+        struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
        /*
         * We allocate for all cpus so we cannot use for online cpu here.
         */
        for_each_cpu(i)
-                kfree(p->ptrs[i]);
+            kfree(p->ptrs[i]);
        kfree(p);
 }
 EXPORT_SYMBOL(free_percpu);
@@ -3095,44 +3128,44 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
                if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
                        goto fail;
 #endif
-                if (!(new = alloc_arraycache(node, (cachep->shared*
+                if (!(new = alloc_arraycache(node, (cachep->shared *
-                                cachep->batchcount), 0xbaadf00d)))
+                                                    cachep->batchcount),
+                                             0xbaadf00d)))
                        goto fail;
                if ((l3 = cachep->nodelists[node])) {
                        spin_lock_irq(&l3->list_lock);
                        if ((nc = cachep->nodelists[node]->shared))
-                                free_block(cachep, nc->entry,
+                                free_block(cachep, nc->entry, nc->avail, node);
-                                                        nc->avail, node);
                        l3->shared = new;
                        if (!cachep->nodelists[node]->alien) {
                                l3->alien = new_alien;
                                new_alien = NULL;
                        }
-                        l3->free_limit = (1 + nr_cpus_node(node))*
+                        l3->free_limit = (1 + nr_cpus_node(node)) *
-                                cachep->batchcount + cachep->num;
+                            cachep->batchcount + cachep->num;
                        spin_unlock_irq(&l3->list_lock);
                        kfree(nc);
                        free_alien_cache(new_alien);
                        continue;
                }
                if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
-                                                GFP_KERNEL, node)))
+                                        GFP_KERNEL, node)))
                        goto fail;
                kmem_list3_init(l3);
                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                        ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
                l3->shared = new;
                l3->alien = new_alien;
-                l3->free_limit = (1 + nr_cpus_node(node))*
+                l3->free_limit = (1 + nr_cpus_node(node)) *
-                        cachep->batchcount + cachep->num;
+                    cachep->batchcount + cachep->num;
                cachep->nodelists[node] = l3;
        }
        return err;
-fail:
+      fail:
        err = -ENOMEM;
        return err;
 }
@@ -3154,18 +3187,19 @@ static void do_ccupdate_local(void *info)
        new->new[smp_processor_id()] = old;
 }
 static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
-                                int shared)
+                            int shared)
 {
        struct ccupdate_struct new;
        int i, err;
-        memset(&new.new,0,sizeof(new.new));
+        memset(&new.new, 0, sizeof(new.new));
        for_each_online_cpu(i) {
-                new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
+                new.new[i] =
+                    alloc_arraycache(cpu_to_node(i), limit, batchcount);
                if (!new.new[i]) {
-                        for (i--; i >= 0; i--) kfree(new.new[i]);
+                        for (i--; i >= 0; i--)
+                                kfree(new.new[i]);
                        return -ENOMEM;
                }
        }
@@ -3193,13 +3227,12 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
        err = alloc_kmemlist(cachep);
        if (err) {
                printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
-                                cachep->name, -err);
+                       cachep->name, -err);
                BUG();
        }
        return 0;
 }
 static void enable_cpucache(kmem_cache_t *cachep)
 {
        int err;
@@ -3246,14 +3279,14 @@ static void enable_cpucache(kmem_cache_t *cachep)
        if (limit > 32)
                limit = 32;
 #endif
-        err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
+        err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
        if (err)
                printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
-                                        cachep->name, -err);
+                       cachep->name, -err);
 }
-static void drain_array_locked(kmem_cache_t *cachep,
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
-                                struct array_cache *ac, int force, int node)
+                                int force, int node)
 {
        int tofree;
@@ -3261,14 +3294,14 @@ static void drain_array_locked(kmem_cache_t *cachep,
        if (ac->touched && !force) {
                ac->touched = 0;
        } else if (ac->avail) {
-                tofree = force ? ac->avail : (ac->limit+4)/5;
+                tofree = force ? ac->avail : (ac->limit + 4) / 5;
                if (tofree > ac->avail) {
-                        tofree = (ac->avail+1)/2;
+                        tofree = (ac->avail + 1) / 2;
                }
                free_block(cachep, ac->entry, tofree, node);
                ac->avail -= tofree;
                memmove(ac->entry, &(ac->entry[tofree]),
-                                        sizeof(void*)*ac->avail);
+                        sizeof(void *) * ac->avail);
        }
 }
@@ -3291,13 +3324,14 @@ static void cache_reap(void *unused)
        if (down_trylock(&cache_chain_sem)) {
                /* Give up. Setup the next iteration. */
-                schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
+                schedule_delayed_work(&__get_cpu_var(reap_work),
+                                      REAPTIMEOUT_CPUC);
                return;
        }
        list_for_each(walk, &cache_chain) {
                kmem_cache_t *searchp;
-                struct list_head* p;
+                struct list_head *p;
                int tofree;
                struct slab *slabp;
@@ -3314,7 +3348,7 @@ static void cache_reap(void *unused)
                spin_lock_irq(&l3->list_lock);
                drain_array_locked(searchp, ac_data(searchp), 0,
-                                numa_node_id());
+                                   numa_node_id());
                if (time_after(l3->next_reap, jiffies))
                        goto next_unlock;
@@ -3323,14 +3357,16 @@ static void cache_reap(void *unused)
                if (l3->shared)
                        drain_array_locked(searchp, l3->shared, 0,
-                                numa_node_id());
+                                           numa_node_id());
                if (l3->free_touched) {
                        l3->free_touched = 0;
                        goto next_unlock;
                }
-                tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
+                tofree =
+                    (l3->free_limit + 5 * searchp->num -
+                     1) / (5 * searchp->num);
                do {
                        p = l3->slabs_free.next;
                        if (p == &(l3->slabs_free))
@@ -3350,10 +3386,10 @@ static void cache_reap(void *unused)
                        spin_unlock_irq(&l3->list_lock);
                        slab_destroy(searchp, slabp);
                        spin_lock_irq(&l3->list_lock);
-                } while(--tofree > 0);
+                } while (--tofree > 0);
-next_unlock:
+              next_unlock:
                spin_unlock_irq(&l3->list_lock);
-next:
+              next:
                cond_resched();
        }
        check_irq_on();
@@ -3365,32 +3401,37 @@ next:
 #ifdef CONFIG_PROC_FS
-static void *s_start(struct seq_file *m, loff_t *pos)
+static void print_slabinfo_header(struct seq_file *m)
 {
-        loff_t n = *pos;
+        /*
-        struct list_head *p;
+         * Output format version, so at least we can change it
+         * without _too_ many complaints.
-        down(&cache_chain_sem);
+         */
-        if (!n) {
-                /*
-                 * Output format version, so at least we can change it
-                 * without _too_ many complaints.
-                 */
 #if STATS
-                seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+        seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
 #else
-                seq_puts(m, "slabinfo - version: 2.1\n");
+        seq_puts(m, "slabinfo - version: 2.1\n");
 #endif
-                seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
+        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
-                seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+                 "<objperslab> <pagesperslab>");
-                seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #if STATS
-                seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
+        seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
-                                " <error> <maxfreeable> <nodeallocs> <remotefrees>");
+                 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
-                seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+        seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
-                seq_putc(m, '\n');
+        seq_putc(m, '\n');
-        }
+}
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+        loff_t n = *pos;
+        struct list_head *p;
+        down(&cache_chain_sem);
+        if (!n)
+                print_slabinfo_header(m);
        p = cache_chain.next;
        while (n--) {
                p = p->next;
@@ -3405,7 +3446,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
        kmem_cache_t *cachep = p;
        ++*pos;
        return cachep->next.next == &cache_chain ? NULL
-                : list_entry(cachep->next.next, kmem_cache_t, next);
+            : list_entry(cachep->next.next, kmem_cache_t, next);
 }
 static void s_stop(struct seq_file *m, void *p)
@@ -3417,11 +3458,11 @@ static int s_show(struct seq_file *m, void *p)
 {
        kmem_cache_t *cachep = p;
        struct list_head *q;
-        struct slab     *slabp;
+        struct slab *slabp;
-        unsigned long   active_objs;
+        unsigned long active_objs;
-        unsigned long   num_objs;
+        unsigned long num_objs;
-        unsigned long   active_slabs = 0;
+        unsigned long active_slabs = 0;
-        unsigned long   num_slabs, free_objects = 0, shared_avail = 0;
+        unsigned long num_slabs, free_objects = 0, shared_avail = 0;
        const char *name;
        char *error = NULL;
        int node;
@@ -3438,14 +3479,14 @@ static int s_show(struct seq_file *m, void *p)
                spin_lock(&l3->list_lock);
-                list_for_each(q,&l3->slabs_full) {
+                list_for_each(q, &l3->slabs_full) {
                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse != cachep->num && !error)
                                error = "slabs_full accounting error";
                        active_objs += cachep->num;
                        active_slabs++;
                }
-                list_for_each(q,&l3->slabs_partial) {
+                list_for_each(q, &l3->slabs_partial) {
                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse == cachep->num && !error)
                                error = "slabs_partial inuse accounting error";
@@ -3454,7 +3495,7 @@ static int s_show(struct seq_file *m, void *p)
                        active_objs += slabp->inuse;
                        active_slabs++;
                }
-                list_for_each(q,&l3->slabs_free) {
+                list_for_each(q, &l3->slabs_free) {
                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse && !error)
                                error = "slabs_free/inuse accounting error";
@@ -3465,25 +3506,24 @@ static int s_show(struct seq_file *m, void *p)
                spin_unlock(&l3->list_lock);
        }
-        num_slabs+=active_slabs;
+        num_slabs += active_slabs;
-        num_objs = num_slabs*cachep->num;
+        num_objs = num_slabs * cachep->num;
        if (num_objs - active_objs != free_objects && !error)
                error = "free_objects accounting error";
-        name = cachep->name; 
+        name = cachep->name;
        if (error)
                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-                name, active_objs, num_objs, cachep->objsize,
+                   name, active_objs, num_objs, cachep->objsize,
-                cachep->num, (1<<cachep->gfporder));
+                   cachep->num, (1 << cachep->gfporder));
        seq_printf(m, " : tunables %4u %4u %4u",
-                        cachep->limit, cachep->batchcount,
+                   cachep->limit, cachep->batchcount, cachep->shared);
-                        cachep->shared);
        seq_printf(m, " : slabdata %6lu %6lu %6lu",
-                        active_slabs, num_slabs, shared_avail);
+                   active_slabs, num_slabs, shared_avail);
 #if STATS
-        {       /* list3 stats */
+        {                       /* list3 stats */
                unsigned long high = cachep->high_mark;
                unsigned long allocs = cachep->num_allocations;
                unsigned long grown = cachep->grown;
@@ -3494,9 +3534,7 @@ static int s_show(struct seq_file *m, void *p)
                unsigned long node_frees = cachep->node_frees;
                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-                                %4lu %4lu %4lu %4lu",
+                                %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
-                                allocs, high, grown, reaped, errors,
-                                max_freeable, node_allocs, node_frees);
        }
        /* cpu stats */
        {
@@ -3506,7 +3544,7 @@ static int s_show(struct seq_file *m, void *p)
                unsigned long freemiss = atomic_read(&cachep->freemiss);
                seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
-                        allochit, allocmiss, freehit, freemiss);
+                           allochit, allocmiss, freehit, freemiss);
        }
 #endif
        seq_putc(m, '\n');
@@ -3529,10 +3567,10 @@ static int s_show(struct seq_file *m, void *p)
 */
 struct seq_operations slabinfo_op = {
-        .start  = s_start,
+        .start = s_start,
-        .next   = s_next,
+        .next = s_next,
-        .stop   = s_stop,
+        .stop = s_stop,
-        .show   = s_show,
+        .show = s_show,
 };
 #define MAX_SLABINFO_WRITE 128
@@ -3543,18 +3581,18 @@ struct seq_operations slabinfo_op = {
 * @count: data length
 * @ppos: unused
 */
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ssize_t slabinfo_write(struct file *file, const char __user * buffer,
-                                size_t count, loff_t *ppos)
+                       size_t count, loff_t *ppos)
 {
-        char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
+        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
        int limit, batchcount, shared, res;
        struct list_head *p;
-        
        if (count > MAX_SLABINFO_WRITE)
                return -EINVAL;
        if (copy_from_user(&kbuf, buffer, count))
                return -EFAULT;
-        kbuf[MAX_SLABINFO_WRITE] = '\0'; 
+        kbuf[MAX_SLABINFO_WRITE] = '\0';
        tmp = strchr(kbuf, ' ');
        if (!tmp)
@@ -3567,18 +3605,17 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
        /* Find the cache in the chain of caches. */
        down(&cache_chain_sem);
        res = -EINVAL;
-        list_for_each(p,&cache_chain) {
+        list_for_each(p, &cache_chain) {
                kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
                if (!strcmp(cachep->name, kbuf)) {
                        if (limit < 1 ||
                            batchcount < 1 ||
-                            batchcount > limit ||
+                            batchcount > limit || shared < 0) {
-                            shared < 0) {
                                res = 0;
                        } else {
                                res = do_tune_cpucache(cachep, limit,
-                                                        batchcount, shared);
+                                                       batchcount, shared);
                        }
                        break;
                }
@@ -3609,26 +3646,3 @@ unsigned int ksize(const void *objp)
        return obj_reallen(page_get_cache(virt_to_page(objp)));
 }
-/*
- * kstrdup - allocate space for and copy an existing string
- *
- * @s: the string to duplicate
- * @gfp: the GFP mask used in the kmalloc() call when allocating memory
- */
-char *kstrdup(const char *s, gfp_t gfp)
-{
-        size_t len;
-        char *buf;
-        if (!s)
-                return NULL;
-        len = strlen(s) + 1;
-        buf = kmalloc(len, gfp);
-        if (buf)
-                memcpy(buf, s, len);
-        return buf;
-}
-EXPORT_SYMBOL(kstrdup);
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 000000000000..1c240c4b71d9
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is 8 bytes on x86, though it's perhaps possible to reduce
+ * this to 4 if it's deemed worth the effort. The slob heap is a
+ * singly-linked list of pages from __get_free_page, grown on demand
+ * and allocation from the heap is currently first-fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * __get_free_pages directly so that it can return page-aligned blocks
+ * and keeps a linked list of such pages and their orders. These
+ * objects are detected in kfree() by their page alignment.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with
+ * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
+ * set, in which case the low-level allocator will fragment blocks to
+ * create the proper alignment. Again, objects of page-size or greater
+ * are allocated by calling __get_free_pages. As SLAB objects know
+ * their size, no separate size bookkeeping is necessary and there is
+ * essentially no allocation space overhead.
+ */
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+struct slob_block {
+        int units;
+        struct slob_block *next;
+};
+typedef struct slob_block slob_t;
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_ALIGN L1_CACHE_BYTES
+struct bigblock {
+        int order;
+        void *pages;
+        struct bigblock *next;
+};
+typedef struct bigblock bigblock_t;
+static slob_t arena = { .next = &arena, .units = 1 };
+static slob_t *slobfree = &arena;
+static bigblock_t *bigblocks;
+static DEFINE_SPINLOCK(slob_lock);
+static DEFINE_SPINLOCK(block_lock);
+static void slob_free(void *b, int size);
+static void *slob_alloc(size_t size, gfp_t gfp, int align)
+{
+        slob_t *prev, *cur, *aligned = 0;
+        int delta = 0, units = SLOB_UNITS(size);
+        unsigned long flags;
+        spin_lock_irqsave(&slob_lock, flags);
+        prev = slobfree;
+        for (cur = prev->next; ; prev = cur, cur = cur->next) {
+                if (align) {
+                        aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+                        delta = aligned - cur;
+                }
+                if (cur->units >= units + delta) { /* room enough? */
+                        if (delta) { /* need to fragment head to align? */
+                                aligned->units = cur->units - delta;
+                                aligned->next = cur->next;
+                                cur->next = aligned;
+                                cur->units = delta;
+                                prev = cur;
+                                cur = aligned;
+                        }
+                        if (cur->units == units) /* exact fit? */
+                                prev->next = cur->next; /* unlink */
+                        else { /* fragment */
+                                prev->next = cur + units;
+                                prev->next->units = cur->units - units;
+                                prev->next->next = cur->next;
+                                cur->units = units;
+                        }
+                        slobfree = prev;
+                        spin_unlock_irqrestore(&slob_lock, flags);
+                        return cur;
+                }
+                if (cur == slobfree) {
+                        spin_unlock_irqrestore(&slob_lock, flags);
+                        if (size == PAGE_SIZE) /* trying to shrink arena? */
+                                return 0;
+                        cur = (slob_t *)__get_free_page(gfp);
+                        if (!cur)
+                                return 0;
+                        slob_free(cur, PAGE_SIZE);
+                        spin_lock_irqsave(&slob_lock, flags);
+                        cur = slobfree;
+                }
+        }
+}
+static void slob_free(void *block, int size)
+{
+        slob_t *cur, *b = (slob_t *)block;
+        unsigned long flags;
+        if (!block)
+                return;
+        if (size)
+                b->units = SLOB_UNITS(size);
+        /* Find reinsertion point */
+        spin_lock_irqsave(&slob_lock, flags);
+        for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+                if (cur >= cur->next && (b > cur || b < cur->next))
+                        break;
+        if (b + b->units == cur->next) {
+                b->units += cur->next->units;
+                b->next = cur->next->next;
+        } else
+                b->next = cur->next;
+        if (cur + cur->units == b) {
+                cur->units += b->units;
+                cur->next = b->next;
+        } else
+                cur->next = b;
+        slobfree = cur;
+        spin_unlock_irqrestore(&slob_lock, flags);
+}
+static int FASTCALL(find_order(int size));
+static int fastcall find_order(int size)
+{
+        int order = 0;
+        for ( ; size > 4096 ; size >>=1)
+                order++;
+        return order;
+}
+void *kmalloc(size_t size, gfp_t gfp)
+{
+        slob_t *m;
+        bigblock_t *bb;
+        unsigned long flags;
+        if (size < PAGE_SIZE - SLOB_UNIT) {
+                m = slob_alloc(size + SLOB_UNIT, gfp, 0);
+                return m ? (void *)(m + 1) : 0;
+        }
+        bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+        if (!bb)
+                return 0;
+        bb->order = find_order(size);
+        bb->pages = (void *)__get_free_pages(gfp, bb->order);
+        if (bb->pages) {
+                spin_lock_irqsave(&block_lock, flags);
+                bb->next = bigblocks;
+                bigblocks = bb;
+                spin_unlock_irqrestore(&block_lock, flags);
+                return bb->pages;
+        }
+        slob_free(bb, sizeof(bigblock_t));
+        return 0;
+}
+EXPORT_SYMBOL(kmalloc);
+void kfree(const void *block)
+{
+        bigblock_t *bb, **last = &bigblocks;
+        unsigned long flags;
+        if (!block)
+                return;
+        if (!((unsigned long)block & (PAGE_SIZE-1))) {
+                /* might be on the big block list */
+                spin_lock_irqsave(&block_lock, flags);
+                for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
+                        if (bb->pages == block) {
+                                *last = bb->next;
+                                spin_unlock_irqrestore(&block_lock, flags);
+                                free_pages((unsigned long)block, bb->order);
+                                slob_free(bb, sizeof(bigblock_t));
+                                return;
+                        }
+                }
+                spin_unlock_irqrestore(&block_lock, flags);
+        }
+        slob_free((slob_t *)block - 1, 0);
+        return;
+}
+EXPORT_SYMBOL(kfree);
+unsigned int ksize(const void *block)
+{
+        bigblock_t *bb;
+        unsigned long flags;
+        if (!block)
+                return 0;
+        if (!((unsigned long)block & (PAGE_SIZE-1))) {
+                spin_lock_irqsave(&block_lock, flags);
+                for (bb = bigblocks; bb; bb = bb->next)
+                        if (bb->pages == block) {
+                                spin_unlock_irqrestore(&slob_lock, flags);
+                                return PAGE_SIZE << bb->order;
+                        }
+                spin_unlock_irqrestore(&block_lock, flags);
+        }
+        return ((slob_t *)block - 1)->units * SLOB_UNIT;
+}
+struct kmem_cache {
+        unsigned int size, align;
+        const char *name;
+        void (*ctor)(void *, struct kmem_cache *, unsigned long);
+        void (*dtor)(void *, struct kmem_cache *, unsigned long);
+};
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+        size_t align, unsigned long flags,
+        void (*ctor)(void*, struct kmem_cache *, unsigned long),
+        void (*dtor)(void*, struct kmem_cache *, unsigned long))
+{
+        struct kmem_cache *c;
+        c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+        if (c) {
+                c->name = name;
+                c->size = size;
+                c->ctor = ctor;
+                c->dtor = dtor;
+                /* ignore alignment unless it's forced */
+                c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+                if (c->align < align)
+                        c->align = align;
+        }
+        return c;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+int kmem_cache_destroy(struct kmem_cache *c)
+{
+        slob_free(c, sizeof(struct kmem_cache));
+        return 0;
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+        void *b;
+        if (c->size < PAGE_SIZE)
+                b = slob_alloc(c->size, flags, c->align);
+        else
+                b = (void *)__get_free_pages(flags, find_order(c->size));
+        if (c->ctor)
+                c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+        return b;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+        if (c->dtor)
+                c->dtor(b, c, 0);
+        if (c->size < PAGE_SIZE)
+                slob_free(b, c->size);
+        else
+                free_pages((unsigned long)b, find_order(c->size));
+}
+EXPORT_SYMBOL(kmem_cache_free);
+unsigned int kmem_cache_size(struct kmem_cache *c)
+{
+        return c->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+const char *kmem_cache_name(struct kmem_cache *c)
+{
+        return c->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+static struct timer_list slob_timer = TIMER_INITIALIZER(
+        (void (*)(unsigned long))kmem_cache_init, 0, 0);
+void kmem_cache_init(void)
+{
+        void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+        if (p)
+                free_page((unsigned long)p);
+        mod_timer(&slob_timer, jiffies + HZ);
+}
+atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
+EXPORT_SYMBOL(slab_reclaim_pages);
+#ifdef CONFIG_SMP
+void *__alloc_percpu(size_t size, size_t align)
+{
+        int i;
+        struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+        if (!pdata)
+                return NULL;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!cpu_possible(i))
+                        continue;
+                pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+                if (!pdata->ptrs[i])
+                        goto unwind_oom;
+                memset(pdata->ptrs[i], 0, size);
+        }
+        /* Catch derefs w/o wrappers */
+        return (void *) (~(unsigned long) pdata);
+unwind_oom:
+        while (--i >= 0) {
+                if (!cpu_possible(i))
+                        continue;
+                kfree(pdata->ptrs[i]);
+        }
+        kfree(pdata);
+        return NULL;
+}
+EXPORT_SYMBOL(__alloc_percpu);
+void
+free_percpu(const void *objp)
+{
+        int i;
+        struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!cpu_possible(i))
+                        continue;
+                kfree(p->ptrs[i]);
+        }
+        kfree(p);
+}
+EXPORT_SYMBOL(free_percpu);
+#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 72079b538e2d..0a51f36ba3a1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -18,10 +18,10 @@
 */
 #ifdef CONFIG_SPARSEMEM_EXTREME
 struct mem_section *mem_section[NR_SECTION_ROOTS]
-        ____cacheline_maxaligned_in_smp;
+        ____cacheline_internodealigned_in_smp;
 #else
 struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
-        ____cacheline_maxaligned_in_smp;
+        ____cacheline_internodealigned_in_smp;
 #endif
 EXPORT_SYMBOL(mem_section);
diff --git a/mm/swap.c b/mm/swap.c
index 73d351439ef6..cbb48e721ab9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -156,16 +156,22 @@ void fastcall lru_cache_add_active(struct page *page)
        put_cpu_var(lru_add_active_pvecs);
 }
-void lru_add_drain(void)
+static void __lru_add_drain(int cpu)
 {
-        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+        struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+        /* CPU is dead, so no locking needed. */
        if (pagevec_count(pvec))
                __pagevec_lru_add(pvec);
-        pvec = &__get_cpu_var(lru_add_active_pvecs);
+        pvec = &per_cpu(lru_add_active_pvecs, cpu);
        if (pagevec_count(pvec))
                __pagevec_lru_add_active(pvec);
-        put_cpu_var(lru_add_pvecs);
+}
+void lru_add_drain(void)
+{
+        __lru_add_drain(get_cpu());
+        put_cpu();
 }
 /*
@@ -378,6 +384,8 @@ unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
        return pagevec_count(pvec);
 }
+EXPORT_SYMBOL(pagevec_lookup);
 unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
                pgoff_t *index, int tag, unsigned nr_pages)
 {
@@ -412,17 +420,6 @@ void vm_acct_memory(long pages)
 }
 #ifdef CONFIG_HOTPLUG_CPU
-static void lru_drain_cache(unsigned int cpu)
-{
-        struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
-        /* CPU is dead, so no locking needed. */
-        if (pagevec_count(pvec))
-                __pagevec_lru_add(pvec);
-        pvec = &per_cpu(lru_add_active_pvecs, cpu);
-        if (pagevec_count(pvec))
-                __pagevec_lru_add_active(pvec);
-}
 /* Drop the CPU's cached committed space back into the central pool. */
 static int cpu_swap_callback(struct notifier_block *nfb,
@@ -435,7 +432,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
        if (action == CPU_DEAD) {
                atomic_add(*committed, &vm_committed_space);
                *committed = 0;
-                lru_drain_cache((long)hcpu);
+                __lru_add_drain((long)hcpu);
        }
        return NOTIFY_OK;
 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0df9a57b1de8..7b09ac503fec 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
+#include <linux/pagevec.h>
 #include <asm/pgtable.h>
@@ -140,7 +141,7 @@ void __delete_from_swap_cache(struct page *page)
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
-int add_to_swap(struct page * page)
+int add_to_swap(struct page * page, gfp_t gfp_mask)
 {
        swp_entry_t entry;
        int err;
@@ -165,7 +166,7 @@ int add_to_swap(struct page * page)
                 * Add it to the swap cache and mark it dirty
                 */
                err = __add_to_swap_cache(page, entry,
-                                GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN);
+                                gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
                switch (err) {
                case 0:                         /* Success */
@@ -272,12 +273,11 @@ void free_page_and_swap_cache(struct page *page)
 */
 void free_pages_and_swap_cache(struct page **pages, int nr)
 {
-        int chunk = 16;
        struct page **pagep = pages;
        lru_add_drain();
        while (nr) {
-                int todo = min(chunk, nr);
+                int todo = min(nr, PAGEVEC_SIZE);
                int i;
                for (i = 0; i < todo; i++)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index edafeace301f..957fef43fa60 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -25,6 +25,7 @@
 #include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
+#include <linux/capability.h>
 #include <linux/syscalls.h>
 #include <asm/pgtable.h>
@@ -211,6 +212,26 @@ noswap:
        return (swp_entry_t) {0};
 }
+swp_entry_t get_swap_page_of_type(int type)
+{
+        struct swap_info_struct *si;
+        pgoff_t offset;
+        spin_lock(&swap_lock);
+        si = swap_info + type;
+        if (si->flags & SWP_WRITEOK) {
+                nr_swap_pages--;
+                offset = scan_swap_map(si);
+                if (offset) {
+                        spin_unlock(&swap_lock);
+                        return swp_entry(type, offset);
+                }
+                nr_swap_pages++;
+        }
+        spin_unlock(&swap_lock);
+        return (swp_entry_t) {0};
+}
 static struct swap_info_struct * swap_info_get(swp_entry_t entry)
 {
        struct swap_info_struct * p;
@@ -1167,9 +1188,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
                set_blocksize(bdev, p->old_block_size);
                bd_release(bdev);
        } else {
-                down(&inode->i_sem);
+                mutex_lock(&inode->i_mutex);
                inode->i_flags &= ~S_SWAPFILE;
-                up(&inode->i_sem);
+                mutex_unlock(&inode->i_mutex);
        }
        filp_close(swap_file, NULL);
        err = 0;
@@ -1386,7 +1407,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                p->bdev = bdev;
        } else if (S_ISREG(inode->i_mode)) {
                p->bdev = inode->i_sb->s_bdev;
-                down(&inode->i_sem);
+                mutex_lock(&inode->i_mutex);
                did_down = 1;
                if (IS_SWAPFILE(inode)) {
                        error = -EBUSY;
@@ -1422,7 +1443,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
        else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
                swap_header_version = 2;
        else {
-                printk("Unable to find swap-space signature\n");
+                printk(KERN_ERR "Unable to find swap-space signature\n");
                error = -EINVAL;
                goto bad_swap;
        }
@@ -1473,7 +1494,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                        goto bad_swap;
                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
                        goto bad_swap;
-                
                /* OK, set up the swap map and apply the bad block list */
                if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
                        error = -ENOMEM;
@@ -1482,17 +1503,17 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                error = 0;
                memset(p->swap_map, 0, maxpages * sizeof(short));
-                for (i=0; i<swap_header->info.nr_badpages; i++) {
+                for (i = 0; i < swap_header->info.nr_badpages; i++) {
-                        int page = swap_header->info.badpages[i];
+                        int page_nr = swap_header->info.badpages[i];
-                        if (page <= 0 || page >= swap_header->info.last_page)
+                        if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
                                error = -EINVAL;
                        else
-                                p->swap_map[page] = SWAP_MAP_BAD;
+                                p->swap_map[page_nr] = SWAP_MAP_BAD;
                }
                nr_good_pages = swap_header->info.last_page -
                                swap_header->info.nr_badpages -
                                1 /* header page */;
-                if (error) 
+                if (error)
                        goto bad_swap;
        }
@@ -1576,7 +1597,7 @@ out:
        if (did_down) {
                if (!error)
                        inode->i_flags |= S_SWAPFILE;
-                up(&inode->i_sem);
+                mutex_unlock(&inode->i_mutex);
        }
        return error;
 }
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index b58abcf44ed6..f9d6a9cc91c4 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -81,13 +81,19 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
                goto close_file;
        d_instantiate(dentry, inode);
-        inode->i_size = size;
        inode->i_nlink = 0;     /* It is unlinked */
        file->f_vfsmnt = mntget(shm_mnt);
        file->f_dentry = dentry;
        file->f_mapping = inode->i_mapping;
        file->f_op = &ramfs_file_operations;
        file->f_mode = FMODE_WRITE | FMODE_READ;
+        /* notify everyone as to the change of file size */
+        error = do_truncate(dentry, size, 0, file);
+        if (error < 0)
+                goto close_file;
        return file;
 close_file:
@@ -123,3 +129,24 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
 {
        return 0;
 }
+int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        file_accessed(file);
+#ifndef CONFIG_MMU
+        return ramfs_nommu_mmap(file, vma);
+#else
+        return 0;
+#endif
+}
+#ifndef CONFIG_MMU
+unsigned long shmem_get_unmapped_area(struct file *file,
+                                      unsigned long addr,
+                                      unsigned long len,
+                                      unsigned long pgoff,
+                                      unsigned long flags)
+{
+        return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
+}
+#endif
diff --git a/mm/truncate.c b/mm/truncate.c
index 9173ab500604..6cb3fff25f67 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -82,12 +82,15 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 }
 /**
- * truncate_inode_pages - truncate *all* the pages from an offset
+ * truncate_inode_pages - truncate range of pages specified by start and
+ * end byte offsets
 * @mapping: mapping to truncate
 * @lstart: offset from which to truncate
+ * @lend: offset to which to truncate
 *
- * Truncate the page cache at a set offset, removing the pages that are beyond
+ * Truncate the page cache, removing the pages that are between
- * that offset (and zeroing out partial pages).
+ * specified offsets (and zeroing out partial page
+ * (if lstart is not page aligned)).
 *
 * Truncate takes two passes - the first pass is nonblocking.  It will not
 * block on page locks and it will not block on writeback.  The second pass
@@ -101,12 +104,12 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 * We pass down the cache-hot hint to the page freeing code.  Even if the
 * mapping is large, it is probably the case that the final pages are the most
 * recently touched, and freeing happens in ascending file offset order.
- *
- * Called under (and serialised by) inode->i_sem.
 */
-void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+void truncate_inode_pages_range(struct address_space *mapping,
+                                loff_t lstart, loff_t lend)
 {
        const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+        pgoff_t end;
        const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
        struct pagevec pvec;
        pgoff_t next;
@@ -115,13 +118,22 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
        if (mapping->nrpages == 0)
                return;
+        BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
+        end = (lend >> PAGE_CACHE_SHIFT);
        pagevec_init(&pvec, 0);
        next = start;
-        while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+        while (next <= end &&
+               pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
                        pgoff_t page_index = page->index;
+                        if (page_index > end) {
+                                next = page_index;
+                                break;
+                        }
                        if (page_index > next)
                                next = page_index;
                        next++;
@@ -157,9 +169,15 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
                        next = start;
                        continue;
                }
+                if (pvec.pages[0]->index > end) {
+                        pagevec_release(&pvec);
+                        break;
+                }
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
+                        if (page->index > end)
+                                break;
                        lock_page(page);
                        wait_on_page_writeback(page);
                        if (page->index > next)
@@ -171,7 +189,19 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
                pagevec_release(&pvec);
        }
 }
+EXPORT_SYMBOL(truncate_inode_pages_range);
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Called under (and serialised by) inode->i_mutex.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+        truncate_inode_pages_range(mapping, lstart, (loff_t)-1);
+}
 EXPORT_SYMBOL(truncate_inode_pages);
 /**
@@ -219,7 +249,6 @@ unlock:
                                break;
                }
                pagevec_release(&pvec);
-                cond_resched();
        }
        return ret;
 }
diff --git a/mm/util.c b/mm/util.c
new file mode 100644
index 000000000000..5f4bb59da63c
--- /dev/null
+++ b/mm/util.c
@@ -0,0 +1,39 @@
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ */
+void *kzalloc(size_t size, gfp_t flags)
+{
+        void *ret = kmalloc(size, flags);
+        if (ret)
+                memset(ret, 0, size);
+        return ret;
+}
+EXPORT_SYMBOL(kzalloc);
+/*
+ * kstrdup - allocate space for and copy an existing string
+ *
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, gfp_t gfp)
+{
+        size_t len;
+        char *buf;
+        if (!s)
+                return NULL;
+        len = strlen(s) + 1;
+        buf = kmalloc(len, gfp);
+        if (buf)
+                memcpy(buf, s, len);
+        return buf;
+}
+EXPORT_SYMBOL(kstrdup);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b0cd81c32de6..bf903b2d198f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -63,9 +63,6 @@ struct scan_control {
        unsigned long nr_mapped;        /* From page_state */
-        /* How many pages shrink_cache() should reclaim */
-        int nr_to_reclaim;
        /* Ask shrink_caches, or shrink_zone to scan at this priority */
        unsigned int priority;
@@ -74,9 +71,6 @@ struct scan_control {
        int may_writepage;
-        /* Can pages be swapped as part of reclaim? */
-        int may_swap;
        /* This context's SWAP_CLUSTER_MAX. If freeing memory for
         * suspend, we effectively ignore SWAP_CLUSTER_MAX.
         * In this context, it doesn't matter that we scan the
@@ -186,8 +180,7 @@ EXPORT_SYMBOL(remove_shrinker);
 *
 * Returns the number of slab objects which we shrunk.
 */
-static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
-                        unsigned long lru_pages)
 {
        struct shrinker *shrinker;
        int ret = 0;
@@ -275,9 +268,7 @@ static inline int is_page_cache_freeable(struct page *page)
 static int may_write_to_queue(struct backing_dev_info *bdi)
 {
-        if (current_is_kswapd())
+        if (current->flags & PF_SWAPWRITE)
-                return 1;
-        if (current_is_pdflush())       /* This is unlikely, but why not... */
                return 1;
        if (!bdi_write_congested(bdi))
                return 1;
@@ -367,7 +358,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
                res = mapping->a_ops->writepage(page, &wbc);
                if (res < 0)
                        handle_write_error(mapping, page, res);
-                if (res == WRITEPAGE_ACTIVATE) {
+                if (res == AOP_WRITEPAGE_ACTIVATE) {
                        ClearPageReclaim(page);
                        return PAGE_ACTIVATE;
                }
@@ -382,6 +373,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
        return PAGE_CLEAN;
 }
+static int remove_mapping(struct address_space *mapping, struct page *page)
+{
+        if (!mapping)
+                return 0;               /* truncate got there first */
+        write_lock_irq(&mapping->tree_lock);
+        /*
+         * The non-racy check for busy page.  It is critical to check
+         * PageDirty _after_ making sure that the page is freeable and
+         * not in use by anybody.       (pagecache + us == 2)
+         */
+        if (unlikely(page_count(page) != 2))
+                goto cannot_free;
+        smp_rmb();
+        if (unlikely(PageDirty(page)))
+                goto cannot_free;
+        if (PageSwapCache(page)) {
+                swp_entry_t swap = { .val = page_private(page) };
+                __delete_from_swap_cache(page);
+                write_unlock_irq(&mapping->tree_lock);
+                swap_free(swap);
+                __put_page(page);       /* The pagecache ref */
+                return 1;
+        }
+        __remove_from_page_cache(page);
+        write_unlock_irq(&mapping->tree_lock);
+        __put_page(page);
+        return 1;
+cannot_free:
+        write_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
 /*
 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
 */
@@ -430,9 +458,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * Try to allocate it some swap space here.
                 */
                if (PageAnon(page) && !PageSwapCache(page)) {
-                        if (!sc->may_swap)
+                        if (!add_to_swap(page, GFP_ATOMIC))
-                                goto keep_locked;
-                        if (!add_to_swap(page))
                                goto activate_locked;
                }
 #endif /* CONFIG_SWAP */
@@ -515,36 +541,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                                goto free_it;
                }
-                if (!mapping)
+                if (!remove_mapping(mapping, page))
-                        goto keep_locked;       /* truncate got there first */
+                        goto keep_locked;
-                write_lock_irq(&mapping->tree_lock);
-                /*
-                 * The non-racy check for busy page.  It is critical to check
-                 * PageDirty _after_ making sure that the page is freeable and
-                 * not in use by anybody.       (pagecache + us == 2)
-                 */
-                if (unlikely(page_count(page) != 2))
-                        goto cannot_free;
-                smp_rmb();
-                if (unlikely(PageDirty(page)))
-                        goto cannot_free;
-#ifdef CONFIG_SWAP
-                if (PageSwapCache(page)) {
-                        swp_entry_t swap = { .val = page_private(page) };
-                        __delete_from_swap_cache(page);
-                        write_unlock_irq(&mapping->tree_lock);
-                        swap_free(swap);
-                        __put_page(page);       /* The pagecache ref */
-                        goto free_it;
-                }
-#endif /* CONFIG_SWAP */
-                __remove_from_page_cache(page);
-                write_unlock_irq(&mapping->tree_lock);
-                __put_page(page);
 free_it:
                unlock_page(page);
@@ -553,10 +551,6 @@ free_it:
                        __pagevec_release_nonlru(&freed_pvec);
                continue;
-cannot_free:
-                write_unlock_irq(&mapping->tree_lock);
-                goto keep_locked;
 activate_locked:
                SetPageActive(page);
                pgactivate++;
@@ -574,6 +568,241 @@ keep:
        return reclaimed;
 }
+#ifdef CONFIG_MIGRATION
+static inline void move_to_lru(struct page *page)
+{
+        list_del(&page->lru);
+        if (PageActive(page)) {
+                /*
+                 * lru_cache_add_active checks that
+                 * the PG_active bit is off.
+                 */
+                ClearPageActive(page);
+                lru_cache_add_active(page);
+        } else {
+                lru_cache_add(page);
+        }
+        put_page(page);
+}
+/*
+ * Add isolated pages on the list back to the LRU
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+        struct page *page;
+        struct page *page2;
+        int count = 0;
+        list_for_each_entry_safe(page, page2, l, lru) {
+                move_to_lru(page);
+                count++;
+        }
+        return count;
+}
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+        struct address_space *mapping = page_mapping(page);
+        if (page_mapped(page) && mapping)
+                if (try_to_unmap(page) != SWAP_SUCCESS)
+                        goto unlock_retry;
+        if (PageDirty(page)) {
+                /* Page is dirty, try to write it out here */
+                switch(pageout(page, mapping)) {
+                case PAGE_KEEP:
+                case PAGE_ACTIVATE:
+                        goto unlock_retry;
+                case PAGE_SUCCESS:
+                        goto retry;
+                case PAGE_CLEAN:
+                        ; /* try to free the page below */
+                }
+        }
+        if (PagePrivate(page)) {
+                if (!try_to_release_page(page, GFP_KERNEL) ||
+                    (!mapping && page_count(page) == 1))
+                        goto unlock_retry;
+        }
+        if (remove_mapping(mapping, page)) {
+                /* Success */
+                unlock_page(page);
+                return 0;
+        }
+unlock_retry:
+        unlock_page(page);
+retry:
+        return -EAGAIN;
+}
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because t has become empty
+ * or no retryable pages exist anymore.
+ *
+ * SIMPLIFIED VERSION: This implementation of migrate_pages
+ * is only swapping out pages and never touches the second
+ * list. The direct migration patchset
+ * extends this function to avoid the use of swap.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+                  struct list_head *moved, struct list_head *failed)
+{
+        int retry;
+        int nr_failed = 0;
+        int pass = 0;
+        struct page *page;
+        struct page *page2;
+        int swapwrite = current->flags & PF_SWAPWRITE;
+        int rc;
+        if (!swapwrite)
+                current->flags |= PF_SWAPWRITE;
+redo:
+        retry = 0;
+        list_for_each_entry_safe(page, page2, from, lru) {
+                cond_resched();
+                rc = 0;
+                if (page_count(page) == 1)
+                        /* page was freed from under us. So we are done. */
+                        goto next;
+                /*
+                 * Skip locked pages during the first two passes to give the
+                 * functions holding the lock time to release the page. Later we
+                 * use lock_page() to have a higher chance of acquiring the
+                 * lock.
+                 */
+                rc = -EAGAIN;
+                if (pass > 2)
+                        lock_page(page);
+                else
+                        if (TestSetPageLocked(page))
+                                goto next;
+                /*
+                 * Only wait on writeback if we have already done a pass where
+                 * we we may have triggered writeouts for lots of pages.
+                 */
+                if (pass > 0) {
+                        wait_on_page_writeback(page);
+                } else {
+                        if (PageWriteback(page))
+                                goto unlock_page;
+                }
+                /*
+                 * Anonymous pages must have swap cache references otherwise
+                 * the information contained in the page maps cannot be
+                 * preserved.
+                 */
+                if (PageAnon(page) && !PageSwapCache(page)) {
+                        if (!add_to_swap(page, GFP_KERNEL)) {
+                                rc = -ENOMEM;
+                                goto unlock_page;
+                        }
+                }
+                /*
+                 * Page is properly locked and writeback is complete.
+                 * Try to migrate the page.
+                 */
+                rc = swap_page(page);
+                goto next;
+unlock_page:
+                unlock_page(page);
+next:
+                if (rc == -EAGAIN) {
+                        retry++;
+                } else if (rc) {
+                        /* Permanent failure */
+                        list_move(&page->lru, failed);
+                        nr_failed++;
+                } else {
+                        /* Success */
+                        list_move(&page->lru, moved);
+                }
+        }
+        if (retry && pass++ < 10)
+                goto redo;
+        if (!swapwrite)
+                current->flags &= ~PF_SWAPWRITE;
+        return nr_failed + retry;
+}
+static void lru_add_drain_per_cpu(void *dummy)
+{
+        lru_add_drain();
+}
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list. Do necessary cache draining if the
+ * page is not on the LRU lists yet.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ * -ENOENT = page is being freed elsewhere.
+ */
+int isolate_lru_page(struct page *page)
+{
+        int rc = 0;
+        struct zone *zone = page_zone(page);
+redo:
+        spin_lock_irq(&zone->lru_lock);
+        rc = __isolate_lru_page(page);
+        if (rc == 1) {
+                if (PageActive(page))
+                        del_page_from_active_list(zone, page);
+                else
+                        del_page_from_inactive_list(zone, page);
+        }
+        spin_unlock_irq(&zone->lru_lock);
+        if (rc == 0) {
+                /*
+                 * Maybe this page is still waiting for a cpu to drain it
+                 * from one of the lru lists?
+                 */
+                rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+                if (rc == 0 && PageLRU(page))
+                        goto redo;
+        }
+        return rc;
+}
+#endif
 /*
 * zone->lru_lock is heavily contended.  Some of the functions that
 * shrink the lists perform better by taking out a batch of pages
@@ -602,20 +831,18 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
-                if (!TestClearPageLRU(page))
+                switch (__isolate_lru_page(page)) {
-                        BUG();
+                case 1:
-                list_del(&page->lru);
+                        /* Succeeded to isolate page */
-                if (get_page_testone(page)) {
+                        list_move(&page->lru, dst);
-                        /*
-                         * It is being freed elsewhere
-                         */
-                        __put_page(page);
-                        SetPageLRU(page);
-                        list_add(&page->lru, src);
-                        continue;
-                } else {
-                        list_add(&page->lru, dst);
                        nr_taken++;
+                        break;
+                case -ENOENT:
+                        /* Not possible to isolate */
+                        list_move(&page->lru, src);
+                        break;
+                default:
+                        BUG();
                }
        }
@@ -653,17 +880,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
                        goto done;
                max_scan -= nr_scan;
-                if (current_is_kswapd())
-                        mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
-                else
-                        mod_page_state_zone(zone, pgscan_direct, nr_scan);
                nr_freed = shrink_list(&page_list, sc);
-                if (current_is_kswapd())
-                        mod_page_state(kswapd_steal, nr_freed);
-                mod_page_state_zone(zone, pgsteal, nr_freed);
-                sc->nr_to_reclaim -= nr_freed;
-                spin_lock_irq(&zone->lru_lock);
+                local_irq_disable();
+                if (current_is_kswapd()) {
+                        __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
+                        __mod_page_state(kswapd_steal, nr_freed);
+                } else
+                        __mod_page_state_zone(zone, pgscan_direct, nr_scan);
+                __mod_page_state_zone(zone, pgsteal, nr_freed);
+                spin_lock(&zone->lru_lock);
                /*
                 * Put back any unfreeable pages.
                 */
@@ -825,11 +1052,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
                }
        }
        zone->nr_active += pgmoved;
-        spin_unlock_irq(&zone->lru_lock);
+        spin_unlock(&zone->lru_lock);
-        pagevec_release(&pvec);
-        mod_page_state_zone(zone, pgrefill, pgscanned);
+        __mod_page_state_zone(zone, pgrefill, pgscanned);
-        mod_page_state(pgdeactivate, pgdeactivate);
+        __mod_page_state(pgdeactivate, pgdeactivate);
+        local_irq_enable();
+        pagevec_release(&pvec);
 }
 /*
@@ -861,8 +1090,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
        else
                nr_inactive = 0;
-        sc->nr_to_reclaim = sc->swap_cluster_max;
        while (nr_active || nr_inactive) {
                if (nr_active) {
                        sc->nr_to_scan = min(nr_active,
@@ -876,8 +1103,6 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
                                        (unsigned long)sc->swap_cluster_max);
                        nr_inactive -= sc->nr_to_scan;
                        shrink_cache(zone, sc);
-                        if (sc->nr_to_reclaim <= 0)
-                                break;
                }
        }
@@ -910,7 +1135,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
-                if (zone->present_pages == 0)
+                if (!populated_zone(zone))
                        continue;
                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
@@ -952,7 +1177,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        sc.gfp_mask = gfp_mask;
        sc.may_writepage = 0;
-        sc.may_swap = 1;
        inc_page_state(allocstall);
@@ -1055,7 +1279,6 @@ loop_again:
        total_reclaimed = 0;
        sc.gfp_mask = GFP_KERNEL;
        sc.may_writepage = 0;
-        sc.may_swap = 1;
        sc.nr_mapped = read_page_state(nr_mapped);
        inc_page_state(pageoutrun);
@@ -1084,7 +1307,7 @@ loop_again:
                        for (i = pgdat->nr_zones - 1; i >= 0; i--) {
                                struct zone *zone = pgdat->node_zones + i;
-                                if (zone->present_pages == 0)
+                                if (!populated_zone(zone))
                                        continue;
                                if (zone->all_unreclaimable &&
@@ -1121,7 +1344,7 @@ scan:
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
-                        if (zone->present_pages == 0)
+                        if (!populated_zone(zone))
                                continue;
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
@@ -1238,7 +1461,7 @@ static int kswapd(void *p)
         * us from recursively trying to free more memory as we're
         * trying to free the first piece of memory in the first place).
         */
-        tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
        order = 0;
        for ( ; ; ) {
@@ -1273,7 +1496,7 @@ void wakeup_kswapd(struct zone *zone, int order)
 {
        pg_data_t *pgdat;
-        if (zone->present_pages == 0)
+        if (!populated_zone(zone))
                return;
        pgdat = zone->zone_pgdat;
@@ -1353,76 +1576,3 @@ static int __init kswapd_init(void)
 }
 module_init(kswapd_init)
-/*
- * Try to free up some pages from this zone through reclaim.
- */
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
-{
-        struct scan_control sc;
-        int nr_pages = 1 << order;
-        int total_reclaimed = 0;
-        /* The reclaim may sleep, so don't do it if sleep isn't allowed */
-        if (!(gfp_mask & __GFP_WAIT))
-                return 0;
-        if (zone->all_unreclaimable)
-                return 0;
-        sc.gfp_mask = gfp_mask;
-        sc.may_writepage = 0;
-        sc.may_swap = 0;
-        sc.nr_mapped = read_page_state(nr_mapped);
-        sc.nr_scanned = 0;
-        sc.nr_reclaimed = 0;
-        /* scan at the highest priority */
-        sc.priority = 0;
-        disable_swap_token();
-        if (nr_pages > SWAP_CLUSTER_MAX)
-                sc.swap_cluster_max = nr_pages;
-        else
-                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-        /* Don't reclaim the zone if there are other reclaimers active */
-        if (atomic_read(&zone->reclaim_in_progress) > 0)
-                goto out;
-        shrink_zone(zone, &sc);
-        total_reclaimed = sc.nr_reclaimed;
- out:
-        return total_reclaimed;
-}
-asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
-                                     unsigned int state)
-{
-        struct zone *z;
-        int i;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EACCES;
-        if (node >= MAX_NUMNODES || !node_online(node))
-                return -EINVAL;
-        /* This will break if we ever add more zones */
-        if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
-                return -EINVAL;
-        for (i = 0; i < MAX_NR_ZONES; i++) {
-                if (!(zone & 1<<i))
-                        continue;
-                z = &NODE_DATA(node)->node_zones[i];
-                if (state)
-                        z->reclaim_pages = 1;
-                else
-                        z->reclaim_pages = 0;
-        }
-        return 0;
-}