29 files changed, 2603 insertions, 1587 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a9cb80ae6409..332f5c29b53a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS
 # support for page migration
 #
 config MIGRATION
-        def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
+        bool "Page migration"
-        depends on SWAP
+        def_bool y if NUMA
+        depends on SWAP && NUMA
+        help
+          Allows the migration of the physical location of pages of processes
+          while the virtual addresses are not changed. This is useful for
+          example on NUMA systems to put pages nearer to the processors accessing
+          the page.
diff --git a/mm/Makefile b/mm/Makefile
index 9aa03fa1dcc3..0b8f73f2ed16 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y                   := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           page_alloc.o page-writeback.o pdflush.o \
                           readahead.o swap.o truncate.o vmscan.o \
-                           prio_tree.o util.o $(mmu-y)
+                           prio_tree.o util.o mmzone.o $(mmu-y)
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
+obj-$(CONFIG_MIGRATION) += migrate.o
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 35c32290f717..d3e3bd2ffcea 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(max_pfn);		/* This is exported so
                                 * dma_get_required_mask(), which uses
                                 * it, can be an inline function */
+static LIST_HEAD(bdata_list);
 #ifdef CONFIG_CRASH_DUMP
 /*
 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -52,6 +53,27 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
        return mapsize;
 }
+/*
+ * link bdata in order
+ */
+static void link_bootmem(bootmem_data_t *bdata)
+{
+        bootmem_data_t *ent;
+        if (list_empty(&bdata_list)) {
+                list_add(&bdata->list, &bdata_list);
+                return;
+        }
+        /* insert in order */
+        list_for_each_entry(ent, &bdata_list, list) {
+                if (bdata->node_boot_start < ent->node_boot_start) {
+                        list_add_tail(&bdata->list, &ent->list);
+                        return;
+                }
+        }
+        list_add_tail(&bdata->list, &bdata_list);
+        return;
+}
 /*
 * Called once to set up the allocator itself.
@@ -62,13 +84,11 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
        bootmem_data_t *bdata = pgdat->bdata;
        unsigned long mapsize = ((end - start)+7)/8;
-        pgdat->pgdat_next = pgdat_list;
-        pgdat_list = pgdat;
        mapsize = ALIGN(mapsize, sizeof(long));
        bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
        bdata->node_boot_start = (start << PAGE_SHIFT);
        bdata->node_low_pfn = end;
+        link_bootmem(bdata);
        /*
         * Initially all pages are reserved - setup_arch() has to
@@ -152,7 +172,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
 *
 * NOTE:  This function is _not_ reentrant.
 */
-static void * __init
+void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
              unsigned long align, unsigned long goal, unsigned long limit)
 {
@@ -383,12 +403,11 @@ unsigned long __init free_all_bootmem (void)
 void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
 {
-        pg_data_t *pgdat = pgdat_list;
+        bootmem_data_t *bdata;
        void *ptr;
-        for_each_pgdat(pgdat)
+        list_for_each_entry(bdata, &bdata_list, list)
-                if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+                if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0)))
-                                                 align, goal, 0)))
                        return(ptr);
        /*
@@ -416,11 +435,11 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigne
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
 {
-        pg_data_t *pgdat = pgdat_list;
+        bootmem_data_t *bdata;
        void *ptr;
-        for_each_pgdat(pgdat)
+        list_for_each_entry(bdata, &bdata_list, list)
-                if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+                if ((ptr = __alloc_bootmem_core(bdata, size,
                                                 align, goal, LOW32LIMIT)))
                        return(ptr);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d257c89e7704..907c39257ca0 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -15,6 +15,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/fadvise.h>
+#include <linux/writeback.h>
 #include <linux/syscalls.h>
 #include <asm/unistd.h>
@@ -22,13 +23,36 @@
 /*
 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
 * deactivate the pages and clear PG_Referenced.
+ *
+ * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
+ * offsets `offset' and `offset+len' inclusive.  Any pages which are currently
+ * under writeout are skipped, whether or not they are dirty.
+ *
+ * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
+ * offsets `offset' and `offset+len'.
+ *
+ * By combining these two operations the application may do several things:
+ *
+ * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently
+ * dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push
+ * all of the currently dirty pages at the disk, wait until they have been
+ * written.
+ *
+ * It should be noted that none of these operations write out the file's
+ * metadata.  So unless the application is strictly performing overwrites of
+ * already-instantiated disk blocks, there are no guarantees here that the data
+ * will be available after a crash.
 */
 asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 {
        struct file *file = fget(fd);
        struct address_space *mapping;
        struct backing_dev_info *bdi;
-        loff_t endbyte;
+        loff_t endbyte;                 /* inclusive */
        pgoff_t start_index;
        pgoff_t end_index;
        unsigned long nrpages;
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
        endbyte = offset + len;
        if (!len || endbyte < len)
                endbyte = -1;
+        else
+                endbyte--;              /* inclusive */
        bdi = mapping->backing_dev_info;
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                /* First and last PARTIAL page! */
                start_index = offset >> PAGE_CACHE_SHIFT;
-                end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
+                end_index = endbyte >> PAGE_CACHE_SHIFT;
                /* Careful about overflow on the "+1" */
                nrpages = end_index - start_index + 1;
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                        filemap_flush(mapping);
                /* First and last FULL page! */
-                start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+                start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
                end_index = (endbyte >> PAGE_CACHE_SHIFT);
-                if (end_index > start_index)
+                if (end_index >= start_index)
-                        invalidate_mapping_pages(mapping, start_index, end_index-1);
+                        invalidate_mapping_pages(mapping, start_index,
+                                                end_index);
+                break;
+        case LINUX_FADV_ASYNC_WRITE:
+                ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+                                                WB_SYNC_NONE);
+                break;
+        case LINUX_FADV_WRITE_WAIT:
+                ret = wait_on_page_writeback_range(mapping,
+                                        offset >> PAGE_CACHE_SHIFT,
+                                        endbyte >> PAGE_CACHE_SHIFT);
                break;
        default:
                ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 7624c26fcea6..1120338a5d0f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,7 +29,10 @@
 #include <linux/blkdev.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/cpuset.h>
 #include "filemap.h"
+#include "internal.h"
 /*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
@@ -172,7 +175,7 @@ static int sync_page(void *word)
 * dirty pages that lie within the byte offsets <start, end>
 * @mapping:    address space structure to write
 * @start:      offset in bytes where the range starts
- * @end:        offset in bytes where the range ends
+ * @end:        offset in bytes where the range ends (inclusive)
 * @sync_mode:  enable synchronous operation
 *
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
@@ -180,8 +183,8 @@ static int sync_page(void *word)
 * these two operations is that if a dirty page/buffer is encountered, it must
 * be waited upon, and not just skipped over.
 */
-static int __filemap_fdatawrite_range(struct address_space *mapping,
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
-        loff_t start, loff_t end, int sync_mode)
+                                loff_t end, int sync_mode)
 {
        int ret;
        struct writeback_control wbc = {
@@ -210,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_fdatawrite);
-static int filemap_fdatawrite_range(struct address_space *mapping,
+static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
-        loff_t start, loff_t end)
+                                loff_t end)
 {
        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
@@ -230,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush);
 * Wait for writeback to complete against pages indexed by start->end
 * inclusive
 */
-static int wait_on_page_writeback_range(struct address_space *mapping,
+int wait_on_page_writeback_range(struct address_space *mapping,
                                pgoff_t start, pgoff_t end)
 {
        struct pagevec pvec;
@@ -365,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_write_and_wait);
+/*
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that `lend' is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ */
 int filemap_write_and_wait_range(struct address_space *mapping,
                                 loff_t lstart, loff_t lend)
 {
@@ -425,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
        return ret;
 }
+#ifdef CONFIG_NUMA
+struct page *page_cache_alloc(struct address_space *x)
+{
+        if (cpuset_do_page_mem_spread()) {
+                int n = cpuset_mem_spread_node();
+                return alloc_pages_node(n, mapping_gfp_mask(x), 0);
+        }
+        return alloc_pages(mapping_gfp_mask(x), 0);
+}
+EXPORT_SYMBOL(page_cache_alloc);
+struct page *page_cache_alloc_cold(struct address_space *x)
+{
+        if (cpuset_do_page_mem_spread()) {
+                int n = cpuset_mem_spread_node();
+                return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
+        }
+        return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+}
+EXPORT_SYMBOL(page_cache_alloc_cold);
+#endif
 /*
 * In order to wait for pages to become available there must be
 * waitqueues associated with pages. By using a hash table of
diff --git a/mm/highmem.c b/mm/highmem.c
index ce2e7e8bbfa7..55885f64af40 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,18 +26,14 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 static mempool_t *page_pool, *isa_page_pool;
-static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data)
+static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
 {
-        return alloc_page(gfp_mask | GFP_DMA);
+        return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
-}
-static void page_pool_free(void *page, void *data)
-{
-        __free_page(page);
 }
 /*
@@ -50,11 +46,6 @@ static void page_pool_free(void *page, void *data)
 */
 #ifdef CONFIG_HIGHMEM
-static void *page_pool_alloc(gfp_t gfp_mask, void *data)
-{
-        return alloc_page(gfp_mask);
-}
 static int pkmap_count[LAST_PKMAP];
 static unsigned int last_pkmap_nr;
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
@@ -228,7 +219,7 @@ static __init int init_emergency_pool(void)
        if (!i.totalhigh)
                return 0;
-        page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
+        page_pool = mempool_create_page_pool(POOL_SIZE, 0);
        if (!page_pool)
                BUG();
        printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
@@ -271,7 +262,8 @@ int init_emergency_isa_pool(void)
        if (isa_page_pool)
                return 0;
-        isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL);
+        isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
+                                       mempool_free_pages, (void *) 0);
        if (!isa_page_pool)
                BUG();
@@ -336,7 +328,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
        bio_put(bio);
 }
-static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err)
+static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
 {
        if (bio->bi_size)
                return 1;
@@ -383,7 +375,7 @@ static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int
 }
 static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
-                        mempool_t *pool)
+                               mempool_t *pool)
 {
        struct page *page;
        struct bio *bio = NULL;
@@ -483,6 +475,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
                pool = isa_page_pool;
        }
+        blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
        /*
         * slow path
         */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 508707704d2c..ebad6bbb3501 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,24 +13,48 @@
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
+#include <linux/mutex.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <linux/hugetlb.h>
+#include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
 static DEFINE_SPINLOCK(hugetlb_lock);
+static void clear_huge_page(struct page *page, unsigned long addr)
+{
+        int i;
+        might_sleep();
+        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+                cond_resched();
+                clear_user_highpage(page + i, addr);
+        }
+}
+static void copy_huge_page(struct page *dst, struct page *src,
+                           unsigned long addr)
+{
+        int i;
+        might_sleep();
+        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+                cond_resched();
+                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+        }
+}
 static void enqueue_huge_page(struct page *page)
 {
        int nid = page_to_nid(page);
@@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
        return page;
 }
-static struct page *alloc_fresh_huge_page(void)
+static void free_huge_page(struct page *page)
+{
+        BUG_ON(page_count(page));
+        INIT_LIST_HEAD(&page->lru);
+        spin_lock(&hugetlb_lock);
+        enqueue_huge_page(page);
+        spin_unlock(&hugetlb_lock);
+}
+static int alloc_fresh_huge_page(void)
 {
        static int nid = 0;
        struct page *page;
        page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
                                        HUGETLB_PAGE_ORDER);
-        nid = (nid + 1) % num_online_nodes();
+        nid = next_node(nid, node_online_map);
+        if (nid == MAX_NUMNODES)
+                nid = first_node(node_online_map);
        if (page) {
+                page[1].lru.next = (void *)free_huge_page;      /* dtor */
                spin_lock(&hugetlb_lock);
                nr_huge_pages++;
                nr_huge_pages_node[page_to_nid(page)]++;
                spin_unlock(&hugetlb_lock);
+                put_page(page); /* free it into the hugepage allocator */
+                return 1;
        }
-        return page;
+        return 0;
 }
-void free_huge_page(struct page *page)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+                                    unsigned long addr)
 {
-        BUG_ON(page_count(page));
+        struct inode *inode = vma->vm_file->f_dentry->d_inode;
+        struct page *page;
+        int use_reserve = 0;
+        unsigned long idx;
-        INIT_LIST_HEAD(&page->lru);
+        spin_lock(&hugetlb_lock);
-        page[1].lru.next = NULL;                        /* reset dtor */
+        if (vma->vm_flags & VM_MAYSHARE) {
+                /* idx = radix tree index, i.e. offset into file in
+                 * HPAGE_SIZE units */
+                idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+                        + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+                /* The hugetlbfs specific inode info stores the number
+                 * of "guaranteed available" (huge) pages.  That is,
+                 * the first 'prereserved_hpages' pages of the inode
+                 * are either already instantiated, or have been
+                 * pre-reserved (by hugetlb_reserve_for_inode()). Here
+                 * we're in the process of instantiating the page, so
+                 * we use this to determine whether to draw from the
+                 * pre-reserved pool or the truly free pool. */
+                if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
+                        use_reserve = 1;
+        }
+        if (!use_reserve) {
+                if (free_huge_pages <= reserved_huge_pages)
+                        goto fail;
+        } else {
+                BUG_ON(reserved_huge_pages == 0);
+                reserved_huge_pages--;
+        }
+        page = dequeue_huge_page(vma, addr);
+        if (!page)
+                goto fail;
+        spin_unlock(&hugetlb_lock);
+        set_page_refcounted(page);
+        return page;
+ fail:
+        WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+        spin_unlock(&hugetlb_lock);
+        return NULL;
+}
+/* hugetlb_extend_reservation()
+ *
+ * Ensure that at least 'atleast' hugepages are, and will remain,
+ * available to instantiate the first 'atleast' pages of the given
+ * inode.  If the inode doesn't already have this many pages reserved
+ * or instantiated, set aside some hugepages in the reserved pool to
+ * satisfy later faults (or fail now if there aren't enough, rather
+ * than getting the SIGBUS later).
+ */
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+                               unsigned long atleast)
+{
+        struct inode *inode = &info->vfs_inode;
+        unsigned long change_in_reserve = 0;
+        int ret = 0;
        spin_lock(&hugetlb_lock);
-        enqueue_huge_page(page);
+        read_lock_irq(&inode->i_mapping->tree_lock);
+        if (info->prereserved_hpages >= atleast)
+                goto out;
+        /* Because we always call this on shared mappings, none of the
+         * pages beyond info->prereserved_hpages can have been
+         * instantiated, so we need to reserve all of them now. */
+        change_in_reserve = atleast - info->prereserved_hpages;
+        if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        reserved_huge_pages += change_in_reserve;
+        info->prereserved_hpages = atleast;
+ out:
+        read_unlock_irq(&inode->i_mapping->tree_lock);
        spin_unlock(&hugetlb_lock);
+        return ret;
 }
-struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
+/* hugetlb_truncate_reservation()
+ *
+ * This returns pages reserved for the given inode to the general free
+ * hugepage pool.  If the inode has any pages prereserved, but not
+ * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
+ * them.
+ */
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+                                  unsigned long atmost)
 {
+        struct inode *inode = &info->vfs_inode;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned long idx;
+        unsigned long change_in_reserve = 0;
        struct page *page;
-        int i;
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page(vma, addr);
+        read_lock_irq(&inode->i_mapping->tree_lock);
-        if (!page) {
-                spin_unlock(&hugetlb_lock);
+        if (info->prereserved_hpages <= atmost)
-                return NULL;
+                goto out;
+        /* Count pages which were reserved, but not instantiated, and
+         * which we can now release. */
+        for (idx = atmost; idx < info->prereserved_hpages; idx++) {
+                page = radix_tree_lookup(&mapping->page_tree, idx);
+                if (!page)
+                        /* Pages which are already instantiated can't
+                         * be unreserved (and in fact have already
+                         * been removed from the reserved pool) */
+                        change_in_reserve++;
        }
+        BUG_ON(reserved_huge_pages < change_in_reserve);
+        reserved_huge_pages -= change_in_reserve;
+        info->prereserved_hpages = atmost;
+ out:
+        read_unlock_irq(&inode->i_mapping->tree_lock);
        spin_unlock(&hugetlb_lock);
-        set_page_count(page, 1);
-        page[1].lru.next = (void *)free_huge_page;      /* set dtor */
-        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-                clear_user_highpage(&page[i], addr);
-        return page;
 }
 static int __init hugetlb_init(void)
 {
        unsigned long i;
-        struct page *page;
        if (HPAGE_SHIFT == 0)
                return 0;
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void)
                INIT_LIST_HEAD(&hugepage_freelists[i]);
        for (i = 0; i < max_huge_pages; ++i) {
-                page = alloc_fresh_huge_page();
+                if (!alloc_fresh_huge_page())
-                if (!page)
                        break;
-                spin_lock(&hugetlb_lock);
-                enqueue_huge_page(page);
-                spin_unlock(&hugetlb_lock);
        }
        max_huge_pages = free_huge_pages = nr_huge_pages = i;
        printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page)
                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
                                1 << PG_private | 1<< PG_writeback);
-                set_page_count(&page[i], 0);
        }
-        set_page_count(page, 1);
+        page[1].lru.next = NULL;
+        set_page_refcounted(page);
        __free_pages(page, HUGETLB_PAGE_ORDER);
 }
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
        while (count > nr_huge_pages) {
-                struct page *page = alloc_fresh_huge_page();
+                if (!alloc_fresh_huge_page())
-                if (!page)
                        return nr_huge_pages;
-                spin_lock(&hugetlb_lock);
-                enqueue_huge_page(page);
-                spin_unlock(&hugetlb_lock);
        }
        if (count >= nr_huge_pages)
                return nr_huge_pages;
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf)
        return sprintf(buf,
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
+                        "HugePages_Rsvd:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
                        nr_huge_pages,
                        free_huge_pages,
+                        reserved_huge_pages,
                        HPAGE_SIZE/1024);
 }
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
                nid, free_huge_pages_node[nid]);
 }
-int is_hugepage_mem_enough(size_t size)
-{
-        return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
-}
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, pte_t pte)
 {
        struct page *old_page, *new_page;
-        int i, avoidcopy;
+        int avoidcopy;
        old_page = pte_page(pte);
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        spin_unlock(&mm->page_table_lock);
-        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+        copy_huge_page(new_page, old_page, address);
-                copy_user_highpage(new_page + i, old_page + i,
-                                   address + i*PAGE_SIZE);
        spin_lock(&mm->page_table_lock);
        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -442,6 +572,7 @@ retry:
                        ret = VM_FAULT_OOM;
                        goto out;
                }
+                clear_huge_page(page, address);
                if (vma->vm_flags & VM_SHARED) {
                        int err;
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t entry;
        int ret;
+        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
        ptep = huge_pte_alloc(mm, address);
        if (!ptep)
                return VM_FAULT_OOM;
+        /*
+         * Serialize hugepage allocation and instantiation, so that we don't
+         * get spurious allocation failures if two CPUs race to instantiate
+         * the same page in the page cache.
+         */
+        mutex_lock(&hugetlb_instantiation_mutex);
        entry = *ptep;
-        if (pte_none(entry))
+        if (pte_none(entry)) {
-                return hugetlb_no_page(mm, vma, address, ptep, write_access);
+                ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
+                mutex_unlock(&hugetlb_instantiation_mutex);
+                return ret;
+        }
        ret = VM_FAULT_MINOR;
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                if (write_access && !pte_write(entry))
                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
        spin_unlock(&mm->page_table_lock);
+        mutex_unlock(&hugetlb_instantiation_mutex);
        return ret;
 }
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i)
 {
-        unsigned long vpfn, vaddr = *position;
+        unsigned long pfn_offset;
+        unsigned long vaddr = *position;
        int remainder = *length;
-        vpfn = vaddr/PAGE_SIZE;
        spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
                pte_t *pte;
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        break;
                }
-                if (pages) {
+                pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
-                        page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+                page = pte_page(*pte);
-                        get_page(page);
+same_page:
-                        pages[i] = page;
+                get_page(page);
-                }
+                if (pages)
+                        pages[i] = page + pfn_offset;
                if (vmas)
                        vmas[i] = vma;
                vaddr += PAGE_SIZE;
-                ++vpfn;
+                ++pfn_offset;
                --remainder;
                ++i;
+                if (vaddr < vma->vm_end && remainder &&
+                                pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+                        /*
+                         * We use pfn_offset to avoid touching the pageframes
+                         * of this compound page.
+                         */
+                        goto same_page;
+                }
        }
        spin_unlock(&mm->page_table_lock);
        *length = remainder;
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        return i;
 }
+void hugetlb_change_protection(struct vm_area_struct *vma,
+                unsigned long address, unsigned long end, pgprot_t newprot)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long start = address;
+        pte_t *ptep;
+        pte_t pte;
+        BUG_ON(address >= end);
+        flush_cache_range(vma, address, end);
+        spin_lock(&mm->page_table_lock);
+        for (; address < end; address += HPAGE_SIZE) {
+                ptep = huge_pte_offset(mm, address);
+                if (!ptep)
+                        continue;
+                if (!pte_none(*ptep)) {
+                        pte = huge_ptep_get_and_clear(mm, address, ptep);
+                        pte = pte_mkhuge(pte_modify(pte, newprot));
+                        set_huge_pte_at(mm, address, ptep, pte);
+                        lazy_mmu_prot_update(pte);
+                }
+        }
+        spin_unlock(&mm->page_table_lock);
+        flush_tlb_range(vma, start, end);
+}
diff --git a/mm/internal.h b/mm/internal.h
index 17256bb2f4ef..d20e3cc4aef0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -8,23 +8,33 @@
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
+#ifndef __MM_INTERNAL_H
+#define __MM_INTERNAL_H
-static inline void set_page_refs(struct page *page, int order)
+#include <linux/mm.h>
+static inline void set_page_count(struct page *page, int v)
+{
+        atomic_set(&page->_count, v);
+}
+/*
+ * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * a count of one.
+ */
+static inline void set_page_refcounted(struct page *page)
 {
-#ifdef CONFIG_MMU
+        BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
+        BUG_ON(atomic_read(&page->_count));
        set_page_count(page, 1);
-#else
+}
-        int i;
-        /*
+static inline void __put_page(struct page *page)
-         * We need to reference all the pages for this order, otherwise if
+{
-         * anyone accesses one of the pages with (get/put) it will be freed.
+        atomic_dec(&page->_count);
-         * - eg: access_process_vm()
-         */
-        for (i = 0; i < (1 << order); i++)
-                set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
 }
 extern void fastcall __init __free_pages_bootmem(struct page *page,
                                                unsigned int order);
+#endif
diff --git a/mm/memory.c b/mm/memory.c
index 85e80a57db29..8d8f52569f32 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
                anon_vma_unlink(vma);
                unlink_file_vma(vma);
-                if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
+                if (is_vm_hugetlb_page(vma)) {
                        hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next? next->vm_start: ceiling);
                } else {
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
                         * Optimization: gather nearby vmas into one call down
                         */
                        while (next && next->vm_start <= vma->vm_end + PMD_SIZE
-                          && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
+                               && !is_vm_hugetlb_page(next)) {
-                                                        HPAGE_SIZE)) {
                                vma = next;
                                next = vma->vm_next;
                                anon_vma_unlink(vma);
@@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 {
        unsigned long pfn = pte_pfn(pte);
-        if (vma->vm_flags & VM_PFNMAP) {
+        if (unlikely(vma->vm_flags & VM_PFNMAP)) {
                unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
                if (pfn == vma->vm_pgoff + off)
                        return NULL;
@@ -401,8 +400,6 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
         * we should just do "return pfn_to_page(pfn)", but
         * in the meantime we check that we get a valid pfn,
         * and that the resulting page looks ok.
-         *
-         * Remove this test eventually!
         */
        if (unlikely(!pfn_valid(pfn))) {
                print_bad_pte(vma, pte, addr);
@@ -1074,6 +1071,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        }
                        if (pages) {
                                pages[i] = page;
+                                flush_anon_page(page, start);
                                flush_dcache_page(page);
                        }
                        if (vmas)
@@ -1221,9 +1220,7 @@ out:
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
- * (which is mainly an issue of doing "set_page_count(page, 1)" for
+ * (see split_page()).
- * each sub-page, and then freeing them one by one when you free
- * them rather than freeing it as a compound page).
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
@@ -2357,10 +2354,8 @@ int make_pages_present(unsigned long addr, unsigned long end)
        if (!vma)
                return -1;
        write = (vma->vm_flags & VM_WRITE) != 0;
-        if (addr >= end)
+        BUG_ON(addr >= end);
-                BUG();
+        BUG_ON(end > vma->vm_end);
-        if (end > vma->vm_end)
-                BUG();
        len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
        ret = get_user_pages(current, current->mm, addr,
                        len, write, 0, NULL, NULL);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b21869a39f0b..dec8249e972d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
 #include <linux/swap.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/migrate.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -95,11 +96,8 @@
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
-/* The number of pages to migrate per call to migrate_pages() */
+static struct kmem_cache *policy_cache;
-#define MIGRATE_CHUNK_SIZE 256
+static struct kmem_cache *sn_cache;
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
 #define PDprintk(fmt...)
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        struct vm_area_struct *first, *vma, *prev;
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-                /* Must have swap device for migration */
-                if (nr_swap_pages <= 0)
-                        return ERR_PTR(-ENODEV);
-                /*
+                err = migrate_prep();
-                 * Clear the LRU lists so pages can be isolated.
+                if (err)
-                 * Note that pages may be moved off the LRU after we have
+                        return ERR_PTR(err);
-                 * drained them. Those pages will fail to migrate like other
-                 * pages that may be busy.
-                 */
-                lru_add_drain_all();
        }
        first = find_vma(mm, start);
@@ -431,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
        return mpol_check_policy(mode, nodes);
 }
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy.  Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+        if (p->mempolicy)
+                p->flags |= PF_MEMPOLICY;
+        else
+                p->flags &= ~PF_MEMPOLICY;
+}
+static void mpol_set_task_struct_flag(void)
+{
+        mpol_fix_fork_child_flag(current);
+}
 /* Set the process memory policy */
 long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
@@ -443,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
                return PTR_ERR(new);
        mpol_free(current->mempolicy);
        current->mempolicy = new;
+        mpol_set_task_struct_flag();
        if (new && new->policy == MPOL_INTERLEAVE)
                current->il_next = first_node(new->v.nodes);
        return 0;
@@ -550,92 +573,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
        return err;
 }
+#ifdef CONFIG_MIGRATION
 /*
 * page migration
 */
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags)
 {
        /*
         * Avoid migrating a page that is shared with others.
         */
-        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
-                if (isolate_lru_page(page))
+                isolate_lru_page(page, pagelist);
-                        list_add_tail(&page->lru, pagelist);
-        }
-}
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-static int migrate_pages_to(struct list_head *pagelist,
-                        struct vm_area_struct *vma, int dest)
-{
-        LIST_HEAD(newlist);
-        LIST_HEAD(moved);
-        LIST_HEAD(failed);
-        int err = 0;
-        unsigned long offset = 0;
-        int nr_pages;
-        struct page *page;
-        struct list_head *p;
-redo:
-        nr_pages = 0;
-        list_for_each(p, pagelist) {
-                if (vma) {
-                        /*
-                         * The address passed to alloc_page_vma is used to
-                         * generate the proper interleave behavior. We fake
-                         * the address here by an increasing offset in order
-                         * to get the proper distribution of pages.
-                         *
-                         * No decision has been made as to which page
-                         * a certain old page is moved to so we cannot
-                         * specify the correct address.
-                         */
-                        page = alloc_page_vma(GFP_HIGHUSER, vma,
-                                        offset + vma->vm_start);
-                        offset += PAGE_SIZE;
-                }
-                else
-                        page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-                if (!page) {
-                        err = -ENOMEM;
-                        goto out;
-                }
-                list_add_tail(&page->lru, &newlist);
-                nr_pages++;
-                if (nr_pages > MIGRATE_CHUNK_SIZE)
-                        break;
-        }
-        err = migrate_pages(pagelist, &newlist, &moved, &failed);
-        putback_lru_pages(&moved);      /* Call release pages instead ?? */
-        if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-                goto redo;
-out:
-        /* Return leftover allocated pages */
-        while (!list_empty(&newlist)) {
-                page = list_entry(newlist.next, struct page, lru);
-                list_del(&page->lru);
-                __free_page(page);
-        }
-        list_splice(&failed, pagelist);
-        if (err < 0)
-                return err;
-        /* Calculate number of leftover pages */
-        nr_pages = 0;
-        list_for_each(p, pagelist)
-                nr_pages++;
-        return nr_pages;
 }
 /*
@@ -742,8 +691,23 @@ int do_migrate_pages(struct mm_struct *mm,
        if (err < 0)
                return err;
        return busy;
 }
+#else
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                                unsigned long flags)
+{
+}
+int do_migrate_pages(struct mm_struct *mm,
+        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+        return -ENOSYS;
+}
+#endif
 long do_mbind(unsigned long start, unsigned long len,
                unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
@@ -808,6 +772,7 @@ long do_mbind(unsigned long start, unsigned long len,
                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
        }
        if (!list_empty(&pagelist))
                putback_lru_pages(&pagelist);
@@ -947,7 +912,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
        /*
         * Check if this process has the right to modify the specified
         * process. The right exists if the process has administrative
-         * capabilities, superuser priviledges or the same
+         * capabilities, superuser privileges or the same
         * userid as the target process.
         */
        if ((current->euid != task->suid) && (current->euid != task->uid) &&
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a99b80480d3..fe6e05289cc5 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -183,8 +183,8 @@ EXPORT_SYMBOL(mempool_resize);
 */
 void mempool_destroy(mempool_t *pool)
 {
-        if (pool->curr_nr != pool->min_nr)
+        /* Check for outstanding elements */
-                BUG();          /* There were outstanding elements */
+        BUG_ON(pool->curr_nr != pool->min_nr);
        free_pool(pool);
 }
 EXPORT_SYMBOL(mempool_destroy);
@@ -278,14 +278,56 @@ EXPORT_SYMBOL(mempool_free);
 */
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
 {
-        kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+        struct kmem_cache *mem = pool_data;
        return kmem_cache_alloc(mem, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_alloc_slab);
 void mempool_free_slab(void *element, void *pool_data)
 {
-        kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+        struct kmem_cache *mem = pool_data;
        kmem_cache_free(mem, element);
 }
 EXPORT_SYMBOL(mempool_free_slab);
+/*
+ * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
+ * specfied by pool_data
+ */
+void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
+{
+        size_t size = (size_t)(long)pool_data;
+        return kmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kmalloc);
+void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
+{
+        size_t size = (size_t) pool_data;
+        return kzalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kzalloc);
+void mempool_kfree(void *element, void *pool_data)
+{
+        kfree(element);
+}
+EXPORT_SYMBOL(mempool_kfree);
+/*
+ * A simple mempool-backed page allocator that allocates pages
+ * of the order specified by pool_data.
+ */
+void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
+{
+        int order = (int)(long)pool_data;
+        return alloc_pages(gfp_mask, order);
+}
+EXPORT_SYMBOL(mempool_alloc_pages);
+void mempool_free_pages(void *element, void *pool_data)
+{
+        int order = (int)(long)pool_data;
+        __free_pages(element, order);
+}
+EXPORT_SYMBOL(mempool_free_pages);
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 000000000000..09f6e4aa87fc
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,655 @@
+/*
+ * Memory Migration functionality - linux/mm/migration.c
+ *
+ * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
+ *
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+#include <linux/migrate.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>  /* for try_to_release_page(),
+                                        buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/swapops.h>
+#include "internal.h"
+#include "internal.h"
+/* The maximum number of pages to take off the LRU for migration */
+#define MIGRATE_CHUNK_SIZE 256
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+/*
+ * Isolate one page from the LRU lists. If successful put it onto
+ * the indicated list with elevated page count.
+ *
+ * Result:
+ *  -EBUSY: page not on LRU list
+ *  0: page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page, struct list_head *pagelist)
+{
+        int ret = -EBUSY;
+        if (PageLRU(page)) {
+                struct zone *zone = page_zone(page);
+                spin_lock_irq(&zone->lru_lock);
+                if (PageLRU(page)) {
+                        ret = 0;
+                        get_page(page);
+                        ClearPageLRU(page);
+                        if (PageActive(page))
+                                del_page_from_active_list(zone, page);
+                        else
+                                del_page_from_inactive_list(zone, page);
+                        list_add_tail(&page->lru, pagelist);
+                }
+                spin_unlock_irq(&zone->lru_lock);
+        }
+        return ret;
+}
+/*
+ * migrate_prep() needs to be called after we have compiled the list of pages
+ * to be migrated using isolate_lru_page() but before we begin a series of calls
+ * to migrate_pages().
+ */
+int migrate_prep(void)
+{
+        /* Must have swap device for migration */
+        if (nr_swap_pages <= 0)
+                return -ENODEV;
+        /*
+         * Clear the LRU lists so pages can be isolated.
+         * Note that pages may be moved off the LRU after we have
+         * drained them. Those pages will fail to migrate like other
+         * pages that may be busy.
+         */
+        lru_add_drain_all();
+        return 0;
+}
+static inline void move_to_lru(struct page *page)
+{
+        list_del(&page->lru);
+        if (PageActive(page)) {
+                /*
+                 * lru_cache_add_active checks that
+                 * the PG_active bit is off.
+                 */
+                ClearPageActive(page);
+                lru_cache_add_active(page);
+        } else {
+                lru_cache_add(page);
+        }
+        put_page(page);
+}
+/*
+ * Add isolated pages on the list back to the LRU.
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+        struct page *page;
+        struct page *page2;
+        int count = 0;
+        list_for_each_entry_safe(page, page2, l, lru) {
+                move_to_lru(page);
+                count++;
+        }
+        return count;
+}
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+        return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+        struct address_space *mapping = page_mapping(page);
+        if (page_mapped(page) && mapping)
+                if (try_to_unmap(page, 1) != SWAP_SUCCESS)
+                        goto unlock_retry;
+        if (PageDirty(page)) {
+                /* Page is dirty, try to write it out here */
+                switch(pageout(page, mapping)) {
+                case PAGE_KEEP:
+                case PAGE_ACTIVATE:
+                        goto unlock_retry;
+                case PAGE_SUCCESS:
+                        goto retry;
+                case PAGE_CLEAN:
+                        ; /* try to free the page below */
+                }
+        }
+        if (PagePrivate(page)) {
+                if (!try_to_release_page(page, GFP_KERNEL) ||
+                    (!mapping && page_count(page) == 1))
+                        goto unlock_retry;
+        }
+        if (remove_mapping(mapping, page)) {
+                /* Success */
+                unlock_page(page);
+                return 0;
+        }
+unlock_retry:
+        unlock_page(page);
+retry:
+        return -EAGAIN;
+}
+EXPORT_SYMBOL(swap_page);
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+int migrate_page_remove_references(struct page *newpage,
+                                struct page *page, int nr_refs)
+{
+        struct address_space *mapping = page_mapping(page);
+        struct page **radix_pointer;
+        /*
+         * Avoid doing any of the following work if the page count
+         * indicates that the page is in use or truncate has removed
+         * the page.
+         */
+        if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+                return -EAGAIN;
+        /*
+         * Establish swap ptes for anonymous pages or destroy pte
+         * maps for files.
+         *
+         * In order to reestablish file backed mappings the fault handlers
+         * will take the radix tree_lock which may then be used to stop
+         * processses from accessing this page until the new page is ready.
+         *
+         * A process accessing via a swap pte (an anonymous page) will take a
+         * page_lock on the old page which will block the process until the
+         * migration attempt is complete. At that time the PageSwapCache bit
+         * will be examined. If the page was migrated then the PageSwapCache
+         * bit will be clear and the operation to retrieve the page will be
+         * retried which will find the new page in the radix tree. Then a new
+         * direct mapping may be generated based on the radix tree contents.
+         *
+         * If the page was not migrated then the PageSwapCache bit
+         * is still set and the operation may continue.
+         */
+        if (try_to_unmap(page, 1) == SWAP_FAIL)
+                /* A vma has VM_LOCKED set -> permanent failure */
+                return -EPERM;
+        /*
+         * Give up if we were unable to remove all mappings.
+         */
+        if (page_mapcount(page))
+                return -EAGAIN;
+        write_lock_irq(&mapping->tree_lock);
+        radix_pointer = (struct page **)radix_tree_lookup_slot(
+                                                &mapping->page_tree,
+                                                page_index(page));
+        if (!page_mapping(page) || page_count(page) != nr_refs ||
+                        *radix_pointer != page) {
+                write_unlock_irq(&mapping->tree_lock);
+                return 1;
+        }
+        /*
+         * Now we know that no one else is looking at the page.
+         *
+         * Certain minimal information about a page must be available
+         * in order for other subsystems to properly handle the page if they
+         * find it through the radix tree update before we are finished
+         * copying the page.
+         */
+        get_page(newpage);
+        newpage->index = page->index;
+        newpage->mapping = page->mapping;
+        if (PageSwapCache(page)) {
+                SetPageSwapCache(newpage);
+                set_page_private(newpage, page_private(page));
+        }
+        *radix_pointer = newpage;
+        __put_page(page);
+        write_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
+EXPORT_SYMBOL(migrate_page_remove_references);
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+        copy_highpage(newpage, page);
+        if (PageError(page))
+                SetPageError(newpage);
+        if (PageReferenced(page))
+                SetPageReferenced(newpage);
+        if (PageUptodate(page))
+                SetPageUptodate(newpage);
+        if (PageActive(page))
+                SetPageActive(newpage);
+        if (PageChecked(page))
+                SetPageChecked(newpage);
+        if (PageMappedToDisk(page))
+                SetPageMappedToDisk(newpage);
+        if (PageDirty(page)) {
+                clear_page_dirty_for_io(page);
+                set_page_dirty(newpage);
+        }
+        ClearPageSwapCache(page);
+        ClearPageActive(page);
+        ClearPagePrivate(page);
+        set_page_private(page, 0);
+        page->mapping = NULL;
+        /*
+         * If any waiters have accumulated on the new page then
+         * wake them up.
+         */
+        if (PageWriteback(newpage))
+                end_page_writeback(newpage);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+        int rc;
+        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
+        rc = migrate_page_remove_references(newpage, page, 2);
+        if (rc)
+                return rc;
+        migrate_page_copy(newpage, page);
+        /*
+         * Remove auxiliary swap entries and replace
+         * them with real ptes.
+         *
+         * Note that a real pte entry will allow processes that are not
+         * waiting on the page lock to use the new page via the page tables
+         * before the new page is unlocked.
+         */
+        remove_from_swap(newpage);
+        return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because to has become empty
+ * or no retryable pages exist anymore.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+                  struct list_head *moved, struct list_head *failed)
+{
+        int retry;
+        int nr_failed = 0;
+        int pass = 0;
+        struct page *page;
+        struct page *page2;
+        int swapwrite = current->flags & PF_SWAPWRITE;
+        int rc;
+        if (!swapwrite)
+                current->flags |= PF_SWAPWRITE;
+redo:
+        retry = 0;
+        list_for_each_entry_safe(page, page2, from, lru) {
+                struct page *newpage = NULL;
+                struct address_space *mapping;
+                cond_resched();
+                rc = 0;
+                if (page_count(page) == 1)
+                        /* page was freed from under us. So we are done. */
+                        goto next;
+                if (to && list_empty(to))
+                        break;
+                /*
+                 * Skip locked pages during the first two passes to give the
+                 * functions holding the lock time to release the page. Later we
+                 * use lock_page() to have a higher chance of acquiring the
+                 * lock.
+                 */
+                rc = -EAGAIN;
+                if (pass > 2)
+                        lock_page(page);
+                else
+                        if (TestSetPageLocked(page))
+                                goto next;
+                /*
+                 * Only wait on writeback if we have already done a pass where
+                 * we we may have triggered writeouts for lots of pages.
+                 */
+                if (pass > 0) {
+                        wait_on_page_writeback(page);
+                } else {
+                        if (PageWriteback(page))
+                                goto unlock_page;
+                }
+                /*
+                 * Anonymous pages must have swap cache references otherwise
+                 * the information contained in the page maps cannot be
+                 * preserved.
+                 */
+                if (PageAnon(page) && !PageSwapCache(page)) {
+                        if (!add_to_swap(page, GFP_KERNEL)) {
+                                rc = -ENOMEM;
+                                goto unlock_page;
+                        }
+                }
+                if (!to) {
+                        rc = swap_page(page);
+                        goto next;
+                }
+                newpage = lru_to_page(to);
+                lock_page(newpage);
+                /*
+                 * Pages are properly locked and writeback is complete.
+                 * Try to migrate the page.
+                 */
+                mapping = page_mapping(page);
+                if (!mapping)
+                        goto unlock_both;
+                if (mapping->a_ops->migratepage) {
+                        /*
+                         * Most pages have a mapping and most filesystems
+                         * should provide a migration function. Anonymous
+                         * pages are part of swap space which also has its
+                         * own migration function. This is the most common
+                         * path for page migration.
+                         */
+                        rc = mapping->a_ops->migratepage(newpage, page);
+                        goto unlock_both;
+                }
+                /*
+                 * Default handling if a filesystem does not provide
+                 * a migration function. We can only migrate clean
+                 * pages so try to write out any dirty pages first.
+                 */
+                if (PageDirty(page)) {
+                        switch (pageout(page, mapping)) {
+                        case PAGE_KEEP:
+                        case PAGE_ACTIVATE:
+                                goto unlock_both;
+                        case PAGE_SUCCESS:
+                                unlock_page(newpage);
+                                goto next;
+                        case PAGE_CLEAN:
+                                ; /* try to migrate the page below */
+                        }
+                }
+                /*
+                 * Buffers are managed in a filesystem specific way.
+                 * We must have no buffers or drop them.
+                 */
+                if (!page_has_buffers(page) ||
+                    try_to_release_page(page, GFP_KERNEL)) {
+                        rc = migrate_page(newpage, page);
+                        goto unlock_both;
+                }
+                /*
+                 * On early passes with mapped pages simply
+                 * retry. There may be a lock held for some
+                 * buffers that may go away. Later
+                 * swap them out.
+                 */
+                if (pass > 4) {
+                        /*
+                         * Persistently unable to drop buffers..... As a
+                         * measure of last resort we fall back to
+                         * swap_page().
+                         */
+                        unlock_page(newpage);
+                        newpage = NULL;
+                        rc = swap_page(page);
+                        goto next;
+                }
+unlock_both:
+                unlock_page(newpage);
+unlock_page:
+                unlock_page(page);
+next:
+                if (rc == -EAGAIN) {
+                        retry++;
+                } else if (rc) {
+                        /* Permanent failure */
+                        list_move(&page->lru, failed);
+                        nr_failed++;
+                } else {
+                        if (newpage) {
+                                /* Successful migration. Return page to LRU */
+                                move_to_lru(newpage);
+                        }
+                        list_move(&page->lru, moved);
+                }
+        }
+        if (retry && pass++ < 10)
+                goto redo;
+        if (!swapwrite)
+                current->flags &= ~PF_SWAPWRITE;
+        return nr_failed + retry;
+}
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct page *newpage, struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct buffer_head *bh, *head;
+        int rc;
+        if (!mapping)
+                return -EAGAIN;
+        if (!page_has_buffers(page))
+                return migrate_page(newpage, page);
+        head = page_buffers(page);
+        rc = migrate_page_remove_references(newpage, page, 3);
+        if (rc)
+                return rc;
+        bh = head;
+        do {
+                get_bh(bh);
+                lock_buffer(bh);
+                bh = bh->b_this_page;
+        } while (bh != head);
+        ClearPagePrivate(page);
+        set_page_private(newpage, page_private(page));
+        set_page_private(page, 0);
+        put_page(page);
+        get_page(newpage);
+        bh = head;
+        do {
+                set_bh_page(bh, newpage, bh_offset(bh));
+                bh = bh->b_this_page;
+        } while (bh != head);
+        SetPagePrivate(newpage);
+        migrate_page_copy(newpage, page);
+        bh = head;
+        do {
+                unlock_buffer(bh);
+                put_bh(bh);
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+int migrate_pages_to(struct list_head *pagelist,
+                        struct vm_area_struct *vma, int dest)
+{
+        LIST_HEAD(newlist);
+        LIST_HEAD(moved);
+        LIST_HEAD(failed);
+        int err = 0;
+        unsigned long offset = 0;
+        int nr_pages;
+        struct page *page;
+        struct list_head *p;
+redo:
+        nr_pages = 0;
+        list_for_each(p, pagelist) {
+                if (vma) {
+                        /*
+                         * The address passed to alloc_page_vma is used to
+                         * generate the proper interleave behavior. We fake
+                         * the address here by an increasing offset in order
+                         * to get the proper distribution of pages.
+                         *
+                         * No decision has been made as to which page
+                         * a certain old page is moved to so we cannot
+                         * specify the correct address.
+                         */
+                        page = alloc_page_vma(GFP_HIGHUSER, vma,
+                                        offset + vma->vm_start);
+                        offset += PAGE_SIZE;
+                }
+                else
+                        page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+                if (!page) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                list_add_tail(&page->lru, &newlist);
+                nr_pages++;
+                if (nr_pages > MIGRATE_CHUNK_SIZE)
+                        break;
+        }
+        err = migrate_pages(pagelist, &newlist, &moved, &failed);
+        putback_lru_pages(&moved);      /* Call release pages instead ?? */
+        if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+                goto redo;
+out:
+        /* Return leftover allocated pages */
+        while (!list_empty(&newlist)) {
+                page = list_entry(newlist.next, struct page, lru);
+                list_del(&page->lru);
+                __free_page(page);
+        }
+        list_splice(&failed, pagelist);
+        if (err < 0)
+                return err;
+        /* Calculate number of leftover pages */
+        nr_pages = 0;
+        list_for_each(p, pagelist)
+                nr_pages++;
+        return nr_pages;
+}
diff --git a/mm/mmap.c b/mm/mmap.c
index 47556d2b3e90..4f5b5709136a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those.
 */
-#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                        struct file *file, unsigned long vm_flags)
@@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
        const unsigned long stack_flags
                = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
-#ifdef CONFIG_HUGETLB
-        if (flags & VM_HUGETLB) {
-                if (!(flags & VM_DONTCOPY))
-                        mm->shared_vm += pages;
-                return;
-        }
-#endif /* CONFIG_HUGETLB */
        if (file) {
                mm->shared_vm += pages;
                if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
@@ -1048,12 +1040,11 @@ munmap_back:
         * specific mapper. the address has already been validated, but
         * not unmapped, but the maps are removed from the list.
         */
-        vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
        if (!vma) {
                error = -ENOMEM;
                goto unacct_error;
        }
-        memset(vma, 0, sizeof(*vma));
        vma->vm_mm = mm;
        vma->vm_start = addr;
@@ -1904,12 +1895,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
        /*
         * create a vma struct for an anonymous mapping
         */
-        vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
        if (!vma) {
                vm_unacct_memory(len >> PAGE_SHIFT);
                return -ENOMEM;
        }
-        memset(vma, 0, sizeof(*vma));
        vma->vm_mm = mm;
        vma->vm_start = addr;
diff --git a/mm/mmzone.c b/mm/mmzone.c
new file mode 100644
index 000000000000..b022370e612e
--- /dev/null
+++ b/mm/mmzone.c
@@ -0,0 +1,50 @@
+/*
+ * linux/mm/mmzone.c
+ *
+ * management codes for pgdats and zones.
+ */
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+struct pglist_data *first_online_pgdat(void)
+{
+        return NODE_DATA(first_online_node);
+}
+EXPORT_SYMBOL(first_online_pgdat);
+struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+{
+        int nid = next_online_node(pgdat->node_id);
+        if (nid == MAX_NUMNODES)
+                return NULL;
+        return NODE_DATA(nid);
+}
+EXPORT_SYMBOL(next_online_pgdat);
+/*
+ * next_zone - helper magic for for_each_zone()
+ */
+struct zone *next_zone(struct zone *zone)
+{
+        pg_data_t *pgdat = zone->zone_pgdat;
+        if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
+                zone++;
+        else {
+                pgdat = next_online_pgdat(pgdat);
+                if (pgdat)
+                        zone = pgdat->node_zones;
+                else
+                        zone = NULL;
+        }
+        return zone;
+}
+EXPORT_SYMBOL(next_zone);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 653b8571c1ed..4c14d4289b61 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
         * a MAP_NORESERVE private mapping to writable will now reserve.
         */
        if (newflags & VM_WRITE) {
-                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
+                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
                        charged = nrpages;
                        if (security_vm_enough_memory(charged))
                                return -ENOMEM;
@@ -166,7 +166,10 @@ success:
         */
        vma->vm_flags = newflags;
        vma->vm_page_prot = newprot;
-        change_protection(vma, start, end, newprot);
+        if (is_vm_hugetlb_page(vma))
+                hugetlb_change_protection(vma, start, end, newprot);
+        else
+                change_protection(vma, start, end, newprot);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
        return 0;
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
-                if (is_vm_hugetlb_page(vma)) {
-                        error = -EACCES;
-                        goto out;
-                }
                newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
                /* newflags >> 4 shift VM_MAY% in place of VM_% */
diff --git a/mm/msync.c b/mm/msync.c
index 3563a56e1a51..bc6c95376366 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -9,20 +9,24 @@
 */
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/hugetlb.h>
+#include <linux/writeback.h>
+#include <linux/file.h>
 #include <linux/syscalls.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
-static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end)
 {
        pte_t *pte;
        spinlock_t *ptl;
        int progress = 0;
+        unsigned long ret = 0;
 again:
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -43,58 +47,64 @@ again:
                if (!page)
                        continue;
                if (ptep_clear_flush_dirty(vma, addr, pte) ||
-                    page_test_and_clear_dirty(page))
+                                page_test_and_clear_dirty(page))
-                        set_page_dirty(page);
+                        ret += set_page_dirty(page);
                progress += 3;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap_unlock(pte - 1, ptl);
        cond_resched();
        if (addr != end)
                goto again;
+        return ret;
 }
-static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
-                                unsigned long addr, unsigned long end)
+                        pud_t *pud, unsigned long addr, unsigned long end)
 {
        pmd_t *pmd;
        unsigned long next;
+        unsigned long ret = 0;
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                msync_pte_range(vma, pmd, addr, next);
+                ret += msync_pte_range(vma, pmd, addr, next);
        } while (pmd++, addr = next, addr != end);
+        return ret;
 }
-static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
-                                unsigned long addr, unsigned long end)
+                        pgd_t *pgd, unsigned long addr, unsigned long end)
 {
        pud_t *pud;
        unsigned long next;
+        unsigned long ret = 0;
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                msync_pmd_range(vma, pud, addr, next);
+                ret += msync_pmd_range(vma, pud, addr, next);
        } while (pud++, addr = next, addr != end);
+        return ret;
 }
-static void msync_page_range(struct vm_area_struct *vma,
+static unsigned long msync_page_range(struct vm_area_struct *vma,
                                unsigned long addr, unsigned long end)
 {
        pgd_t *pgd;
        unsigned long next;
+        unsigned long ret = 0;
        /* For hugepages we can't go walking the page table normally,
         * but that's ok, hugetlbfs is memory based, so we don't need
         * to do anything more on an msync().
         */
        if (vma->vm_flags & VM_HUGETLB)
-                return;
+                return 0;
        BUG_ON(addr >= end);
        pgd = pgd_offset(vma->vm_mm, addr);
@@ -103,8 +113,9 @@ static void msync_page_range(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                msync_pud_range(vma, pgd, addr, next);
+                ret += msync_pud_range(vma, pgd, addr, next);
        } while (pgd++, addr = next, addr != end);
+        return ret;
 }
 /*
@@ -115,53 +126,31 @@ static void msync_page_range(struct vm_area_struct *vma,
 * write out the dirty pages and wait on the writeout and check the result.
 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
 * async writeout immediately.
- * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
+ * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
 * applications.
 */
-static int msync_interval(struct vm_area_struct *vma,
+static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
-                        unsigned long addr, unsigned long end, int flags)
+                        unsigned long end, int flags,
+                        unsigned long *nr_pages_dirtied)
 {
-        int ret = 0;
        struct file *file = vma->vm_file;
        if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
                return -EBUSY;
-        if (file && (vma->vm_flags & VM_SHARED)) {
+        if (file && (vma->vm_flags & VM_SHARED))
-                msync_page_range(vma, addr, end);
+                *nr_pages_dirtied = msync_page_range(vma, addr, end);
+        return 0;
-                if (flags & MS_SYNC) {
-                        struct address_space *mapping = file->f_mapping;
-                        int err;
-                        ret = filemap_fdatawrite(mapping);
-                        if (file->f_op && file->f_op->fsync) {
-                                /*
-                                 * We don't take i_mutex here because mmap_sem
-                                 * is already held.
-                                 */
-                                err = file->f_op->fsync(file,file->f_dentry,1);
-                                if (err && !ret)
-                                        ret = err;
-                        }
-                        err = filemap_fdatawait(mapping);
-                        if (!ret)
-                                ret = err;
-                }
-        }
-        return ret;
 }
 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 {
        unsigned long end;
        struct vm_area_struct *vma;
-        int unmapped_error, error = -EINVAL;
+        int unmapped_error = 0;
+        int error = -EINVAL;
-        if (flags & MS_SYNC)
+        int done = 0;
-                current->flags |= PF_SYNCWRITE;
-        down_read(&current->mm->mmap_sem);
        if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
                goto out;
        if (start & ~PAGE_MASK)
@@ -180,13 +169,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
         * If the interval [start,end) covers some unmapped address ranges,
         * just ignore them, but return -ENOMEM at the end.
         */
+        down_read(&current->mm->mmap_sem);
+        if (flags & MS_SYNC)
+                current->flags |= PF_SYNCWRITE;
        vma = find_vma(current->mm, start);
-        unmapped_error = 0;
+        if (!vma) {
-        for (;;) {
-                /* Still start < end. */
                error = -ENOMEM;
-                if (!vma)
+                goto out_unlock;
-                        goto out;
+        }
+        do {
+                unsigned long nr_pages_dirtied = 0;
+                struct file *file;
                /* Here start < vma->vm_end. */
                if (start < vma->vm_start) {
                        unmapped_error = -ENOMEM;
@@ -195,22 +189,47 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
                /* Here vma->vm_start <= start < vma->vm_end. */
                if (end <= vma->vm_end) {
                        if (start < end) {
-                                error = msync_interval(vma, start, end, flags);
+                                error = msync_interval(vma, start, end, flags,
+                                                        &nr_pages_dirtied);
                                if (error)
-                                        goto out;
+                                        goto out_unlock;
                        }
                        error = unmapped_error;
-                        goto out;
+                        done = 1;
+                } else {
+                        /* Here vma->vm_start <= start < vma->vm_end < end. */
+                        error = msync_interval(vma, start, vma->vm_end, flags,
+                                                &nr_pages_dirtied);
+                        if (error)
+                                goto out_unlock;
                }
-                /* Here vma->vm_start <= start < vma->vm_end < end. */
+                file = vma->vm_file;
-                error = msync_interval(vma, start, vma->vm_end, flags);
-                if (error)
-                        goto out;
                start = vma->vm_end;
-                vma = vma->vm_next;
+                if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
-        }
+                        get_file(file);
-out:
+                        up_read(&current->mm->mmap_sem);
-        up_read(&current->mm->mmap_sem);
+                        balance_dirty_pages_ratelimited_nr(file->f_mapping,
+                                                        nr_pages_dirtied);
+                        fput(file);
+                        down_read(&current->mm->mmap_sem);
+                        vma = find_vma(current->mm, start);
+                } else if ((flags & MS_SYNC) && file &&
+                                (vma->vm_flags & VM_SHARED)) {
+                        get_file(file);
+                        up_read(&current->mm->mmap_sem);
+                        error = do_fsync(file, 0);
+                        fput(file);
+                        down_read(&current->mm->mmap_sem);
+                        if (error)
+                                goto out_unlock;
+                        vma = find_vma(current->mm, start);
+                } else {
+                        vma = vma->vm_next;
+                }
+        } while (vma && !done);
+out_unlock:
        current->flags &= ~PF_SYNCWRITE;
+        up_read(&current->mm->mmap_sem);
+out:
        return error;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 4951f4786f28..db45efac17cc 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
        /*
         * kmalloc doesn't like __GFP_HIGHMEM for some reason
         */
-        return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM);
+        return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
 }
 struct page * vmalloc_to_page(void *addr)
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
         * - note that this may not return a page-aligned address if the object
         *   we're allocating is smaller than a page
         */
-        base = kmalloc(len, GFP_KERNEL);
+        base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
        if (!base)
                goto enomem;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 945559fb63d2..893d7677579e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -75,12 +75,12 @@ int vm_dirty_ratio = 40;
 * The interval between `kupdate'-style writebacks, in centiseconds
 * (hundredths of a second)
 */
-int dirty_writeback_centisecs = 5 * 100;
+int dirty_writeback_interval = 5 * HZ;
 /*
 * The longest number of centiseconds for which data is allowed to remain dirty
 */
-int dirty_expire_centisecs = 30 * 100;
+int dirty_expire_interval = 30 * HZ;
 /*
 * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -88,7 +88,8 @@ int dirty_expire_centisecs = 30 * 100;
 int block_dump;
 /*
- * Flag that puts the machine in "laptop mode".
+ * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
+ * a full sync is triggered after this time elapses without any disk activity.
 */
 int laptop_mode;
@@ -255,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping)
 }
 /**
- * balance_dirty_pages_ratelimited - balance dirty memory state
+ * balance_dirty_pages_ratelimited_nr - balance dirty memory state
 * @mapping: address_space which was dirtied
+ * @nr_pages: number of pages which the caller has just dirtied
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
@@ -267,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping)
 * limit we decrease the ratelimiting by a lot, to prevent individual processes
 * from overshooting the limit by (ratelimit_pages) each.
 */
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+                                        unsigned long nr_pages_dirtied)
 {
-        static DEFINE_PER_CPU(int, ratelimits) = 0;
+        static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
-        long ratelimit;
+        unsigned long ratelimit;
+        unsigned long *p;
        ratelimit = ratelimit_pages;
        if (dirty_exceeded)
@@ -280,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
         * Check the rate limiting. Also, we do not want to throttle real-time
         * tasks in balance_dirty_pages(). Period.
         */
-        if (get_cpu_var(ratelimits)++ >= ratelimit) {
+        preempt_disable();
-                __get_cpu_var(ratelimits) = 0;
+        p =  &__get_cpu_var(ratelimits);
-                put_cpu_var(ratelimits);
+        *p += nr_pages_dirtied;
+        if (unlikely(*p >= ratelimit)) {
+                *p = 0;
+                preempt_enable();
                balance_dirty_pages(mapping);
                return;
        }
-        put_cpu_var(ratelimits);
+        preempt_enable();
 }
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 void throttle_vm_writeout(void)
 {
@@ -380,8 +387,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 * just walks the superblock inode list, writing back any inodes which are
 * older than a specific point in time.
 *
- * Try to run once per dirty_writeback_centisecs.  But if a writeback event
+ * Try to run once per dirty_writeback_interval.  But if a writeback event
- * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * takes longer than a dirty_writeback_interval interval, then leave a
 * one-second gap.
 *
 * older_than_this takes precedence over nr_to_write.  So we'll only write back
@@ -406,9 +413,9 @@ static void wb_kupdate(unsigned long arg)
        sync_supers();
        get_writeback_state(&wbs);
-        oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
+        oldest_jif = jiffies - dirty_expire_interval;
        start_jif = jiffies;
-        next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
+        next_jif = start_jif + dirty_writeback_interval;
        nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        while (nr_to_write > 0) {
@@ -425,7 +432,7 @@ static void wb_kupdate(unsigned long arg)
        }
        if (time_before(next_jif, jiffies + HZ))
                next_jif = jiffies + HZ;
-        if (dirty_writeback_centisecs)
+        if (dirty_writeback_interval)
                mod_timer(&wb_timer, next_jif);
 }
@@ -435,11 +442,11 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
                struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-        proc_dointvec(table, write, file, buffer, length, ppos);
+        proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
-        if (dirty_writeback_centisecs) {
+        if (dirty_writeback_interval) {
                mod_timer(&wb_timer,
-                        jiffies + (dirty_writeback_centisecs * HZ) / 100);
+                        jiffies + dirty_writeback_interval);
-        } else {
+                } else {
                del_timer(&wb_timer);
        }
        return 0;
@@ -468,7 +475,7 @@ static void laptop_timer_fn(unsigned long unused)
 */
 void laptop_io_completion(void)
 {
-        mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+        mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
 }
 /*
@@ -544,7 +551,7 @@ void __init page_writeback_init(void)
                if (vm_dirty_ratio <= 0)
                        vm_dirty_ratio = 1;
        }
-        mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
+        mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
        set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
 }
@@ -621,8 +628,6 @@ EXPORT_SYMBOL(write_one_page);
 */
 int __set_page_dirty_nobuffers(struct page *page)
 {
-        int ret = 0;
        if (!TestSetPageDirty(page)) {
                struct address_space *mapping = page_mapping(page);
                struct address_space *mapping2;
@@ -644,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page)
                                                        I_DIRTY_PAGES);
                        }
                }
+                return 1;
        }
-        return ret;
+        return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -675,8 +681,10 @@ int fastcall set_page_dirty(struct page *page)
                        return (*spd)(page);
                return __set_page_dirty_buffers(page);
        }
-        if (!PageDirty(page))
+        if (!PageDirty(page)) {
-                SetPageDirty(page);
+                if (!TestSetPageDirty(page))
+                        return 1;
+        }
        return 0;
 }
 EXPORT_SYMBOL(set_page_dirty);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 234bd4895d14..dc523a1f270d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,13 +49,11 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 int percpu_pagelist_fraction;
-static void fastcall free_hot_cold_page(struct page *page, int cold);
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
@@ -190,7 +188,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
        for (i = 0; i < nr_pages; i++) {
                struct page *p = page + i;
-                SetPageCompound(p);
+                __SetPageCompound(p);
                set_page_private(p, (unsigned long)page);
        }
 }
@@ -209,10 +207,24 @@ static void destroy_compound_page(struct page *page, unsigned long order)
                if (unlikely(!PageCompound(p) |
                                (page_private(p) != (unsigned long)page)))
                        bad_page(page);
-                ClearPageCompound(p);
+                __ClearPageCompound(p);
        }
 }
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+{
+        int i;
+        BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+        /*
+         * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
+         * and __GFP_HIGHMEM from hard or soft interrupt context.
+         */
+        BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
+        for (i = 0; i < (1 << order); i++)
+                clear_highpage(page + i);
+}
 /*
 * function for dealing with page's order in buddy system.
 * zone->lock is already acquired when we use these.
@@ -423,11 +435,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
                mutex_debug_check_no_locks_freed(page_address(page),
                                                 PAGE_SIZE<<order);
-#ifndef CONFIG_MMU
-        for (i = 1 ; i < (1 << order) ; ++i)
-                __put_page(page + i);
-#endif
        for (i = 0 ; i < (1 << order) ; ++i)
                reserved += free_pages_check(page + i);
        if (reserved)
@@ -448,28 +455,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
        if (order == 0) {
                __ClearPageReserved(page);
                set_page_count(page, 0);
+                set_page_refcounted(page);
-                free_hot_cold_page(page, 0);
+                __free_page(page);
        } else {
-                LIST_HEAD(list);
                int loop;
+                prefetchw(page);
                for (loop = 0; loop < BITS_PER_LONG; loop++) {
                        struct page *p = &page[loop];
-                        if (loop + 16 < BITS_PER_LONG)
+                        if (loop + 1 < BITS_PER_LONG)
-                                prefetchw(p + 16);
+                                prefetchw(p + 1);
                        __ClearPageReserved(p);
                        set_page_count(p, 0);
                }
-                arch_free_page(page, order);
+                set_page_refcounted(page);
+                __free_pages(page, order);
-                mod_page_state(pgfree, 1 << order);
-                list_add(&page->lru, &list);
-                kernel_map_pages(page, 1 << order, 0);
-                free_pages_bulk(page_zone(page), 1, &list, order);
        }
 }
@@ -507,7 +509,7 @@ static inline void expand(struct zone *zone, struct page *page,
 /*
 * This page is about to be returned from the page allocator
 */
-static int prep_new_page(struct page *page, int order)
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
@@ -536,8 +538,15 @@ static int prep_new_page(struct page *page, int order)
                        1 << PG_referenced | 1 << PG_arch_1 |
                        1 << PG_checked | 1 << PG_mappedtodisk);
        set_page_private(page, 0);
-        set_page_refs(page, order);
+        set_page_refcounted(page);
        kernel_map_pages(page, 1 << order, 1);
+        if (gfp_flags & __GFP_ZERO)
+                prep_zero_page(page, order, gfp_flags);
+        if (order && (gfp_flags & __GFP_COMP))
+                prep_compound_page(page, order);
        return 0;
 }
@@ -593,13 +602,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 /*
 * Called from the slab reaper to drain pagesets on a particular node that
 * belong to the currently executing processor.
+ * Note that this function must be called with the thread pinned to
+ * a single processor.
 */
 void drain_node_pages(int nodeid)
 {
        int i, z;
        unsigned long flags;
-        local_irq_save(flags);
        for (z = 0; z < MAX_NR_ZONES; z++) {
                struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
                struct per_cpu_pageset *pset;
@@ -609,11 +619,14 @@ void drain_node_pages(int nodeid)
                        struct per_cpu_pages *pcp;
                        pcp = &pset->pcp[i];
-                        free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+                        if (pcp->count) {
-                        pcp->count = 0;
+                                local_irq_save(flags);
+                                free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+                                pcp->count = 0;
+                                local_irq_restore(flags);
+                        }
                }
        }
-        local_irq_restore(flags);
 }
 #endif
@@ -743,13 +756,22 @@ void fastcall free_cold_page(struct page *page)
        free_hot_cold_page(page, 1);
 }
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+/*
+ * split_page takes a non-compound higher-order page, and splits it into
+ * n (1<<order) sub-pages: page[0..n]
+ * Each sub-page must be freed individually.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+void split_page(struct page *page, unsigned int order)
 {
        int i;
-        BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+        BUG_ON(PageCompound(page));
-        for(i = 0; i < (1 << order); i++)
+        BUG_ON(!page_count(page));
-                clear_highpage(page + i);
+        for (i = 1; i < (1 << order); i++)
+                set_page_refcounted(page + i);
 }
 /*
@@ -795,14 +817,8 @@ again:
        put_cpu();
        BUG_ON(bad_range(zone, page));
-        if (prep_new_page(page, order))
+        if (prep_new_page(page, order, gfp_flags))
                goto again;
-        if (gfp_flags & __GFP_ZERO)
-                prep_zero_page(page, order, gfp_flags);
-        if (order && (gfp_flags & __GFP_COMP))
-                prep_compound_page(page, order);
        return page;
 failed:
@@ -926,7 +942,8 @@ restart:
                goto got_pg;
        do {
-                wakeup_kswapd(*z, order);
+                if (cpuset_zone_allowed(*z, gfp_mask))
+                        wakeup_kswapd(*z, order);
        } while (*(++z));
        /*
@@ -1183,7 +1200,7 @@ unsigned int nr_free_highpages (void)
        pg_data_t *pgdat;
        unsigned int pages = 0;
-        for_each_pgdat(pgdat)
+        for_each_online_pgdat(pgdat)
                pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
        return pages;
@@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
-        int cpu = 0;
+        unsigned cpu;
        memset(ret, 0, nr * sizeof(unsigned long));
        cpus_and(*cpumask, *cpumask, cpu_online_map);
-        cpu = first_cpu(*cpumask);
+        for_each_cpu_mask(cpu, *cpumask) {
-        while (cpu < NR_CPUS) {
+                unsigned long *in;
-                unsigned long *in, *out, off;
+                unsigned long *out;
+                unsigned off;
-                if (!cpu_isset(cpu, *cpumask))
+                unsigned next_cpu;
-                        continue;
                in = (unsigned long *)&per_cpu(page_states, cpu);
-                cpu = next_cpu(cpu, *cpumask);
+                next_cpu = next_cpu(cpu, *cpumask);
+                if (likely(next_cpu < NR_CPUS))
-                if (likely(cpu < NR_CPUS))
+                        prefetch(&per_cpu(page_states, next_cpu));
-                        prefetch(&per_cpu(page_states, cpu));
                out = (unsigned long *)ret;
                for (off = 0; off < nr; off++)
@@ -1327,7 +1342,7 @@ void get_zone_counts(unsigned long *active,
        *active = 0;
        *inactive = 0;
        *free = 0;
-        for_each_pgdat(pgdat) {
+        for_each_online_pgdat(pgdat) {
                unsigned long l, m, n;
                __get_zone_counts(&l, &m, &n, pgdat);
                *active += l;
@@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                        continue;
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
-                set_page_count(page, 1);
+                init_page_count(page);
                reset_page_mapcount(page);
                SetPageReserved(page);
                INIT_LIST_HEAD(&page->lru);
@@ -2013,8 +2028,9 @@ static __meminit void zone_pcp_init(struct zone *zone)
                setup_pageset(zone_pcp(zone,cpu), batch);
 #endif
        }
-        printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+        if (zone->present_pages)
-                zone->name, zone->present_pages, batch);
+                printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+                        zone->name, zone->present_pages, batch);
 }
 static __meminit void init_currently_empty_zone(struct zone *zone,
@@ -2025,7 +2041,6 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
        zone_wait_table_init(zone, size);
        pgdat->nr_zones = zone_idx(zone) + 1;
-        zone->zone_mem_map = pfn_to_page(zone_start_pfn);
        zone->zone_start_pfn = zone_start_pfn;
        memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
@@ -2153,8 +2168,9 @@ static void *frag_start(struct seq_file *m, loff_t *pos)
 {
        pg_data_t *pgdat;
        loff_t node = *pos;
+        for (pgdat = first_online_pgdat();
-        for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
+             pgdat && node;
+             pgdat = next_online_pgdat(pgdat))
                --node;
        return pgdat;
@@ -2165,7 +2181,7 @@ static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
        pg_data_t *pgdat = (pg_data_t *)arg;
        (*pos)++;
-        return pgdat->pgdat_next;
+        return next_online_pgdat(pgdat);
 }
 static void frag_stop(struct seq_file *m, void *arg)
@@ -2466,7 +2482,7 @@ static void setup_per_zone_lowmem_reserve(void)
        struct pglist_data *pgdat;
        int j, idx;
-        for_each_pgdat(pgdat) {
+        for_each_online_pgdat(pgdat) {
                for (j = 0; j < MAX_NR_ZONES; j++) {
                        struct zone *zone = pgdat->node_zones + j;
                        unsigned long present_pages = zone->present_pages;
@@ -2685,8 +2701,7 @@ void *__init alloc_large_system_hash(const char *tablename,
                else
                        numentries <<= (PAGE_SHIFT - scale);
        }
-        /* rounded up to nearest power of 2 in size */
+        numentries = roundup_pow_of_two(numentries);
-        numentries = 1UL << (long_log2(numentries) + 1);
        /* limit allocation size to 1/16 total memory by default */
        if (max == 0) {
@@ -2729,3 +2744,44 @@ void *__init alloc_large_system_hash(const char *tablename,
        return table;
 }
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+/*
+ * pfn <-> page translation. out-of-line version.
+ * (see asm-generic/memory_model.h)
+ */
+#if defined(CONFIG_FLATMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+        return mem_map + (pfn - ARCH_PFN_OFFSET);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+        return (page - mem_map) + ARCH_PFN_OFFSET;
+}
+#elif defined(CONFIG_DISCONTIGMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+        int nid = arch_pfn_to_nid(pfn);
+        return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+        struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+        return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
+}
+#elif defined(CONFIG_SPARSEMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+        return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
+}
+unsigned long page_to_pfn(struct page *page)
+{
+        long section_id = page_to_section(page);
+        return page - __section_mem_map_addr(__nr_to_section(section_id));
+}
+#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
+EXPORT_SYMBOL(pfn_to_page);
+EXPORT_SYMBOL(page_to_pfn);
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
diff --git a/mm/readahead.c b/mm/readahead.c
index 9f0b98227b41..ba7db816f4c8 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -53,13 +53,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra)
        return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 }
+static inline void reset_ahead_window(struct file_ra_state *ra)
+{
+        /*
+         * ... but preserve ahead_start + ahead_size value,
+         * see 'recheck:' label in page_cache_readahead().
+         * Note: We never use ->ahead_size as rvalue without
+         * checking ->ahead_start != 0 first.
+         */
+        ra->ahead_size += ra->ahead_start;
+        ra->ahead_start = 0;
+}
 static inline void ra_off(struct file_ra_state *ra)
 {
        ra->start = 0;
        ra->flags = 0;
        ra->size = 0;
-        ra->ahead_start = 0;
+        reset_ahead_window(ra);
-        ra->ahead_size = 0;
        return;
 }
@@ -73,10 +84,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
 {
        unsigned long newsize = roundup_pow_of_two(size);
-        if (newsize <= max / 64)
+        if (newsize <= max / 32)
-                newsize = newsize * newsize;
+                newsize = newsize * 4;
        else if (newsize <= max / 4)
-                newsize = max / 4;
+                newsize = newsize * 2;
        else
                newsize = max;
        return newsize;
@@ -427,8 +438,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
                 * congestion.  The ahead window will any way be closed
                 * in case we failed due to excessive page cache hits.
                 */
-                ra->ahead_start = 0;
+                reset_ahead_window(ra);
-                ra->ahead_size = 0;
        }
        return ret;
@@ -521,11 +531,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
         * If we get here we are doing sequential IO and this was not the first
         * occurence (ie we have an existing window)
         */
        if (ra->ahead_start == 0) {      /* no ahead window yet */
                if (!make_ahead_window(mapping, filp, ra, 0))
-                        goto out;
+                        goto recheck;
        }
        /*
         * Already have an ahead window, check if we crossed into it.
         * If so, shift windows and issue a new ahead window.
@@ -537,11 +547,16 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
                ra->start = ra->ahead_start;
                ra->size = ra->ahead_size;
                make_ahead_window(mapping, filp, ra, 0);
+recheck:
+                /* prev_page shouldn't overrun the ahead window */
+                ra->prev_page = min(ra->prev_page,
+                        ra->ahead_start + ra->ahead_size - 1);
        }
 out:
        return ra->prev_page + 1;
 }
+EXPORT_SYMBOL_GPL(page_cache_readahead);
 /*
 * handle_ra_miss() is called when it is known that a page which should have
diff --git a/mm/rmap.c b/mm/rmap.c
index 67f0e20b101f..1963e269314d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,13 +56,11 @@
 #include <asm/tlbflush.h>
-//#define RMAP_DEBUG /* can be enabled only for debugging */
+struct kmem_cache *anon_vma_cachep;
-kmem_cache_t *anon_vma_cachep;
 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
 {
-#ifdef RMAP_DEBUG
+#ifdef CONFIG_DEBUG_VM
        struct anon_vma *anon_vma = find_vma->anon_vma;
        struct vm_area_struct *vma;
        unsigned int mapcount = 0;
@@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma)
                anon_vma_free(anon_vma);
 }
-static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
+                          unsigned long flags)
 {
        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
                                                SLAB_CTOR_CONSTRUCTOR) {
@@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page)
 void page_remove_rmap(struct page *page)
 {
        if (atomic_add_negative(-1, &page->_mapcount)) {
-                if (page_mapcount(page) < 0) {
+#ifdef CONFIG_DEBUG_VM
+                if (unlikely(page_mapcount(page) < 0)) {
                        printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
                        printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
                        printk (KERN_EMERG "  page->count = %x\n", page_count(page));
                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
                }
+#endif
                BUG_ON(page_mapcount(page) < 0);
                /*
                 * It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c455fbaff7b..37eaf42ed2c6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -875,7 +875,7 @@ redirty:
 }
 #ifdef CONFIG_NUMA
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
 {
        char *nodelist = strchr(value, ':');
        int err = 1;
@@ -2119,7 +2119,7 @@ failed:
        return err;
 }
-static kmem_cache_t *shmem_inode_cachep;
+static struct kmem_cache *shmem_inode_cachep;
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
@@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode)
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep,
+                      unsigned long flags)
 {
        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
diff --git a/mm/slab.c b/mm/slab.c
index d0bd7f07ab04..4cbf8bb13557 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -50,7 +50,7 @@
 * The head array is strictly LIFO and should improve the cache hit rates.
 * On SMP, it additionally reduces the spinlock operations.
 *
- * The c_cpuarray may not be read with enabled local interrupts - 
+ * The c_cpuarray may not be read with enabled local interrupts -
 * it's changed with a smp_call_function().
 *
 * SMP synchronization:
@@ -94,6 +94,7 @@
 #include        <linux/interrupt.h>
 #include        <linux/init.h>
 #include        <linux/compiler.h>
+#include        <linux/cpuset.h>
 #include        <linux/seq_file.h>
 #include        <linux/notifier.h>
 #include        <linux/kallsyms.h>
@@ -170,15 +171,15 @@
 #if DEBUG
 # define CREATE_MASK    (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
                         SLAB_POISON | SLAB_HWCACHE_ALIGN | \
-                         SLAB_NO_REAP | SLAB_CACHE_DMA | \
+                         SLAB_CACHE_DMA | \
                         SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-                         SLAB_DESTROY_BY_RCU)
+                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #else
-# define CREATE_MASK    (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
+# define CREATE_MASK    (SLAB_HWCACHE_ALIGN | \
                         SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-                         SLAB_DESTROY_BY_RCU)
+                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #endif
 /*
@@ -203,7 +204,8 @@
 typedef unsigned int kmem_bufctl_t;
 #define BUFCTL_END      (((kmem_bufctl_t)(~0U))-0)
 #define BUFCTL_FREE     (((kmem_bufctl_t)(~0U))-1)
-#define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-2)
+#define BUFCTL_ACTIVE   (((kmem_bufctl_t)(~0U))-2)
+#define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-3)
 /* Max number of objs-per-slab for caches which use off-slab slabs.
 * Needed to avoid a possible looping condition in cache_grow().
@@ -266,16 +268,17 @@ struct array_cache {
        unsigned int batchcount;
        unsigned int touched;
        spinlock_t lock;
-        void *entry[0];         /*
+        void *entry[0]; /*
-                                 * Must have this definition in here for the proper
+                         * Must have this definition in here for the proper
-                                 * alignment of array_cache. Also simplifies accessing
+                         * alignment of array_cache. Also simplifies accessing
-                                 * the entries.
+                         * the entries.
-                                 * [0] is for gcc 2.95. It should really be [].
+                         * [0] is for gcc 2.95. It should really be [].
-                                 */
+                         */
 };
-/* bootstrap: The caches do not work without cpuarrays anymore,
+/*
- * but the cpuarrays are allocated from the generic caches...
+ * bootstrap: The caches do not work without cpuarrays anymore, but the
+ * cpuarrays are allocated from the generic caches...
 */
 #define BOOT_CPUCACHE_ENTRIES   1
 struct arraycache_init {
@@ -291,13 +294,13 @@ struct kmem_list3 {
        struct list_head slabs_full;
        struct list_head slabs_free;
        unsigned long free_objects;
-        unsigned long next_reap;
-        int free_touched;
        unsigned int free_limit;
        unsigned int colour_next;       /* Per-node cache coloring */
        spinlock_t list_lock;
        struct array_cache *shared;     /* shared per node */
        struct array_cache **alien;     /* on other nodes */
+        unsigned long next_reap;        /* updated without locking */
+        int free_touched;               /* updated without locking */
 };
 /*
@@ -310,10 +313,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 #define SIZE_L3 (1 + MAX_NUMNODES)
 /*
- * This function must be completely optimized away if
+ * This function must be completely optimized away if a constant is passed to
- * a constant is passed to it. Mostly the same as
+ * it.  Mostly the same as what is in linux/slab.h except it returns an index.
- * what is in linux/slab.h except it returns an
- * index.
 */
 static __always_inline int index_of(const size_t size)
 {
@@ -351,14 +352,14 @@ static void kmem_list3_init(struct kmem_list3 *parent)
        parent->free_touched = 0;
 }
-#define MAKE_LIST(cachep, listp, slab, nodeid)  \
+#define MAKE_LIST(cachep, listp, slab, nodeid)                          \
-        do {    \
+        do {                                                            \
-                INIT_LIST_HEAD(listp);          \
+                INIT_LIST_HEAD(listp);                                  \
-                list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+                list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
        } while (0)
-#define MAKE_ALL_LISTS(cachep, ptr, nodeid)                     \
+#define MAKE_ALL_LISTS(cachep, ptr, nodeid)                             \
-        do {                                    \
+        do {                                                            \
        MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);  \
        MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
        MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);  \
@@ -373,28 +374,30 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
        struct array_cache *array[NR_CPUS];
+/* 2) Cache tunables. Protected by cache_chain_mutex */
        unsigned int batchcount;
        unsigned int limit;
        unsigned int shared;
        unsigned int buffer_size;
-/* 2) touched by every alloc & free from the backend */
+/* 3) touched by every alloc & free from the backend */
        struct kmem_list3 *nodelists[MAX_NUMNODES];
-        unsigned int flags;     /* constant flags */
-        unsigned int num;       /* # of objs per slab */
-        spinlock_t spinlock;
-/* 3) cache_grow/shrink */
+        unsigned int flags;             /* constant flags */
+        unsigned int num;               /* # of objs per slab */
+/* 4) cache_grow/shrink */
        /* order of pgs per slab (2^n) */
        unsigned int gfporder;
        /* force GFP flags, e.g. GFP_DMA */
        gfp_t gfpflags;
-        size_t colour;          /* cache colouring range */
+        size_t colour;                  /* cache colouring range */
        unsigned int colour_off;        /* colour offset */
        struct kmem_cache *slabp_cache;
        unsigned int slab_size;
-        unsigned int dflags;    /* dynamic flags */
+        unsigned int dflags;            /* dynamic flags */
        /* constructor func */
        void (*ctor) (void *, struct kmem_cache *, unsigned long);
@@ -402,11 +405,11 @@ struct kmem_cache {
        /* de-constructor func */
        void (*dtor) (void *, struct kmem_cache *, unsigned long);
-/* 4) cache creation/removal */
+/* 5) cache creation/removal */
        const char *name;
        struct list_head next;
-/* 5) statistics */
+/* 6) statistics */
 #if STATS
        unsigned long num_active;
        unsigned long num_allocations;
@@ -438,8 +441,9 @@ struct kmem_cache {
 #define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
 #define BATCHREFILL_LIMIT       16
-/* Optimization question: fewer reaps means less 
+/*
- * probability for unnessary cpucache drain/refill cycles.
+ * Optimization question: fewer reaps means less probability for unnessary
+ * cpucache drain/refill cycles.
 *
 * OTOH the cpuarrays can contain lots of objects,
 * which could lock up otherwise freeable slabs.
@@ -453,17 +457,19 @@ struct kmem_cache {
 #define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 #define STATS_INC_GROWN(x)      ((x)->grown++)
 #define STATS_INC_REAPED(x)     ((x)->reaped++)
-#define STATS_SET_HIGH(x)       do { if ((x)->num_active > (x)->high_mark) \
+#define STATS_SET_HIGH(x)                                               \
-                                        (x)->high_mark = (x)->num_active; \
+        do {                                                            \
-                                } while (0)
+                if ((x)->num_active > (x)->high_mark)                   \
+                        (x)->high_mark = (x)->num_active;               \
+        } while (0)
 #define STATS_INC_ERR(x)        ((x)->errors++)
 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
 #define STATS_INC_NODEFREES(x)  ((x)->node_frees++)
-#define STATS_SET_FREEABLE(x, i) \
+#define STATS_SET_FREEABLE(x, i)                                        \
-                                do { if ((x)->max_freeable < i) \
+        do {                                                            \
-                                        (x)->max_freeable = i; \
+                if ((x)->max_freeable < i)                              \
-                                } while (0)
+                        (x)->max_freeable = i;                          \
+        } while (0)
 #define STATS_INC_ALLOCHIT(x)   atomic_inc(&(x)->allochit)
 #define STATS_INC_ALLOCMISS(x)  atomic_inc(&(x)->allocmiss)
 #define STATS_INC_FREEHIT(x)    atomic_inc(&(x)->freehit)
@@ -478,9 +484,7 @@ struct kmem_cache {
 #define STATS_INC_ERR(x)        do { } while (0)
 #define STATS_INC_NODEALLOCS(x) do { } while (0)
 #define STATS_INC_NODEFREES(x)  do { } while (0)
-#define STATS_SET_FREEABLE(x, i) \
+#define STATS_SET_FREEABLE(x, i) do { } while (0)
-                                do { } while (0)
 #define STATS_INC_ALLOCHIT(x)   do { } while (0)
 #define STATS_INC_ALLOCMISS(x)  do { } while (0)
 #define STATS_INC_FREEHIT(x)    do { } while (0)
@@ -488,7 +492,8 @@ struct kmem_cache {
 #endif
 #if DEBUG
-/* Magic nums for obj red zoning.
+/*
+ * Magic nums for obj red zoning.
 * Placed in the first word before and the first word after an obj.
 */
 #define RED_INACTIVE    0x5A2CF071UL    /* when obj is inactive */
@@ -499,7 +504,8 @@ struct kmem_cache {
 #define POISON_FREE     0x6b    /* for use-after-free poisoning */
 #define POISON_END      0xa5    /* end-byte of poisoning */
-/* memory layout of objects:
+/*
+ * memory layout of objects:
 * 0            : objp
 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
 *              the end of an object is aligned with the end of the real
@@ -508,7 +514,8 @@ struct kmem_cache {
 *              redzone word.
 * cachep->obj_offset: The real object.
 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
+ * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ *                                      [BYTES_PER_WORD long]
 */
 static int obj_offset(struct kmem_cache *cachep)
 {
@@ -552,8 +559,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #endif
 /*
- * Maximum size of an obj (in 2^order pages)
+ * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
- * and absolute limit for the gfp order.
+ * order.
 */
 #if defined(CONFIG_LARGE_ALLOCS)
 #define MAX_OBJ_ORDER   13      /* up to 32Mb */
@@ -573,9 +580,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #define BREAK_GFP_ORDER_LO      0
 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
-/* Functions for storing/retrieving the cachep and or slab from the
+/*
- * global 'mem_map'. These are used to find the slab an obj belongs to.
+ * Functions for storing/retrieving the cachep and or slab from the page
- * With kfree(), these are used to find the cache which an obj belongs to.
+ * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
+ * these are used to find the cache which an obj belongs to.
 */
 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 {
@@ -584,6 +592,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 static inline struct kmem_cache *page_get_cache(struct page *page)
 {
+        if (unlikely(PageCompound(page)))
+                page = (struct page *)page_private(page);
        return (struct kmem_cache *)page->lru.next;
 }
@@ -594,6 +604,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab)
 static inline struct slab *page_get_slab(struct page *page)
 {
+        if (unlikely(PageCompound(page)))
+                page = (struct page *)page_private(page);
        return (struct slab *)page->lru.prev;
 }
@@ -609,7 +621,21 @@ static inline struct slab *virt_to_slab(const void *obj)
        return page_get_slab(page);
 }
-/* These are the default caches for kmalloc. Custom caches can have other sizes. */
+static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
+                                 unsigned int idx)
+{
+        return slab->s_mem + cache->buffer_size * idx;
+}
+static inline unsigned int obj_to_index(struct kmem_cache *cache,
+                                        struct slab *slab, void *obj)
+{
+        return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
+}
+/*
+ * These are the default caches for kmalloc. Custom caches can have other sizes.
+ */
 struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
 #include <linux/kmalloc_sizes.h>
@@ -642,8 +668,6 @@ static struct kmem_cache cache_cache = {
        .limit = BOOT_CPUCACHE_ENTRIES,
        .shared = 1,
        .buffer_size = sizeof(struct kmem_cache),
-        .flags = SLAB_NO_REAP,
-        .spinlock = SPIN_LOCK_UNLOCKED,
        .name = "kmem_cache",
 #if DEBUG
        .obj_size = sizeof(struct kmem_cache),
@@ -655,8 +679,8 @@ static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 /*
- * vm_enough_memory() looks at this to determine how many
+ * vm_enough_memory() looks at this to determine how many slab-allocated pages
- * slab-allocated pages are possibly freeable under pressure
+ * are possibly freeable under pressure
 *
 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
 */
@@ -675,7 +699,8 @@ static enum {
 static DEFINE_PER_CPU(struct work_struct, reap_work);
-static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len,
+                        int node);
 static void enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(void *unused);
 static int __node_shrink(struct kmem_cache *cachep, int node);
@@ -685,7 +710,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
        return cachep->array[smp_processor_id()];
 }
-static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
+static inline struct kmem_cache *__find_general_cachep(size_t size,
+                                                        gfp_t gfpflags)
 {
        struct cache_sizes *csizep = malloc_sizes;
@@ -720,8 +746,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align)
        return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
 }
-/* Calculate the number of objects and left-over bytes for a given
+/*
-   buffer size. */
+ * Calculate the number of objects and left-over bytes for a given buffer size.
+ */
 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
                           size_t align, int flags, size_t *left_over,
                           unsigned int *num)
@@ -782,7 +809,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
-static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
+static void __slab_error(const char *function, struct kmem_cache *cachep,
+                        char *msg)
 {
        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
               function, cachep->name, msg);
@@ -804,7 +832,7 @@ static void init_reap_node(int cpu)
        node = next_node(cpu_to_node(cpu), node_online_map);
        if (node == MAX_NUMNODES)
-                node = 0;
+                node = first_node(node_online_map);
        __get_cpu_var(reap_node) = node;
 }
@@ -870,8 +898,33 @@ static struct array_cache *alloc_arraycache(int node, int entries,
        return nc;
 }
+/*
+ * Transfer objects in one arraycache to another.
+ * Locking must be handled by the caller.
+ *
+ * Return the number of entries transferred.
+ */
+static int transfer_objects(struct array_cache *to,
+                struct array_cache *from, unsigned int max)
+{
+        /* Figure out how many entries to transfer */
+        int nr = min(min(from->avail, max), to->limit - to->avail);
+        if (!nr)
+                return 0;
+        memcpy(to->entry + to->avail, from->entry + from->avail -nr,
+                        sizeof(void *) *nr);
+        from->avail -= nr;
+        to->avail += nr;
+        to->touched = 1;
+        return nr;
+}
 #ifdef CONFIG_NUMA
 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -906,10 +959,8 @@ static void free_alien_cache(struct array_cache **ac_ptr)
        if (!ac_ptr)
                return;
        for_each_node(i)
            kfree(ac_ptr[i]);
        kfree(ac_ptr);
 }
@@ -920,6 +971,13 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
        if (ac->avail) {
                spin_lock(&rl3->list_lock);
+                /*
+                 * Stuff objects into the remote nodes shared array first.
+                 * That way we could avoid the overhead of putting the objects
+                 * into the free lists and getting them back later.
+                 */
+                transfer_objects(rl3->shared, ac, ac->limit);
                free_block(cachep, ac->entry, ac->avail, node);
                ac->avail = 0;
                spin_unlock(&rl3->list_lock);
@@ -935,15 +993,16 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
        if (l3->alien) {
                struct array_cache *ac = l3->alien[node];
-                if (ac && ac->avail) {
-                        spin_lock_irq(&ac->lock);
+                if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
                        __drain_alien_cache(cachep, ac, node);
                        spin_unlock_irq(&ac->lock);
                }
        }
 }
-static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
+static void drain_alien_cache(struct kmem_cache *cachep,
+                                struct array_cache **alien)
 {
        int i = 0;
        struct array_cache *ac;
@@ -986,20 +1045,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
                mutex_lock(&cache_chain_mutex);
-                /* we need to do this right in the beginning since
+                /*
+                 * We need to do this right in the beginning since
                 * alloc_arraycache's are going to use this list.
                 * kmalloc_node allows us to add the slab to the right
                 * kmem_list3 and not this cpu's kmem_list3
                 */
                list_for_each_entry(cachep, &cache_chain, next) {
-                        /* setup the size64 kmemlist for cpu before we can
+                        /*
+                         * Set up the size64 kmemlist for cpu before we can
                         * begin anything. Make sure some other cpu on this
                         * node has not already allocated this
                         */
                        if (!cachep->nodelists[node]) {
-                                if (!(l3 = kmalloc_node(memsize,
+                                l3 = kmalloc_node(memsize, GFP_KERNEL, node);
-                                                        GFP_KERNEL, node)))
+                                if (!l3)
                                        goto bad;
                                kmem_list3_init(l3);
                                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
@@ -1015,13 +1076,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        spin_lock_irq(&cachep->nodelists[node]->list_lock);
                        cachep->nodelists[node]->free_limit =
-                            (1 + nr_cpus_node(node)) *
+                                (1 + nr_cpus_node(node)) *
-                            cachep->batchcount + cachep->num;
+                                cachep->batchcount + cachep->num;
                        spin_unlock_irq(&cachep->nodelists[node]->list_lock);
                }
-                /* Now we can go ahead with allocating the shared array's
+                /*
-                   & array cache's */
+                 * Now we can go ahead with allocating the shared arrays and
+                 * array caches
+                 */
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
                        struct array_cache *shared;
@@ -1041,7 +1104,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        if (!alien)
                                goto bad;
                        cachep->array[cpu] = nc;
                        l3 = cachep->nodelists[node];
                        BUG_ON(!l3);
@@ -1061,7 +1123,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        }
 #endif
                        spin_unlock_irq(&l3->list_lock);
                        kfree(shared);
                        free_alien_cache(alien);
                }
@@ -1083,7 +1144,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                /* fall thru */
        case CPU_UP_CANCELED:
                mutex_lock(&cache_chain_mutex);
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
                        struct array_cache *shared;
@@ -1150,7 +1210,7 @@ free_array_cache:
 #endif
        }
        return NOTIFY_OK;
-      bad:
+bad:
        mutex_unlock(&cache_chain_mutex);
        return NOTIFY_BAD;
 }
@@ -1160,7 +1220,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
 * swap the static kmem_list3 with kmalloced memory
 */
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
+static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+                        int nodeid)
 {
        struct kmem_list3 *ptr;
@@ -1175,8 +1236,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no
        local_irq_enable();
 }
-/* Initialisation.
+/*
- * Called after the gfp() functions have been enabled, and before smp_init().
+ * Initialisation.  Called after the page allocator have been initialised and
+ * before smp_init().
 */
 void __init kmem_cache_init(void)
 {
@@ -1201,9 +1263,9 @@ void __init kmem_cache_init(void)
        /* Bootstrap is tricky, because several objects are allocated
         * from caches that do not exist yet:
-         * 1) initialize the cache_cache cache: it contains the struct kmem_cache
+         * 1) initialize the cache_cache cache: it contains the struct
-         *    structures of all caches, except cache_cache itself: cache_cache
+         *    kmem_cache structures of all caches, except cache_cache itself:
-         *    is statically allocated.
+         *    cache_cache is statically allocated.
         *    Initially an __init data area is used for the head array and the
         *    kmem_list3 structures, it's replaced with a kmalloc allocated
         *    array at the end of the bootstrap.
@@ -1226,7 +1288,8 @@ void __init kmem_cache_init(void)
        cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
        cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
-        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
+        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
+                                        cache_line_size());
        for (order = 0; order < MAX_ORDER; order++) {
                cache_estimate(order, cache_cache.buffer_size,
@@ -1245,24 +1308,26 @@ void __init kmem_cache_init(void)
        sizes = malloc_sizes;
        names = cache_names;
-        /* Initialize the caches that provide memory for the array cache
+        /*
-         * and the kmem_list3 structures first.
+         * Initialize the caches that provide memory for the array cache and the
-         * Without this, further allocations will bug
+         * kmem_list3 structures first.  Without this, further allocations will
+         * bug.
         */
        sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-                                                      sizes[INDEX_AC].cs_size,
+                                        sizes[INDEX_AC].cs_size,
-                                                      ARCH_KMALLOC_MINALIGN,
+                                        ARCH_KMALLOC_MINALIGN,
-                                                      (ARCH_KMALLOC_FLAGS |
+                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-                                                       SLAB_PANIC), NULL, NULL);
+                                        NULL, NULL);
-        if (INDEX_AC != INDEX_L3)
+        if (INDEX_AC != INDEX_L3) {
                sizes[INDEX_L3].cs_cachep =
-                    kmem_cache_create(names[INDEX_L3].name,
+                        kmem_cache_create(names[INDEX_L3].name,
-                                      sizes[INDEX_L3].cs_size,
+                                sizes[INDEX_L3].cs_size,
-                                      ARCH_KMALLOC_MINALIGN,
+                                ARCH_KMALLOC_MINALIGN,
-                                      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
+                                ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-                                      NULL);
+                                NULL, NULL);
+        }
        while (sizes->cs_size != ULONG_MAX) {
                /*
@@ -1272,13 +1337,13 @@ void __init kmem_cache_init(void)
                 * Note for systems short on memory removing the alignment will
                 * allow tighter packing of the smaller caches.
                 */
-                if (!sizes->cs_cachep)
+                if (!sizes->cs_cachep) {
                        sizes->cs_cachep = kmem_cache_create(names->name,
-                                                             sizes->cs_size,
+                                        sizes->cs_size,
-                                                             ARCH_KMALLOC_MINALIGN,
+                                        ARCH_KMALLOC_MINALIGN,
-                                                             (ARCH_KMALLOC_FLAGS
+                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-                                                              | SLAB_PANIC),
+                                        NULL, NULL);
-                                                             NULL, NULL);
+                }
                /* Inc off-slab bufctl limit until the ceiling is hit. */
                if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -1287,13 +1352,11 @@ void __init kmem_cache_init(void)
                }
                sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
-                                                        sizes->cs_size,
+                                        sizes->cs_size,
-                                                        ARCH_KMALLOC_MINALIGN,
+                                        ARCH_KMALLOC_MINALIGN,
-                                                        (ARCH_KMALLOC_FLAGS |
+                                        ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
-                                                         SLAB_CACHE_DMA |
+                                                SLAB_PANIC,
-                                                         SLAB_PANIC), NULL,
+                                        NULL, NULL);
-                                                        NULL);
                sizes++;
                names++;
        }
@@ -1345,20 +1408,22 @@ void __init kmem_cache_init(void)
                struct kmem_cache *cachep;
                mutex_lock(&cache_chain_mutex);
                list_for_each_entry(cachep, &cache_chain, next)
-                    enable_cpucache(cachep);
+                        enable_cpucache(cachep);
                mutex_unlock(&cache_chain_mutex);
        }
        /* Done! */
        g_cpucache_up = FULL;
-        /* Register a cpu startup notifier callback
+        /*
-         * that initializes cpu_cache_get for all new cpus
+         * Register a cpu startup notifier callback that initializes
+         * cpu_cache_get for all new cpus
         */
        register_cpu_notifier(&cpucache_notifier);
-        /* The reap timers are started later, with a module init call:
+        /*
-         * That part of the kernel is not yet operational.
+         * The reap timers are started later, with a module init call: That part
+         * of the kernel is not yet operational.
         */
 }
@@ -1366,16 +1431,13 @@ static int __init cpucache_init(void)
 {
        int cpu;
-        /* 
+        /*
-         * Register the timers that return unneeded
+         * Register the timers that return unneeded pages to the page allocator
-         * pages to gfp.
         */
        for_each_online_cpu(cpu)
-            start_cpu_timer(cpu);
+                start_cpu_timer(cpu);
        return 0;
 }
 __initcall(cpucache_init);
 /*
@@ -1402,7 +1464,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                atomic_add(i, &slab_reclaim_pages);
        add_page_state(nr_slab, i);
        while (i--) {
-                SetPageSlab(page);
+                __SetPageSlab(page);
                page++;
        }
        return addr;
@@ -1418,8 +1480,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
        const unsigned long nr_freed = i;
        while (i--) {
-                if (!TestClearPageSlab(page))
+                BUG_ON(!PageSlab(page));
-                        BUG();
+                __ClearPageSlab(page);
                page++;
        }
        sub_page_state(nr_slab, nr_freed);
@@ -1489,9 +1551,8 @@ static void dump_line(char *data, int offset, int limit)
 {
        int i;
        printk(KERN_ERR "%03x:", offset);
-        for (i = 0; i < limit; i++) {
+        for (i = 0; i < limit; i++)
                printk(" %02x", (unsigned char)data[offset + i]);
-        }
        printk("\n");
 }
 #endif
@@ -1505,15 +1566,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
        if (cachep->flags & SLAB_RED_ZONE) {
                printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
-                       *dbg_redzone1(cachep, objp),
+                        *dbg_redzone1(cachep, objp),
-                       *dbg_redzone2(cachep, objp));
+                        *dbg_redzone2(cachep, objp));
        }
        if (cachep->flags & SLAB_STORE_USER) {
                printk(KERN_ERR "Last user: [<%p>]",
-                       *dbg_userword(cachep, objp));
+                        *dbg_userword(cachep, objp));
                print_symbol("(%s)",
-                             (unsigned long)*dbg_userword(cachep, objp));
+                                (unsigned long)*dbg_userword(cachep, objp));
                printk("\n");
        }
        realobj = (char *)objp + obj_offset(cachep);
@@ -1546,8 +1607,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
                        /* Print header */
                        if (lines == 0) {
                                printk(KERN_ERR
-                                       "Slab corruption: start=%p, len=%d\n",
+                                        "Slab corruption: start=%p, len=%d\n",
-                                       realobj, size);
+                                        realobj, size);
                                print_objinfo(cachep, objp, 0);
                        }
                        /* Hexdump the affected line */
@@ -1568,18 +1629,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
                 * exist:
                 */
                struct slab *slabp = virt_to_slab(objp);
-                int objnr;
+                unsigned int objnr;
-                objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+                objnr = obj_to_index(cachep, slabp, objp);
                if (objnr) {
-                        objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
+                        objp = index_to_obj(cachep, slabp, objnr - 1);
                        realobj = (char *)objp + obj_offset(cachep);
                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
                               realobj, size);
                        print_objinfo(cachep, objp, 2);
                }
                if (objnr + 1 < cachep->num) {
-                        objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
+                        objp = index_to_obj(cachep, slabp, objnr + 1);
                        realobj = (char *)objp + obj_offset(cachep);
                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
                               realobj, size);
@@ -1591,22 +1652,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 #if DEBUG
 /**
- * slab_destroy_objs - call the registered destructor for each object in
+ * slab_destroy_objs - destroy a slab and its objects
- *      a slab that is to be destroyed.
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
+ * Call the registered destructor for each object in a slab that is being
+ * destroyed.
 */
 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
        int i;
        for (i = 0; i < cachep->num; i++) {
-                void *objp = slabp->s_mem + cachep->buffer_size * i;
+                void *objp = index_to_obj(cachep, slabp, i);
                if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                        if ((cachep->buffer_size % PAGE_SIZE) == 0
+                        if (cachep->buffer_size % PAGE_SIZE == 0 &&
-                            && OFF_SLAB(cachep))
+                                        OFF_SLAB(cachep))
                                kernel_map_pages(virt_to_page(objp),
-                                                 cachep->buffer_size / PAGE_SIZE,
+                                        cachep->buffer_size / PAGE_SIZE, 1);
-                                                 1);
                        else
                                check_poison_obj(cachep, objp);
 #else
@@ -1631,7 +1695,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
        if (cachep->dtor) {
                int i;
                for (i = 0; i < cachep->num; i++) {
-                        void *objp = slabp->s_mem + cachep->buffer_size * i;
+                        void *objp = index_to_obj(cachep, slabp, i);
                        (cachep->dtor) (objp, cachep, 0);
                }
        }
@@ -1639,9 +1703,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 #endif
 /**
+ * slab_destroy - destroy and release all objects in a slab
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
 * Destroy all the objs in a slab, and release the mem back to the system.
- * Before calling the slab must have been unlinked from the cache.
+ * Before calling the slab must have been unlinked from the cache.  The
- * The cache-lock is not held/needed.
+ * cache-lock is not held/needed.
 */
 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 {
@@ -1662,8 +1730,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
        }
 }
-/* For setting up all the kmem_list3s for cache whose buffer_size is same
+/*
-   as size of kmem_list3. */
+ * For setting up all the kmem_list3s for cache whose buffer_size is same as
+ * size of kmem_list3.
+ */
 static void set_up_list3s(struct kmem_cache *cachep, int index)
 {
        int node;
@@ -1689,13 +1759,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
 * high order pages for slabs.  When the gfp() functions are more friendly
 * towards high-order requests, this should be changed.
 */
-static inline size_t calculate_slab_order(struct kmem_cache *cachep,
+static size_t calculate_slab_order(struct kmem_cache *cachep,
                        size_t size, size_t align, unsigned long flags)
 {
        size_t left_over = 0;
        int gfporder;
-        for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
+        for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
                unsigned int num;
                size_t remainder;
@@ -1730,12 +1800,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
                /*
                 * Acceptable internal fragmentation?
                 */
-                if ((left_over * 8) <= (PAGE_SIZE << gfporder))
+                if (left_over * 8 <= (PAGE_SIZE << gfporder))
                        break;
        }
        return left_over;
 }
+static void setup_cpu_cache(struct kmem_cache *cachep)
+{
+        if (g_cpucache_up == FULL) {
+                enable_cpucache(cachep);
+                return;
+        }
+        if (g_cpucache_up == NONE) {
+                /*
+                 * Note: the first kmem_cache_create must create the cache
+                 * that's used by kmalloc(24), otherwise the creation of
+                 * further caches will BUG().
+                 */
+                cachep->array[smp_processor_id()] = &initarray_generic.cache;
+                /*
+                 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
+                 * the first cache, then we need to set up all its list3s,
+                 * otherwise the creation of further caches will BUG().
+                 */
+                set_up_list3s(cachep, SIZE_AC);
+                if (INDEX_AC == INDEX_L3)
+                        g_cpucache_up = PARTIAL_L3;
+                else
+                        g_cpucache_up = PARTIAL_AC;
+        } else {
+                cachep->array[smp_processor_id()] =
+                        kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+                if (g_cpucache_up == PARTIAL_AC) {
+                        set_up_list3s(cachep, SIZE_L3);
+                        g_cpucache_up = PARTIAL_L3;
+                } else {
+                        int node;
+                        for_each_online_node(node) {
+                                cachep->nodelists[node] =
+                                    kmalloc_node(sizeof(struct kmem_list3),
+                                                GFP_KERNEL, node);
+                                BUG_ON(!cachep->nodelists[node]);
+                                kmem_list3_init(cachep->nodelists[node]);
+                        }
+                }
+        }
+        cachep->nodelists[numa_node_id()]->next_reap =
+                        jiffies + REAPTIMEOUT_LIST3 +
+                        ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+        cpu_cache_get(cachep)->avail = 0;
+        cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+        cpu_cache_get(cachep)->batchcount = 1;
+        cpu_cache_get(cachep)->touched = 0;
+        cachep->batchcount = 1;
+        cachep->limit = BOOT_CPUCACHE_ENTRIES;
+}
 /**
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1751,9 +1875,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 * and the @dtor is run before the pages are handed back.
 *
 * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting 
+ * the module calling this has to destroy the cache before getting unloaded.
- * unloaded.
+ *
- * 
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -1762,16 +1885,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
- * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
- * memory pressure.
- *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-        unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
+        unsigned long flags,
+        void (*ctor)(void*, struct kmem_cache *, unsigned long),
        void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
        size_t left_over, slab_size, ralign;
@@ -1781,12 +1902,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /*
         * Sanity checks... these are all serious usage bugs.
         */
-        if ((!name) ||
+        if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
-            in_interrupt() ||
-            (size < BYTES_PER_WORD) ||
            (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
-                printk(KERN_ERR "%s: Early error in slab %s\n",
+                printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
-                       __FUNCTION__, name);
+                                name);
                BUG();
        }
@@ -1840,8 +1959,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * above the next power of two: caches with object sizes just above a
         * power of two have a significant amount of internal fragmentation.
         */
-        if ((size < 4096
+        if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
-             || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
                flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
        if (!(flags & SLAB_DESTROY_BY_RCU))
                flags |= SLAB_POISON;
@@ -1853,13 +1971,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                BUG_ON(dtor);
        /*
-         * Always checks flags, a caller might be expecting debug
+         * Always checks flags, a caller might be expecting debug support which
-         * support which isn't available.
+         * isn't available.
         */
        if (flags & ~CREATE_MASK)
                BUG();
-        /* Check that size is in terms of words.  This is needed to avoid
+        /*
+         * Check that size is in terms of words.  This is needed to avoid
         * unaligned accesses for some archs when redzoning is used, and makes
         * sure any on-slab bufctl's are also correctly aligned.
         */
@@ -1868,12 +1987,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                size &= ~(BYTES_PER_WORD - 1);
        }
-        /* calculate out the final buffer alignment: */
+        /* calculate the final buffer alignment: */
        /* 1) arch recommendation: can be overridden for debug */
        if (flags & SLAB_HWCACHE_ALIGN) {
-                /* Default alignment: as specified by the arch code.
+                /*
-                 * Except if an object is really small, then squeeze multiple
+                 * Default alignment: as specified by the arch code.  Except if
-                 * objects into one cacheline.
+                 * an object is really small, then squeeze multiple objects into
+                 * one cacheline.
                 */
                ralign = cache_line_size();
                while (size <= ralign / 2)
@@ -1893,16 +2014,16 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                if (ralign > BYTES_PER_WORD)
                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        }
-        /* 4) Store it. Note that the debug code below can reduce
+        /*
+         * 4) Store it. Note that the debug code below can reduce
         *    the alignment to BYTES_PER_WORD.
         */
        align = ralign;
        /* Get cache's description obj. */
-        cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
+        cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
        if (!cachep)
                goto oops;
-        memset(cachep, 0, sizeof(struct kmem_cache));
 #if DEBUG
        cachep->obj_size = size;
@@ -1978,7 +2099,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->gfpflags = 0;
        if (flags & SLAB_CACHE_DMA)
                cachep->gfpflags |= GFP_DMA;
-        spin_lock_init(&cachep->spinlock);
        cachep->buffer_size = size;
        if (flags & CFLGS_OFF_SLAB)
@@ -1988,64 +2108,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->name = name;
-        if (g_cpucache_up == FULL) {
+        setup_cpu_cache(cachep);
-                enable_cpucache(cachep);
-        } else {
-                if (g_cpucache_up == NONE) {
-                        /* Note: the first kmem_cache_create must create
-                         * the cache that's used by kmalloc(24), otherwise
-                         * the creation of further caches will BUG().
-                         */
-                        cachep->array[smp_processor_id()] =
-                            &initarray_generic.cache;
-                        /* If the cache that's used by
-                         * kmalloc(sizeof(kmem_list3)) is the first cache,
-                         * then we need to set up all its list3s, otherwise
-                         * the creation of further caches will BUG().
-                         */
-                        set_up_list3s(cachep, SIZE_AC);
-                        if (INDEX_AC == INDEX_L3)
-                                g_cpucache_up = PARTIAL_L3;
-                        else
-                                g_cpucache_up = PARTIAL_AC;
-                } else {
-                        cachep->array[smp_processor_id()] =
-                            kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-                        if (g_cpucache_up == PARTIAL_AC) {
-                                set_up_list3s(cachep, SIZE_L3);
-                                g_cpucache_up = PARTIAL_L3;
-                        } else {
-                                int node;
-                                for_each_online_node(node) {
-                                        cachep->nodelists[node] =
-                                            kmalloc_node(sizeof
-                                                         (struct kmem_list3),
-                                                         GFP_KERNEL, node);
-                                        BUG_ON(!cachep->nodelists[node]);
-                                        kmem_list3_init(cachep->
-                                                        nodelists[node]);
-                                }
-                        }
-                }
-                cachep->nodelists[numa_node_id()]->next_reap =
-                    jiffies + REAPTIMEOUT_LIST3 +
-                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-                BUG_ON(!cpu_cache_get(cachep));
-                cpu_cache_get(cachep)->avail = 0;
-                cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-                cpu_cache_get(cachep)->batchcount = 1;
-                cpu_cache_get(cachep)->touched = 0;
-                cachep->batchcount = 1;
-                cachep->limit = BOOT_CPUCACHE_ENTRIES;
-        }
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
-      oops:
+oops:
        if (!cachep && (flags & SLAB_PANIC))
                panic("kmem_cache_create(): failed to create slab `%s'\n",
                      name);
@@ -2089,30 +2156,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
-/*
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
- * Waits for all CPUs to execute func().
+                        struct array_cache *ac,
- */
+                        int force, int node);
-static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
-{
-        check_irq_on();
-        preempt_disable();
-        local_irq_disable();
-        func(arg);
-        local_irq_enable();
-        if (smp_call_function(func, arg, 1, 1))
-                BUG();
-        preempt_enable();
-}
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
-                                int force, int node);
 static void do_drain(void *arg)
 {
-        struct kmem_cache *cachep = (struct kmem_cache *) arg;
+        struct kmem_cache *cachep = arg;
        struct array_cache *ac;
        int node = numa_node_id();
@@ -2129,14 +2179,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
        struct kmem_list3 *l3;
        int node;
-        smp_call_function_all_cpus(do_drain, cachep);
+        on_each_cpu(do_drain, cachep, 1, 1);
        check_irq_on();
        for_each_online_node(node) {
                l3 = cachep->nodelists[node];
                if (l3) {
-                        spin_lock_irq(&l3->list_lock);
+                        drain_array(cachep, l3, l3->shared, 1, node);
-                        drain_array_locked(cachep, l3->shared, 1, node);
-                        spin_unlock_irq(&l3->list_lock);
                        if (l3->alien)
                                drain_alien_cache(cachep, l3->alien);
                }
@@ -2260,16 +2308,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
        /* NUMA: free the list3 structures */
        for_each_online_node(i) {
-                if ((l3 = cachep->nodelists[i])) {
+                l3 = cachep->nodelists[i];
+                if (l3) {
                        kfree(l3->shared);
                        free_alien_cache(l3->alien);
                        kfree(l3);
                }
        }
        kmem_cache_free(&cache_cache, cachep);
        unlock_cpu_hotplug();
        return 0;
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2292,7 +2339,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
        slabp->inuse = 0;
        slabp->colouroff = colour_off;
        slabp->s_mem = objp + colour_off;
        return slabp;
 }
@@ -2307,7 +2353,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
        int i;
        for (i = 0; i < cachep->num; i++) {
-                void *objp = slabp->s_mem + cachep->buffer_size * i;
+                void *objp = index_to_obj(cachep, slabp, i);
 #if DEBUG
                /* need to poison the objs? */
                if (cachep->flags & SLAB_POISON)
@@ -2320,9 +2366,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
                        *dbg_redzone2(cachep, objp) = RED_INACTIVE;
                }
                /*
-                 * Constructors are not allowed to allocate memory from
+                 * Constructors are not allowed to allocate memory from the same
-                 * the same cache which they are a constructor for.
+                 * cache which they are a constructor for.  Otherwise, deadlock.
-                 * Otherwise, deadlock. They must also be threaded.
+                 * They must also be threaded.
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
                        cachep->ctor(objp + obj_offset(cachep), cachep,
@@ -2336,8 +2382,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                slab_error(cachep, "constructor overwrote the"
                                           " start of an object");
                }
-                if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
+                if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
-                    && cachep->flags & SLAB_POISON)
+                            OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
                        kernel_map_pages(virt_to_page(objp),
                                         cachep->buffer_size / PAGE_SIZE, 0);
 #else
@@ -2352,18 +2398,16 @@ static void cache_init_objs(struct kmem_cache *cachep,
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
-        if (flags & SLAB_DMA) {
+        if (flags & SLAB_DMA)
-                if (!(cachep->gfpflags & GFP_DMA))
+                BUG_ON(!(cachep->gfpflags & GFP_DMA));
-                        BUG();
+        else
-        } else {
+                BUG_ON(cachep->gfpflags & GFP_DMA);
-                if (cachep->gfpflags & GFP_DMA)
-                        BUG();
-        }
 }
-static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
+                                int nodeid)
 {
-        void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
+        void *objp = index_to_obj(cachep, slabp, slabp->free);
        kmem_bufctl_t next;
        slabp->inuse++;
@@ -2377,18 +2421,18 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod
        return objp;
 }
-static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
-                          int nodeid)
+                                void *objp, int nodeid)
 {
-        unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
+        unsigned int objnr = obj_to_index(cachep, slabp, objp);
 #if DEBUG
        /* Verify that the slab belongs to the intended node */
        WARN_ON(slabp->nodeid != nodeid);
-        if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
+        if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
                printk(KERN_ERR "slab: double free detected in cache "
-                       "'%s', objp %p\n", cachep->name, objp);
+                                "'%s', objp %p\n", cachep->name, objp);
                BUG();
        }
 #endif
@@ -2397,14 +2441,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
        slabp->inuse--;
 }
-static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
+static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
+                        void *objp)
 {
        int i;
        struct page *page;
        /* Nasty!!!!!! I hope this is OK. */
-        i = 1 << cachep->gfporder;
        page = virt_to_page(objp);
+        i = 1;
+        if (likely(!PageCompound(page)))
+                i <<= cachep->gfporder;
        do {
                page_set_cache(page, cachep);
                page_set_slab(page, slabp);
@@ -2425,8 +2473,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        unsigned long ctor_flags;
        struct kmem_list3 *l3;
-        /* Be lazy and only check for valid flags here,
+        /*
-         * keeping it out of the critical path in kmem_cache_alloc().
+         * Be lazy and only check for valid flags here,  keeping it out of the
+         * critical path in kmem_cache_alloc().
         */
        if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
                BUG();
@@ -2467,14 +2516,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
         */
        kmem_flagcheck(cachep, flags);
-        /* Get mem for the objs.
+        /*
-         * Attempt to allocate a physical page from 'nodeid',
+         * Get mem for the objs.  Attempt to allocate a physical page from
+         * 'nodeid'.
         */
-        if (!(objp = kmem_getpages(cachep, flags, nodeid)))
+        objp = kmem_getpages(cachep, flags, nodeid);
+        if (!objp)
                goto failed;
        /* Get slab management. */
-        if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
+        slabp = alloc_slabmgmt(cachep, objp, offset, local_flags);
+        if (!slabp)
                goto opps1;
        slabp->nodeid = nodeid;
@@ -2493,9 +2545,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        l3->free_objects += cachep->num;
        spin_unlock(&l3->list_lock);
        return 1;
-      opps1:
+opps1:
        kmem_freepages(cachep, objp);
-      failed:
+failed:
        if (local_flags & __GFP_WAIT)
                local_irq_disable();
        return 0;
@@ -2538,8 +2590,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        page = virt_to_page(objp);
        if (page_get_cache(page) != cachep) {
-                printk(KERN_ERR
+                printk(KERN_ERR "mismatch in kmem_cache_free: expected "
-                       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+                                "cache %p, got %p\n",
                       page_get_cache(page), cachep);
                printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
                printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
@@ -2549,13 +2601,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        slabp = page_get_slab(page);
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
+                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
-                    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+                                *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-                        slab_error(cachep,
+                        slab_error(cachep, "double free, or memory outside"
-                                   "double free, or memory outside"
+                                                " object was overwritten");
-                                   " object was overwritten");
+                        printk(KERN_ERR "%p: redzone 1:0x%lx, "
-                        printk(KERN_ERR
+                                        "redzone 2:0x%lx.\n",
-                               "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
                               objp, *dbg_redzone1(cachep, objp),
                               *dbg_redzone2(cachep, objp));
                }
@@ -2565,15 +2616,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        if (cachep->flags & SLAB_STORE_USER)
                *dbg_userword(cachep, objp) = caller;
-        objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+        objnr = obj_to_index(cachep, slabp, objp);
        BUG_ON(objnr >= cachep->num);
-        BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
+        BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
        if (cachep->flags & SLAB_DEBUG_INITIAL) {
-                /* Need to call the slab's constructor so the
+                /*
-                 * caller can perform a verify of its state (debugging).
+                 * Need to call the slab's constructor so the caller can
-                 * Called without the cache-lock held.
+                 * perform a verify of its state (debugging).  Called without
+                 * the cache-lock held.
                 */
                cachep->ctor(objp + obj_offset(cachep),
                             cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
@@ -2584,9 +2636,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
                 */
                cachep->dtor(objp + obj_offset(cachep), cachep, 0);
        }
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+        slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
+#endif
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
+                if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
                        store_stackinfo(cachep, objp, (unsigned long)caller);
                        kernel_map_pages(virt_to_page(objp),
                                         cachep->buffer_size / PAGE_SIZE, 0);
@@ -2612,14 +2667,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
                        goto bad;
        }
        if (entries != cachep->num - slabp->inuse) {
-              bad:
+bad:
-                printk(KERN_ERR
+                printk(KERN_ERR "slab: Internal list corruption detected in "
-                       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+                                "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-                       cachep->name, cachep->num, slabp, slabp->inuse);
+                        cachep->name, cachep->num, slabp, slabp->inuse);
                for (i = 0;
                     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
                     i++) {
-                        if ((i % 16) == 0)
+                        if (i % 16 == 0)
                                printk("\n%03x:", i);
                        printk(" %02x", ((unsigned char *)slabp)[i]);
                }
@@ -2641,12 +2696,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
        check_irq_off();
        ac = cpu_cache_get(cachep);
-      retry:
+retry:
        batchcount = ac->batchcount;
        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
-                /* if there was little recent activity on this
+                /*
-                 * cache, then perform only a partial refill.
+                 * If there was little recent activity on this cache, then
-                 * Otherwise we could generate refill bouncing.
+                 * perform only a partial refill.  Otherwise we could generate
+                 * refill bouncing.
                 */
                batchcount = BATCHREFILL_LIMIT;
        }
@@ -2655,20 +2711,10 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
        BUG_ON(ac->avail > 0 || !l3);
        spin_lock(&l3->list_lock);
-        if (l3->shared) {
+        /* See if we can refill from the shared array */
-                struct array_cache *shared_array = l3->shared;
+        if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
-                if (shared_array->avail) {
+                goto alloc_done;
-                        if (batchcount > shared_array->avail)
-                                batchcount = shared_array->avail;
-                        shared_array->avail -= batchcount;
-                        ac->avail = batchcount;
-                        memcpy(ac->entry,
-                               &(shared_array->entry[shared_array->avail]),
-                               sizeof(void *) * batchcount);
-                        shared_array->touched = 1;
-                        goto alloc_done;
-                }
-        }
        while (batchcount > 0) {
                struct list_head *entry;
                struct slab *slabp;
@@ -2702,29 +2748,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
                        list_add(&slabp->list, &l3->slabs_partial);
        }
-      must_grow:
+must_grow:
        l3->free_objects -= ac->avail;
-      alloc_done:
+alloc_done:
        spin_unlock(&l3->list_lock);
        if (unlikely(!ac->avail)) {
                int x;
                x = cache_grow(cachep, flags, numa_node_id());
-                // cache_grow can reenable interrupts, then ac could change.
+                /* cache_grow can reenable interrupts, then ac could change. */
                ac = cpu_cache_get(cachep);
-                if (!x && ac->avail == 0)       // no objects in sight? abort
+                if (!x && ac->avail == 0)       /* no objects in sight? abort */
                        return NULL;
-                if (!ac->avail) // objects refilled by interrupt?
+                if (!ac->avail)         /* objects refilled by interrupt? */
                        goto retry;
        }
        ac->touched = 1;
        return ac->entry[--ac->avail];
 }
-static inline void
+static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
-cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
+                                                gfp_t flags)
 {
        might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
@@ -2733,8 +2779,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
 }
 #if DEBUG
-static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
+static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
-                                        void *objp, void *caller)
+                                gfp_t flags, void *objp, void *caller)
 {
        if (!objp)
                return objp;
@@ -2754,19 +2800,28 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags
                *dbg_userword(cachep, objp) = caller;
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
+                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
-                    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+                                *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-                        slab_error(cachep,
+                        slab_error(cachep, "double free, or memory outside"
-                                   "double free, or memory outside"
+                                                " object was overwritten");
-                                   " object was overwritten");
                        printk(KERN_ERR
-                               "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                                "%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
-                               objp, *dbg_redzone1(cachep, objp),
+                                objp, *dbg_redzone1(cachep, objp),
-                               *dbg_redzone2(cachep, objp));
+                                *dbg_redzone2(cachep, objp));
                }
                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
        }
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+        {
+                struct slab *slabp;
+                unsigned objnr;
+                slabp = page_get_slab(virt_to_page(objp));
+                objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+                slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
+        }
+#endif
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON) {
                unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
@@ -2788,11 +2843,10 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
        struct array_cache *ac;
 #ifdef CONFIG_NUMA
-        if (unlikely(current->mempolicy && !in_interrupt())) {
+        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
-                int nid = slab_node(current->mempolicy);
+                objp = alternate_node_alloc(cachep, flags);
+                if (objp != NULL)
-                if (nid != numa_node_id())
+                        return objp;
-                        return __cache_alloc_node(cachep, flags, nid);
        }
 #endif
@@ -2809,8 +2863,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
        return objp;
 }
-static __always_inline void *
+static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+                                                gfp_t flags, void *caller)
 {
        unsigned long save_flags;
        void *objp;
@@ -2828,9 +2882,32 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 #ifdef CONFIG_NUMA
 /*
+ * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ *
+ * If we are in_interrupt, then process context, including cpusets and
+ * mempolicy, may not apply and should not be used for allocation policy.
+ */
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+        int nid_alloc, nid_here;
+        if (in_interrupt())
+                return NULL;
+        nid_alloc = nid_here = numa_node_id();
+        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
+                nid_alloc = cpuset_mem_spread_node();
+        else if (current->mempolicy)
+                nid_alloc = slab_node(current->mempolicy);
+        if (nid_alloc != nid_here)
+                return __cache_alloc_node(cachep, flags, nid_alloc);
+        return NULL;
+}
+/*
 * A interface to enable slab creation on nodeid
 */
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+                                int nodeid)
 {
        struct list_head *entry;
        struct slab *slabp;
@@ -2841,7 +2918,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
        l3 = cachep->nodelists[nodeid];
        BUG_ON(!l3);
-      retry:
+retry:
        check_irq_off();
        spin_lock(&l3->list_lock);
        entry = l3->slabs_partial.next;
@@ -2868,16 +2945,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
        /* move slabp to correct slabp list: */
        list_del(&slabp->list);
-        if (slabp->free == BUFCTL_END) {
+        if (slabp->free == BUFCTL_END)
                list_add(&slabp->list, &l3->slabs_full);
-        } else {
+        else
                list_add(&slabp->list, &l3->slabs_partial);
-        }
        spin_unlock(&l3->list_lock);
        goto done;
-      must_grow:
+must_grow:
        spin_unlock(&l3->list_lock);
        x = cache_grow(cachep, flags, nodeid);
@@ -2885,7 +2961,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
                return NULL;
        goto retry;
-      done:
+done:
        return obj;
 }
 #endif
@@ -2958,7 +3034,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
        }
        free_block(cachep, ac->entry, batchcount, node);
-      free_done:
+free_done:
 #if STATS
        {
                int i = 0;
@@ -2979,16 +3055,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 #endif
        spin_unlock(&l3->list_lock);
        ac->avail -= batchcount;
-        memmove(ac->entry, &(ac->entry[batchcount]),
+        memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
-                sizeof(void *) * ac->avail);
 }
 /*
- * __cache_free
+ * Release an obj back to its cache. If the obj has a constructed state, it must
- * Release an obj back to its cache. If the obj has a constructed
+ * be in this state _before_ it is released.  Called with disabled ints.
- * state, it must be in this state _before_ it is released.
- *
- * Called with disabled ints.
 */
 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 {
@@ -3007,9 +3079,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
                if (unlikely(slabp->nodeid != numa_node_id())) {
                        struct array_cache *alien = NULL;
                        int nodeid = slabp->nodeid;
-                        struct kmem_list3 *l3 =
+                        struct kmem_list3 *l3;
-                            cachep->nodelists[numa_node_id()];
+                        l3 = cachep->nodelists[numa_node_id()];
                        STATS_INC_NODEFREES(cachep);
                        if (l3->alien && l3->alien[nodeid]) {
                                alien = l3->alien[nodeid];
@@ -3056,6 +3128,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 EXPORT_SYMBOL(kmem_cache_alloc);
 /**
+ * kmem_cache_alloc - Allocate an object. The memory is set to zero.
+ * @cache: The cache to allocate from.
+ * @flags: See kmalloc().
+ *
+ * Allocate an object from this cache and set the allocated memory to zero.
+ * The flags are only relevant if the cache has no available objects.
+ */
+void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
+{
+        void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
+        if (ret)
+                memset(ret, 0, obj_size(cache));
+        return ret;
+}
+EXPORT_SYMBOL(kmem_cache_zalloc);
+/**
 * kmem_ptr_validate - check if an untrusted pointer might
 *      be a slab entry.
 * @cachep: the cache we're checking against
@@ -3093,7 +3182,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
        if (unlikely(page_get_cache(page) != cachep))
                goto out;
        return 1;
-      out:
+out:
        return 0;
 }
@@ -3119,7 +3208,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        local_irq_save(save_flags);
        if (nodeid == -1 || nodeid == numa_node_id() ||
-            !cachep->nodelists[nodeid])
+                        !cachep->nodelists[nodeid])
                ptr = ____cache_alloc(cachep, flags);
        else
                ptr = __cache_alloc_node(cachep, flags, nodeid);
@@ -3148,6 +3237,7 @@ EXPORT_SYMBOL(kmalloc_node);
 * kmalloc - allocate memory
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
+ * @caller: function caller for debug tracking of the caller
 *
 * kmalloc is the normal method of allocating memory
 * in the kernel.
@@ -3181,22 +3271,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
        return __cache_alloc(cachep, flags, caller);
 }
-#ifndef CONFIG_DEBUG_SLAB
 void *__kmalloc(size_t size, gfp_t flags)
 {
+#ifndef CONFIG_DEBUG_SLAB
        return __do_kmalloc(size, flags, NULL);
+#else
+        return __do_kmalloc(size, flags, __builtin_return_address(0));
+#endif
 }
 EXPORT_SYMBOL(__kmalloc);
-#else
+#ifdef CONFIG_DEBUG_SLAB
 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
 {
        return __do_kmalloc(size, flags, caller);
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
 #endif
 #ifdef CONFIG_SMP
@@ -3220,7 +3311,7 @@ void *__alloc_percpu(size_t size)
         * and we have no way of figuring out how to fix the array
         * that we have allocated then....
         */
-        for_each_cpu(i) {
+        for_each_possible_cpu(i) {
                int node = cpu_to_node(i);
                if (node_online(node))
@@ -3236,7 +3327,7 @@ void *__alloc_percpu(size_t size)
        /* Catch derefs w/o wrappers */
        return (void *)(~(unsigned long)pdata);
-      unwind_oom:
+unwind_oom:
        while (--i >= 0) {
                if (!cpu_possible(i))
                        continue;
@@ -3307,7 +3398,7 @@ void free_percpu(const void *objp)
        /*
         * We allocate for all cpus so we cannot use for online cpu here.
         */
-        for_each_cpu(i)
+        for_each_possible_cpu(i)
            kfree(p->ptrs[i]);
        kfree(p);
 }
@@ -3327,61 +3418,86 @@ const char *kmem_cache_name(struct kmem_cache *cachep)
 EXPORT_SYMBOL_GPL(kmem_cache_name);
 /*
- * This initializes kmem_list3 for all nodes.
+ * This initializes kmem_list3 or resizes varioius caches for all nodes.
 */
 static int alloc_kmemlist(struct kmem_cache *cachep)
 {
        int node;
        struct kmem_list3 *l3;
-        int err = 0;
+        struct array_cache *new_shared;
+        struct array_cache **new_alien;
        for_each_online_node(node) {
-                struct array_cache *nc = NULL, *new;
-                struct array_cache **new_alien = NULL;
+                new_alien = alloc_alien_cache(node, cachep->limit);
-#ifdef CONFIG_NUMA
+                if (!new_alien)
-                if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
                        goto fail;
-#endif
-                if (!(new = alloc_arraycache(node, (cachep->shared *
+                new_shared = alloc_arraycache(node,
-                                                    cachep->batchcount),
+                                cachep->shared*cachep->batchcount,
-                                             0xbaadf00d)))
+                                        0xbaadf00d);
+                if (!new_shared) {
+                        free_alien_cache(new_alien);
                        goto fail;
-                if ((l3 = cachep->nodelists[node])) {
+                }
+                l3 = cachep->nodelists[node];
+                if (l3) {
+                        struct array_cache *shared = l3->shared;
                        spin_lock_irq(&l3->list_lock);
-                        if ((nc = cachep->nodelists[node]->shared))
+                        if (shared)
-                                free_block(cachep, nc->entry, nc->avail, node);
+                                free_block(cachep, shared->entry,
+                                                shared->avail, node);
-                        l3->shared = new;
+                        l3->shared = new_shared;
-                        if (!cachep->nodelists[node]->alien) {
+                        if (!l3->alien) {
                                l3->alien = new_alien;
                                new_alien = NULL;
                        }
                        l3->free_limit = (1 + nr_cpus_node(node)) *
-                            cachep->batchcount + cachep->num;
+                                        cachep->batchcount + cachep->num;
                        spin_unlock_irq(&l3->list_lock);
-                        kfree(nc);
+                        kfree(shared);
                        free_alien_cache(new_alien);
                        continue;
                }
-                if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
+                l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
-                                        GFP_KERNEL, node)))
+                if (!l3) {
+                        free_alien_cache(new_alien);
+                        kfree(new_shared);
                        goto fail;
+                }
                kmem_list3_init(l3);
                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+                                ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-                l3->shared = new;
+                l3->shared = new_shared;
                l3->alien = new_alien;
                l3->free_limit = (1 + nr_cpus_node(node)) *
-                    cachep->batchcount + cachep->num;
+                                        cachep->batchcount + cachep->num;
                cachep->nodelists[node] = l3;
        }
-        return err;
+        return 0;
-      fail:
-        err = -ENOMEM;
+fail:
-        return err;
+        if (!cachep->next.next) {
+                /* Cache is not active yet. Roll back what we did */
+                node--;
+                while (node >= 0) {
+                        if (cachep->nodelists[node]) {
+                                l3 = cachep->nodelists[node];
+                                kfree(l3->shared);
+                                free_alien_cache(l3->alien);
+                                kfree(l3);
+                                cachep->nodelists[node] = NULL;
+                        }
+                        node--;
+                }
+        }
+        return -ENOMEM;
 }
 struct ccupdate_struct {
@@ -3391,7 +3507,7 @@ struct ccupdate_struct {
 static void do_ccupdate_local(void *info)
 {
-        struct ccupdate_struct *new = (struct ccupdate_struct *)info;
+        struct ccupdate_struct *new = info;
        struct array_cache *old;
        check_irq_off();
@@ -3401,16 +3517,17 @@ static void do_ccupdate_local(void *info)
        new->new[smp_processor_id()] = old;
 }
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
+/* Always called with the cache_chain_mutex held */
-                            int shared)
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+                                int batchcount, int shared)
 {
        struct ccupdate_struct new;
        int i, err;
        memset(&new.new, 0, sizeof(new.new));
        for_each_online_cpu(i) {
-                new.new[i] =
+                new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
-                    alloc_arraycache(cpu_to_node(i), limit, batchcount);
+                                                batchcount);
                if (!new.new[i]) {
                        for (i--; i >= 0; i--)
                                kfree(new.new[i]);
@@ -3419,14 +3536,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
        }
        new.cachep = cachep;
-        smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
+        on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
        check_irq_on();
-        spin_lock(&cachep->spinlock);
        cachep->batchcount = batchcount;
        cachep->limit = limit;
        cachep->shared = shared;
-        spin_unlock(&cachep->spinlock);
        for_each_online_cpu(i) {
                struct array_cache *ccold = new.new[i];
@@ -3447,15 +3562,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
        return 0;
 }
+/* Called with cache_chain_mutex held always */
 static void enable_cpucache(struct kmem_cache *cachep)
 {
        int err;
        int limit, shared;
-        /* The head array serves three purposes:
+        /*
+         * The head array serves three purposes:
         * - create a LIFO ordering, i.e. return objects that are cache-warm
         * - reduce the number of spinlock operations.
-         * - reduce the number of linked list operations on the slab and 
+         * - reduce the number of linked list operations on the slab and
         *   bufctl chains: array operations are cheaper.
         * The numbers are guessed, we should auto-tune as described by
         * Bonwick.
@@ -3471,7 +3588,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
        else
                limit = 120;
-        /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
+        /*
+         * CPU bound tasks (e.g. network routing) can exhibit cpu bound
         * allocation behaviour: Most allocs on one cpu, most free operations
         * on another cpu. For these cases, an efficient object passing between
         * cpus is necessary. This is provided by a shared array. The array
@@ -3486,9 +3604,9 @@ static void enable_cpucache(struct kmem_cache *cachep)
 #endif
 #if DEBUG
-        /* With debugging enabled, large batchcount lead to excessively
+        /*
-         * long periods with disabled local interrupts. Limit the 
+         * With debugging enabled, large batchcount lead to excessively long
-         * batchcount
+         * periods with disabled local interrupts. Limit the batchcount
         */
        if (limit > 32)
                limit = 32;
@@ -3499,23 +3617,32 @@ static void enable_cpucache(struct kmem_cache *cachep)
                       cachep->name, -err);
 }
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
+/*
-                                int force, int node)
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary. Note that the l3 listlock also protects the array_cache
+ * if drain_array() is used on the shared array.
+ */
+void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+                         struct array_cache *ac, int force, int node)
 {
        int tofree;
-        check_spinlock_acquired_node(cachep, node);
+        if (!ac || !ac->avail)
+                return;
        if (ac->touched && !force) {
                ac->touched = 0;
-        } else if (ac->avail) {
+        } else {
-                tofree = force ? ac->avail : (ac->limit + 4) / 5;
+                spin_lock_irq(&l3->list_lock);
-                if (tofree > ac->avail) {
+                if (ac->avail) {
-                        tofree = (ac->avail + 1) / 2;
+                        tofree = force ? ac->avail : (ac->limit + 4) / 5;
+                        if (tofree > ac->avail)
+                                tofree = (ac->avail + 1) / 2;
+                        free_block(cachep, ac->entry, tofree, node);
+                        ac->avail -= tofree;
+                        memmove(ac->entry, &(ac->entry[tofree]),
+                                sizeof(void *) * ac->avail);
                }
-                free_block(cachep, ac->entry, tofree, node);
+                spin_unlock_irq(&l3->list_lock);
-                ac->avail -= tofree;
-                memmove(ac->entry, &(ac->entry[tofree]),
-                        sizeof(void *) * ac->avail);
        }
 }
@@ -3528,13 +3655,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac
 * - clear the per-cpu caches for this CPU.
 * - return freeable pages to the main free memory pool.
 *
- * If we cannot acquire the cache chain mutex then just give up - we'll
+ * If we cannot acquire the cache chain mutex then just give up - we'll try
- * try again on the next iteration.
+ * again on the next iteration.
 */
 static void cache_reap(void *unused)
 {
        struct list_head *walk;
        struct kmem_list3 *l3;
+        int node = numa_node_id();
        if (!mutex_trylock(&cache_chain_mutex)) {
                /* Give up. Setup the next iteration. */
@@ -3550,65 +3678,72 @@ static void cache_reap(void *unused)
                struct slab *slabp;
                searchp = list_entry(walk, struct kmem_cache, next);
-                if (searchp->flags & SLAB_NO_REAP)
-                        goto next;
                check_irq_on();
-                l3 = searchp->nodelists[numa_node_id()];
+                /*
+                 * We only take the l3 lock if absolutely necessary and we
+                 * have established with reasonable certainty that
+                 * we can do some work if the lock was obtained.
+                 */
+                l3 = searchp->nodelists[node];
                reap_alien(searchp, l3);
-                spin_lock_irq(&l3->list_lock);
-                drain_array_locked(searchp, cpu_cache_get(searchp), 0,
+                drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
-                                   numa_node_id());
+                /*
+                 * These are racy checks but it does not matter
+                 * if we skip one check or scan twice.
+                 */
                if (time_after(l3->next_reap, jiffies))
-                        goto next_unlock;
+                        goto next;
                l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
-                if (l3->shared)
+                drain_array(searchp, l3, l3->shared, 0, node);
-                        drain_array_locked(searchp, l3->shared, 0,
-                                           numa_node_id());
                if (l3->free_touched) {
                        l3->free_touched = 0;
-                        goto next_unlock;
+                        goto next;
                }
-                tofree =
+                tofree = (l3->free_limit + 5 * searchp->num - 1) /
-                    (l3->free_limit + 5 * searchp->num -
+                                (5 * searchp->num);
-                     1) / (5 * searchp->num);
                do {
+                        /*
+                         * Do not lock if there are no free blocks.
+                         */
+                        if (list_empty(&l3->slabs_free))
+                                break;
+                        spin_lock_irq(&l3->list_lock);
                        p = l3->slabs_free.next;
-                        if (p == &(l3->slabs_free))
+                        if (p == &(l3->slabs_free)) {
+                                spin_unlock_irq(&l3->list_lock);
                                break;
+                        }
                        slabp = list_entry(p, struct slab, list);
                        BUG_ON(slabp->inuse);
                        list_del(&slabp->list);
                        STATS_INC_REAPED(searchp);
-                        /* Safe to drop the lock. The slab is no longer
+                        /*
-                         * linked to the cache.
+                         * Safe to drop the lock. The slab is no longer linked
-                         * searchp cannot disappear, we hold
+                         * to the cache. searchp cannot disappear, we hold
                         * cache_chain_lock
                         */
                        l3->free_objects -= searchp->num;
                        spin_unlock_irq(&l3->list_lock);
                        slab_destroy(searchp, slabp);
-                        spin_lock_irq(&l3->list_lock);
                } while (--tofree > 0);
-              next_unlock:
+next:
-                spin_unlock_irq(&l3->list_lock);
-              next:
                cond_resched();
        }
        check_irq_on();
        mutex_unlock(&cache_chain_mutex);
        next_reap_node();
-        /* Setup the next iteration */
+        /* Set up the next iteration */
        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
@@ -3658,8 +3793,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
        struct kmem_cache *cachep = p;
        ++*pos;
-        return cachep->next.next == &cache_chain ? NULL
+        return cachep->next.next == &cache_chain ?
-            : list_entry(cachep->next.next, struct kmem_cache, next);
+                NULL : list_entry(cachep->next.next, struct kmem_cache, next);
 }
 static void s_stop(struct seq_file *m, void *p)
@@ -3681,7 +3816,6 @@ static int s_show(struct seq_file *m, void *p)
        int node;
        struct kmem_list3 *l3;
-        spin_lock(&cachep->spinlock);
        active_objs = 0;
        num_slabs = 0;
        for_each_online_node(node) {
@@ -3748,7 +3882,9 @@ static int s_show(struct seq_file *m, void *p)
                unsigned long node_frees = cachep->node_frees;
                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-                                %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
+                                %4lu %4lu %4lu %4lu", allocs, high, grown,
+                                reaped, errors, max_freeable, node_allocs,
+                                node_frees);
        }
        /* cpu stats */
        {
@@ -3762,7 +3898,6 @@ static int s_show(struct seq_file *m, void *p)
        }
 #endif
        seq_putc(m, '\n');
-        spin_unlock(&cachep->spinlock);
        return 0;
 }
@@ -3820,13 +3955,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
        mutex_lock(&cache_chain_mutex);
        res = -EINVAL;
        list_for_each(p, &cache_chain) {
-                struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
+                struct kmem_cache *cachep;
-                                                       next);
+                cachep = list_entry(p, struct kmem_cache, next);
                if (!strcmp(cachep->name, kbuf)) {
-                        if (limit < 1 ||
+                        if (limit < 1 || batchcount < 1 ||
-                            batchcount < 1 ||
+                                        batchcount > limit || shared < 0) {
-                            batchcount > limit || shared < 0) {
                                res = 0;
                        } else {
                                res = do_tune_cpucache(cachep, limit,
@@ -3840,6 +3974,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
                res = count;
        return res;
 }
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+static void *leaks_start(struct seq_file *m, loff_t *pos)
+{
+        loff_t n = *pos;
+        struct list_head *p;
+        mutex_lock(&cache_chain_mutex);
+        p = cache_chain.next;
+        while (n--) {
+                p = p->next;
+                if (p == &cache_chain)
+                        return NULL;
+        }
+        return list_entry(p, struct kmem_cache, next);
+}
+static inline int add_caller(unsigned long *n, unsigned long v)
+{
+        unsigned long *p;
+        int l;
+        if (!v)
+                return 1;
+        l = n[1];
+        p = n + 2;
+        while (l) {
+                int i = l/2;
+                unsigned long *q = p + 2 * i;
+                if (*q == v) {
+                        q[1]++;
+                        return 1;
+                }
+                if (*q > v) {
+                        l = i;
+                } else {
+                        p = q + 2;
+                        l -= i + 1;
+                }
+        }
+        if (++n[1] == n[0])
+                return 0;
+        memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
+        p[0] = v;
+        p[1] = 1;
+        return 1;
+}
+static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
+{
+        void *p;
+        int i;
+        if (n[0] == n[1])
+                return;
+        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
+                if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
+                        continue;
+                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+                        return;
+        }
+}
+static void show_symbol(struct seq_file *m, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+        char *modname;
+        const char *name;
+        unsigned long offset, size;
+        char namebuf[KSYM_NAME_LEN+1];
+        name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+        if (name) {
+                seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
+                if (modname)
+                        seq_printf(m, " [%s]", modname);
+                return;
+        }
+#endif
+        seq_printf(m, "%p", (void *)address);
+}
+static int leaks_show(struct seq_file *m, void *p)
+{
+        struct kmem_cache *cachep = p;
+        struct list_head *q;
+        struct slab *slabp;
+        struct kmem_list3 *l3;
+        const char *name;
+        unsigned long *n = m->private;
+        int node;
+        int i;
+        if (!(cachep->flags & SLAB_STORE_USER))
+                return 0;
+        if (!(cachep->flags & SLAB_RED_ZONE))
+                return 0;
+        /* OK, we can do it */
+        n[1] = 0;
+        for_each_online_node(node) {
+                l3 = cachep->nodelists[node];
+                if (!l3)
+                        continue;
+                check_irq_on();
+                spin_lock_irq(&l3->list_lock);
+                list_for_each(q, &l3->slabs_full) {
+                        slabp = list_entry(q, struct slab, list);
+                        handle_slab(n, cachep, slabp);
+                }
+                list_for_each(q, &l3->slabs_partial) {
+                        slabp = list_entry(q, struct slab, list);
+                        handle_slab(n, cachep, slabp);
+                }
+                spin_unlock_irq(&l3->list_lock);
+        }
+        name = cachep->name;
+        if (n[0] == n[1]) {
+                /* Increase the buffer size */
+                mutex_unlock(&cache_chain_mutex);
+                m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
+                if (!m->private) {
+                        /* Too bad, we are really out */
+                        m->private = n;
+                        mutex_lock(&cache_chain_mutex);
+                        return -ENOMEM;
+                }
+                *(unsigned long *)m->private = n[0] * 2;
+                kfree(n);
+                mutex_lock(&cache_chain_mutex);
+                /* Now make sure this entry will be retried */
+                m->count = m->size;
+                return 0;
+        }
+        for (i = 0; i < n[1]; i++) {
+                seq_printf(m, "%s: %lu ", name, n[2*i+3]);
+                show_symbol(m, n[2*i+2]);
+                seq_putc(m, '\n');
+        }
+        return 0;
+}
+struct seq_operations slabstats_op = {
+        .start = leaks_start,
+        .next = s_next,
+        .stop = s_stop,
+        .show = leaks_show,
+};
+#endif
 #endif
 /**
diff --git a/mm/slob.c b/mm/slob.c
index a1f42bdc0245..9bcc7e2cabfd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
+void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
+{
+        void *ret = kmem_cache_alloc(c, flags);
+        if (ret)
+                memset(ret, 0, c->size);
+        return ret;
+}
+EXPORT_SYMBOL(kmem_cache_zalloc);
 void kmem_cache_free(struct kmem_cache *c, void *b)
 {
        if (c->dtor)
diff --git a/mm/swap.c b/mm/swap.c
index b524ea90bddb..88895c249bc9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -209,19 +209,18 @@ int lru_add_drain_all(void)
 */
 void fastcall __page_cache_release(struct page *page)
 {
-        unsigned long flags;
+        if (PageLRU(page)) {
-        struct zone *zone = page_zone(page);
+                unsigned long flags;
+                struct zone *zone = page_zone(page);
-        spin_lock_irqsave(&zone->lru_lock, flags);
+                spin_lock_irqsave(&zone->lru_lock, flags);
-        if (TestClearPageLRU(page))
+                BUG_ON(!PageLRU(page));
+                __ClearPageLRU(page);
                del_page_from_lru(zone, page);
-        if (page_count(page) != 0)
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
-                page = NULL;
+        }
-        spin_unlock_irqrestore(&zone->lru_lock, flags);
+        free_hot_page(page);
-        if (page)
-                free_hot_page(page);
 }
 EXPORT_SYMBOL(__page_cache_release);
 /*
@@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold)
        pagevec_init(&pages_to_free, cold);
        for (i = 0; i < nr; i++) {
                struct page *page = pages[i];
-                struct zone *pagezone;
                if (unlikely(PageCompound(page))) {
                        if (zone) {
@@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold)
                if (!put_page_testzero(page))
                        continue;
-                pagezone = page_zone(page);
+                if (PageLRU(page)) {
-                if (pagezone != zone) {
+                        struct zone *pagezone = page_zone(page);
-                        if (zone)
+                        if (pagezone != zone) {
-                                spin_unlock_irq(&zone->lru_lock);
+                                if (zone)
-                        zone = pagezone;
+                                        spin_unlock_irq(&zone->lru_lock);
-                        spin_lock_irq(&zone->lru_lock);
+                                zone = pagezone;
-                }
+                                spin_lock_irq(&zone->lru_lock);
-                if (TestClearPageLRU(page))
+                        }
+                        BUG_ON(!PageLRU(page));
+                        __ClearPageLRU(page);
                        del_page_from_lru(zone, page);
-                if (page_count(page) == 0) {
+                }
-                        if (!pagevec_add(&pages_to_free, page)) {
+                if (!pagevec_add(&pages_to_free, page)) {
+                        if (zone) {
                                spin_unlock_irq(&zone->lru_lock);
-                                __pagevec_free(&pages_to_free);
+                                zone = NULL;
-                                pagevec_reinit(&pages_to_free);
-                                zone = NULL;    /* No lock is held */
                        }
-                }
+                        __pagevec_free(&pages_to_free);
+                        pagevec_reinit(&pages_to_free);
+                }
        }
        if (zone)
                spin_unlock_irq(&zone->lru_lock);
@@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
                        zone = pagezone;
                        spin_lock_irq(&zone->lru_lock);
                }
-                if (TestSetPageLRU(page))
+                BUG_ON(PageLRU(page));
-                        BUG();
+                SetPageLRU(page);
                add_page_to_inactive_list(zone, page);
        }
        if (zone)
@@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
                        zone = pagezone;
                        spin_lock_irq(&zone->lru_lock);
                }
-                if (TestSetPageLRU(page))
+                BUG_ON(PageLRU(page));
-                        BUG();
+                SetPageLRU(page);
-                if (TestSetPageActive(page))
+                BUG_ON(PageActive(page));
-                        BUG();
+                SetPageActive(page);
                add_page_to_active_list(zone, page);
        }
        if (zone)
@@ -510,7 +512,7 @@ long percpu_counter_sum(struct percpu_counter *fbc)
        spin_lock(&fbc->lock);
        ret = fbc->count;
-        for_each_cpu(cpu) {
+        for_each_possible_cpu(cpu) {
                long *pcount = per_cpu_ptr(fbc->counters, cpu);
                ret += *pcount;
        }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index db8a3d3e1636..d7af296833fc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/migrate.h>
 #include <asm/pgtable.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f9cf0d073b8..39aa9d129612 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry ";
 struct swap_list_t swap_list = {-1, -1};
-struct swap_info_struct swap_info[MAX_SWAPFILES];
+static struct swap_info_struct swap_info[MAX_SWAPFILES];
 static DEFINE_MUTEX(swapon_mutex);
@@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                                last_in_cluster = offset + SWAPFILE_CLUSTER;
                        else if (offset == last_in_cluster) {
                                spin_lock(&swap_lock);
-                                si->cluster_next = offset-SWAPFILE_CLUSTER-1;
+                                si->cluster_next = offset-SWAPFILE_CLUSTER+1;
                                goto cluster;
                        }
                        if (unlikely(--latency_ration < 0)) {
@@ -417,6 +417,61 @@ void free_swap_and_cache(swp_entry_t entry)
        }
 }
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Find the swap type that corresponds to given device (if any)
+ *
+ * This is needed for software suspend and is done in such a way that inode
+ * aliasing is allowed.
+ */
+int swap_type_of(dev_t device)
+{
+        int i;
+        spin_lock(&swap_lock);
+        for (i = 0; i < nr_swapfiles; i++) {
+                struct inode *inode;
+                if (!(swap_info[i].flags & SWP_WRITEOK))
+                        continue;
+                if (!device) {
+                        spin_unlock(&swap_lock);
+                        return i;
+                }
+                inode = swap_info->swap_file->f_dentry->d_inode;
+                if (S_ISBLK(inode->i_mode) &&
+                    device == MKDEV(imajor(inode), iminor(inode))) {
+                        spin_unlock(&swap_lock);
+                        return i;
+                }
+        }
+        spin_unlock(&swap_lock);
+        return -ENODEV;
+}
+/*
+ * Return either the total number of swap pages of given type, or the number
+ * of free pages of that type (depending on @free)
+ *
+ * This is needed for software suspend
+ */
+unsigned int count_swap_pages(int type, int free)
+{
+        unsigned int n = 0;
+        if (type < nr_swapfiles) {
+                spin_lock(&swap_lock);
+                if (swap_info[type].flags & SWP_WRITEOK) {
+                        n = swap_info[type].pages;
+                        if (free)
+                                n -= swap_info[type].inuse_pages;
+                }
+                spin_unlock(&swap_lock);
+        }
+        return n;
+}
+#endif
 /*
 * No need to decide whether this PTE shares the swap entry with others,
 * just let do_wp_page work it out if a write is requested later - to
diff --git a/mm/util.c b/mm/util.c
index 5f4bb59da63c..7368479220b3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,20 +1,22 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
+#include <linux/err.h>
+#include <asm/uaccess.h>
 /**
- * kzalloc - allocate memory. The memory is set to zero.
+ * __kzalloc - allocate memory. The memory is set to zero.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
 */
-void *kzalloc(size_t size, gfp_t flags)
+void *__kzalloc(size_t size, gfp_t flags)
 {
-        void *ret = kmalloc(size, flags);
+        void *ret = ____kmalloc(size, flags);
        if (ret)
                memset(ret, 0, size);
        return ret;
 }
-EXPORT_SYMBOL(kzalloc);
+EXPORT_SYMBOL(__kzalloc);
 /*
 * kstrdup - allocate space for and copy an existing string
@@ -31,9 +33,44 @@ char *kstrdup(const char *s, gfp_t gfp)
                return NULL;
        len = strlen(s) + 1;
-        buf = kmalloc(len, gfp);
+        buf = ____kmalloc(len, gfp);
        if (buf)
                memcpy(buf, s, len);
        return buf;
 }
 EXPORT_SYMBOL(kstrdup);
+/*
+ * strndup_user - duplicate an existing string from user space
+ *
+ * @s: The string to duplicate
+ * @n: Maximum number of bytes to copy, including the trailing NUL.
+ */
+char *strndup_user(const char __user *s, long n)
+{
+        char *p;
+        long length;
+        length = strnlen_user(s, n);
+        if (!length)
+                return ERR_PTR(-EFAULT);
+        if (length > n)
+                return ERR_PTR(-EINVAL);
+        p = kmalloc(length, GFP_KERNEL);
+        if (!p)
+                return ERR_PTR(-ENOMEM);
+        if (copy_from_user(p, s, length)) {
+                kfree(p);
+                return ERR_PTR(-EFAULT);
+        }
+        p[length - 1] = '\0';
+        return p;
+}
+EXPORT_SYMBOL(strndup_user);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4fe7e3aa02e2..acdf001d6941 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -33,39 +33,21 @@
 #include <linux/cpuset.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
+#include <linux/delay.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include <linux/swapops.h>
-/* possible outcome of pageout() */
+#include "internal.h"
-typedef enum {
-        /* failed to write page out, page is locked */
-        PAGE_KEEP,
-        /* move page to the active list, page is locked */
-        PAGE_ACTIVATE,
-        /* page has been sent to the disk successfully, page is unlocked */
-        PAGE_SUCCESS,
-        /* page is clean and locked */
-        PAGE_CLEAN,
-} pageout_t;
 struct scan_control {
-        /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
-        unsigned long nr_to_scan;
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
-        /* Incremented by the number of pages reclaimed */
-        unsigned long nr_reclaimed;
        unsigned long nr_mapped;        /* From page_state */
-        /* Ask shrink_caches, or shrink_zone to scan at this priority */
-        unsigned int priority;
        /* This context's GFP mask */
        gfp_t gfp_mask;
@@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker);
 *
 * Returns the number of slab objects which we shrunk.
 */
-int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+                        unsigned long lru_pages)
 {
        struct shrinker *shrinker;
-        int ret = 0;
+        unsigned long ret = 0;
        if (scanned == 0)
                scanned = SWAP_CLUSTER_MAX;
@@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping,
 }
 /*
- * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
 */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+pageout_t pageout(struct page *page, struct address_space *mapping)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
        return PAGE_CLEAN;
 }
-static int remove_mapping(struct address_space *mapping, struct page *page)
+int remove_mapping(struct address_space *mapping, struct page *page)
 {
        if (!mapping)
                return 0;               /* truncate got there first */
@@ -414,14 +398,15 @@ cannot_free:
 }
 /*
- * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
+ * shrink_page_list() returns the number of reclaimed pages
 */
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static unsigned long shrink_page_list(struct list_head *page_list,
+                                        struct scan_control *sc)
 {
        LIST_HEAD(ret_pages);
        struct pagevec freed_pvec;
        int pgactivate = 0;
-        int reclaimed = 0;
+        unsigned long nr_reclaimed = 0;
        cond_resched();
@@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * Anonymous process memory has backing store?
                 * Try to allocate it some swap space here.
                 */
-                if (PageAnon(page) && !PageSwapCache(page)) {
+                if (PageAnon(page) && !PageSwapCache(page))
-                        if (!sc->may_swap)
-                                goto keep_locked;
                        if (!add_to_swap(page, GFP_ATOMIC))
                                goto activate_locked;
-                }
 #endif /* CONFIG_SWAP */
                mapping = page_mapping(page);
@@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                        /*
-                         * No unmapping if we do not swap
-                         */
-                        if (!sc->may_swap)
-                                goto keep_locked;
                        switch (try_to_unmap(page, 0)) {
                        case SWAP_FAIL:
                                goto activate_locked;
@@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 free_it:
                unlock_page(page);
-                reclaimed++;
+                nr_reclaimed++;
                if (!pagevec_add(&freed_pvec, page))
                        __pagevec_release_nonlru(&freed_pvec);
                continue;
@@ -579,483 +555,8 @@ keep:
        if (pagevec_count(&freed_pvec))
                __pagevec_release_nonlru(&freed_pvec);
        mod_page_state(pgactivate, pgactivate);
-        sc->nr_reclaimed += reclaimed;
+        return nr_reclaimed;
-        return reclaimed;
-}
-#ifdef CONFIG_MIGRATION
-static inline void move_to_lru(struct page *page)
-{
-        list_del(&page->lru);
-        if (PageActive(page)) {
-                /*
-                 * lru_cache_add_active checks that
-                 * the PG_active bit is off.
-                 */
-                ClearPageActive(page);
-                lru_cache_add_active(page);
-        } else {
-                lru_cache_add(page);
-        }
-        put_page(page);
-}
-/*
- * Add isolated pages on the list back to the LRU.
- *
- * returns the number of pages put back.
- */
-int putback_lru_pages(struct list_head *l)
-{
-        struct page *page;
-        struct page *page2;
-        int count = 0;
-        list_for_each_entry_safe(page, page2, l, lru) {
-                move_to_lru(page);
-                count++;
-        }
-        return count;
-}
-/*
- * Non migratable page
- */
-int fail_migrate_page(struct page *newpage, struct page *page)
-{
-        return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-/*
- * swapout a single page
- * page is locked upon entry, unlocked on exit
- */
-static int swap_page(struct page *page)
-{
-        struct address_space *mapping = page_mapping(page);
-        if (page_mapped(page) && mapping)
-                if (try_to_unmap(page, 1) != SWAP_SUCCESS)
-                        goto unlock_retry;
-        if (PageDirty(page)) {
-                /* Page is dirty, try to write it out here */
-                switch(pageout(page, mapping)) {
-                case PAGE_KEEP:
-                case PAGE_ACTIVATE:
-                        goto unlock_retry;
-                case PAGE_SUCCESS:
-                        goto retry;
-                case PAGE_CLEAN:
-                        ; /* try to free the page below */
-                }
-        }
-        if (PagePrivate(page)) {
-                if (!try_to_release_page(page, GFP_KERNEL) ||
-                    (!mapping && page_count(page) == 1))
-                        goto unlock_retry;
-        }
-        if (remove_mapping(mapping, page)) {
-                /* Success */
-                unlock_page(page);
-                return 0;
-        }
-unlock_retry:
-        unlock_page(page);
-retry:
-        return -EAGAIN;
 }
-EXPORT_SYMBOL(swap_page);
-/*
- * Page migration was first developed in the context of the memory hotplug
- * project. The main authors of the migration code are:
- *
- * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
- * Hirokazu Takahashi <taka@valinux.co.jp>
- * Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
- */
-/*
- * Remove references for a page and establish the new page with the correct
- * basic settings to be able to stop accesses to the page.
- */
-int migrate_page_remove_references(struct page *newpage,
-                                struct page *page, int nr_refs)
-{
-        struct address_space *mapping = page_mapping(page);
-        struct page **radix_pointer;
-        /*
-         * Avoid doing any of the following work if the page count
-         * indicates that the page is in use or truncate has removed
-         * the page.
-         */
-        if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
-                return -EAGAIN;
-        /*
-         * Establish swap ptes for anonymous pages or destroy pte
-         * maps for files.
-         *
-         * In order to reestablish file backed mappings the fault handlers
-         * will take the radix tree_lock which may then be used to stop
-         * processses from accessing this page until the new page is ready.
-         *
-         * A process accessing via a swap pte (an anonymous page) will take a
-         * page_lock on the old page which will block the process until the
-         * migration attempt is complete. At that time the PageSwapCache bit
-         * will be examined. If the page was migrated then the PageSwapCache
-         * bit will be clear and the operation to retrieve the page will be
-         * retried which will find the new page in the radix tree. Then a new
-         * direct mapping may be generated based on the radix tree contents.
-         *
-         * If the page was not migrated then the PageSwapCache bit
-         * is still set and the operation may continue.
-         */
-        if (try_to_unmap(page, 1) == SWAP_FAIL)
-                /* A vma has VM_LOCKED set -> Permanent failure */
-                return -EPERM;
-        /*
-         * Give up if we were unable to remove all mappings.
-         */
-        if (page_mapcount(page))
-                return -EAGAIN;
-        write_lock_irq(&mapping->tree_lock);
-        radix_pointer = (struct page **)radix_tree_lookup_slot(
-                                                &mapping->page_tree,
-                                                page_index(page));
-        if (!page_mapping(page) || page_count(page) != nr_refs ||
-                        *radix_pointer != page) {
-                write_unlock_irq(&mapping->tree_lock);
-                return -EAGAIN;
-        }
-        /*
-         * Now we know that no one else is looking at the page.
-         *
-         * Certain minimal information about a page must be available
-         * in order for other subsystems to properly handle the page if they
-         * find it through the radix tree update before we are finished
-         * copying the page.
-         */
-        get_page(newpage);
-        newpage->index = page->index;
-        newpage->mapping = page->mapping;
-        if (PageSwapCache(page)) {
-                SetPageSwapCache(newpage);
-                set_page_private(newpage, page_private(page));
-        }
-        *radix_pointer = newpage;
-        __put_page(page);
-        write_unlock_irq(&mapping->tree_lock);
-        return 0;
-}
-EXPORT_SYMBOL(migrate_page_remove_references);
-/*
- * Copy the page to its new location
- */
-void migrate_page_copy(struct page *newpage, struct page *page)
-{
-        copy_highpage(newpage, page);
-        if (PageError(page))
-                SetPageError(newpage);
-        if (PageReferenced(page))
-                SetPageReferenced(newpage);
-        if (PageUptodate(page))
-                SetPageUptodate(newpage);
-        if (PageActive(page))
-                SetPageActive(newpage);
-        if (PageChecked(page))
-                SetPageChecked(newpage);
-        if (PageMappedToDisk(page))
-                SetPageMappedToDisk(newpage);
-        if (PageDirty(page)) {
-                clear_page_dirty_for_io(page);
-                set_page_dirty(newpage);
-        }
-        ClearPageSwapCache(page);
-        ClearPageActive(page);
-        ClearPagePrivate(page);
-        set_page_private(page, 0);
-        page->mapping = NULL;
-        /*
-         * If any waiters have accumulated on the new page then
-         * wake them up.
-         */
-        if (PageWriteback(newpage))
-                end_page_writeback(newpage);
-}
-EXPORT_SYMBOL(migrate_page_copy);
-/*
- * Common logic to directly migrate a single page suitable for
- * pages that do not use PagePrivate.
- *
- * Pages are locked upon entry and exit.
- */
-int migrate_page(struct page *newpage, struct page *page)
-{
-        int rc;
-        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
-        rc = migrate_page_remove_references(newpage, page, 2);
-        if (rc)
-                return rc;
-        migrate_page_copy(newpage, page);
-        /*
-         * Remove auxiliary swap entries and replace
-         * them with real ptes.
-         *
-         * Note that a real pte entry will allow processes that are not
-         * waiting on the page lock to use the new page via the page tables
-         * before the new page is unlocked.
-         */
-        remove_from_swap(newpage);
-        return 0;
-}
-EXPORT_SYMBOL(migrate_page);
-/*
- * migrate_pages
- *
- * Two lists are passed to this function. The first list
- * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the pages isolated
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
- *
- * The function returns after 10 attempts or if no pages
- * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
- *
- * Return: Number of pages not migrated when "to" ran empty.
- */
-int migrate_pages(struct list_head *from, struct list_head *to,
-                  struct list_head *moved, struct list_head *failed)
-{
-        int retry;
-        int nr_failed = 0;
-        int pass = 0;
-        struct page *page;
-        struct page *page2;
-        int swapwrite = current->flags & PF_SWAPWRITE;
-        int rc;
-        if (!swapwrite)
-                current->flags |= PF_SWAPWRITE;
-redo:
-        retry = 0;
-        list_for_each_entry_safe(page, page2, from, lru) {
-                struct page *newpage = NULL;
-                struct address_space *mapping;
-                cond_resched();
-                rc = 0;
-                if (page_count(page) == 1)
-                        /* page was freed from under us. So we are done. */
-                        goto next;
-                if (to && list_empty(to))
-                        break;
-                /*
-                 * Skip locked pages during the first two passes to give the
-                 * functions holding the lock time to release the page. Later we
-                 * use lock_page() to have a higher chance of acquiring the
-                 * lock.
-                 */
-                rc = -EAGAIN;
-                if (pass > 2)
-                        lock_page(page);
-                else
-                        if (TestSetPageLocked(page))
-                                goto next;
-                /*
-                 * Only wait on writeback if we have already done a pass where
-                 * we we may have triggered writeouts for lots of pages.
-                 */
-                if (pass > 0) {
-                        wait_on_page_writeback(page);
-                } else {
-                        if (PageWriteback(page))
-                                goto unlock_page;
-                }
-                /*
-                 * Anonymous pages must have swap cache references otherwise
-                 * the information contained in the page maps cannot be
-                 * preserved.
-                 */
-                if (PageAnon(page) && !PageSwapCache(page)) {
-                        if (!add_to_swap(page, GFP_KERNEL)) {
-                                rc = -ENOMEM;
-                                goto unlock_page;
-                        }
-                }
-                if (!to) {
-                        rc = swap_page(page);
-                        goto next;
-                }
-                newpage = lru_to_page(to);
-                lock_page(newpage);
-                /*
-                 * Pages are properly locked and writeback is complete.
-                 * Try to migrate the page.
-                 */
-                mapping = page_mapping(page);
-                if (!mapping)
-                        goto unlock_both;
-                if (mapping->a_ops->migratepage) {
-                        /*
-                         * Most pages have a mapping and most filesystems
-                         * should provide a migration function. Anonymous
-                         * pages are part of swap space which also has its
-                         * own migration function. This is the most common
-                         * path for page migration.
-                         */
-                        rc = mapping->a_ops->migratepage(newpage, page);
-                        goto unlock_both;
-                }
-                /*
-                 * Default handling if a filesystem does not provide
-                 * a migration function. We can only migrate clean
-                 * pages so try to write out any dirty pages first.
-                 */
-                if (PageDirty(page)) {
-                        switch (pageout(page, mapping)) {
-                        case PAGE_KEEP:
-                        case PAGE_ACTIVATE:
-                                goto unlock_both;
-                        case PAGE_SUCCESS:
-                                unlock_page(newpage);
-                                goto next;
-                        case PAGE_CLEAN:
-                                ; /* try to migrate the page below */
-                        }
-                }
-                /*
-                 * Buffers are managed in a filesystem specific way.
-                 * We must have no buffers or drop them.
-                 */
-                if (!page_has_buffers(page) ||
-                    try_to_release_page(page, GFP_KERNEL)) {
-                        rc = migrate_page(newpage, page);
-                        goto unlock_both;
-                }
-                /*
-                 * On early passes with mapped pages simply
-                 * retry. There may be a lock held for some
-                 * buffers that may go away. Later
-                 * swap them out.
-                 */
-                if (pass > 4) {
-                        /*
-                         * Persistently unable to drop buffers..... As a
-                         * measure of last resort we fall back to
-                         * swap_page().
-                         */
-                        unlock_page(newpage);
-                        newpage = NULL;
-                        rc = swap_page(page);
-                        goto next;
-                }
-unlock_both:
-                unlock_page(newpage);
-unlock_page:
-                unlock_page(page);
-next:
-                if (rc == -EAGAIN) {
-                        retry++;
-                } else if (rc) {
-                        /* Permanent failure */
-                        list_move(&page->lru, failed);
-                        nr_failed++;
-                } else {
-                        if (newpage) {
-                                /* Successful migration. Return page to LRU */
-                                move_to_lru(newpage);
-                        }
-                        list_move(&page->lru, moved);
-                }
-        }
-        if (retry && pass++ < 10)
-                goto redo;
-        if (!swapwrite)
-                current->flags &= ~PF_SWAPWRITE;
-        return nr_failed + retry;
-}
-/*
- * Isolate one page from the LRU lists and put it on the
- * indicated list with elevated refcount.
- *
- * Result:
- *  0 = page not on LRU list
- *  1 = page removed from LRU list and added to the specified list.
- */
-int isolate_lru_page(struct page *page)
-{
-        int ret = 0;
-        if (PageLRU(page)) {
-                struct zone *zone = page_zone(page);
-                spin_lock_irq(&zone->lru_lock);
-                if (TestClearPageLRU(page)) {
-                        ret = 1;
-                        get_page(page);
-                        if (PageActive(page))
-                                del_page_from_active_list(zone, page);
-                        else
-                                del_page_from_inactive_list(zone, page);
-                }
-                spin_unlock_irq(&zone->lru_lock);
-        }
-        return ret;
-}
-#endif
 /*
 * zone->lru_lock is heavily contended.  Some of the functions that
@@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page)
 *
 * returns how many pages were moved onto *@dst.
 */
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
-                             struct list_head *dst, int *scanned)
+                struct list_head *src, struct list_head *dst,
+                unsigned long *scanned)
 {
-        int nr_taken = 0;
+        unsigned long nr_taken = 0;
        struct page *page;
-        int scan = 0;
+        unsigned long scan;
-        while (scan++ < nr_to_scan && !list_empty(src)) {
+        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+                struct list_head *target;
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
-                if (!TestClearPageLRU(page))
+                BUG_ON(!PageLRU(page));
-                        BUG();
                list_del(&page->lru);
-                if (get_page_testone(page)) {
+                target = src;
+                if (likely(get_page_unless_zero(page))) {
                        /*
-                         * It is being freed elsewhere
+                         * Be careful not to clear PageLRU until after we're
+                         * sure the page is not being freed elsewhere -- the
+                         * page release code relies on it.
                         */
-                        __put_page(page);
+                        ClearPageLRU(page);
-                        SetPageLRU(page);
+                        target = dst;
-                        list_add(&page->lru, src);
-                        continue;
-                } else {
-                        list_add(&page->lru, dst);
                        nr_taken++;
-                }
+                } /* else it is being freed elsewhere */
+                list_add(&page->lru, target);
        }
        *scanned = scan;
@@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 }
 /*
- * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
 */
-static void shrink_cache(struct zone *zone, struct scan_control *sc)
+static unsigned long shrink_inactive_list(unsigned long max_scan,
+                                struct zone *zone, struct scan_control *sc)
 {
        LIST_HEAD(page_list);
        struct pagevec pvec;
-        int max_scan = sc->nr_to_scan;
+        unsigned long nr_scanned = 0;
+        unsigned long nr_reclaimed = 0;
        pagevec_init(&pvec, 1);
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
-        while (max_scan > 0) {
+        do {
                struct page *page;
-                int nr_taken;
+                unsigned long nr_taken;
-                int nr_scan;
+                unsigned long nr_scan;
-                int nr_freed;
+                unsigned long nr_freed;
                nr_taken = isolate_lru_pages(sc->swap_cluster_max,
                                             &zone->inactive_list,
@@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
                zone->pages_scanned += nr_scan;
                spin_unlock_irq(&zone->lru_lock);
-                if (nr_taken == 0)
+                nr_scanned += nr_scan;
-                        goto done;
+                nr_freed = shrink_page_list(&page_list, sc);
+                nr_reclaimed += nr_freed;
-                max_scan -= nr_scan;
-                nr_freed = shrink_list(&page_list, sc);
                local_irq_disable();
                if (current_is_kswapd()) {
                        __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
@@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
                        __mod_page_state_zone(zone, pgscan_direct, nr_scan);
                __mod_page_state_zone(zone, pgsteal, nr_freed);
+                if (nr_taken == 0)
+                        goto done;
                spin_lock(&zone->lru_lock);
                /*
                 * Put back any unfreeable pages.
                 */
                while (!list_empty(&page_list)) {
                        page = lru_to_page(&page_list);
-                        if (TestSetPageLRU(page))
+                        BUG_ON(PageLRU(page));
-                                BUG();
+                        SetPageLRU(page);
                        list_del(&page->lru);
                        if (PageActive(page))
                                add_page_to_active_list(zone, page);
@@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
                                spin_lock_irq(&zone->lru_lock);
                        }
                }
-        }
+        } while (nr_scanned < max_scan);
-        spin_unlock_irq(&zone->lru_lock);
+        spin_unlock(&zone->lru_lock);
 done:
+        local_irq_enable();
        pagevec_release(&pvec);
+        return nr_reclaimed;
 }
 /*
@@ -1188,13 +697,12 @@ done:
 * The downside is that we have to touch page->_count against each page.
 * But we had to alter page->flags anyway.
 */
-static void
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+                                struct scan_control *sc)
 {
-        int pgmoved;
+        unsigned long pgmoved;
        int pgdeactivate = 0;
-        int pgscanned;
+        unsigned long pgscanned;
-        int nr_pages = sc->nr_to_scan;
        LIST_HEAD(l_hold);      /* The pages which were snipped off */
        LIST_HEAD(l_inactive);  /* Pages to go onto the inactive_list */
        LIST_HEAD(l_active);    /* Pages to go onto the active_list */
@@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        struct pagevec pvec;
        int reclaim_mapped = 0;
-        if (unlikely(sc->may_swap)) {
+        if (sc->may_swap) {
                long mapped_ratio;
                long distress;
                long swap_tendency;
@@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        while (!list_empty(&l_inactive)) {
                page = lru_to_page(&l_inactive);
                prefetchw_prev_lru_page(page, &l_inactive, flags);
-                if (TestSetPageLRU(page))
+                BUG_ON(PageLRU(page));
-                        BUG();
+                SetPageLRU(page);
-                if (!TestClearPageActive(page))
+                BUG_ON(!PageActive(page));
-                        BUG();
+                ClearPageActive(page);
                list_move(&page->lru, &zone->inactive_list);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
@@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        while (!list_empty(&l_active)) {
                page = lru_to_page(&l_active);
                prefetchw_prev_lru_page(page, &l_active, flags);
-                if (TestSetPageLRU(page))
+                BUG_ON(PageLRU(page));
-                        BUG();
+                SetPageLRU(page);
                BUG_ON(!PageActive(page));
                list_move(&page->lru, &zone->active_list);
                pgmoved++;
@@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 /*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
-static void
+static unsigned long shrink_zone(int priority, struct zone *zone,
-shrink_zone(struct zone *zone, struct scan_control *sc)
+                                struct scan_control *sc)
 {
        unsigned long nr_active;
        unsigned long nr_inactive;
+        unsigned long nr_to_scan;
+        unsigned long nr_reclaimed = 0;
        atomic_inc(&zone->reclaim_in_progress);
@@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
         * Add one to `nr_to_scan' just to make sure that the kernel will
         * slowly sift through the active list.
         */
-        zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+        zone->nr_scan_active += (zone->nr_active >> priority) + 1;
        nr_active = zone->nr_scan_active;
        if (nr_active >= sc->swap_cluster_max)
                zone->nr_scan_active = 0;
        else
                nr_active = 0;
-        zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
+        zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
        nr_inactive = zone->nr_scan_inactive;
        if (nr_inactive >= sc->swap_cluster_max)
                zone->nr_scan_inactive = 0;
@@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
        while (nr_active || nr_inactive) {
                if (nr_active) {
-                        sc->nr_to_scan = min(nr_active,
+                        nr_to_scan = min(nr_active,
                                        (unsigned long)sc->swap_cluster_max);
-                        nr_active -= sc->nr_to_scan;
+                        nr_active -= nr_to_scan;
-                        refill_inactive_zone(zone, sc);
+                        shrink_active_list(nr_to_scan, zone, sc);
                }
                if (nr_inactive) {
-                        sc->nr_to_scan = min(nr_inactive,
+                        nr_to_scan = min(nr_inactive,
                                        (unsigned long)sc->swap_cluster_max);
-                        nr_inactive -= sc->nr_to_scan;
+                        nr_inactive -= nr_to_scan;
-                        shrink_cache(zone, sc);
+                        nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
+                                                                sc);
                }
        }
        throttle_vm_writeout();
        atomic_dec(&zone->reclaim_in_progress);
+        return nr_reclaimed;
 }
 /*
@@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
-static void
+static unsigned long shrink_zones(int priority, struct zone **zones,
-shrink_caches(struct zone **zones, struct scan_control *sc)
+                                        struct scan_control *sc)
 {
+        unsigned long nr_reclaimed = 0;
        int i;
        for (i = 0; zones[i] != NULL; i++) {
@@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
                        continue;
-                zone->temp_priority = sc->priority;
+                zone->temp_priority = priority;
-                if (zone->prev_priority > sc->priority)
+                if (zone->prev_priority > priority)
-                        zone->prev_priority = sc->priority;
+                        zone->prev_priority = priority;
-                if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+                if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                        continue;       /* Let kswapd poll it */
-                shrink_zone(zone, sc);
+                nr_reclaimed += shrink_zone(priority, zone, sc);
        }
+        return nr_reclaimed;
 }
 
 /*
@@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 * holds filesystem locks which prevent writeout this might not work, and the
 * allocation attempt will fail.
 */
-int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 {
        int priority;
        int ret = 0;
-        int total_scanned = 0, total_reclaimed = 0;
+        unsigned long total_scanned = 0;
+        unsigned long nr_reclaimed = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
-        struct scan_control sc;
        unsigned long lru_pages = 0;
        int i;
+        struct scan_control sc = {
-        sc.gfp_mask = gfp_mask;
+                .gfp_mask = gfp_mask,
-        sc.may_writepage = !laptop_mode;
+                .may_writepage = !laptop_mode,
-        sc.may_swap = 1;
+                .swap_cluster_max = SWAP_CLUSTER_MAX,
+                .may_swap = 1,
+        };
        inc_page_state(allocstall);
@@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc.nr_mapped = read_page_state(nr_mapped);
                sc.nr_scanned = 0;
-                sc.nr_reclaimed = 0;
-                sc.priority = priority;
-                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
                if (!priority)
                        disable_swap_token();
-                shrink_caches(zones, &sc);
+                nr_reclaimed += shrink_zones(priority, zones, &sc);
                shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
                if (reclaim_state) {
-                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+                        nr_reclaimed += reclaim_state->reclaimed_slab;
                        reclaim_state->reclaimed_slab = 0;
                }
                total_scanned += sc.nr_scanned;
-                total_reclaimed += sc.nr_reclaimed;
+                if (nr_reclaimed >= sc.swap_cluster_max) {
-                if (total_reclaimed >= sc.swap_cluster_max) {
                        ret = 1;
                        goto out;
                }
@@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                 * that's undesirable in laptop mode, where we *want* lumpy
                 * writeout.  So in laptop mode, write out the whole world.
                 */
-                if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+                if (total_scanned > sc.swap_cluster_max +
+                                        sc.swap_cluster_max / 2) {
                        wakeup_pdflush(laptop_mode ? 0 : total_scanned);
                        sc.may_writepage = 1;
                }
@@ -1528,22 +1042,26 @@ out:
 * the page allocator fallback scheme to ensure that aging of pages is balanced
 * across the zones.
 */
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+                                int order)
 {
-        int to_free = nr_pages;
+        unsigned long to_free = nr_pages;
        int all_zones_ok;
        int priority;
        int i;
-        int total_scanned, total_reclaimed;
+        unsigned long total_scanned;
+        unsigned long nr_reclaimed;
        struct reclaim_state *reclaim_state = current->reclaim_state;
-        struct scan_control sc;
+        struct scan_control sc = {
+                .gfp_mask = GFP_KERNEL,
+                .may_swap = 1,
+                .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+        };
 loop_again:
        total_scanned = 0;
-        total_reclaimed = 0;
+        nr_reclaimed = 0;
-        sc.gfp_mask = GFP_KERNEL;
+        sc.may_writepage = !laptop_mode,
-        sc.may_writepage = !laptop_mode;
-        sc.may_swap = 1;
        sc.nr_mapped = read_page_state(nr_mapped);
        inc_page_state(pageoutrun);
@@ -1624,15 +1142,11 @@ scan:
                        if (zone->prev_priority > priority)
                                zone->prev_priority = priority;
                        sc.nr_scanned = 0;
-                        sc.nr_reclaimed = 0;
+                        nr_reclaimed += shrink_zone(priority, zone, &sc);
-                        sc.priority = priority;
-                        sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
-                        shrink_zone(zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
                                                lru_pages);
-                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+                        nr_reclaimed += reclaim_state->reclaimed_slab;
-                        total_reclaimed += sc.nr_reclaimed;
                        total_scanned += sc.nr_scanned;
                        if (zone->all_unreclaimable)
                                continue;
@@ -1645,10 +1159,10 @@ scan:
                         * even in laptop mode
                         */
                        if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-                            total_scanned > total_reclaimed+total_reclaimed/2)
+                            total_scanned > nr_reclaimed + nr_reclaimed / 2)
                                sc.may_writepage = 1;
                }
-                if (nr_pages && to_free > total_reclaimed)
+                if (nr_pages && to_free > nr_reclaimed)
                        continue;       /* swsusp: need to do more work */
                if (all_zones_ok)
                        break;          /* kswapd: all done */
@@ -1665,7 +1179,7 @@ scan:
                 * matches the direct reclaim path behaviour in terms of impact
                 * on zone->*_priority.
                 */
-                if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
+                if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
                        break;
        }
 out:
@@ -1679,7 +1193,7 @@ out:
                goto loop_again;
        }
-        return total_reclaimed;
+        return nr_reclaimed;
 }
 /*
@@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order)
 * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
 * pages.
 */
-int shrink_all_memory(int nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages)
 {
        pg_data_t *pgdat;
-        int nr_to_free = nr_pages;
+        unsigned long nr_to_free = nr_pages;
-        int ret = 0;
+        unsigned long ret = 0;
+        unsigned retry = 2;
        struct reclaim_state reclaim_state = {
                .reclaimed_slab = 0,
        };
        current->reclaim_state = &reclaim_state;
-        for_each_pgdat(pgdat) {
+repeat:
-                int freed;
+        for_each_online_pgdat(pgdat) {
+                unsigned long freed;
                freed = balance_pgdat(pgdat, nr_to_free, 0);
                ret += freed;
                nr_to_free -= freed;
-                if (nr_to_free <= 0)
+                if ((long)nr_to_free <= 0)
                        break;
        }
+        if (retry-- && ret < nr_pages) {
+                blk_congestion_wait(WRITE, HZ/5);
+                goto repeat;
+        }
        current->reclaim_state = NULL;
        return ret;
 }
@@ -1808,14 +1329,13 @@ int shrink_all_memory(int nr_pages)
   away, we get changed to run anywhere: as the first one comes back,
   restore their cpu bindings. */
 static int __devinit cpu_callback(struct notifier_block *nfb,
-                                  unsigned long action,
+                                  unsigned long action, void *hcpu)
-                                  void *hcpu)
 {
        pg_data_t *pgdat;
        cpumask_t mask;
        if (action == CPU_ONLINE) {
-                for_each_pgdat(pgdat) {
+                for_each_online_pgdat(pgdat) {
                        mask = node_to_cpumask(pgdat->node_id);
                        if (any_online_cpu(mask) != NR_CPUS)
                                /* One of our CPUs online: restore mask */
@@ -1829,10 +1349,17 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 static int __init kswapd_init(void)
 {
        pg_data_t *pgdat;
        swap_setup();
-        for_each_pgdat(pgdat)
+        for_each_online_pgdat(pgdat) {
-                pgdat->kswapd
+                pid_t pid;
-                = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+                pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+                BUG_ON(pid < 0);
+                read_lock(&tasklist_lock);
+                pgdat->kswapd = find_task_by_pid(pid);
+                read_unlock(&tasklist_lock);
+        }
        total_memory = nr_free_pagecache_pages();
        hotcpu_notifier(cpu_callback, 0);
        return 0;
@@ -1874,46 +1401,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
 /*
 * Try to free up some pages from this zone through reclaim.
 */
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-        int nr_pages;
+        /* Minimum pages needed in order to stay on node */
+        const unsigned long nr_pages = 1 << order;
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
-        struct scan_control sc;
+        int priority;
-        cpumask_t mask;
+        unsigned long nr_reclaimed = 0;
-        int node_id;
+        struct scan_control sc = {
+                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-        if (time_before(jiffies,
+                .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
-                zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+                .nr_mapped = read_page_state(nr_mapped),
-                        return 0;
+                .swap_cluster_max = max_t(unsigned long, nr_pages,
+                                        SWAP_CLUSTER_MAX),
-        if (!(gfp_mask & __GFP_WAIT) ||
+                .gfp_mask = gfp_mask,
-                zone->all_unreclaimable ||
+        };
-                atomic_read(&zone->reclaim_in_progress) > 0 ||
-                (p->flags & PF_MEMALLOC))
-                        return 0;
-        node_id = zone->zone_pgdat->node_id;
-        mask = node_to_cpumask(node_id);
-        if (!cpus_empty(mask) && node_id != numa_node_id())
-                return 0;
-        sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
-        sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
-        sc.nr_scanned = 0;
-        sc.nr_reclaimed = 0;
-        sc.priority = ZONE_RECLAIM_PRIORITY + 1;
-        sc.nr_mapped = read_page_state(nr_mapped);
-        sc.gfp_mask = gfp_mask;
        disable_swap_token();
-        nr_pages = 1 << order;
-        if (nr_pages > SWAP_CLUSTER_MAX)
-                sc.swap_cluster_max = nr_pages;
-        else
-                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
        cond_resched();
        /*
         * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -1928,17 +1433,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         * Free memory by calling shrink zone with increasing priorities
         * until we have enough memory freed.
         */
+        priority = ZONE_RECLAIM_PRIORITY;
        do {
-                sc.priority--;
+                nr_reclaimed += shrink_zone(priority, zone, &sc);
-                shrink_zone(zone, &sc);
+                priority--;
+        } while (priority >= 0 && nr_reclaimed < nr_pages);
-        } while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
+        if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
-        if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
                /*
-                 * shrink_slab does not currently allow us to determine
+                 * shrink_slab() does not currently allow us to determine how
-                 * how many pages were freed in the zone. So we just
+                 * many pages were freed in this zone. So we just shake the slab
-                 * shake the slab and then go offnode for a single allocation.
+                 * a bit and then go off node for this particular allocation
+                 * despite possibly having freed enough memory to allocate in
+                 * this zone.  If we freed local memory then the next
+                 * allocations will be local again.
                 *
                 * shrink_slab will free memory on all zones and may take
                 * a long time.
@@ -1949,10 +1457,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        p->reclaim_state = NULL;
        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
-        if (sc.nr_reclaimed == 0)
+        if (nr_reclaimed == 0) {
+                /*
+                 * We were unable to reclaim enough pages to stay on node.  We
+                 * now allow off node accesses for a certain time period before
+                 * trying again to reclaim pages from the local zone.
+                 */
                zone->last_unsuccessful_zone_reclaim = jiffies;
+        }
-        return sc.nr_reclaimed >= nr_pages;
+        return nr_reclaimed >= nr_pages;
 }
-#endif
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+        cpumask_t mask;
+        int node_id;
+        /*
+         * Do not reclaim if there was a recent unsuccessful attempt at zone
+         * reclaim.  In that case we let allocations go off node for the
+         * zone_reclaim_interval.  Otherwise we would scan for each off-node
+         * page allocation.
+         */
+        if (time_before(jiffies,
+                zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+                        return 0;
+        /*
+         * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+         * not have reclaimable pages and if we should not delay the allocation
+         * then do not scan.
+         */
+        if (!(gfp_mask & __GFP_WAIT) ||
+                zone->all_unreclaimable ||
+                atomic_read(&zone->reclaim_in_progress) > 0 ||
+                (current->flags & PF_MEMALLOC))
+                        return 0;
+        /*
+         * Only run zone reclaim on the local zone or on zones that do not
+         * have associated processors. This will favor the local processor
+         * over remote processors and spread off node memory allocations
+         * as wide as possible.
+         */
+        node_id = zone->zone_pgdat->node_id;
+        mask = node_to_cpumask(node_id);
+        if (!cpus_empty(mask) && node_id != numa_node_id())
+                return 0;
+        return __zone_reclaim(zone, gfp_mask, order);
+}
+#endif