21 files changed, 1773 insertions, 1385 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a9cb80ae64..bd80460360 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS
 # support for page migration
 #
 config MIGRATION
+        bool "Page migration"
        def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
        depends on SWAP
+        help
+          Allows the migration of the physical location of pages of processes
+          while the virtual addresses are not changed. This is useful for
+          example on NUMA systems to put pages nearer to the processors accessing
+          the page.
diff --git a/mm/Makefile b/mm/Makefile
index 9aa03fa1dc..f10c753dce 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
+obj-$(CONFIG_MIGRATION) += migrate.o
diff --git a/mm/filemap.c b/mm/filemap.c
index 44da3d4769..e8f58f7dd7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,6 +30,8 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include "filemap.h"
+#include "internal.h"
 /*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 508707704d..ebad6bbb35 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,24 +13,48 @@
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
+#include <linux/mutex.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <linux/hugetlb.h>
+#include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
 static DEFINE_SPINLOCK(hugetlb_lock);
+static void clear_huge_page(struct page *page, unsigned long addr)
+{
+        int i;
+        might_sleep();
+        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+                cond_resched();
+                clear_user_highpage(page + i, addr);
+        }
+}
+static void copy_huge_page(struct page *dst, struct page *src,
+                           unsigned long addr)
+{
+        int i;
+        might_sleep();
+        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+                cond_resched();
+                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+        }
+}
 static void enqueue_huge_page(struct page *page)
 {
        int nid = page_to_nid(page);
@@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
        return page;
 }
-static struct page *alloc_fresh_huge_page(void)
+static void free_huge_page(struct page *page)
+{
+        BUG_ON(page_count(page));
+        INIT_LIST_HEAD(&page->lru);
+        spin_lock(&hugetlb_lock);
+        enqueue_huge_page(page);
+        spin_unlock(&hugetlb_lock);
+}
+static int alloc_fresh_huge_page(void)
 {
        static int nid = 0;
        struct page *page;
        page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
                                        HUGETLB_PAGE_ORDER);
-        nid = (nid + 1) % num_online_nodes();
+        nid = next_node(nid, node_online_map);
+        if (nid == MAX_NUMNODES)
+                nid = first_node(node_online_map);
        if (page) {
+                page[1].lru.next = (void *)free_huge_page;      /* dtor */
                spin_lock(&hugetlb_lock);
                nr_huge_pages++;
                nr_huge_pages_node[page_to_nid(page)]++;
                spin_unlock(&hugetlb_lock);
+                put_page(page); /* free it into the hugepage allocator */
+                return 1;
        }
-        return page;
+        return 0;
 }
-void free_huge_page(struct page *page)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+                                    unsigned long addr)
 {
-        BUG_ON(page_count(page));
+        struct inode *inode = vma->vm_file->f_dentry->d_inode;
+        struct page *page;
+        int use_reserve = 0;
+        unsigned long idx;
-        INIT_LIST_HEAD(&page->lru);
+        spin_lock(&hugetlb_lock);
-        page[1].lru.next = NULL;                        /* reset dtor */
+        if (vma->vm_flags & VM_MAYSHARE) {
+                /* idx = radix tree index, i.e. offset into file in
+                 * HPAGE_SIZE units */
+                idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+                        + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+                /* The hugetlbfs specific inode info stores the number
+                 * of "guaranteed available" (huge) pages.  That is,
+                 * the first 'prereserved_hpages' pages of the inode
+                 * are either already instantiated, or have been
+                 * pre-reserved (by hugetlb_reserve_for_inode()). Here
+                 * we're in the process of instantiating the page, so
+                 * we use this to determine whether to draw from the
+                 * pre-reserved pool or the truly free pool. */
+                if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
+                        use_reserve = 1;
+        }
+        if (!use_reserve) {
+                if (free_huge_pages <= reserved_huge_pages)
+                        goto fail;
+        } else {
+                BUG_ON(reserved_huge_pages == 0);
+                reserved_huge_pages--;
+        }
+        page = dequeue_huge_page(vma, addr);
+        if (!page)
+                goto fail;
+        spin_unlock(&hugetlb_lock);
+        set_page_refcounted(page);
+        return page;
+ fail:
+        WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+        spin_unlock(&hugetlb_lock);
+        return NULL;
+}
+/* hugetlb_extend_reservation()
+ *
+ * Ensure that at least 'atleast' hugepages are, and will remain,
+ * available to instantiate the first 'atleast' pages of the given
+ * inode.  If the inode doesn't already have this many pages reserved
+ * or instantiated, set aside some hugepages in the reserved pool to
+ * satisfy later faults (or fail now if there aren't enough, rather
+ * than getting the SIGBUS later).
+ */
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+                               unsigned long atleast)
+{
+        struct inode *inode = &info->vfs_inode;
+        unsigned long change_in_reserve = 0;
+        int ret = 0;
        spin_lock(&hugetlb_lock);
-        enqueue_huge_page(page);
+        read_lock_irq(&inode->i_mapping->tree_lock);
+        if (info->prereserved_hpages >= atleast)
+                goto out;
+        /* Because we always call this on shared mappings, none of the
+         * pages beyond info->prereserved_hpages can have been
+         * instantiated, so we need to reserve all of them now. */
+        change_in_reserve = atleast - info->prereserved_hpages;
+        if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        reserved_huge_pages += change_in_reserve;
+        info->prereserved_hpages = atleast;
+ out:
+        read_unlock_irq(&inode->i_mapping->tree_lock);
        spin_unlock(&hugetlb_lock);
+        return ret;
 }
-struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
+/* hugetlb_truncate_reservation()
+ *
+ * This returns pages reserved for the given inode to the general free
+ * hugepage pool.  If the inode has any pages prereserved, but not
+ * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
+ * them.
+ */
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+                                  unsigned long atmost)
 {
+        struct inode *inode = &info->vfs_inode;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned long idx;
+        unsigned long change_in_reserve = 0;
        struct page *page;
-        int i;
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page(vma, addr);
+        read_lock_irq(&inode->i_mapping->tree_lock);
-        if (!page) {
-                spin_unlock(&hugetlb_lock);
+        if (info->prereserved_hpages <= atmost)
-                return NULL;
+                goto out;
+        /* Count pages which were reserved, but not instantiated, and
+         * which we can now release. */
+        for (idx = atmost; idx < info->prereserved_hpages; idx++) {
+                page = radix_tree_lookup(&mapping->page_tree, idx);
+                if (!page)
+                        /* Pages which are already instantiated can't
+                         * be unreserved (and in fact have already
+                         * been removed from the reserved pool) */
+                        change_in_reserve++;
        }
+        BUG_ON(reserved_huge_pages < change_in_reserve);
+        reserved_huge_pages -= change_in_reserve;
+        info->prereserved_hpages = atmost;
+ out:
+        read_unlock_irq(&inode->i_mapping->tree_lock);
        spin_unlock(&hugetlb_lock);
-        set_page_count(page, 1);
-        page[1].lru.next = (void *)free_huge_page;      /* set dtor */
-        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-                clear_user_highpage(&page[i], addr);
-        return page;
 }
 static int __init hugetlb_init(void)
 {
        unsigned long i;
-        struct page *page;
        if (HPAGE_SHIFT == 0)
                return 0;
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void)
                INIT_LIST_HEAD(&hugepage_freelists[i]);
        for (i = 0; i < max_huge_pages; ++i) {
-                page = alloc_fresh_huge_page();
+                if (!alloc_fresh_huge_page())
-                if (!page)
                        break;
-                spin_lock(&hugetlb_lock);
-                enqueue_huge_page(page);
-                spin_unlock(&hugetlb_lock);
        }
        max_huge_pages = free_huge_pages = nr_huge_pages = i;
        printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page)
                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
                                1 << PG_private | 1<< PG_writeback);
-                set_page_count(&page[i], 0);
        }
-        set_page_count(page, 1);
+        page[1].lru.next = NULL;
+        set_page_refcounted(page);
        __free_pages(page, HUGETLB_PAGE_ORDER);
 }
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
        while (count > nr_huge_pages) {
-                struct page *page = alloc_fresh_huge_page();
+                if (!alloc_fresh_huge_page())
-                if (!page)
                        return nr_huge_pages;
-                spin_lock(&hugetlb_lock);
-                enqueue_huge_page(page);
-                spin_unlock(&hugetlb_lock);
        }
        if (count >= nr_huge_pages)
                return nr_huge_pages;
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf)
        return sprintf(buf,
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
+                        "HugePages_Rsvd:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
                        nr_huge_pages,
                        free_huge_pages,
+                        reserved_huge_pages,
                        HPAGE_SIZE/1024);
 }
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
                nid, free_huge_pages_node[nid]);
 }
-int is_hugepage_mem_enough(size_t size)
-{
-        return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
-}
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, pte_t pte)
 {
        struct page *old_page, *new_page;
-        int i, avoidcopy;
+        int avoidcopy;
        old_page = pte_page(pte);
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        spin_unlock(&mm->page_table_lock);
-        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+        copy_huge_page(new_page, old_page, address);
-                copy_user_highpage(new_page + i, old_page + i,
-                                   address + i*PAGE_SIZE);
        spin_lock(&mm->page_table_lock);
        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -442,6 +572,7 @@ retry:
                        ret = VM_FAULT_OOM;
                        goto out;
                }
+                clear_huge_page(page, address);
                if (vma->vm_flags & VM_SHARED) {
                        int err;
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t entry;
        int ret;
+        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
        ptep = huge_pte_alloc(mm, address);
        if (!ptep)
                return VM_FAULT_OOM;
+        /*
+         * Serialize hugepage allocation and instantiation, so that we don't
+         * get spurious allocation failures if two CPUs race to instantiate
+         * the same page in the page cache.
+         */
+        mutex_lock(&hugetlb_instantiation_mutex);
        entry = *ptep;
-        if (pte_none(entry))
+        if (pte_none(entry)) {
-                return hugetlb_no_page(mm, vma, address, ptep, write_access);
+                ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
+                mutex_unlock(&hugetlb_instantiation_mutex);
+                return ret;
+        }
        ret = VM_FAULT_MINOR;
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                if (write_access && !pte_write(entry))
                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
        spin_unlock(&mm->page_table_lock);
+        mutex_unlock(&hugetlb_instantiation_mutex);
        return ret;
 }
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i)
 {
-        unsigned long vpfn, vaddr = *position;
+        unsigned long pfn_offset;
+        unsigned long vaddr = *position;
        int remainder = *length;
-        vpfn = vaddr/PAGE_SIZE;
        spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
                pte_t *pte;
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        break;
                }
-                if (pages) {
+                pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
-                        page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+                page = pte_page(*pte);
-                        get_page(page);
+same_page:
-                        pages[i] = page;
+                get_page(page);
-                }
+                if (pages)
+                        pages[i] = page + pfn_offset;
                if (vmas)
                        vmas[i] = vma;
                vaddr += PAGE_SIZE;
-                ++vpfn;
+                ++pfn_offset;
                --remainder;
                ++i;
+                if (vaddr < vma->vm_end && remainder &&
+                                pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+                        /*
+                         * We use pfn_offset to avoid touching the pageframes
+                         * of this compound page.
+                         */
+                        goto same_page;
+                }
        }
        spin_unlock(&mm->page_table_lock);
        *length = remainder;
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        return i;
 }
+void hugetlb_change_protection(struct vm_area_struct *vma,
+                unsigned long address, unsigned long end, pgprot_t newprot)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long start = address;
+        pte_t *ptep;
+        pte_t pte;
+        BUG_ON(address >= end);
+        flush_cache_range(vma, address, end);
+        spin_lock(&mm->page_table_lock);
+        for (; address < end; address += HPAGE_SIZE) {
+                ptep = huge_pte_offset(mm, address);
+                if (!ptep)
+                        continue;
+                if (!pte_none(*ptep)) {
+                        pte = huge_ptep_get_and_clear(mm, address, ptep);
+                        pte = pte_mkhuge(pte_modify(pte, newprot));
+                        set_huge_pte_at(mm, address, ptep, pte);
+                        lazy_mmu_prot_update(pte);
+                }
+        }
+        spin_unlock(&mm->page_table_lock);
+        flush_tlb_range(vma, start, end);
+}
diff --git a/mm/internal.h b/mm/internal.h
index 17256bb2f4..d20e3cc4ae 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -8,23 +8,33 @@
 * as published by the Free Software Foundation; either version
 * 2 of the License, or (at your option) any later version.
 */
+#ifndef __MM_INTERNAL_H
+#define __MM_INTERNAL_H
-static inline void set_page_refs(struct page *page, int order)
+#include <linux/mm.h>
+static inline void set_page_count(struct page *page, int v)
+{
+        atomic_set(&page->_count, v);
+}
+/*
+ * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * a count of one.
+ */
+static inline void set_page_refcounted(struct page *page)
 {
-#ifdef CONFIG_MMU
+        BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
+        BUG_ON(atomic_read(&page->_count));
        set_page_count(page, 1);
-#else
+}
-        int i;
-        /*
+static inline void __put_page(struct page *page)
-         * We need to reference all the pages for this order, otherwise if
+{
-         * anyone accesses one of the pages with (get/put) it will be freed.
+        atomic_dec(&page->_count);
-         * - eg: access_process_vm()
-         */
-        for (i = 0; i < (1 << order); i++)
-                set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
 }
 extern void fastcall __init __free_pages_bootmem(struct page *page,
                                                unsigned int order);
+#endif
diff --git a/mm/memory.c b/mm/memory.c
index 85e80a57db..80c3fb370f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
                anon_vma_unlink(vma);
                unlink_file_vma(vma);
-                if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
+                if (is_vm_hugetlb_page(vma)) {
                        hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
                                floor, next? next->vm_start: ceiling);
                } else {
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
                         * Optimization: gather nearby vmas into one call down
                         */
                        while (next && next->vm_start <= vma->vm_end + PMD_SIZE
-                          && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
+                               && !is_vm_hugetlb_page(next)) {
-                                                        HPAGE_SIZE)) {
                                vma = next;
                                next = vma->vm_next;
                                anon_vma_unlink(vma);
@@ -388,7 +387,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 {
        unsigned long pfn = pte_pfn(pte);
-        if (vma->vm_flags & VM_PFNMAP) {
+        if (unlikely(vma->vm_flags & VM_PFNMAP)) {
                unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
                if (pfn == vma->vm_pgoff + off)
                        return NULL;
@@ -396,18 +395,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
                        return NULL;
        }
-        /*
+#ifdef CONFIG_DEBUG_VM
-         * Add some anal sanity checks for now. Eventually,
-         * we should just do "return pfn_to_page(pfn)", but
-         * in the meantime we check that we get a valid pfn,
-         * and that the resulting page looks ok.
-         *
-         * Remove this test eventually!
-         */
        if (unlikely(!pfn_valid(pfn))) {
                print_bad_pte(vma, pte, addr);
                return NULL;
        }
+#endif
        /*
         * NOTE! We still have PageReserved() pages in the page 
@@ -1221,9 +1214,7 @@ out:
 * The page has to be a nice clean _individual_ kernel allocation.
 * If you allocate a compound page, you need to have marked it as
 * such (__GFP_COMP), or manually just split the page up yourself
- * (which is mainly an issue of doing "set_page_count(page, 1)" for
+ * (see split_page()).
- * each sub-page, and then freeing them one by one when you free
- * them rather than freeing it as a compound page).
 *
 * NOTE! Traditionally this was done with "remap_pfn_range()" which
 * took an arbitrary page protection parameter. This doesn't allow
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b21869a39f..e93cc740c2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
 #include <linux/swap.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/migrate.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -95,11 +96,8 @@
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
-/* The number of pages to migrate per call to migrate_pages() */
+static struct kmem_cache *policy_cache;
-#define MIGRATE_CHUNK_SIZE 256
+static struct kmem_cache *sn_cache;
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
 #define PDprintk(fmt...)
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
        struct vm_area_struct *first, *vma, *prev;
        if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-                /* Must have swap device for migration */
-                if (nr_swap_pages <= 0)
-                        return ERR_PTR(-ENODEV);
-                /*
+                err = migrate_prep();
-                 * Clear the LRU lists so pages can be isolated.
+                if (err)
-                 * Note that pages may be moved off the LRU after we have
+                        return ERR_PTR(err);
-                 * drained them. Those pages will fail to migrate like other
-                 * pages that may be busy.
-                 */
-                lru_add_drain_all();
        }
        first = find_vma(mm, start);
@@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
        return err;
 }
+#ifdef CONFIG_MIGRATION
 /*
 * page migration
 */
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags)
 {
        /*
         * Avoid migrating a page that is shared with others.
         */
-        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+        if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
-                if (isolate_lru_page(page))
+                isolate_lru_page(page, pagelist);
-                        list_add_tail(&page->lru, pagelist);
-        }
-}
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-static int migrate_pages_to(struct list_head *pagelist,
-                        struct vm_area_struct *vma, int dest)
-{
-        LIST_HEAD(newlist);
-        LIST_HEAD(moved);
-        LIST_HEAD(failed);
-        int err = 0;
-        unsigned long offset = 0;
-        int nr_pages;
-        struct page *page;
-        struct list_head *p;
-redo:
-        nr_pages = 0;
-        list_for_each(p, pagelist) {
-                if (vma) {
-                        /*
-                         * The address passed to alloc_page_vma is used to
-                         * generate the proper interleave behavior. We fake
-                         * the address here by an increasing offset in order
-                         * to get the proper distribution of pages.
-                         *
-                         * No decision has been made as to which page
-                         * a certain old page is moved to so we cannot
-                         * specify the correct address.
-                         */
-                        page = alloc_page_vma(GFP_HIGHUSER, vma,
-                                        offset + vma->vm_start);
-                        offset += PAGE_SIZE;
-                }
-                else
-                        page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-                if (!page) {
-                        err = -ENOMEM;
-                        goto out;
-                }
-                list_add_tail(&page->lru, &newlist);
-                nr_pages++;
-                if (nr_pages > MIGRATE_CHUNK_SIZE)
-                        break;
-        }
-        err = migrate_pages(pagelist, &newlist, &moved, &failed);
-        putback_lru_pages(&moved);      /* Call release pages instead ?? */
-        if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-                goto redo;
-out:
-        /* Return leftover allocated pages */
-        while (!list_empty(&newlist)) {
-                page = list_entry(newlist.next, struct page, lru);
-                list_del(&page->lru);
-                __free_page(page);
-        }
-        list_splice(&failed, pagelist);
-        if (err < 0)
-                return err;
-        /* Calculate number of leftover pages */
-        nr_pages = 0;
-        list_for_each(p, pagelist)
-                nr_pages++;
-        return nr_pages;
 }
 /*
@@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm,
        if (err < 0)
                return err;
        return busy;
 }
+#else
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                                unsigned long flags)
+{
+}
+int do_migrate_pages(struct mm_struct *mm,
+        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+        return -ENOSYS;
+}
+#endif
 long do_mbind(unsigned long start, unsigned long len,
                unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
@@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len,
                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                        err = -EIO;
        }
        if (!list_empty(&pagelist))
                putback_lru_pages(&pagelist);
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a99b80480..f71893ed35 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -278,14 +278,14 @@ EXPORT_SYMBOL(mempool_free);
 */
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
 {
-        kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+        struct kmem_cache *mem = pool_data;
        return kmem_cache_alloc(mem, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_alloc_slab);
 void mempool_free_slab(void *element, void *pool_data)
 {
-        kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+        struct kmem_cache *mem = pool_data;
        kmem_cache_free(mem, element);
 }
 EXPORT_SYMBOL(mempool_free_slab);
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 0000000000..09f6e4aa87
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,655 @@
+/*
+ * Memory Migration functionality - linux/mm/migration.c
+ *
+ * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
+ *
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+#include <linux/migrate.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>  /* for try_to_release_page(),
+                                        buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/swapops.h>
+#include "internal.h"
+#include "internal.h"
+/* The maximum number of pages to take off the LRU for migration */
+#define MIGRATE_CHUNK_SIZE 256
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+/*
+ * Isolate one page from the LRU lists. If successful put it onto
+ * the indicated list with elevated page count.
+ *
+ * Result:
+ *  -EBUSY: page not on LRU list
+ *  0: page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page, struct list_head *pagelist)
+{
+        int ret = -EBUSY;
+        if (PageLRU(page)) {
+                struct zone *zone = page_zone(page);
+                spin_lock_irq(&zone->lru_lock);
+                if (PageLRU(page)) {
+                        ret = 0;
+                        get_page(page);
+                        ClearPageLRU(page);
+                        if (PageActive(page))
+                                del_page_from_active_list(zone, page);
+                        else
+                                del_page_from_inactive_list(zone, page);
+                        list_add_tail(&page->lru, pagelist);
+                }
+                spin_unlock_irq(&zone->lru_lock);
+        }
+        return ret;
+}
+/*
+ * migrate_prep() needs to be called after we have compiled the list of pages
+ * to be migrated using isolate_lru_page() but before we begin a series of calls
+ * to migrate_pages().
+ */
+int migrate_prep(void)
+{
+        /* Must have swap device for migration */
+        if (nr_swap_pages <= 0)
+                return -ENODEV;
+        /*
+         * Clear the LRU lists so pages can be isolated.
+         * Note that pages may be moved off the LRU after we have
+         * drained them. Those pages will fail to migrate like other
+         * pages that may be busy.
+         */
+        lru_add_drain_all();
+        return 0;
+}
+static inline void move_to_lru(struct page *page)
+{
+        list_del(&page->lru);
+        if (PageActive(page)) {
+                /*
+                 * lru_cache_add_active checks that
+                 * the PG_active bit is off.
+                 */
+                ClearPageActive(page);
+                lru_cache_add_active(page);
+        } else {
+                lru_cache_add(page);
+        }
+        put_page(page);
+}
+/*
+ * Add isolated pages on the list back to the LRU.
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+        struct page *page;
+        struct page *page2;
+        int count = 0;
+        list_for_each_entry_safe(page, page2, l, lru) {
+                move_to_lru(page);
+                count++;
+        }
+        return count;
+}
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+        return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+        struct address_space *mapping = page_mapping(page);
+        if (page_mapped(page) && mapping)
+                if (try_to_unmap(page, 1) != SWAP_SUCCESS)
+                        goto unlock_retry;
+        if (PageDirty(page)) {
+                /* Page is dirty, try to write it out here */
+                switch(pageout(page, mapping)) {
+                case PAGE_KEEP:
+                case PAGE_ACTIVATE:
+                        goto unlock_retry;
+                case PAGE_SUCCESS:
+                        goto retry;
+                case PAGE_CLEAN:
+                        ; /* try to free the page below */
+                }
+        }
+        if (PagePrivate(page)) {
+                if (!try_to_release_page(page, GFP_KERNEL) ||
+                    (!mapping && page_count(page) == 1))
+                        goto unlock_retry;
+        }
+        if (remove_mapping(mapping, page)) {
+                /* Success */
+                unlock_page(page);
+                return 0;
+        }
+unlock_retry:
+        unlock_page(page);
+retry:
+        return -EAGAIN;
+}
+EXPORT_SYMBOL(swap_page);
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+int migrate_page_remove_references(struct page *newpage,
+                                struct page *page, int nr_refs)
+{
+        struct address_space *mapping = page_mapping(page);
+        struct page **radix_pointer;
+        /*
+         * Avoid doing any of the following work if the page count
+         * indicates that the page is in use or truncate has removed
+         * the page.
+         */
+        if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+                return -EAGAIN;
+        /*
+         * Establish swap ptes for anonymous pages or destroy pte
+         * maps for files.
+         *
+         * In order to reestablish file backed mappings the fault handlers
+         * will take the radix tree_lock which may then be used to stop
+         * processses from accessing this page until the new page is ready.
+         *
+         * A process accessing via a swap pte (an anonymous page) will take a
+         * page_lock on the old page which will block the process until the
+         * migration attempt is complete. At that time the PageSwapCache bit
+         * will be examined. If the page was migrated then the PageSwapCache
+         * bit will be clear and the operation to retrieve the page will be
+         * retried which will find the new page in the radix tree. Then a new
+         * direct mapping may be generated based on the radix tree contents.
+         *
+         * If the page was not migrated then the PageSwapCache bit
+         * is still set and the operation may continue.
+         */
+        if (try_to_unmap(page, 1) == SWAP_FAIL)
+                /* A vma has VM_LOCKED set -> permanent failure */
+                return -EPERM;
+        /*
+         * Give up if we were unable to remove all mappings.
+         */
+        if (page_mapcount(page))
+                return -EAGAIN;
+        write_lock_irq(&mapping->tree_lock);
+        radix_pointer = (struct page **)radix_tree_lookup_slot(
+                                                &mapping->page_tree,
+                                                page_index(page));
+        if (!page_mapping(page) || page_count(page) != nr_refs ||
+                        *radix_pointer != page) {
+                write_unlock_irq(&mapping->tree_lock);
+                return 1;
+        }
+        /*
+         * Now we know that no one else is looking at the page.
+         *
+         * Certain minimal information about a page must be available
+         * in order for other subsystems to properly handle the page if they
+         * find it through the radix tree update before we are finished
+         * copying the page.
+         */
+        get_page(newpage);
+        newpage->index = page->index;
+        newpage->mapping = page->mapping;
+        if (PageSwapCache(page)) {
+                SetPageSwapCache(newpage);
+                set_page_private(newpage, page_private(page));
+        }
+        *radix_pointer = newpage;
+        __put_page(page);
+        write_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
+EXPORT_SYMBOL(migrate_page_remove_references);
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+        copy_highpage(newpage, page);
+        if (PageError(page))
+                SetPageError(newpage);
+        if (PageReferenced(page))
+                SetPageReferenced(newpage);
+        if (PageUptodate(page))
+                SetPageUptodate(newpage);
+        if (PageActive(page))
+                SetPageActive(newpage);
+        if (PageChecked(page))
+                SetPageChecked(newpage);
+        if (PageMappedToDisk(page))
+                SetPageMappedToDisk(newpage);
+        if (PageDirty(page)) {
+                clear_page_dirty_for_io(page);
+                set_page_dirty(newpage);
+        }
+        ClearPageSwapCache(page);
+        ClearPageActive(page);
+        ClearPagePrivate(page);
+        set_page_private(page, 0);
+        page->mapping = NULL;
+        /*
+         * If any waiters have accumulated on the new page then
+         * wake them up.
+         */
+        if (PageWriteback(newpage))
+                end_page_writeback(newpage);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+        int rc;
+        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
+        rc = migrate_page_remove_references(newpage, page, 2);
+        if (rc)
+                return rc;
+        migrate_page_copy(newpage, page);
+        /*
+         * Remove auxiliary swap entries and replace
+         * them with real ptes.
+         *
+         * Note that a real pte entry will allow processes that are not
+         * waiting on the page lock to use the new page via the page tables
+         * before the new page is unlocked.
+         */
+        remove_from_swap(newpage);
+        return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because to has become empty
+ * or no retryable pages exist anymore.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+                  struct list_head *moved, struct list_head *failed)
+{
+        int retry;
+        int nr_failed = 0;
+        int pass = 0;
+        struct page *page;
+        struct page *page2;
+        int swapwrite = current->flags & PF_SWAPWRITE;
+        int rc;
+        if (!swapwrite)
+                current->flags |= PF_SWAPWRITE;
+redo:
+        retry = 0;
+        list_for_each_entry_safe(page, page2, from, lru) {
+                struct page *newpage = NULL;
+                struct address_space *mapping;
+                cond_resched();
+                rc = 0;
+                if (page_count(page) == 1)
+                        /* page was freed from under us. So we are done. */
+                        goto next;
+                if (to && list_empty(to))
+                        break;
+                /*
+                 * Skip locked pages during the first two passes to give the
+                 * functions holding the lock time to release the page. Later we
+                 * use lock_page() to have a higher chance of acquiring the
+                 * lock.
+                 */
+                rc = -EAGAIN;
+                if (pass > 2)
+                        lock_page(page);
+                else
+                        if (TestSetPageLocked(page))
+                                goto next;
+                /*
+                 * Only wait on writeback if we have already done a pass where
+                 * we we may have triggered writeouts for lots of pages.
+                 */
+                if (pass > 0) {
+                        wait_on_page_writeback(page);
+                } else {
+                        if (PageWriteback(page))
+                                goto unlock_page;
+                }
+                /*
+                 * Anonymous pages must have swap cache references otherwise
+                 * the information contained in the page maps cannot be
+                 * preserved.
+                 */
+                if (PageAnon(page) && !PageSwapCache(page)) {
+                        if (!add_to_swap(page, GFP_KERNEL)) {
+                                rc = -ENOMEM;
+                                goto unlock_page;
+                        }
+                }
+                if (!to) {
+                        rc = swap_page(page);
+                        goto next;
+                }
+                newpage = lru_to_page(to);
+                lock_page(newpage);
+                /*
+                 * Pages are properly locked and writeback is complete.
+                 * Try to migrate the page.
+                 */
+                mapping = page_mapping(page);
+                if (!mapping)
+                        goto unlock_both;
+                if (mapping->a_ops->migratepage) {
+                        /*
+                         * Most pages have a mapping and most filesystems
+                         * should provide a migration function. Anonymous
+                         * pages are part of swap space which also has its
+                         * own migration function. This is the most common
+                         * path for page migration.
+                         */
+                        rc = mapping->a_ops->migratepage(newpage, page);
+                        goto unlock_both;
+                }
+                /*
+                 * Default handling if a filesystem does not provide
+                 * a migration function. We can only migrate clean
+                 * pages so try to write out any dirty pages first.
+                 */
+                if (PageDirty(page)) {
+                        switch (pageout(page, mapping)) {
+                        case PAGE_KEEP:
+                        case PAGE_ACTIVATE:
+                                goto unlock_both;
+                        case PAGE_SUCCESS:
+                                unlock_page(newpage);
+                                goto next;
+                        case PAGE_CLEAN:
+                                ; /* try to migrate the page below */
+                        }
+                }
+                /*
+                 * Buffers are managed in a filesystem specific way.
+                 * We must have no buffers or drop them.
+                 */
+                if (!page_has_buffers(page) ||
+                    try_to_release_page(page, GFP_KERNEL)) {
+                        rc = migrate_page(newpage, page);
+                        goto unlock_both;
+                }
+                /*
+                 * On early passes with mapped pages simply
+                 * retry. There may be a lock held for some
+                 * buffers that may go away. Later
+                 * swap them out.
+                 */
+                if (pass > 4) {
+                        /*
+                         * Persistently unable to drop buffers..... As a
+                         * measure of last resort we fall back to
+                         * swap_page().
+                         */
+                        unlock_page(newpage);
+                        newpage = NULL;
+                        rc = swap_page(page);
+                        goto next;
+                }
+unlock_both:
+                unlock_page(newpage);
+unlock_page:
+                unlock_page(page);
+next:
+                if (rc == -EAGAIN) {
+                        retry++;
+                } else if (rc) {
+                        /* Permanent failure */
+                        list_move(&page->lru, failed);
+                        nr_failed++;
+                } else {
+                        if (newpage) {
+                                /* Successful migration. Return page to LRU */
+                                move_to_lru(newpage);
+                        }
+                        list_move(&page->lru, moved);
+                }
+        }
+        if (retry && pass++ < 10)
+                goto redo;
+        if (!swapwrite)
+                current->flags &= ~PF_SWAPWRITE;
+        return nr_failed + retry;
+}
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct page *newpage, struct page *page)
+{
+        struct address_space *mapping = page->mapping;
+        struct buffer_head *bh, *head;
+        int rc;
+        if (!mapping)
+                return -EAGAIN;
+        if (!page_has_buffers(page))
+                return migrate_page(newpage, page);
+        head = page_buffers(page);
+        rc = migrate_page_remove_references(newpage, page, 3);
+        if (rc)
+                return rc;
+        bh = head;
+        do {
+                get_bh(bh);
+                lock_buffer(bh);
+                bh = bh->b_this_page;
+        } while (bh != head);
+        ClearPagePrivate(page);
+        set_page_private(newpage, page_private(page));
+        set_page_private(page, 0);
+        put_page(page);
+        get_page(newpage);
+        bh = head;
+        do {
+                set_bh_page(bh, newpage, bh_offset(bh));
+                bh = bh->b_this_page;
+        } while (bh != head);
+        SetPagePrivate(newpage);
+        migrate_page_copy(newpage, page);
+        bh = head;
+        do {
+                unlock_buffer(bh);
+                put_bh(bh);
+                bh = bh->b_this_page;
+        } while (bh != head);
+        return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+int migrate_pages_to(struct list_head *pagelist,
+                        struct vm_area_struct *vma, int dest)
+{
+        LIST_HEAD(newlist);
+        LIST_HEAD(moved);
+        LIST_HEAD(failed);
+        int err = 0;
+        unsigned long offset = 0;
+        int nr_pages;
+        struct page *page;
+        struct list_head *p;
+redo:
+        nr_pages = 0;
+        list_for_each(p, pagelist) {
+                if (vma) {
+                        /*
+                         * The address passed to alloc_page_vma is used to
+                         * generate the proper interleave behavior. We fake
+                         * the address here by an increasing offset in order
+                         * to get the proper distribution of pages.
+                         *
+                         * No decision has been made as to which page
+                         * a certain old page is moved to so we cannot
+                         * specify the correct address.
+                         */
+                        page = alloc_page_vma(GFP_HIGHUSER, vma,
+                                        offset + vma->vm_start);
+                        offset += PAGE_SIZE;
+                }
+                else
+                        page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+                if (!page) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                list_add_tail(&page->lru, &newlist);
+                nr_pages++;
+                if (nr_pages > MIGRATE_CHUNK_SIZE)
+                        break;
+        }
+        err = migrate_pages(pagelist, &newlist, &moved, &failed);
+        putback_lru_pages(&moved);      /* Call release pages instead ?? */
+        if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+                goto redo;
+out:
+        /* Return leftover allocated pages */
+        while (!list_empty(&newlist)) {
+                page = list_entry(newlist.next, struct page, lru);
+                list_del(&page->lru);
+                __free_page(page);
+        }
+        list_splice(&failed, pagelist);
+        if (err < 0)
+                return err;
+        /* Calculate number of leftover pages */
+        nr_pages = 0;
+        list_for_each(p, pagelist)
+                nr_pages++;
+        return nr_pages;
+}
diff --git a/mm/mmap.c b/mm/mmap.c
index 47556d2b3e..0eb9894db6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 * If the vma has a ->close operation then the driver probably needs to release
 * per-vma resources, so we don't attempt to merge those.
 */
-#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
                        struct file *file, unsigned long vm_flags)
@@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
        const unsigned long stack_flags
                = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
-#ifdef CONFIG_HUGETLB
-        if (flags & VM_HUGETLB) {
-                if (!(flags & VM_DONTCOPY))
-                        mm->shared_vm += pages;
-                return;
-        }
-#endif /* CONFIG_HUGETLB */
        if (file) {
                mm->shared_vm += pages;
                if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 653b8571c1..4c14d4289b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
         * a MAP_NORESERVE private mapping to writable will now reserve.
         */
        if (newflags & VM_WRITE) {
-                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
+                if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
                        charged = nrpages;
                        if (security_vm_enough_memory(charged))
                                return -ENOMEM;
@@ -166,7 +166,10 @@ success:
         */
        vma->vm_flags = newflags;
        vma->vm_page_prot = newprot;
-        change_protection(vma, start, end, newprot);
+        if (is_vm_hugetlb_page(vma))
+                hugetlb_change_protection(vma, start, end, newprot);
+        else
+                change_protection(vma, start, end, newprot);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
        return 0;
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
-                if (is_vm_hugetlb_page(vma)) {
-                        error = -EACCES;
-                        goto out;
-                }
                newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
                /* newflags >> 4 shift VM_MAY% in place of VM_% */
diff --git a/mm/nommu.c b/mm/nommu.c
index 4951f4786f..db45efac17 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
        /*
         * kmalloc doesn't like __GFP_HIGHMEM for some reason
         */
-        return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM);
+        return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
 }
 struct page * vmalloc_to_page(void *addr)
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
         * - note that this may not return a page-aligned address if the object
         *   we're allocating is smaller than a page
         */
-        base = kmalloc(len, GFP_KERNEL);
+        base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
        if (!base)
                goto enomem;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 234bd4895d..b7f14a4799 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,7 +55,6 @@ unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 int percpu_pagelist_fraction;
-static void fastcall free_hot_cold_page(struct page *page, int cold);
 static void __free_pages_ok(struct page *page, unsigned int order);
 /*
@@ -190,7 +189,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
        for (i = 0; i < nr_pages; i++) {
                struct page *p = page + i;
-                SetPageCompound(p);
+                __SetPageCompound(p);
                set_page_private(p, (unsigned long)page);
        }
 }
@@ -209,10 +208,24 @@ static void destroy_compound_page(struct page *page, unsigned long order)
                if (unlikely(!PageCompound(p) |
                                (page_private(p) != (unsigned long)page)))
                        bad_page(page);
-                ClearPageCompound(p);
+                __ClearPageCompound(p);
        }
 }
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+{
+        int i;
+        BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+        /*
+         * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
+         * and __GFP_HIGHMEM from hard or soft interrupt context.
+         */
+        BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
+        for (i = 0; i < (1 << order); i++)
+                clear_highpage(page + i);
+}
 /*
 * function for dealing with page's order in buddy system.
 * zone->lock is already acquired when we use these.
@@ -423,11 +436,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
                mutex_debug_check_no_locks_freed(page_address(page),
                                                 PAGE_SIZE<<order);
-#ifndef CONFIG_MMU
-        for (i = 1 ; i < (1 << order) ; ++i)
-                __put_page(page + i);
-#endif
        for (i = 0 ; i < (1 << order) ; ++i)
                reserved += free_pages_check(page + i);
        if (reserved)
@@ -448,28 +456,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
        if (order == 0) {
                __ClearPageReserved(page);
                set_page_count(page, 0);
+                set_page_refcounted(page);
-                free_hot_cold_page(page, 0);
+                __free_page(page);
        } else {
-                LIST_HEAD(list);
                int loop;
+                prefetchw(page);
                for (loop = 0; loop < BITS_PER_LONG; loop++) {
                        struct page *p = &page[loop];
-                        if (loop + 16 < BITS_PER_LONG)
+                        if (loop + 1 < BITS_PER_LONG)
-                                prefetchw(p + 16);
+                                prefetchw(p + 1);
                        __ClearPageReserved(p);
                        set_page_count(p, 0);
                }
-                arch_free_page(page, order);
+                set_page_refcounted(page);
+                __free_pages(page, order);
-                mod_page_state(pgfree, 1 << order);
-                list_add(&page->lru, &list);
-                kernel_map_pages(page, 1 << order, 0);
-                free_pages_bulk(page_zone(page), 1, &list, order);
        }
 }
@@ -507,7 +510,7 @@ static inline void expand(struct zone *zone, struct page *page,
 /*
 * This page is about to be returned from the page allocator
 */
-static int prep_new_page(struct page *page, int order)
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
        if (unlikely(page_mapcount(page) |
                (page->mapping != NULL)  |
@@ -536,8 +539,15 @@ static int prep_new_page(struct page *page, int order)
                        1 << PG_referenced | 1 << PG_arch_1 |
                        1 << PG_checked | 1 << PG_mappedtodisk);
        set_page_private(page, 0);
-        set_page_refs(page, order);
+        set_page_refcounted(page);
        kernel_map_pages(page, 1 << order, 1);
+        if (gfp_flags & __GFP_ZERO)
+                prep_zero_page(page, order, gfp_flags);
+        if (order && (gfp_flags & __GFP_COMP))
+                prep_compound_page(page, order);
        return 0;
 }
@@ -593,13 +603,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 /*
 * Called from the slab reaper to drain pagesets on a particular node that
 * belong to the currently executing processor.
+ * Note that this function must be called with the thread pinned to
+ * a single processor.
 */
 void drain_node_pages(int nodeid)
 {
        int i, z;
        unsigned long flags;
-        local_irq_save(flags);
        for (z = 0; z < MAX_NR_ZONES; z++) {
                struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
                struct per_cpu_pageset *pset;
@@ -609,11 +620,14 @@ void drain_node_pages(int nodeid)
                        struct per_cpu_pages *pcp;
                        pcp = &pset->pcp[i];
-                        free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+                        if (pcp->count) {
-                        pcp->count = 0;
+                                local_irq_save(flags);
+                                free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+                                pcp->count = 0;
+                                local_irq_restore(flags);
+                        }
                }
        }
-        local_irq_restore(flags);
 }
 #endif
@@ -743,13 +757,22 @@ void fastcall free_cold_page(struct page *page)
        free_hot_cold_page(page, 1);
 }
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+/*
+ * split_page takes a non-compound higher-order page, and splits it into
+ * n (1<<order) sub-pages: page[0..n]
+ * Each sub-page must be freed individually.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+void split_page(struct page *page, unsigned int order)
 {
        int i;
-        BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+        BUG_ON(PageCompound(page));
-        for(i = 0; i < (1 << order); i++)
+        BUG_ON(!page_count(page));
-                clear_highpage(page + i);
+        for (i = 1; i < (1 << order); i++)
+                set_page_refcounted(page + i);
 }
 /*
@@ -795,14 +818,8 @@ again:
        put_cpu();
        BUG_ON(bad_range(zone, page));
-        if (prep_new_page(page, order))
+        if (prep_new_page(page, order, gfp_flags))
                goto again;
-        if (gfp_flags & __GFP_ZERO)
-                prep_zero_page(page, order, gfp_flags);
-        if (order && (gfp_flags & __GFP_COMP))
-                prep_compound_page(page, order);
        return page;
 failed:
@@ -1214,24 +1231,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
-        int cpu = 0;
+        unsigned cpu;
        memset(ret, 0, nr * sizeof(unsigned long));
        cpus_and(*cpumask, *cpumask, cpu_online_map);
-        cpu = first_cpu(*cpumask);
+        for_each_cpu_mask(cpu, *cpumask) {
-        while (cpu < NR_CPUS) {
+                unsigned long *in;
-                unsigned long *in, *out, off;
+                unsigned long *out;
+                unsigned off;
-                if (!cpu_isset(cpu, *cpumask))
+                unsigned next_cpu;
-                        continue;
                in = (unsigned long *)&per_cpu(page_states, cpu);
-                cpu = next_cpu(cpu, *cpumask);
+                next_cpu = next_cpu(cpu, *cpumask);
+                if (likely(next_cpu < NR_CPUS))
-                if (likely(cpu < NR_CPUS))
+                        prefetch(&per_cpu(page_states, next_cpu));
-                        prefetch(&per_cpu(page_states, cpu));
                out = (unsigned long *)ret;
                for (off = 0; off < nr; off++)
@@ -1764,7 +1779,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                        continue;
                page = pfn_to_page(pfn);
                set_page_links(page, zone, nid, pfn);
-                set_page_count(page, 1);
+                init_page_count(page);
                reset_page_mapcount(page);
                SetPageReserved(page);
                INIT_LIST_HEAD(&page->lru);
diff --git a/mm/readahead.c b/mm/readahead.c
index 8d6eeaaa62..301b36c4a0 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -52,13 +52,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra)
        return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 }
+static inline void reset_ahead_window(struct file_ra_state *ra)
+{
+        /*
+         * ... but preserve ahead_start + ahead_size value,
+         * see 'recheck:' label in page_cache_readahead().
+         * Note: We never use ->ahead_size as rvalue without
+         * checking ->ahead_start != 0 first.
+         */
+        ra->ahead_size += ra->ahead_start;
+        ra->ahead_start = 0;
+}
 static inline void ra_off(struct file_ra_state *ra)
 {
        ra->start = 0;
        ra->flags = 0;
        ra->size = 0;
-        ra->ahead_start = 0;
+        reset_ahead_window(ra);
-        ra->ahead_size = 0;
        return;
 }
@@ -72,10 +83,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
 {
        unsigned long newsize = roundup_pow_of_two(size);
-        if (newsize <= max / 64)
+        if (newsize <= max / 32)
-                newsize = newsize * newsize;
+                newsize = newsize * 4;
        else if (newsize <= max / 4)
-                newsize = max / 4;
+                newsize = newsize * 2;
        else
                newsize = max;
        return newsize;
@@ -426,8 +437,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
                 * congestion.  The ahead window will any way be closed
                 * in case we failed due to excessive page cache hits.
                 */
-                ra->ahead_start = 0;
+                reset_ahead_window(ra);
-                ra->ahead_size = 0;
        }
        return ret;
@@ -520,11 +530,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
         * If we get here we are doing sequential IO and this was not the first
         * occurence (ie we have an existing window)
         */
        if (ra->ahead_start == 0) {      /* no ahead window yet */
                if (!make_ahead_window(mapping, filp, ra, 0))
-                        goto out;
+                        goto recheck;
        }
        /*
         * Already have an ahead window, check if we crossed into it.
         * If so, shift windows and issue a new ahead window.
@@ -536,6 +546,10 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
                ra->start = ra->ahead_start;
                ra->size = ra->ahead_size;
                make_ahead_window(mapping, filp, ra, 0);
+recheck:
+                /* prev_page shouldn't overrun the ahead window */
+                ra->prev_page = min(ra->prev_page,
+                        ra->ahead_start + ra->ahead_size - 1);
        }
 out:
diff --git a/mm/rmap.c b/mm/rmap.c
index 67f0e20b10..1963e26931 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,13 +56,11 @@
 #include <asm/tlbflush.h>
-//#define RMAP_DEBUG /* can be enabled only for debugging */
+struct kmem_cache *anon_vma_cachep;
-kmem_cache_t *anon_vma_cachep;
 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
 {
-#ifdef RMAP_DEBUG
+#ifdef CONFIG_DEBUG_VM
        struct anon_vma *anon_vma = find_vma->anon_vma;
        struct vm_area_struct *vma;
        unsigned int mapcount = 0;
@@ -166,7 +164,8 @@ void anon_vma_unlink(struct vm_area_struct *vma)
                anon_vma_free(anon_vma);
 }
-static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
+                          unsigned long flags)
 {
        if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
                                                SLAB_CTOR_CONSTRUCTOR) {
@@ -550,13 +549,14 @@ void page_add_file_rmap(struct page *page)
 void page_remove_rmap(struct page *page)
 {
        if (atomic_add_negative(-1, &page->_mapcount)) {
-                if (page_mapcount(page) < 0) {
+#ifdef CONFIG_DEBUG_VM
+                if (unlikely(page_mapcount(page) < 0)) {
                        printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
                        printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
                        printk (KERN_EMERG "  page->count = %x\n", page_count(page));
                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
                }
+#endif
                BUG_ON(page_mapcount(page) < 0);
                /*
                 * It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c455fbaff..37eaf42ed2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -875,7 +875,7 @@ redirty:
 }
 #ifdef CONFIG_NUMA
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
 {
        char *nodelist = strchr(value, ':');
        int err = 1;
@@ -2119,7 +2119,7 @@ failed:
        return err;
 }
-static kmem_cache_t *shmem_inode_cachep;
+static struct kmem_cache *shmem_inode_cachep;
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
@@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode)
        kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep,
+                      unsigned long flags)
 {
        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
diff --git a/mm/slab.c b/mm/slab.c
index d0bd7f07ab..1c8f5ee230 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -50,7 +50,7 @@
 * The head array is strictly LIFO and should improve the cache hit rates.
 * On SMP, it additionally reduces the spinlock operations.
 *
- * The c_cpuarray may not be read with enabled local interrupts - 
+ * The c_cpuarray may not be read with enabled local interrupts -
 * it's changed with a smp_call_function().
 *
 * SMP synchronization:
@@ -170,12 +170,12 @@
 #if DEBUG
 # define CREATE_MASK    (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
                         SLAB_POISON | SLAB_HWCACHE_ALIGN | \
-                         SLAB_NO_REAP | SLAB_CACHE_DMA | \
+                         SLAB_CACHE_DMA | \
                         SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
                         SLAB_DESTROY_BY_RCU)
 #else
-# define CREATE_MASK    (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
+# define CREATE_MASK    (SLAB_HWCACHE_ALIGN | \
                         SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
                         SLAB_DESTROY_BY_RCU)
@@ -266,16 +266,17 @@ struct array_cache {
        unsigned int batchcount;
        unsigned int touched;
        spinlock_t lock;
-        void *entry[0];         /*
+        void *entry[0]; /*
-                                 * Must have this definition in here for the proper
+                         * Must have this definition in here for the proper
-                                 * alignment of array_cache. Also simplifies accessing
+                         * alignment of array_cache. Also simplifies accessing
-                                 * the entries.
+                         * the entries.
-                                 * [0] is for gcc 2.95. It should really be [].
+                         * [0] is for gcc 2.95. It should really be [].
-                                 */
+                         */
 };
-/* bootstrap: The caches do not work without cpuarrays anymore,
+/*
- * but the cpuarrays are allocated from the generic caches...
+ * bootstrap: The caches do not work without cpuarrays anymore, but the
+ * cpuarrays are allocated from the generic caches...
 */
 #define BOOT_CPUCACHE_ENTRIES   1
 struct arraycache_init {
@@ -291,13 +292,13 @@ struct kmem_list3 {
        struct list_head slabs_full;
        struct list_head slabs_free;
        unsigned long free_objects;
-        unsigned long next_reap;
-        int free_touched;
        unsigned int free_limit;
        unsigned int colour_next;       /* Per-node cache coloring */
        spinlock_t list_lock;
        struct array_cache *shared;     /* shared per node */
        struct array_cache **alien;     /* on other nodes */
+        unsigned long next_reap;        /* updated without locking */
+        int free_touched;               /* updated without locking */
 };
 /*
@@ -310,10 +311,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 #define SIZE_L3 (1 + MAX_NUMNODES)
 /*
- * This function must be completely optimized away if
+ * This function must be completely optimized away if a constant is passed to
- * a constant is passed to it. Mostly the same as
+ * it.  Mostly the same as what is in linux/slab.h except it returns an index.
- * what is in linux/slab.h except it returns an
- * index.
 */
 static __always_inline int index_of(const size_t size)
 {
@@ -351,14 +350,14 @@ static void kmem_list3_init(struct kmem_list3 *parent)
        parent->free_touched = 0;
 }
-#define MAKE_LIST(cachep, listp, slab, nodeid)  \
+#define MAKE_LIST(cachep, listp, slab, nodeid)                          \
-        do {    \
+        do {                                                            \
-                INIT_LIST_HEAD(listp);          \
+                INIT_LIST_HEAD(listp);                                  \
-                list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+                list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
        } while (0)
-#define MAKE_ALL_LISTS(cachep, ptr, nodeid)                     \
+#define MAKE_ALL_LISTS(cachep, ptr, nodeid)                             \
-        do {                                    \
+        do {                                                            \
        MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);  \
        MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
        MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);  \
@@ -373,28 +372,30 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
        struct array_cache *array[NR_CPUS];
+/* 2) Cache tunables. Protected by cache_chain_mutex */
        unsigned int batchcount;
        unsigned int limit;
        unsigned int shared;
        unsigned int buffer_size;
-/* 2) touched by every alloc & free from the backend */
+/* 3) touched by every alloc & free from the backend */
        struct kmem_list3 *nodelists[MAX_NUMNODES];
-        unsigned int flags;     /* constant flags */
-        unsigned int num;       /* # of objs per slab */
-        spinlock_t spinlock;
-/* 3) cache_grow/shrink */
+        unsigned int flags;             /* constant flags */
+        unsigned int num;               /* # of objs per slab */
+/* 4) cache_grow/shrink */
        /* order of pgs per slab (2^n) */
        unsigned int gfporder;
        /* force GFP flags, e.g. GFP_DMA */
        gfp_t gfpflags;
-        size_t colour;          /* cache colouring range */
+        size_t colour;                  /* cache colouring range */
        unsigned int colour_off;        /* colour offset */
        struct kmem_cache *slabp_cache;
        unsigned int slab_size;
-        unsigned int dflags;    /* dynamic flags */
+        unsigned int dflags;            /* dynamic flags */
        /* constructor func */
        void (*ctor) (void *, struct kmem_cache *, unsigned long);
@@ -402,11 +403,11 @@ struct kmem_cache {
        /* de-constructor func */
        void (*dtor) (void *, struct kmem_cache *, unsigned long);
-/* 4) cache creation/removal */
+/* 5) cache creation/removal */
        const char *name;
        struct list_head next;
-/* 5) statistics */
+/* 6) statistics */
 #if STATS
        unsigned long num_active;
        unsigned long num_allocations;
@@ -438,8 +439,9 @@ struct kmem_cache {
 #define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
 #define BATCHREFILL_LIMIT       16
-/* Optimization question: fewer reaps means less 
+/*
- * probability for unnessary cpucache drain/refill cycles.
+ * Optimization question: fewer reaps means less probability for unnessary
+ * cpucache drain/refill cycles.
 *
 * OTOH the cpuarrays can contain lots of objects,
 * which could lock up otherwise freeable slabs.
@@ -453,17 +455,19 @@ struct kmem_cache {
 #define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 #define STATS_INC_GROWN(x)      ((x)->grown++)
 #define STATS_INC_REAPED(x)     ((x)->reaped++)
-#define STATS_SET_HIGH(x)       do { if ((x)->num_active > (x)->high_mark) \
+#define STATS_SET_HIGH(x)                                               \
-                                        (x)->high_mark = (x)->num_active; \
+        do {                                                            \
-                                } while (0)
+                if ((x)->num_active > (x)->high_mark)                   \
+                        (x)->high_mark = (x)->num_active;               \
+        } while (0)
 #define STATS_INC_ERR(x)        ((x)->errors++)
 #define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
 #define STATS_INC_NODEFREES(x)  ((x)->node_frees++)
-#define STATS_SET_FREEABLE(x, i) \
+#define STATS_SET_FREEABLE(x, i)                                        \
-                                do { if ((x)->max_freeable < i) \
+        do {                                                            \
-                                        (x)->max_freeable = i; \
+                if ((x)->max_freeable < i)                              \
-                                } while (0)
+                        (x)->max_freeable = i;                          \
+        } while (0)
 #define STATS_INC_ALLOCHIT(x)   atomic_inc(&(x)->allochit)
 #define STATS_INC_ALLOCMISS(x)  atomic_inc(&(x)->allocmiss)
 #define STATS_INC_FREEHIT(x)    atomic_inc(&(x)->freehit)
@@ -478,9 +482,7 @@ struct kmem_cache {
 #define STATS_INC_ERR(x)        do { } while (0)
 #define STATS_INC_NODEALLOCS(x) do { } while (0)
 #define STATS_INC_NODEFREES(x)  do { } while (0)
-#define STATS_SET_FREEABLE(x, i) \
+#define STATS_SET_FREEABLE(x, i) do { } while (0)
-                                do { } while (0)
 #define STATS_INC_ALLOCHIT(x)   do { } while (0)
 #define STATS_INC_ALLOCMISS(x)  do { } while (0)
 #define STATS_INC_FREEHIT(x)    do { } while (0)
@@ -488,7 +490,8 @@ struct kmem_cache {
 #endif
 #if DEBUG
-/* Magic nums for obj red zoning.
+/*
+ * Magic nums for obj red zoning.
 * Placed in the first word before and the first word after an obj.
 */
 #define RED_INACTIVE    0x5A2CF071UL    /* when obj is inactive */
@@ -499,7 +502,8 @@ struct kmem_cache {
 #define POISON_FREE     0x6b    /* for use-after-free poisoning */
 #define POISON_END      0xa5    /* end-byte of poisoning */
-/* memory layout of objects:
+/*
+ * memory layout of objects:
 * 0            : objp
 * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
 *              the end of an object is aligned with the end of the real
@@ -508,7 +512,8 @@ struct kmem_cache {
 *              redzone word.
 * cachep->obj_offset: The real object.
 * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
+ * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ *                                      [BYTES_PER_WORD long]
 */
 static int obj_offset(struct kmem_cache *cachep)
 {
@@ -552,8 +557,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #endif
 /*
- * Maximum size of an obj (in 2^order pages)
+ * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
- * and absolute limit for the gfp order.
+ * order.
 */
 #if defined(CONFIG_LARGE_ALLOCS)
 #define MAX_OBJ_ORDER   13      /* up to 32Mb */
@@ -573,9 +578,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #define BREAK_GFP_ORDER_LO      0
 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
-/* Functions for storing/retrieving the cachep and or slab from the
+/*
- * global 'mem_map'. These are used to find the slab an obj belongs to.
+ * Functions for storing/retrieving the cachep and or slab from the page
- * With kfree(), these are used to find the cache which an obj belongs to.
+ * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
+ * these are used to find the cache which an obj belongs to.
 */
 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 {
@@ -584,6 +590,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 static inline struct kmem_cache *page_get_cache(struct page *page)
 {
+        if (unlikely(PageCompound(page)))
+                page = (struct page *)page_private(page);
        return (struct kmem_cache *)page->lru.next;
 }
@@ -594,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab)
 static inline struct slab *page_get_slab(struct page *page)
 {
+        if (unlikely(PageCompound(page)))
+                page = (struct page *)page_private(page);
        return (struct slab *)page->lru.prev;
 }
@@ -609,7 +619,21 @@ static inline struct slab *virt_to_slab(const void *obj)
        return page_get_slab(page);
 }
-/* These are the default caches for kmalloc. Custom caches can have other sizes. */
+static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
+                                 unsigned int idx)
+{
+        return slab->s_mem + cache->buffer_size * idx;
+}
+static inline unsigned int obj_to_index(struct kmem_cache *cache,
+                                        struct slab *slab, void *obj)
+{
+        return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
+}
+/*
+ * These are the default caches for kmalloc. Custom caches can have other sizes.
+ */
 struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
 #include <linux/kmalloc_sizes.h>
@@ -642,8 +666,6 @@ static struct kmem_cache cache_cache = {
        .limit = BOOT_CPUCACHE_ENTRIES,
        .shared = 1,
        .buffer_size = sizeof(struct kmem_cache),
-        .flags = SLAB_NO_REAP,
-        .spinlock = SPIN_LOCK_UNLOCKED,
        .name = "kmem_cache",
 #if DEBUG
        .obj_size = sizeof(struct kmem_cache),
@@ -655,8 +677,8 @@ static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 /*
- * vm_enough_memory() looks at this to determine how many
+ * vm_enough_memory() looks at this to determine how many slab-allocated pages
- * slab-allocated pages are possibly freeable under pressure
+ * are possibly freeable under pressure
 *
 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
 */
@@ -675,7 +697,8 @@ static enum {
 static DEFINE_PER_CPU(struct work_struct, reap_work);
-static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len,
+                        int node);
 static void enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(void *unused);
 static int __node_shrink(struct kmem_cache *cachep, int node);
@@ -685,7 +708,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
        return cachep->array[smp_processor_id()];
 }
-static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
+static inline struct kmem_cache *__find_general_cachep(size_t size,
+                                                        gfp_t gfpflags)
 {
        struct cache_sizes *csizep = malloc_sizes;
@@ -720,8 +744,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align)
        return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
 }
-/* Calculate the number of objects and left-over bytes for a given
+/*
-   buffer size. */
+ * Calculate the number of objects and left-over bytes for a given buffer size.
+ */
 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
                           size_t align, int flags, size_t *left_over,
                           unsigned int *num)
@@ -782,7 +807,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
-static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
+static void __slab_error(const char *function, struct kmem_cache *cachep,
+                        char *msg)
 {
        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
               function, cachep->name, msg);
@@ -804,7 +830,7 @@ static void init_reap_node(int cpu)
        node = next_node(cpu_to_node(cpu), node_online_map);
        if (node == MAX_NUMNODES)
-                node = 0;
+                node = first_node(node_online_map);
        __get_cpu_var(reap_node) = node;
 }
@@ -906,10 +932,8 @@ static void free_alien_cache(struct array_cache **ac_ptr)
        if (!ac_ptr)
                return;
        for_each_node(i)
            kfree(ac_ptr[i]);
        kfree(ac_ptr);
 }
@@ -943,7 +967,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
        }
 }
-static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
+static void drain_alien_cache(struct kmem_cache *cachep,
+                                struct array_cache **alien)
 {
        int i = 0;
        struct array_cache *ac;
@@ -986,20 +1011,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
                mutex_lock(&cache_chain_mutex);
-                /* we need to do this right in the beginning since
+                /*
+                 * We need to do this right in the beginning since
                 * alloc_arraycache's are going to use this list.
                 * kmalloc_node allows us to add the slab to the right
                 * kmem_list3 and not this cpu's kmem_list3
                 */
                list_for_each_entry(cachep, &cache_chain, next) {
-                        /* setup the size64 kmemlist for cpu before we can
+                        /*
+                         * Set up the size64 kmemlist for cpu before we can
                         * begin anything. Make sure some other cpu on this
                         * node has not already allocated this
                         */
                        if (!cachep->nodelists[node]) {
-                                if (!(l3 = kmalloc_node(memsize,
+                                l3 = kmalloc_node(memsize, GFP_KERNEL, node);
-                                                        GFP_KERNEL, node)))
+                                if (!l3)
                                        goto bad;
                                kmem_list3_init(l3);
                                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
@@ -1015,13 +1042,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        spin_lock_irq(&cachep->nodelists[node]->list_lock);
                        cachep->nodelists[node]->free_limit =
-                            (1 + nr_cpus_node(node)) *
+                                (1 + nr_cpus_node(node)) *
-                            cachep->batchcount + cachep->num;
+                                cachep->batchcount + cachep->num;
                        spin_unlock_irq(&cachep->nodelists[node]->list_lock);
                }
-                /* Now we can go ahead with allocating the shared array's
+                /*
-                   & array cache's */
+                 * Now we can go ahead with allocating the shared arrays and
+                 * array caches
+                 */
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
                        struct array_cache *shared;
@@ -1041,7 +1070,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        if (!alien)
                                goto bad;
                        cachep->array[cpu] = nc;
                        l3 = cachep->nodelists[node];
                        BUG_ON(!l3);
@@ -1061,7 +1089,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        }
 #endif
                        spin_unlock_irq(&l3->list_lock);
                        kfree(shared);
                        free_alien_cache(alien);
                }
@@ -1083,7 +1110,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                /* fall thru */
        case CPU_UP_CANCELED:
                mutex_lock(&cache_chain_mutex);
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
                        struct array_cache *shared;
@@ -1150,7 +1176,7 @@ free_array_cache:
 #endif
        }
        return NOTIFY_OK;
-      bad:
+bad:
        mutex_unlock(&cache_chain_mutex);
        return NOTIFY_BAD;
 }
@@ -1160,7 +1186,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
 * swap the static kmem_list3 with kmalloced memory
 */
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
+static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+                        int nodeid)
 {
        struct kmem_list3 *ptr;
@@ -1175,8 +1202,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no
        local_irq_enable();
 }
-/* Initialisation.
+/*
- * Called after the gfp() functions have been enabled, and before smp_init().
+ * Initialisation.  Called after the page allocator have been initialised and
+ * before smp_init().
 */
 void __init kmem_cache_init(void)
 {
@@ -1201,9 +1229,9 @@ void __init kmem_cache_init(void)
        /* Bootstrap is tricky, because several objects are allocated
         * from caches that do not exist yet:
-         * 1) initialize the cache_cache cache: it contains the struct kmem_cache
+         * 1) initialize the cache_cache cache: it contains the struct
-         *    structures of all caches, except cache_cache itself: cache_cache
+         *    kmem_cache structures of all caches, except cache_cache itself:
-         *    is statically allocated.
+         *    cache_cache is statically allocated.
         *    Initially an __init data area is used for the head array and the
         *    kmem_list3 structures, it's replaced with a kmalloc allocated
         *    array at the end of the bootstrap.
@@ -1226,7 +1254,8 @@ void __init kmem_cache_init(void)
        cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
        cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
-        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
+        cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
+                                        cache_line_size());
        for (order = 0; order < MAX_ORDER; order++) {
                cache_estimate(order, cache_cache.buffer_size,
@@ -1245,24 +1274,26 @@ void __init kmem_cache_init(void)
        sizes = malloc_sizes;
        names = cache_names;
-        /* Initialize the caches that provide memory for the array cache
+        /*
-         * and the kmem_list3 structures first.
+         * Initialize the caches that provide memory for the array cache and the
-         * Without this, further allocations will bug
+         * kmem_list3 structures first.  Without this, further allocations will
+         * bug.
         */
        sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-                                                      sizes[INDEX_AC].cs_size,
+                                        sizes[INDEX_AC].cs_size,
-                                                      ARCH_KMALLOC_MINALIGN,
+                                        ARCH_KMALLOC_MINALIGN,
-                                                      (ARCH_KMALLOC_FLAGS |
+                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-                                                       SLAB_PANIC), NULL, NULL);
+                                        NULL, NULL);
-        if (INDEX_AC != INDEX_L3)
+        if (INDEX_AC != INDEX_L3) {
                sizes[INDEX_L3].cs_cachep =
-                    kmem_cache_create(names[INDEX_L3].name,
+                        kmem_cache_create(names[INDEX_L3].name,
-                                      sizes[INDEX_L3].cs_size,
+                                sizes[INDEX_L3].cs_size,
-                                      ARCH_KMALLOC_MINALIGN,
+                                ARCH_KMALLOC_MINALIGN,
-                                      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
+                                ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-                                      NULL);
+                                NULL, NULL);
+        }
        while (sizes->cs_size != ULONG_MAX) {
                /*
@@ -1272,13 +1303,13 @@ void __init kmem_cache_init(void)
                 * Note for systems short on memory removing the alignment will
                 * allow tighter packing of the smaller caches.
                 */
-                if (!sizes->cs_cachep)
+                if (!sizes->cs_cachep) {
                        sizes->cs_cachep = kmem_cache_create(names->name,
-                                                             sizes->cs_size,
+                                        sizes->cs_size,
-                                                             ARCH_KMALLOC_MINALIGN,
+                                        ARCH_KMALLOC_MINALIGN,
-                                                             (ARCH_KMALLOC_FLAGS
+                                        ARCH_KMALLOC_FLAGS|SLAB_PANIC,
-                                                              | SLAB_PANIC),
+                                        NULL, NULL);
-                                                             NULL, NULL);
+                }
                /* Inc off-slab bufctl limit until the ceiling is hit. */
                if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -1287,13 +1318,11 @@ void __init kmem_cache_init(void)
                }
                sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
-                                                        sizes->cs_size,
+                                        sizes->cs_size,
-                                                        ARCH_KMALLOC_MINALIGN,
+                                        ARCH_KMALLOC_MINALIGN,
-                                                        (ARCH_KMALLOC_FLAGS |
+                                        ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
-                                                         SLAB_CACHE_DMA |
+                                                SLAB_PANIC,
-                                                         SLAB_PANIC), NULL,
+                                        NULL, NULL);
-                                                        NULL);
                sizes++;
                names++;
        }
@@ -1345,20 +1374,22 @@ void __init kmem_cache_init(void)
                struct kmem_cache *cachep;
                mutex_lock(&cache_chain_mutex);
                list_for_each_entry(cachep, &cache_chain, next)
-                    enable_cpucache(cachep);
+                        enable_cpucache(cachep);
                mutex_unlock(&cache_chain_mutex);
        }
        /* Done! */
        g_cpucache_up = FULL;
-        /* Register a cpu startup notifier callback
+        /*
-         * that initializes cpu_cache_get for all new cpus
+         * Register a cpu startup notifier callback that initializes
+         * cpu_cache_get for all new cpus
         */
        register_cpu_notifier(&cpucache_notifier);
-        /* The reap timers are started later, with a module init call:
+        /*
-         * That part of the kernel is not yet operational.
+         * The reap timers are started later, with a module init call: That part
+         * of the kernel is not yet operational.
         */
 }
@@ -1366,16 +1397,13 @@ static int __init cpucache_init(void)
 {
        int cpu;
-        /* 
+        /*
-         * Register the timers that return unneeded
+         * Register the timers that return unneeded pages to the page allocator
-         * pages to gfp.
         */
        for_each_online_cpu(cpu)
-            start_cpu_timer(cpu);
+                start_cpu_timer(cpu);
        return 0;
 }
 __initcall(cpucache_init);
 /*
@@ -1402,7 +1430,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                atomic_add(i, &slab_reclaim_pages);
        add_page_state(nr_slab, i);
        while (i--) {
-                SetPageSlab(page);
+                __SetPageSlab(page);
                page++;
        }
        return addr;
@@ -1418,8 +1446,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
        const unsigned long nr_freed = i;
        while (i--) {
-                if (!TestClearPageSlab(page))
+                BUG_ON(!PageSlab(page));
-                        BUG();
+                __ClearPageSlab(page);
                page++;
        }
        sub_page_state(nr_slab, nr_freed);
@@ -1489,9 +1517,8 @@ static void dump_line(char *data, int offset, int limit)
 {
        int i;
        printk(KERN_ERR "%03x:", offset);
-        for (i = 0; i < limit; i++) {
+        for (i = 0; i < limit; i++)
                printk(" %02x", (unsigned char)data[offset + i]);
-        }
        printk("\n");
 }
 #endif
@@ -1505,15 +1532,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
        if (cachep->flags & SLAB_RED_ZONE) {
                printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
-                       *dbg_redzone1(cachep, objp),
+                        *dbg_redzone1(cachep, objp),
-                       *dbg_redzone2(cachep, objp));
+                        *dbg_redzone2(cachep, objp));
        }
        if (cachep->flags & SLAB_STORE_USER) {
                printk(KERN_ERR "Last user: [<%p>]",
-                       *dbg_userword(cachep, objp));
+                        *dbg_userword(cachep, objp));
                print_symbol("(%s)",
-                             (unsigned long)*dbg_userword(cachep, objp));
+                                (unsigned long)*dbg_userword(cachep, objp));
                printk("\n");
        }
        realobj = (char *)objp + obj_offset(cachep);
@@ -1546,8 +1573,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
                        /* Print header */
                        if (lines == 0) {
                                printk(KERN_ERR
-                                       "Slab corruption: start=%p, len=%d\n",
+                                        "Slab corruption: start=%p, len=%d\n",
-                                       realobj, size);
+                                        realobj, size);
                                print_objinfo(cachep, objp, 0);
                        }
                        /* Hexdump the affected line */
@@ -1568,18 +1595,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
                 * exist:
                 */
                struct slab *slabp = virt_to_slab(objp);
-                int objnr;
+                unsigned int objnr;
-                objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+                objnr = obj_to_index(cachep, slabp, objp);
                if (objnr) {
-                        objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
+                        objp = index_to_obj(cachep, slabp, objnr - 1);
                        realobj = (char *)objp + obj_offset(cachep);
                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
                               realobj, size);
                        print_objinfo(cachep, objp, 2);
                }
                if (objnr + 1 < cachep->num) {
-                        objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
+                        objp = index_to_obj(cachep, slabp, objnr + 1);
                        realobj = (char *)objp + obj_offset(cachep);
                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
                               realobj, size);
@@ -1591,22 +1618,25 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 #if DEBUG
 /**
- * slab_destroy_objs - call the registered destructor for each object in
+ * slab_destroy_objs - destroy a slab and its objects
- *      a slab that is to be destroyed.
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
+ * Call the registered destructor for each object in a slab that is being
+ * destroyed.
 */
 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
        int i;
        for (i = 0; i < cachep->num; i++) {
-                void *objp = slabp->s_mem + cachep->buffer_size * i;
+                void *objp = index_to_obj(cachep, slabp, i);
                if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                        if ((cachep->buffer_size % PAGE_SIZE) == 0
+                        if (cachep->buffer_size % PAGE_SIZE == 0 &&
-                            && OFF_SLAB(cachep))
+                                        OFF_SLAB(cachep))
                                kernel_map_pages(virt_to_page(objp),
-                                                 cachep->buffer_size / PAGE_SIZE,
+                                        cachep->buffer_size / PAGE_SIZE, 1);
-                                                 1);
                        else
                                check_poison_obj(cachep, objp);
 #else
@@ -1631,7 +1661,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
        if (cachep->dtor) {
                int i;
                for (i = 0; i < cachep->num; i++) {
-                        void *objp = slabp->s_mem + cachep->buffer_size * i;
+                        void *objp = index_to_obj(cachep, slabp, i);
                        (cachep->dtor) (objp, cachep, 0);
                }
        }
@@ -1639,9 +1669,13 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 #endif
 /**
+ * slab_destroy - destroy and release all objects in a slab
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
 * Destroy all the objs in a slab, and release the mem back to the system.
- * Before calling the slab must have been unlinked from the cache.
+ * Before calling the slab must have been unlinked from the cache.  The
- * The cache-lock is not held/needed.
+ * cache-lock is not held/needed.
 */
 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 {
@@ -1662,8 +1696,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
        }
 }
-/* For setting up all the kmem_list3s for cache whose buffer_size is same
+/*
-   as size of kmem_list3. */
+ * For setting up all the kmem_list3s for cache whose buffer_size is same as
+ * size of kmem_list3.
+ */
 static void set_up_list3s(struct kmem_cache *cachep, int index)
 {
        int node;
@@ -1689,13 +1725,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
 * high order pages for slabs.  When the gfp() functions are more friendly
 * towards high-order requests, this should be changed.
 */
-static inline size_t calculate_slab_order(struct kmem_cache *cachep,
+static size_t calculate_slab_order(struct kmem_cache *cachep,
                        size_t size, size_t align, unsigned long flags)
 {
        size_t left_over = 0;
        int gfporder;
-        for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
+        for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
                unsigned int num;
                size_t remainder;
@@ -1730,12 +1766,66 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
                /*
                 * Acceptable internal fragmentation?
                 */
-                if ((left_over * 8) <= (PAGE_SIZE << gfporder))
+                if (left_over * 8 <= (PAGE_SIZE << gfporder))
                        break;
        }
        return left_over;
 }
+static void setup_cpu_cache(struct kmem_cache *cachep)
+{
+        if (g_cpucache_up == FULL) {
+                enable_cpucache(cachep);
+                return;
+        }
+        if (g_cpucache_up == NONE) {
+                /*
+                 * Note: the first kmem_cache_create must create the cache
+                 * that's used by kmalloc(24), otherwise the creation of
+                 * further caches will BUG().
+                 */
+                cachep->array[smp_processor_id()] = &initarray_generic.cache;
+                /*
+                 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
+                 * the first cache, then we need to set up all its list3s,
+                 * otherwise the creation of further caches will BUG().
+                 */
+                set_up_list3s(cachep, SIZE_AC);
+                if (INDEX_AC == INDEX_L3)
+                        g_cpucache_up = PARTIAL_L3;
+                else
+                        g_cpucache_up = PARTIAL_AC;
+        } else {
+                cachep->array[smp_processor_id()] =
+                        kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+                if (g_cpucache_up == PARTIAL_AC) {
+                        set_up_list3s(cachep, SIZE_L3);
+                        g_cpucache_up = PARTIAL_L3;
+                } else {
+                        int node;
+                        for_each_online_node(node) {
+                                cachep->nodelists[node] =
+                                    kmalloc_node(sizeof(struct kmem_list3),
+                                                GFP_KERNEL, node);
+                                BUG_ON(!cachep->nodelists[node]);
+                                kmem_list3_init(cachep->nodelists[node]);
+                        }
+                }
+        }
+        cachep->nodelists[numa_node_id()]->next_reap =
+                        jiffies + REAPTIMEOUT_LIST3 +
+                        ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+        cpu_cache_get(cachep)->avail = 0;
+        cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+        cpu_cache_get(cachep)->batchcount = 1;
+        cpu_cache_get(cachep)->touched = 0;
+        cachep->batchcount = 1;
+        cachep->limit = BOOT_CPUCACHE_ENTRIES;
+}
 /**
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1751,9 +1841,8 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 * and the @dtor is run before the pages are handed back.
 *
 * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting 
+ * the module calling this has to destroy the cache before getting unloaded.
- * unloaded.
+ *
- * 
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -1762,16 +1851,14 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
- * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
- * memory pressure.
- *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-        unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
+        unsigned long flags,
+        void (*ctor)(void*, struct kmem_cache *, unsigned long),
        void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
        size_t left_over, slab_size, ralign;
@@ -1781,12 +1868,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        /*
         * Sanity checks... these are all serious usage bugs.
         */
-        if ((!name) ||
+        if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
-            in_interrupt() ||
-            (size < BYTES_PER_WORD) ||
            (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
-                printk(KERN_ERR "%s: Early error in slab %s\n",
+                printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
-                       __FUNCTION__, name);
+                                name);
                BUG();
        }
@@ -1840,8 +1925,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * above the next power of two: caches with object sizes just above a
         * power of two have a significant amount of internal fragmentation.
         */
-        if ((size < 4096
+        if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
-             || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
                flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
        if (!(flags & SLAB_DESTROY_BY_RCU))
                flags |= SLAB_POISON;
@@ -1853,13 +1937,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                BUG_ON(dtor);
        /*
-         * Always checks flags, a caller might be expecting debug
+         * Always checks flags, a caller might be expecting debug support which
-         * support which isn't available.
+         * isn't available.
         */
        if (flags & ~CREATE_MASK)
                BUG();
-        /* Check that size is in terms of words.  This is needed to avoid
+        /*
+         * Check that size is in terms of words.  This is needed to avoid
         * unaligned accesses for some archs when redzoning is used, and makes
         * sure any on-slab bufctl's are also correctly aligned.
         */
@@ -1868,12 +1953,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                size &= ~(BYTES_PER_WORD - 1);
        }
-        /* calculate out the final buffer alignment: */
+        /* calculate the final buffer alignment: */
        /* 1) arch recommendation: can be overridden for debug */
        if (flags & SLAB_HWCACHE_ALIGN) {
-                /* Default alignment: as specified by the arch code.
+                /*
-                 * Except if an object is really small, then squeeze multiple
+                 * Default alignment: as specified by the arch code.  Except if
-                 * objects into one cacheline.
+                 * an object is really small, then squeeze multiple objects into
+                 * one cacheline.
                 */
                ralign = cache_line_size();
                while (size <= ralign / 2)
@@ -1893,7 +1980,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                if (ralign > BYTES_PER_WORD)
                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        }
-        /* 4) Store it. Note that the debug code below can reduce
+        /*
+         * 4) Store it. Note that the debug code below can reduce
         *    the alignment to BYTES_PER_WORD.
         */
        align = ralign;
@@ -1978,7 +2066,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->gfpflags = 0;
        if (flags & SLAB_CACHE_DMA)
                cachep->gfpflags |= GFP_DMA;
-        spin_lock_init(&cachep->spinlock);
        cachep->buffer_size = size;
        if (flags & CFLGS_OFF_SLAB)
@@ -1988,64 +2075,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        cachep->name = name;
-        if (g_cpucache_up == FULL) {
+        setup_cpu_cache(cachep);
-                enable_cpucache(cachep);
-        } else {
-                if (g_cpucache_up == NONE) {
-                        /* Note: the first kmem_cache_create must create
-                         * the cache that's used by kmalloc(24), otherwise
-                         * the creation of further caches will BUG().
-                         */
-                        cachep->array[smp_processor_id()] =
-                            &initarray_generic.cache;
-                        /* If the cache that's used by
-                         * kmalloc(sizeof(kmem_list3)) is the first cache,
-                         * then we need to set up all its list3s, otherwise
-                         * the creation of further caches will BUG().
-                         */
-                        set_up_list3s(cachep, SIZE_AC);
-                        if (INDEX_AC == INDEX_L3)
-                                g_cpucache_up = PARTIAL_L3;
-                        else
-                                g_cpucache_up = PARTIAL_AC;
-                } else {
-                        cachep->array[smp_processor_id()] =
-                            kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-                        if (g_cpucache_up == PARTIAL_AC) {
-                                set_up_list3s(cachep, SIZE_L3);
-                                g_cpucache_up = PARTIAL_L3;
-                        } else {
-                                int node;
-                                for_each_online_node(node) {
-                                        cachep->nodelists[node] =
-                                            kmalloc_node(sizeof
-                                                         (struct kmem_list3),
-                                                         GFP_KERNEL, node);
-                                        BUG_ON(!cachep->nodelists[node]);
-                                        kmem_list3_init(cachep->
-                                                        nodelists[node]);
-                                }
-                        }
-                }
-                cachep->nodelists[numa_node_id()]->next_reap =
-                    jiffies + REAPTIMEOUT_LIST3 +
-                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-                BUG_ON(!cpu_cache_get(cachep));
-                cpu_cache_get(cachep)->avail = 0;
-                cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-                cpu_cache_get(cachep)->batchcount = 1;
-                cpu_cache_get(cachep)->touched = 0;
-                cachep->batchcount = 1;
-                cachep->limit = BOOT_CPUCACHE_ENTRIES;
-        }
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
-      oops:
+oops:
        if (!cachep && (flags & SLAB_PANIC))
                panic("kmem_cache_create(): failed to create slab `%s'\n",
                      name);
@@ -2089,30 +2123,13 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
-/*
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
- * Waits for all CPUs to execute func().
+                        struct array_cache *ac,
- */
+                        int force, int node);
-static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
-{
-        check_irq_on();
-        preempt_disable();
-        local_irq_disable();
-        func(arg);
-        local_irq_enable();
-        if (smp_call_function(func, arg, 1, 1))
-                BUG();
-        preempt_enable();
-}
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
-                                int force, int node);
 static void do_drain(void *arg)
 {
-        struct kmem_cache *cachep = (struct kmem_cache *) arg;
+        struct kmem_cache *cachep = arg;
        struct array_cache *ac;
        int node = numa_node_id();
@@ -2129,14 +2146,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
        struct kmem_list3 *l3;
        int node;
-        smp_call_function_all_cpus(do_drain, cachep);
+        on_each_cpu(do_drain, cachep, 1, 1);
        check_irq_on();
        for_each_online_node(node) {
                l3 = cachep->nodelists[node];
                if (l3) {
-                        spin_lock_irq(&l3->list_lock);
+                        drain_array(cachep, l3, l3->shared, 1, node);
-                        drain_array_locked(cachep, l3->shared, 1, node);
-                        spin_unlock_irq(&l3->list_lock);
                        if (l3->alien)
                                drain_alien_cache(cachep, l3->alien);
                }
@@ -2260,16 +2275,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
        /* NUMA: free the list3 structures */
        for_each_online_node(i) {
-                if ((l3 = cachep->nodelists[i])) {
+                l3 = cachep->nodelists[i];
+                if (l3) {
                        kfree(l3->shared);
                        free_alien_cache(l3->alien);
                        kfree(l3);
                }
        }
        kmem_cache_free(&cache_cache, cachep);
        unlock_cpu_hotplug();
        return 0;
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2292,7 +2306,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
        slabp->inuse = 0;
        slabp->colouroff = colour_off;
        slabp->s_mem = objp + colour_off;
        return slabp;
 }
@@ -2307,7 +2320,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
        int i;
        for (i = 0; i < cachep->num; i++) {
-                void *objp = slabp->s_mem + cachep->buffer_size * i;
+                void *objp = index_to_obj(cachep, slabp, i);
 #if DEBUG
                /* need to poison the objs? */
                if (cachep->flags & SLAB_POISON)
@@ -2320,9 +2333,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
                        *dbg_redzone2(cachep, objp) = RED_INACTIVE;
                }
                /*
-                 * Constructors are not allowed to allocate memory from
+                 * Constructors are not allowed to allocate memory from the same
-                 * the same cache which they are a constructor for.
+                 * cache which they are a constructor for.  Otherwise, deadlock.
-                 * Otherwise, deadlock. They must also be threaded.
+                 * They must also be threaded.
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
                        cachep->ctor(objp + obj_offset(cachep), cachep,
@@ -2336,8 +2349,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                slab_error(cachep, "constructor overwrote the"
                                           " start of an object");
                }
-                if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
+                if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
-                    && cachep->flags & SLAB_POISON)
+                            OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
                        kernel_map_pages(virt_to_page(objp),
                                         cachep->buffer_size / PAGE_SIZE, 0);
 #else
@@ -2352,18 +2365,16 @@ static void cache_init_objs(struct kmem_cache *cachep,
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
-        if (flags & SLAB_DMA) {
+        if (flags & SLAB_DMA)
-                if (!(cachep->gfpflags & GFP_DMA))
+                BUG_ON(!(cachep->gfpflags & GFP_DMA));
-                        BUG();
+        else
-        } else {
+                BUG_ON(cachep->gfpflags & GFP_DMA);
-                if (cachep->gfpflags & GFP_DMA)
-                        BUG();
-        }
 }
-static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
+                                int nodeid)
 {
-        void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
+        void *objp = index_to_obj(cachep, slabp, slabp->free);
        kmem_bufctl_t next;
        slabp->inuse++;
@@ -2377,10 +2388,10 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod
        return objp;
 }
-static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
-                          int nodeid)
+                                void *objp, int nodeid)
 {
-        unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
+        unsigned int objnr = obj_to_index(cachep, slabp, objp);
 #if DEBUG
        /* Verify that the slab belongs to the intended node */
@@ -2388,7 +2399,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
        if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
                printk(KERN_ERR "slab: double free detected in cache "
-                       "'%s', objp %p\n", cachep->name, objp);
+                                "'%s', objp %p\n", cachep->name, objp);
                BUG();
        }
 #endif
@@ -2397,14 +2408,18 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
        slabp->inuse--;
 }
-static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
+static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
+                        void *objp)
 {
        int i;
        struct page *page;
        /* Nasty!!!!!! I hope this is OK. */
-        i = 1 << cachep->gfporder;
        page = virt_to_page(objp);
+        i = 1;
+        if (likely(!PageCompound(page)))
+                i <<= cachep->gfporder;
        do {
                page_set_cache(page, cachep);
                page_set_slab(page, slabp);
@@ -2425,8 +2440,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        unsigned long ctor_flags;
        struct kmem_list3 *l3;
-        /* Be lazy and only check for valid flags here,
+        /*
-         * keeping it out of the critical path in kmem_cache_alloc().
+         * Be lazy and only check for valid flags here,  keeping it out of the
+         * critical path in kmem_cache_alloc().
         */
        if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
                BUG();
@@ -2467,14 +2483,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
         */
        kmem_flagcheck(cachep, flags);
-        /* Get mem for the objs.
+        /*
-         * Attempt to allocate a physical page from 'nodeid',
+         * Get mem for the objs.  Attempt to allocate a physical page from
+         * 'nodeid'.
         */
-        if (!(objp = kmem_getpages(cachep, flags, nodeid)))
+        objp = kmem_getpages(cachep, flags, nodeid);
+        if (!objp)
                goto failed;
        /* Get slab management. */
-        if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
+        slabp = alloc_slabmgmt(cachep, objp, offset, local_flags);
+        if (!slabp)
                goto opps1;
        slabp->nodeid = nodeid;
@@ -2493,9 +2512,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        l3->free_objects += cachep->num;
        spin_unlock(&l3->list_lock);
        return 1;
-      opps1:
+opps1:
        kmem_freepages(cachep, objp);
-      failed:
+failed:
        if (local_flags & __GFP_WAIT)
                local_irq_disable();
        return 0;
@@ -2538,8 +2557,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        page = virt_to_page(objp);
        if (page_get_cache(page) != cachep) {
-                printk(KERN_ERR
+                printk(KERN_ERR "mismatch in kmem_cache_free: expected "
-                       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+                                "cache %p, got %p\n",
                       page_get_cache(page), cachep);
                printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
                printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
@@ -2549,13 +2568,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        slabp = page_get_slab(page);
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
+                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
-                    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+                                *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-                        slab_error(cachep,
+                        slab_error(cachep, "double free, or memory outside"
-                                   "double free, or memory outside"
+                                                " object was overwritten");
-                                   " object was overwritten");
+                        printk(KERN_ERR "%p: redzone 1:0x%lx, "
-                        printk(KERN_ERR
+                                        "redzone 2:0x%lx.\n",
-                               "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
                               objp, *dbg_redzone1(cachep, objp),
                               *dbg_redzone2(cachep, objp));
                }
@@ -2565,15 +2583,16 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        if (cachep->flags & SLAB_STORE_USER)
                *dbg_userword(cachep, objp) = caller;
-        objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+        objnr = obj_to_index(cachep, slabp, objp);
        BUG_ON(objnr >= cachep->num);
-        BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
+        BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
        if (cachep->flags & SLAB_DEBUG_INITIAL) {
-                /* Need to call the slab's constructor so the
+                /*
-                 * caller can perform a verify of its state (debugging).
+                 * Need to call the slab's constructor so the caller can
-                 * Called without the cache-lock held.
+                 * perform a verify of its state (debugging).  Called without
+                 * the cache-lock held.
                 */
                cachep->ctor(objp + obj_offset(cachep),
                             cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
@@ -2586,7 +2605,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        }
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
+                if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
                        store_stackinfo(cachep, objp, (unsigned long)caller);
                        kernel_map_pages(virt_to_page(objp),
                                         cachep->buffer_size / PAGE_SIZE, 0);
@@ -2612,14 +2631,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
                        goto bad;
        }
        if (entries != cachep->num - slabp->inuse) {
-              bad:
+bad:
-                printk(KERN_ERR
+                printk(KERN_ERR "slab: Internal list corruption detected in "
-                       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+                                "cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-                       cachep->name, cachep->num, slabp, slabp->inuse);
+                        cachep->name, cachep->num, slabp, slabp->inuse);
                for (i = 0;
                     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
                     i++) {
-                        if ((i % 16) == 0)
+                        if (i % 16 == 0)
                                printk("\n%03x:", i);
                        printk(" %02x", ((unsigned char *)slabp)[i]);
                }
@@ -2641,12 +2660,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
        check_irq_off();
        ac = cpu_cache_get(cachep);
-      retry:
+retry:
        batchcount = ac->batchcount;
        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
-                /* if there was little recent activity on this
+                /*
-                 * cache, then perform only a partial refill.
+                 * If there was little recent activity on this cache, then
-                 * Otherwise we could generate refill bouncing.
+                 * perform only a partial refill.  Otherwise we could generate
+                 * refill bouncing.
                 */
                batchcount = BATCHREFILL_LIMIT;
        }
@@ -2702,29 +2722,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
                        list_add(&slabp->list, &l3->slabs_partial);
        }
-      must_grow:
+must_grow:
        l3->free_objects -= ac->avail;
-      alloc_done:
+alloc_done:
        spin_unlock(&l3->list_lock);
        if (unlikely(!ac->avail)) {
                int x;
                x = cache_grow(cachep, flags, numa_node_id());
-                // cache_grow can reenable interrupts, then ac could change.
+                /* cache_grow can reenable interrupts, then ac could change. */
                ac = cpu_cache_get(cachep);
-                if (!x && ac->avail == 0)       // no objects in sight? abort
+                if (!x && ac->avail == 0)       /* no objects in sight? abort */
                        return NULL;
-                if (!ac->avail) // objects refilled by interrupt?
+                if (!ac->avail)         /* objects refilled by interrupt? */
                        goto retry;
        }
        ac->touched = 1;
        return ac->entry[--ac->avail];
 }
-static inline void
+static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
-cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
+                                                gfp_t flags)
 {
        might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
@@ -2733,8 +2753,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
 }
 #if DEBUG
-static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
+static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
-                                        void *objp, void *caller)
+                                gfp_t flags, void *objp, void *caller)
 {
        if (!objp)
                return objp;
@@ -2754,15 +2774,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags
                *dbg_userword(cachep, objp) = caller;
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
+                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
-                    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+                                *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-                        slab_error(cachep,
+                        slab_error(cachep, "double free, or memory outside"
-                                   "double free, or memory outside"
+                                                " object was overwritten");
-                                   " object was overwritten");
                        printk(KERN_ERR
-                               "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                                "%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
-                               objp, *dbg_redzone1(cachep, objp),
+                                objp, *dbg_redzone1(cachep, objp),
-                               *dbg_redzone2(cachep, objp));
+                                *dbg_redzone2(cachep, objp));
                }
                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
@@ -2809,8 +2828,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
        return objp;
 }
-static __always_inline void *
+static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+                                                gfp_t flags, void *caller)
 {
        unsigned long save_flags;
        void *objp;
@@ -2830,7 +2849,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 /*
 * A interface to enable slab creation on nodeid
 */
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+                                int nodeid)
 {
        struct list_head *entry;
        struct slab *slabp;
@@ -2841,7 +2861,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
        l3 = cachep->nodelists[nodeid];
        BUG_ON(!l3);
-      retry:
+retry:
        check_irq_off();
        spin_lock(&l3->list_lock);
        entry = l3->slabs_partial.next;
@@ -2868,16 +2888,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
        /* move slabp to correct slabp list: */
        list_del(&slabp->list);
-        if (slabp->free == BUFCTL_END) {
+        if (slabp->free == BUFCTL_END)
                list_add(&slabp->list, &l3->slabs_full);
-        } else {
+        else
                list_add(&slabp->list, &l3->slabs_partial);
-        }
        spin_unlock(&l3->list_lock);
        goto done;
-      must_grow:
+must_grow:
        spin_unlock(&l3->list_lock);
        x = cache_grow(cachep, flags, nodeid);
@@ -2885,7 +2904,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
                return NULL;
        goto retry;
-      done:
+done:
        return obj;
 }
 #endif
@@ -2958,7 +2977,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
        }
        free_block(cachep, ac->entry, batchcount, node);
-      free_done:
+free_done:
 #if STATS
        {
                int i = 0;
@@ -2979,16 +2998,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 #endif
        spin_unlock(&l3->list_lock);
        ac->avail -= batchcount;
-        memmove(ac->entry, &(ac->entry[batchcount]),
+        memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
-                sizeof(void *) * ac->avail);
 }
 /*
- * __cache_free
+ * Release an obj back to its cache. If the obj has a constructed state, it must
- * Release an obj back to its cache. If the obj has a constructed
+ * be in this state _before_ it is released.  Called with disabled ints.
- * state, it must be in this state _before_ it is released.
- *
- * Called with disabled ints.
 */
 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 {
@@ -3007,9 +3022,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
                if (unlikely(slabp->nodeid != numa_node_id())) {
                        struct array_cache *alien = NULL;
                        int nodeid = slabp->nodeid;
-                        struct kmem_list3 *l3 =
+                        struct kmem_list3 *l3;
-                            cachep->nodelists[numa_node_id()];
+                        l3 = cachep->nodelists[numa_node_id()];
                        STATS_INC_NODEFREES(cachep);
                        if (l3->alien && l3->alien[nodeid]) {
                                alien = l3->alien[nodeid];
@@ -3093,7 +3108,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
        if (unlikely(page_get_cache(page) != cachep))
                goto out;
        return 1;
-      out:
+out:
        return 0;
 }
@@ -3119,7 +3134,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
        local_irq_save(save_flags);
        if (nodeid == -1 || nodeid == numa_node_id() ||
-            !cachep->nodelists[nodeid])
+                        !cachep->nodelists[nodeid])
                ptr = ____cache_alloc(cachep, flags);
        else
                ptr = __cache_alloc_node(cachep, flags, nodeid);
@@ -3148,6 +3163,7 @@ EXPORT_SYMBOL(kmalloc_node);
 * kmalloc - allocate memory
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
+ * @caller: function caller for debug tracking of the caller
 *
 * kmalloc is the normal method of allocating memory
 * in the kernel.
@@ -3236,7 +3252,7 @@ void *__alloc_percpu(size_t size)
        /* Catch derefs w/o wrappers */
        return (void *)(~(unsigned long)pdata);
-      unwind_oom:
+unwind_oom:
        while (--i >= 0) {
                if (!cpu_possible(i))
                        continue;
@@ -3339,18 +3355,20 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
                struct array_cache *nc = NULL, *new;
                struct array_cache **new_alien = NULL;
 #ifdef CONFIG_NUMA
-                if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
+                new_alien = alloc_alien_cache(node, cachep->limit);
+                if (!new_alien)
                        goto fail;
 #endif
-                if (!(new = alloc_arraycache(node, (cachep->shared *
+                new = alloc_arraycache(node, cachep->shared*cachep->batchcount,
-                                                    cachep->batchcount),
+                                        0xbaadf00d);
-                                             0xbaadf00d)))
+                if (!new)
                        goto fail;
-                if ((l3 = cachep->nodelists[node])) {
+                l3 = cachep->nodelists[node];
+                if (l3) {
                        spin_lock_irq(&l3->list_lock);
-                        if ((nc = cachep->nodelists[node]->shared))
+                        nc = cachep->nodelists[node]->shared;
+                        if (nc)
                                free_block(cachep, nc->entry, nc->avail, node);
                        l3->shared = new;
@@ -3359,27 +3377,27 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
                                new_alien = NULL;
                        }
                        l3->free_limit = (1 + nr_cpus_node(node)) *
-                            cachep->batchcount + cachep->num;
+                                        cachep->batchcount + cachep->num;
                        spin_unlock_irq(&l3->list_lock);
                        kfree(nc);
                        free_alien_cache(new_alien);
                        continue;
                }
-                if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
+                l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
-                                        GFP_KERNEL, node)))
+                if (!l3)
                        goto fail;
                kmem_list3_init(l3);
                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+                                ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
                l3->shared = new;
                l3->alien = new_alien;
                l3->free_limit = (1 + nr_cpus_node(node)) *
-                    cachep->batchcount + cachep->num;
+                                        cachep->batchcount + cachep->num;
                cachep->nodelists[node] = l3;
        }
        return err;
-      fail:
+fail:
        err = -ENOMEM;
        return err;
 }
@@ -3391,7 +3409,7 @@ struct ccupdate_struct {
 static void do_ccupdate_local(void *info)
 {
-        struct ccupdate_struct *new = (struct ccupdate_struct *)info;
+        struct ccupdate_struct *new = info;
        struct array_cache *old;
        check_irq_off();
@@ -3401,16 +3419,17 @@ static void do_ccupdate_local(void *info)
        new->new[smp_processor_id()] = old;
 }
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
+/* Always called with the cache_chain_mutex held */
-                            int shared)
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+                                int batchcount, int shared)
 {
        struct ccupdate_struct new;
        int i, err;
        memset(&new.new, 0, sizeof(new.new));
        for_each_online_cpu(i) {
-                new.new[i] =
+                new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
-                    alloc_arraycache(cpu_to_node(i), limit, batchcount);
+                                                batchcount);
                if (!new.new[i]) {
                        for (i--; i >= 0; i--)
                                kfree(new.new[i]);
@@ -3419,14 +3438,12 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
        }
        new.cachep = cachep;
-        smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
+        on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
        check_irq_on();
-        spin_lock(&cachep->spinlock);
        cachep->batchcount = batchcount;
        cachep->limit = limit;
        cachep->shared = shared;
-        spin_unlock(&cachep->spinlock);
        for_each_online_cpu(i) {
                struct array_cache *ccold = new.new[i];
@@ -3447,15 +3464,17 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount
        return 0;
 }
+/* Called with cache_chain_mutex held always */
 static void enable_cpucache(struct kmem_cache *cachep)
 {
        int err;
        int limit, shared;
-        /* The head array serves three purposes:
+        /*
+         * The head array serves three purposes:
         * - create a LIFO ordering, i.e. return objects that are cache-warm
         * - reduce the number of spinlock operations.
-         * - reduce the number of linked list operations on the slab and 
+         * - reduce the number of linked list operations on the slab and
         *   bufctl chains: array operations are cheaper.
         * The numbers are guessed, we should auto-tune as described by
         * Bonwick.
@@ -3471,7 +3490,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
        else
                limit = 120;
-        /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
+        /*
+         * CPU bound tasks (e.g. network routing) can exhibit cpu bound
         * allocation behaviour: Most allocs on one cpu, most free operations
         * on another cpu. For these cases, an efficient object passing between
         * cpus is necessary. This is provided by a shared array. The array
@@ -3486,9 +3506,9 @@ static void enable_cpucache(struct kmem_cache *cachep)
 #endif
 #if DEBUG
-        /* With debugging enabled, large batchcount lead to excessively
+        /*
-         * long periods with disabled local interrupts. Limit the 
+         * With debugging enabled, large batchcount lead to excessively long
-         * batchcount
+         * periods with disabled local interrupts. Limit the batchcount
         */
        if (limit > 32)
                limit = 32;
@@ -3499,23 +3519,32 @@ static void enable_cpucache(struct kmem_cache *cachep)
                       cachep->name, -err);
 }
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
+/*
-                                int force, int node)
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary. Note that the l3 listlock also protects the array_cache
+ * if drain_array() is used on the shared array.
+ */
+void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+                         struct array_cache *ac, int force, int node)
 {
        int tofree;
-        check_spinlock_acquired_node(cachep, node);
+        if (!ac || !ac->avail)
+                return;
        if (ac->touched && !force) {
                ac->touched = 0;
-        } else if (ac->avail) {
+        } else {
-                tofree = force ? ac->avail : (ac->limit + 4) / 5;
+                spin_lock_irq(&l3->list_lock);
-                if (tofree > ac->avail) {
+                if (ac->avail) {
-                        tofree = (ac->avail + 1) / 2;
+                        tofree = force ? ac->avail : (ac->limit + 4) / 5;
+                        if (tofree > ac->avail)
+                                tofree = (ac->avail + 1) / 2;
+                        free_block(cachep, ac->entry, tofree, node);
+                        ac->avail -= tofree;
+                        memmove(ac->entry, &(ac->entry[tofree]),
+                                sizeof(void *) * ac->avail);
                }
-                free_block(cachep, ac->entry, tofree, node);
+                spin_unlock_irq(&l3->list_lock);
-                ac->avail -= tofree;
-                memmove(ac->entry, &(ac->entry[tofree]),
-                        sizeof(void *) * ac->avail);
        }
 }
@@ -3528,13 +3557,14 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac
 * - clear the per-cpu caches for this CPU.
 * - return freeable pages to the main free memory pool.
 *
- * If we cannot acquire the cache chain mutex then just give up - we'll
+ * If we cannot acquire the cache chain mutex then just give up - we'll try
- * try again on the next iteration.
+ * again on the next iteration.
 */
 static void cache_reap(void *unused)
 {
        struct list_head *walk;
        struct kmem_list3 *l3;
+        int node = numa_node_id();
        if (!mutex_trylock(&cache_chain_mutex)) {
                /* Give up. Setup the next iteration. */
@@ -3550,65 +3580,72 @@ static void cache_reap(void *unused)
                struct slab *slabp;
                searchp = list_entry(walk, struct kmem_cache, next);
-                if (searchp->flags & SLAB_NO_REAP)
-                        goto next;
                check_irq_on();
-                l3 = searchp->nodelists[numa_node_id()];
+                /*
+                 * We only take the l3 lock if absolutely necessary and we
+                 * have established with reasonable certainty that
+                 * we can do some work if the lock was obtained.
+                 */
+                l3 = searchp->nodelists[node];
                reap_alien(searchp, l3);
-                spin_lock_irq(&l3->list_lock);
-                drain_array_locked(searchp, cpu_cache_get(searchp), 0,
+                drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
-                                   numa_node_id());
+                /*
+                 * These are racy checks but it does not matter
+                 * if we skip one check or scan twice.
+                 */
                if (time_after(l3->next_reap, jiffies))
-                        goto next_unlock;
+                        goto next;
                l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
-                if (l3->shared)
+                drain_array(searchp, l3, l3->shared, 0, node);
-                        drain_array_locked(searchp, l3->shared, 0,
-                                           numa_node_id());
                if (l3->free_touched) {
                        l3->free_touched = 0;
-                        goto next_unlock;
+                        goto next;
                }
-                tofree =
+                tofree = (l3->free_limit + 5 * searchp->num - 1) /
-                    (l3->free_limit + 5 * searchp->num -
+                                (5 * searchp->num);
-                     1) / (5 * searchp->num);
                do {
+                        /*
+                         * Do not lock if there are no free blocks.
+                         */
+                        if (list_empty(&l3->slabs_free))
+                                break;
+                        spin_lock_irq(&l3->list_lock);
                        p = l3->slabs_free.next;
-                        if (p == &(l3->slabs_free))
+                        if (p == &(l3->slabs_free)) {
+                                spin_unlock_irq(&l3->list_lock);
                                break;
+                        }
                        slabp = list_entry(p, struct slab, list);
                        BUG_ON(slabp->inuse);
                        list_del(&slabp->list);
                        STATS_INC_REAPED(searchp);
-                        /* Safe to drop the lock. The slab is no longer
+                        /*
-                         * linked to the cache.
+                         * Safe to drop the lock. The slab is no longer linked
-                         * searchp cannot disappear, we hold
+                         * to the cache. searchp cannot disappear, we hold
                         * cache_chain_lock
                         */
                        l3->free_objects -= searchp->num;
                        spin_unlock_irq(&l3->list_lock);
                        slab_destroy(searchp, slabp);
-                        spin_lock_irq(&l3->list_lock);
                } while (--tofree > 0);
-              next_unlock:
+next:
-                spin_unlock_irq(&l3->list_lock);
-              next:
                cond_resched();
        }
        check_irq_on();
        mutex_unlock(&cache_chain_mutex);
        next_reap_node();
-        /* Setup the next iteration */
+        /* Set up the next iteration */
        schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
@@ -3658,8 +3695,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
        struct kmem_cache *cachep = p;
        ++*pos;
-        return cachep->next.next == &cache_chain ? NULL
+        return cachep->next.next == &cache_chain ?
-            : list_entry(cachep->next.next, struct kmem_cache, next);
+                NULL : list_entry(cachep->next.next, struct kmem_cache, next);
 }
 static void s_stop(struct seq_file *m, void *p)
@@ -3681,7 +3718,6 @@ static int s_show(struct seq_file *m, void *p)
        int node;
        struct kmem_list3 *l3;
-        spin_lock(&cachep->spinlock);
        active_objs = 0;
        num_slabs = 0;
        for_each_online_node(node) {
@@ -3748,7 +3784,9 @@ static int s_show(struct seq_file *m, void *p)
                unsigned long node_frees = cachep->node_frees;
                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-                                %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
+                                %4lu %4lu %4lu %4lu", allocs, high, grown,
+                                reaped, errors, max_freeable, node_allocs,
+                                node_frees);
        }
        /* cpu stats */
        {
@@ -3762,7 +3800,6 @@ static int s_show(struct seq_file *m, void *p)
        }
 #endif
        seq_putc(m, '\n');
-        spin_unlock(&cachep->spinlock);
        return 0;
 }
@@ -3820,13 +3857,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
        mutex_lock(&cache_chain_mutex);
        res = -EINVAL;
        list_for_each(p, &cache_chain) {
-                struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
+                struct kmem_cache *cachep;
-                                                       next);
+                cachep = list_entry(p, struct kmem_cache, next);
                if (!strcmp(cachep->name, kbuf)) {
-                        if (limit < 1 ||
+                        if (limit < 1 || batchcount < 1 ||
-                            batchcount < 1 ||
+                                        batchcount > limit || shared < 0) {
-                            batchcount > limit || shared < 0) {
                                res = 0;
                        } else {
                                res = do_tune_cpucache(cachep, limit,
diff --git a/mm/swap.c b/mm/swap.c
index b524ea90bd..91b7e2026f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -209,19 +209,18 @@ int lru_add_drain_all(void)
 */
 void fastcall __page_cache_release(struct page *page)
 {
-        unsigned long flags;
+        if (PageLRU(page)) {
-        struct zone *zone = page_zone(page);
+                unsigned long flags;
+                struct zone *zone = page_zone(page);
-        spin_lock_irqsave(&zone->lru_lock, flags);
+                spin_lock_irqsave(&zone->lru_lock, flags);
-        if (TestClearPageLRU(page))
+                BUG_ON(!PageLRU(page));
+                __ClearPageLRU(page);
                del_page_from_lru(zone, page);
-        if (page_count(page) != 0)
+                spin_unlock_irqrestore(&zone->lru_lock, flags);
-                page = NULL;
+        }
-        spin_unlock_irqrestore(&zone->lru_lock, flags);
+        free_hot_page(page);
-        if (page)
-                free_hot_page(page);
 }
 EXPORT_SYMBOL(__page_cache_release);
 /*
@@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold)
        pagevec_init(&pages_to_free, cold);
        for (i = 0; i < nr; i++) {
                struct page *page = pages[i];
-                struct zone *pagezone;
                if (unlikely(PageCompound(page))) {
                        if (zone) {
@@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold)
                if (!put_page_testzero(page))
                        continue;
-                pagezone = page_zone(page);
+                if (PageLRU(page)) {
-                if (pagezone != zone) {
+                        struct zone *pagezone = page_zone(page);
-                        if (zone)
+                        if (pagezone != zone) {
-                                spin_unlock_irq(&zone->lru_lock);
+                                if (zone)
-                        zone = pagezone;
+                                        spin_unlock_irq(&zone->lru_lock);
-                        spin_lock_irq(&zone->lru_lock);
+                                zone = pagezone;
-                }
+                                spin_lock_irq(&zone->lru_lock);
-                if (TestClearPageLRU(page))
+                        }
+                        BUG_ON(!PageLRU(page));
+                        __ClearPageLRU(page);
                        del_page_from_lru(zone, page);
-                if (page_count(page) == 0) {
+                }
-                        if (!pagevec_add(&pages_to_free, page)) {
+                if (!pagevec_add(&pages_to_free, page)) {
+                        if (zone) {
                                spin_unlock_irq(&zone->lru_lock);
-                                __pagevec_free(&pages_to_free);
+                                zone = NULL;
-                                pagevec_reinit(&pages_to_free);
-                                zone = NULL;    /* No lock is held */
                        }
-                }
+                        __pagevec_free(&pages_to_free);
+                        pagevec_reinit(&pages_to_free);
+                }
        }
        if (zone)
                spin_unlock_irq(&zone->lru_lock);
@@ -343,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
                        zone = pagezone;
                        spin_lock_irq(&zone->lru_lock);
                }
-                if (TestSetPageLRU(page))
+                BUG_ON(PageLRU(page));
-                        BUG();
+                SetPageLRU(page);
                add_page_to_inactive_list(zone, page);
        }
        if (zone)
@@ -370,10 +372,10 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
                        zone = pagezone;
                        spin_lock_irq(&zone->lru_lock);
                }
-                if (TestSetPageLRU(page))
+                BUG_ON(PageLRU(page));
-                        BUG();
+                SetPageLRU(page);
-                if (TestSetPageActive(page))
+                BUG_ON(PageActive(page));
-                        BUG();
+                SetPageActive(page);
                add_page_to_active_list(zone, page);
        }
        if (zone)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index db8a3d3e16..d7af296833 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/migrate.h>
 #include <asm/pgtable.h>
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f9cf0d073..365ed6ff18 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
                                last_in_cluster = offset + SWAPFILE_CLUSTER;
                        else if (offset == last_in_cluster) {
                                spin_lock(&swap_lock);
-                                si->cluster_next = offset-SWAPFILE_CLUSTER-1;
+                                si->cluster_next = offset-SWAPFILE_CLUSTER+1;
                                goto cluster;
                        }
                        if (unlikely(--latency_ration < 0)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4fe7e3aa02..fd572bbdc9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -33,39 +33,21 @@
 #include <linux/cpuset.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
+#include <linux/delay.h>
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
 #include <linux/swapops.h>
-/* possible outcome of pageout() */
+#include "internal.h"
-typedef enum {
-        /* failed to write page out, page is locked */
-        PAGE_KEEP,
-        /* move page to the active list, page is locked */
-        PAGE_ACTIVATE,
-        /* page has been sent to the disk successfully, page is unlocked */
-        PAGE_SUCCESS,
-        /* page is clean and locked */
-        PAGE_CLEAN,
-} pageout_t;
 struct scan_control {
-        /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
-        unsigned long nr_to_scan;
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
-        /* Incremented by the number of pages reclaimed */
-        unsigned long nr_reclaimed;
        unsigned long nr_mapped;        /* From page_state */
-        /* Ask shrink_caches, or shrink_zone to scan at this priority */
-        unsigned int priority;
        /* This context's GFP mask */
        gfp_t gfp_mask;
@@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker);
 *
 * Returns the number of slab objects which we shrunk.
 */
-int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+                        unsigned long lru_pages)
 {
        struct shrinker *shrinker;
-        int ret = 0;
+        unsigned long ret = 0;
        if (scanned == 0)
                scanned = SWAP_CLUSTER_MAX;
@@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping,
 }
 /*
- * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
 */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+pageout_t pageout(struct page *page, struct address_space *mapping)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
        return PAGE_CLEAN;
 }
-static int remove_mapping(struct address_space *mapping, struct page *page)
+int remove_mapping(struct address_space *mapping, struct page *page)
 {
        if (!mapping)
                return 0;               /* truncate got there first */
@@ -414,14 +398,15 @@ cannot_free:
 }
 /*
- * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
+ * shrink_page_list() returns the number of reclaimed pages
 */
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static unsigned long shrink_page_list(struct list_head *page_list,
+                                        struct scan_control *sc)
 {
        LIST_HEAD(ret_pages);
        struct pagevec freed_pvec;
        int pgactivate = 0;
-        int reclaimed = 0;
+        unsigned long nr_reclaimed = 0;
        cond_resched();
@@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * Anonymous process memory has backing store?
                 * Try to allocate it some swap space here.
                 */
-                if (PageAnon(page) && !PageSwapCache(page)) {
+                if (PageAnon(page) && !PageSwapCache(page))
-                        if (!sc->may_swap)
-                                goto keep_locked;
                        if (!add_to_swap(page, GFP_ATOMIC))
                                goto activate_locked;
-                }
 #endif /* CONFIG_SWAP */
                mapping = page_mapping(page);
@@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * processes. Try to unmap it here.
                 */
                if (page_mapped(page) && mapping) {
-                        /*
-                         * No unmapping if we do not swap
-                         */
-                        if (!sc->may_swap)
-                                goto keep_locked;
                        switch (try_to_unmap(page, 0)) {
                        case SWAP_FAIL:
                                goto activate_locked;
@@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 free_it:
                unlock_page(page);
-                reclaimed++;
+                nr_reclaimed++;
                if (!pagevec_add(&freed_pvec, page))
                        __pagevec_release_nonlru(&freed_pvec);
                continue;
@@ -579,483 +555,8 @@ keep:
        if (pagevec_count(&freed_pvec))
                __pagevec_release_nonlru(&freed_pvec);
        mod_page_state(pgactivate, pgactivate);
-        sc->nr_reclaimed += reclaimed;
+        return nr_reclaimed;
-        return reclaimed;
-}
-#ifdef CONFIG_MIGRATION
-static inline void move_to_lru(struct page *page)
-{
-        list_del(&page->lru);
-        if (PageActive(page)) {
-                /*
-                 * lru_cache_add_active checks that
-                 * the PG_active bit is off.
-                 */
-                ClearPageActive(page);
-                lru_cache_add_active(page);
-        } else {
-                lru_cache_add(page);
-        }
-        put_page(page);
-}
-/*
- * Add isolated pages on the list back to the LRU.
- *
- * returns the number of pages put back.
- */
-int putback_lru_pages(struct list_head *l)
-{
-        struct page *page;
-        struct page *page2;
-        int count = 0;
-        list_for_each_entry_safe(page, page2, l, lru) {
-                move_to_lru(page);
-                count++;
-        }
-        return count;
-}
-/*
- * Non migratable page
- */
-int fail_migrate_page(struct page *newpage, struct page *page)
-{
-        return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-/*
- * swapout a single page
- * page is locked upon entry, unlocked on exit
- */
-static int swap_page(struct page *page)
-{
-        struct address_space *mapping = page_mapping(page);
-        if (page_mapped(page) && mapping)
-                if (try_to_unmap(page, 1) != SWAP_SUCCESS)
-                        goto unlock_retry;
-        if (PageDirty(page)) {
-                /* Page is dirty, try to write it out here */
-                switch(pageout(page, mapping)) {
-                case PAGE_KEEP:
-                case PAGE_ACTIVATE:
-                        goto unlock_retry;
-                case PAGE_SUCCESS:
-                        goto retry;
-                case PAGE_CLEAN:
-                        ; /* try to free the page below */
-                }
-        }
-        if (PagePrivate(page)) {
-                if (!try_to_release_page(page, GFP_KERNEL) ||
-                    (!mapping && page_count(page) == 1))
-                        goto unlock_retry;
-        }
-        if (remove_mapping(mapping, page)) {
-                /* Success */
-                unlock_page(page);
-                return 0;
-        }
-unlock_retry:
-        unlock_page(page);
-retry:
-        return -EAGAIN;
-}
-EXPORT_SYMBOL(swap_page);
-/*
- * Page migration was first developed in the context of the memory hotplug
- * project. The main authors of the migration code are:
- *
- * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
- * Hirokazu Takahashi <taka@valinux.co.jp>
- * Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
- */
-/*
- * Remove references for a page and establish the new page with the correct
- * basic settings to be able to stop accesses to the page.
- */
-int migrate_page_remove_references(struct page *newpage,
-                                struct page *page, int nr_refs)
-{
-        struct address_space *mapping = page_mapping(page);
-        struct page **radix_pointer;
-        /*
-         * Avoid doing any of the following work if the page count
-         * indicates that the page is in use or truncate has removed
-         * the page.
-         */
-        if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
-                return -EAGAIN;
-        /*
-         * Establish swap ptes for anonymous pages or destroy pte
-         * maps for files.
-         *
-         * In order to reestablish file backed mappings the fault handlers
-         * will take the radix tree_lock which may then be used to stop
-         * processses from accessing this page until the new page is ready.
-         *
-         * A process accessing via a swap pte (an anonymous page) will take a
-         * page_lock on the old page which will block the process until the
-         * migration attempt is complete. At that time the PageSwapCache bit
-         * will be examined. If the page was migrated then the PageSwapCache
-         * bit will be clear and the operation to retrieve the page will be
-         * retried which will find the new page in the radix tree. Then a new
-         * direct mapping may be generated based on the radix tree contents.
-         *
-         * If the page was not migrated then the PageSwapCache bit
-         * is still set and the operation may continue.
-         */
-        if (try_to_unmap(page, 1) == SWAP_FAIL)
-                /* A vma has VM_LOCKED set -> Permanent failure */
-                return -EPERM;
-        /*
-         * Give up if we were unable to remove all mappings.
-         */
-        if (page_mapcount(page))
-                return -EAGAIN;
-        write_lock_irq(&mapping->tree_lock);
-        radix_pointer = (struct page **)radix_tree_lookup_slot(
-                                                &mapping->page_tree,
-                                                page_index(page));
-        if (!page_mapping(page) || page_count(page) != nr_refs ||
-                        *radix_pointer != page) {
-                write_unlock_irq(&mapping->tree_lock);
-                return -EAGAIN;
-        }
-        /*
-         * Now we know that no one else is looking at the page.
-         *
-         * Certain minimal information about a page must be available
-         * in order for other subsystems to properly handle the page if they
-         * find it through the radix tree update before we are finished
-         * copying the page.
-         */
-        get_page(newpage);
-        newpage->index = page->index;
-        newpage->mapping = page->mapping;
-        if (PageSwapCache(page)) {
-                SetPageSwapCache(newpage);
-                set_page_private(newpage, page_private(page));
-        }
-        *radix_pointer = newpage;
-        __put_page(page);
-        write_unlock_irq(&mapping->tree_lock);
-        return 0;
-}
-EXPORT_SYMBOL(migrate_page_remove_references);
-/*
- * Copy the page to its new location
- */
-void migrate_page_copy(struct page *newpage, struct page *page)
-{
-        copy_highpage(newpage, page);
-        if (PageError(page))
-                SetPageError(newpage);
-        if (PageReferenced(page))
-                SetPageReferenced(newpage);
-        if (PageUptodate(page))
-                SetPageUptodate(newpage);
-        if (PageActive(page))
-                SetPageActive(newpage);
-        if (PageChecked(page))
-                SetPageChecked(newpage);
-        if (PageMappedToDisk(page))
-                SetPageMappedToDisk(newpage);
-        if (PageDirty(page)) {
-                clear_page_dirty_for_io(page);
-                set_page_dirty(newpage);
-        }
-        ClearPageSwapCache(page);
-        ClearPageActive(page);
-        ClearPagePrivate(page);
-        set_page_private(page, 0);
-        page->mapping = NULL;
-        /*
-         * If any waiters have accumulated on the new page then
-         * wake them up.
-         */
-        if (PageWriteback(newpage))
-                end_page_writeback(newpage);
-}
-EXPORT_SYMBOL(migrate_page_copy);
-/*
- * Common logic to directly migrate a single page suitable for
- * pages that do not use PagePrivate.
- *
- * Pages are locked upon entry and exit.
- */
-int migrate_page(struct page *newpage, struct page *page)
-{
-        int rc;
-        BUG_ON(PageWriteback(page));    /* Writeback must be complete */
-        rc = migrate_page_remove_references(newpage, page, 2);
-        if (rc)
-                return rc;
-        migrate_page_copy(newpage, page);
-        /*
-         * Remove auxiliary swap entries and replace
-         * them with real ptes.
-         *
-         * Note that a real pte entry will allow processes that are not
-         * waiting on the page lock to use the new page via the page tables
-         * before the new page is unlocked.
-         */
-        remove_from_swap(newpage);
-        return 0;
 }
-EXPORT_SYMBOL(migrate_page);
-/*
- * migrate_pages
- *
- * Two lists are passed to this function. The first list
- * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the pages isolated
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
- *
- * The function returns after 10 attempts or if no pages
- * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
- *
- * Return: Number of pages not migrated when "to" ran empty.
- */
-int migrate_pages(struct list_head *from, struct list_head *to,
-                  struct list_head *moved, struct list_head *failed)
-{
-        int retry;
-        int nr_failed = 0;
-        int pass = 0;
-        struct page *page;
-        struct page *page2;
-        int swapwrite = current->flags & PF_SWAPWRITE;
-        int rc;
-        if (!swapwrite)
-                current->flags |= PF_SWAPWRITE;
-redo:
-        retry = 0;
-        list_for_each_entry_safe(page, page2, from, lru) {
-                struct page *newpage = NULL;
-                struct address_space *mapping;
-                cond_resched();
-                rc = 0;
-                if (page_count(page) == 1)
-                        /* page was freed from under us. So we are done. */
-                        goto next;
-                if (to && list_empty(to))
-                        break;
-                /*
-                 * Skip locked pages during the first two passes to give the
-                 * functions holding the lock time to release the page. Later we
-                 * use lock_page() to have a higher chance of acquiring the
-                 * lock.
-                 */
-                rc = -EAGAIN;
-                if (pass > 2)
-                        lock_page(page);
-                else
-                        if (TestSetPageLocked(page))
-                                goto next;
-                /*
-                 * Only wait on writeback if we have already done a pass where
-                 * we we may have triggered writeouts for lots of pages.
-                 */
-                if (pass > 0) {
-                        wait_on_page_writeback(page);
-                } else {
-                        if (PageWriteback(page))
-                                goto unlock_page;
-                }
-                /*
-                 * Anonymous pages must have swap cache references otherwise
-                 * the information contained in the page maps cannot be
-                 * preserved.
-                 */
-                if (PageAnon(page) && !PageSwapCache(page)) {
-                        if (!add_to_swap(page, GFP_KERNEL)) {
-                                rc = -ENOMEM;
-                                goto unlock_page;
-                        }
-                }
-                if (!to) {
-                        rc = swap_page(page);
-                        goto next;
-                }
-                newpage = lru_to_page(to);
-                lock_page(newpage);
-                /*
-                 * Pages are properly locked and writeback is complete.
-                 * Try to migrate the page.
-                 */
-                mapping = page_mapping(page);
-                if (!mapping)
-                        goto unlock_both;
-                if (mapping->a_ops->migratepage) {
-                        /*
-                         * Most pages have a mapping and most filesystems
-                         * should provide a migration function. Anonymous
-                         * pages are part of swap space which also has its
-                         * own migration function. This is the most common
-                         * path for page migration.
-                         */
-                        rc = mapping->a_ops->migratepage(newpage, page);
-                        goto unlock_both;
-                }
-                /*
-                 * Default handling if a filesystem does not provide
-                 * a migration function. We can only migrate clean
-                 * pages so try to write out any dirty pages first.
-                 */
-                if (PageDirty(page)) {
-                        switch (pageout(page, mapping)) {
-                        case PAGE_KEEP:
-                        case PAGE_ACTIVATE:
-                                goto unlock_both;
-                        case PAGE_SUCCESS:
-                                unlock_page(newpage);
-                                goto next;
-                        case PAGE_CLEAN:
-                                ; /* try to migrate the page below */
-                        }
-                }
-                /*
-                 * Buffers are managed in a filesystem specific way.
-                 * We must have no buffers or drop them.
-                 */
-                if (!page_has_buffers(page) ||
-                    try_to_release_page(page, GFP_KERNEL)) {
-                        rc = migrate_page(newpage, page);
-                        goto unlock_both;
-                }
-                /*
-                 * On early passes with mapped pages simply
-                 * retry. There may be a lock held for some
-                 * buffers that may go away. Later
-                 * swap them out.
-                 */
-                if (pass > 4) {
-                        /*
-                         * Persistently unable to drop buffers..... As a
-                         * measure of last resort we fall back to
-                         * swap_page().
-                         */
-                        unlock_page(newpage);
-                        newpage = NULL;
-                        rc = swap_page(page);
-                        goto next;
-                }
-unlock_both:
-                unlock_page(newpage);
-unlock_page:
-                unlock_page(page);
-next:
-                if (rc == -EAGAIN) {
-                        retry++;
-                } else if (rc) {
-                        /* Permanent failure */
-                        list_move(&page->lru, failed);
-                        nr_failed++;
-                } else {
-                        if (newpage) {
-                                /* Successful migration. Return page to LRU */
-                                move_to_lru(newpage);
-                        }
-                        list_move(&page->lru, moved);
-                }
-        }
-        if (retry && pass++ < 10)
-                goto redo;
-        if (!swapwrite)
-                current->flags &= ~PF_SWAPWRITE;
-        return nr_failed + retry;
-}
-/*
- * Isolate one page from the LRU lists and put it on the
- * indicated list with elevated refcount.
- *
- * Result:
- *  0 = page not on LRU list
- *  1 = page removed from LRU list and added to the specified list.
- */
-int isolate_lru_page(struct page *page)
-{
-        int ret = 0;
-        if (PageLRU(page)) {
-                struct zone *zone = page_zone(page);
-                spin_lock_irq(&zone->lru_lock);
-                if (TestClearPageLRU(page)) {
-                        ret = 1;
-                        get_page(page);
-                        if (PageActive(page))
-                                del_page_from_active_list(zone, page);
-                        else
-                                del_page_from_inactive_list(zone, page);
-                }
-                spin_unlock_irq(&zone->lru_lock);
-        }
-        return ret;
-}
-#endif
 /*
 * zone->lru_lock is heavily contended.  Some of the functions that
@@ -1074,32 +575,35 @@ int isolate_lru_page(struct page *page)
 *
 * returns how many pages were moved onto *@dst.
 */
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
-                             struct list_head *dst, int *scanned)
+                struct list_head *src, struct list_head *dst,
+                unsigned long *scanned)
 {
-        int nr_taken = 0;
+        unsigned long nr_taken = 0;
        struct page *page;
-        int scan = 0;
+        unsigned long scan;
-        while (scan++ < nr_to_scan && !list_empty(src)) {
+        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+                struct list_head *target;
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
-                if (!TestClearPageLRU(page))
+                BUG_ON(!PageLRU(page));
-                        BUG();
                list_del(&page->lru);
-                if (get_page_testone(page)) {
+                target = src;
+                if (likely(get_page_unless_zero(page))) {
                        /*
-                         * It is being freed elsewhere
+                         * Be careful not to clear PageLRU until after we're
+                         * sure the page is not being freed elsewhere -- the
+                         * page release code relies on it.
                         */
-                        __put_page(page);
+                        ClearPageLRU(page);
-                        SetPageLRU(page);
+                        target = dst;
-                        list_add(&page->lru, src);
-                        continue;
-                } else {
-                        list_add(&page->lru, dst);
                        nr_taken++;
-                }
+                } /* else it is being freed elsewhere */
+                list_add(&page->lru, target);
        }
        *scanned = scan;
@@ -1107,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 }
 /*
- * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
 */
-static void shrink_cache(struct zone *zone, struct scan_control *sc)
+static unsigned long shrink_inactive_list(unsigned long max_scan,
+                                struct zone *zone, struct scan_control *sc)
 {
        LIST_HEAD(page_list);
        struct pagevec pvec;
-        int max_scan = sc->nr_to_scan;
+        unsigned long nr_scanned = 0;
+        unsigned long nr_reclaimed = 0;
        pagevec_init(&pvec, 1);
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
-        while (max_scan > 0) {
+        do {
                struct page *page;
-                int nr_taken;
+                unsigned long nr_taken;
-                int nr_scan;
+                unsigned long nr_scan;
-                int nr_freed;
+                unsigned long nr_freed;
                nr_taken = isolate_lru_pages(sc->swap_cluster_max,
                                             &zone->inactive_list,
@@ -1132,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
                zone->pages_scanned += nr_scan;
                spin_unlock_irq(&zone->lru_lock);
-                if (nr_taken == 0)
+                nr_scanned += nr_scan;
-                        goto done;
+                nr_freed = shrink_page_list(&page_list, sc);
+                nr_reclaimed += nr_freed;
-                max_scan -= nr_scan;
-                nr_freed = shrink_list(&page_list, sc);
                local_irq_disable();
                if (current_is_kswapd()) {
                        __mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
@@ -1146,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
                        __mod_page_state_zone(zone, pgscan_direct, nr_scan);
                __mod_page_state_zone(zone, pgsteal, nr_freed);
+                if (nr_taken == 0)
+                        goto done;
                spin_lock(&zone->lru_lock);
                /*
                 * Put back any unfreeable pages.
                 */
                while (!list_empty(&page_list)) {
                        page = lru_to_page(&page_list);
-                        if (TestSetPageLRU(page))
+                        BUG_ON(PageLRU(page));
-                                BUG();
+                        SetPageLRU(page);
                        list_del(&page->lru);
                        if (PageActive(page))
                                add_page_to_active_list(zone, page);
@@ -1165,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
                                spin_lock_irq(&zone->lru_lock);
                        }
                }
-        }
+        } while (nr_scanned < max_scan);
-        spin_unlock_irq(&zone->lru_lock);
+        spin_unlock(&zone->lru_lock);
 done:
+        local_irq_enable();
        pagevec_release(&pvec);
+        return nr_reclaimed;
 }
 /*
@@ -1188,13 +697,12 @@ done:
 * The downside is that we have to touch page->_count against each page.
 * But we had to alter page->flags anyway.
 */
-static void
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+                                struct scan_control *sc)
 {
-        int pgmoved;
+        unsigned long pgmoved;
        int pgdeactivate = 0;
-        int pgscanned;
+        unsigned long pgscanned;
-        int nr_pages = sc->nr_to_scan;
        LIST_HEAD(l_hold);      /* The pages which were snipped off */
        LIST_HEAD(l_inactive);  /* Pages to go onto the inactive_list */
        LIST_HEAD(l_active);    /* Pages to go onto the active_list */
@@ -1202,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        struct pagevec pvec;
        int reclaim_mapped = 0;
-        if (unlikely(sc->may_swap)) {
+        if (sc->may_swap) {
                long mapped_ratio;
                long distress;
                long swap_tendency;
@@ -1272,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        while (!list_empty(&l_inactive)) {
                page = lru_to_page(&l_inactive);
                prefetchw_prev_lru_page(page, &l_inactive, flags);
-                if (TestSetPageLRU(page))
+                BUG_ON(PageLRU(page));
-                        BUG();
+                SetPageLRU(page);
-                if (!TestClearPageActive(page))
+                BUG_ON(!PageActive(page));
-                        BUG();
+                ClearPageActive(page);
                list_move(&page->lru, &zone->inactive_list);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
@@ -1301,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
        while (!list_empty(&l_active)) {
                page = lru_to_page(&l_active);
                prefetchw_prev_lru_page(page, &l_active, flags);
-                if (TestSetPageLRU(page))
+                BUG_ON(PageLRU(page));
-                        BUG();
+                SetPageLRU(page);
                BUG_ON(!PageActive(page));
                list_move(&page->lru, &zone->active_list);
                pgmoved++;
@@ -1327,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 /*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
-static void
+static unsigned long shrink_zone(int priority, struct zone *zone,
-shrink_zone(struct zone *zone, struct scan_control *sc)
+                                struct scan_control *sc)
 {
        unsigned long nr_active;
        unsigned long nr_inactive;
+        unsigned long nr_to_scan;
+        unsigned long nr_reclaimed = 0;
        atomic_inc(&zone->reclaim_in_progress);
@@ -1339,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
         * Add one to `nr_to_scan' just to make sure that the kernel will
         * slowly sift through the active list.
         */
-        zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+        zone->nr_scan_active += (zone->nr_active >> priority) + 1;
        nr_active = zone->nr_scan_active;
        if (nr_active >= sc->swap_cluster_max)
                zone->nr_scan_active = 0;
        else
                nr_active = 0;
-        zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
+        zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
        nr_inactive = zone->nr_scan_inactive;
        if (nr_inactive >= sc->swap_cluster_max)
                zone->nr_scan_inactive = 0;
@@ -1355,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
        while (nr_active || nr_inactive) {
                if (nr_active) {
-                        sc->nr_to_scan = min(nr_active,
+                        nr_to_scan = min(nr_active,
                                        (unsigned long)sc->swap_cluster_max);
-                        nr_active -= sc->nr_to_scan;
+                        nr_active -= nr_to_scan;
-                        refill_inactive_zone(zone, sc);
+                        shrink_active_list(nr_to_scan, zone, sc);
                }
                if (nr_inactive) {
-                        sc->nr_to_scan = min(nr_inactive,
+                        nr_to_scan = min(nr_inactive,
                                        (unsigned long)sc->swap_cluster_max);
-                        nr_inactive -= sc->nr_to_scan;
+                        nr_inactive -= nr_to_scan;
-                        shrink_cache(zone, sc);
+                        nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
+                                                                sc);
                }
        }
        throttle_vm_writeout();
        atomic_dec(&zone->reclaim_in_progress);
+        return nr_reclaimed;
 }
 /*
@@ -1390,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 * If a zone is deemed to be full of pinned pages then just give it a light
 * scan then give up on it.
 */
-static void
+static unsigned long shrink_zones(int priority, struct zone **zones,
-shrink_caches(struct zone **zones, struct scan_control *sc)
+                                        struct scan_control *sc)
 {
+        unsigned long nr_reclaimed = 0;
        int i;
        for (i = 0; zones[i] != NULL; i++) {
@@ -1404,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
                        continue;
-                zone->temp_priority = sc->priority;
+                zone->temp_priority = priority;
-                if (zone->prev_priority > sc->priority)
+                if (zone->prev_priority > priority)
-                        zone->prev_priority = sc->priority;
+                        zone->prev_priority = priority;
-                if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+                if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                        continue;       /* Let kswapd poll it */
-                shrink_zone(zone, sc);
+                nr_reclaimed += shrink_zone(priority, zone, sc);
        }
+        return nr_reclaimed;
 }
 
 /*
@@ -1428,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 * holds filesystem locks which prevent writeout this might not work, and the
 * allocation attempt will fail.
 */
-int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 {
        int priority;
        int ret = 0;
-        int total_scanned = 0, total_reclaimed = 0;
+        unsigned long total_scanned = 0;
+        unsigned long nr_reclaimed = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
-        struct scan_control sc;
        unsigned long lru_pages = 0;
        int i;
+        struct scan_control sc = {
-        sc.gfp_mask = gfp_mask;
+                .gfp_mask = gfp_mask,
-        sc.may_writepage = !laptop_mode;
+                .may_writepage = !laptop_mode,
-        sc.may_swap = 1;
+                .swap_cluster_max = SWAP_CLUSTER_MAX,
+                .may_swap = 1,
+        };
        inc_page_state(allocstall);
@@ -1457,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc.nr_mapped = read_page_state(nr_mapped);
                sc.nr_scanned = 0;
-                sc.nr_reclaimed = 0;
-                sc.priority = priority;
-                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
                if (!priority)
                        disable_swap_token();
-                shrink_caches(zones, &sc);
+                nr_reclaimed += shrink_zones(priority, zones, &sc);
                shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
                if (reclaim_state) {
-                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+                        nr_reclaimed += reclaim_state->reclaimed_slab;
                        reclaim_state->reclaimed_slab = 0;
                }
                total_scanned += sc.nr_scanned;
-                total_reclaimed += sc.nr_reclaimed;
+                if (nr_reclaimed >= sc.swap_cluster_max) {
-                if (total_reclaimed >= sc.swap_cluster_max) {
                        ret = 1;
                        goto out;
                }
@@ -1482,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                 * that's undesirable in laptop mode, where we *want* lumpy
                 * writeout.  So in laptop mode, write out the whole world.
                 */
-                if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+                if (total_scanned > sc.swap_cluster_max +
+                                        sc.swap_cluster_max / 2) {
                        wakeup_pdflush(laptop_mode ? 0 : total_scanned);
                        sc.may_writepage = 1;
                }
@@ -1528,22 +1042,26 @@ out:
 * the page allocator fallback scheme to ensure that aging of pages is balanced
 * across the zones.
 */
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+                                int order)
 {
-        int to_free = nr_pages;
+        unsigned long to_free = nr_pages;
        int all_zones_ok;
        int priority;
        int i;
-        int total_scanned, total_reclaimed;
+        unsigned long total_scanned;
+        unsigned long nr_reclaimed;
        struct reclaim_state *reclaim_state = current->reclaim_state;
-        struct scan_control sc;
+        struct scan_control sc = {
+                .gfp_mask = GFP_KERNEL,
+                .may_swap = 1,
+                .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+        };
 loop_again:
        total_scanned = 0;
-        total_reclaimed = 0;
+        nr_reclaimed = 0;
-        sc.gfp_mask = GFP_KERNEL;
+        sc.may_writepage = !laptop_mode,
-        sc.may_writepage = !laptop_mode;
-        sc.may_swap = 1;
        sc.nr_mapped = read_page_state(nr_mapped);
        inc_page_state(pageoutrun);
@@ -1624,15 +1142,11 @@ scan:
                        if (zone->prev_priority > priority)
                                zone->prev_priority = priority;
                        sc.nr_scanned = 0;
-                        sc.nr_reclaimed = 0;
+                        nr_reclaimed += shrink_zone(priority, zone, &sc);
-                        sc.priority = priority;
-                        sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
-                        shrink_zone(zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
                                                lru_pages);
-                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+                        nr_reclaimed += reclaim_state->reclaimed_slab;
-                        total_reclaimed += sc.nr_reclaimed;
                        total_scanned += sc.nr_scanned;
                        if (zone->all_unreclaimable)
                                continue;
@@ -1645,10 +1159,10 @@ scan:
                         * even in laptop mode
                         */
                        if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-                            total_scanned > total_reclaimed+total_reclaimed/2)
+                            total_scanned > nr_reclaimed + nr_reclaimed / 2)
                                sc.may_writepage = 1;
                }
-                if (nr_pages && to_free > total_reclaimed)
+                if (nr_pages && to_free > nr_reclaimed)
                        continue;       /* swsusp: need to do more work */
                if (all_zones_ok)
                        break;          /* kswapd: all done */
@@ -1665,7 +1179,7 @@ scan:
                 * matches the direct reclaim path behaviour in terms of impact
                 * on zone->*_priority.
                 */
-                if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
+                if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
                        break;
        }
 out:
@@ -1679,7 +1193,7 @@ out:
                goto loop_again;
        }
-        return total_reclaimed;
+        return nr_reclaimed;
 }
 /*
@@ -1779,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order)
 * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
 * pages.
 */
-int shrink_all_memory(int nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages)
 {
        pg_data_t *pgdat;
-        int nr_to_free = nr_pages;
+        unsigned long nr_to_free = nr_pages;
-        int ret = 0;
+        unsigned long ret = 0;
+        unsigned retry = 2;
        struct reclaim_state reclaim_state = {
                .reclaimed_slab = 0,
        };
        current->reclaim_state = &reclaim_state;
+repeat:
        for_each_pgdat(pgdat) {
-                int freed;
+                unsigned long freed;
                freed = balance_pgdat(pgdat, nr_to_free, 0);
                ret += freed;
                nr_to_free -= freed;
-                if (nr_to_free <= 0)
+                if ((long)nr_to_free <= 0)
                        break;
        }
+        if (retry-- && ret < nr_pages) {
+                blk_congestion_wait(WRITE, HZ/5);
+                goto repeat;
+        }
        current->reclaim_state = NULL;
        return ret;
 }
@@ -1808,8 +1329,7 @@ int shrink_all_memory(int nr_pages)
   away, we get changed to run anywhere: as the first one comes back,
   restore their cpu bindings. */
 static int __devinit cpu_callback(struct notifier_block *nfb,
-                                  unsigned long action,
+                                  unsigned long action, void *hcpu)
-                                  void *hcpu)
 {
        pg_data_t *pgdat;
        cpumask_t mask;
@@ -1829,10 +1349,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 static int __init kswapd_init(void)
 {
        pg_data_t *pgdat;
        swap_setup();
-        for_each_pgdat(pgdat)
+        for_each_pgdat(pgdat) {
-                pgdat->kswapd
+                pid_t pid;
-                = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+                pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+                BUG_ON(pid < 0);
+                pgdat->kswapd = find_task_by_pid(pid);
+        }
        total_memory = nr_free_pagecache_pages();
        hotcpu_notifier(cpu_callback, 0);
        return 0;
@@ -1874,46 +1399,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
 /*
 * Try to free up some pages from this zone through reclaim.
 */
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-        int nr_pages;
+        /* Minimum pages needed in order to stay on node */
+        const unsigned long nr_pages = 1 << order;
        struct task_struct *p = current;
        struct reclaim_state reclaim_state;
-        struct scan_control sc;
+        int priority;
-        cpumask_t mask;
+        unsigned long nr_reclaimed = 0;
-        int node_id;
+        struct scan_control sc = {
+                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
-        if (time_before(jiffies,
+                .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
-                zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+                .nr_mapped = read_page_state(nr_mapped),
-                        return 0;
+                .swap_cluster_max = max_t(unsigned long, nr_pages,
+                                        SWAP_CLUSTER_MAX),
-        if (!(gfp_mask & __GFP_WAIT) ||
+                .gfp_mask = gfp_mask,
-                zone->all_unreclaimable ||
+        };
-                atomic_read(&zone->reclaim_in_progress) > 0 ||
-                (p->flags & PF_MEMALLOC))
-                        return 0;
-        node_id = zone->zone_pgdat->node_id;
-        mask = node_to_cpumask(node_id);
-        if (!cpus_empty(mask) && node_id != numa_node_id())
-                return 0;
-        sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
-        sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
-        sc.nr_scanned = 0;
-        sc.nr_reclaimed = 0;
-        sc.priority = ZONE_RECLAIM_PRIORITY + 1;
-        sc.nr_mapped = read_page_state(nr_mapped);
-        sc.gfp_mask = gfp_mask;
        disable_swap_token();
-        nr_pages = 1 << order;
-        if (nr_pages > SWAP_CLUSTER_MAX)
-                sc.swap_cluster_max = nr_pages;
-        else
-                sc.swap_cluster_max = SWAP_CLUSTER_MAX;
        cond_resched();
        /*
         * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -1928,17 +1431,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         * Free memory by calling shrink zone with increasing priorities
         * until we have enough memory freed.
         */
+        priority = ZONE_RECLAIM_PRIORITY;
        do {
-                sc.priority--;
+                nr_reclaimed += shrink_zone(priority, zone, &sc);
-                shrink_zone(zone, &sc);
+                priority--;
+        } while (priority >= 0 && nr_reclaimed < nr_pages);
-        } while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
+        if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
-        if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
                /*
-                 * shrink_slab does not currently allow us to determine
+                 * shrink_slab() does not currently allow us to determine how
-                 * how many pages were freed in the zone. So we just
+                 * many pages were freed in this zone. So we just shake the slab
-                 * shake the slab and then go offnode for a single allocation.
+                 * a bit and then go off node for this particular allocation
+                 * despite possibly having freed enough memory to allocate in
+                 * this zone.  If we freed local memory then the next
+                 * allocations will be local again.
                 *
                 * shrink_slab will free memory on all zones and may take
                 * a long time.
@@ -1949,10 +1455,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
        p->reclaim_state = NULL;
        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
-        if (sc.nr_reclaimed == 0)
+        if (nr_reclaimed == 0) {
+                /*
+                 * We were unable to reclaim enough pages to stay on node.  We
+                 * now allow off node accesses for a certain time period before
+                 * trying again to reclaim pages from the local zone.
+                 */
                zone->last_unsuccessful_zone_reclaim = jiffies;
+        }
-        return sc.nr_reclaimed >= nr_pages;
+        return nr_reclaimed >= nr_pages;
 }
-#endif
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+        cpumask_t mask;
+        int node_id;
+        /*
+         * Do not reclaim if there was a recent unsuccessful attempt at zone
+         * reclaim.  In that case we let allocations go off node for the
+         * zone_reclaim_interval.  Otherwise we would scan for each off-node
+         * page allocation.
+         */
+        if (time_before(jiffies,
+                zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+                        return 0;
+        /*
+         * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+         * not have reclaimable pages and if we should not delay the allocation
+         * then do not scan.
+         */
+        if (!(gfp_mask & __GFP_WAIT) ||
+                zone->all_unreclaimable ||
+                atomic_read(&zone->reclaim_in_progress) > 0 ||
+                (current->flags & PF_MEMALLOC))
+                        return 0;
+        /*
+         * Only run zone reclaim on the local zone or on zones that do not
+         * have associated processors. This will favor the local processor
+         * over remote processors and spread off node memory allocations
+         * as wide as possible.
+         */
+        node_id = zone->zone_pgdat->node_id;
+        mask = node_to_cpumask(node_id);
+        if (!cpus_empty(mask) && node_id != numa_node_id())
+                return 0;
+        return __zone_reclaim(zone, gfp_mask, order);
+}
+#endif