19 files changed, 1934 insertions, 769 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b3db11f137e0..a9cb80ae6409 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -132,3 +132,10 @@ config SPLIT_PTLOCK_CPUS
        default "4096" if ARM && !CPU_CACHE_VIPT
        default "4096" if PARISC && !PA20
        default "4"
+#
+# support for page migration
+#
+config MIGRATION
+        def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
+        depends on SWAP
diff --git a/mm/Makefile b/mm/Makefile
index 2fa6d2ca9f28..9aa03fa1dcc3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,8 +9,8 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y                   := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
                           page_alloc.o page-writeback.o pdflush.o \
-                           readahead.o slab.o swap.o truncate.o vmscan.o \
+                           readahead.o swap.o truncate.o vmscan.o \
-                           prio_tree.o $(mmu-y)
+                           prio_tree.o util.o $(mmu-y)
 obj-$(CONFIG_SWAP)      += page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS) += hugetlb.o
@@ -18,5 +18,7 @@ obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM) += sparse.o
 obj-$(CONFIG_SHMEM) += shmem.o
 obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
+obj-$(CONFIG_SLOB) += slob.o
+obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 5f19e87bc5af..d257c89e7704 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -37,6 +37,11 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
        if (!file)
                return -EBADF;
+        if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {
+                ret = -ESPIPE;
+                goto out;
+        }
        mapping = file->f_mapping;
        if (!mapping || len < 0) {
                ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index 4ef24a397684..478f4c74cc31 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -280,7 +280,7 @@ static int wait_on_page_writeback_range(struct address_space *mapping,
 * it is otherwise livelockable.
 */
 int sync_page_range(struct inode *inode, struct address_space *mapping,
-                        loff_t pos, size_t count)
+                        loff_t pos, loff_t count)
 {
        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -305,9 +305,8 @@ EXPORT_SYMBOL(sync_page_range);
 * as it forces O_SYNC writers to different parts of the same file
 * to be serialised right until io completion.
 */
-static int sync_page_range_nolock(struct inode *inode,
+int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
-                                  struct address_space *mapping,
+                           loff_t pos, loff_t count)
-                                  loff_t pos, size_t count)
 {
        pgoff_t start = pos >> PAGE_CACHE_SHIFT;
        pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
@@ -322,6 +321,7 @@ static int sync_page_range_nolock(struct inode *inode,
                ret = wait_on_page_writeback_range(mapping, start, end);
        return ret;
 }
+EXPORT_SYMBOL(sync_page_range_nolock);
 /**
 * filemap_fdatawait - walk the list of under-writeback pages of the given
@@ -343,30 +343,44 @@ EXPORT_SYMBOL(filemap_fdatawait);
 int filemap_write_and_wait(struct address_space *mapping)
 {
-        int retval = 0;
+        int err = 0;
        if (mapping->nrpages) {
-                retval = filemap_fdatawrite(mapping);
+                err = filemap_fdatawrite(mapping);
-                if (retval == 0)
+                /*
-                        retval = filemap_fdatawait(mapping);
+                 * Even if the above returned error, the pages may be
+                 * written partially (e.g. -ENOSPC), so we wait for it.
+                 * But the -EIO is special case, it may indicate the worst
+                 * thing (e.g. bug) happened, so we avoid waiting for it.
+                 */
+                if (err != -EIO) {
+                        int err2 = filemap_fdatawait(mapping);
+                        if (!err)
+                                err = err2;
+                }
        }
-        return retval;
+        return err;
 }
+EXPORT_SYMBOL(filemap_write_and_wait);
 int filemap_write_and_wait_range(struct address_space *mapping,
                                 loff_t lstart, loff_t lend)
 {
-        int retval = 0;
+        int err = 0;
        if (mapping->nrpages) {
-                retval = __filemap_fdatawrite_range(mapping, lstart, lend,
+                err = __filemap_fdatawrite_range(mapping, lstart, lend,
-                                                    WB_SYNC_ALL);
+                                                 WB_SYNC_ALL);
-                if (retval == 0)
+                /* See comment of filemap_write_and_wait() */
-                        retval = wait_on_page_writeback_range(mapping,
+                if (err != -EIO) {
-                                                    lstart >> PAGE_CACHE_SHIFT,
+                        int err2 = wait_on_page_writeback_range(mapping,
-                                                    lend >> PAGE_CACHE_SHIFT);
+                                                lstart >> PAGE_CACHE_SHIFT,
+                                                lend >> PAGE_CACHE_SHIFT);
+                        if (!err)
+                                err = err2;
+                }
        }
-        return retval;
+        return err;
 }
 /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f4c43d7980ba..b21d78c941b5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -12,6 +12,7 @@
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
+#include <linux/cpuset.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -48,7 +49,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
        for (z = zonelist->zones; *z; z++) {
                nid = (*z)->zone_pgdat->node_id;
-                if (!list_empty(&hugepage_freelists[nid]))
+                if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
+                    !list_empty(&hugepage_freelists[nid]))
                        break;
        }
diff --git a/mm/memory.c b/mm/memory.c
index 7197f9bcd384..3944fec38012 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2267,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
+EXPORT_SYMBOL_GPL(__handle_mm_fault);
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
 * Allocate page upper directory.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f1d2b8a952b..1850d0aef4ac 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -83,9 +83,18 @@
 #include <linux/init.h>
 #include <linux/compat.h>
 #include <linux/mempolicy.h>
+#include <linux/swap.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
+/* Internal flags */
+#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
+#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
+#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 static kmem_cache_t *policy_cache;
 static kmem_cache_t *sn_cache;
@@ -171,12 +180,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
                break;
        }
        policy->policy = mode;
+        policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
        return policy;
 }
-/* Ensure all existing pages follow the policy. */
+static void gather_stats(struct page *, void *);
+static void migrate_page_add(struct vm_area_struct *vma,
+        struct page *page, struct list_head *pagelist, unsigned long flags);
+/* Scan through pages checking if pages follow certain conditions. */
 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pte_t *orig_pte;
        pte_t *pte;
@@ -193,7 +209,17 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                if (!page)
                        continue;
                nid = page_to_nid(page);
-                if (!node_isset(nid, *nodes))
+                if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+                        continue;
+                if (flags & MPOL_MF_STATS)
+                        gather_stats(page, private);
+                else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+                        spin_unlock(ptl);
+                        migrate_page_add(vma, page, private, flags);
+                        spin_lock(ptl);
+                }
+                else
                        break;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap_unlock(orig_pte, ptl);
@@ -201,7 +227,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 }
 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -211,14 +239,17 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                if (check_pte_range(vma, pmd, addr, next, nodes))
+                if (check_pte_range(vma, pmd, addr, next, nodes,
+                                    flags, private))
                        return -EIO;
        } while (pmd++, addr = next, addr != end);
        return 0;
 }
 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pud_t *pud;
        unsigned long next;
@@ -228,14 +259,17 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                if (check_pmd_range(vma, pud, addr, next, nodes))
+                if (check_pmd_range(vma, pud, addr, next, nodes,
+                                    flags, private))
                        return -EIO;
        } while (pud++, addr = next, addr != end);
        return 0;
 }
 static inline int check_pgd_range(struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end, nodemask_t *nodes)
+                unsigned long addr, unsigned long end,
+                const nodemask_t *nodes, unsigned long flags,
+                void *private)
 {
        pgd_t *pgd;
        unsigned long next;
@@ -245,16 +279,30 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                if (check_pud_range(vma, pgd, addr, next, nodes))
+                if (check_pud_range(vma, pgd, addr, next, nodes,
+                                    flags, private))
                        return -EIO;
        } while (pgd++, addr = next, addr != end);
        return 0;
 }
-/* Step 1: check the range */
+/* Check if a vma is migratable */
+static inline int vma_migratable(struct vm_area_struct *vma)
+{
+        if (vma->vm_flags & (
+                VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
+                return 0;
+        return 1;
+}
+/*
+ * Check if all pages in a range are on a set of nodes.
+ * If pagelist != NULL then isolate pages from the LRU and
+ * put them on the pagelist.
+ */
 static struct vm_area_struct *
 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
-            nodemask_t *nodes, unsigned long flags)
+                const nodemask_t *nodes, unsigned long flags, void *private)
 {
        int err;
        struct vm_area_struct *first, *vma, *prev;
@@ -264,17 +312,24 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                return ERR_PTR(-EFAULT);
        prev = NULL;
        for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
-                if (!vma->vm_next && vma->vm_end < end)
+                if (!(flags & MPOL_MF_DISCONTIG_OK)) {
-                        return ERR_PTR(-EFAULT);
+                        if (!vma->vm_next && vma->vm_end < end)
-                if (prev && prev->vm_end < vma->vm_start)
+                                return ERR_PTR(-EFAULT);
-                        return ERR_PTR(-EFAULT);
+                        if (prev && prev->vm_end < vma->vm_start)
-                if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
+                                return ERR_PTR(-EFAULT);
+                }
+                if (!is_vm_hugetlb_page(vma) &&
+                    ((flags & MPOL_MF_STRICT) ||
+                     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+                                vma_migratable(vma)))) {
                        unsigned long endvma = vma->vm_end;
                        if (endvma > end)
                                endvma = end;
                        if (vma->vm_start > start)
                                start = vma->vm_start;
-                        err = check_pgd_range(vma, start, endvma, nodes);
+                        err = check_pgd_range(vma, start, endvma, nodes,
+                                                flags, private);
                        if (err) {
                                first = ERR_PTR(err);
                                break;
@@ -333,51 +388,10 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
        if (!nodes)
                return 0;
-        /* Update current mems_allowed */
+        cpuset_update_task_memory_state();
-        cpuset_update_current_mems_allowed();
+        if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
-        /* Ignore nodes not set in current->mems_allowed */
-        cpuset_restrict_to_mems_allowed(nodes->bits);
-        return mpol_check_policy(mode, nodes);
-}
-long do_mbind(unsigned long start, unsigned long len,
-                unsigned long mode, nodemask_t *nmask, unsigned long flags)
-{
-        struct vm_area_struct *vma;
-        struct mm_struct *mm = current->mm;
-        struct mempolicy *new;
-        unsigned long end;
-        int err;
-        if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
-                return -EINVAL;
-        if (start & ~PAGE_MASK)
-                return -EINVAL;
-        if (mode == MPOL_DEFAULT)
-                flags &= ~MPOL_MF_STRICT;
-        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
-        end = start + len;
-        if (end < start)
                return -EINVAL;
-        if (end == start)
+        return mpol_check_policy(mode, nodes);
-                return 0;
-        if (mpol_check_policy(mode, nmask))
-                return -EINVAL;
-        new = mpol_new(mode, nmask);
-        if (IS_ERR(new))
-                return PTR_ERR(new);
-        PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-                        mode,nodes_addr(nodes)[0]);
-        down_write(&mm->mmap_sem);
-        vma = check_range(mm, start, end, nmask, flags);
-        err = PTR_ERR(vma);
-        if (!IS_ERR(vma))
-                err = mbind_range(vma, start, end, new);
-        up_write(&mm->mmap_sem);
-        mpol_free(new);
-        return err;
 }
 /* Set the process memory policy */
@@ -448,7 +462,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
        struct vm_area_struct *vma = NULL;
        struct mempolicy *pol = current->mempolicy;
-        cpuset_update_current_mems_allowed();
+        cpuset_update_task_memory_state();
        if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
                return -EINVAL;
        if (flags & MPOL_F_ADDR) {
@@ -500,11 +514,177 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 }
 /*
+ * page migration
+ */
+/* Check if we are the only process mapping the page in question */
+static inline int single_mm_mapping(struct mm_struct *mm,
+                        struct address_space *mapping)
+{
+        struct vm_area_struct *vma;
+        struct prio_tree_iter iter;
+        int rc = 1;
+        spin_lock(&mapping->i_mmap_lock);
+        vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+                if (mm != vma->vm_mm) {
+                        rc = 0;
+                        goto out;
+                }
+        list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
+                if (mm != vma->vm_mm) {
+                        rc = 0;
+                        goto out;
+                }
+out:
+        spin_unlock(&mapping->i_mmap_lock);
+        return rc;
+}
+/*
+ * Add a page to be migrated to the pagelist
+ */
+static void migrate_page_add(struct vm_area_struct *vma,
+        struct page *page, struct list_head *pagelist, unsigned long flags)
+{
+        /*
+         * Avoid migrating a page that is shared by others and not writable.
+         */
+        if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
+            mapping_writably_mapped(page->mapping) ||
+            single_mm_mapping(vma->vm_mm, page->mapping)) {
+                int rc = isolate_lru_page(page);
+                if (rc == 1)
+                        list_add(&page->lru, pagelist);
+                /*
+                 * If the isolate attempt was not successful then we just
+                 * encountered an unswappable page. Something must be wrong.
+                 */
+                WARN_ON(rc == 0);
+        }
+}
+static int swap_pages(struct list_head *pagelist)
+{
+        LIST_HEAD(moved);
+        LIST_HEAD(failed);
+        int n;
+        n = migrate_pages(pagelist, NULL, &moved, &failed);
+        putback_lru_pages(&failed);
+        putback_lru_pages(&moved);
+        return n;
+}
+/*
+ * For now migrate_pages simply swaps out the pages from nodes that are in
+ * the source set but not in the target set. In the future, we would
+ * want a function that moves pages between the two nodesets in such
+ * a way as to preserve the physical layout as much as possible.
+ *
+ * Returns the number of page that could not be moved.
+ */
+int do_migrate_pages(struct mm_struct *mm,
+        const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+        LIST_HEAD(pagelist);
+        int count = 0;
+        nodemask_t nodes;
+        nodes_andnot(nodes, *from_nodes, *to_nodes);
+        down_read(&mm->mmap_sem);
+        check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
+                        flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+        if (!list_empty(&pagelist)) {
+                count = swap_pages(&pagelist);
+                putback_lru_pages(&pagelist);
+        }
+        up_read(&mm->mmap_sem);
+        return count;
+}
+long do_mbind(unsigned long start, unsigned long len,
+                unsigned long mode, nodemask_t *nmask, unsigned long flags)
+{
+        struct vm_area_struct *vma;
+        struct mm_struct *mm = current->mm;
+        struct mempolicy *new;
+        unsigned long end;
+        int err;
+        LIST_HEAD(pagelist);
+        if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+            || mode > MPOL_MAX)
+                return -EINVAL;
+        if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+                return -EPERM;
+        if (start & ~PAGE_MASK)
+                return -EINVAL;
+        if (mode == MPOL_DEFAULT)
+                flags &= ~MPOL_MF_STRICT;
+        len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+        end = start + len;
+        if (end < start)
+                return -EINVAL;
+        if (end == start)
+                return 0;
+        if (mpol_check_policy(mode, nmask))
+                return -EINVAL;
+        new = mpol_new(mode, nmask);
+        if (IS_ERR(new))
+                return PTR_ERR(new);
+        /*
+         * If we are using the default policy then operation
+         * on discontinuous address spaces is okay after all
+         */
+        if (!new)
+                flags |= MPOL_MF_DISCONTIG_OK;
+        PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+                        mode,nodes_addr(nodes)[0]);
+        down_write(&mm->mmap_sem);
+        vma = check_range(mm, start, end, nmask,
+                          flags | MPOL_MF_INVERT, &pagelist);
+        err = PTR_ERR(vma);
+        if (!IS_ERR(vma)) {
+                int nr_failed = 0;
+                err = mbind_range(vma, start, end, new);
+                if (!list_empty(&pagelist))
+                        nr_failed = swap_pages(&pagelist);
+                if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+                        err = -EIO;
+        }
+        if (!list_empty(&pagelist))
+                putback_lru_pages(&pagelist);
+        up_write(&mm->mmap_sem);
+        mpol_free(new);
+        return err;
+}
+/*
 * User space interface with variable sized bitmaps for nodelists.
 */
 /* Copy a node mask from user space. */
-static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
+static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
                     unsigned long maxnode)
 {
        unsigned long k;
@@ -593,6 +773,65 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
        return do_set_mempolicy(mode, &nodes);
 }
+asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
+                const unsigned long __user *old_nodes,
+                const unsigned long __user *new_nodes)
+{
+        struct mm_struct *mm;
+        struct task_struct *task;
+        nodemask_t old;
+        nodemask_t new;
+        nodemask_t task_nodes;
+        int err;
+        err = get_nodes(&old, old_nodes, maxnode);
+        if (err)
+                return err;
+        err = get_nodes(&new, new_nodes, maxnode);
+        if (err)
+                return err;
+        /* Find the mm_struct */
+        read_lock(&tasklist_lock);
+        task = pid ? find_task_by_pid(pid) : current;
+        if (!task) {
+                read_unlock(&tasklist_lock);
+                return -ESRCH;
+        }
+        mm = get_task_mm(task);
+        read_unlock(&tasklist_lock);
+        if (!mm)
+                return -EINVAL;
+        /*
+         * Check if this process has the right to modify the specified
+         * process. The right exists if the process has administrative
+         * capabilities, superuser priviledges or the same
+         * userid as the target process.
+         */
+        if ((current->euid != task->suid) && (current->euid != task->uid) &&
+            (current->uid != task->suid) && (current->uid != task->uid) &&
+            !capable(CAP_SYS_ADMIN)) {
+                err = -EPERM;
+                goto out;
+        }
+        task_nodes = cpuset_mems_allowed(task);
+        /* Is the user allowed to access the target nodes? */
+        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+                err = -EPERM;
+                goto out;
+        }
+        err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+out:
+        mmput(mm);
+        return err;
+}
 /* Retrieve NUMA policy */
 asmlinkage long sys_get_mempolicy(int __user *policy,
                                unsigned long __user *nmask,
@@ -699,8 +938,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 #endif
 /* Return effective policy for a VMA */
-struct mempolicy *
+static struct mempolicy * get_vma_policy(struct task_struct *task,
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+                struct vm_area_struct *vma, unsigned long addr)
 {
        struct mempolicy *pol = task->mempolicy;
@@ -848,7 +1087,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
        struct mempolicy *pol = get_vma_policy(current, vma, addr);
-        cpuset_update_current_mems_allowed();
+        cpuset_update_task_memory_state();
        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
                unsigned nid;
@@ -874,7 +1113,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 *      interrupt context and apply the current process NUMA policy.
 *      Returns NULL when no page can be allocated.
 *
- *      Don't call cpuset_update_current_mems_allowed() unless
+ *      Don't call cpuset_update_task_memory_state() unless
 *      1) it's ok to take cpuset_sem (can WAIT), and
 *      2) allocating for current task (not interrupt).
 */
@@ -883,7 +1122,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
        struct mempolicy *pol = current->mempolicy;
        if ((gfp & __GFP_WAIT) && !in_interrupt())
-                cpuset_update_current_mems_allowed();
+                cpuset_update_task_memory_state();
        if (!pol || in_interrupt())
                pol = &default_policy;
        if (pol->policy == MPOL_INTERLEAVE)
@@ -892,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 }
 EXPORT_SYMBOL(alloc_pages_current);
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed().  This
+ * keeps mempolicies cpuset relative after its cpuset moves.  See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
 /* Slow path of a mempolicy copy */
 struct mempolicy *__mpol_copy(struct mempolicy *old)
 {
@@ -899,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
        if (!new)
                return ERR_PTR(-ENOMEM);
+        if (current_cpuset_is_being_rebound()) {
+                nodemask_t mems = cpuset_mems_allowed(current);
+                mpol_rebind_policy(old, &mems);
+        }
        *new = *old;
        atomic_set(&new->refcnt, 1);
        if (new->policy == MPOL_BIND) {
@@ -1173,25 +1425,31 @@ void numa_default_policy(void)
 }
 /* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
-                                                        const nodemask_t *new)
 {
+        nodemask_t *mpolmask;
        nodemask_t tmp;
        if (!pol)
                return;
+        mpolmask = &pol->cpuset_mems_allowed;
+        if (nodes_equal(*mpolmask, *newmask))
+                return;
        switch (pol->policy) {
        case MPOL_DEFAULT:
                break;
        case MPOL_INTERLEAVE:
-                nodes_remap(tmp, pol->v.nodes, *old, *new);
+                nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
                pol->v.nodes = tmp;
-                current->il_next = node_remap(current->il_next, *old, *new);
+                *mpolmask = *newmask;
+                current->il_next = node_remap(current->il_next,
+                                                *mpolmask, *newmask);
                break;
        case MPOL_PREFERRED:
                pol->v.preferred_node = node_remap(pol->v.preferred_node,
-                                                                *old, *new);
+                                                *mpolmask, *newmask);
+                *mpolmask = *newmask;
                break;
        case MPOL_BIND: {
                nodemask_t nodes;
@@ -1201,7 +1459,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
                nodes_clear(nodes);
                for (z = pol->v.zonelist->zones; *z; z++)
                        node_set((*z)->zone_pgdat->node_id, nodes);
-                nodes_remap(tmp, nodes, *old, *new);
+                nodes_remap(tmp, nodes, *mpolmask, *newmask);
                nodes = tmp;
                zonelist = bind_zonelist(&nodes);
@@ -1216,6 +1474,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
                        kfree(pol->v.zonelist);
                        pol->v.zonelist = zonelist;
                }
+                *mpolmask = *newmask;
                break;
        }
        default:
@@ -1225,12 +1484,156 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
 }
 /*
- * Someone moved this task to different nodes.  Fixup mempolicies.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+        mpol_rebind_policy(tsk->mempolicy, new);
+}
+/*
+ * Rebind each vma in mm to new nodemask.
 *
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
+ * Call holding a reference to mm.  Takes mm->mmap_sem during call.
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
 */
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
-        rebind_policy(current->mempolicy, old, new);
+        struct vm_area_struct *vma;
+        down_write(&mm->mmap_sem);
+        for (vma = mm->mmap; vma; vma = vma->vm_next)
+                mpol_rebind_policy(vma->vm_policy, new);
+        up_write(&mm->mmap_sem);
 }
+/*
+ * Display pages allocated per node and memory policy via /proc.
+ */
+static const char *policy_types[] = { "default", "prefer", "bind",
+                                      "interleave" };
+/*
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
+ */
+static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+{
+        char *p = buffer;
+        int l;
+        nodemask_t nodes;
+        int mode = pol ? pol->policy : MPOL_DEFAULT;
+        switch (mode) {
+        case MPOL_DEFAULT:
+                nodes_clear(nodes);
+                break;
+        case MPOL_PREFERRED:
+                nodes_clear(nodes);
+                node_set(pol->v.preferred_node, nodes);
+                break;
+        case MPOL_BIND:
+                get_zonemask(pol, &nodes);
+                break;
+        case MPOL_INTERLEAVE:
+                nodes = pol->v.nodes;
+                break;
+        default:
+                BUG();
+                return -EFAULT;
+        }
+        l = strlen(policy_types[mode]);
+        if (buffer + maxlen < p + l + 1)
+                return -ENOSPC;
+        strcpy(p, policy_types[mode]);
+        p += l;
+        if (!nodes_empty(nodes)) {
+                if (buffer + maxlen < p + 2)
+                        return -ENOSPC;
+                *p++ = '=';
+                p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
+        }
+        return p - buffer;
+}
+struct numa_maps {
+        unsigned long pages;
+        unsigned long anon;
+        unsigned long mapped;
+        unsigned long mapcount_max;
+        unsigned long node[MAX_NUMNODES];
+};
+static void gather_stats(struct page *page, void *private)
+{
+        struct numa_maps *md = private;
+        int count = page_mapcount(page);
+        if (count)
+                md->mapped++;
+        if (count > md->mapcount_max)
+                md->mapcount_max = count;
+        md->pages++;
+        if (PageAnon(page))
+                md->anon++;
+        md->node[page_to_nid(page)]++;
+        cond_resched();
+}
+int show_numa_map(struct seq_file *m, void *v)
+{
+        struct task_struct *task = m->private;
+        struct vm_area_struct *vma = v;
+        struct numa_maps *md;
+        int n;
+        char buffer[50];
+        if (!vma->vm_mm)
+                return 0;
+        md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
+        if (!md)
+                return 0;
+        check_pgd_range(vma, vma->vm_start, vma->vm_end,
+                    &node_online_map, MPOL_MF_STATS, md);
+        if (md->pages) {
+                mpol_to_str(buffer, sizeof(buffer),
+                            get_vma_policy(task, vma, vma->vm_start));
+                seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
+                           vma->vm_start, buffer, md->pages,
+                           md->mapped, md->mapcount_max);
+                if (md->anon)
+                        seq_printf(m," anon=%lu",md->anon);
+                for_each_online_node(n)
+                        if (md->node[n])
+                                seq_printf(m, " N%d=%lu", n, md->node[n]);
+                seq_putc(m, '\n');
+        }
+        kfree(md);
+        if (m->count < m->size)
+                m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+        return 0;
+}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d348b9035955..4748b906aff2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -298,7 +298,8 @@ retry:
        /*
         * Give "p" a good chance of killing itself before we
-         * retry to allocate memory.
+         * retry to allocate memory unless "p" is current
         */
-        schedule_timeout_interruptible(1);
+        if (!test_thread_flag(TIF_MEMDIE))
+                schedule_timeout_interruptible(1);
 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fd47494cb989..e0e84924171b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -53,6 +53,7 @@ struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
+int percpu_pagelist_fraction;
 static void fastcall free_hot_cold_page(struct page *page, int cold);
@@ -307,7 +308,7 @@ static inline int page_is_buddy(struct page *page, int order)
 * -- wli
 */
-static inline void __free_pages_bulk (struct page *page,
+static inline void __free_one_page(struct page *page,
                struct zone *zone, unsigned int order)
 {
        unsigned long page_idx;
@@ -382,40 +383,42 @@ static inline int free_pages_check(struct page *page)
 * And clear the zone's pages_scanned counter, to hold off the "all pages are
 * pinned" detection logic.
 */
-static int
+static void free_pages_bulk(struct zone *zone, int count,
-free_pages_bulk(struct zone *zone, int count,
+                                        struct list_head *list, int order)
-                struct list_head *list, unsigned int order)
 {
-        struct page *page = NULL;
-        int ret = 0;
        spin_lock(&zone->lock);
        zone->all_unreclaimable = 0;
        zone->pages_scanned = 0;
-        while (!list_empty(list) && count--) {
+        while (count--) {
+                struct page *page;
+                BUG_ON(list_empty(list));
                page = list_entry(list->prev, struct page, lru);
-                /* have to delete it as __free_pages_bulk list manipulates */
+                /* have to delete it as __free_one_page list manipulates */
                list_del(&page->lru);
-                __free_pages_bulk(page, zone, order);
+                __free_one_page(page, zone, order);
-                ret++;
        }
        spin_unlock(&zone->lock);
-        return ret;
 }
-void __free_pages_ok(struct page *page, unsigned int order)
+static void free_one_page(struct zone *zone, struct page *page, int order)
 {
-        unsigned long flags;
        LIST_HEAD(list);
+        list_add(&page->lru, &list);
+        free_pages_bulk(zone, 1, &list, order);
+}
+static void __free_pages_ok(struct page *page, unsigned int order)
+{
+        unsigned long flags;
        int i;
        int reserved = 0;
        arch_free_page(page, order);
 #ifndef CONFIG_MMU
-        if (order > 0)
+        for (i = 1 ; i < (1 << order) ; ++i)
-                for (i = 1 ; i < (1 << order) ; ++i)
+                __put_page(page + i);
-                        __put_page(page + i);
 #endif
        for (i = 0 ; i < (1 << order) ; ++i)
@@ -423,11 +426,10 @@ void __free_pages_ok(struct page *page, unsigned int order)
        if (reserved)
                return;
-        list_add(&page->lru, &list);
+        kernel_map_pages(page, 1 << order, 0);
-        kernel_map_pages(page, 1<<order, 0);
        local_irq_save(flags);
        __mod_page_state(pgfree, 1 << order);
-        free_pages_bulk(page_zone(page), 1, &list, order);
+        free_one_page(page_zone(page), page, order);
        local_irq_restore(flags);
 }
@@ -596,14 +598,13 @@ void drain_remote_pages(void)
                if (zone->zone_pgdat->node_id == numa_node_id())
                        continue;
-                pset = zone->pageset[smp_processor_id()];
+                pset = zone_pcp(zone, smp_processor_id());
                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
                        struct per_cpu_pages *pcp;
                        pcp = &pset->pcp[i];
-                        if (pcp->count)
+                        free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-                                pcp->count -= free_pages_bulk(zone, pcp->count,
+                        pcp->count = 0;
-                                                &pcp->list, 0);
                }
        }
        local_irq_restore(flags);
@@ -626,8 +627,8 @@ static void __drain_pages(unsigned int cpu)
                        pcp = &pset->pcp[i];
                        local_irq_save(flags);
-                        pcp->count -= free_pages_bulk(zone, pcp->count,
+                        free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-                                                &pcp->list, 0);
+                        pcp->count = 0;
                        local_irq_restore(flags);
                }
        }
@@ -718,8 +719,10 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
        __inc_page_state(pgfree);
        list_add(&page->lru, &pcp->list);
        pcp->count++;
-        if (pcp->count >= pcp->high)
+        if (pcp->count >= pcp->high) {
-                pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+                free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+                pcp->count -= pcp->batch;
+        }
        local_irq_restore(flags);
        put_cpu();
 }
@@ -758,7 +761,7 @@ static struct page *buffered_rmqueue(struct zonelist *zonelist,
 again:
        cpu  = get_cpu();
-        if (order == 0) {
+        if (likely(order == 0)) {
                struct per_cpu_pages *pcp;
                pcp = &zone_pcp(zone, cpu)->pcp[cold];
@@ -973,6 +976,7 @@ rebalance:
        cond_resched();
        /* We now go into synchronous reclaim */
+        cpuset_memory_pressure_bump();
        p->flags |= PF_MEMALLOC;
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
@@ -1204,6 +1208,7 @@ static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
        int cpu = 0;
        memset(ret, 0, sizeof(*ret));
+        cpus_and(*cpumask, *cpumask, cpu_online_map);
        cpu = first_cpu(*cpumask);
        while (cpu < NR_CPUS) {
@@ -1256,7 +1261,7 @@ unsigned long read_page_state_offset(unsigned long offset)
        unsigned long ret = 0;
        int cpu;
-        for_each_cpu(cpu) {
+        for_each_online_cpu(cpu) {
                unsigned long in;
                in = (unsigned long)&per_cpu(page_states, cpu) + offset;
@@ -1830,6 +1835,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
        INIT_LIST_HEAD(&pcp->list);
 }
+/*
+ * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
+ * to the value high for the pageset p.
+ */
+static void setup_pagelist_highmark(struct per_cpu_pageset *p,
+                                unsigned long high)
+{
+        struct per_cpu_pages *pcp;
+        pcp = &p->pcp[0]; /* hot list */
+        pcp->high = high;
+        pcp->batch = max(1UL, high/4);
+        if ((high/4) > (PAGE_SHIFT * 8))
+                pcp->batch = PAGE_SHIFT * 8;
+}
 #ifdef CONFIG_NUMA
 /*
 * Boot pageset table. One per cpu which is going to be used for all
@@ -1861,12 +1884,16 @@ static int __devinit process_zones(int cpu)
        for_each_zone(zone) {
-                zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
+                zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
                                         GFP_KERNEL, cpu_to_node(cpu));
-                if (!zone->pageset[cpu])
+                if (!zone_pcp(zone, cpu))
                        goto bad;
-                setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
+                setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
+                if (percpu_pagelist_fraction)
+                        setup_pagelist_highmark(zone_pcp(zone, cpu),
+                                (zone->present_pages / percpu_pagelist_fraction));
        }
        return 0;
@@ -1874,15 +1901,14 @@ bad:
        for_each_zone(dzone) {
                if (dzone == zone)
                        break;
-                kfree(dzone->pageset[cpu]);
+                kfree(zone_pcp(dzone, cpu));
-                dzone->pageset[cpu] = NULL;
+                zone_pcp(dzone, cpu) = NULL;
        }
        return -ENOMEM;
 }
 static inline void free_zone_pagesets(int cpu)
 {
-#ifdef CONFIG_NUMA
        struct zone *zone;
        for_each_zone(zone) {
@@ -1891,7 +1917,6 @@ static inline void free_zone_pagesets(int cpu)
                zone_pcp(zone, cpu) = NULL;
                kfree(pset);
        }
-#endif
 }
 static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
@@ -1962,7 +1987,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
        for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_NUMA
                /* Early boot. Slab allocator not functional yet */
-                zone->pageset[cpu] = &boot_pageset[cpu];
+                zone_pcp(zone, cpu) = &boot_pageset[cpu];
                setup_pageset(&boot_pageset[cpu],0);
 #else
                setup_pageset(zone_pcp(zone,cpu), batch);
@@ -2205,7 +2230,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
                seq_printf(m,
                           ")"
                           "\n  pagesets");
-                for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
+                for_each_online_cpu(i) {
                        struct per_cpu_pageset *pageset;
                        int j;
@@ -2568,6 +2593,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
        return 0;
 }
+/*
+ * percpu_pagelist_fraction - changes the pcp->high for each zone on each
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
+ * can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
+        struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+        struct zone *zone;
+        unsigned int cpu;
+        int ret;
+        ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+        if (!write || (ret == -EINVAL))
+                return ret;
+        for_each_zone(zone) {
+                for_each_online_cpu(cpu) {
+                        unsigned long  high;
+                        high = zone->present_pages / percpu_pagelist_fraction;
+                        setup_pagelist_highmark(zone_pcp(zone, cpu), high);
+                }
+        }
+        return 0;
+}
 __initdata int hashdist = HASHDIST_DEFAULT;
 #ifdef CONFIG_NUMA
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 52822c98c489..c4b6d0afd736 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -90,7 +90,7 @@ struct pdflush_work {
 static int __pdflush(struct pdflush_work *my_work)
 {
-        current->flags |= PF_FLUSHER;
+        current->flags |= PF_FLUSHER | PF_SWAPWRITE;
        my_work->fn = NULL;
        my_work->who = current;
        INIT_LIST_HEAD(&my_work->list);
diff --git a/mm/rmap.c b/mm/rmap.c
index 6f3f7db27128..66ec43053a4d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -514,6 +514,13 @@ void page_add_file_rmap(struct page *page)
 void page_remove_rmap(struct page *page)
 {
        if (atomic_add_negative(-1, &page->_mapcount)) {
+                if (page_mapcount(page) < 0) {
+                        printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
+                        printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
+                        printk (KERN_EMERG "  page->count = %x\n", page_count(page));
+                        printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
+                }
                BUG_ON(page_mapcount(page) < 0);
                /*
                 * It would be tidy to reset the PageAnon mapping here,
diff --git a/mm/slab.c b/mm/slab.c
index e5ec26e0c460..1c46c6383552 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -130,7 +130,6 @@
 #define FORCED_DEBUG    0
 #endif
 /* Shouldn't this be in a header file somewhere? */
 #define BYTES_PER_WORD          sizeof(void *)
@@ -217,12 +216,12 @@ static unsigned long offslab_limit;
 * Slabs are chained into three list: fully used, partial, fully free slabs.
 */
 struct slab {
-        struct list_head        list;
+        struct list_head list;
-        unsigned long           colouroff;
+        unsigned long colouroff;
-        void                    *s_mem;         /* including colour offset */
+        void *s_mem;            /* including colour offset */
-        unsigned int            inuse;          /* num of objs active in slab */
+        unsigned int inuse;     /* num of objs active in slab */
-        kmem_bufctl_t           free;
+        kmem_bufctl_t free;
-        unsigned short          nodeid;
+        unsigned short nodeid;
 };
 /*
@@ -242,9 +241,9 @@ struct slab {
 * We assume struct slab_rcu can overlay struct slab when destroying.
 */
 struct slab_rcu {
-        struct rcu_head         head;
+        struct rcu_head head;
-        kmem_cache_t            *cachep;
+        kmem_cache_t *cachep;
-        void                    *addr;
+        void *addr;
 };
 /*
@@ -279,23 +278,23 @@ struct array_cache {
 #define BOOT_CPUCACHE_ENTRIES   1
 struct arraycache_init {
        struct array_cache cache;
-        void * entries[BOOT_CPUCACHE_ENTRIES];
+        void *entries[BOOT_CPUCACHE_ENTRIES];
 };
 /*
 * The slab lists for all objects.
 */
 struct kmem_list3 {
-        struct list_head        slabs_partial;  /* partial list first, better asm code */
+        struct list_head slabs_partial; /* partial list first, better asm code */
-        struct list_head        slabs_full;
+        struct list_head slabs_full;
-        struct list_head        slabs_free;
+        struct list_head slabs_free;
-        unsigned long   free_objects;
+        unsigned long free_objects;
-        unsigned long   next_reap;
+        unsigned long next_reap;
-        int             free_touched;
+        int free_touched;
-        unsigned int    free_limit;
+        unsigned int free_limit;
-        spinlock_t      list_lock;
+        spinlock_t list_lock;
-        struct array_cache      *shared;        /* shared per node */
+        struct array_cache *shared;     /* shared per node */
-        struct array_cache      **alien;        /* on other nodes */
+        struct array_cache **alien;     /* on other nodes */
 };
 /*
@@ -367,63 +366,63 @@ static inline void kmem_list3_init(struct kmem_list3 *parent)
 *
 * manages a cache.
 */
-        
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
-        struct array_cache      *array[NR_CPUS];
+        struct array_cache *array[NR_CPUS];
-        unsigned int            batchcount;
+        unsigned int batchcount;
-        unsigned int            limit;
+        unsigned int limit;
-        unsigned int            shared;
+        unsigned int shared;
-        unsigned int            objsize;
+        unsigned int objsize;
 /* 2) touched by every alloc & free from the backend */
-        struct kmem_list3       *nodelists[MAX_NUMNODES];
+        struct kmem_list3 *nodelists[MAX_NUMNODES];
-        unsigned int            flags;  /* constant flags */
+        unsigned int flags;     /* constant flags */
-        unsigned int            num;    /* # of objs per slab */
+        unsigned int num;       /* # of objs per slab */
-        spinlock_t              spinlock;
+        spinlock_t spinlock;
 /* 3) cache_grow/shrink */
        /* order of pgs per slab (2^n) */
-        unsigned int            gfporder;
+        unsigned int gfporder;
        /* force GFP flags, e.g. GFP_DMA */
-        gfp_t                   gfpflags;
+        gfp_t gfpflags;
-        size_t                  colour;         /* cache colouring range */
+        size_t colour;          /* cache colouring range */
-        unsigned int            colour_off;     /* colour offset */
+        unsigned int colour_off;        /* colour offset */
-        unsigned int            colour_next;    /* cache colouring */
+        unsigned int colour_next;       /* cache colouring */
-        kmem_cache_t            *slabp_cache;
+        kmem_cache_t *slabp_cache;
-        unsigned int            slab_size;
+        unsigned int slab_size;
-        unsigned int            dflags;         /* dynamic flags */
+        unsigned int dflags;    /* dynamic flags */
        /* constructor func */
-        void (*ctor)(void *, kmem_cache_t *, unsigned long);
+        void (*ctor) (void *, kmem_cache_t *, unsigned long);
        /* de-constructor func */
-        void (*dtor)(void *, kmem_cache_t *, unsigned long);
+        void (*dtor) (void *, kmem_cache_t *, unsigned long);
 /* 4) cache creation/removal */
-        const char              *name;
+        const char *name;
-        struct list_head        next;
+        struct list_head next;
 /* 5) statistics */
 #if STATS
-        unsigned long           num_active;
+        unsigned long num_active;
-        unsigned long           num_allocations;
+        unsigned long num_allocations;
-        unsigned long           high_mark;
+        unsigned long high_mark;
-        unsigned long           grown;
+        unsigned long grown;
-        unsigned long           reaped;
+        unsigned long reaped;
-        unsigned long           errors;
+        unsigned long errors;
-        unsigned long           max_freeable;
+        unsigned long max_freeable;
-        unsigned long           node_allocs;
+        unsigned long node_allocs;
-        unsigned long           node_frees;
+        unsigned long node_frees;
-        atomic_t                allochit;
+        atomic_t allochit;
-        atomic_t                allocmiss;
+        atomic_t allocmiss;
-        atomic_t                freehit;
+        atomic_t freehit;
-        atomic_t                freemiss;
+        atomic_t freemiss;
 #endif
 #if DEBUG
-        int                     dbghead;
+        int dbghead;
-        int                     reallen;
+        int reallen;
 #endif
 };
@@ -523,14 +522,15 @@ static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
        if (cachep->flags & SLAB_STORE_USER)
-                return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
+                return (unsigned long *)(objp + cachep->objsize -
-        return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
+                                         2 * BYTES_PER_WORD);
+        return (unsigned long *)(objp + cachep->objsize - BYTES_PER_WORD);
 }
 static void **dbg_userword(kmem_cache_t *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
-        return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
+        return (void **)(objp + cachep->objsize - BYTES_PER_WORD);
 }
 #else
@@ -607,31 +607,31 @@ struct cache_names {
 static struct cache_names __initdata cache_names[] = {
 #define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 #include <linux/kmalloc_sizes.h>
-        { NULL, }
+        {NULL,}
 #undef CACHE
 };
 static struct arraycache_init initarray_cache __initdata =
-        { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 static struct arraycache_init initarray_generic =
-        { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
+    { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 /* internal cache of cache description objs */
 static kmem_cache_t cache_cache = {
-        .batchcount     = 1,
+        .batchcount = 1,
-        .limit          = BOOT_CPUCACHE_ENTRIES,
+        .limit = BOOT_CPUCACHE_ENTRIES,
-        .shared         = 1,
+        .shared = 1,
-        .objsize        = sizeof(kmem_cache_t),
+        .objsize = sizeof(kmem_cache_t),
-        .flags          = SLAB_NO_REAP,
+        .flags = SLAB_NO_REAP,
-        .spinlock       = SPIN_LOCK_UNLOCKED,
+        .spinlock = SPIN_LOCK_UNLOCKED,
-        .name           = "kmem_cache",
+        .name = "kmem_cache",
 #if DEBUG
-        .reallen        = sizeof(kmem_cache_t),
+        .reallen = sizeof(kmem_cache_t),
 #endif
 };
 /* Guard access to the cache-chain. */
-static struct semaphore cache_chain_sem;
+static struct semaphore cache_chain_sem;
 static struct list_head cache_chain;
 /*
@@ -655,9 +655,9 @@ static enum {
 static DEFINE_PER_CPU(struct work_struct, reap_work);
-static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node);
+static void free_block(kmem_cache_t *cachep, void **objpp, int len, int node);
-static void enable_cpucache (kmem_cache_t *cachep);
+static void enable_cpucache(kmem_cache_t *cachep);
-static void cache_reap (void *unused);
+static void cache_reap(void *unused);
 static int __node_shrink(kmem_cache_t *cachep, int node);
 static inline struct array_cache *ac_data(kmem_cache_t *cachep)
@@ -671,9 +671,9 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
 #if DEBUG
        /* This happens if someone tries to call
-        * kmem_cache_create(), or __kmalloc(), before
+         * kmem_cache_create(), or __kmalloc(), before
-        * the generic caches are initialized.
+         * the generic caches are initialized.
-        */
+         */
        BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 #endif
        while (size > csizep->cs_size)
@@ -697,10 +697,10 @@ EXPORT_SYMBOL(kmem_find_general_cachep);
 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
 static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
-                 int flags, size_t *left_over, unsigned int *num)
+                           int flags, size_t *left_over, unsigned int *num)
 {
        int i;
-        size_t wastage = PAGE_SIZE<<gfporder;
+        size_t wastage = PAGE_SIZE << gfporder;
        size_t extra = 0;
        size_t base = 0;
@@ -709,7 +709,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
                extra = sizeof(kmem_bufctl_t);
        }
        i = 0;
-        while (i*size + ALIGN(base+i*extra, align) <= wastage)
+        while (i * size + ALIGN(base + i * extra, align) <= wastage)
                i++;
        if (i > 0)
                i--;
@@ -718,8 +718,8 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
                i = SLAB_LIMIT;
        *num = i;
-        wastage -= i*size;
+        wastage -= i * size;
-        wastage -= ALIGN(base+i*extra, align);
+        wastage -= ALIGN(base + i * extra, align);
        *left_over = wastage;
 }
@@ -728,7 +728,7 @@ static void cache_estimate(unsigned long gfporder, size_t size, size_t align,
 static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
 {
        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
-                function, cachep->name, msg);
+               function, cachep->name, msg);
        dump_stack();
 }
@@ -755,9 +755,9 @@ static void __devinit start_cpu_timer(int cpu)
 }
 static struct array_cache *alloc_arraycache(int node, int entries,
-                                                int batchcount)
+                                            int batchcount)
 {
-        int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
+        int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
        struct array_cache *nc = NULL;
        nc = kmalloc_node(memsize, GFP_KERNEL, node);
@@ -775,7 +775,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 static inline struct array_cache **alloc_alien_cache(int node, int limit)
 {
        struct array_cache **ac_ptr;
-        int memsize = sizeof(void*)*MAX_NUMNODES;
+        int memsize = sizeof(void *) * MAX_NUMNODES;
        int i;
        if (limit > 1)
@@ -789,7 +789,7 @@ static inline struct array_cache **alloc_alien_cache(int node, int limit)
                        }
                        ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
                        if (!ac_ptr[i]) {
-                                for (i--; i <=0; i--)
+                                for (i--; i <= 0; i--)
                                        kfree(ac_ptr[i]);
                                kfree(ac_ptr);
                                return NULL;
@@ -807,12 +807,13 @@ static inline void free_alien_cache(struct array_cache **ac_ptr)
                return;
        for_each_node(i)
-                kfree(ac_ptr[i]);
+            kfree(ac_ptr[i]);
        kfree(ac_ptr);
 }
-static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
+static inline void __drain_alien_cache(kmem_cache_t *cachep,
+                                       struct array_cache *ac, int node)
 {
        struct kmem_list3 *rl3 = cachep->nodelists[node];
@@ -826,7 +827,7 @@ static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache
 static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
 {
-        int i=0;
+        int i = 0;
        struct array_cache *ac;
        unsigned long flags;
@@ -846,14 +847,13 @@ static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
 #endif
 static int __devinit cpuup_callback(struct notifier_block *nfb,
-                                  unsigned long action, void *hcpu)
+                                    unsigned long action, void *hcpu)
 {
        long cpu = (long)hcpu;
-        kmem_cache_t* cachep;
+        kmem_cache_t *cachep;
        struct kmem_list3 *l3 = NULL;
        int node = cpu_to_node(cpu);
        int memsize = sizeof(struct kmem_list3);
-        struct array_cache *nc = NULL;
        switch (action) {
        case CPU_UP_PREPARE:
@@ -871,27 +871,29 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                         */
                        if (!cachep->nodelists[node]) {
                                if (!(l3 = kmalloc_node(memsize,
-                                                GFP_KERNEL, node)))
+                                                        GFP_KERNEL, node)))
                                        goto bad;
                                kmem_list3_init(l3);
                                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                                  ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
                                cachep->nodelists[node] = l3;
                        }
                        spin_lock_irq(&cachep->nodelists[node]->list_lock);
                        cachep->nodelists[node]->free_limit =
-                                (1 + nr_cpus_node(node)) *
+                            (1 + nr_cpus_node(node)) *
-                                cachep->batchcount + cachep->num;
+                            cachep->batchcount + cachep->num;
                        spin_unlock_irq(&cachep->nodelists[node]->list_lock);
                }
                /* Now we can go ahead with allocating the shared array's
-                  & array cache's */
+                   & array cache's */
                list_for_each_entry(cachep, &cache_chain, next) {
+                        struct array_cache *nc;
                        nc = alloc_arraycache(node, cachep->limit,
-                                        cachep->batchcount);
+                                              cachep->batchcount);
                        if (!nc)
                                goto bad;
                        cachep->array[cpu] = nc;
@@ -900,12 +902,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        BUG_ON(!l3);
                        if (!l3->shared) {
                                if (!(nc = alloc_arraycache(node,
-                                        cachep->shared*cachep->batchcount,
+                                                            cachep->shared *
-                                        0xbaadf00d)))
+                                                            cachep->batchcount,
-                                        goto  bad;
+                                                            0xbaadf00d)))
+                                        goto bad;
                                /* we are serialised from CPU_DEAD or
-                                  CPU_UP_CANCELLED by the cpucontrol lock */
+                                   CPU_UP_CANCELLED by the cpucontrol lock */
                                l3->shared = nc;
                        }
                }
@@ -942,13 +945,13 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                                free_block(cachep, nc->entry, nc->avail, node);
                        if (!cpus_empty(mask)) {
-                                spin_unlock(&l3->list_lock);
+                                spin_unlock(&l3->list_lock);
-                                goto unlock_cache;
+                                goto unlock_cache;
-                        }
+                        }
                        if (l3->shared) {
                                free_block(cachep, l3->shared->entry,
-                                                l3->shared->avail, node);
+                                           l3->shared->avail, node);
                                kfree(l3->shared);
                                l3->shared = NULL;
                        }
@@ -966,7 +969,7 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
                        } else {
                                spin_unlock(&l3->list_lock);
                        }
-unlock_cache:
+                      unlock_cache:
                        spin_unlock_irq(&cachep->spinlock);
                        kfree(nc);
                }
@@ -975,7 +978,7 @@ unlock_cache:
 #endif
        }
        return NOTIFY_OK;
-bad:
+      bad:
        up(&cache_chain_sem);
        return NOTIFY_BAD;
 }
@@ -985,8 +988,7 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
 * swap the static kmem_list3 with kmalloced memory
 */
-static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
+static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list, int nodeid)
-                int nodeid)
 {
        struct kmem_list3 *ptr;
@@ -1055,14 +1057,14 @@ void __init kmem_cache_init(void)
        cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
        cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
-                                &left_over, &cache_cache.num);
+                       &left_over, &cache_cache.num);
        if (!cache_cache.num)
                BUG();
-        cache_cache.colour = left_over/cache_cache.colour_off;
+        cache_cache.colour = left_over / cache_cache.colour_off;
        cache_cache.colour_next = 0;
-        cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
+        cache_cache.slab_size = ALIGN(cache_cache.num * sizeof(kmem_bufctl_t) +
-                                sizeof(struct slab), cache_line_size());
+                                      sizeof(struct slab), cache_line_size());
        /* 2+3) create the kmalloc caches */
        sizes = malloc_sizes;
@@ -1074,14 +1076,18 @@ void __init kmem_cache_init(void)
         */
        sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-                                sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
+                                                      sizes[INDEX_AC].cs_size,
-                                (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+                                                      ARCH_KMALLOC_MINALIGN,
+                                                      (ARCH_KMALLOC_FLAGS |
+                                                       SLAB_PANIC), NULL, NULL);
        if (INDEX_AC != INDEX_L3)
                sizes[INDEX_L3].cs_cachep =
-                        kmem_cache_create(names[INDEX_L3].name,
+                    kmem_cache_create(names[INDEX_L3].name,
-                                sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
+                                      sizes[INDEX_L3].cs_size,
-                                (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+                                      ARCH_KMALLOC_MINALIGN,
+                                      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
+                                      NULL);
        while (sizes->cs_size != ULONG_MAX) {
                /*
@@ -1091,35 +1097,41 @@ void __init kmem_cache_init(void)
                 * Note for systems short on memory removing the alignment will
                 * allow tighter packing of the smaller caches.
                 */
-                if(!sizes->cs_cachep)
+                if (!sizes->cs_cachep)
                        sizes->cs_cachep = kmem_cache_create(names->name,
-                                sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+                                                             sizes->cs_size,
-                                (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+                                                             ARCH_KMALLOC_MINALIGN,
+                                                             (ARCH_KMALLOC_FLAGS
+                                                              | SLAB_PANIC),
+                                                             NULL, NULL);
                /* Inc off-slab bufctl limit until the ceiling is hit. */
                if (!(OFF_SLAB(sizes->cs_cachep))) {
-                        offslab_limit = sizes->cs_size-sizeof(struct slab);
+                        offslab_limit = sizes->cs_size - sizeof(struct slab);
                        offslab_limit /= sizeof(kmem_bufctl_t);
                }
                sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
-                        sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+                                                        sizes->cs_size,
-                        (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
+                                                        ARCH_KMALLOC_MINALIGN,
-                        NULL, NULL);
+                                                        (ARCH_KMALLOC_FLAGS |
+                                                         SLAB_CACHE_DMA |
+                                                         SLAB_PANIC), NULL,
+                                                        NULL);
                sizes++;
                names++;
        }
        /* 4) Replace the bootstrap head arrays */
        {
-                void * ptr;
+                void *ptr;
                ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
                local_irq_disable();
                BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
                memcpy(ptr, ac_data(&cache_cache),
-                                sizeof(struct arraycache_init));
+                       sizeof(struct arraycache_init));
                cache_cache.array[smp_processor_id()] = ptr;
                local_irq_enable();
@@ -1127,11 +1139,11 @@ void __init kmem_cache_init(void)
                local_irq_disable();
                BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
-                                != &initarray_generic.cache);
+                       != &initarray_generic.cache);
                memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
-                                sizeof(struct arraycache_init));
+                       sizeof(struct arraycache_init));
                malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-                                                ptr;
+                    ptr;
                local_irq_enable();
        }
        /* 5) Replace the bootstrap kmem_list3's */
@@ -1139,16 +1151,16 @@ void __init kmem_cache_init(void)
                int node;
                /* Replace the static kmem_list3 structures for the boot cpu */
                init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
-                                numa_node_id());
+                          numa_node_id());
                for_each_online_node(node) {
                        init_list(malloc_sizes[INDEX_AC].cs_cachep,
-                                        &initkmem_list3[SIZE_AC+node], node);
+                                  &initkmem_list3[SIZE_AC + node], node);
                        if (INDEX_AC != INDEX_L3) {
                                init_list(malloc_sizes[INDEX_L3].cs_cachep,
-                                                &initkmem_list3[SIZE_L3+node],
+                                          &initkmem_list3[SIZE_L3 + node],
-                                                node);
+                                          node);
                        }
                }
        }
@@ -1158,7 +1170,7 @@ void __init kmem_cache_init(void)
                kmem_cache_t *cachep;
                down(&cache_chain_sem);
                list_for_each_entry(cachep, &cache_chain, next)
-                        enable_cpucache(cachep);
+                    enable_cpucache(cachep);
                up(&cache_chain_sem);
        }
@@ -1184,7 +1196,7 @@ static int __init cpucache_init(void)
         * pages to gfp.
         */
        for_each_online_cpu(cpu)
-                start_cpu_timer(cpu);
+            start_cpu_timer(cpu);
        return 0;
 }
@@ -1226,7 +1238,7 @@ static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 */
 static void kmem_freepages(kmem_cache_t *cachep, void *addr)
 {
-        unsigned long i = (1<<cachep->gfporder);
+        unsigned long i = (1 << cachep->gfporder);
        struct page *page = virt_to_page(addr);
        const unsigned long nr_freed = i;
@@ -1239,13 +1251,13 @@ static void kmem_freepages(kmem_cache_t *cachep, void *addr)
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += nr_freed;
        free_pages((unsigned long)addr, cachep->gfporder);
-        if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 
+        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
-                atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
+                atomic_sub(1 << cachep->gfporder, &slab_reclaim_pages);
 }
 static void kmem_rcu_free(struct rcu_head *head)
 {
-        struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
+        struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
        kmem_cache_t *cachep = slab_rcu->cachep;
        kmem_freepages(cachep, slab_rcu->addr);
@@ -1257,19 +1269,19 @@ static void kmem_rcu_free(struct rcu_head *head)
 #ifdef CONFIG_DEBUG_PAGEALLOC
 static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
-                                unsigned long caller)
+                            unsigned long caller)
 {
        int size = obj_reallen(cachep);
-        addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
+        addr = (unsigned long *)&((char *)addr)[obj_dbghead(cachep)];
-        if (size < 5*sizeof(unsigned long))
+        if (size < 5 * sizeof(unsigned long))
                return;
-        *addr++=0x12345678;
+        *addr++ = 0x12345678;
-        *addr++=caller;
+        *addr++ = caller;
-        *addr++=smp_processor_id();
+        *addr++ = smp_processor_id();
-        size -= 3*sizeof(unsigned long);
+        size -= 3 * sizeof(unsigned long);
        {
                unsigned long *sptr = &caller;
                unsigned long svalue;
@@ -1277,7 +1289,7 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
                while (!kstack_end(sptr)) {
                        svalue = *sptr++;
                        if (kernel_text_address(svalue)) {
-                                *addr++=svalue;
+                                *addr++ = svalue;
                                size -= sizeof(unsigned long);
                                if (size <= sizeof(unsigned long))
                                        break;
@@ -1285,25 +1297,25 @@ static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr,
                }
        }
-        *addr++=0x87654321;
+        *addr++ = 0x87654321;
 }
 #endif
 static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
 {
        int size = obj_reallen(cachep);
-        addr = &((char*)addr)[obj_dbghead(cachep)];
+        addr = &((char *)addr)[obj_dbghead(cachep)];
        memset(addr, val, size);
-        *(unsigned char *)(addr+size-1) = POISON_END;
+        *(unsigned char *)(addr + size - 1) = POISON_END;
 }
 static void dump_line(char *data, int offset, int limit)
 {
        int i;
        printk(KERN_ERR "%03x:", offset);
-        for (i=0;i<limit;i++) {
+        for (i = 0; i < limit; i++) {
-                printk(" %02x", (unsigned char)data[offset+i]);
+                printk(" %02x", (unsigned char)data[offset + i]);
        }
        printk("\n");
 }
@@ -1318,24 +1330,24 @@ static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
        if (cachep->flags & SLAB_RED_ZONE) {
                printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
-                        *dbg_redzone1(cachep, objp),
+                       *dbg_redzone1(cachep, objp),
-                        *dbg_redzone2(cachep, objp));
+                       *dbg_redzone2(cachep, objp));
        }
        if (cachep->flags & SLAB_STORE_USER) {
                printk(KERN_ERR "Last user: [<%p>]",
-                                *dbg_userword(cachep, objp));
+                       *dbg_userword(cachep, objp));
                print_symbol("(%s)",
-                                (unsigned long)*dbg_userword(cachep, objp));
+                             (unsigned long)*dbg_userword(cachep, objp));
                printk("\n");
        }
-        realobj = (char*)objp+obj_dbghead(cachep);
+        realobj = (char *)objp + obj_dbghead(cachep);
        size = obj_reallen(cachep);
-        for (i=0; i<size && lines;i+=16, lines--) {
+        for (i = 0; i < size && lines; i += 16, lines--) {
                int limit;
                limit = 16;
-                if (i+limit > size)
+                if (i + limit > size)
-                        limit = size-i;
+                        limit = size - i;
                dump_line(realobj, i, limit);
        }
 }
@@ -1346,27 +1358,28 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
        int size, i;
        int lines = 0;
-        realobj = (char*)objp+obj_dbghead(cachep);
+        realobj = (char *)objp + obj_dbghead(cachep);
        size = obj_reallen(cachep);
-        for (i=0;i<size;i++) {
+        for (i = 0; i < size; i++) {
                char exp = POISON_FREE;
-                if (i == size-1)
+                if (i == size - 1)
                        exp = POISON_END;
                if (realobj[i] != exp) {
                        int limit;
                        /* Mismatch ! */
                        /* Print header */
                        if (lines == 0) {
-                                printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
+                                printk(KERN_ERR
-                                                realobj, size);
+                                       "Slab corruption: start=%p, len=%d\n",
+                                       realobj, size);
                                print_objinfo(cachep, objp, 0);
                        }
                        /* Hexdump the affected line */
-                        i = (i/16)*16;
+                        i = (i / 16) * 16;
                        limit = 16;
-                        if (i+limit > size)
+                        if (i + limit > size)
-                                limit = size-i;
+                                limit = size - i;
                        dump_line(realobj, i, limit);
                        i += 16;
                        lines++;
@@ -1382,19 +1395,19 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
                struct slab *slabp = page_get_slab(virt_to_page(objp));
                int objnr;
-                objnr = (objp-slabp->s_mem)/cachep->objsize;
+                objnr = (objp - slabp->s_mem) / cachep->objsize;
                if (objnr) {
-                        objp = slabp->s_mem+(objnr-1)*cachep->objsize;
+                        objp = slabp->s_mem + (objnr - 1) * cachep->objsize;
-                        realobj = (char*)objp+obj_dbghead(cachep);
+                        realobj = (char *)objp + obj_dbghead(cachep);
                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
-                                                realobj, size);
+                               realobj, size);
                        print_objinfo(cachep, objp, 2);
                }
-                if (objnr+1 < cachep->num) {
+                if (objnr + 1 < cachep->num) {
-                        objp = slabp->s_mem+(objnr+1)*cachep->objsize;
+                        objp = slabp->s_mem + (objnr + 1) * cachep->objsize;
-                        realobj = (char*)objp+obj_dbghead(cachep);
+                        realobj = (char *)objp + obj_dbghead(cachep);
                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
-                                                realobj, size);
+                               realobj, size);
                        print_objinfo(cachep, objp, 2);
                }
        }
@@ -1405,7 +1418,7 @@ static void check_poison_obj(kmem_cache_t *cachep, void *objp)
 * Before calling the slab must have been unlinked from the cache.
 * The cache-lock is not held/needed.
 */
-static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
+static void slab_destroy(kmem_cache_t *cachep, struct slab *slabp)
 {
        void *addr = slabp->s_mem - slabp->colouroff;
@@ -1416,8 +1429,11 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
                if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-                        if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
+                        if ((cachep->objsize % PAGE_SIZE) == 0
-                                kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
+                            && OFF_SLAB(cachep))
+                                kernel_map_pages(virt_to_page(objp),
+                                                 cachep->objsize / PAGE_SIZE,
+                                                 1);
                        else
                                check_poison_obj(cachep, objp);
 #else
@@ -1427,20 +1443,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "start of a freed object "
-                                                        "was overwritten");
+                                           "was overwritten");
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "end of a freed object "
-                                                        "was overwritten");
+                                           "was overwritten");
                }
                if (cachep->dtor && !(cachep->flags & SLAB_POISON))
-                        (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
+                        (cachep->dtor) (objp + obj_dbghead(cachep), cachep, 0);
        }
 #else
        if (cachep->dtor) {
                int i;
                for (i = 0; i < cachep->num; i++) {
-                        void* objp = slabp->s_mem+cachep->objsize*i;
+                        void *objp = slabp->s_mem + cachep->objsize * i;
-                        (cachep->dtor)(objp, cachep, 0);
+                        (cachep->dtor) (objp, cachep, 0);
                }
        }
 #endif
@@ -1448,7 +1464,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
                struct slab_rcu *slab_rcu;
-                slab_rcu = (struct slab_rcu *) slabp;
+                slab_rcu = (struct slab_rcu *)slabp;
                slab_rcu->cachep = cachep;
                slab_rcu->addr = addr;
                call_rcu(&slab_rcu->head, kmem_rcu_free);
@@ -1466,11 +1482,58 @@ static inline void set_up_list3s(kmem_cache_t *cachep, int index)
        int node;
        for_each_online_node(node) {
-                cachep->nodelists[node] = &initkmem_list3[index+node];
+                cachep->nodelists[node] = &initkmem_list3[index + node];
                cachep->nodelists[node]->next_reap = jiffies +
-                        REAPTIMEOUT_LIST3 +
+                    REAPTIMEOUT_LIST3 +
-                        ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+        }
+}
+/**
+ * calculate_slab_order - calculate size (page order) of slabs and the number
+ *                        of objects per slab.
+ *
+ * This could be made much more intelligent.  For now, try to avoid using
+ * high order pages for slabs.  When the gfp() functions are more friendly
+ * towards high-order requests, this should be changed.
+ */
+static inline size_t calculate_slab_order(kmem_cache_t *cachep, size_t size,
+                                          size_t align, gfp_t flags)
+{
+        size_t left_over = 0;
+        for (;; cachep->gfporder++) {
+                unsigned int num;
+                size_t remainder;
+                if (cachep->gfporder > MAX_GFP_ORDER) {
+                        cachep->num = 0;
+                        break;
+                }
+                cache_estimate(cachep->gfporder, size, align, flags,
+                               &remainder, &num);
+                if (!num)
+                        continue;
+                /* More than offslab_limit objects will cause problems */
+                if (flags & CFLGS_OFF_SLAB && cachep->num > offslab_limit)
+                        break;
+                cachep->num = num;
+                left_over = remainder;
+                /*
+                 * Large number of objects is good, but very large slabs are
+                 * currently bad for the gfp()s.
+                 */
+                if (cachep->gfporder >= slab_break_gfp_order)
+                        break;
+                if ((left_over * 8) <= (PAGE_SIZE << cachep->gfporder))
+                        /* Acceptable internal fragmentation */
+                        break;
        }
+        return left_over;
 }
 /**
@@ -1519,14 +1582,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * Sanity checks... these are all serious usage bugs.
         */
        if ((!name) ||
-                in_interrupt() ||
+            in_interrupt() ||
-                (size < BYTES_PER_WORD) ||
+            (size < BYTES_PER_WORD) ||
-                (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
+            (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
-                (dtor && !ctor)) {
+                printk(KERN_ERR "%s: Early error in slab %s\n",
-                        printk(KERN_ERR "%s: Early error in slab %s\n",
+                       __FUNCTION__, name);
-                                        __FUNCTION__, name);
+                BUG();
-                        BUG();
+        }
-                }
        down(&cache_chain_sem);
@@ -1546,11 +1608,11 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                set_fs(old_fs);
                if (res) {
                        printk("SLAB: cache with size %d has lost its name\n",
-                                        pc->objsize);
+                               pc->objsize);
                        continue;
                }
-                if (!strcmp(pc->name,name)) {
+                if (!strcmp(pc->name, name)) {
                        printk("kmem_cache_create: duplicate cache %s\n", name);
                        dump_stack();
                        goto oops;
@@ -1562,10 +1624,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
                /* No constructor, but inital state check requested */
                printk(KERN_ERR "%s: No con, but init state check "
-                                "requested - %s\n", __FUNCTION__, name);
+                       "requested - %s\n", __FUNCTION__, name);
                flags &= ~SLAB_DEBUG_INITIAL;
        }
 #if FORCED_DEBUG
        /*
         * Enable redzoning and last user accounting, except for caches with
@@ -1573,8 +1634,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * above the next power of two: caches with object sizes just above a
         * power of two have a significant amount of internal fragmentation.
         */
-        if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
+        if ((size < 4096
-                flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
+             || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
+                flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
        if (!(flags & SLAB_DESTROY_BY_RCU))
                flags |= SLAB_POISON;
 #endif
@@ -1595,9 +1657,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * unaligned accesses for some archs when redzoning is used, and makes
         * sure any on-slab bufctl's are also correctly aligned.
         */
-        if (size & (BYTES_PER_WORD-1)) {
+        if (size & (BYTES_PER_WORD - 1)) {
-                size += (BYTES_PER_WORD-1);
+                size += (BYTES_PER_WORD - 1);
-                size &= ~(BYTES_PER_WORD-1);
+                size &= ~(BYTES_PER_WORD - 1);
        }
        /* calculate out the final buffer alignment: */
@@ -1608,7 +1670,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                 * objects into one cacheline.
                 */
                ralign = cache_line_size();
-                while (size <= ralign/2)
+                while (size <= ralign / 2)
                        ralign /= 2;
        } else {
                ralign = BYTES_PER_WORD;
@@ -1617,13 +1679,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (ralign < ARCH_SLAB_MINALIGN) {
                ralign = ARCH_SLAB_MINALIGN;
                if (ralign > BYTES_PER_WORD)
-                        flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        }
        /* 3) caller mandated alignment: disables debug if necessary */
        if (ralign < align) {
                ralign = align;
                if (ralign > BYTES_PER_WORD)
-                        flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+                        flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        }
        /* 4) Store it. Note that the debug code below can reduce
         *    the alignment to BYTES_PER_WORD.
@@ -1645,7 +1707,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                /* add space for red zone words */
                cachep->dbghead += BYTES_PER_WORD;
-                size += 2*BYTES_PER_WORD;
+                size += 2 * BYTES_PER_WORD;
        }
        if (flags & SLAB_STORE_USER) {
                /* user store requires word alignment and
@@ -1656,7 +1718,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                size += BYTES_PER_WORD;
        }
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-        if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
+        if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
+            && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
                cachep->dbghead += PAGE_SIZE - size;
                size = PAGE_SIZE;
        }
@@ -1664,7 +1727,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 #endif
        /* Determine if the slab management is 'on' or 'off' slab. */
-        if (size >= (PAGE_SIZE>>3))
+        if (size >= (PAGE_SIZE >> 3))
                /*
                 * Size is large, assume best to place the slab management obj
                 * off-slab (should allow better packing of objs).
@@ -1681,47 +1744,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                 */
                cachep->gfporder = 0;
                cache_estimate(cachep->gfporder, size, align, flags,
-                                        &left_over, &cachep->num);
+                               &left_over, &cachep->num);
-        } else {
+        } else
-                /*
+                left_over = calculate_slab_order(cachep, size, align, flags);
-                 * Calculate size (in pages) of slabs, and the num of objs per
-                 * slab.  This could be made much more intelligent.  For now,
-                 * try to avoid using high page-orders for slabs.  When the
-                 * gfp() funcs are more friendly towards high-order requests,
-                 * this should be changed.
-                 */
-                do {
-                        unsigned int break_flag = 0;
-cal_wastage:
-                        cache_estimate(cachep->gfporder, size, align, flags,
-                                                &left_over, &cachep->num);
-                        if (break_flag)
-                                break;
-                        if (cachep->gfporder >= MAX_GFP_ORDER)
-                                break;
-                        if (!cachep->num)
-                                goto next;
-                        if (flags & CFLGS_OFF_SLAB &&
-                                        cachep->num > offslab_limit) {
-                                /* This num of objs will cause problems. */
-                                cachep->gfporder--;
-                                break_flag++;
-                                goto cal_wastage;
-                        }
-                        /*
-                         * Large num of objs is good, but v. large slabs are
-                         * currently bad for the gfp()s.
-                         */
-                        if (cachep->gfporder >= slab_break_gfp_order)
-                                break;
-                        if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
-                                break;  /* Acceptable internal fragmentation. */
-next:
-                        cachep->gfporder++;
-                } while (1);
-        }
        if (!cachep->num) {
                printk("kmem_cache_create: couldn't create cache %s.\n", name);
@@ -1729,8 +1754,8 @@ next:
                cachep = NULL;
                goto oops;
        }
-        slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
+        slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
-                                + sizeof(struct slab), align);
+                          + sizeof(struct slab), align);
        /*
         * If the slab has been placed off-slab, and we have enough space then
@@ -1743,14 +1768,15 @@ next:
        if (flags & CFLGS_OFF_SLAB) {
                /* really off slab. No need for manual alignment */
-                slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
+                slab_size =
+                    cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
        }
        cachep->colour_off = cache_line_size();
        /* Offset must be a multiple of the alignment. */
        if (cachep->colour_off < align)
                cachep->colour_off = align;
-        cachep->colour = left_over/cachep->colour_off;
+        cachep->colour = left_over / cachep->colour_off;
        cachep->slab_size = slab_size;
        cachep->flags = flags;
        cachep->gfpflags = 0;
@@ -1777,7 +1803,7 @@ next:
                         * the creation of further caches will BUG().
                         */
                        cachep->array[smp_processor_id()] =
-                                &initarray_generic.cache;
+                            &initarray_generic.cache;
                        /* If the cache that's used by
                         * kmalloc(sizeof(kmem_list3)) is the first cache,
@@ -1791,8 +1817,7 @@ next:
                                g_cpucache_up = PARTIAL_AC;
                } else {
                        cachep->array[smp_processor_id()] =
-                                kmalloc(sizeof(struct arraycache_init),
+                            kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-                                                GFP_KERNEL);
                        if (g_cpucache_up == PARTIAL_AC) {
                                set_up_list3s(cachep, SIZE_L3);
@@ -1802,16 +1827,18 @@ next:
                                for_each_online_node(node) {
                                        cachep->nodelists[node] =
-                                                kmalloc_node(sizeof(struct kmem_list3),
+                                            kmalloc_node(sizeof
-                                                                GFP_KERNEL, node);
+                                                         (struct kmem_list3),
+                                                         GFP_KERNEL, node);
                                        BUG_ON(!cachep->nodelists[node]);
-                                        kmem_list3_init(cachep->nodelists[node]);
+                                        kmem_list3_init(cachep->
+                                                        nodelists[node]);
                                }
                        }
                }
                cachep->nodelists[numa_node_id()]->next_reap =
-                        jiffies + REAPTIMEOUT_LIST3 +
+                    jiffies + REAPTIMEOUT_LIST3 +
-                        ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
                BUG_ON(!ac_data(cachep));
                ac_data(cachep)->avail = 0;
@@ -1820,15 +1847,15 @@ next:
                ac_data(cachep)->touched = 0;
                cachep->batchcount = 1;
                cachep->limit = BOOT_CPUCACHE_ENTRIES;
-        } 
+        }
        /* cache setup completed, link it into the list */
        list_add(&cachep->next, &cache_chain);
        unlock_cpu_hotplug();
-oops:
+      oops:
        if (!cachep && (flags & SLAB_PANIC))
                panic("kmem_cache_create(): failed to create slab `%s'\n",
-                        name);
+                      name);
        up(&cache_chain_sem);
        return cachep;
 }
@@ -1871,7 +1898,7 @@ static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
 /*
 * Waits for all CPUs to execute func().
 */
-static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
+static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 {
        check_irq_on();
        preempt_disable();
@@ -1886,12 +1913,12 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
        preempt_enable();
 }
-static void drain_array_locked(kmem_cache_t* cachep,
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
-                                struct array_cache *ac, int force, int node);
+                                int force, int node);
 static void do_drain(void *arg)
 {
-        kmem_cache_t *cachep = (kmem_cache_t*)arg;
+        kmem_cache_t *cachep = (kmem_cache_t *) arg;
        struct array_cache *ac;
        int node = numa_node_id();
@@ -1911,7 +1938,7 @@ static void drain_cpu_caches(kmem_cache_t *cachep)
        smp_call_function_all_cpus(do_drain, cachep);
        check_irq_on();
        spin_lock_irq(&cachep->spinlock);
-        for_each_online_node(node)  {
+        for_each_online_node(node) {
                l3 = cachep->nodelists[node];
                if (l3) {
                        spin_lock(&l3->list_lock);
@@ -1949,8 +1976,7 @@ static int __node_shrink(kmem_cache_t *cachep, int node)
                slab_destroy(cachep, slabp);
                spin_lock_irq(&l3->list_lock);
        }
-        ret = !list_empty(&l3->slabs_full) ||
+        ret = !list_empty(&l3->slabs_full) || !list_empty(&l3->slabs_partial);
-                !list_empty(&l3->slabs_partial);
        return ret;
 }
@@ -2006,7 +2032,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 * The caller must guarantee that noone will allocate memory from the cache
 * during the kmem_cache_destroy().
 */
-int kmem_cache_destroy(kmem_cache_t * cachep)
+int kmem_cache_destroy(kmem_cache_t *cachep)
 {
        int i;
        struct kmem_list3 *l3;
@@ -2028,7 +2054,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
        if (__cache_shrink(cachep)) {
                slab_error(cachep, "Can't free all objects");
                down(&cache_chain_sem);
-                list_add(&cachep->next,&cache_chain);
+                list_add(&cachep->next, &cache_chain);
                up(&cache_chain_sem);
                unlock_cpu_hotplug();
                return 1;
@@ -2038,7 +2064,7 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
                synchronize_rcu();
        for_each_online_cpu(i)
-                kfree(cachep->array[i]);
+            kfree(cachep->array[i]);
        /* NUMA: free the list3 structures */
        for_each_online_node(i) {
@@ -2057,39 +2083,39 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
 EXPORT_SYMBOL(kmem_cache_destroy);
 /* Get the memory for a slab management obj. */
-static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
+static struct slab *alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
-                        int colour_off, gfp_t local_flags)
+                                   int colour_off, gfp_t local_flags)
 {
        struct slab *slabp;
-        
        if (OFF_SLAB(cachep)) {
                /* Slab management obj is off-slab. */
                slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
                if (!slabp)
                        return NULL;
        } else {
-                slabp = objp+colour_off;
+                slabp = objp + colour_off;
                colour_off += cachep->slab_size;
        }
        slabp->inuse = 0;
        slabp->colouroff = colour_off;
-        slabp->s_mem = objp+colour_off;
+        slabp->s_mem = objp + colour_off;
        return slabp;
 }
 static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
 {
-        return (kmem_bufctl_t *)(slabp+1);
+        return (kmem_bufctl_t *) (slabp + 1);
 }
 static void cache_init_objs(kmem_cache_t *cachep,
-                        struct slab *slabp, unsigned long ctor_flags)
+                            struct slab *slabp, unsigned long ctor_flags)
 {
        int i;
        for (i = 0; i < cachep->num; i++) {
-                void *objp = slabp->s_mem+cachep->objsize*i;
+                void *objp = slabp->s_mem + cachep->objsize * i;
 #if DEBUG
                /* need to poison the objs? */
                if (cachep->flags & SLAB_POISON)
@@ -2107,25 +2133,28 @@ static void cache_init_objs(kmem_cache_t *cachep,
                 * Otherwise, deadlock. They must also be threaded.
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
-                        cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);
+                        cachep->ctor(objp + obj_dbghead(cachep), cachep,
+                                     ctor_flags);
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "constructor overwrote the"
-                                                        " end of an object");
+                                           " end of an object");
                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
                                slab_error(cachep, "constructor overwrote the"
-                                                        " start of an object");
+                                           " start of an object");
                }
-                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
+                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
-                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+                    && cachep->flags & SLAB_POISON)
+                        kernel_map_pages(virt_to_page(objp),
+                                         cachep->objsize / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
                        cachep->ctor(objp, cachep, ctor_flags);
 #endif
-                slab_bufctl(slabp)[i] = i+1;
+                slab_bufctl(slabp)[i] = i + 1;
        }
-        slab_bufctl(slabp)[i-1] = BUFCTL_END;
+        slab_bufctl(slabp)[i - 1] = BUFCTL_END;
        slabp->free = 0;
 }
@@ -2161,17 +2190,17 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
 */
 static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
-        struct slab     *slabp;
+        struct slab *slabp;
-        void            *objp;
+        void *objp;
-        size_t           offset;
+        size_t offset;
-        gfp_t            local_flags;
+        gfp_t local_flags;
-        unsigned long    ctor_flags;
+        unsigned long ctor_flags;
        struct kmem_list3 *l3;
        /* Be lazy and only check for valid flags here,
-         * keeping it out of the critical path in kmem_cache_alloc().
+         * keeping it out of the critical path in kmem_cache_alloc().
         */
-        if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
+        if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
                BUG();
        if (flags & SLAB_NO_GROW)
                return 0;
@@ -2237,9 +2266,9 @@ static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
        l3->free_objects += cachep->num;
        spin_unlock(&l3->list_lock);
        return 1;
-opps1:
+      opps1:
        kmem_freepages(cachep, objp);
-failed:
+      failed:
        if (local_flags & __GFP_WAIT)
                local_irq_disable();
        return 0;
@@ -2259,18 +2288,19 @@ static void kfree_debugcheck(const void *objp)
        if (!virt_addr_valid(objp)) {
                printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
-                        (unsigned long)objp);   
+                       (unsigned long)objp);
-                BUG();  
+                BUG();
        }
        page = virt_to_page(objp);
        if (!PageSlab(page)) {
-                printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
+                printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n",
+                       (unsigned long)objp);
                BUG();
        }
 }
 static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
-                                        void *caller)
+                                   void *caller)
 {
        struct page *page;
        unsigned int objnr;
@@ -2281,20 +2311,26 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
        page = virt_to_page(objp);
        if (page_get_cache(page) != cachep) {
-                printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+                printk(KERN_ERR
-                                page_get_cache(page),cachep);
+                       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+                       page_get_cache(page), cachep);
                printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
-                printk(KERN_ERR "%p is %s.\n", page_get_cache(page), page_get_cache(page)->name);
+                printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
+                       page_get_cache(page)->name);
                WARN_ON(1);
        }
        slabp = page_get_slab(page);
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
-                        slab_error(cachep, "double free, or memory outside"
+                    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-                                                " object was overwritten");
+                        slab_error(cachep,
-                        printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                                   "double free, or memory outside"
-                                        objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+                                   " object was overwritten");
+                        printk(KERN_ERR
+                               "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                               objp, *dbg_redzone1(cachep, objp),
+                               *dbg_redzone2(cachep, objp));
                }
                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
@@ -2302,30 +2338,31 @@ static void *cache_free_debugcheck(kmem_cache_t *cachep, void *objp,
        if (cachep->flags & SLAB_STORE_USER)
                *dbg_userword(cachep, objp) = caller;
-        objnr = (objp-slabp->s_mem)/cachep->objsize;
+        objnr = (objp - slabp->s_mem) / cachep->objsize;
        BUG_ON(objnr >= cachep->num);
-        BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
+        BUG_ON(objp != slabp->s_mem + objnr * cachep->objsize);
        if (cachep->flags & SLAB_DEBUG_INITIAL) {
                /* Need to call the slab's constructor so the
                 * caller can perform a verify of its state (debugging).
                 * Called without the cache-lock held.
                 */
-                cachep->ctor(objp+obj_dbghead(cachep),
+                cachep->ctor(objp + obj_dbghead(cachep),
-                                        cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
+                             cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
        }
        if (cachep->flags & SLAB_POISON && cachep->dtor) {
                /* we want to cache poison the object,
                 * call the destruction callback
                 */
-                cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
+                cachep->dtor(objp + obj_dbghead(cachep), cachep, 0);
        }
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
                        store_stackinfo(cachep, objp, (unsigned long)caller);
-                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
+                        kernel_map_pages(virt_to_page(objp),
+                                         cachep->objsize / PAGE_SIZE, 0);
                } else {
                        poison_obj(cachep, objp, POISON_FREE);
                }
@@ -2340,7 +2377,7 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
 {
        kmem_bufctl_t i;
        int entries = 0;
-        
        /* Check slab's freelist to see if this obj is there. */
        for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
                entries++;
@@ -2348,13 +2385,16 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
                        goto bad;
        }
        if (entries != cachep->num - slabp->inuse) {
-bad:
+              bad:
-                printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+                printk(KERN_ERR
-                                cachep->name, cachep->num, slabp, slabp->inuse);
+                       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-                for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) {
+                       cachep->name, cachep->num, slabp, slabp->inuse);
-                        if ((i%16)==0)
+                for (i = 0;
+                     i < sizeof(slabp) + cachep->num * sizeof(kmem_bufctl_t);
+                     i++) {
+                        if ((i % 16) == 0)
                                printk("\n%03x:", i);
-                        printk(" %02x", ((unsigned char*)slabp)[i]);
+                        printk(" %02x", ((unsigned char *)slabp)[i]);
                }
                printk("\n");
                BUG();
@@ -2374,7 +2414,7 @@ static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
        check_irq_off();
        ac = ac_data(cachep);
-retry:
+      retry:
        batchcount = ac->batchcount;
        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
                /* if there was little recent activity on this
@@ -2396,8 +2436,8 @@ retry:
                        shared_array->avail -= batchcount;
                        ac->avail = batchcount;
                        memcpy(ac->entry,
-                                &(shared_array->entry[shared_array->avail]),
+                               &(shared_array->entry[shared_array->avail]),
-                                sizeof(void*)*batchcount);
+                               sizeof(void *) * batchcount);
                        shared_array->touched = 1;
                        goto alloc_done;
                }
@@ -2425,7 +2465,7 @@ retry:
                        /* get obj pointer */
                        ac->entry[ac->avail++] = slabp->s_mem +
-                                slabp->free*cachep->objsize;
+                            slabp->free * cachep->objsize;
                        slabp->inuse++;
                        next = slab_bufctl(slabp)[slabp->free];
@@ -2433,7 +2473,7 @@ retry:
                        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
                        WARN_ON(numa_node_id() != slabp->nodeid);
 #endif
-                        slabp->free = next;
+                        slabp->free = next;
                }
                check_slabp(cachep, slabp);
@@ -2445,9 +2485,9 @@ retry:
                        list_add(&slabp->list, &l3->slabs_partial);
        }
-must_grow:
+      must_grow:
        l3->free_objects -= ac->avail;
-alloc_done:
+      alloc_done:
        spin_unlock(&l3->list_lock);
        if (unlikely(!ac->avail)) {
@@ -2459,7 +2499,7 @@ alloc_done:
                if (!x && ac->avail == 0)       // no objects in sight? abort
                        return NULL;
-                if (!ac->avail)         // objects refilled by interrupt?
+                if (!ac->avail) // objects refilled by interrupt?
                        goto retry;
        }
        ac->touched = 1;
@@ -2476,16 +2516,16 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
 }
 #if DEBUG
-static void *
+static void *cache_alloc_debugcheck_after(kmem_cache_t *cachep, gfp_t flags,
-cache_alloc_debugcheck_after(kmem_cache_t *cachep,
+                                        void *objp, void *caller)
-                        gfp_t flags, void *objp, void *caller)
 {
-        if (!objp)      
+        if (!objp)
                return objp;
-        if (cachep->flags & SLAB_POISON) {
+        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
-                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
+                        kernel_map_pages(virt_to_page(objp),
+                                         cachep->objsize / PAGE_SIZE, 1);
                else
                        check_poison_obj(cachep, objp);
 #else
@@ -2497,24 +2537,28 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
                *dbg_userword(cachep, objp) = caller;
        if (cachep->flags & SLAB_RED_ZONE) {
-                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
-                        slab_error(cachep, "double free, or memory outside"
+                    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-                                                " object was overwritten");
+                        slab_error(cachep,
-                        printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                                   "double free, or memory outside"
-                                        objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
+                                   " object was overwritten");
+                        printk(KERN_ERR
+                               "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+                               objp, *dbg_redzone1(cachep, objp),
+                               *dbg_redzone2(cachep, objp));
                }
                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
        }
        objp += obj_dbghead(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON) {
-                unsigned long   ctor_flags = SLAB_CTOR_CONSTRUCTOR;
+                unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
                if (!(flags & __GFP_WAIT))
                        ctor_flags |= SLAB_CTOR_ATOMIC;
                cachep->ctor(objp, cachep, ctor_flags);
-        }       
+        }
        return objp;
 }
 #else
@@ -2523,7 +2567,7 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
 static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
-        void* objp;
+        void *objp;
        struct array_cache *ac;
        check_irq_off();
@@ -2542,7 +2586,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
        unsigned long save_flags;
-        void* objp;
+        void *objp;
        cache_alloc_debugcheck_before(cachep, flags);
@@ -2550,7 +2594,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
        objp = ____cache_alloc(cachep, flags);
        local_irq_restore(save_flags);
        objp = cache_alloc_debugcheck_after(cachep, flags, objp,
-                                        __builtin_return_address(0));
+                                            __builtin_return_address(0));
        prefetchw(objp);
        return objp;
 }
@@ -2562,74 +2606,75 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 static void *__cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
        struct list_head *entry;
-        struct slab *slabp;
+        struct slab *slabp;
-        struct kmem_list3 *l3;
+        struct kmem_list3 *l3;
-        void *obj;
+        void *obj;
-        kmem_bufctl_t next;
+        kmem_bufctl_t next;
-        int x;
+        int x;
-        l3 = cachep->nodelists[nodeid];
+        l3 = cachep->nodelists[nodeid];
-        BUG_ON(!l3);
+        BUG_ON(!l3);
-retry:
+      retry:
-        spin_lock(&l3->list_lock);
+        spin_lock(&l3->list_lock);
-        entry = l3->slabs_partial.next;
+        entry = l3->slabs_partial.next;
-        if (entry == &l3->slabs_partial) {
+        if (entry == &l3->slabs_partial) {
-                l3->free_touched = 1;
+                l3->free_touched = 1;
-                entry = l3->slabs_free.next;
+                entry = l3->slabs_free.next;
-                if (entry == &l3->slabs_free)
+                if (entry == &l3->slabs_free)
-                        goto must_grow;
+                        goto must_grow;
-        }
+        }
-        slabp = list_entry(entry, struct slab, list);
+        slabp = list_entry(entry, struct slab, list);
-        check_spinlock_acquired_node(cachep, nodeid);
+        check_spinlock_acquired_node(cachep, nodeid);
-        check_slabp(cachep, slabp);
+        check_slabp(cachep, slabp);
-        STATS_INC_NODEALLOCS(cachep);
+        STATS_INC_NODEALLOCS(cachep);
-        STATS_INC_ACTIVE(cachep);
+        STATS_INC_ACTIVE(cachep);
-        STATS_SET_HIGH(cachep);
+        STATS_SET_HIGH(cachep);
-        BUG_ON(slabp->inuse == cachep->num);
+        BUG_ON(slabp->inuse == cachep->num);
-        /* get obj pointer */
+        /* get obj pointer */
-        obj =  slabp->s_mem + slabp->free*cachep->objsize;
+        obj = slabp->s_mem + slabp->free * cachep->objsize;
-        slabp->inuse++;
+        slabp->inuse++;
-        next = slab_bufctl(slabp)[slabp->free];
+        next = slab_bufctl(slabp)[slabp->free];
 #if DEBUG
-        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
 #endif
-        slabp->free = next;
+        slabp->free = next;
-        check_slabp(cachep, slabp);
+        check_slabp(cachep, slabp);
-        l3->free_objects--;
+        l3->free_objects--;
-        /* move slabp to correct slabp list: */
+        /* move slabp to correct slabp list: */
-        list_del(&slabp->list);
+        list_del(&slabp->list);
-        if (slabp->free == BUFCTL_END) {
+        if (slabp->free == BUFCTL_END) {
-                list_add(&slabp->list, &l3->slabs_full);
+                list_add(&slabp->list, &l3->slabs_full);
-        } else {
+        } else {
-                list_add(&slabp->list, &l3->slabs_partial);
+                list_add(&slabp->list, &l3->slabs_partial);
-        }
+        }
-        spin_unlock(&l3->list_lock);
+        spin_unlock(&l3->list_lock);
-        goto done;
+        goto done;
-must_grow:
+      must_grow:
-        spin_unlock(&l3->list_lock);
+        spin_unlock(&l3->list_lock);
-        x = cache_grow(cachep, flags, nodeid);
+        x = cache_grow(cachep, flags, nodeid);
-        if (!x)
+        if (!x)
-                return NULL;
+                return NULL;
-        goto retry;
+        goto retry;
-done:
+      done:
-        return obj;
+        return obj;
 }
 #endif
 /*
 * Caller needs to acquire correct kmem_list's list_lock
 */
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node)
+static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects,
+                       int node)
 {
        int i;
        struct kmem_list3 *l3;
@@ -2652,7 +2697,7 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int n
                if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
                        printk(KERN_ERR "slab: double free detected in cache "
-                                        "'%s', objp %p\n", cachep->name, objp);
+                               "'%s', objp %p\n", cachep->name, objp);
                        BUG();
                }
 #endif
@@ -2696,20 +2741,19 @@ static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
        spin_lock(&l3->list_lock);
        if (l3->shared) {
                struct array_cache *shared_array = l3->shared;
-                int max = shared_array->limit-shared_array->avail;
+                int max = shared_array->limit - shared_array->avail;
                if (max) {
                        if (batchcount > max)
                                batchcount = max;
                        memcpy(&(shared_array->entry[shared_array->avail]),
-                                        ac->entry,
+                               ac->entry, sizeof(void *) * batchcount);
-                                        sizeof(void*)*batchcount);
                        shared_array->avail += batchcount;
                        goto free_done;
                }
        }
        free_block(cachep, ac->entry, batchcount, node);
-free_done:
+      free_done:
 #if STATS
        {
                int i = 0;
@@ -2731,10 +2775,9 @@ free_done:
        spin_unlock(&l3->list_lock);
        ac->avail -= batchcount;
        memmove(ac->entry, &(ac->entry[batchcount]),
-                        sizeof(void*)*ac->avail);
+                sizeof(void *) * ac->avail);
 }
 /*
 * __cache_free
 * Release an obj back to its cache. If the obj has a constructed
@@ -2759,7 +2802,8 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
                if (unlikely(slabp->nodeid != numa_node_id())) {
                        struct array_cache *alien = NULL;
                        int nodeid = slabp->nodeid;
-                        struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
+                        struct kmem_list3 *l3 =
+                            cachep->nodelists[numa_node_id()];
                        STATS_INC_NODEFREES(cachep);
                        if (l3->alien && l3->alien[nodeid]) {
@@ -2767,15 +2811,15 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
                                spin_lock(&alien->lock);
                                if (unlikely(alien->avail == alien->limit))
                                        __drain_alien_cache(cachep,
-                                                        alien, nodeid);
+                                                            alien, nodeid);
                                alien->entry[alien->avail++] = objp;
                                spin_unlock(&alien->lock);
                        } else {
                                spin_lock(&(cachep->nodelists[nodeid])->
-                                                list_lock);
+                                          list_lock);
                                free_block(cachep, &objp, 1, nodeid);
                                spin_unlock(&(cachep->nodelists[nodeid])->
-                                                list_lock);
+                                            list_lock);
                        }
                        return;
                }
@@ -2822,9 +2866,9 @@ EXPORT_SYMBOL(kmem_cache_alloc);
 */
 int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
 {
-        unsigned long addr = (unsigned long) ptr;
+        unsigned long addr = (unsigned long)ptr;
        unsigned long min_addr = PAGE_OFFSET;
-        unsigned long align_mask = BYTES_PER_WORD-1;
+        unsigned long align_mask = BYTES_PER_WORD - 1;
        unsigned long size = cachep->objsize;
        struct page *page;
@@ -2844,7 +2888,7 @@ int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
        if (unlikely(page_get_cache(page) != cachep))
                goto out;
        return 1;
-out:
+      out:
        return 0;
 }
@@ -2871,8 +2915,10 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
        if (unlikely(!cachep->nodelists[nodeid])) {
                /* Fall back to __cache_alloc if we run into trouble */
-                printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name);
+                printk(KERN_WARNING
-                return __cache_alloc(cachep,flags);
+                       "slab: not allocating in inactive node %d for cache %s\n",
+                       nodeid, cachep->name);
+                return __cache_alloc(cachep, flags);
        }
        cache_alloc_debugcheck_before(cachep, flags);
@@ -2882,7 +2928,9 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
        else
                ptr = __cache_alloc_node(cachep, flags, nodeid);
        local_irq_restore(save_flags);
-        ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
+        ptr =
+            cache_alloc_debugcheck_after(cachep, flags, ptr,
+                                         __builtin_return_address(0));
        return ptr;
 }
@@ -2944,12 +2992,11 @@ EXPORT_SYMBOL(__kmalloc);
 * Objects should be dereferenced using the per_cpu_ptr macro only.
 *
 * @size: how many bytes of memory are required.
- * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
 */
-void *__alloc_percpu(size_t size, size_t align)
+void *__alloc_percpu(size_t size)
 {
        int i;
-        struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+        struct percpu_data *pdata = kmalloc(sizeof(*pdata), GFP_KERNEL);
        if (!pdata)
                return NULL;
@@ -2973,9 +3020,9 @@ void *__alloc_percpu(size_t size, size_t align)
        }
        /* Catch derefs w/o wrappers */
-        return (void *) (~(unsigned long) pdata);
+        return (void *)(~(unsigned long)pdata);
-unwind_oom:
+      unwind_oom:
        while (--i >= 0) {
                if (!cpu_possible(i))
                        continue;
@@ -3006,20 +3053,6 @@ void kmem_cache_free(kmem_cache_t *cachep, void *objp)
 EXPORT_SYMBOL(kmem_cache_free);
 /**
- * kzalloc - allocate memory. The memory is set to zero.
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- */
-void *kzalloc(size_t size, gfp_t flags)
-{
-        void *ret = kmalloc(size, flags);
-        if (ret)
-                memset(ret, 0, size);
-        return ret;
-}
-EXPORT_SYMBOL(kzalloc);
-/**
 * kfree - free previously allocated memory
 * @objp: pointer returned by kmalloc.
 *
@@ -3038,7 +3071,7 @@ void kfree(const void *objp)
        local_irq_save(flags);
        kfree_debugcheck(objp);
        c = page_get_cache(virt_to_page(objp));
-        __cache_free(c, (void*)objp);
+        __cache_free(c, (void *)objp);
        local_irq_restore(flags);
 }
 EXPORT_SYMBOL(kfree);
@@ -3051,17 +3084,16 @@ EXPORT_SYMBOL(kfree);
 * Don't free memory not originally allocated by alloc_percpu()
 * The complemented objp is to check for that.
 */
-void
+void free_percpu(const void *objp)
-free_percpu(const void *objp)
 {
        int i;
-        struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+        struct percpu_data *p = (struct percpu_data *)(~(unsigned long)objp);
        /*
         * We allocate for all cpus so we cannot use for online cpu here.
         */
        for_each_cpu(i)
-                kfree(p->ptrs[i]);
+            kfree(p->ptrs[i]);
        kfree(p);
 }
 EXPORT_SYMBOL(free_percpu);
@@ -3095,44 +3127,44 @@ static int alloc_kmemlist(kmem_cache_t *cachep)
                if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
                        goto fail;
 #endif
-                if (!(new = alloc_arraycache(node, (cachep->shared*
+                if (!(new = alloc_arraycache(node, (cachep->shared *
-                                cachep->batchcount), 0xbaadf00d)))
+                                                    cachep->batchcount),
+                                             0xbaadf00d)))
                        goto fail;
                if ((l3 = cachep->nodelists[node])) {
                        spin_lock_irq(&l3->list_lock);
                        if ((nc = cachep->nodelists[node]->shared))
-                                free_block(cachep, nc->entry,
+                                free_block(cachep, nc->entry, nc->avail, node);
-                                                        nc->avail, node);
                        l3->shared = new;
                        if (!cachep->nodelists[node]->alien) {
                                l3->alien = new_alien;
                                new_alien = NULL;
                        }
-                        l3->free_limit = (1 + nr_cpus_node(node))*
+                        l3->free_limit = (1 + nr_cpus_node(node)) *
-                                cachep->batchcount + cachep->num;
+                            cachep->batchcount + cachep->num;
                        spin_unlock_irq(&l3->list_lock);
                        kfree(nc);
                        free_alien_cache(new_alien);
                        continue;
                }
                if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
-                                                GFP_KERNEL, node)))
+                                        GFP_KERNEL, node)))
                        goto fail;
                kmem_list3_init(l3);
                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-                        ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+                    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
                l3->shared = new;
                l3->alien = new_alien;
-                l3->free_limit = (1 + nr_cpus_node(node))*
+                l3->free_limit = (1 + nr_cpus_node(node)) *
-                        cachep->batchcount + cachep->num;
+                    cachep->batchcount + cachep->num;
                cachep->nodelists[node] = l3;
        }
        return err;
-fail:
+      fail:
        err = -ENOMEM;
        return err;
 }
@@ -3154,18 +3186,19 @@ static void do_ccupdate_local(void *info)
        new->new[smp_processor_id()] = old;
 }
 static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
-                                int shared)
+                            int shared)
 {
        struct ccupdate_struct new;
        int i, err;
-        memset(&new.new,0,sizeof(new.new));
+        memset(&new.new, 0, sizeof(new.new));
        for_each_online_cpu(i) {
-                new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
+                new.new[i] =
+                    alloc_arraycache(cpu_to_node(i), limit, batchcount);
                if (!new.new[i]) {
-                        for (i--; i >= 0; i--) kfree(new.new[i]);
+                        for (i--; i >= 0; i--)
+                                kfree(new.new[i]);
                        return -ENOMEM;
                }
        }
@@ -3193,13 +3226,12 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
        err = alloc_kmemlist(cachep);
        if (err) {
                printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
-                                cachep->name, -err);
+                       cachep->name, -err);
                BUG();
        }
        return 0;
 }
 static void enable_cpucache(kmem_cache_t *cachep)
 {
        int err;
@@ -3246,14 +3278,14 @@ static void enable_cpucache(kmem_cache_t *cachep)
        if (limit > 32)
                limit = 32;
 #endif
-        err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
+        err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
        if (err)
                printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
-                                        cachep->name, -err);
+                       cachep->name, -err);
 }
-static void drain_array_locked(kmem_cache_t *cachep,
+static void drain_array_locked(kmem_cache_t *cachep, struct array_cache *ac,
-                                struct array_cache *ac, int force, int node)
+                                int force, int node)
 {
        int tofree;
@@ -3261,14 +3293,14 @@ static void drain_array_locked(kmem_cache_t *cachep,
        if (ac->touched && !force) {
                ac->touched = 0;
        } else if (ac->avail) {
-                tofree = force ? ac->avail : (ac->limit+4)/5;
+                tofree = force ? ac->avail : (ac->limit + 4) / 5;
                if (tofree > ac->avail) {
-                        tofree = (ac->avail+1)/2;
+                        tofree = (ac->avail + 1) / 2;
                }
                free_block(cachep, ac->entry, tofree, node);
                ac->avail -= tofree;
                memmove(ac->entry, &(ac->entry[tofree]),
-                                        sizeof(void*)*ac->avail);
+                        sizeof(void *) * ac->avail);
        }
 }
@@ -3291,13 +3323,14 @@ static void cache_reap(void *unused)
        if (down_trylock(&cache_chain_sem)) {
                /* Give up. Setup the next iteration. */
-                schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
+                schedule_delayed_work(&__get_cpu_var(reap_work),
+                                      REAPTIMEOUT_CPUC);
                return;
        }
        list_for_each(walk, &cache_chain) {
                kmem_cache_t *searchp;
-                struct list_head* p;
+                struct list_head *p;
                int tofree;
                struct slab *slabp;
@@ -3314,7 +3347,7 @@ static void cache_reap(void *unused)
                spin_lock_irq(&l3->list_lock);
                drain_array_locked(searchp, ac_data(searchp), 0,
-                                numa_node_id());
+                                   numa_node_id());
                if (time_after(l3->next_reap, jiffies))
                        goto next_unlock;
@@ -3323,14 +3356,16 @@ static void cache_reap(void *unused)
                if (l3->shared)
                        drain_array_locked(searchp, l3->shared, 0,
-                                numa_node_id());
+                                           numa_node_id());
                if (l3->free_touched) {
                        l3->free_touched = 0;
                        goto next_unlock;
                }
-                tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
+                tofree =
+                    (l3->free_limit + 5 * searchp->num -
+                     1) / (5 * searchp->num);
                do {
                        p = l3->slabs_free.next;
                        if (p == &(l3->slabs_free))
@@ -3350,10 +3385,10 @@ static void cache_reap(void *unused)
                        spin_unlock_irq(&l3->list_lock);
                        slab_destroy(searchp, slabp);
                        spin_lock_irq(&l3->list_lock);
-                } while(--tofree > 0);
+                } while (--tofree > 0);
-next_unlock:
+              next_unlock:
                spin_unlock_irq(&l3->list_lock);
-next:
+              next:
                cond_resched();
        }
        check_irq_on();
@@ -3365,32 +3400,37 @@ next:
 #ifdef CONFIG_PROC_FS
-static void *s_start(struct seq_file *m, loff_t *pos)
+static void print_slabinfo_header(struct seq_file *m)
 {
-        loff_t n = *pos;
+        /*
-        struct list_head *p;
+         * Output format version, so at least we can change it
+         * without _too_ many complaints.
-        down(&cache_chain_sem);
+         */
-        if (!n) {
-                /*
-                 * Output format version, so at least we can change it
-                 * without _too_ many complaints.
-                 */
 #if STATS
-                seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
+        seq_puts(m, "slabinfo - version: 2.1 (statistics)\n");
 #else
-                seq_puts(m, "slabinfo - version: 2.1\n");
+        seq_puts(m, "slabinfo - version: 2.1\n");
 #endif
-                seq_puts(m, "# name            <active_objs> <num_objs> <objsize> <objperslab> <pagesperslab>");
+        seq_puts(m, "# name            <active_objs> <num_objs> <objsize> "
-                seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+                 "<objperslab> <pagesperslab>");
-                seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
+        seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
+        seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #if STATS
-                seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
+        seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped> "
-                                " <error> <maxfreeable> <nodeallocs> <remotefrees>");
+                 "<error> <maxfreeable> <nodeallocs> <remotefrees>");
-                seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
+        seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
-                seq_putc(m, '\n');
+        seq_putc(m, '\n');
-        }
+}
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+        loff_t n = *pos;
+        struct list_head *p;
+        down(&cache_chain_sem);
+        if (!n)
+                print_slabinfo_header(m);
        p = cache_chain.next;
        while (n--) {
                p = p->next;
@@ -3405,7 +3445,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
        kmem_cache_t *cachep = p;
        ++*pos;
        return cachep->next.next == &cache_chain ? NULL
-                : list_entry(cachep->next.next, kmem_cache_t, next);
+            : list_entry(cachep->next.next, kmem_cache_t, next);
 }
 static void s_stop(struct seq_file *m, void *p)
@@ -3417,11 +3457,11 @@ static int s_show(struct seq_file *m, void *p)
 {
        kmem_cache_t *cachep = p;
        struct list_head *q;
-        struct slab     *slabp;
+        struct slab *slabp;
-        unsigned long   active_objs;
+        unsigned long active_objs;
-        unsigned long   num_objs;
+        unsigned long num_objs;
-        unsigned long   active_slabs = 0;
+        unsigned long active_slabs = 0;
-        unsigned long   num_slabs, free_objects = 0, shared_avail = 0;
+        unsigned long num_slabs, free_objects = 0, shared_avail = 0;
        const char *name;
        char *error = NULL;
        int node;
@@ -3438,14 +3478,14 @@ static int s_show(struct seq_file *m, void *p)
                spin_lock(&l3->list_lock);
-                list_for_each(q,&l3->slabs_full) {
+                list_for_each(q, &l3->slabs_full) {
                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse != cachep->num && !error)
                                error = "slabs_full accounting error";
                        active_objs += cachep->num;
                        active_slabs++;
                }
-                list_for_each(q,&l3->slabs_partial) {
+                list_for_each(q, &l3->slabs_partial) {
                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse == cachep->num && !error)
                                error = "slabs_partial inuse accounting error";
@@ -3454,7 +3494,7 @@ static int s_show(struct seq_file *m, void *p)
                        active_objs += slabp->inuse;
                        active_slabs++;
                }
-                list_for_each(q,&l3->slabs_free) {
+                list_for_each(q, &l3->slabs_free) {
                        slabp = list_entry(q, struct slab, list);
                        if (slabp->inuse && !error)
                                error = "slabs_free/inuse accounting error";
@@ -3465,25 +3505,24 @@ static int s_show(struct seq_file *m, void *p)
                spin_unlock(&l3->list_lock);
        }
-        num_slabs+=active_slabs;
+        num_slabs += active_slabs;
-        num_objs = num_slabs*cachep->num;
+        num_objs = num_slabs * cachep->num;
        if (num_objs - active_objs != free_objects && !error)
                error = "free_objects accounting error";
-        name = cachep->name; 
+        name = cachep->name;
        if (error)
                printk(KERN_ERR "slab: cache %s error: %s\n", name, error);
        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-                name, active_objs, num_objs, cachep->objsize,
+                   name, active_objs, num_objs, cachep->objsize,
-                cachep->num, (1<<cachep->gfporder));
+                   cachep->num, (1 << cachep->gfporder));
        seq_printf(m, " : tunables %4u %4u %4u",
-                        cachep->limit, cachep->batchcount,
+                   cachep->limit, cachep->batchcount, cachep->shared);
-                        cachep->shared);
        seq_printf(m, " : slabdata %6lu %6lu %6lu",
-                        active_slabs, num_slabs, shared_avail);
+                   active_slabs, num_slabs, shared_avail);
 #if STATS
-        {       /* list3 stats */
+        {                       /* list3 stats */
                unsigned long high = cachep->high_mark;
                unsigned long allocs = cachep->num_allocations;
                unsigned long grown = cachep->grown;
@@ -3494,9 +3533,7 @@ static int s_show(struct seq_file *m, void *p)
                unsigned long node_frees = cachep->node_frees;
                seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-                                %4lu %4lu %4lu %4lu",
+                                %4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
-                                allocs, high, grown, reaped, errors,
-                                max_freeable, node_allocs, node_frees);
        }
        /* cpu stats */
        {
@@ -3506,7 +3543,7 @@ static int s_show(struct seq_file *m, void *p)
                unsigned long freemiss = atomic_read(&cachep->freemiss);
                seq_printf(m, " : cpustat %6lu %6lu %6lu %6lu",
-                        allochit, allocmiss, freehit, freemiss);
+                           allochit, allocmiss, freehit, freemiss);
        }
 #endif
        seq_putc(m, '\n');
@@ -3529,10 +3566,10 @@ static int s_show(struct seq_file *m, void *p)
 */
 struct seq_operations slabinfo_op = {
-        .start  = s_start,
+        .start = s_start,
-        .next   = s_next,
+        .next = s_next,
-        .stop   = s_stop,
+        .stop = s_stop,
-        .show   = s_show,
+        .show = s_show,
 };
 #define MAX_SLABINFO_WRITE 128
@@ -3543,18 +3580,18 @@ struct seq_operations slabinfo_op = {
 * @count: data length
 * @ppos: unused
 */
-ssize_t slabinfo_write(struct file *file, const char __user *buffer,
+ssize_t slabinfo_write(struct file *file, const char __user * buffer,
-                                size_t count, loff_t *ppos)
+                       size_t count, loff_t *ppos)
 {
-        char kbuf[MAX_SLABINFO_WRITE+1], *tmp;
+        char kbuf[MAX_SLABINFO_WRITE + 1], *tmp;
        int limit, batchcount, shared, res;
        struct list_head *p;
-        
        if (count > MAX_SLABINFO_WRITE)
                return -EINVAL;
        if (copy_from_user(&kbuf, buffer, count))
                return -EFAULT;
-        kbuf[MAX_SLABINFO_WRITE] = '\0'; 
+        kbuf[MAX_SLABINFO_WRITE] = '\0';
        tmp = strchr(kbuf, ' ');
        if (!tmp)
@@ -3567,18 +3604,17 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
        /* Find the cache in the chain of caches. */
        down(&cache_chain_sem);
        res = -EINVAL;
-        list_for_each(p,&cache_chain) {
+        list_for_each(p, &cache_chain) {
                kmem_cache_t *cachep = list_entry(p, kmem_cache_t, next);
                if (!strcmp(cachep->name, kbuf)) {
                        if (limit < 1 ||
                            batchcount < 1 ||
-                            batchcount > limit ||
+                            batchcount > limit || shared < 0) {
-                            shared < 0) {
                                res = 0;
                        } else {
                                res = do_tune_cpucache(cachep, limit,
-                                                        batchcount, shared);
+                                                       batchcount, shared);
                        }
                        break;
                }
@@ -3609,26 +3645,3 @@ unsigned int ksize(const void *objp)
        return obj_reallen(page_get_cache(virt_to_page(objp)));
 }
-/*
- * kstrdup - allocate space for and copy an existing string
- *
- * @s: the string to duplicate
- * @gfp: the GFP mask used in the kmalloc() call when allocating memory
- */
-char *kstrdup(const char *s, gfp_t gfp)
-{
-        size_t len;
-        char *buf;
-        if (!s)
-                return NULL;
-        len = strlen(s) + 1;
-        buf = kmalloc(len, gfp);
-        if (buf)
-                memcpy(buf, s, len);
-        return buf;
-}
-EXPORT_SYMBOL(kstrdup);
diff --git a/mm/slob.c b/mm/slob.c
new file mode 100644
index 000000000000..1c240c4b71d9
--- /dev/null
+++ b/mm/slob.c
@@ -0,0 +1,385 @@
+/*
+ * SLOB Allocator: Simple List Of Blocks
+ *
+ * Matt Mackall <mpm@selenic.com> 12/30/03
+ *
+ * How SLOB works:
+ *
+ * The core of SLOB is a traditional K&R style heap allocator, with
+ * support for returning aligned objects. The granularity of this
+ * allocator is 8 bytes on x86, though it's perhaps possible to reduce
+ * this to 4 if it's deemed worth the effort. The slob heap is a
+ * singly-linked list of pages from __get_free_page, grown on demand
+ * and allocation from the heap is currently first-fit.
+ *
+ * Above this is an implementation of kmalloc/kfree. Blocks returned
+ * from kmalloc are 8-byte aligned and prepended with a 8-byte header.
+ * If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
+ * __get_free_pages directly so that it can return page-aligned blocks
+ * and keeps a linked list of such pages and their orders. These
+ * objects are detected in kfree() by their page alignment.
+ *
+ * SLAB is emulated on top of SLOB by simply calling constructors and
+ * destructors for every SLAB allocation. Objects are returned with
+ * the 8-byte alignment unless the SLAB_MUST_HWCACHE_ALIGN flag is
+ * set, in which case the low-level allocator will fragment blocks to
+ * create the proper alignment. Again, objects of page-size or greater
+ * are allocated by calling __get_free_pages. As SLAB objects know
+ * their size, no separate size bookkeeping is necessary and there is
+ * essentially no allocation space overhead.
+ */
+#include <linux/config.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+struct slob_block {
+        int units;
+        struct slob_block *next;
+};
+typedef struct slob_block slob_t;
+#define SLOB_UNIT sizeof(slob_t)
+#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT)
+#define SLOB_ALIGN L1_CACHE_BYTES
+struct bigblock {
+        int order;
+        void *pages;
+        struct bigblock *next;
+};
+typedef struct bigblock bigblock_t;
+static slob_t arena = { .next = &arena, .units = 1 };
+static slob_t *slobfree = &arena;
+static bigblock_t *bigblocks;
+static DEFINE_SPINLOCK(slob_lock);
+static DEFINE_SPINLOCK(block_lock);
+static void slob_free(void *b, int size);
+static void *slob_alloc(size_t size, gfp_t gfp, int align)
+{
+        slob_t *prev, *cur, *aligned = 0;
+        int delta = 0, units = SLOB_UNITS(size);
+        unsigned long flags;
+        spin_lock_irqsave(&slob_lock, flags);
+        prev = slobfree;
+        for (cur = prev->next; ; prev = cur, cur = cur->next) {
+                if (align) {
+                        aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+                        delta = aligned - cur;
+                }
+                if (cur->units >= units + delta) { /* room enough? */
+                        if (delta) { /* need to fragment head to align? */
+                                aligned->units = cur->units - delta;
+                                aligned->next = cur->next;
+                                cur->next = aligned;
+                                cur->units = delta;
+                                prev = cur;
+                                cur = aligned;
+                        }
+                        if (cur->units == units) /* exact fit? */
+                                prev->next = cur->next; /* unlink */
+                        else { /* fragment */
+                                prev->next = cur + units;
+                                prev->next->units = cur->units - units;
+                                prev->next->next = cur->next;
+                                cur->units = units;
+                        }
+                        slobfree = prev;
+                        spin_unlock_irqrestore(&slob_lock, flags);
+                        return cur;
+                }
+                if (cur == slobfree) {
+                        spin_unlock_irqrestore(&slob_lock, flags);
+                        if (size == PAGE_SIZE) /* trying to shrink arena? */
+                                return 0;
+                        cur = (slob_t *)__get_free_page(gfp);
+                        if (!cur)
+                                return 0;
+                        slob_free(cur, PAGE_SIZE);
+                        spin_lock_irqsave(&slob_lock, flags);
+                        cur = slobfree;
+                }
+        }
+}
+static void slob_free(void *block, int size)
+{
+        slob_t *cur, *b = (slob_t *)block;
+        unsigned long flags;
+        if (!block)
+                return;
+        if (size)
+                b->units = SLOB_UNITS(size);
+        /* Find reinsertion point */
+        spin_lock_irqsave(&slob_lock, flags);
+        for (cur = slobfree; !(b > cur && b < cur->next); cur = cur->next)
+                if (cur >= cur->next && (b > cur || b < cur->next))
+                        break;
+        if (b + b->units == cur->next) {
+                b->units += cur->next->units;
+                b->next = cur->next->next;
+        } else
+                b->next = cur->next;
+        if (cur + cur->units == b) {
+                cur->units += b->units;
+                cur->next = b->next;
+        } else
+                cur->next = b;
+        slobfree = cur;
+        spin_unlock_irqrestore(&slob_lock, flags);
+}
+static int FASTCALL(find_order(int size));
+static int fastcall find_order(int size)
+{
+        int order = 0;
+        for ( ; size > 4096 ; size >>=1)
+                order++;
+        return order;
+}
+void *kmalloc(size_t size, gfp_t gfp)
+{
+        slob_t *m;
+        bigblock_t *bb;
+        unsigned long flags;
+        if (size < PAGE_SIZE - SLOB_UNIT) {
+                m = slob_alloc(size + SLOB_UNIT, gfp, 0);
+                return m ? (void *)(m + 1) : 0;
+        }
+        bb = slob_alloc(sizeof(bigblock_t), gfp, 0);
+        if (!bb)
+                return 0;
+        bb->order = find_order(size);
+        bb->pages = (void *)__get_free_pages(gfp, bb->order);
+        if (bb->pages) {
+                spin_lock_irqsave(&block_lock, flags);
+                bb->next = bigblocks;
+                bigblocks = bb;
+                spin_unlock_irqrestore(&block_lock, flags);
+                return bb->pages;
+        }
+        slob_free(bb, sizeof(bigblock_t));
+        return 0;
+}
+EXPORT_SYMBOL(kmalloc);
+void kfree(const void *block)
+{
+        bigblock_t *bb, **last = &bigblocks;
+        unsigned long flags;
+        if (!block)
+                return;
+        if (!((unsigned long)block & (PAGE_SIZE-1))) {
+                /* might be on the big block list */
+                spin_lock_irqsave(&block_lock, flags);
+                for (bb = bigblocks; bb; last = &bb->next, bb = bb->next) {
+                        if (bb->pages == block) {
+                                *last = bb->next;
+                                spin_unlock_irqrestore(&block_lock, flags);
+                                free_pages((unsigned long)block, bb->order);
+                                slob_free(bb, sizeof(bigblock_t));
+                                return;
+                        }
+                }
+                spin_unlock_irqrestore(&block_lock, flags);
+        }
+        slob_free((slob_t *)block - 1, 0);
+        return;
+}
+EXPORT_SYMBOL(kfree);
+unsigned int ksize(const void *block)
+{
+        bigblock_t *bb;
+        unsigned long flags;
+        if (!block)
+                return 0;
+        if (!((unsigned long)block & (PAGE_SIZE-1))) {
+                spin_lock_irqsave(&block_lock, flags);
+                for (bb = bigblocks; bb; bb = bb->next)
+                        if (bb->pages == block) {
+                                spin_unlock_irqrestore(&slob_lock, flags);
+                                return PAGE_SIZE << bb->order;
+                        }
+                spin_unlock_irqrestore(&block_lock, flags);
+        }
+        return ((slob_t *)block - 1)->units * SLOB_UNIT;
+}
+struct kmem_cache {
+        unsigned int size, align;
+        const char *name;
+        void (*ctor)(void *, struct kmem_cache *, unsigned long);
+        void (*dtor)(void *, struct kmem_cache *, unsigned long);
+};
+struct kmem_cache *kmem_cache_create(const char *name, size_t size,
+        size_t align, unsigned long flags,
+        void (*ctor)(void*, struct kmem_cache *, unsigned long),
+        void (*dtor)(void*, struct kmem_cache *, unsigned long))
+{
+        struct kmem_cache *c;
+        c = slob_alloc(sizeof(struct kmem_cache), flags, 0);
+        if (c) {
+                c->name = name;
+                c->size = size;
+                c->ctor = ctor;
+                c->dtor = dtor;
+                /* ignore alignment unless it's forced */
+                c->align = (flags & SLAB_MUST_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
+                if (c->align < align)
+                        c->align = align;
+        }
+        return c;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+int kmem_cache_destroy(struct kmem_cache *c)
+{
+        slob_free(c, sizeof(struct kmem_cache));
+        return 0;
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
+{
+        void *b;
+        if (c->size < PAGE_SIZE)
+                b = slob_alloc(c->size, flags, c->align);
+        else
+                b = (void *)__get_free_pages(flags, find_order(c->size));
+        if (c->ctor)
+                c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+        return b;
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+        if (c->dtor)
+                c->dtor(b, c, 0);
+        if (c->size < PAGE_SIZE)
+                slob_free(b, c->size);
+        else
+                free_pages((unsigned long)b, find_order(c->size));
+}
+EXPORT_SYMBOL(kmem_cache_free);
+unsigned int kmem_cache_size(struct kmem_cache *c)
+{
+        return c->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+const char *kmem_cache_name(struct kmem_cache *c)
+{
+        return c->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+static struct timer_list slob_timer = TIMER_INITIALIZER(
+        (void (*)(unsigned long))kmem_cache_init, 0, 0);
+void kmem_cache_init(void)
+{
+        void *p = slob_alloc(PAGE_SIZE, 0, PAGE_SIZE-1);
+        if (p)
+                free_page((unsigned long)p);
+        mod_timer(&slob_timer, jiffies + HZ);
+}
+atomic_t slab_reclaim_pages = ATOMIC_INIT(0);
+EXPORT_SYMBOL(slab_reclaim_pages);
+#ifdef CONFIG_SMP
+void *__alloc_percpu(size_t size, size_t align)
+{
+        int i;
+        struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
+        if (!pdata)
+                return NULL;
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!cpu_possible(i))
+                        continue;
+                pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+                if (!pdata->ptrs[i])
+                        goto unwind_oom;
+                memset(pdata->ptrs[i], 0, size);
+        }
+        /* Catch derefs w/o wrappers */
+        return (void *) (~(unsigned long) pdata);
+unwind_oom:
+        while (--i >= 0) {
+                if (!cpu_possible(i))
+                        continue;
+                kfree(pdata->ptrs[i]);
+        }
+        kfree(pdata);
+        return NULL;
+}
+EXPORT_SYMBOL(__alloc_percpu);
+void
+free_percpu(const void *objp)
+{
+        int i;
+        struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
+        for (i = 0; i < NR_CPUS; i++) {
+                if (!cpu_possible(i))
+                        continue;
+                kfree(p->ptrs[i]);
+        }
+        kfree(p);
+}
+EXPORT_SYMBOL(free_percpu);
+#endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 72079b538e2d..0a51f36ba3a1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -18,10 +18,10 @@
 */
 #ifdef CONFIG_SPARSEMEM_EXTREME
 struct mem_section *mem_section[NR_SECTION_ROOTS]
-        ____cacheline_maxaligned_in_smp;
+        ____cacheline_internodealigned_in_smp;
 #else
 struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
-        ____cacheline_maxaligned_in_smp;
+        ____cacheline_internodealigned_in_smp;
 #endif
 EXPORT_SYMBOL(mem_section);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fc2aecb70a95..7b09ac503fec 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -141,7 +141,7 @@ void __delete_from_swap_cache(struct page *page)
 * Allocate swap space for the page and add the page to the
 * swap cache.  Caller needs to hold the page lock. 
 */
-int add_to_swap(struct page * page)
+int add_to_swap(struct page * page, gfp_t gfp_mask)
 {
        swp_entry_t entry;
        int err;
@@ -166,7 +166,7 @@ int add_to_swap(struct page * page)
                 * Add it to the swap cache and mark it dirty
                 */
                err = __add_to_swap_cache(page, entry,
-                                GFP_ATOMIC|__GFP_NOMEMALLOC|__GFP_NOWARN);
+                                gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN);
                switch (err) {
                case 0:                         /* Success */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6da4b28b896b..80f948a2028b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1493,7 +1493,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                        goto bad_swap;
                if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
                        goto bad_swap;
-                
                /* OK, set up the swap map and apply the bad block list */
                if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) {
                        error = -ENOMEM;
@@ -1502,17 +1502,17 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
                error = 0;
                memset(p->swap_map, 0, maxpages * sizeof(short));
-                for (i=0; i<swap_header->info.nr_badpages; i++) {
+                for (i = 0; i < swap_header->info.nr_badpages; i++) {
-                        int page = swap_header->info.badpages[i];
+                        int page_nr = swap_header->info.badpages[i];
-                        if (page <= 0 || page >= swap_header->info.last_page)
+                        if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
                                error = -EINVAL;
                        else
-                                p->swap_map[page] = SWAP_MAP_BAD;
+                                p->swap_map[page_nr] = SWAP_MAP_BAD;
                }
                nr_good_pages = swap_header->info.last_page -
                                swap_header->info.nr_badpages -
                                1 /* header page */;
-                if (error) 
+                if (error)
                        goto bad_swap;
        }
diff --git a/mm/truncate.c b/mm/truncate.c
index 7dee32745901..b1a463d0fe71 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -249,7 +249,6 @@ unlock:
                                break;
                }
                pagevec_release(&pvec);
-                cond_resched();
        }
        return ret;
 }
diff --git a/mm/util.c b/mm/util.c
new file mode 100644
index 000000000000..5f4bb59da63c
--- /dev/null
+++ b/mm/util.c
@@ -0,0 +1,39 @@
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+/**
+ * kzalloc - allocate memory. The memory is set to zero.
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ */
+void *kzalloc(size_t size, gfp_t flags)
+{
+        void *ret = kmalloc(size, flags);
+        if (ret)
+                memset(ret, 0, size);
+        return ret;
+}
+EXPORT_SYMBOL(kzalloc);
+/*
+ * kstrdup - allocate space for and copy an existing string
+ *
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ */
+char *kstrdup(const char *s, gfp_t gfp)
+{
+        size_t len;
+        char *buf;
+        if (!s)
+                return NULL;
+        len = strlen(s) + 1;
+        buf = kmalloc(len, gfp);
+        if (buf)
+                memcpy(buf, s, len);
+        return buf;
+}
+EXPORT_SYMBOL(kstrdup);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index be8235fb1939..bf903b2d198f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -180,8 +180,7 @@ EXPORT_SYMBOL(remove_shrinker);
 *
 * Returns the number of slab objects which we shrunk.
 */
-static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
-                        unsigned long lru_pages)
 {
        struct shrinker *shrinker;
        int ret = 0;
@@ -269,9 +268,7 @@ static inline int is_page_cache_freeable(struct page *page)
 static int may_write_to_queue(struct backing_dev_info *bdi)
 {
-        if (current_is_kswapd())
+        if (current->flags & PF_SWAPWRITE)
-                return 1;
-        if (current_is_pdflush())       /* This is unlikely, but why not... */
                return 1;
        if (!bdi_write_congested(bdi))
                return 1;
@@ -376,6 +373,43 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
        return PAGE_CLEAN;
 }
+static int remove_mapping(struct address_space *mapping, struct page *page)
+{
+        if (!mapping)
+                return 0;               /* truncate got there first */
+        write_lock_irq(&mapping->tree_lock);
+        /*
+         * The non-racy check for busy page.  It is critical to check
+         * PageDirty _after_ making sure that the page is freeable and
+         * not in use by anybody.       (pagecache + us == 2)
+         */
+        if (unlikely(page_count(page) != 2))
+                goto cannot_free;
+        smp_rmb();
+        if (unlikely(PageDirty(page)))
+                goto cannot_free;
+        if (PageSwapCache(page)) {
+                swp_entry_t swap = { .val = page_private(page) };
+                __delete_from_swap_cache(page);
+                write_unlock_irq(&mapping->tree_lock);
+                swap_free(swap);
+                __put_page(page);       /* The pagecache ref */
+                return 1;
+        }
+        __remove_from_page_cache(page);
+        write_unlock_irq(&mapping->tree_lock);
+        __put_page(page);
+        return 1;
+cannot_free:
+        write_unlock_irq(&mapping->tree_lock);
+        return 0;
+}
 /*
 * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
 */
@@ -424,7 +458,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                 * Try to allocate it some swap space here.
                 */
                if (PageAnon(page) && !PageSwapCache(page)) {
-                        if (!add_to_swap(page))
+                        if (!add_to_swap(page, GFP_ATOMIC))
                                goto activate_locked;
                }
 #endif /* CONFIG_SWAP */
@@ -507,36 +541,8 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
                                goto free_it;
                }
-                if (!mapping)
+                if (!remove_mapping(mapping, page))
-                        goto keep_locked;       /* truncate got there first */
+                        goto keep_locked;
-                write_lock_irq(&mapping->tree_lock);
-                /*
-                 * The non-racy check for busy page.  It is critical to check
-                 * PageDirty _after_ making sure that the page is freeable and
-                 * not in use by anybody.       (pagecache + us == 2)
-                 */
-                if (unlikely(page_count(page) != 2))
-                        goto cannot_free;
-                smp_rmb();
-                if (unlikely(PageDirty(page)))
-                        goto cannot_free;
-#ifdef CONFIG_SWAP
-                if (PageSwapCache(page)) {
-                        swp_entry_t swap = { .val = page_private(page) };
-                        __delete_from_swap_cache(page);
-                        write_unlock_irq(&mapping->tree_lock);
-                        swap_free(swap);
-                        __put_page(page);       /* The pagecache ref */
-                        goto free_it;
-                }
-#endif /* CONFIG_SWAP */
-                __remove_from_page_cache(page);
-                write_unlock_irq(&mapping->tree_lock);
-                __put_page(page);
 free_it:
                unlock_page(page);
@@ -545,10 +551,6 @@ free_it:
                        __pagevec_release_nonlru(&freed_pvec);
                continue;
-cannot_free:
-                write_unlock_irq(&mapping->tree_lock);
-                goto keep_locked;
 activate_locked:
                SetPageActive(page);
                pgactivate++;
@@ -566,6 +568,241 @@ keep:
        return reclaimed;
 }
+#ifdef CONFIG_MIGRATION
+static inline void move_to_lru(struct page *page)
+{
+        list_del(&page->lru);
+        if (PageActive(page)) {
+                /*
+                 * lru_cache_add_active checks that
+                 * the PG_active bit is off.
+                 */
+                ClearPageActive(page);
+                lru_cache_add_active(page);
+        } else {
+                lru_cache_add(page);
+        }
+        put_page(page);
+}
+/*
+ * Add isolated pages on the list back to the LRU
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+        struct page *page;
+        struct page *page2;
+        int count = 0;
+        list_for_each_entry_safe(page, page2, l, lru) {
+                move_to_lru(page);
+                count++;
+        }
+        return count;
+}
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+        struct address_space *mapping = page_mapping(page);
+        if (page_mapped(page) && mapping)
+                if (try_to_unmap(page) != SWAP_SUCCESS)
+                        goto unlock_retry;
+        if (PageDirty(page)) {
+                /* Page is dirty, try to write it out here */
+                switch(pageout(page, mapping)) {
+                case PAGE_KEEP:
+                case PAGE_ACTIVATE:
+                        goto unlock_retry;
+                case PAGE_SUCCESS:
+                        goto retry;
+                case PAGE_CLEAN:
+                        ; /* try to free the page below */
+                }
+        }
+        if (PagePrivate(page)) {
+                if (!try_to_release_page(page, GFP_KERNEL) ||
+                    (!mapping && page_count(page) == 1))
+                        goto unlock_retry;
+        }
+        if (remove_mapping(mapping, page)) {
+                /* Success */
+                unlock_page(page);
+                return 0;
+        }
+unlock_retry:
+        unlock_page(page);
+retry:
+        return -EAGAIN;
+}
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because t has become empty
+ * or no retryable pages exist anymore.
+ *
+ * SIMPLIFIED VERSION: This implementation of migrate_pages
+ * is only swapping out pages and never touches the second
+ * list. The direct migration patchset
+ * extends this function to avoid the use of swap.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+                  struct list_head *moved, struct list_head *failed)
+{
+        int retry;
+        int nr_failed = 0;
+        int pass = 0;
+        struct page *page;
+        struct page *page2;
+        int swapwrite = current->flags & PF_SWAPWRITE;
+        int rc;
+        if (!swapwrite)
+                current->flags |= PF_SWAPWRITE;
+redo:
+        retry = 0;
+        list_for_each_entry_safe(page, page2, from, lru) {
+                cond_resched();
+                rc = 0;
+                if (page_count(page) == 1)
+                        /* page was freed from under us. So we are done. */
+                        goto next;
+                /*
+                 * Skip locked pages during the first two passes to give the
+                 * functions holding the lock time to release the page. Later we
+                 * use lock_page() to have a higher chance of acquiring the
+                 * lock.
+                 */
+                rc = -EAGAIN;
+                if (pass > 2)
+                        lock_page(page);
+                else
+                        if (TestSetPageLocked(page))
+                                goto next;
+                /*
+                 * Only wait on writeback if we have already done a pass where
+                 * we we may have triggered writeouts for lots of pages.
+                 */
+                if (pass > 0) {
+                        wait_on_page_writeback(page);
+                } else {
+                        if (PageWriteback(page))
+                                goto unlock_page;
+                }
+                /*
+                 * Anonymous pages must have swap cache references otherwise
+                 * the information contained in the page maps cannot be
+                 * preserved.
+                 */
+                if (PageAnon(page) && !PageSwapCache(page)) {
+                        if (!add_to_swap(page, GFP_KERNEL)) {
+                                rc = -ENOMEM;
+                                goto unlock_page;
+                        }
+                }
+                /*
+                 * Page is properly locked and writeback is complete.
+                 * Try to migrate the page.
+                 */
+                rc = swap_page(page);
+                goto next;
+unlock_page:
+                unlock_page(page);
+next:
+                if (rc == -EAGAIN) {
+                        retry++;
+                } else if (rc) {
+                        /* Permanent failure */
+                        list_move(&page->lru, failed);
+                        nr_failed++;
+                } else {
+                        /* Success */
+                        list_move(&page->lru, moved);
+                }
+        }
+        if (retry && pass++ < 10)
+                goto redo;
+        if (!swapwrite)
+                current->flags &= ~PF_SWAPWRITE;
+        return nr_failed + retry;
+}
+static void lru_add_drain_per_cpu(void *dummy)
+{
+        lru_add_drain();
+}
+/*
+ * Isolate one page from the LRU lists and put it on the
+ * indicated list. Do necessary cache draining if the
+ * page is not on the LRU lists yet.
+ *
+ * Result:
+ *  0 = page not on LRU list
+ *  1 = page removed from LRU list and added to the specified list.
+ * -ENOENT = page is being freed elsewhere.
+ */
+int isolate_lru_page(struct page *page)
+{
+        int rc = 0;
+        struct zone *zone = page_zone(page);
+redo:
+        spin_lock_irq(&zone->lru_lock);
+        rc = __isolate_lru_page(page);
+        if (rc == 1) {
+                if (PageActive(page))
+                        del_page_from_active_list(zone, page);
+                else
+                        del_page_from_inactive_list(zone, page);
+        }
+        spin_unlock_irq(&zone->lru_lock);
+        if (rc == 0) {
+                /*
+                 * Maybe this page is still waiting for a cpu to drain it
+                 * from one of the lru lists?
+                 */
+                rc = schedule_on_each_cpu(lru_add_drain_per_cpu, NULL);
+                if (rc == 0 && PageLRU(page))
+                        goto redo;
+        }
+        return rc;
+}
+#endif
 /*
 * zone->lru_lock is heavily contended.  Some of the functions that
 * shrink the lists perform better by taking out a batch of pages
@@ -594,20 +831,18 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
                page = lru_to_page(src);
                prefetchw_prev_lru_page(page, src, flags);
-                if (!TestClearPageLRU(page))
+                switch (__isolate_lru_page(page)) {
-                        BUG();
+                case 1:
-                list_del(&page->lru);
+                        /* Succeeded to isolate page */
-                if (get_page_testone(page)) {
+                        list_move(&page->lru, dst);
-                        /*
-                         * It is being freed elsewhere
-                         */
-                        __put_page(page);
-                        SetPageLRU(page);
-                        list_add(&page->lru, src);
-                        continue;
-                } else {
-                        list_add(&page->lru, dst);
                        nr_taken++;
+                        break;
+                case -ENOENT:
+                        /* Not possible to isolate */
+                        list_move(&page->lru, src);
+                        break;
+                default:
+                        BUG();
                }
        }
@@ -1226,7 +1461,7 @@ static int kswapd(void *p)
         * us from recursively trying to free more memory as we're
         * trying to free the first piece of memory in the first place).
         */
-        tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
        order = 0;
        for ( ; ; ) {