14 files changed, 595 insertions, 168 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index bd80460360db..332f5c29b53a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS
 #
 config MIGRATION
        bool "Page migration"
-        def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
+        def_bool y if NUMA
-        depends on SWAP
+        depends on SWAP && NUMA
        help
          Allows the migration of the physical location of pages of processes
          while the virtual addresses are not changed. This is useful for
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 35c32290f717..b55bd39fc5dd 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -152,7 +152,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
 *
 * NOTE:  This function is _not_ reentrant.
 */
-static void * __init
+void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
              unsigned long align, unsigned long goal, unsigned long limit)
 {
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d257c89e7704..907c39257ca0 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -15,6 +15,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/fadvise.h>
+#include <linux/writeback.h>
 #include <linux/syscalls.h>
 #include <asm/unistd.h>
@@ -22,13 +23,36 @@
 /*
 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
 * deactivate the pages and clear PG_Referenced.
+ *
+ * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
+ * offsets `offset' and `offset+len' inclusive.  Any pages which are currently
+ * under writeout are skipped, whether or not they are dirty.
+ *
+ * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
+ * offsets `offset' and `offset+len'.
+ *
+ * By combining these two operations the application may do several things:
+ *
+ * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently
+ * dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push
+ * all of the currently dirty pages at the disk, wait until they have been
+ * written.
+ *
+ * It should be noted that none of these operations write out the file's
+ * metadata.  So unless the application is strictly performing overwrites of
+ * already-instantiated disk blocks, there are no guarantees here that the data
+ * will be available after a crash.
 */
 asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 {
        struct file *file = fget(fd);
        struct address_space *mapping;
        struct backing_dev_info *bdi;
-        loff_t endbyte;
+        loff_t endbyte;                 /* inclusive */
        pgoff_t start_index;
        pgoff_t end_index;
        unsigned long nrpages;
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
        endbyte = offset + len;
        if (!len || endbyte < len)
                endbyte = -1;
+        else
+                endbyte--;              /* inclusive */
        bdi = mapping->backing_dev_info;
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                /* First and last PARTIAL page! */
                start_index = offset >> PAGE_CACHE_SHIFT;
-                end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
+                end_index = endbyte >> PAGE_CACHE_SHIFT;
                /* Careful about overflow on the "+1" */
                nrpages = end_index - start_index + 1;
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
                        filemap_flush(mapping);
                /* First and last FULL page! */
-                start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+                start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
                end_index = (endbyte >> PAGE_CACHE_SHIFT);
-                if (end_index > start_index)
+                if (end_index >= start_index)
-                        invalidate_mapping_pages(mapping, start_index, end_index-1);
+                        invalidate_mapping_pages(mapping, start_index,
+                                                end_index);
+                break;
+        case LINUX_FADV_ASYNC_WRITE:
+                ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+                                                WB_SYNC_NONE);
+                break;
+        case LINUX_FADV_WRITE_WAIT:
+                ret = wait_on_page_writeback_range(mapping,
+                                        offset >> PAGE_CACHE_SHIFT,
+                                        endbyte >> PAGE_CACHE_SHIFT);
                break;
        default:
                ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index e8f58f7dd7a5..3ef20739e725 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,6 +29,7 @@
 #include <linux/blkdev.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/cpuset.h>
 #include "filemap.h"
 #include "internal.h"
@@ -174,7 +175,7 @@ static int sync_page(void *word)
 * dirty pages that lie within the byte offsets <start, end>
 * @mapping:    address space structure to write
 * @start:      offset in bytes where the range starts
- * @end:        offset in bytes where the range ends
+ * @end:        offset in bytes where the range ends (inclusive)
 * @sync_mode:  enable synchronous operation
 *
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
@@ -182,8 +183,8 @@ static int sync_page(void *word)
 * these two operations is that if a dirty page/buffer is encountered, it must
 * be waited upon, and not just skipped over.
 */
-static int __filemap_fdatawrite_range(struct address_space *mapping,
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
-        loff_t start, loff_t end, int sync_mode)
+                                loff_t end, int sync_mode)
 {
        int ret;
        struct writeback_control wbc = {
@@ -212,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_fdatawrite);
-static int filemap_fdatawrite_range(struct address_space *mapping,
+static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
-        loff_t start, loff_t end)
+                                loff_t end)
 {
        return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
@@ -232,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush);
 * Wait for writeback to complete against pages indexed by start->end
 * inclusive
 */
-static int wait_on_page_writeback_range(struct address_space *mapping,
+int wait_on_page_writeback_range(struct address_space *mapping,
                                pgoff_t start, pgoff_t end)
 {
        struct pagevec pvec;
@@ -367,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_write_and_wait);
+/*
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that `lend' is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ */
 int filemap_write_and_wait_range(struct address_space *mapping,
                                 loff_t lstart, loff_t lend)
 {
@@ -427,6 +434,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
        return ret;
 }
+#ifdef CONFIG_NUMA
+struct page *page_cache_alloc(struct address_space *x)
+{
+        if (cpuset_do_page_mem_spread()) {
+                int n = cpuset_mem_spread_node();
+                return alloc_pages_node(n, mapping_gfp_mask(x), 0);
+        }
+        return alloc_pages(mapping_gfp_mask(x), 0);
+}
+EXPORT_SYMBOL(page_cache_alloc);
+struct page *page_cache_alloc_cold(struct address_space *x)
+{
+        if (cpuset_do_page_mem_spread()) {
+                int n = cpuset_mem_spread_node();
+                return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
+        }
+        return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+}
+EXPORT_SYMBOL(page_cache_alloc_cold);
+#endif
 /*
 * In order to wait for pages to become available there must be
 * waitqueues associated with pages. By using a hash table of
diff --git a/mm/memory.c b/mm/memory.c
index 80c3fb370f91..e347e106ca3a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -395,12 +395,16 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
                        return NULL;
        }
-#ifdef CONFIG_DEBUG_VM
+        /*
+         * Add some anal sanity checks for now. Eventually,
+         * we should just do "return pfn_to_page(pfn)", but
+         * in the meantime we check that we get a valid pfn,
+         * and that the resulting page looks ok.
+         */
        if (unlikely(!pfn_valid(pfn))) {
                print_bad_pte(vma, pte, addr);
                return NULL;
        }
-#endif
        /*
         * NOTE! We still have PageReserved() pages in the page 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e93cc740c22b..4f71cfd29c6f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
        return mpol_check_policy(mode, nodes);
 }
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy.  Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+        if (p->mempolicy)
+                p->flags |= PF_MEMPOLICY;
+        else
+                p->flags &= ~PF_MEMPOLICY;
+}
+static void mpol_set_task_struct_flag(void)
+{
+        mpol_fix_fork_child_flag(current);
+}
 /* Set the process memory policy */
 long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
@@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
                return PTR_ERR(new);
        mpol_free(current->mempolicy);
        current->mempolicy = new;
+        mpol_set_task_struct_flag();
        if (new && new->policy == MPOL_INTERLEAVE)
                current->il_next = first_node(new->v.nodes);
        return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index 0eb9894db6de..4f5b5709136a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1040,12 +1040,11 @@ munmap_back:
         * specific mapper. the address has already been validated, but
         * not unmapped, but the maps are removed from the list.
         */
-        vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
        if (!vma) {
                error = -ENOMEM;
                goto unacct_error;
        }
-        memset(vma, 0, sizeof(*vma));
        vma->vm_mm = mm;
        vma->vm_start = addr;
@@ -1896,12 +1895,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
        /*
         * create a vma struct for an anonymous mapping
         */
-        vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+        vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
        if (!vma) {
                vm_unacct_memory(len >> PAGE_SHIFT);
                return -ENOMEM;
        }
-        memset(vma, 0, sizeof(*vma));
        vma->vm_mm = mm;
        vma->vm_start = addr;
diff --git a/mm/msync.c b/mm/msync.c
index 3563a56e1a51..bc6c95376366 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -9,20 +9,24 @@
 */
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/hugetlb.h>
+#include <linux/writeback.h>
+#include <linux/file.h>
 #include <linux/syscalls.h>
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
-static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                unsigned long addr, unsigned long end)
 {
        pte_t *pte;
        spinlock_t *ptl;
        int progress = 0;
+        unsigned long ret = 0;
 again:
        pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -43,58 +47,64 @@ again:
                if (!page)
                        continue;
                if (ptep_clear_flush_dirty(vma, addr, pte) ||
-                    page_test_and_clear_dirty(page))
+                                page_test_and_clear_dirty(page))
-                        set_page_dirty(page);
+                        ret += set_page_dirty(page);
                progress += 3;
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap_unlock(pte - 1, ptl);
        cond_resched();
        if (addr != end)
                goto again;
+        return ret;
 }
-static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
-                                unsigned long addr, unsigned long end)
+                        pud_t *pud, unsigned long addr, unsigned long end)
 {
        pmd_t *pmd;
        unsigned long next;
+        unsigned long ret = 0;
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
                if (pmd_none_or_clear_bad(pmd))
                        continue;
-                msync_pte_range(vma, pmd, addr, next);
+                ret += msync_pte_range(vma, pmd, addr, next);
        } while (pmd++, addr = next, addr != end);
+        return ret;
 }
-static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
-                                unsigned long addr, unsigned long end)
+                        pgd_t *pgd, unsigned long addr, unsigned long end)
 {
        pud_t *pud;
        unsigned long next;
+        unsigned long ret = 0;
        pud = pud_offset(pgd, addr);
        do {
                next = pud_addr_end(addr, end);
                if (pud_none_or_clear_bad(pud))
                        continue;
-                msync_pmd_range(vma, pud, addr, next);
+                ret += msync_pmd_range(vma, pud, addr, next);
        } while (pud++, addr = next, addr != end);
+        return ret;
 }
-static void msync_page_range(struct vm_area_struct *vma,
+static unsigned long msync_page_range(struct vm_area_struct *vma,
                                unsigned long addr, unsigned long end)
 {
        pgd_t *pgd;
        unsigned long next;
+        unsigned long ret = 0;
        /* For hugepages we can't go walking the page table normally,
         * but that's ok, hugetlbfs is memory based, so we don't need
         * to do anything more on an msync().
         */
        if (vma->vm_flags & VM_HUGETLB)
-                return;
+                return 0;
        BUG_ON(addr >= end);
        pgd = pgd_offset(vma->vm_mm, addr);
@@ -103,8 +113,9 @@ static void msync_page_range(struct vm_area_struct *vma,
                next = pgd_addr_end(addr, end);
                if (pgd_none_or_clear_bad(pgd))
                        continue;
-                msync_pud_range(vma, pgd, addr, next);
+                ret += msync_pud_range(vma, pgd, addr, next);
        } while (pgd++, addr = next, addr != end);
+        return ret;
 }
 /*
@@ -115,53 +126,31 @@ static void msync_page_range(struct vm_area_struct *vma,
 * write out the dirty pages and wait on the writeout and check the result.
 * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
 * async writeout immediately.
- * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
+ * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
 * applications.
 */
-static int msync_interval(struct vm_area_struct *vma,
+static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
-                        unsigned long addr, unsigned long end, int flags)
+                        unsigned long end, int flags,
+                        unsigned long *nr_pages_dirtied)
 {
-        int ret = 0;
        struct file *file = vma->vm_file;
        if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
                return -EBUSY;
-        if (file && (vma->vm_flags & VM_SHARED)) {
+        if (file && (vma->vm_flags & VM_SHARED))
-                msync_page_range(vma, addr, end);
+                *nr_pages_dirtied = msync_page_range(vma, addr, end);
+        return 0;
-                if (flags & MS_SYNC) {
-                        struct address_space *mapping = file->f_mapping;
-                        int err;
-                        ret = filemap_fdatawrite(mapping);
-                        if (file->f_op && file->f_op->fsync) {
-                                /*
-                                 * We don't take i_mutex here because mmap_sem
-                                 * is already held.
-                                 */
-                                err = file->f_op->fsync(file,file->f_dentry,1);
-                                if (err && !ret)
-                                        ret = err;
-                        }
-                        err = filemap_fdatawait(mapping);
-                        if (!ret)
-                                ret = err;
-                }
-        }
-        return ret;
 }
 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 {
        unsigned long end;
        struct vm_area_struct *vma;
-        int unmapped_error, error = -EINVAL;
+        int unmapped_error = 0;
+        int error = -EINVAL;
-        if (flags & MS_SYNC)
+        int done = 0;
-                current->flags |= PF_SYNCWRITE;
-        down_read(&current->mm->mmap_sem);
        if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
                goto out;
        if (start & ~PAGE_MASK)
@@ -180,13 +169,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
         * If the interval [start,end) covers some unmapped address ranges,
         * just ignore them, but return -ENOMEM at the end.
         */
+        down_read(&current->mm->mmap_sem);
+        if (flags & MS_SYNC)
+                current->flags |= PF_SYNCWRITE;
        vma = find_vma(current->mm, start);
-        unmapped_error = 0;
+        if (!vma) {
-        for (;;) {
-                /* Still start < end. */
                error = -ENOMEM;
-                if (!vma)
+                goto out_unlock;
-                        goto out;
+        }
+        do {
+                unsigned long nr_pages_dirtied = 0;
+                struct file *file;
                /* Here start < vma->vm_end. */
                if (start < vma->vm_start) {
                        unmapped_error = -ENOMEM;
@@ -195,22 +189,47 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
                /* Here vma->vm_start <= start < vma->vm_end. */
                if (end <= vma->vm_end) {
                        if (start < end) {
-                                error = msync_interval(vma, start, end, flags);
+                                error = msync_interval(vma, start, end, flags,
+                                                        &nr_pages_dirtied);
                                if (error)
-                                        goto out;
+                                        goto out_unlock;
                        }
                        error = unmapped_error;
-                        goto out;
+                        done = 1;
+                } else {
+                        /* Here vma->vm_start <= start < vma->vm_end < end. */
+                        error = msync_interval(vma, start, vma->vm_end, flags,
+                                                &nr_pages_dirtied);
+                        if (error)
+                                goto out_unlock;
                }
-                /* Here vma->vm_start <= start < vma->vm_end < end. */
+                file = vma->vm_file;
-                error = msync_interval(vma, start, vma->vm_end, flags);
-                if (error)
-                        goto out;
                start = vma->vm_end;
-                vma = vma->vm_next;
+                if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
-        }
+                        get_file(file);
-out:
+                        up_read(&current->mm->mmap_sem);
-        up_read(&current->mm->mmap_sem);
+                        balance_dirty_pages_ratelimited_nr(file->f_mapping,
+                                                        nr_pages_dirtied);
+                        fput(file);
+                        down_read(&current->mm->mmap_sem);
+                        vma = find_vma(current->mm, start);
+                } else if ((flags & MS_SYNC) && file &&
+                                (vma->vm_flags & VM_SHARED)) {
+                        get_file(file);
+                        up_read(&current->mm->mmap_sem);
+                        error = do_fsync(file, 0);
+                        fput(file);
+                        down_read(&current->mm->mmap_sem);
+                        if (error)
+                                goto out_unlock;
+                        vma = find_vma(current->mm, start);
+                } else {
+                        vma = vma->vm_next;
+                }
+        } while (vma && !done);
+out_unlock:
        current->flags &= ~PF_SYNCWRITE;
+        up_read(&current->mm->mmap_sem);
+out:
        return error;
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 945559fb63d2..893d7677579e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -75,12 +75,12 @@ int vm_dirty_ratio = 40;
 * The interval between `kupdate'-style writebacks, in centiseconds
 * (hundredths of a second)
 */
-int dirty_writeback_centisecs = 5 * 100;
+int dirty_writeback_interval = 5 * HZ;
 /*
 * The longest number of centiseconds for which data is allowed to remain dirty
 */
-int dirty_expire_centisecs = 30 * 100;
+int dirty_expire_interval = 30 * HZ;
 /*
 * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -88,7 +88,8 @@ int dirty_expire_centisecs = 30 * 100;
 int block_dump;
 /*
- * Flag that puts the machine in "laptop mode".
+ * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
+ * a full sync is triggered after this time elapses without any disk activity.
 */
 int laptop_mode;
@@ -255,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping)
 }
 /**
- * balance_dirty_pages_ratelimited - balance dirty memory state
+ * balance_dirty_pages_ratelimited_nr - balance dirty memory state
 * @mapping: address_space which was dirtied
+ * @nr_pages: number of pages which the caller has just dirtied
 *
 * Processes which are dirtying memory should call in here once for each page
 * which was newly dirtied.  The function will periodically check the system's
@@ -267,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping)
 * limit we decrease the ratelimiting by a lot, to prevent individual processes
 * from overshooting the limit by (ratelimit_pages) each.
 */
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+                                        unsigned long nr_pages_dirtied)
 {
-        static DEFINE_PER_CPU(int, ratelimits) = 0;
+        static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
-        long ratelimit;
+        unsigned long ratelimit;
+        unsigned long *p;
        ratelimit = ratelimit_pages;
        if (dirty_exceeded)
@@ -280,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
         * Check the rate limiting. Also, we do not want to throttle real-time
         * tasks in balance_dirty_pages(). Period.
         */
-        if (get_cpu_var(ratelimits)++ >= ratelimit) {
+        preempt_disable();
-                __get_cpu_var(ratelimits) = 0;
+        p =  &__get_cpu_var(ratelimits);
-                put_cpu_var(ratelimits);
+        *p += nr_pages_dirtied;
+        if (unlikely(*p >= ratelimit)) {
+                *p = 0;
+                preempt_enable();
                balance_dirty_pages(mapping);
                return;
        }
-        put_cpu_var(ratelimits);
+        preempt_enable();
 }
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 void throttle_vm_writeout(void)
 {
@@ -380,8 +387,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 * just walks the superblock inode list, writing back any inodes which are
 * older than a specific point in time.
 *
- * Try to run once per dirty_writeback_centisecs.  But if a writeback event
+ * Try to run once per dirty_writeback_interval.  But if a writeback event
- * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * takes longer than a dirty_writeback_interval interval, then leave a
 * one-second gap.
 *
 * older_than_this takes precedence over nr_to_write.  So we'll only write back
@@ -406,9 +413,9 @@ static void wb_kupdate(unsigned long arg)
        sync_supers();
        get_writeback_state(&wbs);
-        oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
+        oldest_jif = jiffies - dirty_expire_interval;
        start_jif = jiffies;
-        next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
+        next_jif = start_jif + dirty_writeback_interval;
        nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        while (nr_to_write > 0) {
@@ -425,7 +432,7 @@ static void wb_kupdate(unsigned long arg)
        }
        if (time_before(next_jif, jiffies + HZ))
                next_jif = jiffies + HZ;
-        if (dirty_writeback_centisecs)
+        if (dirty_writeback_interval)
                mod_timer(&wb_timer, next_jif);
 }
@@ -435,11 +442,11 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
                struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-        proc_dointvec(table, write, file, buffer, length, ppos);
+        proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
-        if (dirty_writeback_centisecs) {
+        if (dirty_writeback_interval) {
                mod_timer(&wb_timer,
-                        jiffies + (dirty_writeback_centisecs * HZ) / 100);
+                        jiffies + dirty_writeback_interval);
-        } else {
+                } else {
                del_timer(&wb_timer);
        }
        return 0;
@@ -468,7 +475,7 @@ static void laptop_timer_fn(unsigned long unused)
 */
 void laptop_io_completion(void)
 {
-        mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+        mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
 }
 /*
@@ -544,7 +551,7 @@ void __init page_writeback_init(void)
                if (vm_dirty_ratio <= 0)
                        vm_dirty_ratio = 1;
        }
-        mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
+        mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
        set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
 }
@@ -621,8 +628,6 @@ EXPORT_SYMBOL(write_one_page);
 */
 int __set_page_dirty_nobuffers(struct page *page)
 {
-        int ret = 0;
        if (!TestSetPageDirty(page)) {
                struct address_space *mapping = page_mapping(page);
                struct address_space *mapping2;
@@ -644,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page)
                                                        I_DIRTY_PAGES);
                        }
                }
+                return 1;
        }
-        return ret;
+        return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -675,8 +681,10 @@ int fastcall set_page_dirty(struct page *page)
                        return (*spd)(page);
                return __set_page_dirty_buffers(page);
        }
-        if (!PageDirty(page))
+        if (!PageDirty(page)) {
-                SetPageDirty(page);
+                if (!TestSetPageDirty(page))
+                        return 1;
+        }
        return 0;
 }
 EXPORT_SYMBOL(set_page_dirty);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b7f14a4799a5..338a02bb004d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -943,7 +943,8 @@ restart:
                goto got_pg;
        do {
-                wakeup_kswapd(*z, order);
+                if (cpuset_zone_allowed(*z, gfp_mask))
+                        wakeup_kswapd(*z, order);
        } while (*(++z));
        /*
@@ -2028,8 +2029,9 @@ static __meminit void zone_pcp_init(struct zone *zone)
                setup_pageset(zone_pcp(zone,cpu), batch);
 #endif
        }
-        printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+        if (zone->present_pages)
-                zone->name, zone->present_pages, batch);
+                printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+                        zone->name, zone->present_pages, batch);
 }
 static __meminit void init_currently_empty_zone(struct zone *zone,
@@ -2700,8 +2702,7 @@ void *__init alloc_large_system_hash(const char *tablename,
                else
                        numentries <<= (PAGE_SHIFT - scale);
        }
-        /* rounded up to nearest power of 2 in size */
+        numentries = roundup_pow_of_two(numentries);
-        numentries = 1UL << (long_log2(numentries) + 1);
        /* limit allocation size to 1/16 total memory by default */
        if (max == 0) {
diff --git a/mm/slab.c b/mm/slab.c
index 1c8f5ee230d5..681837499d7d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -94,6 +94,7 @@
 #include        <linux/interrupt.h>
 #include        <linux/init.h>
 #include        <linux/compiler.h>
+#include        <linux/cpuset.h>
 #include        <linux/seq_file.h>
 #include        <linux/notifier.h>
 #include        <linux/kallsyms.h>
@@ -173,12 +174,12 @@
                         SLAB_CACHE_DMA | \
                         SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-                         SLAB_DESTROY_BY_RCU)
+                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #else
 # define CREATE_MASK    (SLAB_HWCACHE_ALIGN | \
                         SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-                         SLAB_DESTROY_BY_RCU)
+                         SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #endif
 /*
@@ -203,7 +204,8 @@
 typedef unsigned int kmem_bufctl_t;
 #define BUFCTL_END      (((kmem_bufctl_t)(~0U))-0)
 #define BUFCTL_FREE     (((kmem_bufctl_t)(~0U))-1)
-#define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-2)
+#define BUFCTL_ACTIVE   (((kmem_bufctl_t)(~0U))-2)
+#define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-3)
 /* Max number of objs-per-slab for caches which use off-slab slabs.
 * Needed to avoid a possible looping condition in cache_grow().
@@ -896,8 +898,33 @@ static struct array_cache *alloc_arraycache(int node, int entries,
        return nc;
 }
+/*
+ * Transfer objects in one arraycache to another.
+ * Locking must be handled by the caller.
+ *
+ * Return the number of entries transferred.
+ */
+static int transfer_objects(struct array_cache *to,
+                struct array_cache *from, unsigned int max)
+{
+        /* Figure out how many entries to transfer */
+        int nr = min(min(from->avail, max), to->limit - to->avail);
+        if (!nr)
+                return 0;
+        memcpy(to->entry + to->avail, from->entry + from->avail -nr,
+                        sizeof(void *) *nr);
+        from->avail -= nr;
+        to->avail += nr;
+        to->touched = 1;
+        return nr;
+}
 #ifdef CONFIG_NUMA
 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -944,6 +971,13 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
        if (ac->avail) {
                spin_lock(&rl3->list_lock);
+                /*
+                 * Stuff objects into the remote nodes shared array first.
+                 * That way we could avoid the overhead of putting the objects
+                 * into the free lists and getting them back later.
+                 */
+                transfer_objects(rl3->shared, ac, ac->limit);
                free_block(cachep, ac->entry, ac->avail, node);
                ac->avail = 0;
                spin_unlock(&rl3->list_lock);
@@ -959,8 +993,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
        if (l3->alien) {
                struct array_cache *ac = l3->alien[node];
-                if (ac && ac->avail) {
-                        spin_lock_irq(&ac->lock);
+                if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
                        __drain_alien_cache(cachep, ac, node);
                        spin_unlock_irq(&ac->lock);
                }
@@ -1987,10 +2021,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        align = ralign;
        /* Get cache's description obj. */
-        cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
+        cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
        if (!cachep)
                goto oops;
-        memset(cachep, 0, sizeof(struct kmem_cache));
 #if DEBUG
        cachep->obj_size = size;
@@ -2397,7 +2430,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
        /* Verify that the slab belongs to the intended node */
        WARN_ON(slabp->nodeid != nodeid);
-        if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
+        if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
                printk(KERN_ERR "slab: double free detected in cache "
                                "'%s', objp %p\n", cachep->name, objp);
                BUG();
@@ -2603,6 +2636,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
                 */
                cachep->dtor(objp + obj_offset(cachep), cachep, 0);
        }
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+        slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
+#endif
        if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
                if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2675,20 +2711,10 @@ retry:
        BUG_ON(ac->avail > 0 || !l3);
        spin_lock(&l3->list_lock);
-        if (l3->shared) {
+        /* See if we can refill from the shared array */
-                struct array_cache *shared_array = l3->shared;
+        if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
-                if (shared_array->avail) {
+                goto alloc_done;
-                        if (batchcount > shared_array->avail)
-                                batchcount = shared_array->avail;
-                        shared_array->avail -= batchcount;
-                        ac->avail = batchcount;
-                        memcpy(ac->entry,
-                               &(shared_array->entry[shared_array->avail]),
-                               sizeof(void *) * batchcount);
-                        shared_array->touched = 1;
-                        goto alloc_done;
-                }
-        }
        while (batchcount > 0) {
                struct list_head *entry;
                struct slab *slabp;
@@ -2786,6 +2812,16 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
        }
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+        {
+                struct slab *slabp;
+                unsigned objnr;
+                slabp = page_get_slab(virt_to_page(objp));
+                objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+                slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
+        }
+#endif
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON) {
                unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
@@ -2807,11 +2843,10 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
        struct array_cache *ac;
 #ifdef CONFIG_NUMA
-        if (unlikely(current->mempolicy && !in_interrupt())) {
+        if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
-                int nid = slab_node(current->mempolicy);
+                objp = alternate_node_alloc(cachep, flags);
+                if (objp != NULL)
-                if (nid != numa_node_id())
+                        return objp;
-                        return __cache_alloc_node(cachep, flags, nid);
        }
 #endif
@@ -2847,6 +2882,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
 #ifdef CONFIG_NUMA
 /*
+ * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
+ *
+ * If we are in_interrupt, then process context, including cpusets and
+ * mempolicy, may not apply and should not be used for allocation policy.
+ */
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+        int nid_alloc, nid_here;
+        if (in_interrupt())
+                return NULL;
+        nid_alloc = nid_here = numa_node_id();
+        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
+                nid_alloc = cpuset_mem_spread_node();
+        else if (current->mempolicy)
+                nid_alloc = slab_node(current->mempolicy);
+        if (nid_alloc != nid_here)
+                return __cache_alloc_node(cachep, flags, nid_alloc);
+        return NULL;
+}
+/*
 * A interface to enable slab creation on nodeid
 */
 static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
@@ -3071,6 +3128,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 EXPORT_SYMBOL(kmem_cache_alloc);
 /**
+ * kmem_cache_alloc - Allocate an object. The memory is set to zero.
+ * @cache: The cache to allocate from.
+ * @flags: See kmalloc().
+ *
+ * Allocate an object from this cache and set the allocated memory to zero.
+ * The flags are only relevant if the cache has no available objects.
+ */
+void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
+{
+        void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
+        if (ret)
+                memset(ret, 0, obj_size(cache));
+        return ret;
+}
+EXPORT_SYMBOL(kmem_cache_zalloc);
+/**
 * kmem_ptr_validate - check if an untrusted pointer might
 *      be a slab entry.
 * @cachep: the cache we're checking against
@@ -3197,22 +3271,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
        return __cache_alloc(cachep, flags, caller);
 }
-#ifndef CONFIG_DEBUG_SLAB
 void *__kmalloc(size_t size, gfp_t flags)
 {
+#ifndef CONFIG_DEBUG_SLAB
        return __do_kmalloc(size, flags, NULL);
+#else
+        return __do_kmalloc(size, flags, __builtin_return_address(0));
+#endif
 }
 EXPORT_SYMBOL(__kmalloc);
-#else
+#ifdef CONFIG_DEBUG_SLAB
 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
 {
        return __do_kmalloc(size, flags, caller);
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
 #endif
 #ifdef CONFIG_SMP
@@ -3343,63 +3418,86 @@ const char *kmem_cache_name(struct kmem_cache *cachep)
 EXPORT_SYMBOL_GPL(kmem_cache_name);
 /*
- * This initializes kmem_list3 for all nodes.
+ * This initializes kmem_list3 or resizes varioius caches for all nodes.
 */
 static int alloc_kmemlist(struct kmem_cache *cachep)
 {
        int node;
        struct kmem_list3 *l3;
-        int err = 0;
+        struct array_cache *new_shared;
+        struct array_cache **new_alien;
        for_each_online_node(node) {
-                struct array_cache *nc = NULL, *new;
-                struct array_cache **new_alien = NULL;
-#ifdef CONFIG_NUMA
                new_alien = alloc_alien_cache(node, cachep->limit);
                if (!new_alien)
                        goto fail;
-#endif
-                new = alloc_arraycache(node, cachep->shared*cachep->batchcount,
+                new_shared = alloc_arraycache(node,
+                                cachep->shared*cachep->batchcount,
                                        0xbaadf00d);
-                if (!new)
+                if (!new_shared) {
+                        free_alien_cache(new_alien);
                        goto fail;
+                }
                l3 = cachep->nodelists[node];
                if (l3) {
+                        struct array_cache *shared = l3->shared;
                        spin_lock_irq(&l3->list_lock);
-                        nc = cachep->nodelists[node]->shared;
+                        if (shared)
-                        if (nc)
+                                free_block(cachep, shared->entry,
-                                free_block(cachep, nc->entry, nc->avail, node);
+                                                shared->avail, node);
-                        l3->shared = new;
+                        l3->shared = new_shared;
-                        if (!cachep->nodelists[node]->alien) {
+                        if (!l3->alien) {
                                l3->alien = new_alien;
                                new_alien = NULL;
                        }
                        l3->free_limit = (1 + nr_cpus_node(node)) *
                                        cachep->batchcount + cachep->num;
                        spin_unlock_irq(&l3->list_lock);
-                        kfree(nc);
+                        kfree(shared);
                        free_alien_cache(new_alien);
                        continue;
                }
                l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
-                if (!l3)
+                if (!l3) {
+                        free_alien_cache(new_alien);
+                        kfree(new_shared);
                        goto fail;
+                }
                kmem_list3_init(l3);
                l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
                                ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-                l3->shared = new;
+                l3->shared = new_shared;
                l3->alien = new_alien;
                l3->free_limit = (1 + nr_cpus_node(node)) *
                                        cachep->batchcount + cachep->num;
                cachep->nodelists[node] = l3;
        }
-        return err;
+        return 0;
 fail:
-        err = -ENOMEM;
+        if (!cachep->next.next) {
-        return err;
+                /* Cache is not active yet. Roll back what we did */
+                node--;
+                while (node >= 0) {
+                        if (cachep->nodelists[node]) {
+                                l3 = cachep->nodelists[node];
+                                kfree(l3->shared);
+                                free_alien_cache(l3->alien);
+                                kfree(l3);
+                                cachep->nodelists[node] = NULL;
+                        }
+                        node--;
+                }
+        }
+        return -ENOMEM;
 }
 struct ccupdate_struct {
@@ -3876,6 +3974,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
                res = count;
        return res;
 }
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+static void *leaks_start(struct seq_file *m, loff_t *pos)
+{
+        loff_t n = *pos;
+        struct list_head *p;
+        mutex_lock(&cache_chain_mutex);
+        p = cache_chain.next;
+        while (n--) {
+                p = p->next;
+                if (p == &cache_chain)
+                        return NULL;
+        }
+        return list_entry(p, struct kmem_cache, next);
+}
+static inline int add_caller(unsigned long *n, unsigned long v)
+{
+        unsigned long *p;
+        int l;
+        if (!v)
+                return 1;
+        l = n[1];
+        p = n + 2;
+        while (l) {
+                int i = l/2;
+                unsigned long *q = p + 2 * i;
+                if (*q == v) {
+                        q[1]++;
+                        return 1;
+                }
+                if (*q > v) {
+                        l = i;
+                } else {
+                        p = q + 2;
+                        l -= i + 1;
+                }
+        }
+        if (++n[1] == n[0])
+                return 0;
+        memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
+        p[0] = v;
+        p[1] = 1;
+        return 1;
+}
+static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
+{
+        void *p;
+        int i;
+        if (n[0] == n[1])
+                return;
+        for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
+                if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
+                        continue;
+                if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+                        return;
+        }
+}
+static void show_symbol(struct seq_file *m, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+        char *modname;
+        const char *name;
+        unsigned long offset, size;
+        char namebuf[KSYM_NAME_LEN+1];
+        name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+        if (name) {
+                seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
+                if (modname)
+                        seq_printf(m, " [%s]", modname);
+                return;
+        }
+#endif
+        seq_printf(m, "%p", (void *)address);
+}
+static int leaks_show(struct seq_file *m, void *p)
+{
+        struct kmem_cache *cachep = p;
+        struct list_head *q;
+        struct slab *slabp;
+        struct kmem_list3 *l3;
+        const char *name;
+        unsigned long *n = m->private;
+        int node;
+        int i;
+        if (!(cachep->flags & SLAB_STORE_USER))
+                return 0;
+        if (!(cachep->flags & SLAB_RED_ZONE))
+                return 0;
+        /* OK, we can do it */
+        n[1] = 0;
+        for_each_online_node(node) {
+                l3 = cachep->nodelists[node];
+                if (!l3)
+                        continue;
+                check_irq_on();
+                spin_lock_irq(&l3->list_lock);
+                list_for_each(q, &l3->slabs_full) {
+                        slabp = list_entry(q, struct slab, list);
+                        handle_slab(n, cachep, slabp);
+                }
+                list_for_each(q, &l3->slabs_partial) {
+                        slabp = list_entry(q, struct slab, list);
+                        handle_slab(n, cachep, slabp);
+                }
+                spin_unlock_irq(&l3->list_lock);
+        }
+        name = cachep->name;
+        if (n[0] == n[1]) {
+                /* Increase the buffer size */
+                mutex_unlock(&cache_chain_mutex);
+                m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
+                if (!m->private) {
+                        /* Too bad, we are really out */
+                        m->private = n;
+                        mutex_lock(&cache_chain_mutex);
+                        return -ENOMEM;
+                }
+                *(unsigned long *)m->private = n[0] * 2;
+                kfree(n);
+                mutex_lock(&cache_chain_mutex);
+                /* Now make sure this entry will be retried */
+                m->count = m->size;
+                return 0;
+        }
+        for (i = 0; i < n[1]; i++) {
+                seq_printf(m, "%s: %lu ", name, n[2*i+3]);
+                show_symbol(m, n[2*i+2]);
+                seq_putc(m, '\n');
+        }
+        return 0;
+}
+struct seq_operations slabstats_op = {
+        .start = leaks_start,
+        .next = s_next,
+        .stop = s_stop,
+        .show = leaks_show,
+};
+#endif
 #endif
 /**
diff --git a/mm/slob.c b/mm/slob.c
index a1f42bdc0245..9bcc7e2cabfd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
+void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
+{
+        void *ret = kmem_cache_alloc(c, flags);
+        if (ret)
+                memset(ret, 0, c->size);
+        return ret;
+}
+EXPORT_SYMBOL(kmem_cache_zalloc);
 void kmem_cache_free(struct kmem_cache *c, void *b)
 {
        if (c->dtor)
diff --git a/mm/util.c b/mm/util.c
index 5f4bb59da63c..7368479220b3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,20 +1,22 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
+#include <linux/err.h>
+#include <asm/uaccess.h>
 /**
- * kzalloc - allocate memory. The memory is set to zero.
+ * __kzalloc - allocate memory. The memory is set to zero.
 * @size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
 */
-void *kzalloc(size_t size, gfp_t flags)
+void *__kzalloc(size_t size, gfp_t flags)
 {
-        void *ret = kmalloc(size, flags);
+        void *ret = ____kmalloc(size, flags);
        if (ret)
                memset(ret, 0, size);
        return ret;
 }
-EXPORT_SYMBOL(kzalloc);
+EXPORT_SYMBOL(__kzalloc);
 /*
 * kstrdup - allocate space for and copy an existing string
@@ -31,9 +33,44 @@ char *kstrdup(const char *s, gfp_t gfp)
                return NULL;
        len = strlen(s) + 1;
-        buf = kmalloc(len, gfp);
+        buf = ____kmalloc(len, gfp);
        if (buf)
                memcpy(buf, s, len);
        return buf;
 }
 EXPORT_SYMBOL(kstrdup);
+/*
+ * strndup_user - duplicate an existing string from user space
+ *
+ * @s: The string to duplicate
+ * @n: Maximum number of bytes to copy, including the trailing NUL.
+ */
+char *strndup_user(const char __user *s, long n)
+{
+        char *p;
+        long length;
+        length = strnlen_user(s, n);
+        if (!length)
+                return ERR_PTR(-EFAULT);
+        if (length > n)
+                return ERR_PTR(-EINVAL);
+        p = kmalloc(length, GFP_KERNEL);
+        if (!p)
+                return ERR_PTR(-ENOMEM);
+        if (copy_from_user(p, s, length)) {
+                kfree(p);
+                return ERR_PTR(-EFAULT);
+        }
+        p[length - 1] = '\0';
+        return p;
+}
+EXPORT_SYMBOL(strndup_user);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fd572bbdc9f5..78865c849f8f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1356,7 +1356,9 @@ static int __init kswapd_init(void)
                pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
                BUG_ON(pid < 0);
+                read_lock(&tasklist_lock);
                pgdat->kswapd = find_task_by_pid(pid);
+                read_unlock(&tasklist_lock);
        }
        total_memory = nr_free_pagecache_pages();
        hotcpu_notifier(cpu_callback, 0);