25 files changed, 1272 insertions, 880 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 1ac718f636ec..8ac412b45f18 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -166,5 +166,5 @@ config ZONE_DMA_FLAG
 config NR_QUICK
        int
        depends on QUICKLIST
+        default "2" if (SUPERH && !SUPERH64)
        default "1"
diff --git a/mm/filemap.c b/mm/filemap.c
index 5631d6b2a62d..edb1b0b5cc8d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -670,7 +670,8 @@ repeat:
        page = find_lock_page(mapping, index);
        if (!page) {
                if (!cached_page) {
-                        cached_page = alloc_page(gfp_mask);
+                        cached_page =
+                                __page_cache_alloc(gfp_mask);
                        if (!cached_page)
                                return NULL;
                }
@@ -750,6 +751,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
        read_unlock_irq(&mapping->tree_lock);
        return i;
 }
+EXPORT_SYMBOL(find_get_pages_contig);
 /**
 * find_get_pages_tag - find and return pages that match @tag
@@ -778,6 +780,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
        read_unlock_irq(&mapping->tree_lock);
        return ret;
 }
+EXPORT_SYMBOL(find_get_pages_tag);
 /**
 * grab_cache_page_nowait - returns locked page at given index in given cache
@@ -1110,6 +1113,45 @@ success:
        return size;
 }
+/*
+ * Performs necessary checks before doing a write
+ * @iov:        io vector request
+ * @nr_segs:    number of segments in the iovec
+ * @count:      number of bytes to write
+ * @access_flags: type of access: %VERIFY_READ or %VERIFY_WRITE
+ *
+ * Adjust number of segments and amount of bytes to write (nr_segs should be
+ * properly initialized first). Returns appropriate error code that caller
+ * should return or zero in case that write should be allowed.
+ */
+int generic_segment_checks(const struct iovec *iov,
+                        unsigned long *nr_segs, size_t *count, int access_flags)
+{
+        unsigned long   seg;
+        size_t cnt = 0;
+        for (seg = 0; seg < *nr_segs; seg++) {
+                const struct iovec *iv = &iov[seg];
+                /*
+                 * If any segment has a negative length, or the cumulative
+                 * length ever wraps negative then return -EINVAL.
+                 */
+                cnt += iv->iov_len;
+                if (unlikely((ssize_t)(cnt|iv->iov_len) < 0))
+                        return -EINVAL;
+                if (access_ok(access_flags, iv->iov_base, iv->iov_len))
+                        continue;
+                if (seg == 0)
+                        return -EFAULT;
+                *nr_segs = seg;
+                cnt -= iv->iov_len;     /* This segment is no good */
+                break;
+        }
+        *count = cnt;
+        return 0;
+}
+EXPORT_SYMBOL(generic_segment_checks);
 /**
 * generic_file_aio_read - generic filesystem read routine
 * @iocb:       kernel I/O control block
@@ -1131,24 +1173,9 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
        loff_t *ppos = &iocb->ki_pos;
        count = 0;
-        for (seg = 0; seg < nr_segs; seg++) {
+        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-                const struct iovec *iv = &iov[seg];
+        if (retval)
+                return retval;
-                /*
-                 * If any segment has a negative length, or the cumulative
-                 * length ever wraps negative then return -EINVAL.
-                 */
-                count += iv->iov_len;
-                if (unlikely((ssize_t)(count|iv->iov_len) < 0))
-                        return -EINVAL;
-                if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
-                        continue;
-                if (seg == 0)
-                        return -EFAULT;
-                nr_segs = seg;
-                count -= iv->iov_len;   /* This segment is no good */
-                break;
-        }
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (filp->f_flags & O_DIRECT) {
@@ -1758,7 +1785,7 @@ struct page *read_cache_page_async(struct address_space *mapping,
 retry:
        page = __read_cache_page(mapping, index, filler, data);
        if (IS_ERR(page))
-                goto out;
+                return page;
        mark_page_accessed(page);
        if (PageUptodate(page))
                goto out;
@@ -1776,9 +1803,9 @@ retry:
        err = filler(data, page);
        if (err < 0) {
                page_cache_release(page);
-                page = ERR_PTR(err);
+                return ERR_PTR(err);
        }
- out:
+out:
        mark_page_accessed(page);
        return page;
 }
@@ -2218,30 +2245,14 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
        size_t ocount;          /* original count */
        size_t count;           /* after file limit checks */
        struct inode    *inode = mapping->host;
-        unsigned long   seg;
        loff_t          pos;
        ssize_t         written;
        ssize_t         err;
        ocount = 0;
-        for (seg = 0; seg < nr_segs; seg++) {
+        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-                const struct iovec *iv = &iov[seg];
+        if (err)
+                return err;
-                /*
-                 * If any segment has a negative length, or the cumulative
-                 * length ever wraps negative then return -EINVAL.
-                 */
-                ocount += iv->iov_len;
-                if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
-                        return -EINVAL;
-                if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
-                        continue;
-                if (seg == 0)
-                        return -EFAULT;
-                nr_segs = seg;
-                ocount -= iv->iov_len;  /* This segment is no good */
-                break;
-        }
        count = ocount;
        pos = *ppos;
@@ -2301,10 +2312,10 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
                 * semantics.
                 */
                endbyte = pos + written_buffered - written - 1;
-                err = do_sync_file_range(file, pos, endbyte,
+                err = do_sync_mapping_range(file->f_mapping, pos, endbyte,
-                                         SYNC_FILE_RANGE_WAIT_BEFORE|
+                                            SYNC_FILE_RANGE_WAIT_BEFORE|
-                                         SYNC_FILE_RANGE_WRITE|
+                                            SYNC_FILE_RANGE_WRITE|
-                                         SYNC_FILE_RANGE_WAIT_AFTER);
+                                            SYNC_FILE_RANGE_WAIT_AFTER);
                if (err == 0) {
                        written = written_buffered;
                        invalidate_mapping_pages(mapping,
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index cbb335813ec0..fa360e566d88 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/uio.h>
 #include <linux/rmap.h>
+#include <linux/sched.h>
 #include <asm/tlbflush.h>
 #include "filemap.h"
@@ -434,7 +435,6 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
        unsigned blocksize;
        unsigned length;
        struct page *page;
-        void *kaddr;
        BUG_ON(!mapping->a_ops->get_xip_page);
@@ -458,11 +458,7 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
                else
                        return PTR_ERR(page);
        }
-        kaddr = kmap_atomic(page, KM_USER0);
+        zero_user_page(page, offset, length, KM_USER0);
-        memset(kaddr + offset, 0, length);
-        kunmap_atomic(kaddr, KM_USER0);
-        flush_dcache_page(page);
        return 0;
 }
 EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 36db012b38dd..eb7180db3033 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -140,6 +140,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        return page;
 fail:
+        if (vma->vm_flags & VM_MAYSHARE)
+                resv_huge_pages++;
        spin_unlock(&hugetlb_lock);
        return NULL;
 }
@@ -172,6 +174,17 @@ static int __init hugetlb_setup(char *s)
 }
 __setup("hugepages=", hugetlb_setup);
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+        int node;
+        unsigned int nr = 0;
+        for_each_node_mask(node, cpuset_current_mems_allowed)
+                nr += array[node];
+        return nr;
+}
 #ifdef CONFIG_SYSCTL
 static void update_and_free_page(struct page *page)
 {
@@ -817,6 +830,26 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
        chg = region_chg(&inode->i_mapping->private_list, from, to);
        if (chg < 0)
                return chg;
+        /*
+         * When cpuset is configured, it breaks the strict hugetlb page
+         * reservation as the accounting is done on a global variable. Such
+         * reservation is completely rubbish in the presence of cpuset because
+         * the reservation is not checked against page availability for the
+         * current cpuset. Application can still potentially OOM'ed by kernel
+         * with lack of free htlb page in cpuset that the task is in.
+         * Attempt to enforce strict accounting with cpuset is almost
+         * impossible (or too ugly) because cpuset is too fluid that
+         * task or memory node can be dynamically moved between cpusets.
+         *
+         * The change of semantics for shared hugetlb mapping with cpuset is
+         * undesirable. However, in order to preserve some of the semantics,
+         * we fall back to check against current free page availability as
+         * a best attempt and hopefully to minimize the impact of changing
+         * semantics that cpuset has.
+         */
+        if (chg > cpuset_mems_nr(free_huge_pages_node))
+                return -ENOMEM;
        ret = hugetlb_acct_memory(chg);
        if (ret < 0)
                return ret;
diff --git a/mm/madvise.c b/mm/madvise.c
index e75096b5a6d3..60542d006ec1 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -10,6 +10,7 @@
 #include <linux/syscalls.h>
 #include <linux/mempolicy.h>
 #include <linux/hugetlb.h>
+#include <linux/sched.h>
 /*
 * Any behaviour which results in changes to the vma->vm_flags needs to
diff --git a/mm/memory.c b/mm/memory.c
index 1d647ab0ee72..cb94488ab96d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -481,7 +481,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        page = vm_normal_page(vma, addr, pte);
        if (page) {
                get_page(page);
-                page_dup_rmap(page);
+                page_dup_rmap(page, vma, addr);
                rss[!!PageAnon(page)]++;
        }
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 84279127fcd3..df9d554bea30 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -65,7 +65,7 @@ static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
        int zone_type;
        zone_type = zone - pgdat->node_zones;
-        if (!populated_zone(zone)) {
+        if (!zone->wait_table) {
                int ret = 0;
                ret = init_currently_empty_zone(zone, phys_start_pfn,
                                                nr_pages, MEMMAP_HOTPLUG);
diff --git a/mm/mlock.c b/mm/mlock.c
index 3446b7ef731e..4d3fea267e0d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -10,7 +10,18 @@
 #include <linux/mm.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+int can_do_mlock(void)
+{
+        if (capable(CAP_IPC_LOCK))
+                return 1;
+        if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0)
+                return 1;
+        return 0;
+}
+EXPORT_SYMBOL(can_do_mlock);
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
        unsigned long start, unsigned long end, unsigned int newflags)
diff --git a/mm/mmap.c b/mm/mmap.c
index 52646d61ff69..68b9ad2ef1d6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1366,7 +1366,6 @@ unsigned long
 get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags)
 {
-        unsigned long ret;
        unsigned long (*get_area)(struct file *, unsigned long,
                                  unsigned long, unsigned long, unsigned long);
@@ -1721,7 +1720,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 /*
 * Split a vma into two pieces at address 'addr', a new vma is allocated
- * either for the first part or the the tail.
+ * either for the first part or the tail.
 */
 int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
              unsigned long addr, int new_below)
diff --git a/mm/msync.c b/mm/msync.c
index 358d73cf7b78..144a7570535d 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -12,6 +12,7 @@
 #include <linux/mman.h>
 #include <linux/file.h>
 #include <linux/syscalls.h>
+#include <linux/sched.h>
 /*
 * MS_SYNC syncs the entire file - including mappings.
diff --git a/mm/nommu.c b/mm/nommu.c
index 1f60194d9b9b..2b16b00a5b11 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -262,6 +262,14 @@ void vunmap(void *addr)
 }
 /*
+ * Implement a stub for vmalloc_sync_all() if the architecture chose not to
+ * have one.
+ */
+void  __attribute__((weak)) vmalloc_sync_all(void)
+{
+}
+/*
 *  sys_brk() for the most part doesn't need the global kernel
 *  lock, except when an application is doing something nasty
 *  like trying to un-brk an area that has already been mapped
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 029dfad5a235..eec1481ba44f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -588,31 +588,27 @@ void __init page_writeback_init(void)
 }
 /**
- * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
+ * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ * @writepage: function called for each page
+ * @data: data passed to writepage function
 *
- * This is a library function, which implements the writepages()
+ * If a page is already under I/O, write_cache_pages() skips it, even
- * address_space_operation.
- *
- * If a page is already under I/O, generic_writepages() skips it, even
 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
 * and msync() need to guarantee that all the data which was dirty at the time
 * the call was made get new I/O started against them.  If wbc->sync_mode is
 * WB_SYNC_ALL then we were called for data integrity and we must wait for
 * existing IO to complete.
- *
- * Derived from mpage_writepages() - if you fix this you should check that
- * also!
 */
-int generic_writepages(struct address_space *mapping,
+int write_cache_pages(struct address_space *mapping,
-                       struct writeback_control *wbc)
+                      struct writeback_control *wbc, writepage_t writepage,
+                      void *data)
 {
        struct backing_dev_info *bdi = mapping->backing_dev_info;
        int ret = 0;
        int done = 0;
-        int (*writepage)(struct page *page, struct writeback_control *wbc);
        struct pagevec pvec;
        int nr_pages;
        pgoff_t index;
@@ -625,12 +621,6 @@ int generic_writepages(struct address_space *mapping,
                return 0;
        }
-        writepage = mapping->a_ops->writepage;
-        /* deal with chardevs and other special file */
-        if (!writepage)
-                return 0;
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
                index = mapping->writeback_index; /* Start from prev offset */
@@ -682,13 +672,7 @@ retry:
                                continue;
                        }
-                        ret = (*writepage)(page, wbc);
+                        ret = (*writepage)(page, wbc, data);
-                        if (ret) {
-                                if (ret == -ENOSPC)
-                                        set_bit(AS_ENOSPC, &mapping->flags);
-                                else
-                                        set_bit(AS_EIO, &mapping->flags);
-                        }
                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE))
                                unlock_page(page);
@@ -715,6 +699,38 @@ retry:
                mapping->writeback_index = index;
        return ret;
 }
+EXPORT_SYMBOL(write_cache_pages);
+/*
+ * Function used by generic_writepages to call the real writepage
+ * function and set the mapping flags on error
+ */
+static int __writepage(struct page *page, struct writeback_control *wbc,
+                       void *data)
+{
+        struct address_space *mapping = data;
+        int ret = mapping->a_ops->writepage(page, wbc);
+        mapping_set_error(mapping, ret);
+        return ret;
+}
+/**
+ * generic_writepages - walk the list of dirty pages of the given address space and writepage() all of them.
+ * @mapping: address space structure to write
+ * @wbc: subtract the number of written pages from *@wbc->nr_to_write
+ *
+ * This is a library function, which implements the writepages()
+ * address_space_operation.
+ */
+int generic_writepages(struct address_space *mapping,
+                       struct writeback_control *wbc)
+{
+        /* deal with chardevs and other special file */
+        if (!mapping->a_ops->writepage)
+                return 0;
+        return write_cache_pages(mapping, wbc, __writepage, mapping);
+}
 EXPORT_SYMBOL(generic_writepages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 59164313167f..bd8e33582d25 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -103,7 +103,7 @@ int min_free_kbytes = 1024;
 unsigned long __meminitdata nr_kernel_pages;
 unsigned long __meminitdata nr_all_pages;
-static unsigned long __initdata dma_reserve;
+static unsigned long __meminitdata dma_reserve;
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
  /*
@@ -126,16 +126,21 @@ static unsigned long __initdata dma_reserve;
    #endif
  #endif
-  struct node_active_region __initdata early_node_map[MAX_ACTIVE_REGIONS];
+  struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];
-  int __initdata nr_nodemap_entries;
+  int __meminitdata nr_nodemap_entries;
-  unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
-  unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+  unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
  unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
  unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
 #endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
 #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
+#if MAX_NUMNODES > 1
+int nr_node_ids __read_mostly = MAX_NUMNODES;
+EXPORT_SYMBOL(nr_node_ids);
+#endif
 #ifdef CONFIG_DEBUG_VM
 static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 {
@@ -669,65 +674,28 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
        return i;
 }
-#if MAX_NUMNODES > 1
-int nr_node_ids __read_mostly = MAX_NUMNODES;
-EXPORT_SYMBOL(nr_node_ids);
-/*
- * Figure out the number of possible node ids.
- */
-static void __init setup_nr_node_ids(void)
-{
-        unsigned int node;
-        unsigned int highest = 0;
-        for_each_node_mask(node, node_possible_map)
-                highest = node;
-        nr_node_ids = highest + 1;
-}
-#else
-static void __init setup_nr_node_ids(void) {}
-#endif
 #ifdef CONFIG_NUMA
 /*
- * Called from the slab reaper to drain pagesets on a particular node that
+ * Called from the vmstat counter updater to drain pagesets of this
- * belongs to the currently executing processor.
+ * currently executing processor on remote nodes after they have
+ * expired.
+ *
 * Note that this function must be called with the thread pinned to
 * a single processor.
 */
-void drain_node_pages(int nodeid)
+void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
-        int i;
-        enum zone_type z;
        unsigned long flags;
+        int to_drain;
-        for (z = 0; z < MAX_NR_ZONES; z++) {
+        local_irq_save(flags);
-                struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
+        if (pcp->count >= pcp->batch)
-                struct per_cpu_pageset *pset;
+                to_drain = pcp->batch;
+        else
-                if (!populated_zone(zone))
+                to_drain = pcp->count;
-                        continue;
+        free_pages_bulk(zone, to_drain, &pcp->list, 0);
+        pcp->count -= to_drain;
-                pset = zone_pcp(zone, smp_processor_id());
+        local_irq_restore(flags);
-                for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
-                        struct per_cpu_pages *pcp;
-                        pcp = &pset->pcp[i];
-                        if (pcp->count) {
-                                int to_drain;
-                                local_irq_save(flags);
-                                if (pcp->count >= pcp->batch)
-                                        to_drain = pcp->batch;
-                                else
-                                        to_drain = pcp->count;
-                                free_pages_bulk(zone, to_drain, &pcp->list, 0);
-                                pcp->count -= to_drain;
-                                local_irq_restore(flags);
-                        }
-                }
-        }
 }
 #endif
@@ -2148,11 +2116,14 @@ static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
                if (process_zones(cpu))
                        ret = NOTIFY_BAD;
                break;
        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                free_zone_pagesets(cpu);
                break;
        default:
@@ -2179,7 +2150,7 @@ void __init setup_per_cpu_pageset(void)
 #endif
-static __meminit
+static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
        int i;
@@ -2267,7 +2238,7 @@ __meminit int init_currently_empty_zone(struct zone *zone,
 * Basic iterator support. Return the first range of PFNs for a node
 * Note: nid == MAX_NUMNODES returns first region regardless of node
 */
-static int __init first_active_region_index_in_nid(int nid)
+static int __meminit first_active_region_index_in_nid(int nid)
 {
        int i;
@@ -2282,7 +2253,7 @@ static int __init first_active_region_index_in_nid(int nid)
 * Basic iterator support. Return the next active range of PFNs for a node
 * Note: nid == MAX_NUMNODES returns next region regardles of node
 */
-static int __init next_active_region_index_in_nid(int index, int nid)
+static int __meminit next_active_region_index_in_nid(int index, int nid)
 {
        for (index = index + 1; index < nr_nodemap_entries; index++)
                if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)
@@ -2298,7 +2269,7 @@ static int __init next_active_region_index_in_nid(int index, int nid)
 * was used and there are no special requirements, this is a convenient
 * alternative
 */
-int __init early_pfn_to_nid(unsigned long pfn)
+int __meminit early_pfn_to_nid(unsigned long pfn)
 {
        int i;
@@ -2435,7 +2406,7 @@ static void __init account_node_boundary(unsigned int nid,
 * with no available memory, a warning is printed and the start and end
 * PFNs will be 0.
 */
-void __init get_pfn_range_for_nid(unsigned int nid,
+void __meminit get_pfn_range_for_nid(unsigned int nid,
                        unsigned long *start_pfn, unsigned long *end_pfn)
 {
        int i;
@@ -2460,7 +2431,7 @@ void __init get_pfn_range_for_nid(unsigned int nid,
 * Return the number of pages a zone spans in a node, including holes
 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
 */
-unsigned long __init zone_spanned_pages_in_node(int nid,
+unsigned long __meminit zone_spanned_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long *ignored)
 {
@@ -2488,7 +2459,7 @@ unsigned long __init zone_spanned_pages_in_node(int nid,
 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
 * then all holes in the requested range will be accounted for.
 */
-unsigned long __init __absent_pages_in_range(int nid,
+unsigned long __meminit __absent_pages_in_range(int nid,
                                unsigned long range_start_pfn,
                                unsigned long range_end_pfn)
 {
@@ -2548,7 +2519,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
 }
 /* Return the number of page frames in holes in a zone on a node */
-unsigned long __init zone_absent_pages_in_node(int nid,
+unsigned long __meminit zone_absent_pages_in_node(int nid,
                                        unsigned long zone_type,
                                        unsigned long *ignored)
 {
@@ -2584,7 +2555,7 @@ static inline unsigned long zone_absent_pages_in_node(int nid,
 #endif
-static void __init calculate_node_totalpages(struct pglist_data *pgdat,
+static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
                unsigned long *zones_size, unsigned long *zholes_size)
 {
        unsigned long realtotalpages, totalpages = 0;
@@ -2692,7 +2663,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
        }
 }
-static void __init alloc_node_mem_map(struct pglist_data *pgdat)
+static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
 {
        /* Skip empty nodes */
        if (!pgdat->node_spanned_pages)
@@ -2718,7 +2689,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat)
                        map = alloc_bootmem_node(pgdat, size);
                pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
        }
-#ifdef CONFIG_FLATMEM
+#ifndef CONFIG_NEED_MULTIPLE_NODES
        /*
         * With no DISCONTIG, the global mem_map is just set as node 0's
         */
@@ -2747,6 +2718,26 @@ void __meminit free_area_init_node(int nid, struct pglist_data *pgdat,
 }
 #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+#if MAX_NUMNODES > 1
+/*
+ * Figure out the number of possible node ids.
+ */
+static void __init setup_nr_node_ids(void)
+{
+        unsigned int node;
+        unsigned int highest = 0;
+        for_each_node_mask(node, node_possible_map)
+                highest = node;
+        nr_node_ids = highest + 1;
+}
+#else
+static inline void setup_nr_node_ids(void)
+{
+}
+#endif
 /**
 * add_active_range - Register a range of PFNs backed by physical memory
 * @nid: The node ID the range resides on
@@ -3012,7 +3003,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 {
        int cpu = (unsigned long)hcpu;
-        if (action == CPU_DEAD) {
+        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
                local_irq_disable();
                __drain_pages(cpu);
                vm_events_fold_cpu(cpu);
diff --git a/mm/rmap.c b/mm/rmap.c
index 75a32be64a21..850165d32b7a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -162,12 +162,10 @@ void anon_vma_unlink(struct vm_area_struct *vma)
 static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
                          unsigned long flags)
 {
-        if (flags & SLAB_CTOR_CONSTRUCTOR) {
+        struct anon_vma *anon_vma = data;
-                struct anon_vma *anon_vma = data;
-                spin_lock_init(&anon_vma->lock);
+        spin_lock_init(&anon_vma->lock);
-                INIT_LIST_HEAD(&anon_vma->head);
+        INIT_LIST_HEAD(&anon_vma->head);
-        }
 }
 void __init anon_vma_init(void)
@@ -505,6 +503,7 @@ int page_mkclean(struct page *page)
        return ret;
 }
+EXPORT_SYMBOL_GPL(page_mkclean);
 /**
 * page_set_anon_rmap - setup new anonymous rmap
@@ -531,19 +530,51 @@ static void __page_set_anon_rmap(struct page *page,
 }
 /**
+ * page_set_anon_rmap - sanity check anonymous rmap addition
+ * @page:       the page to add the mapping to
+ * @vma:        the vm area in which the mapping is added
+ * @address:    the user virtual address mapped
+ */
+static void __page_check_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address)
+{
+#ifdef CONFIG_DEBUG_VM
+        /*
+         * The page's anon-rmap details (mapping and index) are guaranteed to
+         * be set up correctly at this point.
+         *
+         * We have exclusion against page_add_anon_rmap because the caller
+         * always holds the page locked, except if called from page_dup_rmap,
+         * in which case the page is already known to be setup.
+         *
+         * We have exclusion against page_add_new_anon_rmap because those pages
+         * are initially only visible via the pagetables, and the pte is locked
+         * over the call to page_add_new_anon_rmap.
+         */
+        struct anon_vma *anon_vma = vma->anon_vma;
+        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+        BUG_ON(page->mapping != (struct address_space *)anon_vma);
+        BUG_ON(page->index != linear_page_index(vma, address));
+#endif
+}
+/**
 * page_add_anon_rmap - add pte mapping to an anonymous page
 * @page:       the page to add the mapping to
 * @vma:        the vm area in which the mapping is added
 * @address:    the user virtual address mapped
 *
- * The caller needs to hold the pte lock.
+ * The caller needs to hold the pte lock and the page must be locked.
 */
 void page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
+        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        if (atomic_inc_and_test(&page->_mapcount))
                __page_set_anon_rmap(page, vma, address);
-        /* else checking page index and mapping is racy */
+        else
+                __page_check_anon_rmap(page, vma, address);
 }
 /*
@@ -554,10 +585,12 @@ void page_add_anon_rmap(struct page *page,
 *
 * Same as page_add_anon_rmap but must only be called on *new* pages.
 * This means the inc-and-test can be bypassed.
+ * Page does not have to be locked.
 */
 void page_add_new_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
+        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */
        __page_set_anon_rmap(page, vma, address);
 }
@@ -574,6 +607,26 @@ void page_add_file_rmap(struct page *page)
                __inc_zone_page_state(page, NR_FILE_MAPPED);
 }
+#ifdef CONFIG_DEBUG_VM
+/**
+ * page_dup_rmap - duplicate pte mapping to a page
+ * @page:       the page to add the mapping to
+ *
+ * For copy_page_range only: minimal extract from page_add_file_rmap /
+ * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's
+ * quicker.
+ *
+ * The caller needs to hold the pte lock.
+ */
+void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
+{
+        BUG_ON(page_mapcount(page) == 0);
+        if (PageAnon(page))
+                __page_check_anon_rmap(page, vma, address);
+        atomic_inc(&page->_mapcount);
+}
+#endif
 /**
 * page_remove_rmap - take down pte mapping from a page
 * @page: page to remove mapping from
diff --git a/mm/shmem.c b/mm/shmem.c
index f01e8deed645..e537317bec4d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2358,13 +2358,11 @@ static void init_once(void *foo, struct kmem_cache *cachep,
 {
        struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
-        if (flags & SLAB_CTOR_CONSTRUCTOR) {
+        inode_init_once(&p->vfs_inode);
-                inode_init_once(&p->vfs_inode);
 #ifdef CONFIG_TMPFS_POSIX_ACL
-                p->i_acl = NULL;
+        p->i_acl = NULL;
-                p->i_default_acl = NULL;
+        p->i_default_acl = NULL;
 #endif
-        }
 }
 static int init_inodecache(void)
diff --git a/mm/slab.c b/mm/slab.c
index 5920a412b377..2e71a328aa09 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -148,10 +148,11 @@
 * Usually, the kmalloc caches are cache_line_size() aligned, except when
 * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned.
 * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than BYTES_PER_WORD. ARCH_KMALLOC_MINALIGN allows that.
+ * alignment larger than the alignment of a 64-bit integer.
- * Note that this flag disables some debug features.
+ * ARCH_KMALLOC_MINALIGN allows that.
+ * Note that increasing this value may disable some debug features.
 */
-#define ARCH_KMALLOC_MINALIGN 0
+#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 #endif
 #ifndef ARCH_SLAB_MINALIGN
@@ -408,9 +409,6 @@ struct kmem_cache {
        /* constructor func */
        void (*ctor) (void *, struct kmem_cache *, unsigned long);
-        /* de-constructor func */
-        void (*dtor) (void *, struct kmem_cache *, unsigned long);
 /* 5) cache creation/removal */
        const char *name;
        struct list_head next;
@@ -536,19 +534,22 @@ static int obj_size(struct kmem_cache *cachep)
        return cachep->obj_size;
 }
-static unsigned long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
+static unsigned long long *dbg_redzone1(struct kmem_cache *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
-        return (unsigned long*) (objp+obj_offset(cachep)-BYTES_PER_WORD);
+        return (unsigned long long*) (objp + obj_offset(cachep) -
+                                      sizeof(unsigned long long));
 }
-static unsigned long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
+static unsigned long long *dbg_redzone2(struct kmem_cache *cachep, void *objp)
 {
        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
        if (cachep->flags & SLAB_STORE_USER)
-                return (unsigned long *)(objp + cachep->buffer_size -
+                return (unsigned long long *)(objp + cachep->buffer_size -
-                                         2 * BYTES_PER_WORD);
+                                              sizeof(unsigned long long) -
-        return (unsigned long *)(objp + cachep->buffer_size - BYTES_PER_WORD);
+                                              BYTES_PER_WORD);
+        return (unsigned long long *) (objp + cachep->buffer_size -
+                                       sizeof(unsigned long long));
 }
 static void **dbg_userword(struct kmem_cache *cachep, void *objp)
@@ -561,28 +562,13 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #define obj_offset(x)                   0
 #define obj_size(cachep)                (cachep->buffer_size)
-#define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long *)NULL;})
+#define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
-#define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long *)NULL;})
+#define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long long *)NULL;})
 #define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
 #endif
 /*
- * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
- * order.
- */
-#if defined(CONFIG_LARGE_ALLOCS)
-#define MAX_OBJ_ORDER   13      /* up to 32Mb */
-#define MAX_GFP_ORDER   13      /* up to 32Mb */
-#elif defined(CONFIG_MMU)
-#define MAX_OBJ_ORDER   5       /* 32 pages */
-#define MAX_GFP_ORDER   5       /* 32 pages */
-#else
-#define MAX_OBJ_ORDER   8       /* up to 1Mb */
-#define MAX_GFP_ORDER   8       /* up to 1Mb */
-#endif
-/*
 * Do not go above this order unless 0 objects fit into the slab.
 */
 #define BREAK_GFP_ORDER_HI      1
@@ -788,6 +774,7 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
         */
        BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
 #endif
+        WARN_ON_ONCE(size == 0);
        while (size > csizep->cs_size)
                csizep++;
@@ -924,12 +911,6 @@ static void next_reap_node(void)
 {
        int node = __get_cpu_var(reap_node);
-        /*
-         * Also drain per cpu pages on remote zones
-         */
-        if (node != numa_node_id())
-                drain_node_pages(node);
        node = next_node(node, node_online_map);
        if (unlikely(node >= MAX_NUMNODES))
                node = first_node(node_online_map);
@@ -1182,8 +1163,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
        int memsize = sizeof(struct kmem_list3);
        switch (action) {
-        case CPU_UP_PREPARE:
+        case CPU_LOCK_ACQUIRE:
                mutex_lock(&cache_chain_mutex);
+                break;
+        case CPU_UP_PREPARE:
+        case CPU_UP_PREPARE_FROZEN:
                /*
                 * We need to do this right in the beginning since
                 * alloc_arraycache's are going to use this list.
@@ -1270,17 +1254,28 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
                }
                break;
        case CPU_ONLINE:
-                mutex_unlock(&cache_chain_mutex);
+        case CPU_ONLINE_FROZEN:
                start_cpu_timer(cpu);
                break;
 #ifdef CONFIG_HOTPLUG_CPU
-        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE:
-                mutex_lock(&cache_chain_mutex);
+        case CPU_DOWN_PREPARE_FROZEN:
-                break;
+                /*
-        case CPU_DOWN_FAILED:
+                 * Shutdown cache reaper. Note that the cache_chain_mutex is
-                mutex_unlock(&cache_chain_mutex);
+                 * held so that if cache_reap() is invoked it cannot do
-                break;
+                 * anything expensive but will only modify reap_work
+                 * and reschedule the timer.
+                */
+                cancel_rearming_delayed_work(&per_cpu(reap_work, cpu));
+                /* Now the cache_reaper is guaranteed to be not running. */
+                per_cpu(reap_work, cpu).work.func = NULL;
+                break;
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+                start_cpu_timer(cpu);
+                break;
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                /*
                 * Even if all the cpus of a node are down, we don't free the
                 * kmem_list3 of any cache. This to avoid a race between
@@ -1292,6 +1287,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
                /* fall thru */
 #endif
        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
                list_for_each_entry(cachep, &cache_chain, next) {
                        struct array_cache *nc;
                        struct array_cache *shared;
@@ -1350,6 +1346,8 @@ free_array_cache:
                                continue;
                        drain_freelist(cachep, l3, l3->free_objects);
                }
+                break;
+        case CPU_LOCK_RELEASE:
                mutex_unlock(&cache_chain_mutex);
                break;
        }
@@ -1776,7 +1774,7 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
        char *realobj;
        if (cachep->flags & SLAB_RED_ZONE) {
-                printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
+                printk(KERN_ERR "Redzone: 0x%llx/0x%llx.\n",
                        *dbg_redzone1(cachep, objp),
                        *dbg_redzone2(cachep, objp));
        }
@@ -1896,20 +1894,11 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
                                slab_error(cachep, "end of a freed object "
                                           "was overwritten");
                }
-                if (cachep->dtor && !(cachep->flags & SLAB_POISON))
-                        (cachep->dtor) (objp + obj_offset(cachep), cachep, 0);
        }
 }
 #else
 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
-        if (cachep->dtor) {
-                int i;
-                for (i = 0; i < cachep->num; i++) {
-                        void *objp = index_to_obj(cachep, slabp, i);
-                        (cachep->dtor) (objp, cachep, 0);
-                }
-        }
 }
 #endif
@@ -1998,7 +1987,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
        size_t left_over = 0;
        int gfporder;
-        for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
+        for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
                unsigned int num;
                size_t remainder;
@@ -2048,7 +2037,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
        return left_over;
 }
-static int setup_cpu_cache(struct kmem_cache *cachep)
+static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 {
        if (g_cpucache_up == FULL)
                return enable_cpucache(cachep);
@@ -2109,7 +2098,7 @@ static int setup_cpu_cache(struct kmem_cache *cachep)
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @ctor: A constructor for the objects.
- * @dtor: A destructor for the objects.
+ * @dtor: A destructor for the objects (not implemented anymore).
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a int, but can be interrupted.
@@ -2144,7 +2133,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * Sanity checks... these are all serious usage bugs.
         */
        if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
-            (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
+            size > KMALLOC_MAX_SIZE || dtor) {
                printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
                                name);
                BUG();
@@ -2198,9 +2187,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
        if (flags & SLAB_DESTROY_BY_RCU)
                BUG_ON(flags & SLAB_POISON);
 #endif
-        if (flags & SLAB_DESTROY_BY_RCU)
-                BUG_ON(dtor);
        /*
         * Always checks flags, a caller might be expecting debug support which
         * isn't available.
@@ -2239,7 +2225,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         * is greater than BYTES_PER_WORD.
         */
        if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
-                ralign = BYTES_PER_WORD;
+                ralign = __alignof__(unsigned long long);
        /* 2) arch mandated alignment */
        if (ralign < ARCH_SLAB_MINALIGN) {
@@ -2250,7 +2236,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                ralign = align;
        }
        /* disable debug if necessary */
-        if (ralign > BYTES_PER_WORD)
+        if (ralign > __alignof__(unsigned long long))
                flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
        /*
         * 4) Store it.
@@ -2271,8 +2257,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
         */
        if (flags & SLAB_RED_ZONE) {
                /* add space for red zone words */
-                cachep->obj_offset += BYTES_PER_WORD;
+                cachep->obj_offset += sizeof(unsigned long long);
-                size += 2 * BYTES_PER_WORD;
+                size += 2 * sizeof(unsigned long long);
        }
        if (flags & SLAB_STORE_USER) {
                /* user store requires one word storage behind the end of
@@ -2355,7 +2341,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
                BUG_ON(!cachep->slabp_cache);
        }
        cachep->ctor = ctor;
-        cachep->dtor = dtor;
        cachep->name = name;
        if (setup_cpu_cache(cachep)) {
@@ -2610,7 +2595,7 @@ static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
 }
 static void cache_init_objs(struct kmem_cache *cachep,
-                            struct slab *slabp, unsigned long ctor_flags)
+                            struct slab *slabp)
 {
        int i;
@@ -2634,7 +2619,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                 */
                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
                        cachep->ctor(objp + obj_offset(cachep), cachep,
-                                     ctor_flags);
+                                     0);
                if (cachep->flags & SLAB_RED_ZONE) {
                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
@@ -2650,7 +2635,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
                                         cachep->buffer_size / PAGE_SIZE, 0);
 #else
                if (cachep->ctor)
-                        cachep->ctor(objp, cachep, ctor_flags);
+                        cachep->ctor(objp, cachep, 0);
 #endif
                slab_bufctl(slabp)[i] = i + 1;
        }
@@ -2739,7 +2724,6 @@ static int cache_grow(struct kmem_cache *cachep,
        struct slab *slabp;
        size_t offset;
        gfp_t local_flags;
-        unsigned long ctor_flags;
        struct kmem_list3 *l3;
        /*
@@ -2748,7 +2732,6 @@ static int cache_grow(struct kmem_cache *cachep,
         */
        BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK));
-        ctor_flags = SLAB_CTOR_CONSTRUCTOR;
        local_flags = (flags & GFP_LEVEL_MASK);
        /* Take the l3 list lock to change the colour_next on this node */
        check_irq_off();
@@ -2793,7 +2776,7 @@ static int cache_grow(struct kmem_cache *cachep,
        slabp->nodeid = nodeid;
        slab_map_pages(cachep, slabp, objp);
-        cache_init_objs(cachep, slabp, ctor_flags);
+        cache_init_objs(cachep, slabp);
        if (local_flags & __GFP_WAIT)
                local_irq_disable();
@@ -2820,7 +2803,6 @@ failed:
 * Perform extra freeing checks:
 * - detect bad pointers.
 * - POISON/RED_ZONE checking
- * - destructor calls, for caches with POISON+dtor
 */
 static void kfree_debugcheck(const void *objp)
 {
@@ -2833,7 +2815,7 @@ static void kfree_debugcheck(const void *objp)
 static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
 {
-        unsigned long redzone1, redzone2;
+        unsigned long long redzone1, redzone2;
        redzone1 = *dbg_redzone1(cache, obj);
        redzone2 = *dbg_redzone2(cache, obj);
@@ -2849,7 +2831,7 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
        else
                slab_error(cache, "memory outside object was overwritten");
-        printk(KERN_ERR "%p: redzone 1:0x%lx, redzone 2:0x%lx.\n",
+        printk(KERN_ERR "%p: redzone 1:0x%llx, redzone 2:0x%llx.\n",
                        obj, redzone1, redzone2);
 }
@@ -2879,12 +2861,6 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
        BUG_ON(objnr >= cachep->num);
        BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
-        if (cachep->flags & SLAB_POISON && cachep->dtor) {
-                /* we want to cache poison the object,
-                 * call the destruction callback
-                 */
-                cachep->dtor(objp + obj_offset(cachep), cachep, 0);
-        }
 #ifdef CONFIG_DEBUG_SLAB_LEAK
        slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
 #endif
@@ -3065,7 +3041,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
                        slab_error(cachep, "double free, or memory outside"
                                                " object was overwritten");
                        printk(KERN_ERR
-                                "%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
+                                "%p: redzone 1:0x%llx, redzone 2:0x%llx\n",
                                objp, *dbg_redzone1(cachep, objp),
                                *dbg_redzone2(cachep, objp));
                }
@@ -3084,7 +3060,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 #endif
        objp += obj_offset(cachep);
        if (cachep->ctor && cachep->flags & SLAB_POISON)
-                cachep->ctor(objp, cachep, SLAB_CTOR_CONSTRUCTOR);
+                cachep->ctor(objp, cachep, 0);
 #if ARCH_SLAB_MINALIGN
        if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
                printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
@@ -3738,7 +3714,6 @@ EXPORT_SYMBOL(__kmalloc);
 /**
 * krealloc - reallocate memory. The contents will remain unchanged.
- *
 * @p: object to reallocate memory for.
 * @new_size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
@@ -4136,7 +4111,6 @@ next:
        check_irq_on();
        mutex_unlock(&cache_chain_mutex);
        next_reap_node();
-        refresh_cpu_vm_stats(smp_processor_id());
 out:
        /* Set up the next iteration */
        schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
@@ -4428,16 +4402,12 @@ static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
 static void show_symbol(struct seq_file *m, unsigned long address)
 {
 #ifdef CONFIG_KALLSYMS
-        char *modname;
-        const char *name;
        unsigned long offset, size;
-        char namebuf[KSYM_NAME_LEN+1];
+        char modname[MODULE_NAME_LEN + 1], name[KSYM_NAME_LEN + 1];
-        name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
-        if (name) {
+        if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
                seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
-                if (modname)
+                if (modname[0])
                        seq_printf(m, " [%s]", modname);
                return;
        }
diff --git a/mm/slob.c b/mm/slob.c
index c6933bc19bcd..71976c5d40d3 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/timer.h>
+#include <linux/rcupdate.h>
 struct slob_block {
        int units;
@@ -53,6 +54,16 @@ struct bigblock {
 };
 typedef struct bigblock bigblock_t;
+/*
+ * struct slob_rcu is inserted at the tail of allocated slob blocks, which
+ * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free
+ * the block using call_rcu.
+ */
+struct slob_rcu {
+        struct rcu_head head;
+        int size;
+};
 static slob_t arena = { .next = &arena, .units = 1 };
 static slob_t *slobfree = &arena;
 static bigblock_t *bigblocks;
@@ -266,9 +277,9 @@ size_t ksize(const void *block)
 struct kmem_cache {
        unsigned int size, align;
+        unsigned long flags;
        const char *name;
        void (*ctor)(void *, struct kmem_cache *, unsigned long);
-        void (*dtor)(void *, struct kmem_cache *, unsigned long);
 };
 struct kmem_cache *kmem_cache_create(const char *name, size_t size,
@@ -283,8 +294,12 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
        if (c) {
                c->name = name;
                c->size = size;
+                if (flags & SLAB_DESTROY_BY_RCU) {
+                        /* leave room for rcu footer at the end of object */
+                        c->size += sizeof(struct slob_rcu);
+                }
+                c->flags = flags;
                c->ctor = ctor;
-                c->dtor = dtor;
                /* ignore alignment unless it's forced */
                c->align = (flags & SLAB_HWCACHE_ALIGN) ? SLOB_ALIGN : 0;
                if (c->align < align)
@@ -312,7 +327,7 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
                b = (void *)__get_free_pages(flags, get_order(c->size));
        if (c->ctor)
-                c->ctor(b, c, SLAB_CTOR_CONSTRUCTOR);
+                c->ctor(b, c, 0);
        return b;
 }
@@ -328,15 +343,33 @@ void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_zalloc);
-void kmem_cache_free(struct kmem_cache *c, void *b)
+static void __kmem_cache_free(void *b, int size)
 {
-        if (c->dtor)
+        if (size < PAGE_SIZE)
-                c->dtor(b, c, 0);
+                slob_free(b, size);
-        if (c->size < PAGE_SIZE)
-                slob_free(b, c->size);
        else
-                free_pages((unsigned long)b, get_order(c->size));
+                free_pages((unsigned long)b, get_order(size));
+}
+static void kmem_rcu_free(struct rcu_head *head)
+{
+        struct slob_rcu *slob_rcu = (struct slob_rcu *)head;
+        void *b = (void *)slob_rcu - (slob_rcu->size - sizeof(struct slob_rcu));
+        __kmem_cache_free(b, slob_rcu->size);
+}
+void kmem_cache_free(struct kmem_cache *c, void *b)
+{
+        if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
+                struct slob_rcu *slob_rcu;
+                slob_rcu = b + (c->size - sizeof(struct slob_rcu));
+                INIT_RCU_HEAD(&slob_rcu->head);
+                slob_rcu->size = c->size;
+                call_rcu(&slob_rcu->head, kmem_rcu_free);
+        } else {
+                __kmem_cache_free(b, c->size);
+        }
 }
 EXPORT_SYMBOL(kmem_cache_free);
diff --git a/mm/slub.c b/mm/slub.c
index 5db3da5a60bf..51663a3c3c24 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -66,11 +66,11 @@
 * SLUB assigns one slab for allocation to each processor.
 * Allocations only occur from these slabs called cpu slabs.
 *
- * Slabs with free elements are kept on a partial list.
+ * Slabs with free elements are kept on a partial list and during regular
- * There is no list for full slabs. If an object in a full slab is
+ * operations no list for full slabs is used. If an object in a full slab is
 * freed then the slab will show up again on the partial lists.
- * Otherwise there is no need to track full slabs unless we have to
+ * We track full slabs for debugging purposes though because otherwise we
- * track full slabs for debugging purposes.
+ * cannot scan all objects.
 *
 * Slabs are freed when they become empty. Teardown and setup is
 * minimal so we rely on the page allocators per cpu caches for
@@ -78,22 +78,72 @@
 *
 * Overloading of page flags that are otherwise used for LRU management.
 *
- * PageActive           The slab is used as a cpu cache. Allocations
+ * PageActive           The slab is frozen and exempt from list processing.
- *                      may be performed from the slab. The slab is not
+ *                      This means that the slab is dedicated to a purpose
- *                      on any slab list and cannot be moved onto one.
+ *                      such as satisfying allocations for a specific
+ *                      processor. Objects may be freed in the slab while
+ *                      it is frozen but slab_free will then skip the usual
+ *                      list operations. It is up to the processor holding
+ *                      the slab to integrate the slab into the slab lists
+ *                      when the slab is no longer needed.
+ *
+ *                      One use of this flag is to mark slabs that are
+ *                      used for allocations. Then such a slab becomes a cpu
+ *                      slab. The cpu slab may be equipped with an additional
+ *                      lockless_freelist that allows lockless access to
+ *                      free objects in addition to the regular freelist
+ *                      that requires the slab lock.
 *
 * PageError            Slab requires special handling due to debug
 *                      options set. This moves slab handling out of
- *                      the fast path.
+ *                      the fast path and disables lockless freelists.
 */
+#define FROZEN (1 << PG_active)
+#ifdef CONFIG_SLUB_DEBUG
+#define SLABDEBUG (1 << PG_error)
+#else
+#define SLABDEBUG 0
+#endif
+static inline int SlabFrozen(struct page *page)
+{
+        return page->flags & FROZEN;
+}
+static inline void SetSlabFrozen(struct page *page)
+{
+        page->flags |= FROZEN;
+}
+static inline void ClearSlabFrozen(struct page *page)
+{
+        page->flags &= ~FROZEN;
+}
+static inline int SlabDebug(struct page *page)
+{
+        return page->flags & SLABDEBUG;
+}
+static inline void SetSlabDebug(struct page *page)
+{
+        page->flags |= SLABDEBUG;
+}
+static inline void ClearSlabDebug(struct page *page)
+{
+        page->flags &= ~SLABDEBUG;
+}
 /*
 * Issues still to be resolved:
 *
 * - The per cpu array is updated for each new slab and and is a remote
 *   cacheline for most nodes. This could become a bouncing cacheline given
- *   enough frequent updates. There are 16 pointers in a cacheline.so at
+ *   enough frequent updates. There are 16 pointers in a cacheline, so at
- *   max 16 cpus could compete. Likely okay.
+ *   max 16 cpus could compete for the cacheline which may be okay.
 *
 * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
 *
@@ -137,6 +187,7 @@
 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
                                SLAB_POISON | SLAB_STORE_USER)
 /*
 * Set of flags that will prevent slab merging
 */
@@ -157,6 +208,11 @@
 /* Internal SLUB flags */
 #define __OBJECT_POISON 0x80000000      /* Poison object */
+/* Not all arches define cache_line_size */
+#ifndef cache_line_size
+#define cache_line_size()       L1_CACHE_BYTES
+#endif
 static int kmem_size = sizeof(struct kmem_cache);
 #ifdef CONFIG_SMP
@@ -166,7 +222,7 @@ static struct notifier_block slab_notifier;
 static enum {
        DOWN,           /* No slab functionality available */
        PARTIAL,        /* kmem_cache_open() works but kmalloc does not */
-        UP,             /* Everything works */
+        UP,             /* Everything works but does not show up in sysfs */
        SYSFS           /* Sysfs up */
 } slab_state = DOWN;
@@ -174,7 +230,19 @@ static enum {
 static DECLARE_RWSEM(slub_lock);
 LIST_HEAD(slab_caches);
-#ifdef CONFIG_SYSFS
+/*
+ * Tracking user of a slab.
+ */
+struct track {
+        void *addr;             /* Called from address */
+        int cpu;                /* Was running on cpu */
+        int pid;                /* Pid context */
+        unsigned long when;     /* When did the operation occur */
+};
+enum track_item { TRACK_ALLOC, TRACK_FREE };
+#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 static void sysfs_slab_remove(struct kmem_cache *);
@@ -202,6 +270,63 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
 #endif
 }
+static inline int check_valid_pointer(struct kmem_cache *s,
+                                struct page *page, const void *object)
+{
+        void *base;
+        if (!object)
+                return 1;
+        base = page_address(page);
+        if (object < base || object >= base + s->objects * s->size ||
+                (object - base) % s->size) {
+                return 0;
+        }
+        return 1;
+}
+/*
+ * Slow version of get and set free pointer.
+ *
+ * This version requires touching the cache lines of kmem_cache which
+ * we avoid to do in the fast alloc free paths. There we obtain the offset
+ * from the page struct.
+ */
+static inline void *get_freepointer(struct kmem_cache *s, void *object)
+{
+        return *(void **)(object + s->offset);
+}
+static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
+{
+        *(void **)(object + s->offset) = fp;
+}
+/* Loop over all objects in a slab */
+#define for_each_object(__p, __s, __addr) \
+        for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\
+                        __p += (__s)->size)
+/* Scan freelist */
+#define for_each_free_object(__p, __s, __free) \
+        for (__p = (__free); __p; __p = get_freepointer((__s), __p))
+/* Determine object index from a given position */
+static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
+{
+        return (p - addr) / s->size;
+}
+#ifdef CONFIG_SLUB_DEBUG
+/*
+ * Debug settings:
+ */
+static int slub_debug;
+static char *slub_debug_slabs;
 /*
 * Object debugging
 */
@@ -237,35 +362,6 @@ static void print_section(char *text, u8 *addr, unsigned int length)
        }
 }
-/*
- * Slow version of get and set free pointer.
- *
- * This requires touching the cache lines of kmem_cache.
- * The offset can also be obtained from the page. In that
- * case it is in the cacheline that we already need to touch.
- */
-static void *get_freepointer(struct kmem_cache *s, void *object)
-{
-        return *(void **)(object + s->offset);
-}
-static void set_freepointer(struct kmem_cache *s, void *object, void *fp)
-{
-        *(void **)(object + s->offset) = fp;
-}
-/*
- * Tracking user of a slab.
- */
-struct track {
-        void *addr;             /* Called from address */
-        int cpu;                /* Was running on cpu */
-        int pid;                /* Pid context */
-        unsigned long when;     /* When did the operation occur */
-};
-enum track_item { TRACK_ALLOC, TRACK_FREE };
 static struct track *get_track(struct kmem_cache *s, void *object,
        enum track_item alloc)
 {
@@ -400,24 +496,6 @@ static int check_bytes(u8 *start, unsigned int value, unsigned int bytes)
        return 1;
 }
-static int check_valid_pointer(struct kmem_cache *s, struct page *page,
-                                         void *object)
-{
-        void *base;
-        if (!object)
-                return 1;
-        base = page_address(page);
-        if (object < base || object >= base + s->objects * s->size ||
-                (object - base) % s->size) {
-                return 0;
-        }
-        return 1;
-}
 /*
 * Object layout:
 *
@@ -425,26 +503,34 @@ static int check_valid_pointer(struct kmem_cache *s, struct page *page,
 *      Bytes of the object to be managed.
 *      If the freepointer may overlay the object then the free
 *      pointer is the first word of the object.
+ *
 *      Poisoning uses 0x6b (POISON_FREE) and the last byte is
 *      0xa5 (POISON_END)
 *
 * object + s->objsize
 *      Padding to reach word boundary. This is also used for Redzoning.
- *      Padding is extended to word size if Redzoning is enabled
+ *      Padding is extended by another word if Redzoning is enabled and
- *      and objsize == inuse.
+ *      objsize == inuse.
+ *
 *      We fill with 0xbb (RED_INACTIVE) for inactive objects and with
 *      0xcc (RED_ACTIVE) for objects in use.
 *
 * object + s->inuse
+ *      Meta data starts here.
+ *
 *      A. Free pointer (if we cannot overwrite object on free)
 *      B. Tracking data for SLAB_STORE_USER
- *      C. Padding to reach required alignment boundary
+ *      C. Padding to reach required alignment boundary or at mininum
- *              Padding is done using 0x5a (POISON_INUSE)
+ *              one word if debuggin is on to be able to detect writes
+ *              before the word boundary.
+ *
+ *      Padding is done using 0x5a (POISON_INUSE)
 *
 * object + s->size
+ *      Nothing is used beyond s->size.
 *
- * If slabcaches are merged then the objsize and inuse boundaries are to
+ * If slabcaches are merged then the objsize and inuse boundaries are mostly
- * be ignored. And therefore no slab options that rely on these boundaries
+ * ignored. And therefore no slab options that rely on these boundaries
 * may be used with merged slabcaches.
 */
@@ -570,8 +656,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
                /*
                 * No choice but to zap it and thus loose the remainder
                 * of the free objects in this slab. May cause
-                 * another error because the object count maybe
+                 * another error because the object count is now wrong.
-                 * wrong now.
                 */
                set_freepointer(s, p, NULL);
                return 0;
@@ -611,9 +696,8 @@ static int check_slab(struct kmem_cache *s, struct page *page)
 }
 /*
- * Determine if a certain object on a page is on the freelist and
+ * Determine if a certain object on a page is on the freelist. Must hold the
- * therefore free. Must hold the slab lock for cpu slabs to
+ * slab lock to guarantee that the chains are in a consistent state.
- * guarantee that the chains are consistent.
 */
 static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
 {
@@ -658,8 +742,24 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search)
        return search == NULL;
 }
+static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc)
+{
+        if (s->flags & SLAB_TRACE) {
+                printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n",
+                        s->name,
+                        alloc ? "alloc" : "free",
+                        object, page->inuse,
+                        page->freelist);
+                if (!alloc)
+                        print_section("Object", (void *)object, s->objsize);
+                dump_stack();
+        }
+}
 /*
- * Tracking of fully allocated slabs for debugging
+ * Tracking of fully allocated slabs for debugging purposes.
 */
 static void add_full(struct kmem_cache_node *n, struct page *page)
 {
@@ -682,8 +782,18 @@ static void remove_full(struct kmem_cache *s, struct page *page)
        spin_unlock(&n->list_lock);
 }
-static int alloc_object_checks(struct kmem_cache *s, struct page *page,
+static void setup_object_debug(struct kmem_cache *s, struct page *page,
-                                                        void *object)
+                                                                void *object)
+{
+        if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
+                return;
+        init_object(s, object, 0);
+        init_tracking(s, object);
+}
+static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
+                                                void *object, void *addr)
 {
        if (!check_slab(s, page))
                goto bad;
@@ -698,19 +808,22 @@ static int alloc_object_checks(struct kmem_cache *s, struct page *page,
                goto bad;
        }
-        if (!object)
+        if (object && !check_object(s, page, object, 0))
-                return 1;
-        if (!check_object(s, page, object, 0))
                goto bad;
+        /* Success perform special debug activities for allocs */
+        if (s->flags & SLAB_STORE_USER)
+                set_track(s, object, TRACK_ALLOC, addr);
+        trace(s, page, object, 1);
+        init_object(s, object, 1);
        return 1;
 bad:
        if (PageSlab(page)) {
                /*
                 * If this is a slab page then lets do the best we can
                 * to avoid issues in the future. Marking all objects
-                 * as used avoids touching the remainder.
+                 * as used avoids touching the remaining objects.
                 */
                printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n",
                        s->name, page);
@@ -722,8 +835,8 @@ bad:
        return 0;
 }
-static int free_object_checks(struct kmem_cache *s, struct page *page,
+static int free_debug_processing(struct kmem_cache *s, struct page *page,
-                                                        void *object)
+                                                void *object, void *addr)
 {
        if (!check_slab(s, page))
                goto fail;
@@ -757,13 +870,107 @@ static int free_object_checks(struct kmem_cache *s, struct page *page,
                                "to slab %s", object, page->slab->name);
                goto fail;
        }
+        /* Special debug activities for freeing objects */
+        if (!SlabFrozen(page) && !page->freelist)
+                remove_full(s, page);
+        if (s->flags & SLAB_STORE_USER)
+                set_track(s, object, TRACK_FREE, addr);
+        trace(s, page, object, 0);
+        init_object(s, object, 0);
        return 1;
 fail:
        printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
                s->name, page, object);
        return 0;
 }
+static int __init setup_slub_debug(char *str)
+{
+        if (!str || *str != '=')
+                slub_debug = DEBUG_DEFAULT_FLAGS;
+        else {
+                str++;
+                if (*str == 0 || *str == ',')
+                        slub_debug = DEBUG_DEFAULT_FLAGS;
+                else
+                for( ;*str && *str != ','; str++)
+                        switch (*str) {
+                        case 'f' : case 'F' :
+                                slub_debug |= SLAB_DEBUG_FREE;
+                                break;
+                        case 'z' : case 'Z' :
+                                slub_debug |= SLAB_RED_ZONE;
+                                break;
+                        case 'p' : case 'P' :
+                                slub_debug |= SLAB_POISON;
+                                break;
+                        case 'u' : case 'U' :
+                                slub_debug |= SLAB_STORE_USER;
+                                break;
+                        case 't' : case 'T' :
+                                slub_debug |= SLAB_TRACE;
+                                break;
+                        default:
+                                printk(KERN_ERR "slub_debug option '%c' "
+                                        "unknown. skipped\n",*str);
+                        }
+        }
+        if (*str == ',')
+                slub_debug_slabs = str + 1;
+        return 1;
+}
+__setup("slub_debug", setup_slub_debug);
+static void kmem_cache_open_debug_check(struct kmem_cache *s)
+{
+        /*
+         * The page->offset field is only 16 bit wide. This is an offset
+         * in units of words from the beginning of an object. If the slab
+         * size is bigger then we cannot move the free pointer behind the
+         * object anymore.
+         *
+         * On 32 bit platforms the limit is 256k. On 64bit platforms
+         * the limit is 512k.
+         *
+         * Debugging or ctor may create a need to move the free
+         * pointer. Fail if this happens.
+         */
+        if (s->objsize >= 65535 * sizeof(void *)) {
+                BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON |
+                                SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
+                BUG_ON(s->ctor);
+        }
+        else
+                /*
+                 * Enable debugging if selected on the kernel commandline.
+                 */
+                if (slub_debug && (!slub_debug_slabs ||
+                    strncmp(slub_debug_slabs, s->name,
+                        strlen(slub_debug_slabs)) == 0))
+                                s->flags |= slub_debug;
+}
+#else
+static inline void setup_object_debug(struct kmem_cache *s,
+                        struct page *page, void *object) {}
+static inline int alloc_debug_processing(struct kmem_cache *s,
+        struct page *page, void *object, void *addr) { return 0; }
+static inline int free_debug_processing(struct kmem_cache *s,
+        struct page *page, void *object, void *addr) { return 0; }
+static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
+                        { return 1; }
+static inline int check_object(struct kmem_cache *s, struct page *page,
+                        void *object, int active) { return 1; }
+static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
+static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {}
+#define slub_debug 0
+#endif
 /*
 * Slab allocation and freeing
 */
@@ -797,13 +1004,9 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 static void setup_object(struct kmem_cache *s, struct page *page,
                                void *object)
 {
-        if (PageError(page)) {
+        setup_object_debug(s, page, object);
-                init_object(s, object, 0);
-                init_tracking(s, object);
-        }
        if (unlikely(s->ctor))
-                s->ctor(object, s, SLAB_CTOR_CONSTRUCTOR);
+                s->ctor(object, s, 0);
 }
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -832,7 +1035,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        page->flags |= 1 << PG_slab;
        if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON |
                        SLAB_STORE_USER | SLAB_TRACE))
-                page->flags |= 1 << PG_error;
+                SetSlabDebug(page);
        start = page_address(page);
        end = start + s->objects * s->size;
@@ -841,7 +1044,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
                memset(start, POISON_INUSE, PAGE_SIZE << s->order);
        last = start;
-        for (p = start + s->size; p < end; p += s->size) {
+        for_each_object(p, s, start) {
                setup_object(s, page, last);
                set_freepointer(s, last, p);
                last = p;
@@ -850,6 +1053,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        set_freepointer(s, last, NULL);
        page->freelist = start;
+        page->lockless_freelist = NULL;
        page->inuse = 0;
 out:
        if (flags & __GFP_WAIT)
@@ -861,17 +1065,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 {
        int pages = 1 << s->order;
-        if (unlikely(PageError(page) || s->dtor)) {
+        if (unlikely(SlabDebug(page))) {
-                void *start = page_address(page);
-                void *end = start + (pages << PAGE_SHIFT);
                void *p;
                slab_pad_check(s, page);
-                for (p = start; p <= end - s->size; p += s->size) {
+                for_each_object(p, s, page_address(page))
-                        if (s->dtor)
-                                s->dtor(p, s, 0);
                        check_object(s, page, p, 0);
-                }
        }
        mod_zone_page_state(page_zone(page),
@@ -910,7 +1109,8 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
        atomic_long_dec(&n->nr_slabs);
        reset_page_mapcount(page);
-        page->flags &= ~(1 << PG_slab | 1 << PG_error);
+        ClearSlabDebug(page);
+        __ClearPageSlab(page);
        free_slab(s, page);
 }
@@ -966,22 +1166,23 @@ static void remove_partial(struct kmem_cache *s,
 }
 /*
- * Lock page and remove it from the partial list
+ * Lock slab and remove from the partial list.
 *
- * Must hold list_lock
+ * Must hold list_lock.
 */
-static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page)
+static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page)
 {
        if (slab_trylock(page)) {
                list_del(&page->lru);
                n->nr_partial--;
+                SetSlabFrozen(page);
                return 1;
        }
        return 0;
 }
 /*
- * Try to get a partial slab from a specific node
+ * Try to allocate a partial slab from a specific node.
 */
 static struct page *get_partial_node(struct kmem_cache_node *n)
 {
@@ -990,14 +1191,15 @@ static struct page *get_partial_node(struct kmem_cache_node *n)
        /*
         * Racy check. If we mistakenly see no partial slabs then we
         * just allocate an empty slab. If we mistakenly try to get a
-         * partial slab then get_partials() will return NULL.
+         * partial slab and there is none available then get_partials()
+         * will return NULL.
         */
        if (!n || !n->nr_partial)
                return NULL;
        spin_lock(&n->list_lock);
        list_for_each_entry(page, &n->partial, lru)
-                if (lock_and_del_slab(n, page))
+                if (lock_and_freeze_slab(n, page))
                        goto out;
        page = NULL;
 out:
@@ -1006,8 +1208,7 @@ out:
 }
 /*
- * Get a page from somewhere. Search in increasing NUMA
+ * Get a page from somewhere. Search in increasing NUMA distances.
- * distances.
 */
 static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
 {
@@ -1017,24 +1218,22 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
        struct page *page;
        /*
-         * The defrag ratio allows to configure the tradeoffs between
+         * The defrag ratio allows a configuration of the tradeoffs between
-         * inter node defragmentation and node local allocations.
+         * inter node defragmentation and node local allocations. A lower
-         * A lower defrag_ratio increases the tendency to do local
+         * defrag_ratio increases the tendency to do local allocations
-         * allocations instead of scanning throught the partial
+         * instead of attempting to obtain partial slabs from other nodes.
-         * lists on other nodes.
         *
-         * If defrag_ratio is set to 0 then kmalloc() always
+         * If the defrag_ratio is set to 0 then kmalloc() always
-         * returns node local objects. If its higher then kmalloc()
+         * returns node local objects. If the ratio is higher then kmalloc()
-         * may return off node objects in order to avoid fragmentation.
+         * may return off node objects because partial slabs are obtained
-         *
+         * from other nodes and filled up.
-         * A higher ratio means slabs may be taken from other nodes
-         * thus reducing the number of partial slabs on those nodes.
         *
         * If /sys/slab/xx/defrag_ratio is set to 100 (which makes
-         * defrag_ratio = 1000) then every (well almost) allocation
+         * defrag_ratio = 1000) then every (well almost) allocation will
-         * will first attempt to defrag slab caches on other nodes. This
+         * first attempt to defrag slab caches on other nodes. This means
-         * means scanning over all nodes to look for partial slabs which
+         * scanning over all nodes to look for partial slabs which may be
-         * may be a bit expensive to do on every slab allocation.
+         * expensive if we do it every time we are trying to find a slab
+         * with available objects.
         */
        if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio)
                return NULL;
@@ -1079,26 +1278,28 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
 *
 * On exit the slab lock will have been dropped.
 */
-static void putback_slab(struct kmem_cache *s, struct page *page)
+static void unfreeze_slab(struct kmem_cache *s, struct page *page)
 {
        struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+        ClearSlabFrozen(page);
        if (page->inuse) {
                if (page->freelist)
                        add_partial(n, page);
-                else if (PageError(page) && (s->flags & SLAB_STORE_USER))
+                else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER))
                        add_full(n, page);
                slab_unlock(page);
        } else {
                if (n->nr_partial < MIN_PARTIAL) {
                        /*
-                         * Adding an empty page to the partial slabs in order
+                         * Adding an empty slab to the partial slabs in order
-                         * to avoid page allocator overhead. This page needs to
+                         * to avoid page allocator overhead. This slab needs
-                         * come after all the others that are not fully empty
+                         * to come after the other slabs with objects in
-                         * in order to make sure that we do maximum
+                         * order to fill them up. That way the size of the
-                         * defragmentation.
+                         * partial list stays small. kmem_cache_shrink can
+                         * reclaim empty slabs from the partial list.
                         */
                        add_partial_tail(n, page);
                        slab_unlock(page);
@@ -1114,10 +1315,25 @@ static void putback_slab(struct kmem_cache *s, struct page *page)
 */
 static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
 {
-        s->cpu_slab[cpu] = NULL;
+        /*
-        ClearPageActive(page);
+         * Merge cpu freelist into freelist. Typically we get here
+         * because both freelists are empty. So this is unlikely
+         * to occur.
+         */
+        while (unlikely(page->lockless_freelist)) {
+                void **object;
+                /* Retrieve object from cpu_freelist */
+                object = page->lockless_freelist;
+                page->lockless_freelist = page->lockless_freelist[page->offset];
-        putback_slab(s, page);
+                /* And put onto the regular freelist */
+                object[page->offset] = page->freelist;
+                page->freelist = object;
+                page->inuse--;
+        }
+        s->cpu_slab[cpu] = NULL;
+        unfreeze_slab(s, page);
 }
 static void flush_slab(struct kmem_cache *s, struct page *page, int cpu)
@@ -1160,47 +1376,46 @@ static void flush_all(struct kmem_cache *s)
 }
 /*
- * slab_alloc is optimized to only modify two cachelines on the fast path
+ * Slow path. The lockless freelist is empty or we need to perform
- * (aside from the stack):
+ * debugging duties.
+ *
+ * Interrupts are disabled.
 *
- * 1. The page struct
+ * Processing is still very fast if new objects have been freed to the
- * 2. The first cacheline of the object to be allocated.
+ * regular freelist. In that case we simply take over the regular freelist
+ * as the lockless freelist and zap the regular freelist.
 *
- * The only cache lines that are read (apart from code) is the
+ * If that is not working then we fall back to the partial lists. We take the
- * per cpu array in the kmem_cache struct.
+ * first element of the freelist as the object to allocate now and move the
+ * rest of the freelist to the lockless freelist.
 *
- * Fastpath is not possible if we need to get a new slab or have
+ * And if we were unable to get a new slab from the partial slab lists then
- * debugging enabled (which means all slabs are marked with PageError)
+ * we need to allocate a new slab. This is slowest path since we may sleep.
 */
-static void *slab_alloc(struct kmem_cache *s,
+static void *__slab_alloc(struct kmem_cache *s,
-                                gfp_t gfpflags, int node, void *addr)
+                gfp_t gfpflags, int node, void *addr, struct page *page)
 {
-        struct page *page;
        void **object;
-        unsigned long flags;
+        int cpu = smp_processor_id();
-        int cpu;
-        local_irq_save(flags);
-        cpu = smp_processor_id();
-        page = s->cpu_slab[cpu];
        if (!page)
                goto new_slab;
        slab_lock(page);
        if (unlikely(node != -1 && page_to_nid(page) != node))
                goto another_slab;
-redo:
+load_freelist:
        object = page->freelist;
        if (unlikely(!object))
                goto another_slab;
-        if (unlikely(PageError(page)))
+        if (unlikely(SlabDebug(page)))
                goto debug;
-have_object:
+        object = page->freelist;
-        page->inuse++;
+        page->lockless_freelist = object[page->offset];
-        page->freelist = object[page->offset];
+        page->inuse = s->objects;
+        page->freelist = NULL;
        slab_unlock(page);
-        local_irq_restore(flags);
        return object;
 another_slab:
@@ -1208,11 +1423,9 @@ another_slab:
 new_slab:
        page = get_partial(s, gfpflags, node);
-        if (likely(page)) {
+        if (page) {
-have_slab:
                s->cpu_slab[cpu] = page;
-                SetPageActive(page);
+                goto load_freelist;
-                goto redo;
        }
        page = new_slab(s, gfpflags, node);
@@ -1220,9 +1433,11 @@ have_slab:
                cpu = smp_processor_id();
                if (s->cpu_slab[cpu]) {
                        /*
-                         * Someone else populated the cpu_slab while we enabled
+                         * Someone else populated the cpu_slab while we
-                         * interrupts, or we have got scheduled on another cpu.
+                         * enabled interrupts, or we have gotten scheduled
-                         * The page may not be on the requested node.
+                         * on another cpu. The page may not be on the
+                         * requested node even if __GFP_THISNODE was
+                         * specified. So we need to recheck.
                         */
                        if (node == -1 ||
                                page_to_nid(s->cpu_slab[cpu]) == node) {
@@ -1233,29 +1448,58 @@ have_slab:
                                discard_slab(s, page);
                                page = s->cpu_slab[cpu];
                                slab_lock(page);
-                                goto redo;
+                                goto load_freelist;
                        }
-                        /* Dump the current slab */
+                        /* New slab does not fit our expectations */
                        flush_slab(s, s->cpu_slab[cpu], cpu);
                }
                slab_lock(page);
-                goto have_slab;
+                SetSlabFrozen(page);
+                s->cpu_slab[cpu] = page;
+                goto load_freelist;
        }
-        local_irq_restore(flags);
        return NULL;
 debug:
-        if (!alloc_object_checks(s, page, object))
+        object = page->freelist;
+        if (!alloc_debug_processing(s, page, object, addr))
                goto another_slab;
-        if (s->flags & SLAB_STORE_USER)
-                set_track(s, object, TRACK_ALLOC, addr);
+        page->inuse++;
-        if (s->flags & SLAB_TRACE) {
+        page->freelist = object[page->offset];
-                printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n",
+        slab_unlock(page);
-                        s->name, object, page->inuse,
+        return object;
-                        page->freelist);
+}
-                dump_stack();
+/*
+ * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
+ * have the fastpath folded into their functions. So no function call
+ * overhead for requests that can be satisfied on the fastpath.
+ *
+ * The fastpath works by first checking if the lockless freelist can be used.
+ * If not then __slab_alloc is called for slow processing.
+ *
+ * Otherwise we can simply pick the next object from the lockless free list.
+ */
+static void __always_inline *slab_alloc(struct kmem_cache *s,
+                                gfp_t gfpflags, int node, void *addr)
+{
+        struct page *page;
+        void **object;
+        unsigned long flags;
+        local_irq_save(flags);
+        page = s->cpu_slab[smp_processor_id()];
+        if (unlikely(!page || !page->lockless_freelist ||
+                        (node != -1 && page_to_nid(page) != node)))
+                object = __slab_alloc(s, gfpflags, node, addr, page);
+        else {
+                object = page->lockless_freelist;
+                page->lockless_freelist = object[page->offset];
        }
-        init_object(s, object, 1);
+        local_irq_restore(flags);
-        goto have_object;
+        return object;
 }
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
@@ -1273,33 +1517,29 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
 #endif
 /*
- * The fastpath only writes the cacheline of the page struct and the first
+ * Slow patch handling. This may still be called frequently since objects
- * cacheline of the object.
+ * have a longer lifetime than the cpu slabs in most processing loads.
 *
- * No special cachelines need to be read
+ * So we still attempt to reduce cache line usage. Just take the slab
+ * lock and free the item. If there is no additional partial page
+ * handling required then we can return immediately.
 */
-static void slab_free(struct kmem_cache *s, struct page *page,
+static void __slab_free(struct kmem_cache *s, struct page *page,
                                        void *x, void *addr)
 {
        void *prior;
        void **object = (void *)x;
-        unsigned long flags;
-        local_irq_save(flags);
        slab_lock(page);
-        if (unlikely(PageError(page)))
+        if (unlikely(SlabDebug(page)))
                goto debug;
 checks_ok:
        prior = object[page->offset] = page->freelist;
        page->freelist = object;
        page->inuse--;
-        if (unlikely(PageActive(page)))
+        if (unlikely(SlabFrozen(page)))
-                /*
-                 * Cpu slabs are never on partial lists and are
-                 * never freed.
-                 */
                goto out_unlock;
        if (unlikely(!page->inuse))
@@ -1315,39 +1555,53 @@ checks_ok:
 out_unlock:
        slab_unlock(page);
-        local_irq_restore(flags);
        return;
 slab_empty:
        if (prior)
                /*
-                 * Slab on the partial list.
+                 * Slab still on the partial list.
                 */
                remove_partial(s, page);
        slab_unlock(page);
        discard_slab(s, page);
-        local_irq_restore(flags);
        return;
 debug:
-        if (!free_object_checks(s, page, x))
+        if (!free_debug_processing(s, page, x, addr))
                goto out_unlock;
-        if (!PageActive(page) && !page->freelist)
-                remove_full(s, page);
-        if (s->flags & SLAB_STORE_USER)
-                set_track(s, x, TRACK_FREE, addr);
-        if (s->flags & SLAB_TRACE) {
-                printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n",
-                        s->name, object, page->inuse,
-                        page->freelist);
-                print_section("Object", (void *)object, s->objsize);
-                dump_stack();
-        }
-        init_object(s, object, 0);
        goto checks_ok;
 }
+/*
+ * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
+ * can perform fastpath freeing without additional function calls.
+ *
+ * The fastpath is only possible if we are freeing to the current cpu slab
+ * of this processor. This typically the case if we have just allocated
+ * the item before.
+ *
+ * If fastpath is not possible then fall back to __slab_free where we deal
+ * with all sorts of special processing.
+ */
+static void __always_inline slab_free(struct kmem_cache *s,
+                        struct page *page, void *x, void *addr)
+{
+        void **object = (void *)x;
+        unsigned long flags;
+        local_irq_save(flags);
+        if (likely(page == s->cpu_slab[smp_processor_id()] &&
+                                                !SlabDebug(page))) {
+                object[page->offset] = page->lockless_freelist;
+                page->lockless_freelist = object;
+        } else
+                __slab_free(s, page, x, addr);
+        local_irq_restore(flags);
+}
 void kmem_cache_free(struct kmem_cache *s, void *x)
 {
        struct page *page;
@@ -1370,22 +1624,16 @@ static struct page *get_object_page(const void *x)
 }
 /*
- * kmem_cache_open produces objects aligned at "size" and the first object
+ * Object placement in a slab is made very easy because we always start at
- * is placed at offset 0 in the slab (We have no metainformation on the
+ * offset 0. If we tune the size of the object to the alignment then we can
- * slab, all slabs are in essence "off slab").
+ * get the required alignment by putting one properly sized object after
- *
+ * another.
- * In order to get the desired alignment one just needs to align the
- * size.
 *
 * Notice that the allocation order determines the sizes of the per cpu
 * caches. Each processor has always one slab available for allocations.
 * Increasing the allocation order reduces the number of times that slabs
- * must be moved on and off the partial lists and therefore may influence
+ * must be moved on and off the partial lists and is therefore a factor in
 * locking overhead.
- *
- * The offset is used to relocate the free list link in each object. It is
- * therefore possible to move the free list link behind the object. This
- * is necessary for RCU to work properly and also useful for debugging.
 */
 /*
@@ -1396,76 +1644,110 @@ static struct page *get_object_page(const void *x)
 */
 static int slub_min_order;
 static int slub_max_order = DEFAULT_MAX_ORDER;
-/*
- * Minimum number of objects per slab. This is necessary in order to
- * reduce locking overhead. Similar to the queue size in SLAB.
- */
 static int slub_min_objects = DEFAULT_MIN_OBJECTS;
 /*
 * Merge control. If this is set then no merging of slab caches will occur.
+ * (Could be removed. This was introduced to pacify the merge skeptics.)
 */
 static int slub_nomerge;
 /*
- * Debug settings:
- */
-static int slub_debug;
-static char *slub_debug_slabs;
-/*
 * Calculate the order of allocation given an slab object size.
 *
- * The order of allocation has significant impact on other elements
+ * The order of allocation has significant impact on performance and other
- * of the system. Generally order 0 allocations should be preferred
+ * system components. Generally order 0 allocations should be preferred since
- * since they do not cause fragmentation in the page allocator. Larger
+ * order 0 does not cause fragmentation in the page allocator. Larger objects
- * objects may have problems with order 0 because there may be too much
+ * be problematic to put into order 0 slabs because there may be too much
- * space left unused in a slab. We go to a higher order if more than 1/8th
+ * unused space left. We go to a higher order if more than 1/8th of the slab
- * of the slab would be wasted.
+ * would be wasted.
 *
- * In order to reach satisfactory performance we must ensure that
+ * In order to reach satisfactory performance we must ensure that a minimum
- * a minimum number of objects is in one slab. Otherwise we may
+ * number of objects is in one slab. Otherwise we may generate too much
- * generate too much activity on the partial lists. This is less a
+ * activity on the partial lists which requires taking the list_lock. This is
- * concern for large slabs though. slub_max_order specifies the order
+ * less a concern for large slabs though which are rarely used.
- * where we begin to stop considering the number of objects in a slab.
 *
- * Higher order allocations also allow the placement of more objects
+ * slub_max_order specifies the order where we begin to stop considering the
- * in a slab and thereby reduce object handling overhead. If the user
+ * number of objects in a slab as critical. If we reach slub_max_order then
- * has requested a higher mininum order then we start with that one
+ * we try to keep the page order as low as possible. So we accept more waste
- * instead of zero.
+ * of space in favor of a small page order.
+ *
+ * Higher order allocations also allow the placement of more objects in a
+ * slab and thereby reduce object handling overhead. If the user has
+ * requested a higher mininum order then we start with that one instead of
+ * the smallest order which will fit the object.
 */
-static int calculate_order(int size)
+static inline int slab_order(int size, int min_objects,
+                                int max_order, int fract_leftover)
 {
        int order;
        int rem;
-        for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT);
+        for (order = max(slub_min_order,
-                        order < MAX_ORDER; order++) {
+                                fls(min_objects * size - 1) - PAGE_SHIFT);
-                unsigned long slab_size = PAGE_SIZE << order;
+                        order <= max_order; order++) {
-                if (slub_max_order > order &&
+                unsigned long slab_size = PAGE_SIZE << order;
-                                slab_size < slub_min_objects * size)
-                        continue;
-                if (slab_size < size)
+                if (slab_size < min_objects * size)
                        continue;
                rem = slab_size % size;
-                if (rem <= (PAGE_SIZE << order) / 8)
+                if (rem <= slab_size / fract_leftover)
                        break;
        }
-        if (order >= MAX_ORDER)
-                return -E2BIG;
        return order;
 }
+static inline int calculate_order(int size)
+{
+        int order;
+        int min_objects;
+        int fraction;
+        /*
+         * Attempt to find best configuration for a slab. This
+         * works by first attempting to generate a layout with
+         * the best configuration and backing off gradually.
+         *
+         * First we reduce the acceptable waste in a slab. Then
+         * we reduce the minimum objects required in a slab.
+         */
+        min_objects = slub_min_objects;
+        while (min_objects > 1) {
+                fraction = 8;
+                while (fraction >= 4) {
+                        order = slab_order(size, min_objects,
+                                                slub_max_order, fraction);
+                        if (order <= slub_max_order)
+                                return order;
+                        fraction /= 2;
+                }
+                min_objects /= 2;
+        }
+        /*
+         * We were unable to place multiple objects in a slab. Now
+         * lets see if we can place a single object there.
+         */
+        order = slab_order(size, 1, slub_max_order, 1);
+        if (order <= slub_max_order)
+                return order;
+        /*
+         * Doh this slab cannot be placed using slub_max_order.
+         */
+        order = slab_order(size, 1, MAX_ORDER, 1);
+        if (order <= MAX_ORDER)
+                return order;
+        return -ENOSYS;
+}
 /*
- * Function to figure out which alignment to use from the
+ * Figure out what the alignment of the objects will be.
- * various ways of specifying it.
 */
 static unsigned long calculate_alignment(unsigned long flags,
                unsigned long align, unsigned long size)
@@ -1480,8 +1762,8 @@ static unsigned long calculate_alignment(unsigned long flags,
         * then use it.
         */
        if ((flags & SLAB_HWCACHE_ALIGN) &&
-                        size > L1_CACHE_BYTES / 2)
+                        size > cache_line_size() / 2)
-                return max_t(unsigned long, align, L1_CACHE_BYTES);
+                return max_t(unsigned long, align, cache_line_size());
        if (align < ARCH_SLAB_MINALIGN)
                return ARCH_SLAB_MINALIGN;
@@ -1525,7 +1807,7 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag
        page->freelist = get_freepointer(kmalloc_caches, n);
        page->inuse++;
        kmalloc_caches->node[node] = n;
-        init_object(kmalloc_caches, n, 1);
+        setup_object_debug(kmalloc_caches, page, n);
        init_kmem_cache_node(n);
        atomic_long_inc(&n->nr_slabs);
        add_partial(n, page);
@@ -1607,7 +1889,7 @@ static int calculate_sizes(struct kmem_cache *s)
         * then we should never poison the object itself.
         */
        if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) &&
-                        !s->ctor && !s->dtor)
+                        !s->ctor)
                s->flags |= __OBJECT_POISON;
        else
                s->flags &= ~__OBJECT_POISON;
@@ -1619,24 +1901,24 @@ static int calculate_sizes(struct kmem_cache *s)
         */
        size = ALIGN(size, sizeof(void *));
+#ifdef CONFIG_SLUB_DEBUG
        /*
-         * If we are redzoning then check if there is some space between the
+         * If we are Redzoning then check if there is some space between the
         * end of the object and the free pointer. If not then add an
-         * additional word, so that we can establish a redzone between
+         * additional word to have some bytes to store Redzone information.
-         * the object and the freepointer to be able to check for overwrites.
         */
        if ((flags & SLAB_RED_ZONE) && size == s->objsize)
                size += sizeof(void *);
+#endif
        /*
-         * With that we have determined how much of the slab is in actual
+         * With that we have determined the number of bytes in actual use
-         * use by the object. This is the potential offset to the free
+         * by the object. This is the potential offset to the free pointer.
-         * pointer.
         */
        s->inuse = size;
        if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) ||
-                s->ctor || s->dtor)) {
+                s->ctor)) {
                /*
                 * Relocate free pointer after the object if it is not
                 * permitted to overwrite the first word of the object on
@@ -1649,6 +1931,7 @@ static int calculate_sizes(struct kmem_cache *s)
                size += sizeof(void *);
        }
+#ifdef CONFIG_SLUB_DEBUG
        if (flags & SLAB_STORE_USER)
                /*
                 * Need to store information about allocs and frees after
@@ -1656,7 +1939,7 @@ static int calculate_sizes(struct kmem_cache *s)
                 */
                size += 2 * sizeof(struct track);
-        if (flags & DEBUG_DEFAULT_FLAGS)
+        if (flags & SLAB_RED_ZONE)
                /*
                 * Add some empty padding so that we can catch
                 * overwrites from earlier objects rather than let
@@ -1665,10 +1948,12 @@ static int calculate_sizes(struct kmem_cache *s)
                 * of the object.
                 */
                size += sizeof(void *);
+#endif
        /*
         * Determine the alignment based on various parameters that the
-         * user specified (this is unecessarily complex due to the attempt
+         * user specified and the dynamic determination of cache line size
-         * to be compatible with SLAB. Should be cleaned up some day).
+         * on bootup.
         */
        align = calculate_alignment(flags, align, s->objsize);
@@ -1700,62 +1985,18 @@ static int calculate_sizes(struct kmem_cache *s)
 }
-static int __init finish_bootstrap(void)
-{
-        struct list_head *h;
-        int err;
-        slab_state = SYSFS;
-        list_for_each(h, &slab_caches) {
-                struct kmem_cache *s =
-                        container_of(h, struct kmem_cache, list);
-                err = sysfs_slab_add(s);
-                BUG_ON(err);
-        }
-        return 0;
-}
 static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
                const char *name, size_t size,
                size_t align, unsigned long flags,
-                void (*ctor)(void *, struct kmem_cache *, unsigned long),
+                void (*ctor)(void *, struct kmem_cache *, unsigned long))
-                void (*dtor)(void *, struct kmem_cache *, unsigned long))
 {
        memset(s, 0, kmem_size);
        s->name = name;
        s->ctor = ctor;
-        s->dtor = dtor;
        s->objsize = size;
        s->flags = flags;
        s->align = align;
+        kmem_cache_open_debug_check(s);
-        /*
-         * The page->offset field is only 16 bit wide. This is an offset
-         * in units of words from the beginning of an object. If the slab
-         * size is bigger then we cannot move the free pointer behind the
-         * object anymore.
-         *
-         * On 32 bit platforms the limit is 256k. On 64bit platforms
-         * the limit is 512k.
-         *
-         * Debugging or ctor/dtors may create a need to move the free
-         * pointer. Fail if this happens.
-         */
-        if (s->size >= 65535 * sizeof(void *)) {
-                BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON |
-                                SLAB_STORE_USER | SLAB_DESTROY_BY_RCU));
-                BUG_ON(ctor || dtor);
-        }
-        else
-                /*
-                 * Enable debugging if selected on the kernel commandline.
-                 */
-                if (slub_debug && (!slub_debug_slabs ||
-                    strncmp(slub_debug_slabs, name,
-                        strlen(slub_debug_slabs)) == 0))
-                                s->flags |= slub_debug;
        if (!calculate_sizes(s))
                goto error;
@@ -1783,7 +2024,6 @@ EXPORT_SYMBOL(kmem_cache_open);
 int kmem_ptr_validate(struct kmem_cache *s, const void *object)
 {
        struct page * page;
-        void *addr;
        page = get_object_page(object);
@@ -1791,13 +2031,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object)
                /* No slab or wrong slab */
                return 0;
-        addr = page_address(page);
+        if (!check_valid_pointer(s, page, object))
-        if (object < addr || object >= addr + s->objects * s->size)
-                /* Out of bounds */
-                return 0;
-        if ((object - addr) % s->size)
-                /* Improperly aligned */
                return 0;
        /*
@@ -1826,7 +2060,8 @@ const char *kmem_cache_name(struct kmem_cache *s)
 EXPORT_SYMBOL(kmem_cache_name);
 /*
- * Attempt to free all slabs on a node
+ * Attempt to free all slabs on a node. Return the number of slabs we
+ * were unable to free.
 */
 static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
                        struct list_head *list)
@@ -1847,7 +2082,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n,
 }
 /*
- * Release all resources used by slab cache
+ * Release all resources used by a slab cache.
 */
 static int kmem_cache_close(struct kmem_cache *s)
 {
@@ -1932,45 +2167,6 @@ static int __init setup_slub_nomerge(char *str)
 __setup("slub_nomerge", setup_slub_nomerge);
-static int __init setup_slub_debug(char *str)
-{
-        if (!str || *str != '=')
-                slub_debug = DEBUG_DEFAULT_FLAGS;
-        else {
-                str++;
-                if (*str == 0 || *str == ',')
-                        slub_debug = DEBUG_DEFAULT_FLAGS;
-                else
-                for( ;*str && *str != ','; str++)
-                        switch (*str) {
-                        case 'f' : case 'F' :
-                                slub_debug |= SLAB_DEBUG_FREE;
-                                break;
-                        case 'z' : case 'Z' :
-                                slub_debug |= SLAB_RED_ZONE;
-                                break;
-                        case 'p' : case 'P' :
-                                slub_debug |= SLAB_POISON;
-                                break;
-                        case 'u' : case 'U' :
-                                slub_debug |= SLAB_STORE_USER;
-                                break;
-                        case 't' : case 'T' :
-                                slub_debug |= SLAB_TRACE;
-                                break;
-                        default:
-                                printk(KERN_ERR "slub_debug option '%c' "
-                                        "unknown. skipped\n",*str);
-                        }
-        }
-        if (*str == ',')
-                slub_debug_slabs = str + 1;
-        return 1;
-}
-__setup("slub_debug", setup_slub_debug);
 static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
                const char *name, int size, gfp_t gfp_flags)
 {
@@ -1981,7 +2177,7 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
        down_write(&slub_lock);
        if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
-                        flags, NULL, NULL))
+                        flags, NULL))
                goto panic;
        list_add(&s->list, &slab_caches);
@@ -2108,13 +2304,14 @@ void kfree(const void *x)
 EXPORT_SYMBOL(kfree);
 /*
- *  kmem_cache_shrink removes empty slabs from the partial lists
+ * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- *  and then sorts the partially allocated slabs by the number
+ * the remaining slabs by the number of items in use. The slabs with the
- *  of items in use. The slabs with the most items in use
+ * most items in use come first. New allocations will then fill those up
- *  come first. New allocations will remove these from the
+ * and thus they can be removed from the partial lists.
- *  partial list because they are full. The slabs with the
+ *
- *  least items are placed last. If it happens that the objects
+ * The slabs with the least items are placed last. This results in them
- *  are freed then the page can be returned to the page allocator.
+ * being allocated from last increasing the chance that the last objects
+ * are freed in them.
 */
 int kmem_cache_shrink(struct kmem_cache *s)
 {
@@ -2143,12 +2340,10 @@ int kmem_cache_shrink(struct kmem_cache *s)
                spin_lock_irqsave(&n->list_lock, flags);
                /*
-                 * Build lists indexed by the items in use in
+                 * Build lists indexed by the items in use in each slab.
-                 * each slab or free slabs if empty.
                 *
-                 * Note that concurrent frees may occur while
+                 * Note that concurrent frees may occur while we hold the
-                 * we hold the list_lock. page->inuse here is
+                 * list_lock. page->inuse here is the upper limit.
-                 * the upper limit.
                 */
                list_for_each_entry_safe(page, t, &n->partial, lru) {
                        if (!page->inuse && slab_trylock(page)) {
@@ -2172,8 +2367,8 @@ int kmem_cache_shrink(struct kmem_cache *s)
                        goto out;
                /*
-                 * Rebuild the partial list with the slabs filled up
+                 * Rebuild the partial list with the slabs filled up most
-                 * most first and the least used slabs at the end.
+                 * first and the least used slabs at the end.
                 */
                for (i = s->objects - 1; i >= 0; i--)
                        list_splice(slabs_by_inuse + i, n->partial.prev);
@@ -2189,7 +2384,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 /**
 * krealloc - reallocate memory. The contents will remain unchanged.
- *
 * @p: object to reallocate memory for.
 * @new_size: how many bytes of memory are required.
 * @flags: the type of memory to allocate.
@@ -2201,9 +2395,8 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 */
 void *krealloc(const void *p, size_t new_size, gfp_t flags)
 {
-        struct kmem_cache *new_cache;
        void *ret;
-        struct page *page;
+        size_t ks;
        if (unlikely(!p))
                return kmalloc(new_size, flags);
@@ -2213,19 +2406,13 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
                return NULL;
        }
-        page = virt_to_head_page(p);
+        ks = ksize(p);
+        if (ks >= new_size)
-        new_cache = get_slab(new_size, flags);
-        /*
-         * If new size fits in the current cache, bail out.
-         */
-        if (likely(page->slab == new_cache))
                return (void *)p;
        ret = kmalloc(new_size, flags);
        if (ret) {
-                memcpy(ret, p, min(new_size, ksize(p)));
+                memcpy(ret, p, min(new_size, ks));
                kfree(p);
        }
        return ret;
@@ -2243,11 +2430,12 @@ void __init kmem_cache_init(void)
 #ifdef CONFIG_NUMA
        /*
         * Must first have the slab cache available for the allocations of the
-         * struct kmalloc_cache_node's. There is special bootstrap code in
+         * struct kmem_cache_node's. There is special bootstrap code in
         * kmem_cache_open for slab_state == DOWN.
         */
        create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
                sizeof(struct kmem_cache_node), GFP_KERNEL);
+        kmalloc_caches[0].refcount = -1;
 #endif
        /* Able to allocate the per node structures */
@@ -2274,13 +2462,12 @@ void __init kmem_cache_init(void)
        register_cpu_notifier(&slab_notifier);
 #endif
-        if (nr_cpu_ids) /* Remove when nr_cpu_ids is fixed upstream ! */
+        kmem_size = offsetof(struct kmem_cache, cpu_slab) +
-                kmem_size = offsetof(struct kmem_cache, cpu_slab)
+                                nr_cpu_ids * sizeof(struct page *);
-                         + nr_cpu_ids * sizeof(struct page *);
        printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
                " Processors=%d, Nodes=%d\n",
-                KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES,
+                KMALLOC_SHIFT_HIGH, cache_line_size(),
                slub_min_order, slub_max_order, slub_min_objects,
                nr_cpu_ids, nr_node_ids);
 }
@@ -2293,7 +2480,13 @@ static int slab_unmergeable(struct kmem_cache *s)
        if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
                return 1;
-        if (s->ctor || s->dtor)
+        if (s->ctor)
+                return 1;
+        /*
+         * We may have set a slab to be unmergeable during bootstrap.
+         */
+        if (s->refcount < 0)
                return 1;
        return 0;
@@ -2301,15 +2494,14 @@ static int slab_unmergeable(struct kmem_cache *s)
 static struct kmem_cache *find_mergeable(size_t size,
                size_t align, unsigned long flags,
-                void (*ctor)(void *, struct kmem_cache *, unsigned long),
+                void (*ctor)(void *, struct kmem_cache *, unsigned long))
-                void (*dtor)(void *, struct kmem_cache *, unsigned long))
 {
        struct list_head *h;
        if (slub_nomerge || (flags & SLUB_NEVER_MERGE))
                return NULL;
-        if (ctor || dtor)
+        if (ctor)
                return NULL;
        size = ALIGN(size, sizeof(void *));
@@ -2351,8 +2543,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 {
        struct kmem_cache *s;
+        BUG_ON(dtor);
        down_write(&slub_lock);
-        s = find_mergeable(size, align, flags, dtor, ctor);
+        s = find_mergeable(size, align, flags, ctor);
        if (s) {
                s->refcount++;
                /*
@@ -2366,7 +2559,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
        } else {
                s = kmalloc(kmem_size, GFP_KERNEL);
                if (s && kmem_cache_open(s, GFP_KERNEL, name,
-                                size, align, flags, ctor, dtor)) {
+                                size, align, flags, ctor)) {
                        if (sysfs_slab_add(s)) {
                                kfree(s);
                                goto err;
@@ -2415,8 +2608,21 @@ static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu)
 }
 /*
- * Use the cpu notifier to insure that the slab are flushed
+ * Version of __flush_cpu_slab for the case that interrupts
- * when necessary.
+ * are enabled.
+ */
+static void cpu_slab_flush(struct kmem_cache *s, int cpu)
+{
+        unsigned long flags;
+        local_irq_save(flags);
+        __flush_cpu_slab(s, cpu);
+        local_irq_restore(flags);
+}
+/*
+ * Use the cpu notifier to insure that the cpu slabs are flushed when
+ * necessary.
 */
 static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
                unsigned long action, void *hcpu)
@@ -2425,8 +2631,10 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
        switch (action) {
        case CPU_UP_CANCELED:
+        case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
-                for_all_slabs(__flush_cpu_slab, cpu);
+        case CPU_DEAD_FROZEN:
+                for_all_slabs(cpu_slab_flush, cpu);
                break;
        default:
                break;
@@ -2439,153 +2647,6 @@ static struct notifier_block __cpuinitdata slab_notifier =
 #endif
-#ifdef CONFIG_NUMA
-/*****************************************************************
- * Generic reaper used to support the page allocator
- * (the cpu slabs are reaped by a per slab workqueue).
- *
- * Maybe move this to the page allocator?
- ****************************************************************/
-static DEFINE_PER_CPU(unsigned long, reap_node);
-static void init_reap_node(int cpu)
-{
-        int node;
-        node = next_node(cpu_to_node(cpu), node_online_map);
-        if (node == MAX_NUMNODES)
-                node = first_node(node_online_map);
-        __get_cpu_var(reap_node) = node;
-}
-static void next_reap_node(void)
-{
-        int node = __get_cpu_var(reap_node);
-        /*
-         * Also drain per cpu pages on remote zones
-         */
-        if (node != numa_node_id())
-                drain_node_pages(node);
-        node = next_node(node, node_online_map);
-        if (unlikely(node >= MAX_NUMNODES))
-                node = first_node(node_online_map);
-        __get_cpu_var(reap_node) = node;
-}
-#else
-#define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
-#endif
-#define REAPTIMEOUT_CPUC        (2*HZ)
-#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
-static void cache_reap(struct work_struct *unused)
-{
-        next_reap_node();
-        refresh_cpu_vm_stats(smp_processor_id());
-        schedule_delayed_work(&__get_cpu_var(reap_work),
-                                      REAPTIMEOUT_CPUC);
-}
-static void __devinit start_cpu_timer(int cpu)
-{
-        struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
-        /*
-         * When this gets called from do_initcalls via cpucache_init(),
-         * init_workqueues() has already run, so keventd will be setup
-         * at that time.
-         */
-        if (keventd_up() && reap_work->work.func == NULL) {
-                init_reap_node(cpu);
-                INIT_DELAYED_WORK(reap_work, cache_reap);
-                schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
-        }
-}
-static int __init cpucache_init(void)
-{
-        int cpu;
-        /*
-         * Register the timers that drain pcp pages and update vm statistics
-         */
-        for_each_online_cpu(cpu)
-                start_cpu_timer(cpu);
-        return 0;
-}
-__initcall(cpucache_init);
-#endif
-#ifdef SLUB_RESILIENCY_TEST
-static unsigned long validate_slab_cache(struct kmem_cache *s);
-static void resiliency_test(void)
-{
-        u8 *p;
-        printk(KERN_ERR "SLUB resiliency testing\n");
-        printk(KERN_ERR "-----------------------\n");
-        printk(KERN_ERR "A. Corruption after allocation\n");
-        p = kzalloc(16, GFP_KERNEL);
-        p[16] = 0x12;
-        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
-                        " 0x12->0x%p\n\n", p + 16);
-        validate_slab_cache(kmalloc_caches + 4);
-        /* Hmmm... The next two are dangerous */
-        p = kzalloc(32, GFP_KERNEL);
-        p[32 + sizeof(void *)] = 0x34;
-        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
-                        " 0x34 -> -0x%p\n", p);
-        printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
-        validate_slab_cache(kmalloc_caches + 5);
-        p = kzalloc(64, GFP_KERNEL);
-        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
-        *p = 0x56;
-        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
-                                                                        p);
-        printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
-        validate_slab_cache(kmalloc_caches + 6);
-        printk(KERN_ERR "\nB. Corruption after free\n");
-        p = kzalloc(128, GFP_KERNEL);
-        kfree(p);
-        *p = 0x78;
-        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
-        validate_slab_cache(kmalloc_caches + 7);
-        p = kzalloc(256, GFP_KERNEL);
-        kfree(p);
-        p[50] = 0x9a;
-        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
-        validate_slab_cache(kmalloc_caches + 8);
-        p = kzalloc(512, GFP_KERNEL);
-        kfree(p);
-        p[512] = 0xab;
-        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
-        validate_slab_cache(kmalloc_caches + 9);
-}
-#else
-static void resiliency_test(void) {};
-#endif
-/*
- * These are not as efficient as kmalloc for the non debug case.
- * We do not have the page struct available so we have to touch one
- * cacheline in struct kmem_cache to check slab flags.
- */
 void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller)
 {
        struct kmem_cache *s = get_slab(size, gfpflags);
@@ -2607,13 +2668,12 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
        return slab_alloc(s, gfpflags, node, caller);
 }
-#ifdef CONFIG_SYSFS
+#if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)
 static int validate_slab(struct kmem_cache *s, struct page *page)
 {
        void *p;
        void *addr = page_address(page);
-        unsigned long map[BITS_TO_LONGS(s->objects)];
+        DECLARE_BITMAP(map, s->objects);
        if (!check_slab(s, page) ||
                        !on_freelist(s, page, NULL))
@@ -2622,14 +2682,14 @@ static int validate_slab(struct kmem_cache *s, struct page *page)
        /* Now we know that a valid freelist exists */
        bitmap_zero(map, s->objects);
-        for(p = page->freelist; p; p = get_freepointer(s, p)) {
+        for_each_free_object(p, s, page->freelist) {
-                set_bit((p - addr) / s->size, map);
+                set_bit(slab_index(p, s, addr), map);
                if (!check_object(s, page, p, 0))
                        return 0;
        }
-        for(p = addr; p < addr + s->objects * s->size; p += s->size)
+        for_each_object(p, s, addr)
-                if (!test_bit((p - addr) / s->size, map))
+                if (!test_bit(slab_index(p, s, addr), map))
                        if (!check_object(s, page, p, 1))
                                return 0;
        return 1;
@@ -2645,12 +2705,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page)
                        s->name, page);
        if (s->flags & DEBUG_DEFAULT_FLAGS) {
-                if (!PageError(page))
+                if (!SlabDebug(page))
-                        printk(KERN_ERR "SLUB %s: PageError not set "
+                        printk(KERN_ERR "SLUB %s: SlabDebug not set "
                                "on slab 0x%p\n", s->name, page);
        } else {
-                if (PageError(page))
+                if (SlabDebug(page))
-                        printk(KERN_ERR "SLUB %s: PageError set on "
+                        printk(KERN_ERR "SLUB %s: SlabDebug set on "
                                "slab 0x%p\n", s->name, page);
        }
 }
@@ -2702,14 +2762,76 @@ static unsigned long validate_slab_cache(struct kmem_cache *s)
        return count;
 }
+#ifdef SLUB_RESILIENCY_TEST
+static void resiliency_test(void)
+{
+        u8 *p;
+        printk(KERN_ERR "SLUB resiliency testing\n");
+        printk(KERN_ERR "-----------------------\n");
+        printk(KERN_ERR "A. Corruption after allocation\n");
+        p = kzalloc(16, GFP_KERNEL);
+        p[16] = 0x12;
+        printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
+                        " 0x12->0x%p\n\n", p + 16);
+        validate_slab_cache(kmalloc_caches + 4);
+        /* Hmmm... The next two are dangerous */
+        p = kzalloc(32, GFP_KERNEL);
+        p[32 + sizeof(void *)] = 0x34;
+        printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
+                        " 0x34 -> -0x%p\n", p);
+        printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
+        validate_slab_cache(kmalloc_caches + 5);
+        p = kzalloc(64, GFP_KERNEL);
+        p += 64 + (get_cycles() & 0xff) * sizeof(void *);
+        *p = 0x56;
+        printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+                                                                        p);
+        printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
+        validate_slab_cache(kmalloc_caches + 6);
+        printk(KERN_ERR "\nB. Corruption after free\n");
+        p = kzalloc(128, GFP_KERNEL);
+        kfree(p);
+        *p = 0x78;
+        printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+        validate_slab_cache(kmalloc_caches + 7);
+        p = kzalloc(256, GFP_KERNEL);
+        kfree(p);
+        p[50] = 0x9a;
+        printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
+        validate_slab_cache(kmalloc_caches + 8);
+        p = kzalloc(512, GFP_KERNEL);
+        kfree(p);
+        p[512] = 0xab;
+        printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+        validate_slab_cache(kmalloc_caches + 9);
+}
+#else
+static void resiliency_test(void) {};
+#endif
 /*
- * Generate lists of locations where slabcache objects are allocated
+ * Generate lists of code addresses where slabcache objects are allocated
 * and freed.
 */
 struct location {
        unsigned long count;
        void *addr;
+        long long sum_time;
+        long min_time;
+        long max_time;
+        long min_pid;
+        long max_pid;
+        cpumask_t cpus;
+        nodemask_t nodes;
 };
 struct loc_track {
@@ -2750,11 +2872,12 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max)
 }
 static int add_location(struct loc_track *t, struct kmem_cache *s,
-                                                void *addr)
+                                const struct track *track)
 {
        long start, end, pos;
        struct location *l;
        void *caddr;
+        unsigned long age = jiffies - track->when;
        start = -1;
        end = t->count;
@@ -2770,19 +2893,36 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
                        break;
                caddr = t->loc[pos].addr;
-                if (addr == caddr) {
+                if (track->addr == caddr) {
-                        t->loc[pos].count++;
+                        l = &t->loc[pos];
+                        l->count++;
+                        if (track->when) {
+                                l->sum_time += age;
+                                if (age < l->min_time)
+                                        l->min_time = age;
+                                if (age > l->max_time)
+                                        l->max_time = age;
+                                if (track->pid < l->min_pid)
+                                        l->min_pid = track->pid;
+                                if (track->pid > l->max_pid)
+                                        l->max_pid = track->pid;
+                                cpu_set(track->cpu, l->cpus);
+                        }
+                        node_set(page_to_nid(virt_to_page(track)), l->nodes);
                        return 1;
                }
-                if (addr < caddr)
+                if (track->addr < caddr)
                        end = pos;
                else
                        start = pos;
        }
        /*
-         * Not found. Insert new tracking element
+         * Not found. Insert new tracking element.
         */
        if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max))
                return 0;
@@ -2793,7 +2933,16 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
                        (t->count - pos) * sizeof(struct location));
        t->count++;
        l->count = 1;
-        l->addr = addr;
+        l->addr = track->addr;
+        l->sum_time = age;
+        l->min_time = age;
+        l->max_time = age;
+        l->min_pid = track->pid;
+        l->max_pid = track->pid;
+        cpus_clear(l->cpus);
+        cpu_set(track->cpu, l->cpus);
+        nodes_clear(l->nodes);
+        node_set(page_to_nid(virt_to_page(track)), l->nodes);
        return 1;
 }
@@ -2801,19 +2950,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
                struct page *page, enum track_item alloc)
 {
        void *addr = page_address(page);
-        unsigned long map[BITS_TO_LONGS(s->objects)];
+        DECLARE_BITMAP(map, s->objects);
        void *p;
        bitmap_zero(map, s->objects);
-        for (p = page->freelist; p; p = get_freepointer(s, p))
+        for_each_free_object(p, s, page->freelist)
-                set_bit((p - addr) / s->size, map);
+                set_bit(slab_index(p, s, addr), map);
-        for (p = addr; p < addr + s->objects * s->size; p += s->size)
-                if (!test_bit((p - addr) / s->size, map)) {
-                        void *addr = get_track(s, p, alloc)->addr;
-                        add_location(t, s, addr);
+        for_each_object(p, s, addr)
-                }
+                if (!test_bit(slab_index(p, s, addr), map))
+                        add_location(t, s, get_track(s, p, alloc));
 }
 static int list_locations(struct kmem_cache *s, char *buf,
@@ -2847,15 +2993,47 @@ static int list_locations(struct kmem_cache *s, char *buf,
        }
        for (i = 0; i < t.count; i++) {
-                void *addr = t.loc[i].addr;
+                struct location *l = &t.loc[i];
                if (n > PAGE_SIZE - 100)
                        break;
-                n += sprintf(buf + n, "%7ld ", t.loc[i].count);
+                n += sprintf(buf + n, "%7ld ", l->count);
-                if (addr)
-                        n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr);
+                if (l->addr)
+                        n += sprint_symbol(buf + n, (unsigned long)l->addr);
                else
                        n += sprintf(buf + n, "<not-available>");
+                if (l->sum_time != l->min_time) {
+                        unsigned long remainder;
+                        n += sprintf(buf + n, " age=%ld/%ld/%ld",
+                        l->min_time,
+                        div_long_long_rem(l->sum_time, l->count, &remainder),
+                        l->max_time);
+                } else
+                        n += sprintf(buf + n, " age=%ld",
+                                l->min_time);
+                if (l->min_pid != l->max_pid)
+                        n += sprintf(buf + n, " pid=%ld-%ld",
+                                l->min_pid, l->max_pid);
+                else
+                        n += sprintf(buf + n, " pid=%ld",
+                                l->min_pid);
+                if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) {
+                        n += sprintf(buf + n, " cpus=");
+                        n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50,
+                                        l->cpus);
+                }
+                if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) {
+                        n += sprintf(buf + n, " nodes=");
+                        n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50,
+                                        l->nodes);
+                }
                n += sprintf(buf + n, "\n");
        }
@@ -3035,17 +3213,6 @@ static ssize_t ctor_show(struct kmem_cache *s, char *buf)
 }
 SLAB_ATTR_RO(ctor);
-static ssize_t dtor_show(struct kmem_cache *s, char *buf)
-{
-        if (s->dtor) {
-                int n = sprint_symbol(buf, (unsigned long)s->dtor);
-                return n + sprintf(buf + n, "\n");
-        }
-        return 0;
-}
-SLAB_ATTR_RO(dtor);
 static ssize_t aliases_show(struct kmem_cache *s, char *buf)
 {
        return sprintf(buf, "%d\n", s->refcount - 1);
@@ -3277,7 +3444,6 @@ static struct attribute * slab_attrs[] = {
        &partial_attr.attr,
        &cpu_slabs_attr.attr,
        &ctor_attr.attr,
-        &dtor_attr.attr,
        &aliases_attr.attr,
        &align_attr.attr,
        &sanity_checks_attr.attr,
@@ -3491,6 +3657,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
 static int __init slab_sysfs_init(void)
 {
+        struct list_head *h;
        int err;
        err = subsystem_register(&slab_subsys);
@@ -3499,7 +3666,15 @@ static int __init slab_sysfs_init(void)
                return -ENOSYS;
        }
-        finish_bootstrap();
+        slab_state = SYSFS;
+        list_for_each(h, &slab_caches) {
+                struct kmem_cache *s =
+                        container_of(h, struct kmem_cache, list);
+                err = sysfs_slab_add(s);
+                BUG_ON(err);
+        }
        while (alias_list) {
                struct saved_alias *al = alias_list;
@@ -3515,6 +3690,4 @@ static int __init slab_sysfs_init(void)
 }
 __initcall(slab_sysfs_init);
-#else
-__initcall(finish_bootstrap);
 #endif
diff --git a/mm/sparse.c b/mm/sparse.c
index 893e5621c247..545e4d3afcdf 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -44,7 +44,7 @@ EXPORT_SYMBOL(page_to_nid);
 #endif
 #ifdef CONFIG_SPARSEMEM_EXTREME
-static struct mem_section *sparse_index_alloc(int nid)
+static struct mem_section noinline __init_refok *sparse_index_alloc(int nid)
 {
        struct mem_section *section = NULL;
        unsigned long array_size = SECTIONS_PER_ROOT *
@@ -61,7 +61,7 @@ static struct mem_section *sparse_index_alloc(int nid)
        return section;
 }
-static int sparse_index_init(unsigned long section_nr, int nid)
+static int __meminit sparse_index_init(unsigned long section_nr, int nid)
 {
        static DEFINE_SPINLOCK(index_init_lock);
        unsigned long root = SECTION_NR_TO_ROOT(section_nr);
@@ -138,7 +138,7 @@ static inline int sparse_early_nid(struct mem_section *section)
 }
 /* Record a memory area against a node. */
-void memory_present(int nid, unsigned long start, unsigned long end)
+void __init memory_present(int nid, unsigned long start, unsigned long end)
 {
        unsigned long pfn;
@@ -197,7 +197,7 @@ struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pn
        return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
 }
-static int sparse_init_one_section(struct mem_section *ms,
+static int __meminit sparse_init_one_section(struct mem_section *ms,
                unsigned long pnum, struct page *mem_map)
 {
        if (!valid_section(ms))
@@ -209,7 +209,13 @@ static int sparse_init_one_section(struct mem_section *ms,
        return 1;
 }
-static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
+__attribute__((weak))
+void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size)
+{
+        return NULL;
+}
+static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
 {
        struct page *map;
        struct mem_section *ms = __nr_to_section(pnum);
@@ -219,6 +225,11 @@ static struct page *sparse_early_mem_map_alloc(unsigned long pnum)
        if (map)
                return map;
+        map = alloc_bootmem_high_node(NODE_DATA(nid),
+                       sizeof(struct page) * PAGES_PER_SECTION);
+        if (map)
+                return map;
        map = alloc_bootmem_node(NODE_DATA(nid),
                        sizeof(struct page) * PAGES_PER_SECTION);
        if (map)
@@ -288,6 +299,7 @@ void __init sparse_init(void)
        }
 }
+#ifdef CONFIG_MEMORY_HOTPLUG
 /*
 * returns the number of sections whose mem_maps were properly
 * set.  If this is <=0, then that means that the passed-in
@@ -327,3 +339,4 @@ out:
                __kfree_section_memmap(memmap, nr_pages);
        return ret;
 }
+#endif
diff --git a/mm/swap.c b/mm/swap.c
index 218c52a24a21..d3cb966fe992 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -488,7 +488,7 @@ static int cpu_swap_callback(struct notifier_block *nfb,
        long *committed;
        committed = &per_cpu(committed_space, (long)hcpu);
-        if (action == CPU_DEAD) {
+        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
                atomic_add(*committed, &vm_committed_space);
                *committed = 0;
                __lru_add_drain((long)hcpu);
diff --git a/mm/thrash.c b/mm/thrash.c
index 9ef9071f99bc..c4c5205a9c35 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -48,9 +48,8 @@ void grab_swap_token(void)
                if (current_interval < current->mm->last_interval)
                        current->mm->token_priority++;
                else {
-                        current->mm->token_priority--;
+                        if (likely(current->mm->token_priority > 0))
-                        if (unlikely(current->mm->token_priority < 0))
+                                current->mm->token_priority--;
-                                current->mm->token_priority = 0;
                }
                /* Check if we deserve the token */
                if (current->mm->token_priority >
diff --git a/mm/truncate.c b/mm/truncate.c
index 0f4b6d18ab0e..4fbe1a2da5fb 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -12,6 +12,7 @@
 #include <linux/swap.h>
 #include <linux/module.h>
 #include <linux/pagemap.h>
+#include <linux/highmem.h>
 #include <linux/pagevec.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/buffer_head.h>  /* grr. try_to_release_page,
@@ -46,7 +47,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
-        memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+        zero_user_page(page, partial, PAGE_CACHE_SIZE - partial, KM_USER0);
        if (PagePrivate(page))
                do_invalidatepage(page, partial);
 }
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index cb5aabda7046..d3a9c5368257 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -311,7 +311,7 @@ struct vm_struct *remove_vm_area(void *addr)
        return v;
 }
-void __vunmap(void *addr, int deallocate_pages)
+static void __vunmap(void *addr, int deallocate_pages)
 {
        struct vm_struct *area;
@@ -755,3 +755,10 @@ out_einval_locked:
 }
 EXPORT_SYMBOL(remap_vmalloc_range);
+/*
+ * Implement a stub for vmalloc_sync_all() if the architecture chose not to
+ * have one.
+ */
+void  __attribute__((weak)) vmalloc_sync_all(void)
+{
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 56651a10c366..1be5a6376ef0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -284,12 +284,8 @@ static void handle_write_error(struct address_space *mapping,
                                struct page *page, int error)
 {
        lock_page(page);
-        if (page_mapping(page) == mapping) {
+        if (page_mapping(page) == mapping)
-                if (error == -ENOSPC)
+                mapping_set_error(mapping, error);
-                        set_bit(AS_ENOSPC, &mapping->flags);
-                else
-                        set_bit(AS_EIO, &mapping->flags);
-        }
        unlock_page(page);
 }
@@ -1532,7 +1528,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
        pg_data_t *pgdat;
        cpumask_t mask;
-        if (action == CPU_ONLINE) {
+        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
                for_each_online_pgdat(pgdat) {
                        mask = node_to_cpumask(pgdat->node_id);
                        if (any_online_cpu(mask) != NR_CPUS)
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 6c488d6ac425..38254297a494 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
+#include <linux/sched.h>
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -281,6 +282,17 @@ EXPORT_SYMBOL(dec_zone_page_state);
 /*
 * Update the zone counters for one cpu.
+ *
+ * Note that refresh_cpu_vm_stats strives to only access
+ * node local memory. The per cpu pagesets on remote zones are placed
+ * in the memory local to the processor using that pageset. So the
+ * loop over all zones will access a series of cachelines local to
+ * the processor.
+ *
+ * The call to zone_page_state_add updates the cachelines with the
+ * statistics in the remote zone struct as well as the global cachelines
+ * with the global counters. These could cause remote node cache line
+ * bouncing and will have to be only done when necessary.
 */
 void refresh_cpu_vm_stats(int cpu)
 {
@@ -289,21 +301,54 @@ void refresh_cpu_vm_stats(int cpu)
        unsigned long flags;
        for_each_zone(zone) {
-                struct per_cpu_pageset *pcp;
+                struct per_cpu_pageset *p;
                if (!populated_zone(zone))
                        continue;
-                pcp = zone_pcp(zone, cpu);
+                p = zone_pcp(zone, cpu);
                for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-                        if (pcp->vm_stat_diff[i]) {
+                        if (p->vm_stat_diff[i]) {
                                local_irq_save(flags);
-                                zone_page_state_add(pcp->vm_stat_diff[i],
+                                zone_page_state_add(p->vm_stat_diff[i],
                                        zone, i);
-                                pcp->vm_stat_diff[i] = 0;
+                                p->vm_stat_diff[i] = 0;
+#ifdef CONFIG_NUMA
+                                /* 3 seconds idle till flush */
+                                p->expire = 3;
+#endif
                                local_irq_restore(flags);
                        }
+#ifdef CONFIG_NUMA
+                /*
+                 * Deal with draining the remote pageset of this
+                 * processor
+                 *
+                 * Check if there are pages remaining in this pageset
+                 * if not then there is nothing to expire.
+                 */
+                if (!p->expire || (!p->pcp[0].count && !p->pcp[1].count))
+                        continue;
+                /*
+                 * We never drain zones local to this processor.
+                 */
+                if (zone_to_nid(zone) == numa_node_id()) {
+                        p->expire = 0;
+                        continue;
+                }
+                p->expire--;
+                if (p->expire)
+                        continue;
+                if (p->pcp[0].count)
+                        drain_zone_pages(zone, p->pcp + 0);
+                if (p->pcp[1].count)
+                        drain_zone_pages(zone, p->pcp + 1);
+#endif
        }
 }
@@ -640,6 +685,24 @@ const struct seq_operations vmstat_op = {
 #endif /* CONFIG_PROC_FS */
 #ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+int sysctl_stat_interval __read_mostly = HZ;
+static void vmstat_update(struct work_struct *w)
+{
+        refresh_cpu_vm_stats(smp_processor_id());
+        schedule_delayed_work(&__get_cpu_var(vmstat_work),
+                sysctl_stat_interval);
+}
+static void __devinit start_cpu_timer(int cpu)
+{
+        struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
+        INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
+        schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu);
+}
 /*
 * Use the cpu notifier to insure that the thresholds are recalculated
 * when necessary.
@@ -648,10 +711,24 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
                unsigned long action,
                void *hcpu)
 {
+        long cpu = (long)hcpu;
        switch (action) {
-        case CPU_UP_PREPARE:
+        case CPU_ONLINE:
-        case CPU_UP_CANCELED:
+        case CPU_ONLINE_FROZEN:
+                start_cpu_timer(cpu);
+                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+                per_cpu(vmstat_work, cpu).work.func = NULL;
+                break;
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+                start_cpu_timer(cpu);
+                break;
        case CPU_DEAD:
+        case CPU_DEAD_FROZEN:
                refresh_zone_stat_thresholds();
                break;
        default:
@@ -665,8 +742,13 @@ static struct notifier_block __cpuinitdata vmstat_notifier =
 int __init setup_vmstat(void)
 {
+        int cpu;
        refresh_zone_stat_thresholds();
        register_cpu_notifier(&vmstat_notifier);
+        for_each_online_cpu(cpu)
+                start_cpu_timer(cpu);
        return 0;
 }
 module_init(setup_vmstat)