Lockless (and preemptless) fastpaths for slub

Use the this_cpu_cmpxchg_double functionality to implement a lockless allocation algorithm on arches that support fast this_cpu_ops. Each of the per cpu pointers is paired with a transaction id that ensures that updates of the per cpu information can only occur in sequence on a certain cpu. A transaction id is a "long" integer that is comprised of an event number and the cpu number. The event number is incremented for every change to the per cpu state. This means that the cmpxchg instruction can verify for an update that nothing interfered and that we are updating the percpu structure for the processor where we picked up the information and that we are also currently on that processor when we update the information. This results in a significant decrease of the overhead in the fastpaths. It also makes it easy to adopt the fast path for realtime kernels since this is lockless and does not require the use of the current per cpu area over the critical section. It is only important that the per cpu area is current at the beginning of the critical section and at the end. So there is no need even to disable preemption. Test results show that the fastpath cycle count is reduced by up to ~ 40% (alloc/free test goes from ~140 cycles down to ~80). The slowpath for kfree adds a few cycles. Sadly this does nothing for the slowpath which is where the main issues with performance in slub are but the best case performance rises significantly. (For that see the more complex slub patches that require cmpxchg_double) Kmalloc: alloc/free test Before: 10000 times kmalloc(8)/kfree -> 134 cycles 10000 times kmalloc(16)/kfree -> 152 cycles 10000 times kmalloc(32)/kfree -> 144 cycles 10000 times kmalloc(64)/kfree -> 142 cycles 10000 times kmalloc(128)/kfree -> 142 cycles 10000 times kmalloc(256)/kfree -> 132 cycles 10000 times kmalloc(512)/kfree -> 132 cycles 10000 times kmalloc(1024)/kfree -> 135 cycles 10000 times kmalloc(2048)/kfree -> 135 cycles 10000 times kmalloc(4096)/kfree -> 135 cycles 10000 times kmalloc(8192)/kfree -> 144 cycles 10000 times kmalloc(16384)/kfree -> 754 cycles After: 10000 times kmalloc(8)/kfree -> 78 cycles 10000 times kmalloc(16)/kfree -> 78 cycles 10000 times kmalloc(32)/kfree -> 82 cycles 10000 times kmalloc(64)/kfree -> 88 cycles 10000 times kmalloc(128)/kfree -> 79 cycles 10000 times kmalloc(256)/kfree -> 79 cycles 10000 times kmalloc(512)/kfree -> 85 cycles 10000 times kmalloc(1024)/kfree -> 82 cycles 10000 times kmalloc(2048)/kfree -> 82 cycles 10000 times kmalloc(4096)/kfree -> 85 cycles 10000 times kmalloc(8192)/kfree -> 82 cycles 10000 times kmalloc(16384)/kfree -> 706 cycles Kmalloc: Repeatedly allocate then free test Before: 10000 times kmalloc(8) -> 211 cycles kfree -> 113 cycles 10000 times kmalloc(16) -> 174 cycles kfree -> 115 cycles 10000 times kmalloc(32) -> 235 cycles kfree -> 129 cycles 10000 times kmalloc(64) -> 222 cycles kfree -> 120 cycles 10000 times kmalloc(128) -> 343 cycles kfree -> 139 cycles 10000 times kmalloc(256) -> 827 cycles kfree -> 147 cycles 10000 times kmalloc(512) -> 1048 cycles kfree -> 272 cycles 10000 times kmalloc(1024) -> 2043 cycles kfree -> 528 cycles 10000 times kmalloc(2048) -> 4002 cycles kfree -> 571 cycles 10000 times kmalloc(4096) -> 7740 cycles kfree -> 628 cycles 10000 times kmalloc(8192) -> 8062 cycles kfree -> 850 cycles 10000 times kmalloc(16384) -> 8895 cycles kfree -> 1249 cycles After: 10000 times kmalloc(8) -> 190 cycles kfree -> 129 cycles 10000 times kmalloc(16) -> 76 cycles kfree -> 123 cycles 10000 times kmalloc(32) -> 126 cycles kfree -> 124 cycles 10000 times kmalloc(64) -> 181 cycles kfree -> 128 cycles 10000 times kmalloc(128) -> 310 cycles kfree -> 140 cycles 10000 times kmalloc(256) -> 809 cycles kfree -> 165 cycles 10000 times kmalloc(512) -> 1005 cycles kfree -> 269 cycles 10000 times kmalloc(1024) -> 1999 cycles kfree -> 527 cycles 10000 times kmalloc(2048) -> 3967 cycles kfree -> 570 cycles 10000 times kmalloc(4096) -> 7658 cycles kfree -> 637 cycles 10000 times kmalloc(8192) -> 8111 cycles kfree -> 859 cycles 10000 times kmalloc(16384) -> 8791 cycles kfree -> 1173 cycles Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Pekka Enberg <penberg@kernel.org>
author: Christoph Lameter <cl@linux.com> 2011-02-25 12:38:54 -0500
committer: Pekka Enberg <penberg@kernel.org> 2011-03-11 10:42:49 -0500
commit: 8a5ec0ba42c4919e2d8f4c3138cc8b987fdb0b79 (patch)
tree: d20d4eeb63351e3bd76b7957fa434f2b9f85ec14 /mm/slub.c
parent: d3f661d69a486db0e0e6343b452f45d91b4b3656 (diff)
1 files changed, 203 insertions, 2 deletions
diff --git a/mm/slub.c b/mm/slub.c
index bae7a5c636f4..65030c7fd7e2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1494,6 +1494,77 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
        }
 }
+#ifdef CONFIG_CMPXCHG_LOCAL
+#ifdef CONFIG_PREEMPT
+/*
+ * Calculate the next globally unique transaction for disambiguiation
+ * during cmpxchg. The transactions start with the cpu number and are then
+ * incremented by CONFIG_NR_CPUS.
+ */
+#define TID_STEP  roundup_pow_of_two(CONFIG_NR_CPUS)
+#else
+/*
+ * No preemption supported therefore also no need to check for
+ * different cpus.
+ */
+#define TID_STEP 1
+#endif
+static inline unsigned long next_tid(unsigned long tid)
+{
+        return tid + TID_STEP;
+}
+static inline unsigned int tid_to_cpu(unsigned long tid)
+{
+        return tid % TID_STEP;
+}
+static inline unsigned long tid_to_event(unsigned long tid)
+{
+        return tid / TID_STEP;
+}
+static inline unsigned int init_tid(int cpu)
+{
+        return cpu;
+}
+static inline void note_cmpxchg_failure(const char *n,
+                const struct kmem_cache *s, unsigned long tid)
+{
+#ifdef SLUB_DEBUG_CMPXCHG
+        unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
+        printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
+#ifdef CONFIG_PREEMPT
+        if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
+                printk("due to cpu change %d -> %d\n",
+                        tid_to_cpu(tid), tid_to_cpu(actual_tid));
+        else
+#endif
+        if (tid_to_event(tid) != tid_to_event(actual_tid))
+                printk("due to cpu running other code. Event %ld->%ld\n",
+                        tid_to_event(tid), tid_to_event(actual_tid));
+        else
+                printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
+                        actual_tid, tid, next_tid(tid));
+#endif
+}
+#endif
+void init_kmem_cache_cpus(struct kmem_cache *s)
+{
+#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT)
+        int cpu;
+        for_each_possible_cpu(cpu)
+                per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
+#endif
+}
 /*
 * Remove the cpu slab
 */
@@ -1525,6 +1596,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
                page->inuse--;
        }
        c->page = NULL;
+#ifdef CONFIG_CMPXCHG_LOCAL
+        c->tid = next_tid(c->tid);
+#endif
        unfreeze_slab(s, page, tail);
 }
@@ -1659,6 +1733,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
 {
        void **object;
        struct page *new;
+#ifdef CONFIG_CMPXCHG_LOCAL
+        unsigned long flags;
+        local_irq_save(flags);
+#ifdef CONFIG_PREEMPT
+        /*
+         * We may have been preempted and rescheduled on a different
+         * cpu before disabling interrupts. Need to reload cpu area
+         * pointer.
+         */
+        c = this_cpu_ptr(s->cpu_slab);
+#endif
+#endif
        /* We handle __GFP_ZERO in the caller */
        gfpflags &= ~__GFP_ZERO;
@@ -1685,6 +1772,10 @@ load_freelist:
        c->node = page_to_nid(c->page);
 unlock_out:
        slab_unlock(c->page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+        c->tid = next_tid(c->tid);
+        local_irq_restore(flags);
+#endif
        stat(s, ALLOC_SLOWPATH);
        return object;
@@ -1746,23 +1837,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 {
        void **object;
        struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+        unsigned long tid;
+#else
        unsigned long flags;
+#endif
        if (slab_pre_alloc_hook(s, gfpflags))
                return NULL;
+#ifndef CONFIG_CMPXCHG_LOCAL
        local_irq_save(flags);
+#else
+redo:
+#endif
+        /*
+         * Must read kmem_cache cpu data via this cpu ptr. Preemption is
+         * enabled. We may switch back and forth between cpus while
+         * reading from one cpu area. That does not matter as long
+         * as we end up on the original cpu again when doing the cmpxchg.
+         */
        c = __this_cpu_ptr(s->cpu_slab);
+#ifdef CONFIG_CMPXCHG_LOCAL
+        /*
+         * The transaction ids are globally unique per cpu and per operation on
+         * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
+         * occurs on the right processor and that there was no operation on the
+         * linked list in between.
+         */
+        tid = c->tid;
+        barrier();
+#endif
        object = c->freelist;
        if (unlikely(!object || !node_match(c, node)))
                object = __slab_alloc(s, gfpflags, node, addr, c);
        else {
+#ifdef CONFIG_CMPXCHG_LOCAL
+                /*
+                 * The cmpxchg will only match if there was no additonal
+                 * operation and if we are on the right processor.
+                 *
+                 * The cmpxchg does the following atomically (without lock semantics!)
+                 * 1. Relocate first pointer to the current per cpu area.
+                 * 2. Verify that tid and freelist have not been changed
+                 * 3. If they were not changed replace tid and freelist
+                 *
+                 * Since this is without lock semantics the protection is only against
+                 * code executing on this cpu *not* from access by other cpus.
+                 */
+                if (unlikely(!this_cpu_cmpxchg_double(
+                                s->cpu_slab->freelist, s->cpu_slab->tid,
+                                object, tid,
+                                get_freepointer(s, object), next_tid(tid)))) {
+                        note_cmpxchg_failure("slab_alloc", s, tid);
+                        goto redo;
+                }
+#else
                c->freelist = get_freepointer(s, object);
+#endif
                stat(s, ALLOC_FASTPATH);
        }
+#ifndef CONFIG_CMPXCHG_LOCAL
        local_irq_restore(flags);
+#endif
        if (unlikely(gfpflags & __GFP_ZERO) && object)
                memset(object, 0, s->objsize);
@@ -1840,9 +1984,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 {
        void *prior;
        void **object = (void *)x;
+#ifdef CONFIG_CMPXCHG_LOCAL
+        unsigned long flags;
-        stat(s, FREE_SLOWPATH);
+        local_irq_save(flags);
+#endif
        slab_lock(page);
+        stat(s, FREE_SLOWPATH);
        if (kmem_cache_debug(s))
                goto debug;
@@ -1872,6 +2020,9 @@ checks_ok:
 out_unlock:
        slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+        local_irq_restore(flags);
+#endif
        return;
 slab_empty:
@@ -1883,6 +2034,9 @@ slab_empty:
                stat(s, FREE_REMOVE_PARTIAL);
        }
        slab_unlock(page);
+#ifdef CONFIG_CMPXCHG_LOCAL
+        local_irq_restore(flags);
+#endif
        stat(s, FREE_SLAB);
        discard_slab(s, page);
        return;
@@ -1909,21 +2063,54 @@ static __always_inline void slab_free(struct kmem_cache *s,
 {
        void **object = (void *)x;
        struct kmem_cache_cpu *c;
+#ifdef CONFIG_CMPXCHG_LOCAL
+        unsigned long tid;
+#else
        unsigned long flags;
+#endif
        slab_free_hook(s, x);
+#ifndef CONFIG_CMPXCHG_LOCAL
        local_irq_save(flags);
+#endif
+redo:
+        /*
+         * Determine the currently cpus per cpu slab.
+         * The cpu may change afterward. However that does not matter since
+         * data is retrieved via this pointer. If we are on the same cpu
+         * during the cmpxchg then the free will succedd.
+         */
        c = __this_cpu_ptr(s->cpu_slab);
+#ifdef CONFIG_CMPXCHG_LOCAL
+        tid = c->tid;
+        barrier();
+#endif
        if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
                set_freepointer(s, object, c->freelist);
+#ifdef CONFIG_CMPXCHG_LOCAL
+                if (unlikely(!this_cpu_cmpxchg_double(
+                                s->cpu_slab->freelist, s->cpu_slab->tid,
+                                c->freelist, tid,
+                                object, next_tid(tid)))) {
+                        note_cmpxchg_failure("slab_free", s, tid);
+                        goto redo;
+                }
+#else
                c->freelist = object;
+#endif
                stat(s, FREE_FASTPATH);
        } else
                __slab_free(s, page, x, addr);
+#ifndef CONFIG_CMPXCHG_LOCAL
        local_irq_restore(flags);
+#endif
 }
 void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2115,9 +2302,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
        BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
                        SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
+#ifdef CONFIG_CMPXCHG_LOCAL
+        /*
+         * Must align to double word boundary for the double cmpxchg instructions
+         * to work.
+         */
+        s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
+#else
+        /* Regular alignment is sufficient */
        s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+#endif
+        if (!s->cpu_slab)
+                return 0;
+        init_kmem_cache_cpus(s);
-        return s->cpu_slab != NULL;
+        return 1;
 }
 static struct kmem_cache *kmem_cache_node;
author	Christoph Lameter <cl@linux.com>	2011-02-25 12:38:54 -0500
committer	Pekka Enberg <penberg@kernel.org>	2011-03-11 10:42:49 -0500
commit	8a5ec0ba42c4919e2d8f4c3138cc8b987fdb0b79 (patch)
tree	d20d4eeb63351e3bd76b7957fa434f2b9f85ec14 /mm/slub.c
parent	d3f661d69a486db0e0e6343b452f45d91b4b3656 (diff)

diff --git a/mm/slub.c b/mm/slub.c index bae7a5c636f4..65030c7fd7e2 100644 --- a/mm/slub.c +++ b/mm/slub.c
@@ -1494,6 +1494,77 @@ static void unfreeze_slab(struct kmem_cache s, struct page page, int tail)
1494	}	1494	}
1495	}	1495	}
1496		1496
		1497	#ifdef CONFIG_CMPXCHG_LOCAL
		1498	#ifdef CONFIG_PREEMPT
		1499	/*
		1500	* Calculate the next globally unique transaction for disambiguiation
		1501	* during cmpxchg. The transactions start with the cpu number and are then
		1502	* incremented by CONFIG_NR_CPUS.
		1503	*/
		1504	#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
		1505	#else
		1506	/*
		1507	* No preemption supported therefore also no need to check for
		1508	* different cpus.
		1509	*/
		1510	#define TID_STEP 1
		1511	#endif
		1512
		1513	static inline unsigned long next_tid(unsigned long tid)
		1514	{
		1515	return tid + TID_STEP;
		1516	}
		1517
		1518	static inline unsigned int tid_to_cpu(unsigned long tid)
		1519	{
		1520	return tid % TID_STEP;
		1521	}
		1522
		1523	static inline unsigned long tid_to_event(unsigned long tid)
		1524	{
		1525	return tid / TID_STEP;
		1526	}
		1527
		1528	static inline unsigned int init_tid(int cpu)
		1529	{
		1530	return cpu;
		1531	}
		1532
		1533	static inline void note_cmpxchg_failure(const char *n,
		1534	const struct kmem_cache *s, unsigned long tid)
		1535	{
		1536	#ifdef SLUB_DEBUG_CMPXCHG
		1537	unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
		1538
		1539	printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
		1540
		1541	#ifdef CONFIG_PREEMPT
		1542	if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
		1543	printk("due to cpu change %d -> %d\n",
		1544	tid_to_cpu(tid), tid_to_cpu(actual_tid));
		1545	else
		1546	#endif
		1547	if (tid_to_event(tid) != tid_to_event(actual_tid))
		1548	printk("due to cpu running other code. Event %ld->%ld\n",
		1549	tid_to_event(tid), tid_to_event(actual_tid));
		1550	else
		1551	printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
		1552	actual_tid, tid, next_tid(tid));
		1553	#endif
		1554	}
		1555
		1556	#endif
		1557
		1558	void init_kmem_cache_cpus(struct kmem_cache *s)
		1559	{
		1560	#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT)
		1561	int cpu;
		1562
		1563	for_each_possible_cpu(cpu)
		1564	per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
		1565	#endif
		1566
		1567	}
1497	/*	1568	/*
1498	* Remove the cpu slab	1569	* Remove the cpu slab
1499	*/	1570	*/
@@ -1525,6 +1596,9 @@ static void deactivate_slab(struct kmem_cache s, struct kmem_cache_cpu c)
1525	page->inuse--;	1596	page->inuse--;
1526	}	1597	}
1527	c->page = NULL;	1598	c->page = NULL;
		1599	#ifdef CONFIG_CMPXCHG_LOCAL
		1600	c->tid = next_tid(c->tid);
		1601	#endif
1528	unfreeze_slab(s, page, tail);	1602	unfreeze_slab(s, page, tail);
1529	}	1603	}
1530		1604
@@ -1659,6 +1733,19 @@ static void __slab_alloc(struct kmem_cache s, gfp_t gfpflags, int node,
1659	{	1733	{
1660	void **object;	1734	void **object;
1661	struct page *new;	1735	struct page *new;
		1736	#ifdef CONFIG_CMPXCHG_LOCAL
		1737	unsigned long flags;
		1738
		1739	local_irq_save(flags);
		1740	#ifdef CONFIG_PREEMPT
		1741	/*
		1742	* We may have been preempted and rescheduled on a different
		1743	* cpu before disabling interrupts. Need to reload cpu area
		1744	* pointer.
		1745	*/
		1746	c = this_cpu_ptr(s->cpu_slab);
		1747	#endif
		1748	#endif
1662		1749
1663	/* We handle __GFP_ZERO in the caller */	1750	/* We handle __GFP_ZERO in the caller */
1664	gfpflags &= ~__GFP_ZERO;	1751	gfpflags &= ~__GFP_ZERO;
@@ -1685,6 +1772,10 @@ load_freelist:
1685	c->node = page_to_nid(c->page);	1772	c->node = page_to_nid(c->page);
1686	unlock_out:	1773	unlock_out:
1687	slab_unlock(c->page);	1774	slab_unlock(c->page);
		1775	#ifdef CONFIG_CMPXCHG_LOCAL
		1776	c->tid = next_tid(c->tid);
		1777	local_irq_restore(flags);
		1778	#endif
1688	stat(s, ALLOC_SLOWPATH);	1779	stat(s, ALLOC_SLOWPATH);
1689	return object;	1780	return object;
1690		1781
@@ -1746,23 +1837,76 @@ static __always_inline void slab_alloc(struct kmem_cache s,
1746	{	1837	{
1747	void **object;	1838	void **object;
1748	struct kmem_cache_cpu *c;	1839	struct kmem_cache_cpu *c;
		1840	#ifdef CONFIG_CMPXCHG_LOCAL
		1841	unsigned long tid;
		1842	#else
1749	unsigned long flags;	1843	unsigned long flags;
		1844	#endif
1750		1845
1751	if (slab_pre_alloc_hook(s, gfpflags))	1846	if (slab_pre_alloc_hook(s, gfpflags))
1752	return NULL;	1847	return NULL;
1753		1848
		1849	#ifndef CONFIG_CMPXCHG_LOCAL
1754	local_irq_save(flags);	1850	local_irq_save(flags);
		1851	#else
		1852	redo:
		1853	#endif
		1854
		1855	/*
		1856	* Must read kmem_cache cpu data via this cpu ptr. Preemption is
		1857	* enabled. We may switch back and forth between cpus while
		1858	* reading from one cpu area. That does not matter as long
		1859	* as we end up on the original cpu again when doing the cmpxchg.
		1860	*/
1755	c = __this_cpu_ptr(s->cpu_slab);	1861	c = __this_cpu_ptr(s->cpu_slab);
		1862
		1863	#ifdef CONFIG_CMPXCHG_LOCAL
		1864	/*
		1865	* The transaction ids are globally unique per cpu and per operation on
		1866	* a per cpu queue. Thus they can be guarantee that the cmpxchg_double
		1867	* occurs on the right processor and that there was no operation on the
		1868	* linked list in between.
		1869	*/
		1870	tid = c->tid;
		1871	barrier();
		1872	#endif
		1873
1756	object = c->freelist;	1874	object = c->freelist;
1757	if (unlikely(!object \|\| !node_match(c, node)))	1875	if (unlikely(!object \|\| !node_match(c, node)))
1758		1876
1759	object = __slab_alloc(s, gfpflags, node, addr, c);	1877	object = __slab_alloc(s, gfpflags, node, addr, c);
1760		1878
1761	else {	1879	else {
		1880	#ifdef CONFIG_CMPXCHG_LOCAL
		1881	/*
		1882	* The cmpxchg will only match if there was no additonal
		1883	* operation and if we are on the right processor.
		1884	*
		1885	* The cmpxchg does the following atomically (without lock semantics!)
		1886	* 1. Relocate first pointer to the current per cpu area.
		1887	* 2. Verify that tid and freelist have not been changed
		1888	* 3. If they were not changed replace tid and freelist
		1889	*
		1890	* Since this is without lock semantics the protection is only against
		1891	* code executing on this cpu not from access by other cpus.
		1892	*/
		1893	if (unlikely(!this_cpu_cmpxchg_double(
		1894	s->cpu_slab->freelist, s->cpu_slab->tid,
		1895	object, tid,
		1896	get_freepointer(s, object), next_tid(tid)))) {
		1897
		1898	note_cmpxchg_failure("slab_alloc", s, tid);
		1899	goto redo;
		1900	}
		1901	#else
1762	c->freelist = get_freepointer(s, object);	1902	c->freelist = get_freepointer(s, object);
		1903	#endif
1763	stat(s, ALLOC_FASTPATH);	1904	stat(s, ALLOC_FASTPATH);
1764	}	1905	}
		1906
		1907	#ifndef CONFIG_CMPXCHG_LOCAL
1765	local_irq_restore(flags);	1908	local_irq_restore(flags);
		1909	#endif
1766		1910
1767	if (unlikely(gfpflags & __GFP_ZERO) && object)	1911	if (unlikely(gfpflags & __GFP_ZERO) && object)
1768	memset(object, 0, s->objsize);	1912	memset(object, 0, s->objsize);
@@ -1840,9 +1984,13 @@ static void __slab_free(struct kmem_cache s, struct page page,
1840	{	1984	{
1841	void *prior;	1985	void *prior;
1842	void *object = (void )x;	1986	void *object = (void )x;
		1987	#ifdef CONFIG_CMPXCHG_LOCAL
		1988	unsigned long flags;
1843		1989
1844	stat(s, FREE_SLOWPATH);	1990	local_irq_save(flags);
		1991	#endif
1845	slab_lock(page);	1992	slab_lock(page);
		1993	stat(s, FREE_SLOWPATH);
1846		1994
1847	if (kmem_cache_debug(s))	1995	if (kmem_cache_debug(s))
1848	goto debug;	1996	goto debug;
@@ -1872,6 +2020,9 @@ checks_ok:
1872		2020
1873	out_unlock:	2021	out_unlock:
1874	slab_unlock(page);	2022	slab_unlock(page);
		2023	#ifdef CONFIG_CMPXCHG_LOCAL
		2024	local_irq_restore(flags);
		2025	#endif
1875	return;	2026	return;
1876		2027
1877	slab_empty:	2028	slab_empty:
@@ -1883,6 +2034,9 @@ slab_empty:
1883	stat(s, FREE_REMOVE_PARTIAL);	2034	stat(s, FREE_REMOVE_PARTIAL);
1884	}	2035	}
1885	slab_unlock(page);	2036	slab_unlock(page);
		2037	#ifdef CONFIG_CMPXCHG_LOCAL
		2038	local_irq_restore(flags);
		2039	#endif
1886	stat(s, FREE_SLAB);	2040	stat(s, FREE_SLAB);
1887	discard_slab(s, page);	2041	discard_slab(s, page);
1888	return;	2042	return;
@@ -1909,21 +2063,54 @@ static __always_inline void slab_free(struct kmem_cache *s,
1909	{	2063	{
1910	void *object = (void )x;	2064	void *object = (void )x;
1911	struct kmem_cache_cpu *c;	2065	struct kmem_cache_cpu *c;
		2066	#ifdef CONFIG_CMPXCHG_LOCAL
		2067	unsigned long tid;
		2068	#else
1912	unsigned long flags;	2069	unsigned long flags;
		2070	#endif
1913		2071
1914	slab_free_hook(s, x);	2072	slab_free_hook(s, x);
1915		2073
		2074	#ifndef CONFIG_CMPXCHG_LOCAL
1916	local_irq_save(flags);	2075	local_irq_save(flags);
		2076	#endif
		2077
		2078	redo:
		2079	/*
		2080	* Determine the currently cpus per cpu slab.
		2081	* The cpu may change afterward. However that does not matter since
		2082	* data is retrieved via this pointer. If we are on the same cpu
		2083	* during the cmpxchg then the free will succedd.
		2084	*/
1917	c = __this_cpu_ptr(s->cpu_slab);	2085	c = __this_cpu_ptr(s->cpu_slab);
1918		2086
		2087	#ifdef CONFIG_CMPXCHG_LOCAL
		2088	tid = c->tid;
		2089	barrier();
		2090	#endif
		2091
1919	if (likely(page == c->page && c->node != NUMA_NO_NODE)) {	2092	if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
1920	set_freepointer(s, object, c->freelist);	2093	set_freepointer(s, object, c->freelist);
		2094
		2095	#ifdef CONFIG_CMPXCHG_LOCAL
		2096	if (unlikely(!this_cpu_cmpxchg_double(
		2097	s->cpu_slab->freelist, s->cpu_slab->tid,
		2098	c->freelist, tid,
		2099	object, next_tid(tid)))) {
		2100
		2101	note_cmpxchg_failure("slab_free", s, tid);
		2102	goto redo;
		2103	}
		2104	#else
1921	c->freelist = object;	2105	c->freelist = object;
		2106	#endif
1922	stat(s, FREE_FASTPATH);	2107	stat(s, FREE_FASTPATH);
1923	} else	2108	} else
1924	__slab_free(s, page, x, addr);	2109	__slab_free(s, page, x, addr);
1925		2110
		2111	#ifndef CONFIG_CMPXCHG_LOCAL
1926	local_irq_restore(flags);	2112	local_irq_restore(flags);
		2113	#endif
1927	}	2114	}
1928		2115
1929	void kmem_cache_free(struct kmem_cache s, void x)	2116	void kmem_cache_free(struct kmem_cache s, void x)
@@ -2115,9 +2302,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2115	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <	2302	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2116	SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));	2303	SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2117		2304
		2305	#ifdef CONFIG_CMPXCHG_LOCAL
		2306	/*
		2307	* Must align to double word boundary for the double cmpxchg instructions
		2308	* to work.
		2309	*/
		2310	s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
		2311	#else
		2312	/* Regular alignment is sufficient */
2118	s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);	2313	s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
		2314	#endif
		2315
		2316	if (!s->cpu_slab)
		2317	return 0;
		2318
		2319	init_kmem_cache_cpus(s);
2119		2320
2120	return s->cpu_slab != NULL;	2321	return 1;
2121	}	2322	}
2122		2323
2123	static struct kmem_cache *kmem_cache_node;	2324	static struct kmem_cache *kmem_cache_node;