aboutsummaryrefslogtreecommitdiffstats
path: root/mm/slub.c
diff options
context:
space:
mode:
authorChristoph Lameter <cl@linux.com>2011-02-25 12:38:54 -0500
committerPekka Enberg <penberg@kernel.org>2011-03-11 10:42:49 -0500
commit8a5ec0ba42c4919e2d8f4c3138cc8b987fdb0b79 (patch)
treed20d4eeb63351e3bd76b7957fa434f2b9f85ec14 /mm/slub.c
parentd3f661d69a486db0e0e6343b452f45d91b4b3656 (diff)
Lockless (and preemptless) fastpaths for slub
Use the this_cpu_cmpxchg_double functionality to implement a lockless allocation algorithm on arches that support fast this_cpu_ops. Each of the per cpu pointers is paired with a transaction id that ensures that updates of the per cpu information can only occur in sequence on a certain cpu. A transaction id is a "long" integer that is comprised of an event number and the cpu number. The event number is incremented for every change to the per cpu state. This means that the cmpxchg instruction can verify for an update that nothing interfered and that we are updating the percpu structure for the processor where we picked up the information and that we are also currently on that processor when we update the information. This results in a significant decrease of the overhead in the fastpaths. It also makes it easy to adopt the fast path for realtime kernels since this is lockless and does not require the use of the current per cpu area over the critical section. It is only important that the per cpu area is current at the beginning of the critical section and at the end. So there is no need even to disable preemption. Test results show that the fastpath cycle count is reduced by up to ~ 40% (alloc/free test goes from ~140 cycles down to ~80). The slowpath for kfree adds a few cycles. Sadly this does nothing for the slowpath which is where the main issues with performance in slub are but the best case performance rises significantly. (For that see the more complex slub patches that require cmpxchg_double) Kmalloc: alloc/free test Before: 10000 times kmalloc(8)/kfree -> 134 cycles 10000 times kmalloc(16)/kfree -> 152 cycles 10000 times kmalloc(32)/kfree -> 144 cycles 10000 times kmalloc(64)/kfree -> 142 cycles 10000 times kmalloc(128)/kfree -> 142 cycles 10000 times kmalloc(256)/kfree -> 132 cycles 10000 times kmalloc(512)/kfree -> 132 cycles 10000 times kmalloc(1024)/kfree -> 135 cycles 10000 times kmalloc(2048)/kfree -> 135 cycles 10000 times kmalloc(4096)/kfree -> 135 cycles 10000 times kmalloc(8192)/kfree -> 144 cycles 10000 times kmalloc(16384)/kfree -> 754 cycles After: 10000 times kmalloc(8)/kfree -> 78 cycles 10000 times kmalloc(16)/kfree -> 78 cycles 10000 times kmalloc(32)/kfree -> 82 cycles 10000 times kmalloc(64)/kfree -> 88 cycles 10000 times kmalloc(128)/kfree -> 79 cycles 10000 times kmalloc(256)/kfree -> 79 cycles 10000 times kmalloc(512)/kfree -> 85 cycles 10000 times kmalloc(1024)/kfree -> 82 cycles 10000 times kmalloc(2048)/kfree -> 82 cycles 10000 times kmalloc(4096)/kfree -> 85 cycles 10000 times kmalloc(8192)/kfree -> 82 cycles 10000 times kmalloc(16384)/kfree -> 706 cycles Kmalloc: Repeatedly allocate then free test Before: 10000 times kmalloc(8) -> 211 cycles kfree -> 113 cycles 10000 times kmalloc(16) -> 174 cycles kfree -> 115 cycles 10000 times kmalloc(32) -> 235 cycles kfree -> 129 cycles 10000 times kmalloc(64) -> 222 cycles kfree -> 120 cycles 10000 times kmalloc(128) -> 343 cycles kfree -> 139 cycles 10000 times kmalloc(256) -> 827 cycles kfree -> 147 cycles 10000 times kmalloc(512) -> 1048 cycles kfree -> 272 cycles 10000 times kmalloc(1024) -> 2043 cycles kfree -> 528 cycles 10000 times kmalloc(2048) -> 4002 cycles kfree -> 571 cycles 10000 times kmalloc(4096) -> 7740 cycles kfree -> 628 cycles 10000 times kmalloc(8192) -> 8062 cycles kfree -> 850 cycles 10000 times kmalloc(16384) -> 8895 cycles kfree -> 1249 cycles After: 10000 times kmalloc(8) -> 190 cycles kfree -> 129 cycles 10000 times kmalloc(16) -> 76 cycles kfree -> 123 cycles 10000 times kmalloc(32) -> 126 cycles kfree -> 124 cycles 10000 times kmalloc(64) -> 181 cycles kfree -> 128 cycles 10000 times kmalloc(128) -> 310 cycles kfree -> 140 cycles 10000 times kmalloc(256) -> 809 cycles kfree -> 165 cycles 10000 times kmalloc(512) -> 1005 cycles kfree -> 269 cycles 10000 times kmalloc(1024) -> 1999 cycles kfree -> 527 cycles 10000 times kmalloc(2048) -> 3967 cycles kfree -> 570 cycles 10000 times kmalloc(4096) -> 7658 cycles kfree -> 637 cycles 10000 times kmalloc(8192) -> 8111 cycles kfree -> 859 cycles 10000 times kmalloc(16384) -> 8791 cycles kfree -> 1173 cycles Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Pekka Enberg <penberg@kernel.org>
Diffstat (limited to 'mm/slub.c')
-rw-r--r--mm/slub.c205
1 files changed, 203 insertions, 2 deletions
diff --git a/mm/slub.c b/mm/slub.c
index bae7a5c636f4..65030c7fd7e2 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1494,6 +1494,77 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
1494 } 1494 }
1495} 1495}
1496 1496
1497#ifdef CONFIG_CMPXCHG_LOCAL
1498#ifdef CONFIG_PREEMPT
1499/*
1500 * Calculate the next globally unique transaction for disambiguiation
1501 * during cmpxchg. The transactions start with the cpu number and are then
1502 * incremented by CONFIG_NR_CPUS.
1503 */
1504#define TID_STEP roundup_pow_of_two(CONFIG_NR_CPUS)
1505#else
1506/*
1507 * No preemption supported therefore also no need to check for
1508 * different cpus.
1509 */
1510#define TID_STEP 1
1511#endif
1512
1513static inline unsigned long next_tid(unsigned long tid)
1514{
1515 return tid + TID_STEP;
1516}
1517
1518static inline unsigned int tid_to_cpu(unsigned long tid)
1519{
1520 return tid % TID_STEP;
1521}
1522
1523static inline unsigned long tid_to_event(unsigned long tid)
1524{
1525 return tid / TID_STEP;
1526}
1527
1528static inline unsigned int init_tid(int cpu)
1529{
1530 return cpu;
1531}
1532
1533static inline void note_cmpxchg_failure(const char *n,
1534 const struct kmem_cache *s, unsigned long tid)
1535{
1536#ifdef SLUB_DEBUG_CMPXCHG
1537 unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid);
1538
1539 printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name);
1540
1541#ifdef CONFIG_PREEMPT
1542 if (tid_to_cpu(tid) != tid_to_cpu(actual_tid))
1543 printk("due to cpu change %d -> %d\n",
1544 tid_to_cpu(tid), tid_to_cpu(actual_tid));
1545 else
1546#endif
1547 if (tid_to_event(tid) != tid_to_event(actual_tid))
1548 printk("due to cpu running other code. Event %ld->%ld\n",
1549 tid_to_event(tid), tid_to_event(actual_tid));
1550 else
1551 printk("for unknown reason: actual=%lx was=%lx target=%lx\n",
1552 actual_tid, tid, next_tid(tid));
1553#endif
1554}
1555
1556#endif
1557
1558void init_kmem_cache_cpus(struct kmem_cache *s)
1559{
1560#if defined(CONFIG_CMPXCHG_LOCAL) && defined(CONFIG_PREEMPT)
1561 int cpu;
1562
1563 for_each_possible_cpu(cpu)
1564 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1565#endif
1566
1567}
1497/* 1568/*
1498 * Remove the cpu slab 1569 * Remove the cpu slab
1499 */ 1570 */
@@ -1525,6 +1596,9 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1525 page->inuse--; 1596 page->inuse--;
1526 } 1597 }
1527 c->page = NULL; 1598 c->page = NULL;
1599#ifdef CONFIG_CMPXCHG_LOCAL
1600 c->tid = next_tid(c->tid);
1601#endif
1528 unfreeze_slab(s, page, tail); 1602 unfreeze_slab(s, page, tail);
1529} 1603}
1530 1604
@@ -1659,6 +1733,19 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
1659{ 1733{
1660 void **object; 1734 void **object;
1661 struct page *new; 1735 struct page *new;
1736#ifdef CONFIG_CMPXCHG_LOCAL
1737 unsigned long flags;
1738
1739 local_irq_save(flags);
1740#ifdef CONFIG_PREEMPT
1741 /*
1742 * We may have been preempted and rescheduled on a different
1743 * cpu before disabling interrupts. Need to reload cpu area
1744 * pointer.
1745 */
1746 c = this_cpu_ptr(s->cpu_slab);
1747#endif
1748#endif
1662 1749
1663 /* We handle __GFP_ZERO in the caller */ 1750 /* We handle __GFP_ZERO in the caller */
1664 gfpflags &= ~__GFP_ZERO; 1751 gfpflags &= ~__GFP_ZERO;
@@ -1685,6 +1772,10 @@ load_freelist:
1685 c->node = page_to_nid(c->page); 1772 c->node = page_to_nid(c->page);
1686unlock_out: 1773unlock_out:
1687 slab_unlock(c->page); 1774 slab_unlock(c->page);
1775#ifdef CONFIG_CMPXCHG_LOCAL
1776 c->tid = next_tid(c->tid);
1777 local_irq_restore(flags);
1778#endif
1688 stat(s, ALLOC_SLOWPATH); 1779 stat(s, ALLOC_SLOWPATH);
1689 return object; 1780 return object;
1690 1781
@@ -1746,23 +1837,76 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1746{ 1837{
1747 void **object; 1838 void **object;
1748 struct kmem_cache_cpu *c; 1839 struct kmem_cache_cpu *c;
1840#ifdef CONFIG_CMPXCHG_LOCAL
1841 unsigned long tid;
1842#else
1749 unsigned long flags; 1843 unsigned long flags;
1844#endif
1750 1845
1751 if (slab_pre_alloc_hook(s, gfpflags)) 1846 if (slab_pre_alloc_hook(s, gfpflags))
1752 return NULL; 1847 return NULL;
1753 1848
1849#ifndef CONFIG_CMPXCHG_LOCAL
1754 local_irq_save(flags); 1850 local_irq_save(flags);
1851#else
1852redo:
1853#endif
1854
1855 /*
1856 * Must read kmem_cache cpu data via this cpu ptr. Preemption is
1857 * enabled. We may switch back and forth between cpus while
1858 * reading from one cpu area. That does not matter as long
1859 * as we end up on the original cpu again when doing the cmpxchg.
1860 */
1755 c = __this_cpu_ptr(s->cpu_slab); 1861 c = __this_cpu_ptr(s->cpu_slab);
1862
1863#ifdef CONFIG_CMPXCHG_LOCAL
1864 /*
1865 * The transaction ids are globally unique per cpu and per operation on
1866 * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
1867 * occurs on the right processor and that there was no operation on the
1868 * linked list in between.
1869 */
1870 tid = c->tid;
1871 barrier();
1872#endif
1873
1756 object = c->freelist; 1874 object = c->freelist;
1757 if (unlikely(!object || !node_match(c, node))) 1875 if (unlikely(!object || !node_match(c, node)))
1758 1876
1759 object = __slab_alloc(s, gfpflags, node, addr, c); 1877 object = __slab_alloc(s, gfpflags, node, addr, c);
1760 1878
1761 else { 1879 else {
1880#ifdef CONFIG_CMPXCHG_LOCAL
1881 /*
1882 * The cmpxchg will only match if there was no additonal
1883 * operation and if we are on the right processor.
1884 *
1885 * The cmpxchg does the following atomically (without lock semantics!)
1886 * 1. Relocate first pointer to the current per cpu area.
1887 * 2. Verify that tid and freelist have not been changed
1888 * 3. If they were not changed replace tid and freelist
1889 *
1890 * Since this is without lock semantics the protection is only against
1891 * code executing on this cpu *not* from access by other cpus.
1892 */
1893 if (unlikely(!this_cpu_cmpxchg_double(
1894 s->cpu_slab->freelist, s->cpu_slab->tid,
1895 object, tid,
1896 get_freepointer(s, object), next_tid(tid)))) {
1897
1898 note_cmpxchg_failure("slab_alloc", s, tid);
1899 goto redo;
1900 }
1901#else
1762 c->freelist = get_freepointer(s, object); 1902 c->freelist = get_freepointer(s, object);
1903#endif
1763 stat(s, ALLOC_FASTPATH); 1904 stat(s, ALLOC_FASTPATH);
1764 } 1905 }
1906
1907#ifndef CONFIG_CMPXCHG_LOCAL
1765 local_irq_restore(flags); 1908 local_irq_restore(flags);
1909#endif
1766 1910
1767 if (unlikely(gfpflags & __GFP_ZERO) && object) 1911 if (unlikely(gfpflags & __GFP_ZERO) && object)
1768 memset(object, 0, s->objsize); 1912 memset(object, 0, s->objsize);
@@ -1840,9 +1984,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
1840{ 1984{
1841 void *prior; 1985 void *prior;
1842 void **object = (void *)x; 1986 void **object = (void *)x;
1987#ifdef CONFIG_CMPXCHG_LOCAL
1988 unsigned long flags;
1843 1989
1844 stat(s, FREE_SLOWPATH); 1990 local_irq_save(flags);
1991#endif
1845 slab_lock(page); 1992 slab_lock(page);
1993 stat(s, FREE_SLOWPATH);
1846 1994
1847 if (kmem_cache_debug(s)) 1995 if (kmem_cache_debug(s))
1848 goto debug; 1996 goto debug;
@@ -1872,6 +2020,9 @@ checks_ok:
1872 2020
1873out_unlock: 2021out_unlock:
1874 slab_unlock(page); 2022 slab_unlock(page);
2023#ifdef CONFIG_CMPXCHG_LOCAL
2024 local_irq_restore(flags);
2025#endif
1875 return; 2026 return;
1876 2027
1877slab_empty: 2028slab_empty:
@@ -1883,6 +2034,9 @@ slab_empty:
1883 stat(s, FREE_REMOVE_PARTIAL); 2034 stat(s, FREE_REMOVE_PARTIAL);
1884 } 2035 }
1885 slab_unlock(page); 2036 slab_unlock(page);
2037#ifdef CONFIG_CMPXCHG_LOCAL
2038 local_irq_restore(flags);
2039#endif
1886 stat(s, FREE_SLAB); 2040 stat(s, FREE_SLAB);
1887 discard_slab(s, page); 2041 discard_slab(s, page);
1888 return; 2042 return;
@@ -1909,21 +2063,54 @@ static __always_inline void slab_free(struct kmem_cache *s,
1909{ 2063{
1910 void **object = (void *)x; 2064 void **object = (void *)x;
1911 struct kmem_cache_cpu *c; 2065 struct kmem_cache_cpu *c;
2066#ifdef CONFIG_CMPXCHG_LOCAL
2067 unsigned long tid;
2068#else
1912 unsigned long flags; 2069 unsigned long flags;
2070#endif
1913 2071
1914 slab_free_hook(s, x); 2072 slab_free_hook(s, x);
1915 2073
2074#ifndef CONFIG_CMPXCHG_LOCAL
1916 local_irq_save(flags); 2075 local_irq_save(flags);
2076#endif
2077
2078redo:
2079 /*
2080 * Determine the currently cpus per cpu slab.
2081 * The cpu may change afterward. However that does not matter since
2082 * data is retrieved via this pointer. If we are on the same cpu
2083 * during the cmpxchg then the free will succedd.
2084 */
1917 c = __this_cpu_ptr(s->cpu_slab); 2085 c = __this_cpu_ptr(s->cpu_slab);
1918 2086
2087#ifdef CONFIG_CMPXCHG_LOCAL
2088 tid = c->tid;
2089 barrier();
2090#endif
2091
1919 if (likely(page == c->page && c->node != NUMA_NO_NODE)) { 2092 if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
1920 set_freepointer(s, object, c->freelist); 2093 set_freepointer(s, object, c->freelist);
2094
2095#ifdef CONFIG_CMPXCHG_LOCAL
2096 if (unlikely(!this_cpu_cmpxchg_double(
2097 s->cpu_slab->freelist, s->cpu_slab->tid,
2098 c->freelist, tid,
2099 object, next_tid(tid)))) {
2100
2101 note_cmpxchg_failure("slab_free", s, tid);
2102 goto redo;
2103 }
2104#else
1921 c->freelist = object; 2105 c->freelist = object;
2106#endif
1922 stat(s, FREE_FASTPATH); 2107 stat(s, FREE_FASTPATH);
1923 } else 2108 } else
1924 __slab_free(s, page, x, addr); 2109 __slab_free(s, page, x, addr);
1925 2110
2111#ifndef CONFIG_CMPXCHG_LOCAL
1926 local_irq_restore(flags); 2112 local_irq_restore(flags);
2113#endif
1927} 2114}
1928 2115
1929void kmem_cache_free(struct kmem_cache *s, void *x) 2116void kmem_cache_free(struct kmem_cache *s, void *x)
@@ -2115,9 +2302,23 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
2115 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < 2302 BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
2116 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); 2303 SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
2117 2304
2305#ifdef CONFIG_CMPXCHG_LOCAL
2306 /*
2307 * Must align to double word boundary for the double cmpxchg instructions
2308 * to work.
2309 */
2310 s->cpu_slab = __alloc_percpu(sizeof(struct kmem_cache_cpu), 2 * sizeof(void *));
2311#else
2312 /* Regular alignment is sufficient */
2118 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); 2313 s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
2314#endif
2315
2316 if (!s->cpu_slab)
2317 return 0;
2318
2319 init_kmem_cache_cpus(s);
2119 2320
2120 return s->cpu_slab != NULL; 2321 return 1;
2121} 2322}
2122 2323
2123static struct kmem_cache *kmem_cache_node; 2324static struct kmem_cache *kmem_cache_node;