summaryrefslogtreecommitdiffstats
path: root/mm/slub.c
diff options
context:
space:
mode:
authorJesper Dangaard Brouer <brouer@redhat.com>2015-09-04 18:45:37 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-04 19:54:41 -0400
commit994eb764ec5ad57c9b7c5e72b892205039a84b69 (patch)
tree5db4d9612a61d1ae87e31655084bac090a96dca2 /mm/slub.c
parent484748f0b65a1950b2b93f444a2287e8dd2cedd6 (diff)
slub bulk alloc: extract objects from the per cpu slab
First piece: acceleration of retrieval of per cpu objects If we are allocating lots of objects then it is advantageous to disable interrupts and avoid the this_cpu_cmpxchg() operation to get these objects faster. Note that we cannot do the fast operation if debugging is enabled, because we would have to add extra code to do all the debugging checks. And it would not be fast anyway. Note also that the requirement of having interrupts disabled avoids having to do processor flag operations. Allocate as many objects as possible in the fast way and then fall back to the generic implementation for the rest of the objects. Measurements on CPU CPU i7-4790K @ 4.00GHz Baseline normal fastpath (alloc+free cost): 42 cycles(tsc) 10.554 ns Bulk- fallback - this-patch 1 - 57 cycles(tsc) 14.432 ns - 48 cycles(tsc) 12.155 ns improved 15.8% 2 - 50 cycles(tsc) 12.746 ns - 37 cycles(tsc) 9.390 ns improved 26.0% 3 - 48 cycles(tsc) 12.180 ns - 33 cycles(tsc) 8.417 ns improved 31.2% 4 - 48 cycles(tsc) 12.015 ns - 32 cycles(tsc) 8.045 ns improved 33.3% 8 - 46 cycles(tsc) 11.526 ns - 30 cycles(tsc) 7.699 ns improved 34.8% 16 - 45 cycles(tsc) 11.418 ns - 32 cycles(tsc) 8.205 ns improved 28.9% 30 - 80 cycles(tsc) 20.246 ns - 73 cycles(tsc) 18.328 ns improved 8.8% 32 - 79 cycles(tsc) 19.946 ns - 72 cycles(tsc) 18.208 ns improved 8.9% 34 - 78 cycles(tsc) 19.659 ns - 71 cycles(tsc) 17.987 ns improved 9.0% 48 - 86 cycles(tsc) 21.516 ns - 82 cycles(tsc) 20.566 ns improved 4.7% 64 - 93 cycles(tsc) 23.423 ns - 89 cycles(tsc) 22.480 ns improved 4.3% 128 - 100 cycles(tsc) 25.170 ns - 99 cycles(tsc) 24.871 ns improved 1.0% 158 - 102 cycles(tsc) 25.549 ns - 101 cycles(tsc) 25.375 ns improved 1.0% 250 - 101 cycles(tsc) 25.344 ns - 100 cycles(tsc) 25.182 ns improved 1.0% Signed-off-by: Christoph Lameter <cl@linux.com> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com> Cc: Pekka Enberg <penberg@kernel.org> Cc: David Rientjes <rientjes@google.com> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/slub.c')
-rw-r--r--mm/slub.c49
1 files changed, 47 insertions, 2 deletions
diff --git a/mm/slub.c b/mm/slub.c
index 3ca89ef9b7b0..30e7dedec664 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2750,16 +2750,61 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
2750} 2750}
2751EXPORT_SYMBOL(kmem_cache_free); 2751EXPORT_SYMBOL(kmem_cache_free);
2752 2752
2753/* Note that interrupts must be enabled when calling this function. */
2753void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) 2754void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p)
2754{ 2755{
2755 __kmem_cache_free_bulk(s, size, p); 2756 __kmem_cache_free_bulk(s, size, p);
2756} 2757}
2757EXPORT_SYMBOL(kmem_cache_free_bulk); 2758EXPORT_SYMBOL(kmem_cache_free_bulk);
2758 2759
2760/* Note that interrupts must be enabled when calling this function. */
2759bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, 2761bool kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
2760 void **p) 2762 void **p)
2761{ 2763{
2762 return __kmem_cache_alloc_bulk(s, flags, size, p); 2764 struct kmem_cache_cpu *c;
2765 int i;
2766
2767 /* Debugging fallback to generic bulk */
2768 if (kmem_cache_debug(s))
2769 return __kmem_cache_alloc_bulk(s, flags, size, p);
2770
2771 /*
2772 * Drain objects in the per cpu slab, while disabling local
2773 * IRQs, which protects against PREEMPT and interrupts
2774 * handlers invoking normal fastpath.
2775 */
2776 local_irq_disable();
2777 c = this_cpu_ptr(s->cpu_slab);
2778
2779 for (i = 0; i < size; i++) {
2780 void *object = c->freelist;
2781
2782 if (!object)
2783 break;
2784
2785 c->freelist = get_freepointer(s, object);
2786 p[i] = object;
2787 }
2788 c->tid = next_tid(c->tid);
2789 local_irq_enable();
2790
2791 /* Clear memory outside IRQ disabled fastpath loop */
2792 if (unlikely(flags & __GFP_ZERO)) {
2793 int j;
2794
2795 for (j = 0; j < i; j++)
2796 memset(p[j], 0, s->object_size);
2797 }
2798
2799 /* Fallback to single elem alloc */
2800 for (; i < size; i++) {
2801 void *x = p[i] = kmem_cache_alloc(s, flags);
2802 if (unlikely(!x)) {
2803 __kmem_cache_free_bulk(s, i, p);
2804 return false;
2805 }
2806 }
2807 return true;
2763} 2808}
2764EXPORT_SYMBOL(kmem_cache_alloc_bulk); 2809EXPORT_SYMBOL(kmem_cache_alloc_bulk);
2765 2810