aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorEric Dumazet <eric.dumazet@gmail.com>2011-12-16 10:25:34 -0500
committerPekka Enberg <penberg@kernel.org>2012-01-24 14:53:57 -0500
commit0ad9500e16fe24aa55809a2b00e0d2d0e658fc71 (patch)
treea448d51a669d475296c36893b0078a9d553c5b65 /mm
parent42c8c99cd891184bf4bcf6f09d62c54e42599453 (diff)
slub: prefetch next freelist pointer in slab_alloc()
Recycling a page is a problem, since freelist link chain is hot on cpu(s) which freed objects, and possibly very cold on cpu currently owning slab. Adding a prefetch of cache line containing the pointer to next object in slab_alloc() helps a lot in many workloads, in particular on assymetric ones (allocations done on one cpu, frees on another cpus). Added cost is three machine instructions only. Examples on my dual socket quad core ht machine (Intel CPU E5540 @2.53GHz) (16 logical cpus, 2 memory nodes), 64bit kernel. Before patch : # perf stat -r 32 hackbench 50 process 4000 >/dev/null Performance counter stats for 'hackbench 50 process 4000' (32 runs): 327577,471718 task-clock # 15,821 CPUs utilized ( +- 0,64% ) 28 866 491 context-switches # 0,088 M/sec ( +- 1,80% ) 1 506 929 CPU-migrations # 0,005 M/sec ( +- 3,24% ) 127 151 page-faults # 0,000 M/sec ( +- 0,16% ) 829 399 813 448 cycles # 2,532 GHz ( +- 0,64% ) 580 664 691 740 stalled-cycles-frontend # 70,01% frontend cycles idle ( +- 0,71% ) 197 431 700 448 stalled-cycles-backend # 23,80% backend cycles idle ( +- 1,03% ) 503 548 648 975 instructions # 0,61 insns per cycle # 1,15 stalled cycles per insn ( +- 0,46% ) 95 780 068 471 branches # 292,389 M/sec ( +- 0,48% ) 1 426 407 916 branch-misses # 1,49% of all branches ( +- 1,35% ) 20,705679994 seconds time elapsed ( +- 0,64% ) After patch : # perf stat -r 32 hackbench 50 process 4000 >/dev/null Performance counter stats for 'hackbench 50 process 4000' (32 runs): 286236,542804 task-clock # 15,786 CPUs utilized ( +- 1,32% ) 19 703 372 context-switches # 0,069 M/sec ( +- 4,99% ) 1 658 249 CPU-migrations # 0,006 M/sec ( +- 6,62% ) 126 776 page-faults # 0,000 M/sec ( +- 0,12% ) 724 636 593 213 cycles # 2,532 GHz ( +- 1,32% ) 499 320 714 837 stalled-cycles-frontend # 68,91% frontend cycles idle ( +- 1,47% ) 156 555 126 809 stalled-cycles-backend # 21,60% backend cycles idle ( +- 2,22% ) 463 897 792 661 instructions # 0,64 insns per cycle # 1,08 stalled cycles per insn ( +- 0,94% ) 87 717 352 563 branches # 306,451 M/sec ( +- 0,99% ) 941 738 280 branch-misses # 1,07% of all branches ( +- 3,35% ) 18,132070670 seconds time elapsed ( +- 1,30% ) Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com> Acked-by: Christoph Lameter <cl@linux.com> CC: Matt Mackall <mpm@selenic.com> CC: David Rientjes <rientjes@google.com> CC: "Alex,Shi" <alex.shi@intel.com> CC: Shaohua Li <shaohua.li@intel.com> Signed-off-by: Pekka Enberg <penberg@kernel.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/slub.c10
1 files changed, 9 insertions, 1 deletions
diff --git a/mm/slub.c b/mm/slub.c
index 4907563ef7ff..5b915e86a9b0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -269,6 +269,11 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
269 return *(void **)(object + s->offset); 269 return *(void **)(object + s->offset);
270} 270}
271 271
272static void prefetch_freepointer(const struct kmem_cache *s, void *object)
273{
274 prefetch(object + s->offset);
275}
276
272static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 277static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
273{ 278{
274 void *p; 279 void *p;
@@ -2309,6 +2314,8 @@ redo:
2309 object = __slab_alloc(s, gfpflags, node, addr, c); 2314 object = __slab_alloc(s, gfpflags, node, addr, c);
2310 2315
2311 else { 2316 else {
2317 void *next_object = get_freepointer_safe(s, object);
2318
2312 /* 2319 /*
2313 * The cmpxchg will only match if there was no additional 2320 * The cmpxchg will only match if there was no additional
2314 * operation and if we are on the right processor. 2321 * operation and if we are on the right processor.
@@ -2324,11 +2331,12 @@ redo:
2324 if (unlikely(!this_cpu_cmpxchg_double( 2331 if (unlikely(!this_cpu_cmpxchg_double(
2325 s->cpu_slab->freelist, s->cpu_slab->tid, 2332 s->cpu_slab->freelist, s->cpu_slab->tid,
2326 object, tid, 2333 object, tid,
2327 get_freepointer_safe(s, object), next_tid(tid)))) { 2334 next_object, next_tid(tid)))) {
2328 2335
2329 note_cmpxchg_failure("slab_alloc", s, tid); 2336 note_cmpxchg_failure("slab_alloc", s, tid);
2330 goto redo; 2337 goto redo;
2331 } 2338 }
2339 prefetch_freepointer(s, next_object);
2332 stat(s, ALLOC_FASTPATH); 2340 stat(s, ALLOC_FASTPATH);
2333 } 2341 }
2334 2342