aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2007-05-10 06:15:16 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-10 12:26:52 -0400
commit894b8788d7f265eb7c6f75a9a77cedeb48f51586 (patch)
tree4b00fa4704090876895b8a7528c6fe5e2201fc28
parent02b67325a6d34f2ae67484a8802b6ffc9ce9931d (diff)
slub: support concurrent local and remote frees and allocs on a slab
Avoid atomic overhead in slab_alloc and slab_free SLUB needs to use the slab_lock for the per cpu slabs to synchronize with potential kfree operations. This patch avoids that need by moving all free objects onto a lockless_freelist. The regular freelist continues to exist and will be used to free objects. So while we consume the lockless_freelist the regular freelist may build up objects. If we are out of objects on the lockless_freelist then we may check the regular freelist. If it has objects then we move those over to the lockless_freelist and do this again. There is a significant savings in terms of atomic operations that have to be performed. We can even free directly to the lockless_freelist if we know that we are running on the same processor. So this speeds up short lived objects. They may be allocated and freed without taking the slab_lock. This is particular good for netperf. In order to maximize the effect of the new faster hotpath we extract the hottest performance pieces into inlined functions. These are then inlined into kmem_cache_alloc and kmem_cache_free. So hotpath allocation and freeing no longer requires a subroutine call within SLUB. [I am not sure that it is worth doing this because it changes the easy to read structure of slub just to reduce atomic ops. However, there is someone out there with a benchmark on 4 way and 8 way processor systems that seems to show a 5% regression vs. Slab. Seems that the regression is due to increased atomic operations use vs. SLAB in SLUB). I wonder if this is applicable or discernable at all in a real workload?] Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mm_types.h7
-rw-r--r--mm/slub.c154
2 files changed, 123 insertions, 38 deletions
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e30687bad075..d5bb1796e12b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -50,13 +50,16 @@ struct page {
50 spinlock_t ptl; 50 spinlock_t ptl;
51#endif 51#endif
52 struct { /* SLUB uses */ 52 struct { /* SLUB uses */
53 struct page *first_page; /* Compound pages */ 53 void **lockless_freelist;
54 struct kmem_cache *slab; /* Pointer to slab */ 54 struct kmem_cache *slab; /* Pointer to slab */
55 }; 55 };
56 struct {
57 struct page *first_page; /* Compound pages */
58 };
56 }; 59 };
57 union { 60 union {
58 pgoff_t index; /* Our offset within mapping. */ 61 pgoff_t index; /* Our offset within mapping. */
59 void *freelist; /* SLUB: pointer to free object */ 62 void *freelist; /* SLUB: freelist req. slab lock */
60 }; 63 };
61 struct list_head lru; /* Pageout list, eg. active_list 64 struct list_head lru; /* Pageout list, eg. active_list
62 * protected by zone->lru_lock ! 65 * protected by zone->lru_lock !
diff --git a/mm/slub.c b/mm/slub.c
index bd2efae02bcd..b07a1cab4f28 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -81,10 +81,14 @@
81 * PageActive The slab is used as a cpu cache. Allocations 81 * PageActive The slab is used as a cpu cache. Allocations
82 * may be performed from the slab. The slab is not 82 * may be performed from the slab. The slab is not
83 * on any slab list and cannot be moved onto one. 83 * on any slab list and cannot be moved onto one.
84 * The cpu slab may be equipped with an additioanl
85 * lockless_freelist that allows lockless access to
86 * free objects in addition to the regular freelist
87 * that requires the slab lock.
84 * 88 *
85 * PageError Slab requires special handling due to debug 89 * PageError Slab requires special handling due to debug
86 * options set. This moves slab handling out of 90 * options set. This moves slab handling out of
87 * the fast path. 91 * the fast path and disables lockless freelists.
88 */ 92 */
89 93
90static inline int SlabDebug(struct page *page) 94static inline int SlabDebug(struct page *page)
@@ -1014,6 +1018,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1014 set_freepointer(s, last, NULL); 1018 set_freepointer(s, last, NULL);
1015 1019
1016 page->freelist = start; 1020 page->freelist = start;
1021 page->lockless_freelist = NULL;
1017 page->inuse = 0; 1022 page->inuse = 0;
1018out: 1023out:
1019 if (flags & __GFP_WAIT) 1024 if (flags & __GFP_WAIT)
@@ -1276,6 +1281,23 @@ static void putback_slab(struct kmem_cache *s, struct page *page)
1276 */ 1281 */
1277static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) 1282static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu)
1278{ 1283{
1284 /*
1285 * Merge cpu freelist into freelist. Typically we get here
1286 * because both freelists are empty. So this is unlikely
1287 * to occur.
1288 */
1289 while (unlikely(page->lockless_freelist)) {
1290 void **object;
1291
1292 /* Retrieve object from cpu_freelist */
1293 object = page->lockless_freelist;
1294 page->lockless_freelist = page->lockless_freelist[page->offset];
1295
1296 /* And put onto the regular freelist */
1297 object[page->offset] = page->freelist;
1298 page->freelist = object;
1299 page->inuse--;
1300 }
1279 s->cpu_slab[cpu] = NULL; 1301 s->cpu_slab[cpu] = NULL;
1280 ClearPageActive(page); 1302 ClearPageActive(page);
1281 1303
@@ -1322,47 +1344,46 @@ static void flush_all(struct kmem_cache *s)
1322} 1344}
1323 1345
1324/* 1346/*
1325 * slab_alloc is optimized to only modify two cachelines on the fast path 1347 * Slow path. The lockless freelist is empty or we need to perform
1326 * (aside from the stack): 1348 * debugging duties.
1349 *
1350 * Interrupts are disabled.
1327 * 1351 *
1328 * 1. The page struct 1352 * Processing is still very fast if new objects have been freed to the
1329 * 2. The first cacheline of the object to be allocated. 1353 * regular freelist. In that case we simply take over the regular freelist
1354 * as the lockless freelist and zap the regular freelist.
1330 * 1355 *
1331 * The only other cache lines that are read (apart from code) is the 1356 * If that is not working then we fall back to the partial lists. We take the
1332 * per cpu array in the kmem_cache struct. 1357 * first element of the freelist as the object to allocate now and move the
1358 * rest of the freelist to the lockless freelist.
1333 * 1359 *
1334 * Fastpath is not possible if we need to get a new slab or have 1360 * And if we were unable to get a new slab from the partial slab lists then
1335 * debugging enabled (which means all slabs are marked with SlabDebug) 1361 * we need to allocate a new slab. This is slowest path since we may sleep.
1336 */ 1362 */
1337static void *slab_alloc(struct kmem_cache *s, 1363static void *__slab_alloc(struct kmem_cache *s,
1338 gfp_t gfpflags, int node, void *addr) 1364 gfp_t gfpflags, int node, void *addr, struct page *page)
1339{ 1365{
1340 struct page *page;
1341 void **object; 1366 void **object;
1342 unsigned long flags; 1367 int cpu = smp_processor_id();
1343 int cpu;
1344 1368
1345 local_irq_save(flags);
1346 cpu = smp_processor_id();
1347 page = s->cpu_slab[cpu];
1348 if (!page) 1369 if (!page)
1349 goto new_slab; 1370 goto new_slab;
1350 1371
1351 slab_lock(page); 1372 slab_lock(page);
1352 if (unlikely(node != -1 && page_to_nid(page) != node)) 1373 if (unlikely(node != -1 && page_to_nid(page) != node))
1353 goto another_slab; 1374 goto another_slab;
1354redo: 1375load_freelist:
1355 object = page->freelist; 1376 object = page->freelist;
1356 if (unlikely(!object)) 1377 if (unlikely(!object))
1357 goto another_slab; 1378 goto another_slab;
1358 if (unlikely(SlabDebug(page))) 1379 if (unlikely(SlabDebug(page)))
1359 goto debug; 1380 goto debug;
1360 1381
1361have_object: 1382 object = page->freelist;
1362 page->inuse++; 1383 page->lockless_freelist = object[page->offset];
1363 page->freelist = object[page->offset]; 1384 page->inuse = s->objects;
1385 page->freelist = NULL;
1364 slab_unlock(page); 1386 slab_unlock(page);
1365 local_irq_restore(flags);
1366 return object; 1387 return object;
1367 1388
1368another_slab: 1389another_slab:
@@ -1370,11 +1391,11 @@ another_slab:
1370 1391
1371new_slab: 1392new_slab:
1372 page = get_partial(s, gfpflags, node); 1393 page = get_partial(s, gfpflags, node);
1373 if (likely(page)) { 1394 if (page) {
1374have_slab: 1395have_slab:
1375 s->cpu_slab[cpu] = page; 1396 s->cpu_slab[cpu] = page;
1376 SetPageActive(page); 1397 SetPageActive(page);
1377 goto redo; 1398 goto load_freelist;
1378 } 1399 }
1379 1400
1380 page = new_slab(s, gfpflags, node); 1401 page = new_slab(s, gfpflags, node);
@@ -1397,7 +1418,7 @@ have_slab:
1397 discard_slab(s, page); 1418 discard_slab(s, page);
1398 page = s->cpu_slab[cpu]; 1419 page = s->cpu_slab[cpu];
1399 slab_lock(page); 1420 slab_lock(page);
1400 goto redo; 1421 goto load_freelist;
1401 } 1422 }
1402 /* New slab does not fit our expectations */ 1423 /* New slab does not fit our expectations */
1403 flush_slab(s, s->cpu_slab[cpu], cpu); 1424 flush_slab(s, s->cpu_slab[cpu], cpu);
@@ -1405,16 +1426,52 @@ have_slab:
1405 slab_lock(page); 1426 slab_lock(page);
1406 goto have_slab; 1427 goto have_slab;
1407 } 1428 }
1408 local_irq_restore(flags);
1409 return NULL; 1429 return NULL;
1410debug: 1430debug:
1431 object = page->freelist;
1411 if (!alloc_object_checks(s, page, object)) 1432 if (!alloc_object_checks(s, page, object))
1412 goto another_slab; 1433 goto another_slab;
1413 if (s->flags & SLAB_STORE_USER) 1434 if (s->flags & SLAB_STORE_USER)
1414 set_track(s, object, TRACK_ALLOC, addr); 1435 set_track(s, object, TRACK_ALLOC, addr);
1415 trace(s, page, object, 1); 1436 trace(s, page, object, 1);
1416 init_object(s, object, 1); 1437 init_object(s, object, 1);
1417 goto have_object; 1438
1439 page->inuse++;
1440 page->freelist = object[page->offset];
1441 slab_unlock(page);
1442 return object;
1443}
1444
1445/*
1446 * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
1447 * have the fastpath folded into their functions. So no function call
1448 * overhead for requests that can be satisfied on the fastpath.
1449 *
1450 * The fastpath works by first checking if the lockless freelist can be used.
1451 * If not then __slab_alloc is called for slow processing.
1452 *
1453 * Otherwise we can simply pick the next object from the lockless free list.
1454 */
1455static void __always_inline *slab_alloc(struct kmem_cache *s,
1456 gfp_t gfpflags, int node, void *addr)
1457{
1458 struct page *page;
1459 void **object;
1460 unsigned long flags;
1461
1462 local_irq_save(flags);
1463 page = s->cpu_slab[smp_processor_id()];
1464 if (unlikely(!page || !page->lockless_freelist ||
1465 (node != -1 && page_to_nid(page) != node)))
1466
1467 object = __slab_alloc(s, gfpflags, node, addr, page);
1468
1469 else {
1470 object = page->lockless_freelist;
1471 page->lockless_freelist = object[page->offset];
1472 }
1473 local_irq_restore(flags);
1474 return object;
1418} 1475}
1419 1476
1420void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1477void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
@@ -1432,20 +1489,19 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
1432#endif 1489#endif
1433 1490
1434/* 1491/*
1435 * The fastpath only writes the cacheline of the page struct and the first 1492 * Slow patch handling. This may still be called frequently since objects
1436 * cacheline of the object. 1493 * have a longer lifetime than the cpu slabs in most processing loads.
1437 * 1494 *
1438 * We read the cpu_slab cacheline to check if the slab is the per cpu 1495 * So we still attempt to reduce cache line usage. Just take the slab
1439 * slab for this processor. 1496 * lock and free the item. If there is no additional partial page
1497 * handling required then we can return immediately.
1440 */ 1498 */
1441static void slab_free(struct kmem_cache *s, struct page *page, 1499static void __slab_free(struct kmem_cache *s, struct page *page,
1442 void *x, void *addr) 1500 void *x, void *addr)
1443{ 1501{
1444 void *prior; 1502 void *prior;
1445 void **object = (void *)x; 1503 void **object = (void *)x;
1446 unsigned long flags;
1447 1504
1448 local_irq_save(flags);
1449 slab_lock(page); 1505 slab_lock(page);
1450 1506
1451 if (unlikely(SlabDebug(page))) 1507 if (unlikely(SlabDebug(page)))
@@ -1475,7 +1531,6 @@ checks_ok:
1475 1531
1476out_unlock: 1532out_unlock:
1477 slab_unlock(page); 1533 slab_unlock(page);
1478 local_irq_restore(flags);
1479 return; 1534 return;
1480 1535
1481slab_empty: 1536slab_empty:
@@ -1487,7 +1542,6 @@ slab_empty:
1487 1542
1488 slab_unlock(page); 1543 slab_unlock(page);
1489 discard_slab(s, page); 1544 discard_slab(s, page);
1490 local_irq_restore(flags);
1491 return; 1545 return;
1492 1546
1493debug: 1547debug:
@@ -1502,6 +1556,34 @@ debug:
1502 goto checks_ok; 1556 goto checks_ok;
1503} 1557}
1504 1558
1559/*
1560 * Fastpath with forced inlining to produce a kfree and kmem_cache_free that
1561 * can perform fastpath freeing without additional function calls.
1562 *
1563 * The fastpath is only possible if we are freeing to the current cpu slab
1564 * of this processor. This typically the case if we have just allocated
1565 * the item before.
1566 *
1567 * If fastpath is not possible then fall back to __slab_free where we deal
1568 * with all sorts of special processing.
1569 */
1570static void __always_inline slab_free(struct kmem_cache *s,
1571 struct page *page, void *x, void *addr)
1572{
1573 void **object = (void *)x;
1574 unsigned long flags;
1575
1576 local_irq_save(flags);
1577 if (likely(page == s->cpu_slab[smp_processor_id()] &&
1578 !SlabDebug(page))) {
1579 object[page->offset] = page->lockless_freelist;
1580 page->lockless_freelist = object;
1581 } else
1582 __slab_free(s, page, x, addr);
1583
1584 local_irq_restore(flags);
1585}
1586
1505void kmem_cache_free(struct kmem_cache *s, void *x) 1587void kmem_cache_free(struct kmem_cache *s, void *x)
1506{ 1588{
1507 struct page *page; 1589 struct page *page;