diff options
author | Christoph Lameter <cl@linux.com> | 2011-06-01 13:25:53 -0400 |
---|---|---|
committer | Pekka Enberg <penberg@kernel.org> | 2011-07-02 06:26:55 -0400 |
commit | 881db7fb03a77af0bcd460fd1de1f4062d5c18fe (patch) | |
tree | 281c07cf45aabd44962dbceed4efb1a86492115d /mm | |
parent | 2cfb7455d223ab24b23df44be430faf92e12390f (diff) |
slub: Invert locking and avoid slab lock
Locking slabs is no longer necesary if the arch supports cmpxchg operations
and if no debuggin features are used on a slab. If the arch does not support
cmpxchg then we fallback to use the slab lock to do a cmpxchg like operation.
The patch also changes the lock order. Slab locks are subsumed to the node lock
now. With that approach slab_trylocking is no longer necessary.
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/slub.c | 129 |
1 files changed, 52 insertions, 77 deletions
@@ -2,10 +2,11 @@ | |||
2 | * SLUB: A slab allocator that limits cache line use instead of queuing | 2 | * SLUB: A slab allocator that limits cache line use instead of queuing |
3 | * objects in per cpu and per node lists. | 3 | * objects in per cpu and per node lists. |
4 | * | 4 | * |
5 | * The allocator synchronizes using per slab locks and only | 5 | * The allocator synchronizes using per slab locks or atomic operatios |
6 | * uses a centralized lock to manage a pool of partial slabs. | 6 | * and only uses a centralized lock to manage a pool of partial slabs. |
7 | * | 7 | * |
8 | * (C) 2007 SGI, Christoph Lameter | 8 | * (C) 2007 SGI, Christoph Lameter |
9 | * (C) 2011 Linux Foundation, Christoph Lameter | ||
9 | */ | 10 | */ |
10 | 11 | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
@@ -32,15 +33,27 @@ | |||
32 | 33 | ||
33 | /* | 34 | /* |
34 | * Lock order: | 35 | * Lock order: |
35 | * 1. slab_lock(page) | 36 | * 1. slub_lock (Global Semaphore) |
36 | * 2. slab->list_lock | 37 | * 2. node->list_lock |
38 | * 3. slab_lock(page) (Only on some arches and for debugging) | ||
37 | * | 39 | * |
38 | * The slab_lock protects operations on the object of a particular | 40 | * slub_lock |
39 | * slab and its metadata in the page struct. If the slab lock | 41 | * |
40 | * has been taken then no allocations nor frees can be performed | 42 | * The role of the slub_lock is to protect the list of all the slabs |
41 | * on the objects in the slab nor can the slab be added or removed | 43 | * and to synchronize major metadata changes to slab cache structures. |
42 | * from the partial or full lists since this would mean modifying | 44 | * |
43 | * the page_struct of the slab. | 45 | * The slab_lock is only used for debugging and on arches that do not |
46 | * have the ability to do a cmpxchg_double. It only protects the second | ||
47 | * double word in the page struct. Meaning | ||
48 | * A. page->freelist -> List of object free in a page | ||
49 | * B. page->counters -> Counters of objects | ||
50 | * C. page->frozen -> frozen state | ||
51 | * | ||
52 | * If a slab is frozen then it is exempt from list management. It is not | ||
53 | * on any list. The processor that froze the slab is the one who can | ||
54 | * perform list operations on the page. Other processors may put objects | ||
55 | * onto the freelist but the processor that froze the slab is the only | ||
56 | * one that can retrieve the objects from the page's freelist. | ||
44 | * | 57 | * |
45 | * The list_lock protects the partial and full list on each node and | 58 | * The list_lock protects the partial and full list on each node and |
46 | * the partial slab counter. If taken then no new slabs may be added or | 59 | * the partial slab counter. If taken then no new slabs may be added or |
@@ -53,20 +66,6 @@ | |||
53 | * slabs, operations can continue without any centralized lock. F.e. | 66 | * slabs, operations can continue without any centralized lock. F.e. |
54 | * allocating a long series of objects that fill up slabs does not require | 67 | * allocating a long series of objects that fill up slabs does not require |
55 | * the list lock. | 68 | * the list lock. |
56 | * | ||
57 | * The lock order is sometimes inverted when we are trying to get a slab | ||
58 | * off a list. We take the list_lock and then look for a page on the list | ||
59 | * to use. While we do that objects in the slabs may be freed. We can | ||
60 | * only operate on the slab if we have also taken the slab_lock. So we use | ||
61 | * a slab_trylock() on the slab. If trylock was successful then no frees | ||
62 | * can occur anymore and we can use the slab for allocations etc. If the | ||
63 | * slab_trylock() does not succeed then frees are in progress in the slab and | ||
64 | * we must stay away from it for a while since we may cause a bouncing | ||
65 | * cacheline if we try to acquire the lock. So go onto the next slab. | ||
66 | * If all pages are busy then we may allocate a new slab instead of reusing | ||
67 | * a partial slab. A new slab has no one operating on it and thus there is | ||
68 | * no danger of cacheline contention. | ||
69 | * | ||
70 | * Interrupts are disabled during allocation and deallocation in order to | 69 | * Interrupts are disabled during allocation and deallocation in order to |
71 | * make the slab allocator safe to use in the context of an irq. In addition | 70 | * make the slab allocator safe to use in the context of an irq. In addition |
72 | * interrupts are disabled to ensure that the processor does not change | 71 | * interrupts are disabled to ensure that the processor does not change |
@@ -342,6 +341,19 @@ static inline int oo_objects(struct kmem_cache_order_objects x) | |||
342 | return x.x & OO_MASK; | 341 | return x.x & OO_MASK; |
343 | } | 342 | } |
344 | 343 | ||
344 | /* | ||
345 | * Per slab locking using the pagelock | ||
346 | */ | ||
347 | static __always_inline void slab_lock(struct page *page) | ||
348 | { | ||
349 | bit_spin_lock(PG_locked, &page->flags); | ||
350 | } | ||
351 | |||
352 | static __always_inline void slab_unlock(struct page *page) | ||
353 | { | ||
354 | __bit_spin_unlock(PG_locked, &page->flags); | ||
355 | } | ||
356 | |||
345 | static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | 357 | static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, |
346 | void *freelist_old, unsigned long counters_old, | 358 | void *freelist_old, unsigned long counters_old, |
347 | void *freelist_new, unsigned long counters_new, | 359 | void *freelist_new, unsigned long counters_new, |
@@ -356,11 +368,14 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | |||
356 | } else | 368 | } else |
357 | #endif | 369 | #endif |
358 | { | 370 | { |
371 | slab_lock(page); | ||
359 | if (page->freelist == freelist_old && page->counters == counters_old) { | 372 | if (page->freelist == freelist_old && page->counters == counters_old) { |
360 | page->freelist = freelist_new; | 373 | page->freelist = freelist_new; |
361 | page->counters = counters_new; | 374 | page->counters = counters_new; |
375 | slab_unlock(page); | ||
362 | return 1; | 376 | return 1; |
363 | } | 377 | } |
378 | slab_unlock(page); | ||
364 | } | 379 | } |
365 | 380 | ||
366 | cpu_relax(); | 381 | cpu_relax(); |
@@ -377,7 +392,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | |||
377 | /* | 392 | /* |
378 | * Determine a map of object in use on a page. | 393 | * Determine a map of object in use on a page. |
379 | * | 394 | * |
380 | * Slab lock or node listlock must be held to guarantee that the page does | 395 | * Node listlock must be held to guarantee that the page does |
381 | * not vanish from under us. | 396 | * not vanish from under us. |
382 | */ | 397 | */ |
383 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) | 398 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) |
@@ -808,10 +823,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
808 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | 823 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) |
809 | { | 824 | { |
810 | int nr = 0; | 825 | int nr = 0; |
811 | void *fp = page->freelist; | 826 | void *fp; |
812 | void *object = NULL; | 827 | void *object = NULL; |
813 | unsigned long max_objects; | 828 | unsigned long max_objects; |
814 | 829 | ||
830 | fp = page->freelist; | ||
815 | while (fp && nr <= page->objects) { | 831 | while (fp && nr <= page->objects) { |
816 | if (fp == search) | 832 | if (fp == search) |
817 | return 1; | 833 | return 1; |
@@ -1024,6 +1040,8 @@ bad: | |||
1024 | static noinline int free_debug_processing(struct kmem_cache *s, | 1040 | static noinline int free_debug_processing(struct kmem_cache *s, |
1025 | struct page *page, void *object, unsigned long addr) | 1041 | struct page *page, void *object, unsigned long addr) |
1026 | { | 1042 | { |
1043 | slab_lock(page); | ||
1044 | |||
1027 | if (!check_slab(s, page)) | 1045 | if (!check_slab(s, page)) |
1028 | goto fail; | 1046 | goto fail; |
1029 | 1047 | ||
@@ -1059,10 +1077,12 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1059 | set_track(s, object, TRACK_FREE, addr); | 1077 | set_track(s, object, TRACK_FREE, addr); |
1060 | trace(s, page, object, 0); | 1078 | trace(s, page, object, 0); |
1061 | init_object(s, object, SLUB_RED_INACTIVE); | 1079 | init_object(s, object, SLUB_RED_INACTIVE); |
1080 | slab_unlock(page); | ||
1062 | return 1; | 1081 | return 1; |
1063 | 1082 | ||
1064 | fail: | 1083 | fail: |
1065 | slab_fix(s, "Object at 0x%p not freed", object); | 1084 | slab_fix(s, "Object at 0x%p not freed", object); |
1085 | slab_unlock(page); | ||
1066 | return 0; | 1086 | return 0; |
1067 | } | 1087 | } |
1068 | 1088 | ||
@@ -1394,27 +1414,6 @@ static void discard_slab(struct kmem_cache *s, struct page *page) | |||
1394 | } | 1414 | } |
1395 | 1415 | ||
1396 | /* | 1416 | /* |
1397 | * Per slab locking using the pagelock | ||
1398 | */ | ||
1399 | static __always_inline void slab_lock(struct page *page) | ||
1400 | { | ||
1401 | bit_spin_lock(PG_locked, &page->flags); | ||
1402 | } | ||
1403 | |||
1404 | static __always_inline void slab_unlock(struct page *page) | ||
1405 | { | ||
1406 | __bit_spin_unlock(PG_locked, &page->flags); | ||
1407 | } | ||
1408 | |||
1409 | static __always_inline int slab_trylock(struct page *page) | ||
1410 | { | ||
1411 | int rc = 1; | ||
1412 | |||
1413 | rc = bit_spin_trylock(PG_locked, &page->flags); | ||
1414 | return rc; | ||
1415 | } | ||
1416 | |||
1417 | /* | ||
1418 | * Management of partially allocated slabs. | 1417 | * Management of partially allocated slabs. |
1419 | * | 1418 | * |
1420 | * list_lock must be held. | 1419 | * list_lock must be held. |
@@ -1445,17 +1444,13 @@ static inline void remove_partial(struct kmem_cache_node *n, | |||
1445 | * | 1444 | * |
1446 | * Must hold list_lock. | 1445 | * Must hold list_lock. |
1447 | */ | 1446 | */ |
1448 | static inline int lock_and_freeze_slab(struct kmem_cache *s, | 1447 | static inline int acquire_slab(struct kmem_cache *s, |
1449 | struct kmem_cache_node *n, struct page *page) | 1448 | struct kmem_cache_node *n, struct page *page) |
1450 | { | 1449 | { |
1451 | void *freelist; | 1450 | void *freelist; |
1452 | unsigned long counters; | 1451 | unsigned long counters; |
1453 | struct page new; | 1452 | struct page new; |
1454 | 1453 | ||
1455 | |||
1456 | if (!slab_trylock(page)) | ||
1457 | return 0; | ||
1458 | |||
1459 | /* | 1454 | /* |
1460 | * Zap the freelist and set the frozen bit. | 1455 | * Zap the freelist and set the frozen bit. |
1461 | * The old freelist is the list of objects for the | 1456 | * The old freelist is the list of objects for the |
@@ -1491,7 +1486,6 @@ static inline int lock_and_freeze_slab(struct kmem_cache *s, | |||
1491 | */ | 1486 | */ |
1492 | printk(KERN_ERR "SLUB: %s : Page without available objects on" | 1487 | printk(KERN_ERR "SLUB: %s : Page without available objects on" |
1493 | " partial list\n", s->name); | 1488 | " partial list\n", s->name); |
1494 | slab_unlock(page); | ||
1495 | return 0; | 1489 | return 0; |
1496 | } | 1490 | } |
1497 | } | 1491 | } |
@@ -1515,7 +1509,7 @@ static struct page *get_partial_node(struct kmem_cache *s, | |||
1515 | 1509 | ||
1516 | spin_lock(&n->list_lock); | 1510 | spin_lock(&n->list_lock); |
1517 | list_for_each_entry(page, &n->partial, lru) | 1511 | list_for_each_entry(page, &n->partial, lru) |
1518 | if (lock_and_freeze_slab(s, n, page)) | 1512 | if (acquire_slab(s, n, page)) |
1519 | goto out; | 1513 | goto out; |
1520 | page = NULL; | 1514 | page = NULL; |
1521 | out: | 1515 | out: |
@@ -1804,8 +1798,6 @@ redo: | |||
1804 | "unfreezing slab")) | 1798 | "unfreezing slab")) |
1805 | goto redo; | 1799 | goto redo; |
1806 | 1800 | ||
1807 | slab_unlock(page); | ||
1808 | |||
1809 | if (lock) | 1801 | if (lock) |
1810 | spin_unlock(&n->list_lock); | 1802 | spin_unlock(&n->list_lock); |
1811 | 1803 | ||
@@ -1819,7 +1811,6 @@ redo: | |||
1819 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1811 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1820 | { | 1812 | { |
1821 | stat(s, CPUSLAB_FLUSH); | 1813 | stat(s, CPUSLAB_FLUSH); |
1822 | slab_lock(c->page); | ||
1823 | deactivate_slab(s, c); | 1814 | deactivate_slab(s, c); |
1824 | } | 1815 | } |
1825 | 1816 | ||
@@ -1968,7 +1959,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1968 | if (!page) | 1959 | if (!page) |
1969 | goto new_slab; | 1960 | goto new_slab; |
1970 | 1961 | ||
1971 | slab_lock(page); | ||
1972 | if (unlikely(!node_match(c, node))) | 1962 | if (unlikely(!node_match(c, node))) |
1973 | goto another_slab; | 1963 | goto another_slab; |
1974 | 1964 | ||
@@ -1994,8 +1984,6 @@ load_freelist: | |||
1994 | 1984 | ||
1995 | stat(s, ALLOC_REFILL); | 1985 | stat(s, ALLOC_REFILL); |
1996 | 1986 | ||
1997 | slab_unlock(page); | ||
1998 | |||
1999 | c->freelist = get_freepointer(s, object); | 1987 | c->freelist = get_freepointer(s, object); |
2000 | c->tid = next_tid(c->tid); | 1988 | c->tid = next_tid(c->tid); |
2001 | local_irq_restore(flags); | 1989 | local_irq_restore(flags); |
@@ -2031,7 +2019,6 @@ new_slab: | |||
2031 | page->inuse = page->objects; | 2019 | page->inuse = page->objects; |
2032 | 2020 | ||
2033 | stat(s, ALLOC_SLAB); | 2021 | stat(s, ALLOC_SLAB); |
2034 | slab_lock(page); | ||
2035 | c->node = page_to_nid(page); | 2022 | c->node = page_to_nid(page); |
2036 | c->page = page; | 2023 | c->page = page; |
2037 | goto load_freelist; | 2024 | goto load_freelist; |
@@ -2205,7 +2192,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2205 | unsigned long uninitialized_var(flags); | 2192 | unsigned long uninitialized_var(flags); |
2206 | 2193 | ||
2207 | local_irq_save(flags); | 2194 | local_irq_save(flags); |
2208 | slab_lock(page); | ||
2209 | stat(s, FREE_SLOWPATH); | 2195 | stat(s, FREE_SLOWPATH); |
2210 | 2196 | ||
2211 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) | 2197 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) |
@@ -2271,7 +2257,6 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2271 | spin_unlock(&n->list_lock); | 2257 | spin_unlock(&n->list_lock); |
2272 | 2258 | ||
2273 | out_unlock: | 2259 | out_unlock: |
2274 | slab_unlock(page); | ||
2275 | local_irq_restore(flags); | 2260 | local_irq_restore(flags); |
2276 | return; | 2261 | return; |
2277 | 2262 | ||
@@ -2285,7 +2270,6 @@ slab_empty: | |||
2285 | } | 2270 | } |
2286 | 2271 | ||
2287 | spin_unlock(&n->list_lock); | 2272 | spin_unlock(&n->list_lock); |
2288 | slab_unlock(page); | ||
2289 | local_irq_restore(flags); | 2273 | local_irq_restore(flags); |
2290 | stat(s, FREE_SLAB); | 2274 | stat(s, FREE_SLAB); |
2291 | discard_slab(s, page); | 2275 | discard_slab(s, page); |
@@ -3202,14 +3186,8 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
3202 | * list_lock. page->inuse here is the upper limit. | 3186 | * list_lock. page->inuse here is the upper limit. |
3203 | */ | 3187 | */ |
3204 | list_for_each_entry_safe(page, t, &n->partial, lru) { | 3188 | list_for_each_entry_safe(page, t, &n->partial, lru) { |
3205 | if (!page->inuse && slab_trylock(page)) { | 3189 | if (!page->inuse) { |
3206 | /* | ||
3207 | * Must hold slab lock here because slab_free | ||
3208 | * may have freed the last object and be | ||
3209 | * waiting to release the slab. | ||
3210 | */ | ||
3211 | remove_partial(n, page); | 3190 | remove_partial(n, page); |
3212 | slab_unlock(page); | ||
3213 | discard_slab(s, page); | 3191 | discard_slab(s, page); |
3214 | } else { | 3192 | } else { |
3215 | list_move(&page->lru, | 3193 | list_move(&page->lru, |
@@ -3797,12 +3775,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, | |||
3797 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, | 3775 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, |
3798 | unsigned long *map) | 3776 | unsigned long *map) |
3799 | { | 3777 | { |
3800 | if (slab_trylock(page)) { | 3778 | slab_lock(page); |
3801 | validate_slab(s, page, map); | 3779 | validate_slab(s, page, map); |
3802 | slab_unlock(page); | 3780 | slab_unlock(page); |
3803 | } else | ||
3804 | printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", | ||
3805 | s->name, page); | ||
3806 | } | 3781 | } |
3807 | 3782 | ||
3808 | static int validate_slab_node(struct kmem_cache *s, | 3783 | static int validate_slab_node(struct kmem_cache *s, |