diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/slub.c | 764 |
1 files changed, 512 insertions, 252 deletions
@@ -2,10 +2,11 @@ | |||
2 | * SLUB: A slab allocator that limits cache line use instead of queuing | 2 | * SLUB: A slab allocator that limits cache line use instead of queuing |
3 | * objects in per cpu and per node lists. | 3 | * objects in per cpu and per node lists. |
4 | * | 4 | * |
5 | * The allocator synchronizes using per slab locks and only | 5 | * The allocator synchronizes using per slab locks or atomic operatios |
6 | * uses a centralized lock to manage a pool of partial slabs. | 6 | * and only uses a centralized lock to manage a pool of partial slabs. |
7 | * | 7 | * |
8 | * (C) 2007 SGI, Christoph Lameter | 8 | * (C) 2007 SGI, Christoph Lameter |
9 | * (C) 2011 Linux Foundation, Christoph Lameter | ||
9 | */ | 10 | */ |
10 | 11 | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
@@ -33,15 +34,27 @@ | |||
33 | 34 | ||
34 | /* | 35 | /* |
35 | * Lock order: | 36 | * Lock order: |
36 | * 1. slab_lock(page) | 37 | * 1. slub_lock (Global Semaphore) |
37 | * 2. slab->list_lock | 38 | * 2. node->list_lock |
39 | * 3. slab_lock(page) (Only on some arches and for debugging) | ||
38 | * | 40 | * |
39 | * The slab_lock protects operations on the object of a particular | 41 | * slub_lock |
40 | * slab and its metadata in the page struct. If the slab lock | 42 | * |
41 | * has been taken then no allocations nor frees can be performed | 43 | * The role of the slub_lock is to protect the list of all the slabs |
42 | * on the objects in the slab nor can the slab be added or removed | 44 | * and to synchronize major metadata changes to slab cache structures. |
43 | * from the partial or full lists since this would mean modifying | 45 | * |
44 | * the page_struct of the slab. | 46 | * The slab_lock is only used for debugging and on arches that do not |
47 | * have the ability to do a cmpxchg_double. It only protects the second | ||
48 | * double word in the page struct. Meaning | ||
49 | * A. page->freelist -> List of object free in a page | ||
50 | * B. page->counters -> Counters of objects | ||
51 | * C. page->frozen -> frozen state | ||
52 | * | ||
53 | * If a slab is frozen then it is exempt from list management. It is not | ||
54 | * on any list. The processor that froze the slab is the one who can | ||
55 | * perform list operations on the page. Other processors may put objects | ||
56 | * onto the freelist but the processor that froze the slab is the only | ||
57 | * one that can retrieve the objects from the page's freelist. | ||
45 | * | 58 | * |
46 | * The list_lock protects the partial and full list on each node and | 59 | * The list_lock protects the partial and full list on each node and |
47 | * the partial slab counter. If taken then no new slabs may be added or | 60 | * the partial slab counter. If taken then no new slabs may be added or |
@@ -54,20 +67,6 @@ | |||
54 | * slabs, operations can continue without any centralized lock. F.e. | 67 | * slabs, operations can continue without any centralized lock. F.e. |
55 | * allocating a long series of objects that fill up slabs does not require | 68 | * allocating a long series of objects that fill up slabs does not require |
56 | * the list lock. | 69 | * the list lock. |
57 | * | ||
58 | * The lock order is sometimes inverted when we are trying to get a slab | ||
59 | * off a list. We take the list_lock and then look for a page on the list | ||
60 | * to use. While we do that objects in the slabs may be freed. We can | ||
61 | * only operate on the slab if we have also taken the slab_lock. So we use | ||
62 | * a slab_trylock() on the slab. If trylock was successful then no frees | ||
63 | * can occur anymore and we can use the slab for allocations etc. If the | ||
64 | * slab_trylock() does not succeed then frees are in progress in the slab and | ||
65 | * we must stay away from it for a while since we may cause a bouncing | ||
66 | * cacheline if we try to acquire the lock. So go onto the next slab. | ||
67 | * If all pages are busy then we may allocate a new slab instead of reusing | ||
68 | * a partial slab. A new slab has no one operating on it and thus there is | ||
69 | * no danger of cacheline contention. | ||
70 | * | ||
71 | * Interrupts are disabled during allocation and deallocation in order to | 70 | * Interrupts are disabled during allocation and deallocation in order to |
72 | * make the slab allocator safe to use in the context of an irq. In addition | 71 | * make the slab allocator safe to use in the context of an irq. In addition |
73 | * interrupts are disabled to ensure that the processor does not change | 72 | * interrupts are disabled to ensure that the processor does not change |
@@ -132,6 +131,9 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
132 | /* Enable to test recovery from slab corruption on boot */ | 131 | /* Enable to test recovery from slab corruption on boot */ |
133 | #undef SLUB_RESILIENCY_TEST | 132 | #undef SLUB_RESILIENCY_TEST |
134 | 133 | ||
134 | /* Enable to log cmpxchg failures */ | ||
135 | #undef SLUB_DEBUG_CMPXCHG | ||
136 | |||
135 | /* | 137 | /* |
136 | * Mininum number of partial slabs. These will be left on the partial | 138 | * Mininum number of partial slabs. These will be left on the partial |
137 | * lists even if they are empty. kmem_cache_shrink may reclaim them. | 139 | * lists even if they are empty. kmem_cache_shrink may reclaim them. |
@@ -167,10 +169,11 @@ static inline int kmem_cache_debug(struct kmem_cache *s) | |||
167 | 169 | ||
168 | #define OO_SHIFT 16 | 170 | #define OO_SHIFT 16 |
169 | #define OO_MASK ((1 << OO_SHIFT) - 1) | 171 | #define OO_MASK ((1 << OO_SHIFT) - 1) |
170 | #define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ | 172 | #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ |
171 | 173 | ||
172 | /* Internal SLUB flags */ | 174 | /* Internal SLUB flags */ |
173 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ | 175 | #define __OBJECT_POISON 0x80000000UL /* Poison object */ |
176 | #define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ | ||
174 | 177 | ||
175 | static int kmem_size = sizeof(struct kmem_cache); | 178 | static int kmem_size = sizeof(struct kmem_cache); |
176 | 179 | ||
@@ -343,11 +346,99 @@ static inline int oo_objects(struct kmem_cache_order_objects x) | |||
343 | return x.x & OO_MASK; | 346 | return x.x & OO_MASK; |
344 | } | 347 | } |
345 | 348 | ||
349 | /* | ||
350 | * Per slab locking using the pagelock | ||
351 | */ | ||
352 | static __always_inline void slab_lock(struct page *page) | ||
353 | { | ||
354 | bit_spin_lock(PG_locked, &page->flags); | ||
355 | } | ||
356 | |||
357 | static __always_inline void slab_unlock(struct page *page) | ||
358 | { | ||
359 | __bit_spin_unlock(PG_locked, &page->flags); | ||
360 | } | ||
361 | |||
362 | /* Interrupts must be disabled (for the fallback code to work right) */ | ||
363 | static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | ||
364 | void *freelist_old, unsigned long counters_old, | ||
365 | void *freelist_new, unsigned long counters_new, | ||
366 | const char *n) | ||
367 | { | ||
368 | VM_BUG_ON(!irqs_disabled()); | ||
369 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
370 | if (s->flags & __CMPXCHG_DOUBLE) { | ||
371 | if (cmpxchg_double(&page->freelist, | ||
372 | freelist_old, counters_old, | ||
373 | freelist_new, counters_new)) | ||
374 | return 1; | ||
375 | } else | ||
376 | #endif | ||
377 | { | ||
378 | slab_lock(page); | ||
379 | if (page->freelist == freelist_old && page->counters == counters_old) { | ||
380 | page->freelist = freelist_new; | ||
381 | page->counters = counters_new; | ||
382 | slab_unlock(page); | ||
383 | return 1; | ||
384 | } | ||
385 | slab_unlock(page); | ||
386 | } | ||
387 | |||
388 | cpu_relax(); | ||
389 | stat(s, CMPXCHG_DOUBLE_FAIL); | ||
390 | |||
391 | #ifdef SLUB_DEBUG_CMPXCHG | ||
392 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | ||
393 | #endif | ||
394 | |||
395 | return 0; | ||
396 | } | ||
397 | |||
398 | static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, | ||
399 | void *freelist_old, unsigned long counters_old, | ||
400 | void *freelist_new, unsigned long counters_new, | ||
401 | const char *n) | ||
402 | { | ||
403 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
404 | if (s->flags & __CMPXCHG_DOUBLE) { | ||
405 | if (cmpxchg_double(&page->freelist, | ||
406 | freelist_old, counters_old, | ||
407 | freelist_new, counters_new)) | ||
408 | return 1; | ||
409 | } else | ||
410 | #endif | ||
411 | { | ||
412 | unsigned long flags; | ||
413 | |||
414 | local_irq_save(flags); | ||
415 | slab_lock(page); | ||
416 | if (page->freelist == freelist_old && page->counters == counters_old) { | ||
417 | page->freelist = freelist_new; | ||
418 | page->counters = counters_new; | ||
419 | slab_unlock(page); | ||
420 | local_irq_restore(flags); | ||
421 | return 1; | ||
422 | } | ||
423 | slab_unlock(page); | ||
424 | local_irq_restore(flags); | ||
425 | } | ||
426 | |||
427 | cpu_relax(); | ||
428 | stat(s, CMPXCHG_DOUBLE_FAIL); | ||
429 | |||
430 | #ifdef SLUB_DEBUG_CMPXCHG | ||
431 | printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); | ||
432 | #endif | ||
433 | |||
434 | return 0; | ||
435 | } | ||
436 | |||
346 | #ifdef CONFIG_SLUB_DEBUG | 437 | #ifdef CONFIG_SLUB_DEBUG |
347 | /* | 438 | /* |
348 | * Determine a map of object in use on a page. | 439 | * Determine a map of object in use on a page. |
349 | * | 440 | * |
350 | * Slab lock or node listlock must be held to guarantee that the page does | 441 | * Node listlock must be held to guarantee that the page does |
351 | * not vanish from under us. | 442 | * not vanish from under us. |
352 | */ | 443 | */ |
353 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) | 444 | static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map) |
@@ -838,10 +929,11 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
838 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | 929 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) |
839 | { | 930 | { |
840 | int nr = 0; | 931 | int nr = 0; |
841 | void *fp = page->freelist; | 932 | void *fp; |
842 | void *object = NULL; | 933 | void *object = NULL; |
843 | unsigned long max_objects; | 934 | unsigned long max_objects; |
844 | 935 | ||
936 | fp = page->freelist; | ||
845 | while (fp && nr <= page->objects) { | 937 | while (fp && nr <= page->objects) { |
846 | if (fp == search) | 938 | if (fp == search) |
847 | return 1; | 939 | return 1; |
@@ -946,26 +1038,27 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) | |||
946 | 1038 | ||
947 | /* | 1039 | /* |
948 | * Tracking of fully allocated slabs for debugging purposes. | 1040 | * Tracking of fully allocated slabs for debugging purposes. |
1041 | * | ||
1042 | * list_lock must be held. | ||
949 | */ | 1043 | */ |
950 | static void add_full(struct kmem_cache_node *n, struct page *page) | 1044 | static void add_full(struct kmem_cache *s, |
1045 | struct kmem_cache_node *n, struct page *page) | ||
951 | { | 1046 | { |
952 | spin_lock(&n->list_lock); | 1047 | if (!(s->flags & SLAB_STORE_USER)) |
1048 | return; | ||
1049 | |||
953 | list_add(&page->lru, &n->full); | 1050 | list_add(&page->lru, &n->full); |
954 | spin_unlock(&n->list_lock); | ||
955 | } | 1051 | } |
956 | 1052 | ||
1053 | /* | ||
1054 | * list_lock must be held. | ||
1055 | */ | ||
957 | static void remove_full(struct kmem_cache *s, struct page *page) | 1056 | static void remove_full(struct kmem_cache *s, struct page *page) |
958 | { | 1057 | { |
959 | struct kmem_cache_node *n; | ||
960 | |||
961 | if (!(s->flags & SLAB_STORE_USER)) | 1058 | if (!(s->flags & SLAB_STORE_USER)) |
962 | return; | 1059 | return; |
963 | 1060 | ||
964 | n = get_node(s, page_to_nid(page)); | ||
965 | |||
966 | spin_lock(&n->list_lock); | ||
967 | list_del(&page->lru); | 1061 | list_del(&page->lru); |
968 | spin_unlock(&n->list_lock); | ||
969 | } | 1062 | } |
970 | 1063 | ||
971 | /* Tracking of the number of slabs for debugging purposes */ | 1064 | /* Tracking of the number of slabs for debugging purposes */ |
@@ -1021,11 +1114,6 @@ static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *pa | |||
1021 | if (!check_slab(s, page)) | 1114 | if (!check_slab(s, page)) |
1022 | goto bad; | 1115 | goto bad; |
1023 | 1116 | ||
1024 | if (!on_freelist(s, page, object)) { | ||
1025 | object_err(s, page, object, "Object already allocated"); | ||
1026 | goto bad; | ||
1027 | } | ||
1028 | |||
1029 | if (!check_valid_pointer(s, page, object)) { | 1117 | if (!check_valid_pointer(s, page, object)) { |
1030 | object_err(s, page, object, "Freelist Pointer check fails"); | 1118 | object_err(s, page, object, "Freelist Pointer check fails"); |
1031 | goto bad; | 1119 | goto bad; |
@@ -1058,6 +1146,12 @@ bad: | |||
1058 | static noinline int free_debug_processing(struct kmem_cache *s, | 1146 | static noinline int free_debug_processing(struct kmem_cache *s, |
1059 | struct page *page, void *object, unsigned long addr) | 1147 | struct page *page, void *object, unsigned long addr) |
1060 | { | 1148 | { |
1149 | unsigned long flags; | ||
1150 | int rc = 0; | ||
1151 | |||
1152 | local_irq_save(flags); | ||
1153 | slab_lock(page); | ||
1154 | |||
1061 | if (!check_slab(s, page)) | 1155 | if (!check_slab(s, page)) |
1062 | goto fail; | 1156 | goto fail; |
1063 | 1157 | ||
@@ -1072,7 +1166,7 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1072 | } | 1166 | } |
1073 | 1167 | ||
1074 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) | 1168 | if (!check_object(s, page, object, SLUB_RED_ACTIVE)) |
1075 | return 0; | 1169 | goto out; |
1076 | 1170 | ||
1077 | if (unlikely(s != page->slab)) { | 1171 | if (unlikely(s != page->slab)) { |
1078 | if (!PageSlab(page)) { | 1172 | if (!PageSlab(page)) { |
@@ -1089,18 +1183,19 @@ static noinline int free_debug_processing(struct kmem_cache *s, | |||
1089 | goto fail; | 1183 | goto fail; |
1090 | } | 1184 | } |
1091 | 1185 | ||
1092 | /* Special debug activities for freeing objects */ | ||
1093 | if (!PageSlubFrozen(page) && !page->freelist) | ||
1094 | remove_full(s, page); | ||
1095 | if (s->flags & SLAB_STORE_USER) | 1186 | if (s->flags & SLAB_STORE_USER) |
1096 | set_track(s, object, TRACK_FREE, addr); | 1187 | set_track(s, object, TRACK_FREE, addr); |
1097 | trace(s, page, object, 0); | 1188 | trace(s, page, object, 0); |
1098 | init_object(s, object, SLUB_RED_INACTIVE); | 1189 | init_object(s, object, SLUB_RED_INACTIVE); |
1099 | return 1; | 1190 | rc = 1; |
1191 | out: | ||
1192 | slab_unlock(page); | ||
1193 | local_irq_restore(flags); | ||
1194 | return rc; | ||
1100 | 1195 | ||
1101 | fail: | 1196 | fail: |
1102 | slab_fix(s, "Object at 0x%p not freed", object); | 1197 | slab_fix(s, "Object at 0x%p not freed", object); |
1103 | return 0; | 1198 | goto out; |
1104 | } | 1199 | } |
1105 | 1200 | ||
1106 | static int __init setup_slub_debug(char *str) | 1201 | static int __init setup_slub_debug(char *str) |
@@ -1200,7 +1295,9 @@ static inline int slab_pad_check(struct kmem_cache *s, struct page *page) | |||
1200 | { return 1; } | 1295 | { return 1; } |
1201 | static inline int check_object(struct kmem_cache *s, struct page *page, | 1296 | static inline int check_object(struct kmem_cache *s, struct page *page, |
1202 | void *object, u8 val) { return 1; } | 1297 | void *object, u8 val) { return 1; } |
1203 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | 1298 | static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, |
1299 | struct page *page) {} | ||
1300 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} | ||
1204 | static inline unsigned long kmem_cache_flags(unsigned long objsize, | 1301 | static inline unsigned long kmem_cache_flags(unsigned long objsize, |
1205 | unsigned long flags, const char *name, | 1302 | unsigned long flags, const char *name, |
1206 | void (*ctor)(void *)) | 1303 | void (*ctor)(void *)) |
@@ -1252,6 +1349,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1252 | struct kmem_cache_order_objects oo = s->oo; | 1349 | struct kmem_cache_order_objects oo = s->oo; |
1253 | gfp_t alloc_gfp; | 1350 | gfp_t alloc_gfp; |
1254 | 1351 | ||
1352 | flags &= gfp_allowed_mask; | ||
1353 | |||
1354 | if (flags & __GFP_WAIT) | ||
1355 | local_irq_enable(); | ||
1356 | |||
1255 | flags |= s->allocflags; | 1357 | flags |= s->allocflags; |
1256 | 1358 | ||
1257 | /* | 1359 | /* |
@@ -1268,12 +1370,17 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1268 | * Try a lower order alloc if possible | 1370 | * Try a lower order alloc if possible |
1269 | */ | 1371 | */ |
1270 | page = alloc_slab_page(flags, node, oo); | 1372 | page = alloc_slab_page(flags, node, oo); |
1271 | if (!page) | ||
1272 | return NULL; | ||
1273 | 1373 | ||
1274 | stat(s, ORDER_FALLBACK); | 1374 | if (page) |
1375 | stat(s, ORDER_FALLBACK); | ||
1275 | } | 1376 | } |
1276 | 1377 | ||
1378 | if (flags & __GFP_WAIT) | ||
1379 | local_irq_disable(); | ||
1380 | |||
1381 | if (!page) | ||
1382 | return NULL; | ||
1383 | |||
1277 | if (kmemcheck_enabled | 1384 | if (kmemcheck_enabled |
1278 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { | 1385 | && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { |
1279 | int pages = 1 << oo_order(oo); | 1386 | int pages = 1 << oo_order(oo); |
@@ -1341,6 +1448,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
1341 | 1448 | ||
1342 | page->freelist = start; | 1449 | page->freelist = start; |
1343 | page->inuse = 0; | 1450 | page->inuse = 0; |
1451 | page->frozen = 1; | ||
1344 | out: | 1452 | out: |
1345 | return page; | 1453 | return page; |
1346 | } | 1454 | } |
@@ -1418,77 +1526,87 @@ static void discard_slab(struct kmem_cache *s, struct page *page) | |||
1418 | } | 1526 | } |
1419 | 1527 | ||
1420 | /* | 1528 | /* |
1421 | * Per slab locking using the pagelock | 1529 | * Management of partially allocated slabs. |
1422 | */ | 1530 | * |
1423 | static __always_inline void slab_lock(struct page *page) | 1531 | * list_lock must be held. |
1424 | { | ||
1425 | bit_spin_lock(PG_locked, &page->flags); | ||
1426 | } | ||
1427 | |||
1428 | static __always_inline void slab_unlock(struct page *page) | ||
1429 | { | ||
1430 | __bit_spin_unlock(PG_locked, &page->flags); | ||
1431 | } | ||
1432 | |||
1433 | static __always_inline int slab_trylock(struct page *page) | ||
1434 | { | ||
1435 | int rc = 1; | ||
1436 | |||
1437 | rc = bit_spin_trylock(PG_locked, &page->flags); | ||
1438 | return rc; | ||
1439 | } | ||
1440 | |||
1441 | /* | ||
1442 | * Management of partially allocated slabs | ||
1443 | */ | 1532 | */ |
1444 | static void add_partial(struct kmem_cache_node *n, | 1533 | static inline void add_partial(struct kmem_cache_node *n, |
1445 | struct page *page, int tail) | 1534 | struct page *page, int tail) |
1446 | { | 1535 | { |
1447 | spin_lock(&n->list_lock); | ||
1448 | n->nr_partial++; | 1536 | n->nr_partial++; |
1449 | if (tail) | 1537 | if (tail) |
1450 | list_add_tail(&page->lru, &n->partial); | 1538 | list_add_tail(&page->lru, &n->partial); |
1451 | else | 1539 | else |
1452 | list_add(&page->lru, &n->partial); | 1540 | list_add(&page->lru, &n->partial); |
1453 | spin_unlock(&n->list_lock); | ||
1454 | } | 1541 | } |
1455 | 1542 | ||
1456 | static inline void __remove_partial(struct kmem_cache_node *n, | 1543 | /* |
1544 | * list_lock must be held. | ||
1545 | */ | ||
1546 | static inline void remove_partial(struct kmem_cache_node *n, | ||
1457 | struct page *page) | 1547 | struct page *page) |
1458 | { | 1548 | { |
1459 | list_del(&page->lru); | 1549 | list_del(&page->lru); |
1460 | n->nr_partial--; | 1550 | n->nr_partial--; |
1461 | } | 1551 | } |
1462 | 1552 | ||
1463 | static void remove_partial(struct kmem_cache *s, struct page *page) | ||
1464 | { | ||
1465 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1466 | |||
1467 | spin_lock(&n->list_lock); | ||
1468 | __remove_partial(n, page); | ||
1469 | spin_unlock(&n->list_lock); | ||
1470 | } | ||
1471 | |||
1472 | /* | 1553 | /* |
1473 | * Lock slab and remove from the partial list. | 1554 | * Lock slab, remove from the partial list and put the object into the |
1555 | * per cpu freelist. | ||
1474 | * | 1556 | * |
1475 | * Must hold list_lock. | 1557 | * Must hold list_lock. |
1476 | */ | 1558 | */ |
1477 | static inline int lock_and_freeze_slab(struct kmem_cache_node *n, | 1559 | static inline int acquire_slab(struct kmem_cache *s, |
1478 | struct page *page) | 1560 | struct kmem_cache_node *n, struct page *page) |
1479 | { | 1561 | { |
1480 | if (slab_trylock(page)) { | 1562 | void *freelist; |
1481 | __remove_partial(n, page); | 1563 | unsigned long counters; |
1482 | __SetPageSlubFrozen(page); | 1564 | struct page new; |
1565 | |||
1566 | /* | ||
1567 | * Zap the freelist and set the frozen bit. | ||
1568 | * The old freelist is the list of objects for the | ||
1569 | * per cpu allocation list. | ||
1570 | */ | ||
1571 | do { | ||
1572 | freelist = page->freelist; | ||
1573 | counters = page->counters; | ||
1574 | new.counters = counters; | ||
1575 | new.inuse = page->objects; | ||
1576 | |||
1577 | VM_BUG_ON(new.frozen); | ||
1578 | new.frozen = 1; | ||
1579 | |||
1580 | } while (!__cmpxchg_double_slab(s, page, | ||
1581 | freelist, counters, | ||
1582 | NULL, new.counters, | ||
1583 | "lock and freeze")); | ||
1584 | |||
1585 | remove_partial(n, page); | ||
1586 | |||
1587 | if (freelist) { | ||
1588 | /* Populate the per cpu freelist */ | ||
1589 | this_cpu_write(s->cpu_slab->freelist, freelist); | ||
1590 | this_cpu_write(s->cpu_slab->page, page); | ||
1591 | this_cpu_write(s->cpu_slab->node, page_to_nid(page)); | ||
1483 | return 1; | 1592 | return 1; |
1593 | } else { | ||
1594 | /* | ||
1595 | * Slab page came from the wrong list. No object to allocate | ||
1596 | * from. Put it onto the correct list and continue partial | ||
1597 | * scan. | ||
1598 | */ | ||
1599 | printk(KERN_ERR "SLUB: %s : Page without available objects on" | ||
1600 | " partial list\n", s->name); | ||
1601 | return 0; | ||
1484 | } | 1602 | } |
1485 | return 0; | ||
1486 | } | 1603 | } |
1487 | 1604 | ||
1488 | /* | 1605 | /* |
1489 | * Try to allocate a partial slab from a specific node. | 1606 | * Try to allocate a partial slab from a specific node. |
1490 | */ | 1607 | */ |
1491 | static struct page *get_partial_node(struct kmem_cache_node *n) | 1608 | static struct page *get_partial_node(struct kmem_cache *s, |
1609 | struct kmem_cache_node *n) | ||
1492 | { | 1610 | { |
1493 | struct page *page; | 1611 | struct page *page; |
1494 | 1612 | ||
@@ -1503,7 +1621,7 @@ static struct page *get_partial_node(struct kmem_cache_node *n) | |||
1503 | 1621 | ||
1504 | spin_lock(&n->list_lock); | 1622 | spin_lock(&n->list_lock); |
1505 | list_for_each_entry(page, &n->partial, lru) | 1623 | list_for_each_entry(page, &n->partial, lru) |
1506 | if (lock_and_freeze_slab(n, page)) | 1624 | if (acquire_slab(s, n, page)) |
1507 | goto out; | 1625 | goto out; |
1508 | page = NULL; | 1626 | page = NULL; |
1509 | out: | 1627 | out: |
@@ -1554,7 +1672,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1554 | 1672 | ||
1555 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1673 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && |
1556 | n->nr_partial > s->min_partial) { | 1674 | n->nr_partial > s->min_partial) { |
1557 | page = get_partial_node(n); | 1675 | page = get_partial_node(s, n); |
1558 | if (page) { | 1676 | if (page) { |
1559 | put_mems_allowed(); | 1677 | put_mems_allowed(); |
1560 | return page; | 1678 | return page; |
@@ -1574,60 +1692,13 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) | |||
1574 | struct page *page; | 1692 | struct page *page; |
1575 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; | 1693 | int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; |
1576 | 1694 | ||
1577 | page = get_partial_node(get_node(s, searchnode)); | 1695 | page = get_partial_node(s, get_node(s, searchnode)); |
1578 | if (page || node != NUMA_NO_NODE) | 1696 | if (page || node != NUMA_NO_NODE) |
1579 | return page; | 1697 | return page; |
1580 | 1698 | ||
1581 | return get_any_partial(s, flags); | 1699 | return get_any_partial(s, flags); |
1582 | } | 1700 | } |
1583 | 1701 | ||
1584 | /* | ||
1585 | * Move a page back to the lists. | ||
1586 | * | ||
1587 | * Must be called with the slab lock held. | ||
1588 | * | ||
1589 | * On exit the slab lock will have been dropped. | ||
1590 | */ | ||
1591 | static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) | ||
1592 | __releases(bitlock) | ||
1593 | { | ||
1594 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); | ||
1595 | |||
1596 | __ClearPageSlubFrozen(page); | ||
1597 | if (page->inuse) { | ||
1598 | |||
1599 | if (page->freelist) { | ||
1600 | add_partial(n, page, tail); | ||
1601 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | ||
1602 | } else { | ||
1603 | stat(s, DEACTIVATE_FULL); | ||
1604 | if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) | ||
1605 | add_full(n, page); | ||
1606 | } | ||
1607 | slab_unlock(page); | ||
1608 | } else { | ||
1609 | stat(s, DEACTIVATE_EMPTY); | ||
1610 | if (n->nr_partial < s->min_partial) { | ||
1611 | /* | ||
1612 | * Adding an empty slab to the partial slabs in order | ||
1613 | * to avoid page allocator overhead. This slab needs | ||
1614 | * to come after the other slabs with objects in | ||
1615 | * so that the others get filled first. That way the | ||
1616 | * size of the partial list stays small. | ||
1617 | * | ||
1618 | * kmem_cache_shrink can reclaim any empty slabs from | ||
1619 | * the partial list. | ||
1620 | */ | ||
1621 | add_partial(n, page, 1); | ||
1622 | slab_unlock(page); | ||
1623 | } else { | ||
1624 | slab_unlock(page); | ||
1625 | stat(s, FREE_SLAB); | ||
1626 | discard_slab(s, page); | ||
1627 | } | ||
1628 | } | ||
1629 | } | ||
1630 | |||
1631 | #ifdef CONFIG_PREEMPT | 1702 | #ifdef CONFIG_PREEMPT |
1632 | /* | 1703 | /* |
1633 | * Calculate the next globally unique transaction for disambiguiation | 1704 | * Calculate the next globally unique transaction for disambiguiation |
@@ -1697,42 +1768,161 @@ void init_kmem_cache_cpus(struct kmem_cache *s) | |||
1697 | /* | 1768 | /* |
1698 | * Remove the cpu slab | 1769 | * Remove the cpu slab |
1699 | */ | 1770 | */ |
1771 | |||
1772 | /* | ||
1773 | * Remove the cpu slab | ||
1774 | */ | ||
1700 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1775 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1701 | __releases(bitlock) | ||
1702 | { | 1776 | { |
1777 | enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; | ||
1703 | struct page *page = c->page; | 1778 | struct page *page = c->page; |
1704 | int tail = 1; | 1779 | struct kmem_cache_node *n = get_node(s, page_to_nid(page)); |
1705 | 1780 | int lock = 0; | |
1706 | if (page->freelist) | 1781 | enum slab_modes l = M_NONE, m = M_NONE; |
1782 | void *freelist; | ||
1783 | void *nextfree; | ||
1784 | int tail = 0; | ||
1785 | struct page new; | ||
1786 | struct page old; | ||
1787 | |||
1788 | if (page->freelist) { | ||
1707 | stat(s, DEACTIVATE_REMOTE_FREES); | 1789 | stat(s, DEACTIVATE_REMOTE_FREES); |
1790 | tail = 1; | ||
1791 | } | ||
1792 | |||
1793 | c->tid = next_tid(c->tid); | ||
1794 | c->page = NULL; | ||
1795 | freelist = c->freelist; | ||
1796 | c->freelist = NULL; | ||
1797 | |||
1798 | /* | ||
1799 | * Stage one: Free all available per cpu objects back | ||
1800 | * to the page freelist while it is still frozen. Leave the | ||
1801 | * last one. | ||
1802 | * | ||
1803 | * There is no need to take the list->lock because the page | ||
1804 | * is still frozen. | ||
1805 | */ | ||
1806 | while (freelist && (nextfree = get_freepointer(s, freelist))) { | ||
1807 | void *prior; | ||
1808 | unsigned long counters; | ||
1809 | |||
1810 | do { | ||
1811 | prior = page->freelist; | ||
1812 | counters = page->counters; | ||
1813 | set_freepointer(s, freelist, prior); | ||
1814 | new.counters = counters; | ||
1815 | new.inuse--; | ||
1816 | VM_BUG_ON(!new.frozen); | ||
1817 | |||
1818 | } while (!__cmpxchg_double_slab(s, page, | ||
1819 | prior, counters, | ||
1820 | freelist, new.counters, | ||
1821 | "drain percpu freelist")); | ||
1822 | |||
1823 | freelist = nextfree; | ||
1824 | } | ||
1825 | |||
1708 | /* | 1826 | /* |
1709 | * Merge cpu freelist into slab freelist. Typically we get here | 1827 | * Stage two: Ensure that the page is unfrozen while the |
1710 | * because both freelists are empty. So this is unlikely | 1828 | * list presence reflects the actual number of objects |
1711 | * to occur. | 1829 | * during unfreeze. |
1830 | * | ||
1831 | * We setup the list membership and then perform a cmpxchg | ||
1832 | * with the count. If there is a mismatch then the page | ||
1833 | * is not unfrozen but the page is on the wrong list. | ||
1834 | * | ||
1835 | * Then we restart the process which may have to remove | ||
1836 | * the page from the list that we just put it on again | ||
1837 | * because the number of objects in the slab may have | ||
1838 | * changed. | ||
1712 | */ | 1839 | */ |
1713 | while (unlikely(c->freelist)) { | 1840 | redo: |
1714 | void **object; | ||
1715 | 1841 | ||
1716 | tail = 0; /* Hot objects. Put the slab first */ | 1842 | old.freelist = page->freelist; |
1843 | old.counters = page->counters; | ||
1844 | VM_BUG_ON(!old.frozen); | ||
1717 | 1845 | ||
1718 | /* Retrieve object from cpu_freelist */ | 1846 | /* Determine target state of the slab */ |
1719 | object = c->freelist; | 1847 | new.counters = old.counters; |
1720 | c->freelist = get_freepointer(s, c->freelist); | 1848 | if (freelist) { |
1849 | new.inuse--; | ||
1850 | set_freepointer(s, freelist, old.freelist); | ||
1851 | new.freelist = freelist; | ||
1852 | } else | ||
1853 | new.freelist = old.freelist; | ||
1854 | |||
1855 | new.frozen = 0; | ||
1856 | |||
1857 | if (!new.inuse && n->nr_partial < s->min_partial) | ||
1858 | m = M_FREE; | ||
1859 | else if (new.freelist) { | ||
1860 | m = M_PARTIAL; | ||
1861 | if (!lock) { | ||
1862 | lock = 1; | ||
1863 | /* | ||
1864 | * Taking the spinlock removes the possiblity | ||
1865 | * that acquire_slab() will see a slab page that | ||
1866 | * is frozen | ||
1867 | */ | ||
1868 | spin_lock(&n->list_lock); | ||
1869 | } | ||
1870 | } else { | ||
1871 | m = M_FULL; | ||
1872 | if (kmem_cache_debug(s) && !lock) { | ||
1873 | lock = 1; | ||
1874 | /* | ||
1875 | * This also ensures that the scanning of full | ||
1876 | * slabs from diagnostic functions will not see | ||
1877 | * any frozen slabs. | ||
1878 | */ | ||
1879 | spin_lock(&n->list_lock); | ||
1880 | } | ||
1881 | } | ||
1882 | |||
1883 | if (l != m) { | ||
1884 | |||
1885 | if (l == M_PARTIAL) | ||
1886 | |||
1887 | remove_partial(n, page); | ||
1888 | |||
1889 | else if (l == M_FULL) | ||
1890 | |||
1891 | remove_full(s, page); | ||
1892 | |||
1893 | if (m == M_PARTIAL) { | ||
1894 | |||
1895 | add_partial(n, page, tail); | ||
1896 | stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); | ||
1897 | |||
1898 | } else if (m == M_FULL) { | ||
1721 | 1899 | ||
1722 | /* And put onto the regular freelist */ | 1900 | stat(s, DEACTIVATE_FULL); |
1723 | set_freepointer(s, object, page->freelist); | 1901 | add_full(s, n, page); |
1724 | page->freelist = object; | 1902 | |
1725 | page->inuse--; | 1903 | } |
1904 | } | ||
1905 | |||
1906 | l = m; | ||
1907 | if (!__cmpxchg_double_slab(s, page, | ||
1908 | old.freelist, old.counters, | ||
1909 | new.freelist, new.counters, | ||
1910 | "unfreezing slab")) | ||
1911 | goto redo; | ||
1912 | |||
1913 | if (lock) | ||
1914 | spin_unlock(&n->list_lock); | ||
1915 | |||
1916 | if (m == M_FREE) { | ||
1917 | stat(s, DEACTIVATE_EMPTY); | ||
1918 | discard_slab(s, page); | ||
1919 | stat(s, FREE_SLAB); | ||
1726 | } | 1920 | } |
1727 | c->page = NULL; | ||
1728 | c->tid = next_tid(c->tid); | ||
1729 | unfreeze_slab(s, page, tail); | ||
1730 | } | 1921 | } |
1731 | 1922 | ||
1732 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) | 1923 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1733 | { | 1924 | { |
1734 | stat(s, CPUSLAB_FLUSH); | 1925 | stat(s, CPUSLAB_FLUSH); |
1735 | slab_lock(c->page); | ||
1736 | deactivate_slab(s, c); | 1926 | deactivate_slab(s, c); |
1737 | } | 1927 | } |
1738 | 1928 | ||
@@ -1861,6 +2051,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1861 | void **object; | 2051 | void **object; |
1862 | struct page *page; | 2052 | struct page *page; |
1863 | unsigned long flags; | 2053 | unsigned long flags; |
2054 | struct page new; | ||
2055 | unsigned long counters; | ||
1864 | 2056 | ||
1865 | local_irq_save(flags); | 2057 | local_irq_save(flags); |
1866 | #ifdef CONFIG_PREEMPT | 2058 | #ifdef CONFIG_PREEMPT |
@@ -1879,72 +2071,97 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, | |||
1879 | if (!page) | 2071 | if (!page) |
1880 | goto new_slab; | 2072 | goto new_slab; |
1881 | 2073 | ||
1882 | slab_lock(page); | 2074 | if (unlikely(!node_match(c, node))) { |
1883 | if (unlikely(!node_match(c, node))) | 2075 | stat(s, ALLOC_NODE_MISMATCH); |
1884 | goto another_slab; | 2076 | deactivate_slab(s, c); |
2077 | goto new_slab; | ||
2078 | } | ||
2079 | |||
2080 | stat(s, ALLOC_SLOWPATH); | ||
2081 | |||
2082 | do { | ||
2083 | object = page->freelist; | ||
2084 | counters = page->counters; | ||
2085 | new.counters = counters; | ||
2086 | VM_BUG_ON(!new.frozen); | ||
2087 | |||
2088 | /* | ||
2089 | * If there is no object left then we use this loop to | ||
2090 | * deactivate the slab which is simple since no objects | ||
2091 | * are left in the slab and therefore we do not need to | ||
2092 | * put the page back onto the partial list. | ||
2093 | * | ||
2094 | * If there are objects left then we retrieve them | ||
2095 | * and use them to refill the per cpu queue. | ||
2096 | */ | ||
2097 | |||
2098 | new.inuse = page->objects; | ||
2099 | new.frozen = object != NULL; | ||
2100 | |||
2101 | } while (!__cmpxchg_double_slab(s, page, | ||
2102 | object, counters, | ||
2103 | NULL, new.counters, | ||
2104 | "__slab_alloc")); | ||
2105 | |||
2106 | if (unlikely(!object)) { | ||
2107 | c->page = NULL; | ||
2108 | stat(s, DEACTIVATE_BYPASS); | ||
2109 | goto new_slab; | ||
2110 | } | ||
1885 | 2111 | ||
1886 | stat(s, ALLOC_REFILL); | 2112 | stat(s, ALLOC_REFILL); |
1887 | 2113 | ||
1888 | load_freelist: | 2114 | load_freelist: |
1889 | object = page->freelist; | 2115 | VM_BUG_ON(!page->frozen); |
1890 | if (unlikely(!object)) | ||
1891 | goto another_slab; | ||
1892 | if (kmem_cache_debug(s)) | ||
1893 | goto debug; | ||
1894 | |||
1895 | c->freelist = get_freepointer(s, object); | 2116 | c->freelist = get_freepointer(s, object); |
1896 | page->inuse = page->objects; | ||
1897 | page->freelist = NULL; | ||
1898 | |||
1899 | slab_unlock(page); | ||
1900 | c->tid = next_tid(c->tid); | 2117 | c->tid = next_tid(c->tid); |
1901 | local_irq_restore(flags); | 2118 | local_irq_restore(flags); |
1902 | stat(s, ALLOC_SLOWPATH); | ||
1903 | return object; | 2119 | return object; |
1904 | 2120 | ||
1905 | another_slab: | ||
1906 | deactivate_slab(s, c); | ||
1907 | |||
1908 | new_slab: | 2121 | new_slab: |
1909 | page = get_partial(s, gfpflags, node); | 2122 | page = get_partial(s, gfpflags, node); |
1910 | if (page) { | 2123 | if (page) { |
1911 | stat(s, ALLOC_FROM_PARTIAL); | 2124 | stat(s, ALLOC_FROM_PARTIAL); |
1912 | c->node = page_to_nid(page); | 2125 | object = c->freelist; |
1913 | c->page = page; | 2126 | |
2127 | if (kmem_cache_debug(s)) | ||
2128 | goto debug; | ||
1914 | goto load_freelist; | 2129 | goto load_freelist; |
1915 | } | 2130 | } |
1916 | 2131 | ||
1917 | gfpflags &= gfp_allowed_mask; | ||
1918 | if (gfpflags & __GFP_WAIT) | ||
1919 | local_irq_enable(); | ||
1920 | |||
1921 | page = new_slab(s, gfpflags, node); | 2132 | page = new_slab(s, gfpflags, node); |
1922 | 2133 | ||
1923 | if (gfpflags & __GFP_WAIT) | ||
1924 | local_irq_disable(); | ||
1925 | |||
1926 | if (page) { | 2134 | if (page) { |
1927 | c = __this_cpu_ptr(s->cpu_slab); | 2135 | c = __this_cpu_ptr(s->cpu_slab); |
1928 | stat(s, ALLOC_SLAB); | ||
1929 | if (c->page) | 2136 | if (c->page) |
1930 | flush_slab(s, c); | 2137 | flush_slab(s, c); |
1931 | 2138 | ||
1932 | slab_lock(page); | 2139 | /* |
1933 | __SetPageSlubFrozen(page); | 2140 | * No other reference to the page yet so we can |
2141 | * muck around with it freely without cmpxchg | ||
2142 | */ | ||
2143 | object = page->freelist; | ||
2144 | page->freelist = NULL; | ||
2145 | page->inuse = page->objects; | ||
2146 | |||
2147 | stat(s, ALLOC_SLAB); | ||
1934 | c->node = page_to_nid(page); | 2148 | c->node = page_to_nid(page); |
1935 | c->page = page; | 2149 | c->page = page; |
2150 | |||
2151 | if (kmem_cache_debug(s)) | ||
2152 | goto debug; | ||
1936 | goto load_freelist; | 2153 | goto load_freelist; |
1937 | } | 2154 | } |
1938 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) | 2155 | if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) |
1939 | slab_out_of_memory(s, gfpflags, node); | 2156 | slab_out_of_memory(s, gfpflags, node); |
1940 | local_irq_restore(flags); | 2157 | local_irq_restore(flags); |
1941 | return NULL; | 2158 | return NULL; |
2159 | |||
1942 | debug: | 2160 | debug: |
1943 | if (!alloc_debug_processing(s, page, object, addr)) | 2161 | if (!object || !alloc_debug_processing(s, page, object, addr)) |
1944 | goto another_slab; | 2162 | goto new_slab; |
1945 | 2163 | ||
1946 | page->inuse++; | 2164 | c->freelist = get_freepointer(s, object); |
1947 | page->freelist = get_freepointer(s, object); | ||
1948 | deactivate_slab(s, c); | 2165 | deactivate_slab(s, c); |
1949 | c->page = NULL; | 2166 | c->page = NULL; |
1950 | c->node = NUMA_NO_NODE; | 2167 | c->node = NUMA_NO_NODE; |
@@ -2096,40 +2313,75 @@ static void __slab_free(struct kmem_cache *s, struct page *page, | |||
2096 | { | 2313 | { |
2097 | void *prior; | 2314 | void *prior; |
2098 | void **object = (void *)x; | 2315 | void **object = (void *)x; |
2099 | unsigned long flags; | 2316 | int was_frozen; |
2317 | int inuse; | ||
2318 | struct page new; | ||
2319 | unsigned long counters; | ||
2320 | struct kmem_cache_node *n = NULL; | ||
2321 | unsigned long uninitialized_var(flags); | ||
2100 | 2322 | ||
2101 | local_irq_save(flags); | ||
2102 | slab_lock(page); | ||
2103 | stat(s, FREE_SLOWPATH); | 2323 | stat(s, FREE_SLOWPATH); |
2104 | 2324 | ||
2105 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) | 2325 | if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr)) |
2106 | goto out_unlock; | 2326 | return; |
2107 | 2327 | ||
2108 | prior = page->freelist; | 2328 | do { |
2109 | set_freepointer(s, object, prior); | 2329 | prior = page->freelist; |
2110 | page->freelist = object; | 2330 | counters = page->counters; |
2111 | page->inuse--; | 2331 | set_freepointer(s, object, prior); |
2332 | new.counters = counters; | ||
2333 | was_frozen = new.frozen; | ||
2334 | new.inuse--; | ||
2335 | if ((!new.inuse || !prior) && !was_frozen && !n) { | ||
2336 | n = get_node(s, page_to_nid(page)); | ||
2337 | /* | ||
2338 | * Speculatively acquire the list_lock. | ||
2339 | * If the cmpxchg does not succeed then we may | ||
2340 | * drop the list_lock without any processing. | ||
2341 | * | ||
2342 | * Otherwise the list_lock will synchronize with | ||
2343 | * other processors updating the list of slabs. | ||
2344 | */ | ||
2345 | spin_lock_irqsave(&n->list_lock, flags); | ||
2346 | } | ||
2347 | inuse = new.inuse; | ||
2112 | 2348 | ||
2113 | if (unlikely(PageSlubFrozen(page))) { | 2349 | } while (!cmpxchg_double_slab(s, page, |
2114 | stat(s, FREE_FROZEN); | 2350 | prior, counters, |
2115 | goto out_unlock; | 2351 | object, new.counters, |
2116 | } | 2352 | "__slab_free")); |
2117 | 2353 | ||
2118 | if (unlikely(!page->inuse)) | 2354 | if (likely(!n)) { |
2119 | goto slab_empty; | 2355 | /* |
2356 | * The list lock was not taken therefore no list | ||
2357 | * activity can be necessary. | ||
2358 | */ | ||
2359 | if (was_frozen) | ||
2360 | stat(s, FREE_FROZEN); | ||
2361 | return; | ||
2362 | } | ||
2120 | 2363 | ||
2121 | /* | 2364 | /* |
2122 | * Objects left in the slab. If it was not on the partial list before | 2365 | * was_frozen may have been set after we acquired the list_lock in |
2123 | * then add it. | 2366 | * an earlier loop. So we need to check it here again. |
2124 | */ | 2367 | */ |
2125 | if (unlikely(!prior)) { | 2368 | if (was_frozen) |
2126 | add_partial(get_node(s, page_to_nid(page)), page, 1); | 2369 | stat(s, FREE_FROZEN); |
2127 | stat(s, FREE_ADD_PARTIAL); | 2370 | else { |
2128 | } | 2371 | if (unlikely(!inuse && n->nr_partial > s->min_partial)) |
2372 | goto slab_empty; | ||
2129 | 2373 | ||
2130 | out_unlock: | 2374 | /* |
2131 | slab_unlock(page); | 2375 | * Objects left in the slab. If it was not on the partial list before |
2132 | local_irq_restore(flags); | 2376 | * then add it. |
2377 | */ | ||
2378 | if (unlikely(!prior)) { | ||
2379 | remove_full(s, page); | ||
2380 | add_partial(n, page, 0); | ||
2381 | stat(s, FREE_ADD_PARTIAL); | ||
2382 | } | ||
2383 | } | ||
2384 | spin_unlock_irqrestore(&n->list_lock, flags); | ||
2133 | return; | 2385 | return; |
2134 | 2386 | ||
2135 | slab_empty: | 2387 | slab_empty: |
@@ -2137,11 +2389,11 @@ slab_empty: | |||
2137 | /* | 2389 | /* |
2138 | * Slab still on the partial list. | 2390 | * Slab still on the partial list. |
2139 | */ | 2391 | */ |
2140 | remove_partial(s, page); | 2392 | remove_partial(n, page); |
2141 | stat(s, FREE_REMOVE_PARTIAL); | 2393 | stat(s, FREE_REMOVE_PARTIAL); |
2142 | } | 2394 | } |
2143 | slab_unlock(page); | 2395 | |
2144 | local_irq_restore(flags); | 2396 | spin_unlock_irqrestore(&n->list_lock, flags); |
2145 | stat(s, FREE_SLAB); | 2397 | stat(s, FREE_SLAB); |
2146 | discard_slab(s, page); | 2398 | discard_slab(s, page); |
2147 | } | 2399 | } |
@@ -2415,7 +2667,6 @@ static void early_kmem_cache_node_alloc(int node) | |||
2415 | { | 2667 | { |
2416 | struct page *page; | 2668 | struct page *page; |
2417 | struct kmem_cache_node *n; | 2669 | struct kmem_cache_node *n; |
2418 | unsigned long flags; | ||
2419 | 2670 | ||
2420 | BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); | 2671 | BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); |
2421 | 2672 | ||
@@ -2433,6 +2684,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
2433 | BUG_ON(!n); | 2684 | BUG_ON(!n); |
2434 | page->freelist = get_freepointer(kmem_cache_node, n); | 2685 | page->freelist = get_freepointer(kmem_cache_node, n); |
2435 | page->inuse++; | 2686 | page->inuse++; |
2687 | page->frozen = 0; | ||
2436 | kmem_cache_node->node[node] = n; | 2688 | kmem_cache_node->node[node] = n; |
2437 | #ifdef CONFIG_SLUB_DEBUG | 2689 | #ifdef CONFIG_SLUB_DEBUG |
2438 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); | 2690 | init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); |
@@ -2441,14 +2693,7 @@ static void early_kmem_cache_node_alloc(int node) | |||
2441 | init_kmem_cache_node(n, kmem_cache_node); | 2693 | init_kmem_cache_node(n, kmem_cache_node); |
2442 | inc_slabs_node(kmem_cache_node, node, page->objects); | 2694 | inc_slabs_node(kmem_cache_node, node, page->objects); |
2443 | 2695 | ||
2444 | /* | ||
2445 | * lockdep requires consistent irq usage for each lock | ||
2446 | * so even though there cannot be a race this early in | ||
2447 | * the boot sequence, we still disable irqs. | ||
2448 | */ | ||
2449 | local_irq_save(flags); | ||
2450 | add_partial(n, page, 0); | 2696 | add_partial(n, page, 0); |
2451 | local_irq_restore(flags); | ||
2452 | } | 2697 | } |
2453 | 2698 | ||
2454 | static void free_kmem_cache_nodes(struct kmem_cache *s) | 2699 | static void free_kmem_cache_nodes(struct kmem_cache *s) |
@@ -2654,6 +2899,12 @@ static int kmem_cache_open(struct kmem_cache *s, | |||
2654 | } | 2899 | } |
2655 | } | 2900 | } |
2656 | 2901 | ||
2902 | #ifdef CONFIG_CMPXCHG_DOUBLE | ||
2903 | if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) | ||
2904 | /* Enable fast mode */ | ||
2905 | s->flags |= __CMPXCHG_DOUBLE; | ||
2906 | #endif | ||
2907 | |||
2657 | /* | 2908 | /* |
2658 | * The larger the object size is, the more pages we want on the partial | 2909 | * The larger the object size is, the more pages we want on the partial |
2659 | * list to avoid pounding the page allocator excessively. | 2910 | * list to avoid pounding the page allocator excessively. |
@@ -2726,7 +2977,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) | |||
2726 | spin_lock_irqsave(&n->list_lock, flags); | 2977 | spin_lock_irqsave(&n->list_lock, flags); |
2727 | list_for_each_entry_safe(page, h, &n->partial, lru) { | 2978 | list_for_each_entry_safe(page, h, &n->partial, lru) { |
2728 | if (!page->inuse) { | 2979 | if (!page->inuse) { |
2729 | __remove_partial(n, page); | 2980 | remove_partial(n, page); |
2730 | discard_slab(s, page); | 2981 | discard_slab(s, page); |
2731 | } else { | 2982 | } else { |
2732 | list_slab_objects(s, page, | 2983 | list_slab_objects(s, page, |
@@ -3094,14 +3345,8 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
3094 | * list_lock. page->inuse here is the upper limit. | 3345 | * list_lock. page->inuse here is the upper limit. |
3095 | */ | 3346 | */ |
3096 | list_for_each_entry_safe(page, t, &n->partial, lru) { | 3347 | list_for_each_entry_safe(page, t, &n->partial, lru) { |
3097 | if (!page->inuse && slab_trylock(page)) { | 3348 | if (!page->inuse) { |
3098 | /* | 3349 | remove_partial(n, page); |
3099 | * Must hold slab lock here because slab_free | ||
3100 | * may have freed the last object and be | ||
3101 | * waiting to release the slab. | ||
3102 | */ | ||
3103 | __remove_partial(n, page); | ||
3104 | slab_unlock(page); | ||
3105 | discard_slab(s, page); | 3350 | discard_slab(s, page); |
3106 | } else { | 3351 | } else { |
3107 | list_move(&page->lru, | 3352 | list_move(&page->lru, |
@@ -3689,12 +3934,9 @@ static int validate_slab(struct kmem_cache *s, struct page *page, | |||
3689 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, | 3934 | static void validate_slab_slab(struct kmem_cache *s, struct page *page, |
3690 | unsigned long *map) | 3935 | unsigned long *map) |
3691 | { | 3936 | { |
3692 | if (slab_trylock(page)) { | 3937 | slab_lock(page); |
3693 | validate_slab(s, page, map); | 3938 | validate_slab(s, page, map); |
3694 | slab_unlock(page); | 3939 | slab_unlock(page); |
3695 | } else | ||
3696 | printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", | ||
3697 | s->name, page); | ||
3698 | } | 3940 | } |
3699 | 3941 | ||
3700 | static int validate_slab_node(struct kmem_cache *s, | 3942 | static int validate_slab_node(struct kmem_cache *s, |
@@ -4342,8 +4584,10 @@ static ssize_t sanity_checks_store(struct kmem_cache *s, | |||
4342 | const char *buf, size_t length) | 4584 | const char *buf, size_t length) |
4343 | { | 4585 | { |
4344 | s->flags &= ~SLAB_DEBUG_FREE; | 4586 | s->flags &= ~SLAB_DEBUG_FREE; |
4345 | if (buf[0] == '1') | 4587 | if (buf[0] == '1') { |
4588 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4346 | s->flags |= SLAB_DEBUG_FREE; | 4589 | s->flags |= SLAB_DEBUG_FREE; |
4590 | } | ||
4347 | return length; | 4591 | return length; |
4348 | } | 4592 | } |
4349 | SLAB_ATTR(sanity_checks); | 4593 | SLAB_ATTR(sanity_checks); |
@@ -4357,8 +4601,10 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, | |||
4357 | size_t length) | 4601 | size_t length) |
4358 | { | 4602 | { |
4359 | s->flags &= ~SLAB_TRACE; | 4603 | s->flags &= ~SLAB_TRACE; |
4360 | if (buf[0] == '1') | 4604 | if (buf[0] == '1') { |
4605 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4361 | s->flags |= SLAB_TRACE; | 4606 | s->flags |= SLAB_TRACE; |
4607 | } | ||
4362 | return length; | 4608 | return length; |
4363 | } | 4609 | } |
4364 | SLAB_ATTR(trace); | 4610 | SLAB_ATTR(trace); |
@@ -4375,8 +4621,10 @@ static ssize_t red_zone_store(struct kmem_cache *s, | |||
4375 | return -EBUSY; | 4621 | return -EBUSY; |
4376 | 4622 | ||
4377 | s->flags &= ~SLAB_RED_ZONE; | 4623 | s->flags &= ~SLAB_RED_ZONE; |
4378 | if (buf[0] == '1') | 4624 | if (buf[0] == '1') { |
4625 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4379 | s->flags |= SLAB_RED_ZONE; | 4626 | s->flags |= SLAB_RED_ZONE; |
4627 | } | ||
4380 | calculate_sizes(s, -1); | 4628 | calculate_sizes(s, -1); |
4381 | return length; | 4629 | return length; |
4382 | } | 4630 | } |
@@ -4394,8 +4642,10 @@ static ssize_t poison_store(struct kmem_cache *s, | |||
4394 | return -EBUSY; | 4642 | return -EBUSY; |
4395 | 4643 | ||
4396 | s->flags &= ~SLAB_POISON; | 4644 | s->flags &= ~SLAB_POISON; |
4397 | if (buf[0] == '1') | 4645 | if (buf[0] == '1') { |
4646 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4398 | s->flags |= SLAB_POISON; | 4647 | s->flags |= SLAB_POISON; |
4648 | } | ||
4399 | calculate_sizes(s, -1); | 4649 | calculate_sizes(s, -1); |
4400 | return length; | 4650 | return length; |
4401 | } | 4651 | } |
@@ -4413,8 +4663,10 @@ static ssize_t store_user_store(struct kmem_cache *s, | |||
4413 | return -EBUSY; | 4663 | return -EBUSY; |
4414 | 4664 | ||
4415 | s->flags &= ~SLAB_STORE_USER; | 4665 | s->flags &= ~SLAB_STORE_USER; |
4416 | if (buf[0] == '1') | 4666 | if (buf[0] == '1') { |
4667 | s->flags &= ~__CMPXCHG_DOUBLE; | ||
4417 | s->flags |= SLAB_STORE_USER; | 4668 | s->flags |= SLAB_STORE_USER; |
4669 | } | ||
4418 | calculate_sizes(s, -1); | 4670 | calculate_sizes(s, -1); |
4419 | return length; | 4671 | return length; |
4420 | } | 4672 | } |
@@ -4579,6 +4831,7 @@ STAT_ATTR(FREE_REMOVE_PARTIAL, free_remove_partial); | |||
4579 | STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); | 4831 | STAT_ATTR(ALLOC_FROM_PARTIAL, alloc_from_partial); |
4580 | STAT_ATTR(ALLOC_SLAB, alloc_slab); | 4832 | STAT_ATTR(ALLOC_SLAB, alloc_slab); |
4581 | STAT_ATTR(ALLOC_REFILL, alloc_refill); | 4833 | STAT_ATTR(ALLOC_REFILL, alloc_refill); |
4834 | STAT_ATTR(ALLOC_NODE_MISMATCH, alloc_node_mismatch); | ||
4582 | STAT_ATTR(FREE_SLAB, free_slab); | 4835 | STAT_ATTR(FREE_SLAB, free_slab); |
4583 | STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); | 4836 | STAT_ATTR(CPUSLAB_FLUSH, cpuslab_flush); |
4584 | STAT_ATTR(DEACTIVATE_FULL, deactivate_full); | 4837 | STAT_ATTR(DEACTIVATE_FULL, deactivate_full); |
@@ -4586,7 +4839,10 @@ STAT_ATTR(DEACTIVATE_EMPTY, deactivate_empty); | |||
4586 | STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); | 4839 | STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); |
4587 | STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); | 4840 | STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); |
4588 | STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); | 4841 | STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); |
4842 | STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); | ||
4589 | STAT_ATTR(ORDER_FALLBACK, order_fallback); | 4843 | STAT_ATTR(ORDER_FALLBACK, order_fallback); |
4844 | STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); | ||
4845 | STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); | ||
4590 | #endif | 4846 | #endif |
4591 | 4847 | ||
4592 | static struct attribute *slab_attrs[] = { | 4848 | static struct attribute *slab_attrs[] = { |
@@ -4636,6 +4892,7 @@ static struct attribute *slab_attrs[] = { | |||
4636 | &alloc_from_partial_attr.attr, | 4892 | &alloc_from_partial_attr.attr, |
4637 | &alloc_slab_attr.attr, | 4893 | &alloc_slab_attr.attr, |
4638 | &alloc_refill_attr.attr, | 4894 | &alloc_refill_attr.attr, |
4895 | &alloc_node_mismatch_attr.attr, | ||
4639 | &free_slab_attr.attr, | 4896 | &free_slab_attr.attr, |
4640 | &cpuslab_flush_attr.attr, | 4897 | &cpuslab_flush_attr.attr, |
4641 | &deactivate_full_attr.attr, | 4898 | &deactivate_full_attr.attr, |
@@ -4643,7 +4900,10 @@ static struct attribute *slab_attrs[] = { | |||
4643 | &deactivate_to_head_attr.attr, | 4900 | &deactivate_to_head_attr.attr, |
4644 | &deactivate_to_tail_attr.attr, | 4901 | &deactivate_to_tail_attr.attr, |
4645 | &deactivate_remote_frees_attr.attr, | 4902 | &deactivate_remote_frees_attr.attr, |
4903 | &deactivate_bypass_attr.attr, | ||
4646 | &order_fallback_attr.attr, | 4904 | &order_fallback_attr.attr, |
4905 | &cmpxchg_double_fail_attr.attr, | ||
4906 | &cmpxchg_double_cpu_fail_attr.attr, | ||
4647 | #endif | 4907 | #endif |
4648 | #ifdef CONFIG_FAILSLAB | 4908 | #ifdef CONFIG_FAILSLAB |
4649 | &failslab_attr.attr, | 4909 | &failslab_attr.attr, |