diff options
-rw-r--r-- | mm/slub.c | 242 |
1 files changed, 119 insertions, 123 deletions
@@ -66,11 +66,11 @@ | |||
66 | * SLUB assigns one slab for allocation to each processor. | 66 | * SLUB assigns one slab for allocation to each processor. |
67 | * Allocations only occur from these slabs called cpu slabs. | 67 | * Allocations only occur from these slabs called cpu slabs. |
68 | * | 68 | * |
69 | * Slabs with free elements are kept on a partial list. | 69 | * Slabs with free elements are kept on a partial list and during regular |
70 | * There is no list for full slabs. If an object in a full slab is | 70 | * operations no list for full slabs is used. If an object in a full slab is |
71 | * freed then the slab will show up again on the partial lists. | 71 | * freed then the slab will show up again on the partial lists. |
72 | * Otherwise there is no need to track full slabs unless we have to | 72 | * We track full slabs for debugging purposes though because otherwise we |
73 | * track full slabs for debugging purposes. | 73 | * cannot scan all objects. |
74 | * | 74 | * |
75 | * Slabs are freed when they become empty. Teardown and setup is | 75 | * Slabs are freed when they become empty. Teardown and setup is |
76 | * minimal so we rely on the page allocators per cpu caches for | 76 | * minimal so we rely on the page allocators per cpu caches for |
@@ -92,8 +92,8 @@ | |||
92 | * | 92 | * |
93 | * - The per cpu array is updated for each new slab and and is a remote | 93 | * - The per cpu array is updated for each new slab and and is a remote |
94 | * cacheline for most nodes. This could become a bouncing cacheline given | 94 | * cacheline for most nodes. This could become a bouncing cacheline given |
95 | * enough frequent updates. There are 16 pointers in a cacheline.so at | 95 | * enough frequent updates. There are 16 pointers in a cacheline, so at |
96 | * max 16 cpus could compete. Likely okay. | 96 | * max 16 cpus could compete for the cacheline which may be okay. |
97 | * | 97 | * |
98 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. | 98 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. |
99 | * | 99 | * |
@@ -137,6 +137,7 @@ | |||
137 | 137 | ||
138 | #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ | 138 | #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ |
139 | SLAB_POISON | SLAB_STORE_USER) | 139 | SLAB_POISON | SLAB_STORE_USER) |
140 | |||
140 | /* | 141 | /* |
141 | * Set of flags that will prevent slab merging | 142 | * Set of flags that will prevent slab merging |
142 | */ | 143 | */ |
@@ -171,7 +172,7 @@ static struct notifier_block slab_notifier; | |||
171 | static enum { | 172 | static enum { |
172 | DOWN, /* No slab functionality available */ | 173 | DOWN, /* No slab functionality available */ |
173 | PARTIAL, /* kmem_cache_open() works but kmalloc does not */ | 174 | PARTIAL, /* kmem_cache_open() works but kmalloc does not */ |
174 | UP, /* Everything works */ | 175 | UP, /* Everything works but does not show up in sysfs */ |
175 | SYSFS /* Sysfs up */ | 176 | SYSFS /* Sysfs up */ |
176 | } slab_state = DOWN; | 177 | } slab_state = DOWN; |
177 | 178 | ||
@@ -245,9 +246,9 @@ static void print_section(char *text, u8 *addr, unsigned int length) | |||
245 | /* | 246 | /* |
246 | * Slow version of get and set free pointer. | 247 | * Slow version of get and set free pointer. |
247 | * | 248 | * |
248 | * This requires touching the cache lines of kmem_cache. | 249 | * This version requires touching the cache lines of kmem_cache which |
249 | * The offset can also be obtained from the page. In that | 250 | * we avoid to do in the fast alloc free paths. There we obtain the offset |
250 | * case it is in the cacheline that we already need to touch. | 251 | * from the page struct. |
251 | */ | 252 | */ |
252 | static void *get_freepointer(struct kmem_cache *s, void *object) | 253 | static void *get_freepointer(struct kmem_cache *s, void *object) |
253 | { | 254 | { |
@@ -429,26 +430,34 @@ static inline int check_valid_pointer(struct kmem_cache *s, | |||
429 | * Bytes of the object to be managed. | 430 | * Bytes of the object to be managed. |
430 | * If the freepointer may overlay the object then the free | 431 | * If the freepointer may overlay the object then the free |
431 | * pointer is the first word of the object. | 432 | * pointer is the first word of the object. |
433 | * | ||
432 | * Poisoning uses 0x6b (POISON_FREE) and the last byte is | 434 | * Poisoning uses 0x6b (POISON_FREE) and the last byte is |
433 | * 0xa5 (POISON_END) | 435 | * 0xa5 (POISON_END) |
434 | * | 436 | * |
435 | * object + s->objsize | 437 | * object + s->objsize |
436 | * Padding to reach word boundary. This is also used for Redzoning. | 438 | * Padding to reach word boundary. This is also used for Redzoning. |
437 | * Padding is extended to word size if Redzoning is enabled | 439 | * Padding is extended by another word if Redzoning is enabled and |
438 | * and objsize == inuse. | 440 | * objsize == inuse. |
441 | * | ||
439 | * We fill with 0xbb (RED_INACTIVE) for inactive objects and with | 442 | * We fill with 0xbb (RED_INACTIVE) for inactive objects and with |
440 | * 0xcc (RED_ACTIVE) for objects in use. | 443 | * 0xcc (RED_ACTIVE) for objects in use. |
441 | * | 444 | * |
442 | * object + s->inuse | 445 | * object + s->inuse |
446 | * Meta data starts here. | ||
447 | * | ||
443 | * A. Free pointer (if we cannot overwrite object on free) | 448 | * A. Free pointer (if we cannot overwrite object on free) |
444 | * B. Tracking data for SLAB_STORE_USER | 449 | * B. Tracking data for SLAB_STORE_USER |
445 | * C. Padding to reach required alignment boundary | 450 | * C. Padding to reach required alignment boundary or at mininum |
446 | * Padding is done using 0x5a (POISON_INUSE) | 451 | * one word if debuggin is on to be able to detect writes |
452 | * before the word boundary. | ||
453 | * | ||
454 | * Padding is done using 0x5a (POISON_INUSE) | ||
447 | * | 455 | * |
448 | * object + s->size | 456 | * object + s->size |
457 | * Nothing is used beyond s->size. | ||
449 | * | 458 | * |
450 | * If slabcaches are merged then the objsize and inuse boundaries are to | 459 | * If slabcaches are merged then the objsize and inuse boundaries are mostly |
451 | * be ignored. And therefore no slab options that rely on these boundaries | 460 | * ignored. And therefore no slab options that rely on these boundaries |
452 | * may be used with merged slabcaches. | 461 | * may be used with merged slabcaches. |
453 | */ | 462 | */ |
454 | 463 | ||
@@ -574,8 +583,7 @@ static int check_object(struct kmem_cache *s, struct page *page, | |||
574 | /* | 583 | /* |
575 | * No choice but to zap it and thus loose the remainder | 584 | * No choice but to zap it and thus loose the remainder |
576 | * of the free objects in this slab. May cause | 585 | * of the free objects in this slab. May cause |
577 | * another error because the object count maybe | 586 | * another error because the object count is now wrong. |
578 | * wrong now. | ||
579 | */ | 587 | */ |
580 | set_freepointer(s, p, NULL); | 588 | set_freepointer(s, p, NULL); |
581 | return 0; | 589 | return 0; |
@@ -615,9 +623,8 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
615 | } | 623 | } |
616 | 624 | ||
617 | /* | 625 | /* |
618 | * Determine if a certain object on a page is on the freelist and | 626 | * Determine if a certain object on a page is on the freelist. Must hold the |
619 | * therefore free. Must hold the slab lock for cpu slabs to | 627 | * slab lock to guarantee that the chains are in a consistent state. |
620 | * guarantee that the chains are consistent. | ||
621 | */ | 628 | */ |
622 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | 629 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) |
623 | { | 630 | { |
@@ -663,7 +670,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | |||
663 | } | 670 | } |
664 | 671 | ||
665 | /* | 672 | /* |
666 | * Tracking of fully allocated slabs for debugging | 673 | * Tracking of fully allocated slabs for debugging purposes. |
667 | */ | 674 | */ |
668 | static void add_full(struct kmem_cache_node *n, struct page *page) | 675 | static void add_full(struct kmem_cache_node *n, struct page *page) |
669 | { | 676 | { |
@@ -714,7 +721,7 @@ bad: | |||
714 | /* | 721 | /* |
715 | * If this is a slab page then lets do the best we can | 722 | * If this is a slab page then lets do the best we can |
716 | * to avoid issues in the future. Marking all objects | 723 | * to avoid issues in the future. Marking all objects |
717 | * as used avoids touching the remainder. | 724 | * as used avoids touching the remaining objects. |
718 | */ | 725 | */ |
719 | printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", | 726 | printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", |
720 | s->name, page); | 727 | s->name, page); |
@@ -970,9 +977,9 @@ static void remove_partial(struct kmem_cache *s, | |||
970 | } | 977 | } |
971 | 978 | ||
972 | /* | 979 | /* |
973 | * Lock page and remove it from the partial list | 980 | * Lock slab and remove from the partial list. |
974 | * | 981 | * |
975 | * Must hold list_lock | 982 | * Must hold list_lock. |
976 | */ | 983 | */ |
977 | static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) | 984 | static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) |
978 | { | 985 | { |
@@ -985,7 +992,7 @@ static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) | |||
985 | } | 992 | } |
986 | 993 | ||
987 | /* | 994 | /* |
988 | * Try to get a partial slab from a specific node | 995 | * Try to allocate a partial slab from a specific node. |
989 | */ | 996 | */ |
990 | static struct page *get_partial_node(struct kmem_cache_node *n) | 997 | static struct page *get_partial_node(struct kmem_cache_node *n) |
991 | { | 998 | { |
@@ -994,7 +1001,8 @@ static struct page *get_partial_node(struct kmem_cache_node *n) | |||
994 | /* | 1001 | /* |
995 | * Racy check. If we mistakenly see no partial slabs then we | 1002 | * Racy check. If we mistakenly see no partial slabs then we |
996 | * just allocate an empty slab. If we mistakenly try to get a | 1003 | * just allocate an empty slab. If we mistakenly try to get a |
997 | * partial slab then get_partials() will return NULL. | 1004 | * partial slab and there is none available then get_partials() |
1005 | * will return NULL. | ||
998 | */ | 1006 | */ |
999 | if (!n || !n->nr_partial) | 1007 | if (!n || !n->nr_partial) |
1000 | return NULL; | 1008 | return NULL; |
@@ -1010,8 +1018,7 @@ out: | |||
1010 | } | 1018 | } |
1011 | 1019 | ||
1012 | /* | 1020 | /* |
1013 | * Get a page from somewhere. Search in increasing NUMA | 1021 | * Get a page from somewhere. Search in increasing NUMA distances. |
1014 | * distances. | ||
1015 | */ | 1022 | */ |
1016 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | 1023 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) |
1017 | { | 1024 | { |
@@ -1021,24 +1028,22 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1021 | struct page *page; | 1028 | struct page *page; |
1022 | 1029 | ||
1023 | /* | 1030 | /* |
1024 | * The defrag ratio allows to configure the tradeoffs between | 1031 | * The defrag ratio allows a configuration of the tradeoffs between |
1025 | * inter node defragmentation and node local allocations. | 1032 | * inter node defragmentation and node local allocations. A lower |
1026 | * A lower defrag_ratio increases the tendency to do local | 1033 | * defrag_ratio increases the tendency to do local allocations |
1027 | * allocations instead of scanning throught the partial | 1034 | * instead of attempting to obtain partial slabs from other nodes. |
1028 | * lists on other nodes. | ||
1029 | * | 1035 | * |
1030 | * If defrag_ratio is set to 0 then kmalloc() always | 1036 | * If the defrag_ratio is set to 0 then kmalloc() always |
1031 | * returns node local objects. If its higher then kmalloc() | 1037 | * returns node local objects. If the ratio is higher then kmalloc() |
1032 | * may return off node objects in order to avoid fragmentation. | 1038 | * may return off node objects because partial slabs are obtained |
1033 | * | 1039 | * from other nodes and filled up. |
1034 | * A higher ratio means slabs may be taken from other nodes | ||
1035 | * thus reducing the number of partial slabs on those nodes. | ||
1036 | * | 1040 | * |
1037 | * If /sys/slab/xx/defrag_ratio is set to 100 (which makes | 1041 | * If /sys/slab/xx/defrag_ratio is set to 100 (which makes |
1038 | * defrag_ratio = 1000) then every (well almost) allocation | 1042 | * defrag_ratio = 1000) then every (well almost) allocation will |
1039 | * will first attempt to defrag slab caches on other nodes. This | 1043 | * first attempt to defrag slab caches on other nodes. This means |
1040 | * means scanning over all nodes to look for partial slabs which | 1044 | * scanning over all nodes to look for partial slabs which may be |
1041 | * may be a bit expensive to do on every slab allocation. | 1045 | * expensive if we do it every time we are trying to find a slab |
1046 | * with available objects. | ||
1042 | */ | 1047 | */ |
1043 | if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) | 1048 | if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) |
1044 | return NULL; | 1049 | return NULL; |
@@ -1098,11 +1103,12 @@ static void putback_slab(struct kmem_cache *s, struct page *page) | |||
1098 | } else { | 1103 | } else { |
1099 | if (n->nr_partial < MIN_PARTIAL) { | 1104 | if (n->nr_partial < MIN_PARTIAL) { |
1100 | /* | 1105 | /* |
1101 | * Adding an empty page to the partial slabs in order | 1106 | * Adding an empty slab to the partial slabs in order |
1102 | * to avoid page allocator overhead. This page needs to | 1107 | * to avoid page allocator overhead. This slab needs |
1103 | * come after all the others that are not fully empty | 1108 | * to come after the other slabs with objects in |
1104 | * in order to make sure that we do maximum | 1109 | * order to fill them up. That way the size of the |
1105 | * defragmentation. | 1110 | * partial list stays small. kmem_cache_shrink can |
1111 | * reclaim empty slabs from the partial list. | ||
1106 | */ | 1112 | */ |
1107 | add_partial_tail(n, page); | 1113 | add_partial_tail(n, page); |
1108 | slab_unlock(page); | 1114 | slab_unlock(page); |
@@ -1170,7 +1176,7 @@ static void flush_all(struct kmem_cache *s) | |||
1170 | * 1. The page struct | 1176 | * 1. The page struct |
1171 | * 2. The first cacheline of the object to be allocated. | 1177 | * 2. The first cacheline of the object to be allocated. |
1172 | * | 1178 | * |
1173 | * The only cache lines that are read (apart from code) is the | 1179 | * The only other cache lines that are read (apart from code) is the |
1174 | * per cpu array in the kmem_cache struct. | 1180 | * per cpu array in the kmem_cache struct. |
1175 | * | 1181 | * |
1176 | * Fastpath is not possible if we need to get a new slab or have | 1182 | * Fastpath is not possible if we need to get a new slab or have |
@@ -1224,9 +1230,11 @@ have_slab: | |||
1224 | cpu = smp_processor_id(); | 1230 | cpu = smp_processor_id(); |
1225 | if (s->cpu_slab[cpu]) { | 1231 | if (s->cpu_slab[cpu]) { |
1226 | /* | 1232 | /* |
1227 | * Someone else populated the cpu_slab while we enabled | 1233 | * Someone else populated the cpu_slab while we |
1228 | * interrupts, or we have got scheduled on another cpu. | 1234 | * enabled interrupts, or we have gotten scheduled |
1229 | * The page may not be on the requested node. | 1235 | * on another cpu. The page may not be on the |
1236 | * requested node even if __GFP_THISNODE was | ||
1237 | * specified. So we need to recheck. | ||
1230 | */ | 1238 | */ |
1231 | if (node == -1 || | 1239 | if (node == -1 || |
1232 | page_to_nid(s->cpu_slab[cpu]) == node) { | 1240 | page_to_nid(s->cpu_slab[cpu]) == node) { |
@@ -1239,7 +1247,7 @@ have_slab: | |||
1239 | slab_lock(page); | 1247 | slab_lock(page); |
1240 | goto redo; | 1248 | goto redo; |
1241 | } | 1249 | } |
1242 | /* Dump the current slab */ | 1250 | /* New slab does not fit our expectations */ |
1243 | flush_slab(s, s->cpu_slab[cpu], cpu); | 1251 | flush_slab(s, s->cpu_slab[cpu], cpu); |
1244 | } | 1252 | } |
1245 | slab_lock(page); | 1253 | slab_lock(page); |
@@ -1280,7 +1288,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); | |||
1280 | * The fastpath only writes the cacheline of the page struct and the first | 1288 | * The fastpath only writes the cacheline of the page struct and the first |
1281 | * cacheline of the object. | 1289 | * cacheline of the object. |
1282 | * | 1290 | * |
1283 | * No special cachelines need to be read | 1291 | * We read the cpu_slab cacheline to check if the slab is the per cpu |
1292 | * slab for this processor. | ||
1284 | */ | 1293 | */ |
1285 | static void slab_free(struct kmem_cache *s, struct page *page, | 1294 | static void slab_free(struct kmem_cache *s, struct page *page, |
1286 | void *x, void *addr) | 1295 | void *x, void *addr) |
@@ -1325,7 +1334,7 @@ out_unlock: | |||
1325 | slab_empty: | 1334 | slab_empty: |
1326 | if (prior) | 1335 | if (prior) |
1327 | /* | 1336 | /* |
1328 | * Slab on the partial list. | 1337 | * Slab still on the partial list. |
1329 | */ | 1338 | */ |
1330 | remove_partial(s, page); | 1339 | remove_partial(s, page); |
1331 | 1340 | ||
@@ -1374,22 +1383,16 @@ static struct page *get_object_page(const void *x) | |||
1374 | } | 1383 | } |
1375 | 1384 | ||
1376 | /* | 1385 | /* |
1377 | * kmem_cache_open produces objects aligned at "size" and the first object | 1386 | * Object placement in a slab is made very easy because we always start at |
1378 | * is placed at offset 0 in the slab (We have no metainformation on the | 1387 | * offset 0. If we tune the size of the object to the alignment then we can |
1379 | * slab, all slabs are in essence "off slab"). | 1388 | * get the required alignment by putting one properly sized object after |
1380 | * | 1389 | * another. |
1381 | * In order to get the desired alignment one just needs to align the | ||
1382 | * size. | ||
1383 | * | 1390 | * |
1384 | * Notice that the allocation order determines the sizes of the per cpu | 1391 | * Notice that the allocation order determines the sizes of the per cpu |
1385 | * caches. Each processor has always one slab available for allocations. | 1392 | * caches. Each processor has always one slab available for allocations. |
1386 | * Increasing the allocation order reduces the number of times that slabs | 1393 | * Increasing the allocation order reduces the number of times that slabs |
1387 | * must be moved on and off the partial lists and therefore may influence | 1394 | * must be moved on and off the partial lists and is therefore a factor in |
1388 | * locking overhead. | 1395 | * locking overhead. |
1389 | * | ||
1390 | * The offset is used to relocate the free list link in each object. It is | ||
1391 | * therefore possible to move the free list link behind the object. This | ||
1392 | * is necessary for RCU to work properly and also useful for debugging. | ||
1393 | */ | 1396 | */ |
1394 | 1397 | ||
1395 | /* | 1398 | /* |
@@ -1400,15 +1403,11 @@ static struct page *get_object_page(const void *x) | |||
1400 | */ | 1403 | */ |
1401 | static int slub_min_order; | 1404 | static int slub_min_order; |
1402 | static int slub_max_order = DEFAULT_MAX_ORDER; | 1405 | static int slub_max_order = DEFAULT_MAX_ORDER; |
1403 | |||
1404 | /* | ||
1405 | * Minimum number of objects per slab. This is necessary in order to | ||
1406 | * reduce locking overhead. Similar to the queue size in SLAB. | ||
1407 | */ | ||
1408 | static int slub_min_objects = DEFAULT_MIN_OBJECTS; | 1406 | static int slub_min_objects = DEFAULT_MIN_OBJECTS; |
1409 | 1407 | ||
1410 | /* | 1408 | /* |
1411 | * Merge control. If this is set then no merging of slab caches will occur. | 1409 | * Merge control. If this is set then no merging of slab caches will occur. |
1410 | * (Could be removed. This was introduced to pacify the merge skeptics.) | ||
1412 | */ | 1411 | */ |
1413 | static int slub_nomerge; | 1412 | static int slub_nomerge; |
1414 | 1413 | ||
@@ -1422,23 +1421,27 @@ static char *slub_debug_slabs; | |||
1422 | /* | 1421 | /* |
1423 | * Calculate the order of allocation given an slab object size. | 1422 | * Calculate the order of allocation given an slab object size. |
1424 | * | 1423 | * |
1425 | * The order of allocation has significant impact on other elements | 1424 | * The order of allocation has significant impact on performance and other |
1426 | * of the system. Generally order 0 allocations should be preferred | 1425 | * system components. Generally order 0 allocations should be preferred since |
1427 | * since they do not cause fragmentation in the page allocator. Larger | 1426 | * order 0 does not cause fragmentation in the page allocator. Larger objects |
1428 | * objects may have problems with order 0 because there may be too much | 1427 | * be problematic to put into order 0 slabs because there may be too much |
1429 | * space left unused in a slab. We go to a higher order if more than 1/8th | 1428 | * unused space left. We go to a higher order if more than 1/8th of the slab |
1430 | * of the slab would be wasted. | 1429 | * would be wasted. |
1430 | * | ||
1431 | * In order to reach satisfactory performance we must ensure that a minimum | ||
1432 | * number of objects is in one slab. Otherwise we may generate too much | ||
1433 | * activity on the partial lists which requires taking the list_lock. This is | ||
1434 | * less a concern for large slabs though which are rarely used. | ||
1431 | * | 1435 | * |
1432 | * In order to reach satisfactory performance we must ensure that | 1436 | * slub_max_order specifies the order where we begin to stop considering the |
1433 | * a minimum number of objects is in one slab. Otherwise we may | 1437 | * number of objects in a slab as critical. If we reach slub_max_order then |
1434 | * generate too much activity on the partial lists. This is less a | 1438 | * we try to keep the page order as low as possible. So we accept more waste |
1435 | * concern for large slabs though. slub_max_order specifies the order | 1439 | * of space in favor of a small page order. |
1436 | * where we begin to stop considering the number of objects in a slab. | ||
1437 | * | 1440 | * |
1438 | * Higher order allocations also allow the placement of more objects | 1441 | * Higher order allocations also allow the placement of more objects in a |
1439 | * in a slab and thereby reduce object handling overhead. If the user | 1442 | * slab and thereby reduce object handling overhead. If the user has |
1440 | * has requested a higher mininum order then we start with that one | 1443 | * requested a higher mininum order then we start with that one instead of |
1441 | * instead of zero. | 1444 | * the smallest order which will fit the object. |
1442 | */ | 1445 | */ |
1443 | static int calculate_order(int size) | 1446 | static int calculate_order(int size) |
1444 | { | 1447 | { |
@@ -1458,18 +1461,18 @@ static int calculate_order(int size) | |||
1458 | 1461 | ||
1459 | rem = slab_size % size; | 1462 | rem = slab_size % size; |
1460 | 1463 | ||
1461 | if (rem <= (PAGE_SIZE << order) / 8) | 1464 | if (rem <= slab_size / 8) |
1462 | break; | 1465 | break; |
1463 | 1466 | ||
1464 | } | 1467 | } |
1465 | if (order >= MAX_ORDER) | 1468 | if (order >= MAX_ORDER) |
1466 | return -E2BIG; | 1469 | return -E2BIG; |
1470 | |||
1467 | return order; | 1471 | return order; |
1468 | } | 1472 | } |
1469 | 1473 | ||
1470 | /* | 1474 | /* |
1471 | * Function to figure out which alignment to use from the | 1475 | * Figure out what the alignment of the objects will be. |
1472 | * various ways of specifying it. | ||
1473 | */ | 1476 | */ |
1474 | static unsigned long calculate_alignment(unsigned long flags, | 1477 | static unsigned long calculate_alignment(unsigned long flags, |
1475 | unsigned long align, unsigned long size) | 1478 | unsigned long align, unsigned long size) |
@@ -1624,18 +1627,16 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1624 | size = ALIGN(size, sizeof(void *)); | 1627 | size = ALIGN(size, sizeof(void *)); |
1625 | 1628 | ||
1626 | /* | 1629 | /* |
1627 | * If we are redzoning then check if there is some space between the | 1630 | * If we are Redzoning then check if there is some space between the |
1628 | * end of the object and the free pointer. If not then add an | 1631 | * end of the object and the free pointer. If not then add an |
1629 | * additional word, so that we can establish a redzone between | 1632 | * additional word to have some bytes to store Redzone information. |
1630 | * the object and the freepointer to be able to check for overwrites. | ||
1631 | */ | 1633 | */ |
1632 | if ((flags & SLAB_RED_ZONE) && size == s->objsize) | 1634 | if ((flags & SLAB_RED_ZONE) && size == s->objsize) |
1633 | size += sizeof(void *); | 1635 | size += sizeof(void *); |
1634 | 1636 | ||
1635 | /* | 1637 | /* |
1636 | * With that we have determined how much of the slab is in actual | 1638 | * With that we have determined the number of bytes in actual use |
1637 | * use by the object. This is the potential offset to the free | 1639 | * by the object. This is the potential offset to the free pointer. |
1638 | * pointer. | ||
1639 | */ | 1640 | */ |
1640 | s->inuse = size; | 1641 | s->inuse = size; |
1641 | 1642 | ||
@@ -1669,6 +1670,7 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1669 | * of the object. | 1670 | * of the object. |
1670 | */ | 1671 | */ |
1671 | size += sizeof(void *); | 1672 | size += sizeof(void *); |
1673 | |||
1672 | /* | 1674 | /* |
1673 | * Determine the alignment based on various parameters that the | 1675 | * Determine the alignment based on various parameters that the |
1674 | * user specified and the dynamic determination of cache line size | 1676 | * user specified and the dynamic determination of cache line size |
@@ -1770,7 +1772,6 @@ EXPORT_SYMBOL(kmem_cache_open); | |||
1770 | int kmem_ptr_validate(struct kmem_cache *s, const void *object) | 1772 | int kmem_ptr_validate(struct kmem_cache *s, const void *object) |
1771 | { | 1773 | { |
1772 | struct page * page; | 1774 | struct page * page; |
1773 | void *addr; | ||
1774 | 1775 | ||
1775 | page = get_object_page(object); | 1776 | page = get_object_page(object); |
1776 | 1777 | ||
@@ -1807,7 +1808,8 @@ const char *kmem_cache_name(struct kmem_cache *s) | |||
1807 | EXPORT_SYMBOL(kmem_cache_name); | 1808 | EXPORT_SYMBOL(kmem_cache_name); |
1808 | 1809 | ||
1809 | /* | 1810 | /* |
1810 | * Attempt to free all slabs on a node | 1811 | * Attempt to free all slabs on a node. Return the number of slabs we |
1812 | * were unable to free. | ||
1811 | */ | 1813 | */ |
1812 | static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, | 1814 | static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, |
1813 | struct list_head *list) | 1815 | struct list_head *list) |
@@ -1828,7 +1830,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, | |||
1828 | } | 1830 | } |
1829 | 1831 | ||
1830 | /* | 1832 | /* |
1831 | * Release all resources used by slab cache | 1833 | * Release all resources used by a slab cache. |
1832 | */ | 1834 | */ |
1833 | static int kmem_cache_close(struct kmem_cache *s) | 1835 | static int kmem_cache_close(struct kmem_cache *s) |
1834 | { | 1836 | { |
@@ -2089,13 +2091,14 @@ void kfree(const void *x) | |||
2089 | EXPORT_SYMBOL(kfree); | 2091 | EXPORT_SYMBOL(kfree); |
2090 | 2092 | ||
2091 | /* | 2093 | /* |
2092 | * kmem_cache_shrink removes empty slabs from the partial lists | 2094 | * kmem_cache_shrink removes empty slabs from the partial lists and sorts |
2093 | * and then sorts the partially allocated slabs by the number | 2095 | * the remaining slabs by the number of items in use. The slabs with the |
2094 | * of items in use. The slabs with the most items in use | 2096 | * most items in use come first. New allocations will then fill those up |
2095 | * come first. New allocations will remove these from the | 2097 | * and thus they can be removed from the partial lists. |
2096 | * partial list because they are full. The slabs with the | 2098 | * |
2097 | * least items are placed last. If it happens that the objects | 2099 | * The slabs with the least items are placed last. This results in them |
2098 | * are freed then the page can be returned to the page allocator. | 2100 | * being allocated from last increasing the chance that the last objects |
2101 | * are freed in them. | ||
2099 | */ | 2102 | */ |
2100 | int kmem_cache_shrink(struct kmem_cache *s) | 2103 | int kmem_cache_shrink(struct kmem_cache *s) |
2101 | { | 2104 | { |
@@ -2124,12 +2127,10 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
2124 | spin_lock_irqsave(&n->list_lock, flags); | 2127 | spin_lock_irqsave(&n->list_lock, flags); |
2125 | 2128 | ||
2126 | /* | 2129 | /* |
2127 | * Build lists indexed by the items in use in | 2130 | * Build lists indexed by the items in use in each slab. |
2128 | * each slab or free slabs if empty. | ||
2129 | * | 2131 | * |
2130 | * Note that concurrent frees may occur while | 2132 | * Note that concurrent frees may occur while we hold the |
2131 | * we hold the list_lock. page->inuse here is | 2133 | * list_lock. page->inuse here is the upper limit. |
2132 | * the upper limit. | ||
2133 | */ | 2134 | */ |
2134 | list_for_each_entry_safe(page, t, &n->partial, lru) { | 2135 | list_for_each_entry_safe(page, t, &n->partial, lru) { |
2135 | if (!page->inuse && slab_trylock(page)) { | 2136 | if (!page->inuse && slab_trylock(page)) { |
@@ -2153,8 +2154,8 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
2153 | goto out; | 2154 | goto out; |
2154 | 2155 | ||
2155 | /* | 2156 | /* |
2156 | * Rebuild the partial list with the slabs filled up | 2157 | * Rebuild the partial list with the slabs filled up most |
2157 | * most first and the least used slabs at the end. | 2158 | * first and the least used slabs at the end. |
2158 | */ | 2159 | */ |
2159 | for (i = s->objects - 1; i >= 0; i--) | 2160 | for (i = s->objects - 1; i >= 0; i--) |
2160 | list_splice(slabs_by_inuse + i, n->partial.prev); | 2161 | list_splice(slabs_by_inuse + i, n->partial.prev); |
@@ -2217,7 +2218,7 @@ void __init kmem_cache_init(void) | |||
2217 | #ifdef CONFIG_NUMA | 2218 | #ifdef CONFIG_NUMA |
2218 | /* | 2219 | /* |
2219 | * Must first have the slab cache available for the allocations of the | 2220 | * Must first have the slab cache available for the allocations of the |
2220 | * struct kmalloc_cache_node's. There is special bootstrap code in | 2221 | * struct kmem_cache_node's. There is special bootstrap code in |
2221 | * kmem_cache_open for slab_state == DOWN. | 2222 | * kmem_cache_open for slab_state == DOWN. |
2222 | */ | 2223 | */ |
2223 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", | 2224 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", |
@@ -2389,8 +2390,8 @@ static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) | |||
2389 | } | 2390 | } |
2390 | 2391 | ||
2391 | /* | 2392 | /* |
2392 | * Use the cpu notifier to insure that the slab are flushed | 2393 | * Use the cpu notifier to insure that the cpu slabs are flushed when |
2393 | * when necessary. | 2394 | * necessary. |
2394 | */ | 2395 | */ |
2395 | static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | 2396 | static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, |
2396 | unsigned long action, void *hcpu) | 2397 | unsigned long action, void *hcpu) |
@@ -2555,11 +2556,6 @@ static void resiliency_test(void) | |||
2555 | static void resiliency_test(void) {}; | 2556 | static void resiliency_test(void) {}; |
2556 | #endif | 2557 | #endif |
2557 | 2558 | ||
2558 | /* | ||
2559 | * These are not as efficient as kmalloc for the non debug case. | ||
2560 | * We do not have the page struct available so we have to touch one | ||
2561 | * cacheline in struct kmem_cache to check slab flags. | ||
2562 | */ | ||
2563 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) | 2559 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) |
2564 | { | 2560 | { |
2565 | struct kmem_cache *s = get_slab(size, gfpflags); | 2561 | struct kmem_cache *s = get_slab(size, gfpflags); |
@@ -2677,7 +2673,7 @@ static unsigned long validate_slab_cache(struct kmem_cache *s) | |||
2677 | } | 2673 | } |
2678 | 2674 | ||
2679 | /* | 2675 | /* |
2680 | * Generate lists of locations where slabcache objects are allocated | 2676 | * Generate lists of code addresses where slabcache objects are allocated |
2681 | * and freed. | 2677 | * and freed. |
2682 | */ | 2678 | */ |
2683 | 2679 | ||
@@ -2756,7 +2752,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, | |||
2756 | } | 2752 | } |
2757 | 2753 | ||
2758 | /* | 2754 | /* |
2759 | * Not found. Insert new tracking element | 2755 | * Not found. Insert new tracking element. |
2760 | */ | 2756 | */ |
2761 | if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) | 2757 | if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) |
2762 | return 0; | 2758 | return 0; |