diff options
Diffstat (limited to 'mm/slub.c')
-rw-r--r-- | mm/slub.c | 1050 |
1 files changed, 559 insertions, 491 deletions
@@ -66,11 +66,11 @@ | |||
66 | * SLUB assigns one slab for allocation to each processor. | 66 | * SLUB assigns one slab for allocation to each processor. |
67 | * Allocations only occur from these slabs called cpu slabs. | 67 | * Allocations only occur from these slabs called cpu slabs. |
68 | * | 68 | * |
69 | * Slabs with free elements are kept on a partial list. | 69 | * Slabs with free elements are kept on a partial list and during regular |
70 | * There is no list for full slabs. If an object in a full slab is | 70 | * operations no list for full slabs is used. If an object in a full slab is |
71 | * freed then the slab will show up again on the partial lists. | 71 | * freed then the slab will show up again on the partial lists. |
72 | * Otherwise there is no need to track full slabs unless we have to | 72 | * We track full slabs for debugging purposes though because otherwise we |
73 | * track full slabs for debugging purposes. | 73 | * cannot scan all objects. |
74 | * | 74 | * |
75 | * Slabs are freed when they become empty. Teardown and setup is | 75 | * Slabs are freed when they become empty. Teardown and setup is |
76 | * minimal so we rely on the page allocators per cpu caches for | 76 | * minimal so we rely on the page allocators per cpu caches for |
@@ -87,13 +87,36 @@ | |||
87 | * the fast path. | 87 | * the fast path. |
88 | */ | 88 | */ |
89 | 89 | ||
90 | static inline int SlabDebug(struct page *page) | ||
91 | { | ||
92 | #ifdef CONFIG_SLUB_DEBUG | ||
93 | return PageError(page); | ||
94 | #else | ||
95 | return 0; | ||
96 | #endif | ||
97 | } | ||
98 | |||
99 | static inline void SetSlabDebug(struct page *page) | ||
100 | { | ||
101 | #ifdef CONFIG_SLUB_DEBUG | ||
102 | SetPageError(page); | ||
103 | #endif | ||
104 | } | ||
105 | |||
106 | static inline void ClearSlabDebug(struct page *page) | ||
107 | { | ||
108 | #ifdef CONFIG_SLUB_DEBUG | ||
109 | ClearPageError(page); | ||
110 | #endif | ||
111 | } | ||
112 | |||
90 | /* | 113 | /* |
91 | * Issues still to be resolved: | 114 | * Issues still to be resolved: |
92 | * | 115 | * |
93 | * - The per cpu array is updated for each new slab and and is a remote | 116 | * - The per cpu array is updated for each new slab and and is a remote |
94 | * cacheline for most nodes. This could become a bouncing cacheline given | 117 | * cacheline for most nodes. This could become a bouncing cacheline given |
95 | * enough frequent updates. There are 16 pointers in a cacheline.so at | 118 | * enough frequent updates. There are 16 pointers in a cacheline, so at |
96 | * max 16 cpus could compete. Likely okay. | 119 | * max 16 cpus could compete for the cacheline which may be okay. |
97 | * | 120 | * |
98 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. | 121 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. |
99 | * | 122 | * |
@@ -137,6 +160,7 @@ | |||
137 | 160 | ||
138 | #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ | 161 | #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ |
139 | SLAB_POISON | SLAB_STORE_USER) | 162 | SLAB_POISON | SLAB_STORE_USER) |
163 | |||
140 | /* | 164 | /* |
141 | * Set of flags that will prevent slab merging | 165 | * Set of flags that will prevent slab merging |
142 | */ | 166 | */ |
@@ -157,6 +181,11 @@ | |||
157 | /* Internal SLUB flags */ | 181 | /* Internal SLUB flags */ |
158 | #define __OBJECT_POISON 0x80000000 /* Poison object */ | 182 | #define __OBJECT_POISON 0x80000000 /* Poison object */ |
159 | 183 | ||
184 | /* Not all arches define cache_line_size */ | ||
185 | #ifndef cache_line_size | ||
186 | #define cache_line_size() L1_CACHE_BYTES | ||
187 | #endif | ||
188 | |||
160 | static int kmem_size = sizeof(struct kmem_cache); | 189 | static int kmem_size = sizeof(struct kmem_cache); |
161 | 190 | ||
162 | #ifdef CONFIG_SMP | 191 | #ifdef CONFIG_SMP |
@@ -166,7 +195,7 @@ static struct notifier_block slab_notifier; | |||
166 | static enum { | 195 | static enum { |
167 | DOWN, /* No slab functionality available */ | 196 | DOWN, /* No slab functionality available */ |
168 | PARTIAL, /* kmem_cache_open() works but kmalloc does not */ | 197 | PARTIAL, /* kmem_cache_open() works but kmalloc does not */ |
169 | UP, /* Everything works */ | 198 | UP, /* Everything works but does not show up in sysfs */ |
170 | SYSFS /* Sysfs up */ | 199 | SYSFS /* Sysfs up */ |
171 | } slab_state = DOWN; | 200 | } slab_state = DOWN; |
172 | 201 | ||
@@ -174,7 +203,19 @@ static enum { | |||
174 | static DECLARE_RWSEM(slub_lock); | 203 | static DECLARE_RWSEM(slub_lock); |
175 | LIST_HEAD(slab_caches); | 204 | LIST_HEAD(slab_caches); |
176 | 205 | ||
177 | #ifdef CONFIG_SYSFS | 206 | /* |
207 | * Tracking user of a slab. | ||
208 | */ | ||
209 | struct track { | ||
210 | void *addr; /* Called from address */ | ||
211 | int cpu; /* Was running on cpu */ | ||
212 | int pid; /* Pid context */ | ||
213 | unsigned long when; /* When did the operation occur */ | ||
214 | }; | ||
215 | |||
216 | enum track_item { TRACK_ALLOC, TRACK_FREE }; | ||
217 | |||
218 | #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) | ||
178 | static int sysfs_slab_add(struct kmem_cache *); | 219 | static int sysfs_slab_add(struct kmem_cache *); |
179 | static int sysfs_slab_alias(struct kmem_cache *, const char *); | 220 | static int sysfs_slab_alias(struct kmem_cache *, const char *); |
180 | static void sysfs_slab_remove(struct kmem_cache *); | 221 | static void sysfs_slab_remove(struct kmem_cache *); |
@@ -202,6 +243,63 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
202 | #endif | 243 | #endif |
203 | } | 244 | } |
204 | 245 | ||
246 | static inline int check_valid_pointer(struct kmem_cache *s, | ||
247 | struct page *page, const void *object) | ||
248 | { | ||
249 | void *base; | ||
250 | |||
251 | if (!object) | ||
252 | return 1; | ||
253 | |||
254 | base = page_address(page); | ||
255 | if (object < base || object >= base + s->objects * s->size || | ||
256 | (object - base) % s->size) { | ||
257 | return 0; | ||
258 | } | ||
259 | |||
260 | return 1; | ||
261 | } | ||
262 | |||
263 | /* | ||
264 | * Slow version of get and set free pointer. | ||
265 | * | ||
266 | * This version requires touching the cache lines of kmem_cache which | ||
267 | * we avoid to do in the fast alloc free paths. There we obtain the offset | ||
268 | * from the page struct. | ||
269 | */ | ||
270 | static inline void *get_freepointer(struct kmem_cache *s, void *object) | ||
271 | { | ||
272 | return *(void **)(object + s->offset); | ||
273 | } | ||
274 | |||
275 | static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) | ||
276 | { | ||
277 | *(void **)(object + s->offset) = fp; | ||
278 | } | ||
279 | |||
280 | /* Loop over all objects in a slab */ | ||
281 | #define for_each_object(__p, __s, __addr) \ | ||
282 | for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ | ||
283 | __p += (__s)->size) | ||
284 | |||
285 | /* Scan freelist */ | ||
286 | #define for_each_free_object(__p, __s, __free) \ | ||
287 | for (__p = (__free); __p; __p = get_freepointer((__s), __p)) | ||
288 | |||
289 | /* Determine object index from a given position */ | ||
290 | static inline int slab_index(void *p, struct kmem_cache *s, void *addr) | ||
291 | { | ||
292 | return (p - addr) / s->size; | ||
293 | } | ||
294 | |||
295 | #ifdef CONFIG_SLUB_DEBUG | ||
296 | /* | ||
297 | * Debug settings: | ||
298 | */ | ||
299 | static int slub_debug; | ||
300 | |||
301 | static char *slub_debug_slabs; | ||
302 | |||
205 | /* | 303 | /* |
206 | * Object debugging | 304 | * Object debugging |
207 | */ | 305 | */ |
@@ -237,35 +335,6 @@ static void print_section(char *text, u8 *addr, unsigned int length) | |||
237 | } | 335 | } |
238 | } | 336 | } |
239 | 337 | ||
240 | /* | ||
241 | * Slow version of get and set free pointer. | ||
242 | * | ||
243 | * This requires touching the cache lines of kmem_cache. | ||
244 | * The offset can also be obtained from the page. In that | ||
245 | * case it is in the cacheline that we already need to touch. | ||
246 | */ | ||
247 | static void *get_freepointer(struct kmem_cache *s, void *object) | ||
248 | { | ||
249 | return *(void **)(object + s->offset); | ||
250 | } | ||
251 | |||
252 | static void set_freepointer(struct kmem_cache *s, void *object, void *fp) | ||
253 | { | ||
254 | *(void **)(object + s->offset) = fp; | ||
255 | } | ||
256 | |||
257 | /* | ||
258 | * Tracking user of a slab. | ||
259 | */ | ||
260 | struct track { | ||
261 | void *addr; /* Called from address */ | ||
262 | int cpu; /* Was running on cpu */ | ||
263 | int pid; /* Pid context */ | ||
264 | unsigned long when; /* When did the operation occur */ | ||
265 | }; | ||
266 | |||
267 | enum track_item { TRACK_ALLOC, TRACK_FREE }; | ||
268 | |||
269 | static struct track *get_track(struct kmem_cache *s, void *object, | 338 | static struct track *get_track(struct kmem_cache *s, void *object, |
270 | enum track_item alloc) | 339 | enum track_item alloc) |
271 | { | 340 | { |
@@ -400,24 +469,6 @@ static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) | |||
400 | return 1; | 469 | return 1; |
401 | } | 470 | } |
402 | 471 | ||
403 | |||
404 | static int check_valid_pointer(struct kmem_cache *s, struct page *page, | ||
405 | void *object) | ||
406 | { | ||
407 | void *base; | ||
408 | |||
409 | if (!object) | ||
410 | return 1; | ||
411 | |||
412 | base = page_address(page); | ||
413 | if (object < base || object >= base + s->objects * s->size || | ||
414 | (object - base) % s->size) { | ||
415 | return 0; | ||
416 | } | ||
417 | |||
418 | return 1; | ||
419 | } | ||
420 | |||
421 | /* | 472 | /* |
422 | * Object layout: | 473 | * Object layout: |
423 | * | 474 | * |
@@ -425,26 +476,34 @@ static int check_valid_pointer(struct kmem_cache *s, struct page *page, | |||
425 | * Bytes of the object to be managed. | 476 | * Bytes of the object to be managed. |
426 | * If the freepointer may overlay the object then the free | 477 | * If the freepointer may overlay the object then the free |
427 | * pointer is the first word of the object. | 478 | * pointer is the first word of the object. |
479 | * | ||
428 | * Poisoning uses 0x6b (POISON_FREE) and the last byte is | 480 | * Poisoning uses 0x6b (POISON_FREE) and the last byte is |
429 | * 0xa5 (POISON_END) | 481 | * 0xa5 (POISON_END) |
430 | * | 482 | * |
431 | * object + s->objsize | 483 | * object + s->objsize |
432 | * Padding to reach word boundary. This is also used for Redzoning. | 484 | * Padding to reach word boundary. This is also used for Redzoning. |
433 | * Padding is extended to word size if Redzoning is enabled | 485 | * Padding is extended by another word if Redzoning is enabled and |
434 | * and objsize == inuse. | 486 | * objsize == inuse. |
487 | * | ||
435 | * We fill with 0xbb (RED_INACTIVE) for inactive objects and with | 488 | * We fill with 0xbb (RED_INACTIVE) for inactive objects and with |
436 | * 0xcc (RED_ACTIVE) for objects in use. | 489 | * 0xcc (RED_ACTIVE) for objects in use. |
437 | * | 490 | * |
438 | * object + s->inuse | 491 | * object + s->inuse |
492 | * Meta data starts here. | ||
493 | * | ||
439 | * A. Free pointer (if we cannot overwrite object on free) | 494 | * A. Free pointer (if we cannot overwrite object on free) |
440 | * B. Tracking data for SLAB_STORE_USER | 495 | * B. Tracking data for SLAB_STORE_USER |
441 | * C. Padding to reach required alignment boundary | 496 | * C. Padding to reach required alignment boundary or at mininum |
442 | * Padding is done using 0x5a (POISON_INUSE) | 497 | * one word if debuggin is on to be able to detect writes |
498 | * before the word boundary. | ||
499 | * | ||
500 | * Padding is done using 0x5a (POISON_INUSE) | ||
443 | * | 501 | * |
444 | * object + s->size | 502 | * object + s->size |
503 | * Nothing is used beyond s->size. | ||
445 | * | 504 | * |
446 | * If slabcaches are merged then the objsize and inuse boundaries are to | 505 | * If slabcaches are merged then the objsize and inuse boundaries are mostly |
447 | * be ignored. And therefore no slab options that rely on these boundaries | 506 | * ignored. And therefore no slab options that rely on these boundaries |
448 | * may be used with merged slabcaches. | 507 | * may be used with merged slabcaches. |
449 | */ | 508 | */ |
450 | 509 | ||
@@ -570,8 +629,7 @@ static int check_object(struct kmem_cache *s, struct page *page, | |||
570 | /* | 629 | /* |
571 | * No choice but to zap it and thus loose the remainder | 630 | * No choice but to zap it and thus loose the remainder |
572 | * of the free objects in this slab. May cause | 631 | * of the free objects in this slab. May cause |
573 | * another error because the object count maybe | 632 | * another error because the object count is now wrong. |
574 | * wrong now. | ||
575 | */ | 633 | */ |
576 | set_freepointer(s, p, NULL); | 634 | set_freepointer(s, p, NULL); |
577 | return 0; | 635 | return 0; |
@@ -611,9 +669,8 @@ static int check_slab(struct kmem_cache *s, struct page *page) | |||
611 | } | 669 | } |
612 | 670 | ||
613 | /* | 671 | /* |
614 | * Determine if a certain object on a page is on the freelist and | 672 | * Determine if a certain object on a page is on the freelist. Must hold the |
615 | * therefore free. Must hold the slab lock for cpu slabs to | 673 | * slab lock to guarantee that the chains are in a consistent state. |
616 | * guarantee that the chains are consistent. | ||
617 | */ | 674 | */ |
618 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | 675 | static int on_freelist(struct kmem_cache *s, struct page *page, void *search) |
619 | { | 676 | { |
@@ -659,7 +716,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) | |||
659 | } | 716 | } |
660 | 717 | ||
661 | /* | 718 | /* |
662 | * Tracking of fully allocated slabs for debugging | 719 | * Tracking of fully allocated slabs for debugging purposes. |
663 | */ | 720 | */ |
664 | static void add_full(struct kmem_cache_node *n, struct page *page) | 721 | static void add_full(struct kmem_cache_node *n, struct page *page) |
665 | { | 722 | { |
@@ -710,7 +767,7 @@ bad: | |||
710 | /* | 767 | /* |
711 | * If this is a slab page then lets do the best we can | 768 | * If this is a slab page then lets do the best we can |
712 | * to avoid issues in the future. Marking all objects | 769 | * to avoid issues in the future. Marking all objects |
713 | * as used avoids touching the remainder. | 770 | * as used avoids touching the remaining objects. |
714 | */ | 771 | */ |
715 | printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", | 772 | printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", |
716 | s->name, page); | 773 | s->name, page); |
@@ -764,6 +821,113 @@ fail: | |||
764 | return 0; | 821 | return 0; |
765 | } | 822 | } |
766 | 823 | ||
824 | static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) | ||
825 | { | ||
826 | if (s->flags & SLAB_TRACE) { | ||
827 | printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", | ||
828 | s->name, | ||
829 | alloc ? "alloc" : "free", | ||
830 | object, page->inuse, | ||
831 | page->freelist); | ||
832 | |||
833 | if (!alloc) | ||
834 | print_section("Object", (void *)object, s->objsize); | ||
835 | |||
836 | dump_stack(); | ||
837 | } | ||
838 | } | ||
839 | |||
840 | static int __init setup_slub_debug(char *str) | ||
841 | { | ||
842 | if (!str || *str != '=') | ||
843 | slub_debug = DEBUG_DEFAULT_FLAGS; | ||
844 | else { | ||
845 | str++; | ||
846 | if (*str == 0 || *str == ',') | ||
847 | slub_debug = DEBUG_DEFAULT_FLAGS; | ||
848 | else | ||
849 | for( ;*str && *str != ','; str++) | ||
850 | switch (*str) { | ||
851 | case 'f' : case 'F' : | ||
852 | slub_debug |= SLAB_DEBUG_FREE; | ||
853 | break; | ||
854 | case 'z' : case 'Z' : | ||
855 | slub_debug |= SLAB_RED_ZONE; | ||
856 | break; | ||
857 | case 'p' : case 'P' : | ||
858 | slub_debug |= SLAB_POISON; | ||
859 | break; | ||
860 | case 'u' : case 'U' : | ||
861 | slub_debug |= SLAB_STORE_USER; | ||
862 | break; | ||
863 | case 't' : case 'T' : | ||
864 | slub_debug |= SLAB_TRACE; | ||
865 | break; | ||
866 | default: | ||
867 | printk(KERN_ERR "slub_debug option '%c' " | ||
868 | "unknown. skipped\n",*str); | ||
869 | } | ||
870 | } | ||
871 | |||
872 | if (*str == ',') | ||
873 | slub_debug_slabs = str + 1; | ||
874 | return 1; | ||
875 | } | ||
876 | |||
877 | __setup("slub_debug", setup_slub_debug); | ||
878 | |||
879 | static void kmem_cache_open_debug_check(struct kmem_cache *s) | ||
880 | { | ||
881 | /* | ||
882 | * The page->offset field is only 16 bit wide. This is an offset | ||
883 | * in units of words from the beginning of an object. If the slab | ||
884 | * size is bigger then we cannot move the free pointer behind the | ||
885 | * object anymore. | ||
886 | * | ||
887 | * On 32 bit platforms the limit is 256k. On 64bit platforms | ||
888 | * the limit is 512k. | ||
889 | * | ||
890 | * Debugging or ctor/dtors may create a need to move the free | ||
891 | * pointer. Fail if this happens. | ||
892 | */ | ||
893 | if (s->size >= 65535 * sizeof(void *)) { | ||
894 | BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON | | ||
895 | SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); | ||
896 | BUG_ON(s->ctor || s->dtor); | ||
897 | } | ||
898 | else | ||
899 | /* | ||
900 | * Enable debugging if selected on the kernel commandline. | ||
901 | */ | ||
902 | if (slub_debug && (!slub_debug_slabs || | ||
903 | strncmp(slub_debug_slabs, s->name, | ||
904 | strlen(slub_debug_slabs)) == 0)) | ||
905 | s->flags |= slub_debug; | ||
906 | } | ||
907 | #else | ||
908 | |||
909 | static inline int alloc_object_checks(struct kmem_cache *s, | ||
910 | struct page *page, void *object) { return 0; } | ||
911 | |||
912 | static inline int free_object_checks(struct kmem_cache *s, | ||
913 | struct page *page, void *object) { return 0; } | ||
914 | |||
915 | static inline void add_full(struct kmem_cache_node *n, struct page *page) {} | ||
916 | static inline void remove_full(struct kmem_cache *s, struct page *page) {} | ||
917 | static inline void trace(struct kmem_cache *s, struct page *page, | ||
918 | void *object, int alloc) {} | ||
919 | static inline void init_object(struct kmem_cache *s, | ||
920 | void *object, int active) {} | ||
921 | static inline void init_tracking(struct kmem_cache *s, void *object) {} | ||
922 | static inline int slab_pad_check(struct kmem_cache *s, struct page *page) | ||
923 | { return 1; } | ||
924 | static inline int check_object(struct kmem_cache *s, struct page *page, | ||
925 | void *object, int active) { return 1; } | ||
926 | static inline void set_track(struct kmem_cache *s, void *object, | ||
927 | enum track_item alloc, void *addr) {} | ||
928 | static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {} | ||
929 | #define slub_debug 0 | ||
930 | #endif | ||
767 | /* | 931 | /* |
768 | * Slab allocation and freeing | 932 | * Slab allocation and freeing |
769 | */ | 933 | */ |
@@ -797,7 +961,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
797 | static void setup_object(struct kmem_cache *s, struct page *page, | 961 | static void setup_object(struct kmem_cache *s, struct page *page, |
798 | void *object) | 962 | void *object) |
799 | { | 963 | { |
800 | if (PageError(page)) { | 964 | if (SlabDebug(page)) { |
801 | init_object(s, object, 0); | 965 | init_object(s, object, 0); |
802 | init_tracking(s, object); | 966 | init_tracking(s, object); |
803 | } | 967 | } |
@@ -832,7 +996,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
832 | page->flags |= 1 << PG_slab; | 996 | page->flags |= 1 << PG_slab; |
833 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | | 997 | if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | |
834 | SLAB_STORE_USER | SLAB_TRACE)) | 998 | SLAB_STORE_USER | SLAB_TRACE)) |
835 | page->flags |= 1 << PG_error; | 999 | SetSlabDebug(page); |
836 | 1000 | ||
837 | start = page_address(page); | 1001 | start = page_address(page); |
838 | end = start + s->objects * s->size; | 1002 | end = start + s->objects * s->size; |
@@ -841,7 +1005,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) | |||
841 | memset(start, POISON_INUSE, PAGE_SIZE << s->order); | 1005 | memset(start, POISON_INUSE, PAGE_SIZE << s->order); |
842 | 1006 | ||
843 | last = start; | 1007 | last = start; |
844 | for (p = start + s->size; p < end; p += s->size) { | 1008 | for_each_object(p, s, start) { |
845 | setup_object(s, page, last); | 1009 | setup_object(s, page, last); |
846 | set_freepointer(s, last, p); | 1010 | set_freepointer(s, last, p); |
847 | last = p; | 1011 | last = p; |
@@ -861,13 +1025,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page) | |||
861 | { | 1025 | { |
862 | int pages = 1 << s->order; | 1026 | int pages = 1 << s->order; |
863 | 1027 | ||
864 | if (unlikely(PageError(page) || s->dtor)) { | 1028 | if (unlikely(SlabDebug(page) || s->dtor)) { |
865 | void *start = page_address(page); | ||
866 | void *end = start + (pages << PAGE_SHIFT); | ||
867 | void *p; | 1029 | void *p; |
868 | 1030 | ||
869 | slab_pad_check(s, page); | 1031 | slab_pad_check(s, page); |
870 | for (p = start; p <= end - s->size; p += s->size) { | 1032 | for_each_object(p, s, page_address(page)) { |
871 | if (s->dtor) | 1033 | if (s->dtor) |
872 | s->dtor(p, s, 0); | 1034 | s->dtor(p, s, 0); |
873 | check_object(s, page, p, 0); | 1035 | check_object(s, page, p, 0); |
@@ -910,7 +1072,8 @@ static void discard_slab(struct kmem_cache *s, struct page *page) | |||
910 | 1072 | ||
911 | atomic_long_dec(&n->nr_slabs); | 1073 | atomic_long_dec(&n->nr_slabs); |
912 | reset_page_mapcount(page); | 1074 | reset_page_mapcount(page); |
913 | page->flags &= ~(1 << PG_slab | 1 << PG_error); | 1075 | ClearSlabDebug(page); |
1076 | __ClearPageSlab(page); | ||
914 | free_slab(s, page); | 1077 | free_slab(s, page); |
915 | } | 1078 | } |
916 | 1079 | ||
@@ -966,9 +1129,9 @@ static void remove_partial(struct kmem_cache *s, | |||
966 | } | 1129 | } |
967 | 1130 | ||
968 | /* | 1131 | /* |
969 | * Lock page and remove it from the partial list | 1132 | * Lock slab and remove from the partial list. |
970 | * | 1133 | * |
971 | * Must hold list_lock | 1134 | * Must hold list_lock. |
972 | */ | 1135 | */ |
973 | static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) | 1136 | static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) |
974 | { | 1137 | { |
@@ -981,7 +1144,7 @@ static int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) | |||
981 | } | 1144 | } |
982 | 1145 | ||
983 | /* | 1146 | /* |
984 | * Try to get a partial slab from a specific node | 1147 | * Try to allocate a partial slab from a specific node. |
985 | */ | 1148 | */ |
986 | static struct page *get_partial_node(struct kmem_cache_node *n) | 1149 | static struct page *get_partial_node(struct kmem_cache_node *n) |
987 | { | 1150 | { |
@@ -990,7 +1153,8 @@ static struct page *get_partial_node(struct kmem_cache_node *n) | |||
990 | /* | 1153 | /* |
991 | * Racy check. If we mistakenly see no partial slabs then we | 1154 | * Racy check. If we mistakenly see no partial slabs then we |
992 | * just allocate an empty slab. If we mistakenly try to get a | 1155 | * just allocate an empty slab. If we mistakenly try to get a |
993 | * partial slab then get_partials() will return NULL. | 1156 | * partial slab and there is none available then get_partials() |
1157 | * will return NULL. | ||
994 | */ | 1158 | */ |
995 | if (!n || !n->nr_partial) | 1159 | if (!n || !n->nr_partial) |
996 | return NULL; | 1160 | return NULL; |
@@ -1006,8 +1170,7 @@ out: | |||
1006 | } | 1170 | } |
1007 | 1171 | ||
1008 | /* | 1172 | /* |
1009 | * Get a page from somewhere. Search in increasing NUMA | 1173 | * Get a page from somewhere. Search in increasing NUMA distances. |
1010 | * distances. | ||
1011 | */ | 1174 | */ |
1012 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | 1175 | static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) |
1013 | { | 1176 | { |
@@ -1017,24 +1180,22 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) | |||
1017 | struct page *page; | 1180 | struct page *page; |
1018 | 1181 | ||
1019 | /* | 1182 | /* |
1020 | * The defrag ratio allows to configure the tradeoffs between | 1183 | * The defrag ratio allows a configuration of the tradeoffs between |
1021 | * inter node defragmentation and node local allocations. | 1184 | * inter node defragmentation and node local allocations. A lower |
1022 | * A lower defrag_ratio increases the tendency to do local | 1185 | * defrag_ratio increases the tendency to do local allocations |
1023 | * allocations instead of scanning throught the partial | 1186 | * instead of attempting to obtain partial slabs from other nodes. |
1024 | * lists on other nodes. | ||
1025 | * | ||
1026 | * If defrag_ratio is set to 0 then kmalloc() always | ||
1027 | * returns node local objects. If its higher then kmalloc() | ||
1028 | * may return off node objects in order to avoid fragmentation. | ||
1029 | * | 1187 | * |
1030 | * A higher ratio means slabs may be taken from other nodes | 1188 | * If the defrag_ratio is set to 0 then kmalloc() always |
1031 | * thus reducing the number of partial slabs on those nodes. | 1189 | * returns node local objects. If the ratio is higher then kmalloc() |
1190 | * may return off node objects because partial slabs are obtained | ||
1191 | * from other nodes and filled up. | ||
1032 | * | 1192 | * |
1033 | * If /sys/slab/xx/defrag_ratio is set to 100 (which makes | 1193 | * If /sys/slab/xx/defrag_ratio is set to 100 (which makes |
1034 | * defrag_ratio = 1000) then every (well almost) allocation | 1194 | * defrag_ratio = 1000) then every (well almost) allocation will |
1035 | * will first attempt to defrag slab caches on other nodes. This | 1195 | * first attempt to defrag slab caches on other nodes. This means |
1036 | * means scanning over all nodes to look for partial slabs which | 1196 | * scanning over all nodes to look for partial slabs which may be |
1037 | * may be a bit expensive to do on every slab allocation. | 1197 | * expensive if we do it every time we are trying to find a slab |
1198 | * with available objects. | ||
1038 | */ | 1199 | */ |
1039 | if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) | 1200 | if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) |
1040 | return NULL; | 1201 | return NULL; |
@@ -1087,18 +1248,19 @@ static void putback_slab(struct kmem_cache *s, struct page *page) | |||
1087 | 1248 | ||
1088 | if (page->freelist) | 1249 | if (page->freelist) |
1089 | add_partial(n, page); | 1250 | add_partial(n, page); |
1090 | else if (PageError(page) && (s->flags & SLAB_STORE_USER)) | 1251 | else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) |
1091 | add_full(n, page); | 1252 | add_full(n, page); |
1092 | slab_unlock(page); | 1253 | slab_unlock(page); |
1093 | 1254 | ||
1094 | } else { | 1255 | } else { |
1095 | if (n->nr_partial < MIN_PARTIAL) { | 1256 | if (n->nr_partial < MIN_PARTIAL) { |
1096 | /* | 1257 | /* |
1097 | * Adding an empty page to the partial slabs in order | 1258 | * Adding an empty slab to the partial slabs in order |
1098 | * to avoid page allocator overhead. This page needs to | 1259 | * to avoid page allocator overhead. This slab needs |
1099 | * come after all the others that are not fully empty | 1260 | * to come after the other slabs with objects in |
1100 | * in order to make sure that we do maximum | 1261 | * order to fill them up. That way the size of the |
1101 | * defragmentation. | 1262 | * partial list stays small. kmem_cache_shrink can |
1263 | * reclaim empty slabs from the partial list. | ||
1102 | */ | 1264 | */ |
1103 | add_partial_tail(n, page); | 1265 | add_partial_tail(n, page); |
1104 | slab_unlock(page); | 1266 | slab_unlock(page); |
@@ -1166,11 +1328,11 @@ static void flush_all(struct kmem_cache *s) | |||
1166 | * 1. The page struct | 1328 | * 1. The page struct |
1167 | * 2. The first cacheline of the object to be allocated. | 1329 | * 2. The first cacheline of the object to be allocated. |
1168 | * | 1330 | * |
1169 | * The only cache lines that are read (apart from code) is the | 1331 | * The only other cache lines that are read (apart from code) is the |
1170 | * per cpu array in the kmem_cache struct. | 1332 | * per cpu array in the kmem_cache struct. |
1171 | * | 1333 | * |
1172 | * Fastpath is not possible if we need to get a new slab or have | 1334 | * Fastpath is not possible if we need to get a new slab or have |
1173 | * debugging enabled (which means all slabs are marked with PageError) | 1335 | * debugging enabled (which means all slabs are marked with SlabDebug) |
1174 | */ | 1336 | */ |
1175 | static void *slab_alloc(struct kmem_cache *s, | 1337 | static void *slab_alloc(struct kmem_cache *s, |
1176 | gfp_t gfpflags, int node, void *addr) | 1338 | gfp_t gfpflags, int node, void *addr) |
@@ -1193,7 +1355,7 @@ redo: | |||
1193 | object = page->freelist; | 1355 | object = page->freelist; |
1194 | if (unlikely(!object)) | 1356 | if (unlikely(!object)) |
1195 | goto another_slab; | 1357 | goto another_slab; |
1196 | if (unlikely(PageError(page))) | 1358 | if (unlikely(SlabDebug(page))) |
1197 | goto debug; | 1359 | goto debug; |
1198 | 1360 | ||
1199 | have_object: | 1361 | have_object: |
@@ -1220,9 +1382,11 @@ have_slab: | |||
1220 | cpu = smp_processor_id(); | 1382 | cpu = smp_processor_id(); |
1221 | if (s->cpu_slab[cpu]) { | 1383 | if (s->cpu_slab[cpu]) { |
1222 | /* | 1384 | /* |
1223 | * Someone else populated the cpu_slab while we enabled | 1385 | * Someone else populated the cpu_slab while we |
1224 | * interrupts, or we have got scheduled on another cpu. | 1386 | * enabled interrupts, or we have gotten scheduled |
1225 | * The page may not be on the requested node. | 1387 | * on another cpu. The page may not be on the |
1388 | * requested node even if __GFP_THISNODE was | ||
1389 | * specified. So we need to recheck. | ||
1226 | */ | 1390 | */ |
1227 | if (node == -1 || | 1391 | if (node == -1 || |
1228 | page_to_nid(s->cpu_slab[cpu]) == node) { | 1392 | page_to_nid(s->cpu_slab[cpu]) == node) { |
@@ -1235,7 +1399,7 @@ have_slab: | |||
1235 | slab_lock(page); | 1399 | slab_lock(page); |
1236 | goto redo; | 1400 | goto redo; |
1237 | } | 1401 | } |
1238 | /* Dump the current slab */ | 1402 | /* New slab does not fit our expectations */ |
1239 | flush_slab(s, s->cpu_slab[cpu], cpu); | 1403 | flush_slab(s, s->cpu_slab[cpu], cpu); |
1240 | } | 1404 | } |
1241 | slab_lock(page); | 1405 | slab_lock(page); |
@@ -1248,12 +1412,7 @@ debug: | |||
1248 | goto another_slab; | 1412 | goto another_slab; |
1249 | if (s->flags & SLAB_STORE_USER) | 1413 | if (s->flags & SLAB_STORE_USER) |
1250 | set_track(s, object, TRACK_ALLOC, addr); | 1414 | set_track(s, object, TRACK_ALLOC, addr); |
1251 | if (s->flags & SLAB_TRACE) { | 1415 | trace(s, page, object, 1); |
1252 | printk(KERN_INFO "TRACE %s alloc 0x%p inuse=%d fp=0x%p\n", | ||
1253 | s->name, object, page->inuse, | ||
1254 | page->freelist); | ||
1255 | dump_stack(); | ||
1256 | } | ||
1257 | init_object(s, object, 1); | 1416 | init_object(s, object, 1); |
1258 | goto have_object; | 1417 | goto have_object; |
1259 | } | 1418 | } |
@@ -1276,7 +1435,8 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); | |||
1276 | * The fastpath only writes the cacheline of the page struct and the first | 1435 | * The fastpath only writes the cacheline of the page struct and the first |
1277 | * cacheline of the object. | 1436 | * cacheline of the object. |
1278 | * | 1437 | * |
1279 | * No special cachelines need to be read | 1438 | * We read the cpu_slab cacheline to check if the slab is the per cpu |
1439 | * slab for this processor. | ||
1280 | */ | 1440 | */ |
1281 | static void slab_free(struct kmem_cache *s, struct page *page, | 1441 | static void slab_free(struct kmem_cache *s, struct page *page, |
1282 | void *x, void *addr) | 1442 | void *x, void *addr) |
@@ -1288,7 +1448,7 @@ static void slab_free(struct kmem_cache *s, struct page *page, | |||
1288 | local_irq_save(flags); | 1448 | local_irq_save(flags); |
1289 | slab_lock(page); | 1449 | slab_lock(page); |
1290 | 1450 | ||
1291 | if (unlikely(PageError(page))) | 1451 | if (unlikely(SlabDebug(page))) |
1292 | goto debug; | 1452 | goto debug; |
1293 | checks_ok: | 1453 | checks_ok: |
1294 | prior = object[page->offset] = page->freelist; | 1454 | prior = object[page->offset] = page->freelist; |
@@ -1321,7 +1481,7 @@ out_unlock: | |||
1321 | slab_empty: | 1481 | slab_empty: |
1322 | if (prior) | 1482 | if (prior) |
1323 | /* | 1483 | /* |
1324 | * Slab on the partial list. | 1484 | * Slab still on the partial list. |
1325 | */ | 1485 | */ |
1326 | remove_partial(s, page); | 1486 | remove_partial(s, page); |
1327 | 1487 | ||
@@ -1337,13 +1497,7 @@ debug: | |||
1337 | remove_full(s, page); | 1497 | remove_full(s, page); |
1338 | if (s->flags & SLAB_STORE_USER) | 1498 | if (s->flags & SLAB_STORE_USER) |
1339 | set_track(s, x, TRACK_FREE, addr); | 1499 | set_track(s, x, TRACK_FREE, addr); |
1340 | if (s->flags & SLAB_TRACE) { | 1500 | trace(s, page, object, 0); |
1341 | printk(KERN_INFO "TRACE %s free 0x%p inuse=%d fp=0x%p\n", | ||
1342 | s->name, object, page->inuse, | ||
1343 | page->freelist); | ||
1344 | print_section("Object", (void *)object, s->objsize); | ||
1345 | dump_stack(); | ||
1346 | } | ||
1347 | init_object(s, object, 0); | 1501 | init_object(s, object, 0); |
1348 | goto checks_ok; | 1502 | goto checks_ok; |
1349 | } | 1503 | } |
@@ -1370,22 +1524,16 @@ static struct page *get_object_page(const void *x) | |||
1370 | } | 1524 | } |
1371 | 1525 | ||
1372 | /* | 1526 | /* |
1373 | * kmem_cache_open produces objects aligned at "size" and the first object | 1527 | * Object placement in a slab is made very easy because we always start at |
1374 | * is placed at offset 0 in the slab (We have no metainformation on the | 1528 | * offset 0. If we tune the size of the object to the alignment then we can |
1375 | * slab, all slabs are in essence "off slab"). | 1529 | * get the required alignment by putting one properly sized object after |
1376 | * | 1530 | * another. |
1377 | * In order to get the desired alignment one just needs to align the | ||
1378 | * size. | ||
1379 | * | 1531 | * |
1380 | * Notice that the allocation order determines the sizes of the per cpu | 1532 | * Notice that the allocation order determines the sizes of the per cpu |
1381 | * caches. Each processor has always one slab available for allocations. | 1533 | * caches. Each processor has always one slab available for allocations. |
1382 | * Increasing the allocation order reduces the number of times that slabs | 1534 | * Increasing the allocation order reduces the number of times that slabs |
1383 | * must be moved on and off the partial lists and therefore may influence | 1535 | * must be moved on and off the partial lists and is therefore a factor in |
1384 | * locking overhead. | 1536 | * locking overhead. |
1385 | * | ||
1386 | * The offset is used to relocate the free list link in each object. It is | ||
1387 | * therefore possible to move the free list link behind the object. This | ||
1388 | * is necessary for RCU to work properly and also useful for debugging. | ||
1389 | */ | 1537 | */ |
1390 | 1538 | ||
1391 | /* | 1539 | /* |
@@ -1396,76 +1544,110 @@ static struct page *get_object_page(const void *x) | |||
1396 | */ | 1544 | */ |
1397 | static int slub_min_order; | 1545 | static int slub_min_order; |
1398 | static int slub_max_order = DEFAULT_MAX_ORDER; | 1546 | static int slub_max_order = DEFAULT_MAX_ORDER; |
1399 | |||
1400 | /* | ||
1401 | * Minimum number of objects per slab. This is necessary in order to | ||
1402 | * reduce locking overhead. Similar to the queue size in SLAB. | ||
1403 | */ | ||
1404 | static int slub_min_objects = DEFAULT_MIN_OBJECTS; | 1547 | static int slub_min_objects = DEFAULT_MIN_OBJECTS; |
1405 | 1548 | ||
1406 | /* | 1549 | /* |
1407 | * Merge control. If this is set then no merging of slab caches will occur. | 1550 | * Merge control. If this is set then no merging of slab caches will occur. |
1551 | * (Could be removed. This was introduced to pacify the merge skeptics.) | ||
1408 | */ | 1552 | */ |
1409 | static int slub_nomerge; | 1553 | static int slub_nomerge; |
1410 | 1554 | ||
1411 | /* | 1555 | /* |
1412 | * Debug settings: | ||
1413 | */ | ||
1414 | static int slub_debug; | ||
1415 | |||
1416 | static char *slub_debug_slabs; | ||
1417 | |||
1418 | /* | ||
1419 | * Calculate the order of allocation given an slab object size. | 1556 | * Calculate the order of allocation given an slab object size. |
1420 | * | 1557 | * |
1421 | * The order of allocation has significant impact on other elements | 1558 | * The order of allocation has significant impact on performance and other |
1422 | * of the system. Generally order 0 allocations should be preferred | 1559 | * system components. Generally order 0 allocations should be preferred since |
1423 | * since they do not cause fragmentation in the page allocator. Larger | 1560 | * order 0 does not cause fragmentation in the page allocator. Larger objects |
1424 | * objects may have problems with order 0 because there may be too much | 1561 | * be problematic to put into order 0 slabs because there may be too much |
1425 | * space left unused in a slab. We go to a higher order if more than 1/8th | 1562 | * unused space left. We go to a higher order if more than 1/8th of the slab |
1426 | * of the slab would be wasted. | 1563 | * would be wasted. |
1427 | * | 1564 | * |
1428 | * In order to reach satisfactory performance we must ensure that | 1565 | * In order to reach satisfactory performance we must ensure that a minimum |
1429 | * a minimum number of objects is in one slab. Otherwise we may | 1566 | * number of objects is in one slab. Otherwise we may generate too much |
1430 | * generate too much activity on the partial lists. This is less a | 1567 | * activity on the partial lists which requires taking the list_lock. This is |
1431 | * concern for large slabs though. slub_max_order specifies the order | 1568 | * less a concern for large slabs though which are rarely used. |
1432 | * where we begin to stop considering the number of objects in a slab. | ||
1433 | * | 1569 | * |
1434 | * Higher order allocations also allow the placement of more objects | 1570 | * slub_max_order specifies the order where we begin to stop considering the |
1435 | * in a slab and thereby reduce object handling overhead. If the user | 1571 | * number of objects in a slab as critical. If we reach slub_max_order then |
1436 | * has requested a higher mininum order then we start with that one | 1572 | * we try to keep the page order as low as possible. So we accept more waste |
1437 | * instead of zero. | 1573 | * of space in favor of a small page order. |
1574 | * | ||
1575 | * Higher order allocations also allow the placement of more objects in a | ||
1576 | * slab and thereby reduce object handling overhead. If the user has | ||
1577 | * requested a higher mininum order then we start with that one instead of | ||
1578 | * the smallest order which will fit the object. | ||
1438 | */ | 1579 | */ |
1439 | static int calculate_order(int size) | 1580 | static inline int slab_order(int size, int min_objects, |
1581 | int max_order, int fract_leftover) | ||
1440 | { | 1582 | { |
1441 | int order; | 1583 | int order; |
1442 | int rem; | 1584 | int rem; |
1443 | 1585 | ||
1444 | for (order = max(slub_min_order, fls(size - 1) - PAGE_SHIFT); | 1586 | for (order = max(slub_min_order, |
1445 | order < MAX_ORDER; order++) { | 1587 | fls(min_objects * size - 1) - PAGE_SHIFT); |
1446 | unsigned long slab_size = PAGE_SIZE << order; | 1588 | order <= max_order; order++) { |
1447 | 1589 | ||
1448 | if (slub_max_order > order && | 1590 | unsigned long slab_size = PAGE_SIZE << order; |
1449 | slab_size < slub_min_objects * size) | ||
1450 | continue; | ||
1451 | 1591 | ||
1452 | if (slab_size < size) | 1592 | if (slab_size < min_objects * size) |
1453 | continue; | 1593 | continue; |
1454 | 1594 | ||
1455 | rem = slab_size % size; | 1595 | rem = slab_size % size; |
1456 | 1596 | ||
1457 | if (rem <= (PAGE_SIZE << order) / 8) | 1597 | if (rem <= slab_size / fract_leftover) |
1458 | break; | 1598 | break; |
1459 | 1599 | ||
1460 | } | 1600 | } |
1461 | if (order >= MAX_ORDER) | 1601 | |
1462 | return -E2BIG; | ||
1463 | return order; | 1602 | return order; |
1464 | } | 1603 | } |
1465 | 1604 | ||
1605 | static inline int calculate_order(int size) | ||
1606 | { | ||
1607 | int order; | ||
1608 | int min_objects; | ||
1609 | int fraction; | ||
1610 | |||
1611 | /* | ||
1612 | * Attempt to find best configuration for a slab. This | ||
1613 | * works by first attempting to generate a layout with | ||
1614 | * the best configuration and backing off gradually. | ||
1615 | * | ||
1616 | * First we reduce the acceptable waste in a slab. Then | ||
1617 | * we reduce the minimum objects required in a slab. | ||
1618 | */ | ||
1619 | min_objects = slub_min_objects; | ||
1620 | while (min_objects > 1) { | ||
1621 | fraction = 8; | ||
1622 | while (fraction >= 4) { | ||
1623 | order = slab_order(size, min_objects, | ||
1624 | slub_max_order, fraction); | ||
1625 | if (order <= slub_max_order) | ||
1626 | return order; | ||
1627 | fraction /= 2; | ||
1628 | } | ||
1629 | min_objects /= 2; | ||
1630 | } | ||
1631 | |||
1632 | /* | ||
1633 | * We were unable to place multiple objects in a slab. Now | ||
1634 | * lets see if we can place a single object there. | ||
1635 | */ | ||
1636 | order = slab_order(size, 1, slub_max_order, 1); | ||
1637 | if (order <= slub_max_order) | ||
1638 | return order; | ||
1639 | |||
1640 | /* | ||
1641 | * Doh this slab cannot be placed using slub_max_order. | ||
1642 | */ | ||
1643 | order = slab_order(size, 1, MAX_ORDER, 1); | ||
1644 | if (order <= MAX_ORDER) | ||
1645 | return order; | ||
1646 | return -ENOSYS; | ||
1647 | } | ||
1648 | |||
1466 | /* | 1649 | /* |
1467 | * Function to figure out which alignment to use from the | 1650 | * Figure out what the alignment of the objects will be. |
1468 | * various ways of specifying it. | ||
1469 | */ | 1651 | */ |
1470 | static unsigned long calculate_alignment(unsigned long flags, | 1652 | static unsigned long calculate_alignment(unsigned long flags, |
1471 | unsigned long align, unsigned long size) | 1653 | unsigned long align, unsigned long size) |
@@ -1480,8 +1662,8 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
1480 | * then use it. | 1662 | * then use it. |
1481 | */ | 1663 | */ |
1482 | if ((flags & SLAB_HWCACHE_ALIGN) && | 1664 | if ((flags & SLAB_HWCACHE_ALIGN) && |
1483 | size > L1_CACHE_BYTES / 2) | 1665 | size > cache_line_size() / 2) |
1484 | return max_t(unsigned long, align, L1_CACHE_BYTES); | 1666 | return max_t(unsigned long, align, cache_line_size()); |
1485 | 1667 | ||
1486 | if (align < ARCH_SLAB_MINALIGN) | 1668 | if (align < ARCH_SLAB_MINALIGN) |
1487 | return ARCH_SLAB_MINALIGN; | 1669 | return ARCH_SLAB_MINALIGN; |
@@ -1619,22 +1801,23 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1619 | */ | 1801 | */ |
1620 | size = ALIGN(size, sizeof(void *)); | 1802 | size = ALIGN(size, sizeof(void *)); |
1621 | 1803 | ||
1804 | #ifdef CONFIG_SLUB_DEBUG | ||
1622 | /* | 1805 | /* |
1623 | * If we are redzoning then check if there is some space between the | 1806 | * If we are Redzoning then check if there is some space between the |
1624 | * end of the object and the free pointer. If not then add an | 1807 | * end of the object and the free pointer. If not then add an |
1625 | * additional word, so that we can establish a redzone between | 1808 | * additional word to have some bytes to store Redzone information. |
1626 | * the object and the freepointer to be able to check for overwrites. | ||
1627 | */ | 1809 | */ |
1628 | if ((flags & SLAB_RED_ZONE) && size == s->objsize) | 1810 | if ((flags & SLAB_RED_ZONE) && size == s->objsize) |
1629 | size += sizeof(void *); | 1811 | size += sizeof(void *); |
1812 | #endif | ||
1630 | 1813 | ||
1631 | /* | 1814 | /* |
1632 | * With that we have determined how much of the slab is in actual | 1815 | * With that we have determined the number of bytes in actual use |
1633 | * use by the object. This is the potential offset to the free | 1816 | * by the object. This is the potential offset to the free pointer. |
1634 | * pointer. | ||
1635 | */ | 1817 | */ |
1636 | s->inuse = size; | 1818 | s->inuse = size; |
1637 | 1819 | ||
1820 | #ifdef CONFIG_SLUB_DEBUG | ||
1638 | if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || | 1821 | if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || |
1639 | s->ctor || s->dtor)) { | 1822 | s->ctor || s->dtor)) { |
1640 | /* | 1823 | /* |
@@ -1656,7 +1839,7 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1656 | */ | 1839 | */ |
1657 | size += 2 * sizeof(struct track); | 1840 | size += 2 * sizeof(struct track); |
1658 | 1841 | ||
1659 | if (flags & DEBUG_DEFAULT_FLAGS) | 1842 | if (flags & SLAB_RED_ZONE) |
1660 | /* | 1843 | /* |
1661 | * Add some empty padding so that we can catch | 1844 | * Add some empty padding so that we can catch |
1662 | * overwrites from earlier objects rather than let | 1845 | * overwrites from earlier objects rather than let |
@@ -1665,10 +1848,12 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1665 | * of the object. | 1848 | * of the object. |
1666 | */ | 1849 | */ |
1667 | size += sizeof(void *); | 1850 | size += sizeof(void *); |
1851 | #endif | ||
1852 | |||
1668 | /* | 1853 | /* |
1669 | * Determine the alignment based on various parameters that the | 1854 | * Determine the alignment based on various parameters that the |
1670 | * user specified (this is unecessarily complex due to the attempt | 1855 | * user specified and the dynamic determination of cache line size |
1671 | * to be compatible with SLAB. Should be cleaned up some day). | 1856 | * on bootup. |
1672 | */ | 1857 | */ |
1673 | align = calculate_alignment(flags, align, s->objsize); | 1858 | align = calculate_alignment(flags, align, s->objsize); |
1674 | 1859 | ||
@@ -1700,23 +1885,6 @@ static int calculate_sizes(struct kmem_cache *s) | |||
1700 | 1885 | ||
1701 | } | 1886 | } |
1702 | 1887 | ||
1703 | static int __init finish_bootstrap(void) | ||
1704 | { | ||
1705 | struct list_head *h; | ||
1706 | int err; | ||
1707 | |||
1708 | slab_state = SYSFS; | ||
1709 | |||
1710 | list_for_each(h, &slab_caches) { | ||
1711 | struct kmem_cache *s = | ||
1712 | container_of(h, struct kmem_cache, list); | ||
1713 | |||
1714 | err = sysfs_slab_add(s); | ||
1715 | BUG_ON(err); | ||
1716 | } | ||
1717 | return 0; | ||
1718 | } | ||
1719 | |||
1720 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | 1888 | static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, |
1721 | const char *name, size_t size, | 1889 | const char *name, size_t size, |
1722 | size_t align, unsigned long flags, | 1890 | size_t align, unsigned long flags, |
@@ -1730,32 +1898,7 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
1730 | s->objsize = size; | 1898 | s->objsize = size; |
1731 | s->flags = flags; | 1899 | s->flags = flags; |
1732 | s->align = align; | 1900 | s->align = align; |
1733 | 1901 | kmem_cache_open_debug_check(s); | |
1734 | /* | ||
1735 | * The page->offset field is only 16 bit wide. This is an offset | ||
1736 | * in units of words from the beginning of an object. If the slab | ||
1737 | * size is bigger then we cannot move the free pointer behind the | ||
1738 | * object anymore. | ||
1739 | * | ||
1740 | * On 32 bit platforms the limit is 256k. On 64bit platforms | ||
1741 | * the limit is 512k. | ||
1742 | * | ||
1743 | * Debugging or ctor/dtors may create a need to move the free | ||
1744 | * pointer. Fail if this happens. | ||
1745 | */ | ||
1746 | if (s->size >= 65535 * sizeof(void *)) { | ||
1747 | BUG_ON(flags & (SLAB_RED_ZONE | SLAB_POISON | | ||
1748 | SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); | ||
1749 | BUG_ON(ctor || dtor); | ||
1750 | } | ||
1751 | else | ||
1752 | /* | ||
1753 | * Enable debugging if selected on the kernel commandline. | ||
1754 | */ | ||
1755 | if (slub_debug && (!slub_debug_slabs || | ||
1756 | strncmp(slub_debug_slabs, name, | ||
1757 | strlen(slub_debug_slabs)) == 0)) | ||
1758 | s->flags |= slub_debug; | ||
1759 | 1902 | ||
1760 | if (!calculate_sizes(s)) | 1903 | if (!calculate_sizes(s)) |
1761 | goto error; | 1904 | goto error; |
@@ -1783,7 +1926,6 @@ EXPORT_SYMBOL(kmem_cache_open); | |||
1783 | int kmem_ptr_validate(struct kmem_cache *s, const void *object) | 1926 | int kmem_ptr_validate(struct kmem_cache *s, const void *object) |
1784 | { | 1927 | { |
1785 | struct page * page; | 1928 | struct page * page; |
1786 | void *addr; | ||
1787 | 1929 | ||
1788 | page = get_object_page(object); | 1930 | page = get_object_page(object); |
1789 | 1931 | ||
@@ -1791,13 +1933,7 @@ int kmem_ptr_validate(struct kmem_cache *s, const void *object) | |||
1791 | /* No slab or wrong slab */ | 1933 | /* No slab or wrong slab */ |
1792 | return 0; | 1934 | return 0; |
1793 | 1935 | ||
1794 | addr = page_address(page); | 1936 | if (!check_valid_pointer(s, page, object)) |
1795 | if (object < addr || object >= addr + s->objects * s->size) | ||
1796 | /* Out of bounds */ | ||
1797 | return 0; | ||
1798 | |||
1799 | if ((object - addr) % s->size) | ||
1800 | /* Improperly aligned */ | ||
1801 | return 0; | 1937 | return 0; |
1802 | 1938 | ||
1803 | /* | 1939 | /* |
@@ -1826,7 +1962,8 @@ const char *kmem_cache_name(struct kmem_cache *s) | |||
1826 | EXPORT_SYMBOL(kmem_cache_name); | 1962 | EXPORT_SYMBOL(kmem_cache_name); |
1827 | 1963 | ||
1828 | /* | 1964 | /* |
1829 | * Attempt to free all slabs on a node | 1965 | * Attempt to free all slabs on a node. Return the number of slabs we |
1966 | * were unable to free. | ||
1830 | */ | 1967 | */ |
1831 | static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, | 1968 | static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, |
1832 | struct list_head *list) | 1969 | struct list_head *list) |
@@ -1847,7 +1984,7 @@ static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, | |||
1847 | } | 1984 | } |
1848 | 1985 | ||
1849 | /* | 1986 | /* |
1850 | * Release all resources used by slab cache | 1987 | * Release all resources used by a slab cache. |
1851 | */ | 1988 | */ |
1852 | static int kmem_cache_close(struct kmem_cache *s) | 1989 | static int kmem_cache_close(struct kmem_cache *s) |
1853 | { | 1990 | { |
@@ -1932,45 +2069,6 @@ static int __init setup_slub_nomerge(char *str) | |||
1932 | 2069 | ||
1933 | __setup("slub_nomerge", setup_slub_nomerge); | 2070 | __setup("slub_nomerge", setup_slub_nomerge); |
1934 | 2071 | ||
1935 | static int __init setup_slub_debug(char *str) | ||
1936 | { | ||
1937 | if (!str || *str != '=') | ||
1938 | slub_debug = DEBUG_DEFAULT_FLAGS; | ||
1939 | else { | ||
1940 | str++; | ||
1941 | if (*str == 0 || *str == ',') | ||
1942 | slub_debug = DEBUG_DEFAULT_FLAGS; | ||
1943 | else | ||
1944 | for( ;*str && *str != ','; str++) | ||
1945 | switch (*str) { | ||
1946 | case 'f' : case 'F' : | ||
1947 | slub_debug |= SLAB_DEBUG_FREE; | ||
1948 | break; | ||
1949 | case 'z' : case 'Z' : | ||
1950 | slub_debug |= SLAB_RED_ZONE; | ||
1951 | break; | ||
1952 | case 'p' : case 'P' : | ||
1953 | slub_debug |= SLAB_POISON; | ||
1954 | break; | ||
1955 | case 'u' : case 'U' : | ||
1956 | slub_debug |= SLAB_STORE_USER; | ||
1957 | break; | ||
1958 | case 't' : case 'T' : | ||
1959 | slub_debug |= SLAB_TRACE; | ||
1960 | break; | ||
1961 | default: | ||
1962 | printk(KERN_ERR "slub_debug option '%c' " | ||
1963 | "unknown. skipped\n",*str); | ||
1964 | } | ||
1965 | } | ||
1966 | |||
1967 | if (*str == ',') | ||
1968 | slub_debug_slabs = str + 1; | ||
1969 | return 1; | ||
1970 | } | ||
1971 | |||
1972 | __setup("slub_debug", setup_slub_debug); | ||
1973 | |||
1974 | static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, | 2072 | static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, |
1975 | const char *name, int size, gfp_t gfp_flags) | 2073 | const char *name, int size, gfp_t gfp_flags) |
1976 | { | 2074 | { |
@@ -2108,13 +2206,14 @@ void kfree(const void *x) | |||
2108 | EXPORT_SYMBOL(kfree); | 2206 | EXPORT_SYMBOL(kfree); |
2109 | 2207 | ||
2110 | /* | 2208 | /* |
2111 | * kmem_cache_shrink removes empty slabs from the partial lists | 2209 | * kmem_cache_shrink removes empty slabs from the partial lists and sorts |
2112 | * and then sorts the partially allocated slabs by the number | 2210 | * the remaining slabs by the number of items in use. The slabs with the |
2113 | * of items in use. The slabs with the most items in use | 2211 | * most items in use come first. New allocations will then fill those up |
2114 | * come first. New allocations will remove these from the | 2212 | * and thus they can be removed from the partial lists. |
2115 | * partial list because they are full. The slabs with the | 2213 | * |
2116 | * least items are placed last. If it happens that the objects | 2214 | * The slabs with the least items are placed last. This results in them |
2117 | * are freed then the page can be returned to the page allocator. | 2215 | * being allocated from last increasing the chance that the last objects |
2216 | * are freed in them. | ||
2118 | */ | 2217 | */ |
2119 | int kmem_cache_shrink(struct kmem_cache *s) | 2218 | int kmem_cache_shrink(struct kmem_cache *s) |
2120 | { | 2219 | { |
@@ -2143,12 +2242,10 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
2143 | spin_lock_irqsave(&n->list_lock, flags); | 2242 | spin_lock_irqsave(&n->list_lock, flags); |
2144 | 2243 | ||
2145 | /* | 2244 | /* |
2146 | * Build lists indexed by the items in use in | 2245 | * Build lists indexed by the items in use in each slab. |
2147 | * each slab or free slabs if empty. | ||
2148 | * | 2246 | * |
2149 | * Note that concurrent frees may occur while | 2247 | * Note that concurrent frees may occur while we hold the |
2150 | * we hold the list_lock. page->inuse here is | 2248 | * list_lock. page->inuse here is the upper limit. |
2151 | * the upper limit. | ||
2152 | */ | 2249 | */ |
2153 | list_for_each_entry_safe(page, t, &n->partial, lru) { | 2250 | list_for_each_entry_safe(page, t, &n->partial, lru) { |
2154 | if (!page->inuse && slab_trylock(page)) { | 2251 | if (!page->inuse && slab_trylock(page)) { |
@@ -2172,8 +2269,8 @@ int kmem_cache_shrink(struct kmem_cache *s) | |||
2172 | goto out; | 2269 | goto out; |
2173 | 2270 | ||
2174 | /* | 2271 | /* |
2175 | * Rebuild the partial list with the slabs filled up | 2272 | * Rebuild the partial list with the slabs filled up most |
2176 | * most first and the least used slabs at the end. | 2273 | * first and the least used slabs at the end. |
2177 | */ | 2274 | */ |
2178 | for (i = s->objects - 1; i >= 0; i--) | 2275 | for (i = s->objects - 1; i >= 0; i--) |
2179 | list_splice(slabs_by_inuse + i, n->partial.prev); | 2276 | list_splice(slabs_by_inuse + i, n->partial.prev); |
@@ -2189,7 +2286,6 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
2189 | 2286 | ||
2190 | /** | 2287 | /** |
2191 | * krealloc - reallocate memory. The contents will remain unchanged. | 2288 | * krealloc - reallocate memory. The contents will remain unchanged. |
2192 | * | ||
2193 | * @p: object to reallocate memory for. | 2289 | * @p: object to reallocate memory for. |
2194 | * @new_size: how many bytes of memory are required. | 2290 | * @new_size: how many bytes of memory are required. |
2195 | * @flags: the type of memory to allocate. | 2291 | * @flags: the type of memory to allocate. |
@@ -2201,9 +2297,8 @@ EXPORT_SYMBOL(kmem_cache_shrink); | |||
2201 | */ | 2297 | */ |
2202 | void *krealloc(const void *p, size_t new_size, gfp_t flags) | 2298 | void *krealloc(const void *p, size_t new_size, gfp_t flags) |
2203 | { | 2299 | { |
2204 | struct kmem_cache *new_cache; | ||
2205 | void *ret; | 2300 | void *ret; |
2206 | struct page *page; | 2301 | size_t ks; |
2207 | 2302 | ||
2208 | if (unlikely(!p)) | 2303 | if (unlikely(!p)) |
2209 | return kmalloc(new_size, flags); | 2304 | return kmalloc(new_size, flags); |
@@ -2213,19 +2308,13 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) | |||
2213 | return NULL; | 2308 | return NULL; |
2214 | } | 2309 | } |
2215 | 2310 | ||
2216 | page = virt_to_head_page(p); | 2311 | ks = ksize(p); |
2217 | 2312 | if (ks >= new_size) | |
2218 | new_cache = get_slab(new_size, flags); | ||
2219 | |||
2220 | /* | ||
2221 | * If new size fits in the current cache, bail out. | ||
2222 | */ | ||
2223 | if (likely(page->slab == new_cache)) | ||
2224 | return (void *)p; | 2313 | return (void *)p; |
2225 | 2314 | ||
2226 | ret = kmalloc(new_size, flags); | 2315 | ret = kmalloc(new_size, flags); |
2227 | if (ret) { | 2316 | if (ret) { |
2228 | memcpy(ret, p, min(new_size, ksize(p))); | 2317 | memcpy(ret, p, min(new_size, ks)); |
2229 | kfree(p); | 2318 | kfree(p); |
2230 | } | 2319 | } |
2231 | return ret; | 2320 | return ret; |
@@ -2243,7 +2332,7 @@ void __init kmem_cache_init(void) | |||
2243 | #ifdef CONFIG_NUMA | 2332 | #ifdef CONFIG_NUMA |
2244 | /* | 2333 | /* |
2245 | * Must first have the slab cache available for the allocations of the | 2334 | * Must first have the slab cache available for the allocations of the |
2246 | * struct kmalloc_cache_node's. There is special bootstrap code in | 2335 | * struct kmem_cache_node's. There is special bootstrap code in |
2247 | * kmem_cache_open for slab_state == DOWN. | 2336 | * kmem_cache_open for slab_state == DOWN. |
2248 | */ | 2337 | */ |
2249 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", | 2338 | create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", |
@@ -2280,7 +2369,7 @@ void __init kmem_cache_init(void) | |||
2280 | 2369 | ||
2281 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," | 2370 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," |
2282 | " Processors=%d, Nodes=%d\n", | 2371 | " Processors=%d, Nodes=%d\n", |
2283 | KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES, | 2372 | KMALLOC_SHIFT_HIGH, cache_line_size(), |
2284 | slub_min_order, slub_max_order, slub_min_objects, | 2373 | slub_min_order, slub_max_order, slub_min_objects, |
2285 | nr_cpu_ids, nr_node_ids); | 2374 | nr_cpu_ids, nr_node_ids); |
2286 | } | 2375 | } |
@@ -2415,8 +2504,8 @@ static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) | |||
2415 | } | 2504 | } |
2416 | 2505 | ||
2417 | /* | 2506 | /* |
2418 | * Use the cpu notifier to insure that the slab are flushed | 2507 | * Use the cpu notifier to insure that the cpu slabs are flushed when |
2419 | * when necessary. | 2508 | * necessary. |
2420 | */ | 2509 | */ |
2421 | static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | 2510 | static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, |
2422 | unsigned long action, void *hcpu) | 2511 | unsigned long action, void *hcpu) |
@@ -2425,7 +2514,9 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, | |||
2425 | 2514 | ||
2426 | switch (action) { | 2515 | switch (action) { |
2427 | case CPU_UP_CANCELED: | 2516 | case CPU_UP_CANCELED: |
2517 | case CPU_UP_CANCELED_FROZEN: | ||
2428 | case CPU_DEAD: | 2518 | case CPU_DEAD: |
2519 | case CPU_DEAD_FROZEN: | ||
2429 | for_all_slabs(__flush_cpu_slab, cpu); | 2520 | for_all_slabs(__flush_cpu_slab, cpu); |
2430 | break; | 2521 | break; |
2431 | default: | 2522 | default: |
@@ -2439,153 +2530,6 @@ static struct notifier_block __cpuinitdata slab_notifier = | |||
2439 | 2530 | ||
2440 | #endif | 2531 | #endif |
2441 | 2532 | ||
2442 | #ifdef CONFIG_NUMA | ||
2443 | |||
2444 | /***************************************************************** | ||
2445 | * Generic reaper used to support the page allocator | ||
2446 | * (the cpu slabs are reaped by a per slab workqueue). | ||
2447 | * | ||
2448 | * Maybe move this to the page allocator? | ||
2449 | ****************************************************************/ | ||
2450 | |||
2451 | static DEFINE_PER_CPU(unsigned long, reap_node); | ||
2452 | |||
2453 | static void init_reap_node(int cpu) | ||
2454 | { | ||
2455 | int node; | ||
2456 | |||
2457 | node = next_node(cpu_to_node(cpu), node_online_map); | ||
2458 | if (node == MAX_NUMNODES) | ||
2459 | node = first_node(node_online_map); | ||
2460 | |||
2461 | __get_cpu_var(reap_node) = node; | ||
2462 | } | ||
2463 | |||
2464 | static void next_reap_node(void) | ||
2465 | { | ||
2466 | int node = __get_cpu_var(reap_node); | ||
2467 | |||
2468 | /* | ||
2469 | * Also drain per cpu pages on remote zones | ||
2470 | */ | ||
2471 | if (node != numa_node_id()) | ||
2472 | drain_node_pages(node); | ||
2473 | |||
2474 | node = next_node(node, node_online_map); | ||
2475 | if (unlikely(node >= MAX_NUMNODES)) | ||
2476 | node = first_node(node_online_map); | ||
2477 | __get_cpu_var(reap_node) = node; | ||
2478 | } | ||
2479 | #else | ||
2480 | #define init_reap_node(cpu) do { } while (0) | ||
2481 | #define next_reap_node(void) do { } while (0) | ||
2482 | #endif | ||
2483 | |||
2484 | #define REAPTIMEOUT_CPUC (2*HZ) | ||
2485 | |||
2486 | #ifdef CONFIG_SMP | ||
2487 | static DEFINE_PER_CPU(struct delayed_work, reap_work); | ||
2488 | |||
2489 | static void cache_reap(struct work_struct *unused) | ||
2490 | { | ||
2491 | next_reap_node(); | ||
2492 | refresh_cpu_vm_stats(smp_processor_id()); | ||
2493 | schedule_delayed_work(&__get_cpu_var(reap_work), | ||
2494 | REAPTIMEOUT_CPUC); | ||
2495 | } | ||
2496 | |||
2497 | static void __devinit start_cpu_timer(int cpu) | ||
2498 | { | ||
2499 | struct delayed_work *reap_work = &per_cpu(reap_work, cpu); | ||
2500 | |||
2501 | /* | ||
2502 | * When this gets called from do_initcalls via cpucache_init(), | ||
2503 | * init_workqueues() has already run, so keventd will be setup | ||
2504 | * at that time. | ||
2505 | */ | ||
2506 | if (keventd_up() && reap_work->work.func == NULL) { | ||
2507 | init_reap_node(cpu); | ||
2508 | INIT_DELAYED_WORK(reap_work, cache_reap); | ||
2509 | schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); | ||
2510 | } | ||
2511 | } | ||
2512 | |||
2513 | static int __init cpucache_init(void) | ||
2514 | { | ||
2515 | int cpu; | ||
2516 | |||
2517 | /* | ||
2518 | * Register the timers that drain pcp pages and update vm statistics | ||
2519 | */ | ||
2520 | for_each_online_cpu(cpu) | ||
2521 | start_cpu_timer(cpu); | ||
2522 | return 0; | ||
2523 | } | ||
2524 | __initcall(cpucache_init); | ||
2525 | #endif | ||
2526 | |||
2527 | #ifdef SLUB_RESILIENCY_TEST | ||
2528 | static unsigned long validate_slab_cache(struct kmem_cache *s); | ||
2529 | |||
2530 | static void resiliency_test(void) | ||
2531 | { | ||
2532 | u8 *p; | ||
2533 | |||
2534 | printk(KERN_ERR "SLUB resiliency testing\n"); | ||
2535 | printk(KERN_ERR "-----------------------\n"); | ||
2536 | printk(KERN_ERR "A. Corruption after allocation\n"); | ||
2537 | |||
2538 | p = kzalloc(16, GFP_KERNEL); | ||
2539 | p[16] = 0x12; | ||
2540 | printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" | ||
2541 | " 0x12->0x%p\n\n", p + 16); | ||
2542 | |||
2543 | validate_slab_cache(kmalloc_caches + 4); | ||
2544 | |||
2545 | /* Hmmm... The next two are dangerous */ | ||
2546 | p = kzalloc(32, GFP_KERNEL); | ||
2547 | p[32 + sizeof(void *)] = 0x34; | ||
2548 | printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" | ||
2549 | " 0x34 -> -0x%p\n", p); | ||
2550 | printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); | ||
2551 | |||
2552 | validate_slab_cache(kmalloc_caches + 5); | ||
2553 | p = kzalloc(64, GFP_KERNEL); | ||
2554 | p += 64 + (get_cycles() & 0xff) * sizeof(void *); | ||
2555 | *p = 0x56; | ||
2556 | printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", | ||
2557 | p); | ||
2558 | printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); | ||
2559 | validate_slab_cache(kmalloc_caches + 6); | ||
2560 | |||
2561 | printk(KERN_ERR "\nB. Corruption after free\n"); | ||
2562 | p = kzalloc(128, GFP_KERNEL); | ||
2563 | kfree(p); | ||
2564 | *p = 0x78; | ||
2565 | printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); | ||
2566 | validate_slab_cache(kmalloc_caches + 7); | ||
2567 | |||
2568 | p = kzalloc(256, GFP_KERNEL); | ||
2569 | kfree(p); | ||
2570 | p[50] = 0x9a; | ||
2571 | printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); | ||
2572 | validate_slab_cache(kmalloc_caches + 8); | ||
2573 | |||
2574 | p = kzalloc(512, GFP_KERNEL); | ||
2575 | kfree(p); | ||
2576 | p[512] = 0xab; | ||
2577 | printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); | ||
2578 | validate_slab_cache(kmalloc_caches + 9); | ||
2579 | } | ||
2580 | #else | ||
2581 | static void resiliency_test(void) {}; | ||
2582 | #endif | ||
2583 | |||
2584 | /* | ||
2585 | * These are not as efficient as kmalloc for the non debug case. | ||
2586 | * We do not have the page struct available so we have to touch one | ||
2587 | * cacheline in struct kmem_cache to check slab flags. | ||
2588 | */ | ||
2589 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) | 2533 | void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) |
2590 | { | 2534 | { |
2591 | struct kmem_cache *s = get_slab(size, gfpflags); | 2535 | struct kmem_cache *s = get_slab(size, gfpflags); |
@@ -2607,13 +2551,12 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, | |||
2607 | return slab_alloc(s, gfpflags, node, caller); | 2551 | return slab_alloc(s, gfpflags, node, caller); |
2608 | } | 2552 | } |
2609 | 2553 | ||
2610 | #ifdef CONFIG_SYSFS | 2554 | #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) |
2611 | |||
2612 | static int validate_slab(struct kmem_cache *s, struct page *page) | 2555 | static int validate_slab(struct kmem_cache *s, struct page *page) |
2613 | { | 2556 | { |
2614 | void *p; | 2557 | void *p; |
2615 | void *addr = page_address(page); | 2558 | void *addr = page_address(page); |
2616 | unsigned long map[BITS_TO_LONGS(s->objects)]; | 2559 | DECLARE_BITMAP(map, s->objects); |
2617 | 2560 | ||
2618 | if (!check_slab(s, page) || | 2561 | if (!check_slab(s, page) || |
2619 | !on_freelist(s, page, NULL)) | 2562 | !on_freelist(s, page, NULL)) |
@@ -2622,14 +2565,14 @@ static int validate_slab(struct kmem_cache *s, struct page *page) | |||
2622 | /* Now we know that a valid freelist exists */ | 2565 | /* Now we know that a valid freelist exists */ |
2623 | bitmap_zero(map, s->objects); | 2566 | bitmap_zero(map, s->objects); |
2624 | 2567 | ||
2625 | for(p = page->freelist; p; p = get_freepointer(s, p)) { | 2568 | for_each_free_object(p, s, page->freelist) { |
2626 | set_bit((p - addr) / s->size, map); | 2569 | set_bit(slab_index(p, s, addr), map); |
2627 | if (!check_object(s, page, p, 0)) | 2570 | if (!check_object(s, page, p, 0)) |
2628 | return 0; | 2571 | return 0; |
2629 | } | 2572 | } |
2630 | 2573 | ||
2631 | for(p = addr; p < addr + s->objects * s->size; p += s->size) | 2574 | for_each_object(p, s, addr) |
2632 | if (!test_bit((p - addr) / s->size, map)) | 2575 | if (!test_bit(slab_index(p, s, addr), map)) |
2633 | if (!check_object(s, page, p, 1)) | 2576 | if (!check_object(s, page, p, 1)) |
2634 | return 0; | 2577 | return 0; |
2635 | return 1; | 2578 | return 1; |
@@ -2645,12 +2588,12 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page) | |||
2645 | s->name, page); | 2588 | s->name, page); |
2646 | 2589 | ||
2647 | if (s->flags & DEBUG_DEFAULT_FLAGS) { | 2590 | if (s->flags & DEBUG_DEFAULT_FLAGS) { |
2648 | if (!PageError(page)) | 2591 | if (!SlabDebug(page)) |
2649 | printk(KERN_ERR "SLUB %s: PageError not set " | 2592 | printk(KERN_ERR "SLUB %s: SlabDebug not set " |
2650 | "on slab 0x%p\n", s->name, page); | 2593 | "on slab 0x%p\n", s->name, page); |
2651 | } else { | 2594 | } else { |
2652 | if (PageError(page)) | 2595 | if (SlabDebug(page)) |
2653 | printk(KERN_ERR "SLUB %s: PageError set on " | 2596 | printk(KERN_ERR "SLUB %s: SlabDebug set on " |
2654 | "slab 0x%p\n", s->name, page); | 2597 | "slab 0x%p\n", s->name, page); |
2655 | } | 2598 | } |
2656 | } | 2599 | } |
@@ -2702,14 +2645,76 @@ static unsigned long validate_slab_cache(struct kmem_cache *s) | |||
2702 | return count; | 2645 | return count; |
2703 | } | 2646 | } |
2704 | 2647 | ||
2648 | #ifdef SLUB_RESILIENCY_TEST | ||
2649 | static void resiliency_test(void) | ||
2650 | { | ||
2651 | u8 *p; | ||
2652 | |||
2653 | printk(KERN_ERR "SLUB resiliency testing\n"); | ||
2654 | printk(KERN_ERR "-----------------------\n"); | ||
2655 | printk(KERN_ERR "A. Corruption after allocation\n"); | ||
2656 | |||
2657 | p = kzalloc(16, GFP_KERNEL); | ||
2658 | p[16] = 0x12; | ||
2659 | printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" | ||
2660 | " 0x12->0x%p\n\n", p + 16); | ||
2661 | |||
2662 | validate_slab_cache(kmalloc_caches + 4); | ||
2663 | |||
2664 | /* Hmmm... The next two are dangerous */ | ||
2665 | p = kzalloc(32, GFP_KERNEL); | ||
2666 | p[32 + sizeof(void *)] = 0x34; | ||
2667 | printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" | ||
2668 | " 0x34 -> -0x%p\n", p); | ||
2669 | printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); | ||
2670 | |||
2671 | validate_slab_cache(kmalloc_caches + 5); | ||
2672 | p = kzalloc(64, GFP_KERNEL); | ||
2673 | p += 64 + (get_cycles() & 0xff) * sizeof(void *); | ||
2674 | *p = 0x56; | ||
2675 | printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", | ||
2676 | p); | ||
2677 | printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); | ||
2678 | validate_slab_cache(kmalloc_caches + 6); | ||
2679 | |||
2680 | printk(KERN_ERR "\nB. Corruption after free\n"); | ||
2681 | p = kzalloc(128, GFP_KERNEL); | ||
2682 | kfree(p); | ||
2683 | *p = 0x78; | ||
2684 | printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); | ||
2685 | validate_slab_cache(kmalloc_caches + 7); | ||
2686 | |||
2687 | p = kzalloc(256, GFP_KERNEL); | ||
2688 | kfree(p); | ||
2689 | p[50] = 0x9a; | ||
2690 | printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); | ||
2691 | validate_slab_cache(kmalloc_caches + 8); | ||
2692 | |||
2693 | p = kzalloc(512, GFP_KERNEL); | ||
2694 | kfree(p); | ||
2695 | p[512] = 0xab; | ||
2696 | printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); | ||
2697 | validate_slab_cache(kmalloc_caches + 9); | ||
2698 | } | ||
2699 | #else | ||
2700 | static void resiliency_test(void) {}; | ||
2701 | #endif | ||
2702 | |||
2705 | /* | 2703 | /* |
2706 | * Generate lists of locations where slabcache objects are allocated | 2704 | * Generate lists of code addresses where slabcache objects are allocated |
2707 | * and freed. | 2705 | * and freed. |
2708 | */ | 2706 | */ |
2709 | 2707 | ||
2710 | struct location { | 2708 | struct location { |
2711 | unsigned long count; | 2709 | unsigned long count; |
2712 | void *addr; | 2710 | void *addr; |
2711 | long long sum_time; | ||
2712 | long min_time; | ||
2713 | long max_time; | ||
2714 | long min_pid; | ||
2715 | long max_pid; | ||
2716 | cpumask_t cpus; | ||
2717 | nodemask_t nodes; | ||
2713 | }; | 2718 | }; |
2714 | 2719 | ||
2715 | struct loc_track { | 2720 | struct loc_track { |
@@ -2750,11 +2755,12 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max) | |||
2750 | } | 2755 | } |
2751 | 2756 | ||
2752 | static int add_location(struct loc_track *t, struct kmem_cache *s, | 2757 | static int add_location(struct loc_track *t, struct kmem_cache *s, |
2753 | void *addr) | 2758 | const struct track *track) |
2754 | { | 2759 | { |
2755 | long start, end, pos; | 2760 | long start, end, pos; |
2756 | struct location *l; | 2761 | struct location *l; |
2757 | void *caddr; | 2762 | void *caddr; |
2763 | unsigned long age = jiffies - track->when; | ||
2758 | 2764 | ||
2759 | start = -1; | 2765 | start = -1; |
2760 | end = t->count; | 2766 | end = t->count; |
@@ -2770,19 +2776,36 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, | |||
2770 | break; | 2776 | break; |
2771 | 2777 | ||
2772 | caddr = t->loc[pos].addr; | 2778 | caddr = t->loc[pos].addr; |
2773 | if (addr == caddr) { | 2779 | if (track->addr == caddr) { |
2774 | t->loc[pos].count++; | 2780 | |
2781 | l = &t->loc[pos]; | ||
2782 | l->count++; | ||
2783 | if (track->when) { | ||
2784 | l->sum_time += age; | ||
2785 | if (age < l->min_time) | ||
2786 | l->min_time = age; | ||
2787 | if (age > l->max_time) | ||
2788 | l->max_time = age; | ||
2789 | |||
2790 | if (track->pid < l->min_pid) | ||
2791 | l->min_pid = track->pid; | ||
2792 | if (track->pid > l->max_pid) | ||
2793 | l->max_pid = track->pid; | ||
2794 | |||
2795 | cpu_set(track->cpu, l->cpus); | ||
2796 | } | ||
2797 | node_set(page_to_nid(virt_to_page(track)), l->nodes); | ||
2775 | return 1; | 2798 | return 1; |
2776 | } | 2799 | } |
2777 | 2800 | ||
2778 | if (addr < caddr) | 2801 | if (track->addr < caddr) |
2779 | end = pos; | 2802 | end = pos; |
2780 | else | 2803 | else |
2781 | start = pos; | 2804 | start = pos; |
2782 | } | 2805 | } |
2783 | 2806 | ||
2784 | /* | 2807 | /* |
2785 | * Not found. Insert new tracking element | 2808 | * Not found. Insert new tracking element. |
2786 | */ | 2809 | */ |
2787 | if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) | 2810 | if (t->count >= t->max && !alloc_loc_track(t, 2 * t->max)) |
2788 | return 0; | 2811 | return 0; |
@@ -2793,7 +2816,16 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, | |||
2793 | (t->count - pos) * sizeof(struct location)); | 2816 | (t->count - pos) * sizeof(struct location)); |
2794 | t->count++; | 2817 | t->count++; |
2795 | l->count = 1; | 2818 | l->count = 1; |
2796 | l->addr = addr; | 2819 | l->addr = track->addr; |
2820 | l->sum_time = age; | ||
2821 | l->min_time = age; | ||
2822 | l->max_time = age; | ||
2823 | l->min_pid = track->pid; | ||
2824 | l->max_pid = track->pid; | ||
2825 | cpus_clear(l->cpus); | ||
2826 | cpu_set(track->cpu, l->cpus); | ||
2827 | nodes_clear(l->nodes); | ||
2828 | node_set(page_to_nid(virt_to_page(track)), l->nodes); | ||
2797 | return 1; | 2829 | return 1; |
2798 | } | 2830 | } |
2799 | 2831 | ||
@@ -2801,19 +2833,16 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s, | |||
2801 | struct page *page, enum track_item alloc) | 2833 | struct page *page, enum track_item alloc) |
2802 | { | 2834 | { |
2803 | void *addr = page_address(page); | 2835 | void *addr = page_address(page); |
2804 | unsigned long map[BITS_TO_LONGS(s->objects)]; | 2836 | DECLARE_BITMAP(map, s->objects); |
2805 | void *p; | 2837 | void *p; |
2806 | 2838 | ||
2807 | bitmap_zero(map, s->objects); | 2839 | bitmap_zero(map, s->objects); |
2808 | for (p = page->freelist; p; p = get_freepointer(s, p)) | 2840 | for_each_free_object(p, s, page->freelist) |
2809 | set_bit((p - addr) / s->size, map); | 2841 | set_bit(slab_index(p, s, addr), map); |
2810 | |||
2811 | for (p = addr; p < addr + s->objects * s->size; p += s->size) | ||
2812 | if (!test_bit((p - addr) / s->size, map)) { | ||
2813 | void *addr = get_track(s, p, alloc)->addr; | ||
2814 | 2842 | ||
2815 | add_location(t, s, addr); | 2843 | for_each_object(p, s, addr) |
2816 | } | 2844 | if (!test_bit(slab_index(p, s, addr), map)) |
2845 | add_location(t, s, get_track(s, p, alloc)); | ||
2817 | } | 2846 | } |
2818 | 2847 | ||
2819 | static int list_locations(struct kmem_cache *s, char *buf, | 2848 | static int list_locations(struct kmem_cache *s, char *buf, |
@@ -2847,15 +2876,47 @@ static int list_locations(struct kmem_cache *s, char *buf, | |||
2847 | } | 2876 | } |
2848 | 2877 | ||
2849 | for (i = 0; i < t.count; i++) { | 2878 | for (i = 0; i < t.count; i++) { |
2850 | void *addr = t.loc[i].addr; | 2879 | struct location *l = &t.loc[i]; |
2851 | 2880 | ||
2852 | if (n > PAGE_SIZE - 100) | 2881 | if (n > PAGE_SIZE - 100) |
2853 | break; | 2882 | break; |
2854 | n += sprintf(buf + n, "%7ld ", t.loc[i].count); | 2883 | n += sprintf(buf + n, "%7ld ", l->count); |
2855 | if (addr) | 2884 | |
2856 | n += sprint_symbol(buf + n, (unsigned long)t.loc[i].addr); | 2885 | if (l->addr) |
2886 | n += sprint_symbol(buf + n, (unsigned long)l->addr); | ||
2857 | else | 2887 | else |
2858 | n += sprintf(buf + n, "<not-available>"); | 2888 | n += sprintf(buf + n, "<not-available>"); |
2889 | |||
2890 | if (l->sum_time != l->min_time) { | ||
2891 | unsigned long remainder; | ||
2892 | |||
2893 | n += sprintf(buf + n, " age=%ld/%ld/%ld", | ||
2894 | l->min_time, | ||
2895 | div_long_long_rem(l->sum_time, l->count, &remainder), | ||
2896 | l->max_time); | ||
2897 | } else | ||
2898 | n += sprintf(buf + n, " age=%ld", | ||
2899 | l->min_time); | ||
2900 | |||
2901 | if (l->min_pid != l->max_pid) | ||
2902 | n += sprintf(buf + n, " pid=%ld-%ld", | ||
2903 | l->min_pid, l->max_pid); | ||
2904 | else | ||
2905 | n += sprintf(buf + n, " pid=%ld", | ||
2906 | l->min_pid); | ||
2907 | |||
2908 | if (num_online_cpus() > 1 && !cpus_empty(l->cpus)) { | ||
2909 | n += sprintf(buf + n, " cpus="); | ||
2910 | n += cpulist_scnprintf(buf + n, PAGE_SIZE - n - 50, | ||
2911 | l->cpus); | ||
2912 | } | ||
2913 | |||
2914 | if (num_online_nodes() > 1 && !nodes_empty(l->nodes)) { | ||
2915 | n += sprintf(buf + n, " nodes="); | ||
2916 | n += nodelist_scnprintf(buf + n, PAGE_SIZE - n - 50, | ||
2917 | l->nodes); | ||
2918 | } | ||
2919 | |||
2859 | n += sprintf(buf + n, "\n"); | 2920 | n += sprintf(buf + n, "\n"); |
2860 | } | 2921 | } |
2861 | 2922 | ||
@@ -3491,6 +3552,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) | |||
3491 | 3552 | ||
3492 | static int __init slab_sysfs_init(void) | 3553 | static int __init slab_sysfs_init(void) |
3493 | { | 3554 | { |
3555 | struct list_head *h; | ||
3494 | int err; | 3556 | int err; |
3495 | 3557 | ||
3496 | err = subsystem_register(&slab_subsys); | 3558 | err = subsystem_register(&slab_subsys); |
@@ -3499,7 +3561,15 @@ static int __init slab_sysfs_init(void) | |||
3499 | return -ENOSYS; | 3561 | return -ENOSYS; |
3500 | } | 3562 | } |
3501 | 3563 | ||
3502 | finish_bootstrap(); | 3564 | slab_state = SYSFS; |
3565 | |||
3566 | list_for_each(h, &slab_caches) { | ||
3567 | struct kmem_cache *s = | ||
3568 | container_of(h, struct kmem_cache, list); | ||
3569 | |||
3570 | err = sysfs_slab_add(s); | ||
3571 | BUG_ON(err); | ||
3572 | } | ||
3503 | 3573 | ||
3504 | while (alias_list) { | 3574 | while (alias_list) { |
3505 | struct saved_alias *al = alias_list; | 3575 | struct saved_alias *al = alias_list; |
@@ -3515,6 +3585,4 @@ static int __init slab_sysfs_init(void) | |||
3515 | } | 3585 | } |
3516 | 3586 | ||
3517 | __initcall(slab_sysfs_init); | 3587 | __initcall(slab_sysfs_init); |
3518 | #else | ||
3519 | __initcall(finish_bootstrap); | ||
3520 | #endif | 3588 | #endif |