aboutsummaryrefslogtreecommitdiffstats
path: root/mm/slub.c
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /mm/slub.c
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'mm/slub.c')
-rw-r--r--mm/slub.c1598
1 files changed, 711 insertions, 887 deletions
diff --git a/mm/slub.c b/mm/slub.c
index ba2ca53f6c3..f73234db904 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,7 +16,6 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include "slab.h"
20#include <linux/proc_fs.h> 19#include <linux/proc_fs.h>
21#include <linux/seq_file.h> 20#include <linux/seq_file.h>
22#include <linux/kmemcheck.h> 21#include <linux/kmemcheck.h>
@@ -30,22 +29,18 @@
30#include <linux/math64.h> 29#include <linux/math64.h>
31#include <linux/fault-inject.h> 30#include <linux/fault-inject.h>
32#include <linux/stacktrace.h> 31#include <linux/stacktrace.h>
33#include <linux/prefetch.h>
34#include <linux/memcontrol.h>
35 32
36#include <trace/events/kmem.h> 33#include <trace/events/kmem.h>
37 34
38#include "internal.h"
39
40/* 35/*
41 * Lock order: 36 * Lock order:
42 * 1. slab_mutex (Global Mutex) 37 * 1. slub_lock (Global Semaphore)
43 * 2. node->list_lock 38 * 2. node->list_lock
44 * 3. slab_lock(page) (Only on some arches and for debugging) 39 * 3. slab_lock(page) (Only on some arches and for debugging)
45 * 40 *
46 * slab_mutex 41 * slub_lock
47 * 42 *
48 * The role of the slab_mutex is to protect the list of all the slabs 43 * The role of the slub_lock is to protect the list of all the slabs
49 * and to synchronize major metadata changes to slab cache structures. 44 * and to synchronize major metadata changes to slab cache structures.
50 * 45 *
51 * The slab_lock is only used for debugging and on arches that do not 46 * The slab_lock is only used for debugging and on arches that do not
@@ -113,6 +108,9 @@
113 * the fast path and disables lockless freelists. 108 * the fast path and disables lockless freelists.
114 */ 109 */
115 110
111#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
112 SLAB_TRACE | SLAB_DEBUG_FREE)
113
116static inline int kmem_cache_debug(struct kmem_cache *s) 114static inline int kmem_cache_debug(struct kmem_cache *s)
117{ 115{
118#ifdef CONFIG_SLUB_DEBUG 116#ifdef CONFIG_SLUB_DEBUG
@@ -177,10 +175,23 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
177#define __OBJECT_POISON 0x80000000UL /* Poison object */ 175#define __OBJECT_POISON 0x80000000UL /* Poison object */
178#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ 176#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
179 177
178static int kmem_size = sizeof(struct kmem_cache);
179
180#ifdef CONFIG_SMP 180#ifdef CONFIG_SMP
181static struct notifier_block slab_notifier; 181static struct notifier_block slab_notifier;
182#endif 182#endif
183 183
184static enum {
185 DOWN, /* No slab functionality available */
186 PARTIAL, /* Kmem_cache_node works */
187 UP, /* Everything works but does not show up in sysfs */
188 SYSFS /* Sysfs up */
189} slab_state = DOWN;
190
191/* A list of all slab caches on the system */
192static DECLARE_RWSEM(slub_lock);
193static LIST_HEAD(slab_caches);
194
184/* 195/*
185 * Tracking user of a slab. 196 * Tracking user of a slab.
186 */ 197 */
@@ -201,14 +212,17 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
201static int sysfs_slab_add(struct kmem_cache *); 212static int sysfs_slab_add(struct kmem_cache *);
202static int sysfs_slab_alias(struct kmem_cache *, const char *); 213static int sysfs_slab_alias(struct kmem_cache *, const char *);
203static void sysfs_slab_remove(struct kmem_cache *); 214static void sysfs_slab_remove(struct kmem_cache *);
204static void memcg_propagate_slab_attrs(struct kmem_cache *s); 215
205#else 216#else
206static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } 217static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
207static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) 218static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
208 { return 0; } 219 { return 0; }
209static inline void sysfs_slab_remove(struct kmem_cache *s) { } 220static inline void sysfs_slab_remove(struct kmem_cache *s)
221{
222 kfree(s->name);
223 kfree(s);
224}
210 225
211static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
212#endif 226#endif
213 227
214static inline void stat(const struct kmem_cache *s, enum stat_item si) 228static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -222,6 +236,11 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si)
222 * Core slab cache functions 236 * Core slab cache functions
223 *******************************************************************/ 237 *******************************************************************/
224 238
239int slab_is_available(void)
240{
241 return slab_state >= UP;
242}
243
225static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) 244static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
226{ 245{
227 return s->node[node]; 246 return s->node[node];
@@ -250,11 +269,6 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
250 return *(void **)(object + s->offset); 269 return *(void **)(object + s->offset);
251} 270}
252 271
253static void prefetch_freepointer(const struct kmem_cache *s, void *object)
254{
255 prefetch(object + s->offset);
256}
257
258static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) 272static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
259{ 273{
260 void *p; 274 void *p;
@@ -291,7 +305,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
291 * and whatever may come after it. 305 * and whatever may come after it.
292 */ 306 */
293 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) 307 if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
294 return s->object_size; 308 return s->objsize;
295 309
296#endif 310#endif
297 /* 311 /*
@@ -352,10 +366,9 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page
352 const char *n) 366 const char *n)
353{ 367{
354 VM_BUG_ON(!irqs_disabled()); 368 VM_BUG_ON(!irqs_disabled());
355#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 369#ifdef CONFIG_CMPXCHG_DOUBLE
356 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
357 if (s->flags & __CMPXCHG_DOUBLE) { 370 if (s->flags & __CMPXCHG_DOUBLE) {
358 if (cmpxchg_double(&page->freelist, &page->counters, 371 if (cmpxchg_double(&page->freelist,
359 freelist_old, counters_old, 372 freelist_old, counters_old,
360 freelist_new, counters_new)) 373 freelist_new, counters_new))
361 return 1; 374 return 1;
@@ -387,10 +400,9 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
387 void *freelist_new, unsigned long counters_new, 400 void *freelist_new, unsigned long counters_new,
388 const char *n) 401 const char *n)
389{ 402{
390#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 403#ifdef CONFIG_CMPXCHG_DOUBLE
391 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
392 if (s->flags & __CMPXCHG_DOUBLE) { 404 if (s->flags & __CMPXCHG_DOUBLE) {
393 if (cmpxchg_double(&page->freelist, &page->counters, 405 if (cmpxchg_double(&page->freelist,
394 freelist_old, counters_old, 406 freelist_old, counters_old,
395 freelist_new, counters_new)) 407 freelist_new, counters_new))
396 return 1; 408 return 1;
@@ -455,8 +467,34 @@ static int disable_higher_order_debug;
455 */ 467 */
456static void print_section(char *text, u8 *addr, unsigned int length) 468static void print_section(char *text, u8 *addr, unsigned int length)
457{ 469{
458 print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, 470 int i, offset;
459 length, 1); 471 int newline = 1;
472 char ascii[17];
473
474 ascii[16] = 0;
475
476 for (i = 0; i < length; i++) {
477 if (newline) {
478 printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
479 newline = 0;
480 }
481 printk(KERN_CONT " %02x", addr[i]);
482 offset = i % 16;
483 ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
484 if (offset == 15) {
485 printk(KERN_CONT " %s\n", ascii);
486 newline = 1;
487 }
488 }
489 if (!newline) {
490 i %= 16;
491 while (i < 16) {
492 printk(KERN_CONT " ");
493 ascii[i] = ' ';
494 i++;
495 }
496 printk(KERN_CONT " %s\n", ascii);
497 }
460} 498}
461 499
462static struct track *get_track(struct kmem_cache *s, void *object, 500static struct track *get_track(struct kmem_cache *s, void *object,
@@ -558,11 +596,9 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...)
558 va_end(args); 596 va_end(args);
559 printk(KERN_ERR "========================================" 597 printk(KERN_ERR "========================================"
560 "=====================================\n"); 598 "=====================================\n");
561 printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); 599 printk(KERN_ERR "BUG %s: %s\n", s->name, buf);
562 printk(KERN_ERR "----------------------------------------" 600 printk(KERN_ERR "----------------------------------------"
563 "-------------------------------------\n\n"); 601 "-------------------------------------\n\n");
564
565 add_taint(TAINT_BAD_PAGE);
566} 602}
567 603
568static void slab_fix(struct kmem_cache *s, char *fmt, ...) 604static void slab_fix(struct kmem_cache *s, char *fmt, ...)
@@ -589,13 +625,13 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
589 p, p - addr, get_freepointer(s, p)); 625 p, p - addr, get_freepointer(s, p));
590 626
591 if (p > addr + 16) 627 if (p > addr + 16)
592 print_section("Bytes b4 ", p - 16, 16); 628 print_section("Bytes b4", p - 16, 16);
629
630 print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
593 631
594 print_section("Object ", p, min_t(unsigned long, s->object_size,
595 PAGE_SIZE));
596 if (s->flags & SLAB_RED_ZONE) 632 if (s->flags & SLAB_RED_ZONE)
597 print_section("Redzone ", p + s->object_size, 633 print_section("Redzone", p + s->objsize,
598 s->inuse - s->object_size); 634 s->inuse - s->objsize);
599 635
600 if (s->offset) 636 if (s->offset)
601 off = s->offset + sizeof(void *); 637 off = s->offset + sizeof(void *);
@@ -607,7 +643,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
607 643
608 if (off != s->size) 644 if (off != s->size)
609 /* Beginning of the filler is the free pointer */ 645 /* Beginning of the filler is the free pointer */
610 print_section("Padding ", p + off, s->size - off); 646 print_section("Padding", p + off, s->size - off);
611 647
612 dump_stack(); 648 dump_stack();
613} 649}
@@ -619,7 +655,7 @@ static void object_err(struct kmem_cache *s, struct page *page,
619 print_trailer(s, page, object); 655 print_trailer(s, page, object);
620} 656}
621 657
622static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) 658static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
623{ 659{
624 va_list args; 660 va_list args;
625 char buf[100]; 661 char buf[100];
@@ -637,12 +673,55 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
637 u8 *p = object; 673 u8 *p = object;
638 674
639 if (s->flags & __OBJECT_POISON) { 675 if (s->flags & __OBJECT_POISON) {
640 memset(p, POISON_FREE, s->object_size - 1); 676 memset(p, POISON_FREE, s->objsize - 1);
641 p[s->object_size - 1] = POISON_END; 677 p[s->objsize - 1] = POISON_END;
642 } 678 }
643 679
644 if (s->flags & SLAB_RED_ZONE) 680 if (s->flags & SLAB_RED_ZONE)
645 memset(p + s->object_size, val, s->inuse - s->object_size); 681 memset(p + s->objsize, val, s->inuse - s->objsize);
682}
683
684static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
685{
686 while (bytes) {
687 if (*start != value)
688 return start;
689 start++;
690 bytes--;
691 }
692 return NULL;
693}
694
695static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
696{
697 u64 value64;
698 unsigned int words, prefix;
699
700 if (bytes <= 16)
701 return check_bytes8(start, value, bytes);
702
703 value64 = value | value << 8 | value << 16 | value << 24;
704 value64 = (value64 & 0xffffffff) | value64 << 32;
705 prefix = 8 - ((unsigned long)start) % 8;
706
707 if (prefix) {
708 u8 *r = check_bytes8(start, value, prefix);
709 if (r)
710 return r;
711 start += prefix;
712 bytes -= prefix;
713 }
714
715 words = bytes / 8;
716
717 while (words) {
718 if (*(u64 *)start != value64)
719 return check_bytes8(start, value, 8);
720 start += 8;
721 words--;
722 }
723
724 return check_bytes8(start, value, bytes % 8);
646} 725}
647 726
648static void restore_bytes(struct kmem_cache *s, char *message, u8 data, 727static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
@@ -659,7 +738,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
659 u8 *fault; 738 u8 *fault;
660 u8 *end; 739 u8 *end;
661 740
662 fault = memchr_inv(start, value, bytes); 741 fault = check_bytes(start, value, bytes);
663 if (!fault) 742 if (!fault)
664 return 1; 743 return 1;
665 744
@@ -687,10 +766,10 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
687 * Poisoning uses 0x6b (POISON_FREE) and the last byte is 766 * Poisoning uses 0x6b (POISON_FREE) and the last byte is
688 * 0xa5 (POISON_END) 767 * 0xa5 (POISON_END)
689 * 768 *
690 * object + s->object_size 769 * object + s->objsize
691 * Padding to reach word boundary. This is also used for Redzoning. 770 * Padding to reach word boundary. This is also used for Redzoning.
692 * Padding is extended by another word if Redzoning is enabled and 771 * Padding is extended by another word if Redzoning is enabled and
693 * object_size == inuse. 772 * objsize == inuse.
694 * 773 *
695 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with 774 * We fill with 0xbb (RED_INACTIVE) for inactive objects and with
696 * 0xcc (RED_ACTIVE) for objects in use. 775 * 0xcc (RED_ACTIVE) for objects in use.
@@ -709,7 +788,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
709 * object + s->size 788 * object + s->size
710 * Nothing is used beyond s->size. 789 * Nothing is used beyond s->size.
711 * 790 *
712 * If slabcaches are merged then the object_size and inuse boundaries are mostly 791 * If slabcaches are merged then the objsize and inuse boundaries are mostly
713 * ignored. And therefore no slab options that rely on these boundaries 792 * ignored. And therefore no slab options that rely on these boundaries
714 * may be used with merged slabcaches. 793 * may be used with merged slabcaches.
715 */ 794 */
@@ -752,14 +831,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
752 if (!remainder) 831 if (!remainder)
753 return 1; 832 return 1;
754 833
755 fault = memchr_inv(end - remainder, POISON_INUSE, remainder); 834 fault = check_bytes(end - remainder, POISON_INUSE, remainder);
756 if (!fault) 835 if (!fault)
757 return 1; 836 return 1;
758 while (end > fault && end[-1] == POISON_INUSE) 837 while (end > fault && end[-1] == POISON_INUSE)
759 end--; 838 end--;
760 839
761 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); 840 slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
762 print_section("Padding ", end - remainder, remainder); 841 print_section("Padding", end - remainder, remainder);
763 842
764 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); 843 restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
765 return 0; 844 return 0;
@@ -769,25 +848,25 @@ static int check_object(struct kmem_cache *s, struct page *page,
769 void *object, u8 val) 848 void *object, u8 val)
770{ 849{
771 u8 *p = object; 850 u8 *p = object;
772 u8 *endobject = object + s->object_size; 851 u8 *endobject = object + s->objsize;
773 852
774 if (s->flags & SLAB_RED_ZONE) { 853 if (s->flags & SLAB_RED_ZONE) {
775 if (!check_bytes_and_report(s, page, object, "Redzone", 854 if (!check_bytes_and_report(s, page, object, "Redzone",
776 endobject, val, s->inuse - s->object_size)) 855 endobject, val, s->inuse - s->objsize))
777 return 0; 856 return 0;
778 } else { 857 } else {
779 if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { 858 if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
780 check_bytes_and_report(s, page, p, "Alignment padding", 859 check_bytes_and_report(s, page, p, "Alignment padding",
781 endobject, POISON_INUSE, s->inuse - s->object_size); 860 endobject, POISON_INUSE, s->inuse - s->objsize);
782 } 861 }
783 } 862 }
784 863
785 if (s->flags & SLAB_POISON) { 864 if (s->flags & SLAB_POISON) {
786 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && 865 if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
787 (!check_bytes_and_report(s, page, p, "Poison", p, 866 (!check_bytes_and_report(s, page, p, "Poison", p,
788 POISON_FREE, s->object_size - 1) || 867 POISON_FREE, s->objsize - 1) ||
789 !check_bytes_and_report(s, page, p, "Poison", 868 !check_bytes_and_report(s, page, p, "Poison",
790 p + s->object_size - 1, POISON_END, 1))) 869 p + s->objsize - 1, POISON_END, 1)))
791 return 0; 870 return 0;
792 /* 871 /*
793 * check_pad_bytes cleans up on its own. 872 * check_pad_bytes cleans up on its own.
@@ -908,7 +987,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
908 page->freelist); 987 page->freelist);
909 988
910 if (!alloc) 989 if (!alloc)
911 print_section("Object ", (void *)object, s->object_size); 990 print_section("Object", (void *)object, s->objsize);
912 991
913 dump_stack(); 992 dump_stack();
914 } 993 }
@@ -924,14 +1003,14 @@ static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
924 lockdep_trace_alloc(flags); 1003 lockdep_trace_alloc(flags);
925 might_sleep_if(flags & __GFP_WAIT); 1004 might_sleep_if(flags & __GFP_WAIT);
926 1005
927 return should_failslab(s->object_size, flags, s->flags); 1006 return should_failslab(s->objsize, flags, s->flags);
928} 1007}
929 1008
930static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) 1009static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
931{ 1010{
932 flags &= gfp_allowed_mask; 1011 flags &= gfp_allowed_mask;
933 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); 1012 kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
934 kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); 1013 kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
935} 1014}
936 1015
937static inline void slab_free_hook(struct kmem_cache *s, void *x) 1016static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -948,13 +1027,13 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
948 unsigned long flags; 1027 unsigned long flags;
949 1028
950 local_irq_save(flags); 1029 local_irq_save(flags);
951 kmemcheck_slab_free(s, x, s->object_size); 1030 kmemcheck_slab_free(s, x, s->objsize);
952 debug_check_no_locks_freed(x, s->object_size); 1031 debug_check_no_locks_freed(x, s->objsize);
953 local_irq_restore(flags); 1032 local_irq_restore(flags);
954 } 1033 }
955#endif 1034#endif
956 if (!(s->flags & SLAB_DEBUG_OBJECTS)) 1035 if (!(s->flags & SLAB_DEBUG_OBJECTS))
957 debug_check_no_obj_freed(x, s->object_size); 1036 debug_check_no_obj_freed(x, s->objsize);
958} 1037}
959 1038
960/* 1039/*
@@ -1064,13 +1143,13 @@ bad:
1064 return 0; 1143 return 0;
1065} 1144}
1066 1145
1067static noinline struct kmem_cache_node *free_debug_processing( 1146static noinline int free_debug_processing(struct kmem_cache *s,
1068 struct kmem_cache *s, struct page *page, void *object, 1147 struct page *page, void *object, unsigned long addr)
1069 unsigned long addr, unsigned long *flags)
1070{ 1148{
1071 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1149 unsigned long flags;
1150 int rc = 0;
1072 1151
1073 spin_lock_irqsave(&n->list_lock, *flags); 1152 local_irq_save(flags);
1074 slab_lock(page); 1153 slab_lock(page);
1075 1154
1076 if (!check_slab(s, page)) 1155 if (!check_slab(s, page))
@@ -1089,11 +1168,11 @@ static noinline struct kmem_cache_node *free_debug_processing(
1089 if (!check_object(s, page, object, SLUB_RED_ACTIVE)) 1168 if (!check_object(s, page, object, SLUB_RED_ACTIVE))
1090 goto out; 1169 goto out;
1091 1170
1092 if (unlikely(s != page->slab_cache)) { 1171 if (unlikely(s != page->slab)) {
1093 if (!PageSlab(page)) { 1172 if (!PageSlab(page)) {
1094 slab_err(s, page, "Attempt to free object(0x%p) " 1173 slab_err(s, page, "Attempt to free object(0x%p) "
1095 "outside of slab", object); 1174 "outside of slab", object);
1096 } else if (!page->slab_cache) { 1175 } else if (!page->slab) {
1097 printk(KERN_ERR 1176 printk(KERN_ERR
1098 "SLUB <none>: no slab for object 0x%p.\n", 1177 "SLUB <none>: no slab for object 0x%p.\n",
1099 object); 1178 object);
@@ -1108,19 +1187,15 @@ static noinline struct kmem_cache_node *free_debug_processing(
1108 set_track(s, object, TRACK_FREE, addr); 1187 set_track(s, object, TRACK_FREE, addr);
1109 trace(s, page, object, 0); 1188 trace(s, page, object, 0);
1110 init_object(s, object, SLUB_RED_INACTIVE); 1189 init_object(s, object, SLUB_RED_INACTIVE);
1190 rc = 1;
1111out: 1191out:
1112 slab_unlock(page); 1192 slab_unlock(page);
1113 /* 1193 local_irq_restore(flags);
1114 * Keep node_lock to preserve integrity 1194 return rc;
1115 * until the object is actually freed
1116 */
1117 return n;
1118 1195
1119fail: 1196fail:
1120 slab_unlock(page);
1121 spin_unlock_irqrestore(&n->list_lock, *flags);
1122 slab_fix(s, "Object at 0x%p not freed", object); 1197 slab_fix(s, "Object at 0x%p not freed", object);
1123 return NULL; 1198 goto out;
1124} 1199}
1125 1200
1126static int __init setup_slub_debug(char *str) 1201static int __init setup_slub_debug(char *str)
@@ -1193,7 +1268,7 @@ out:
1193 1268
1194__setup("slub_debug", setup_slub_debug); 1269__setup("slub_debug", setup_slub_debug);
1195 1270
1196static unsigned long kmem_cache_flags(unsigned long object_size, 1271static unsigned long kmem_cache_flags(unsigned long objsize,
1197 unsigned long flags, const char *name, 1272 unsigned long flags, const char *name,
1198 void (*ctor)(void *)) 1273 void (*ctor)(void *))
1199{ 1274{
@@ -1213,9 +1288,8 @@ static inline void setup_object_debug(struct kmem_cache *s,
1213static inline int alloc_debug_processing(struct kmem_cache *s, 1288static inline int alloc_debug_processing(struct kmem_cache *s,
1214 struct page *page, void *object, unsigned long addr) { return 0; } 1289 struct page *page, void *object, unsigned long addr) { return 0; }
1215 1290
1216static inline struct kmem_cache_node *free_debug_processing( 1291static inline int free_debug_processing(struct kmem_cache *s,
1217 struct kmem_cache *s, struct page *page, void *object, 1292 struct page *page, void *object, unsigned long addr) { return 0; }
1218 unsigned long addr, unsigned long *flags) { return NULL; }
1219 1293
1220static inline int slab_pad_check(struct kmem_cache *s, struct page *page) 1294static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
1221 { return 1; } 1295 { return 1; }
@@ -1224,7 +1298,7 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
1224static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, 1298static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
1225 struct page *page) {} 1299 struct page *page) {}
1226static inline void remove_full(struct kmem_cache *s, struct page *page) {} 1300static inline void remove_full(struct kmem_cache *s, struct page *page) {}
1227static inline unsigned long kmem_cache_flags(unsigned long object_size, 1301static inline unsigned long kmem_cache_flags(unsigned long objsize,
1228 unsigned long flags, const char *name, 1302 unsigned long flags, const char *name,
1229 void (*ctor)(void *)) 1303 void (*ctor)(void *))
1230{ 1304{
@@ -1301,7 +1375,13 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1301 stat(s, ORDER_FALLBACK); 1375 stat(s, ORDER_FALLBACK);
1302 } 1376 }
1303 1377
1304 if (kmemcheck_enabled && page 1378 if (flags & __GFP_WAIT)
1379 local_irq_disable();
1380
1381 if (!page)
1382 return NULL;
1383
1384 if (kmemcheck_enabled
1305 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { 1385 && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
1306 int pages = 1 << oo_order(oo); 1386 int pages = 1 << oo_order(oo);
1307 1387
@@ -1317,11 +1397,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
1317 kmemcheck_mark_unallocated_pages(page, pages); 1397 kmemcheck_mark_unallocated_pages(page, pages);
1318 } 1398 }
1319 1399
1320 if (flags & __GFP_WAIT)
1321 local_irq_disable();
1322 if (!page)
1323 return NULL;
1324
1325 page->objects = oo_objects(oo); 1400 page->objects = oo_objects(oo);
1326 mod_zone_page_state(page_zone(page), 1401 mod_zone_page_state(page_zone(page),
1327 (s->flags & SLAB_RECLAIM_ACCOUNT) ? 1402 (s->flags & SLAB_RECLAIM_ACCOUNT) ?
@@ -1345,7 +1420,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1345 void *start; 1420 void *start;
1346 void *last; 1421 void *last;
1347 void *p; 1422 void *p;
1348 int order;
1349 1423
1350 BUG_ON(flags & GFP_SLAB_BUG_MASK); 1424 BUG_ON(flags & GFP_SLAB_BUG_MASK);
1351 1425
@@ -1354,18 +1428,14 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1354 if (!page) 1428 if (!page)
1355 goto out; 1429 goto out;
1356 1430
1357 order = compound_order(page);
1358 inc_slabs_node(s, page_to_nid(page), page->objects); 1431 inc_slabs_node(s, page_to_nid(page), page->objects);
1359 memcg_bind_pages(s, order); 1432 page->slab = s;
1360 page->slab_cache = s; 1433 page->flags |= 1 << PG_slab;
1361 __SetPageSlab(page);
1362 if (page->pfmemalloc)
1363 SetPageSlabPfmemalloc(page);
1364 1434
1365 start = page_address(page); 1435 start = page_address(page);
1366 1436
1367 if (unlikely(s->flags & SLAB_POISON)) 1437 if (unlikely(s->flags & SLAB_POISON))
1368 memset(start, POISON_INUSE, PAGE_SIZE << order); 1438 memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
1369 1439
1370 last = start; 1440 last = start;
1371 for_each_object(p, s, start, page->objects) { 1441 for_each_object(p, s, start, page->objects) {
@@ -1377,7 +1447,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
1377 set_freepointer(s, last, NULL); 1447 set_freepointer(s, last, NULL);
1378 1448
1379 page->freelist = start; 1449 page->freelist = start;
1380 page->inuse = page->objects; 1450 page->inuse = 0;
1381 page->frozen = 1; 1451 page->frozen = 1;
1382out: 1452out:
1383 return page; 1453 return page;
@@ -1404,14 +1474,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
1404 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, 1474 NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
1405 -pages); 1475 -pages);
1406 1476
1407 __ClearPageSlabPfmemalloc(page);
1408 __ClearPageSlab(page); 1477 __ClearPageSlab(page);
1409
1410 memcg_release_pages(s, order);
1411 reset_page_mapcount(page); 1478 reset_page_mapcount(page);
1412 if (current->reclaim_state) 1479 if (current->reclaim_state)
1413 current->reclaim_state->reclaimed_slab += pages; 1480 current->reclaim_state->reclaimed_slab += pages;
1414 __free_memcg_kmem_pages(page, order); 1481 __free_pages(page, order);
1415} 1482}
1416 1483
1417#define need_reserve_slab_rcu \ 1484#define need_reserve_slab_rcu \
@@ -1426,7 +1493,7 @@ static void rcu_free_slab(struct rcu_head *h)
1426 else 1493 else
1427 page = container_of((struct list_head *)h, struct page, lru); 1494 page = container_of((struct list_head *)h, struct page, lru);
1428 1495
1429 __free_slab(page->slab_cache, page); 1496 __free_slab(page->slab, page);
1430} 1497}
1431 1498
1432static void free_slab(struct kmem_cache *s, struct page *page) 1499static void free_slab(struct kmem_cache *s, struct page *page)
@@ -1467,7 +1534,7 @@ static inline void add_partial(struct kmem_cache_node *n,
1467 struct page *page, int tail) 1534 struct page *page, int tail)
1468{ 1535{
1469 n->nr_partial++; 1536 n->nr_partial++;
1470 if (tail == DEACTIVATE_TO_TAIL) 1537 if (tail)
1471 list_add_tail(&page->lru, &n->partial); 1538 list_add_tail(&page->lru, &n->partial);
1472 else 1539 else
1473 list_add(&page->lru, &n->partial); 1540 list_add(&page->lru, &n->partial);
@@ -1484,16 +1551,13 @@ static inline void remove_partial(struct kmem_cache_node *n,
1484} 1551}
1485 1552
1486/* 1553/*
1487 * Remove slab from the partial list, freeze it and 1554 * Lock slab, remove from the partial list and put the object into the
1488 * return the pointer to the freelist. 1555 * per cpu freelist.
1489 * 1556 *
1490 * Returns a list of objects or NULL if it fails. 1557 * Must hold list_lock.
1491 *
1492 * Must hold list_lock since we modify the partial list.
1493 */ 1558 */
1494static inline void *acquire_slab(struct kmem_cache *s, 1559static inline int acquire_slab(struct kmem_cache *s,
1495 struct kmem_cache_node *n, struct page *page, 1560 struct kmem_cache_node *n, struct page *page)
1496 int mode)
1497{ 1561{
1498 void *freelist; 1562 void *freelist;
1499 unsigned long counters; 1563 unsigned long counters;
@@ -1504,41 +1568,47 @@ static inline void *acquire_slab(struct kmem_cache *s,
1504 * The old freelist is the list of objects for the 1568 * The old freelist is the list of objects for the
1505 * per cpu allocation list. 1569 * per cpu allocation list.
1506 */ 1570 */
1507 freelist = page->freelist; 1571 do {
1508 counters = page->counters; 1572 freelist = page->freelist;
1509 new.counters = counters; 1573 counters = page->counters;
1510 if (mode) { 1574 new.counters = counters;
1511 new.inuse = page->objects; 1575 new.inuse = page->objects;
1512 new.freelist = NULL;
1513 } else {
1514 new.freelist = freelist;
1515 }
1516 1576
1517 VM_BUG_ON(new.frozen); 1577 VM_BUG_ON(new.frozen);
1518 new.frozen = 1; 1578 new.frozen = 1;
1519 1579
1520 if (!__cmpxchg_double_slab(s, page, 1580 } while (!__cmpxchg_double_slab(s, page,
1521 freelist, counters, 1581 freelist, counters,
1522 new.freelist, new.counters, 1582 NULL, new.counters,
1523 "acquire_slab")) 1583 "lock and freeze"));
1524 return NULL;
1525 1584
1526 remove_partial(n, page); 1585 remove_partial(n, page);
1527 WARN_ON(!freelist);
1528 return freelist;
1529}
1530 1586
1531static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); 1587 if (freelist) {
1532static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags); 1588 /* Populate the per cpu freelist */
1589 this_cpu_write(s->cpu_slab->freelist, freelist);
1590 this_cpu_write(s->cpu_slab->page, page);
1591 this_cpu_write(s->cpu_slab->node, page_to_nid(page));
1592 return 1;
1593 } else {
1594 /*
1595 * Slab page came from the wrong list. No object to allocate
1596 * from. Put it onto the correct list and continue partial
1597 * scan.
1598 */
1599 printk(KERN_ERR "SLUB: %s : Page without available objects on"
1600 " partial list\n", s->name);
1601 return 0;
1602 }
1603}
1533 1604
1534/* 1605/*
1535 * Try to allocate a partial slab from a specific node. 1606 * Try to allocate a partial slab from a specific node.
1536 */ 1607 */
1537static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, 1608static struct page *get_partial_node(struct kmem_cache *s,
1538 struct kmem_cache_cpu *c, gfp_t flags) 1609 struct kmem_cache_node *n)
1539{ 1610{
1540 struct page *page, *page2; 1611 struct page *page;
1541 void *object = NULL;
1542 1612
1543 /* 1613 /*
1544 * Racy check. If we mistakenly see no partial slabs then we 1614 * Racy check. If we mistakenly see no partial slabs then we
@@ -1550,47 +1620,26 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n,
1550 return NULL; 1620 return NULL;
1551 1621
1552 spin_lock(&n->list_lock); 1622 spin_lock(&n->list_lock);
1553 list_for_each_entry_safe(page, page2, &n->partial, lru) { 1623 list_for_each_entry(page, &n->partial, lru)
1554 void *t; 1624 if (acquire_slab(s, n, page))
1555 int available; 1625 goto out;
1556 1626 page = NULL;
1557 if (!pfmemalloc_match(page, flags)) 1627out:
1558 continue;
1559
1560 t = acquire_slab(s, n, page, object == NULL);
1561 if (!t)
1562 break;
1563
1564 if (!object) {
1565 c->page = page;
1566 stat(s, ALLOC_FROM_PARTIAL);
1567 object = t;
1568 available = page->objects - page->inuse;
1569 } else {
1570 available = put_cpu_partial(s, page, 0);
1571 stat(s, CPU_PARTIAL_NODE);
1572 }
1573 if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
1574 break;
1575
1576 }
1577 spin_unlock(&n->list_lock); 1628 spin_unlock(&n->list_lock);
1578 return object; 1629 return page;
1579} 1630}
1580 1631
1581/* 1632/*
1582 * Get a page from somewhere. Search in increasing NUMA distances. 1633 * Get a page from somewhere. Search in increasing NUMA distances.
1583 */ 1634 */
1584static void *get_any_partial(struct kmem_cache *s, gfp_t flags, 1635static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1585 struct kmem_cache_cpu *c)
1586{ 1636{
1587#ifdef CONFIG_NUMA 1637#ifdef CONFIG_NUMA
1588 struct zonelist *zonelist; 1638 struct zonelist *zonelist;
1589 struct zoneref *z; 1639 struct zoneref *z;
1590 struct zone *zone; 1640 struct zone *zone;
1591 enum zone_type high_zoneidx = gfp_zone(flags); 1641 enum zone_type high_zoneidx = gfp_zone(flags);
1592 void *object; 1642 struct page *page;
1593 unsigned int cpuset_mems_cookie;
1594 1643
1595 /* 1644 /*
1596 * The defrag ratio allows a configuration of the tradeoffs between 1645 * The defrag ratio allows a configuration of the tradeoffs between
@@ -1614,32 +1663,23 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1614 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1663 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1615 return NULL; 1664 return NULL;
1616 1665
1617 do { 1666 get_mems_allowed();
1618 cpuset_mems_cookie = get_mems_allowed(); 1667 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1619 zonelist = node_zonelist(slab_node(), flags); 1668 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1620 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1669 struct kmem_cache_node *n;
1621 struct kmem_cache_node *n; 1670
1622 1671 n = get_node(s, zone_to_nid(zone));
1623 n = get_node(s, zone_to_nid(zone)); 1672
1624 1673 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1625 if (n && cpuset_zone_allowed_hardwall(zone, flags) && 1674 n->nr_partial > s->min_partial) {
1626 n->nr_partial > s->min_partial) { 1675 page = get_partial_node(s, n);
1627 object = get_partial_node(s, n, c, flags); 1676 if (page) {
1628 if (object) { 1677 put_mems_allowed();
1629 /* 1678 return page;
1630 * Return the object even if
1631 * put_mems_allowed indicated that
1632 * the cpuset mems_allowed was
1633 * updated in parallel. It's a
1634 * harmless race between the alloc
1635 * and the cpuset update.
1636 */
1637 put_mems_allowed(cpuset_mems_cookie);
1638 return object;
1639 }
1640 } 1679 }
1641 } 1680 }
1642 } while (!put_mems_allowed(cpuset_mems_cookie)); 1681 }
1682 put_mems_allowed();
1643#endif 1683#endif
1644 return NULL; 1684 return NULL;
1645} 1685}
@@ -1647,17 +1687,16 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
1647/* 1687/*
1648 * Get a partial page, lock it and return it. 1688 * Get a partial page, lock it and return it.
1649 */ 1689 */
1650static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, 1690static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
1651 struct kmem_cache_cpu *c)
1652{ 1691{
1653 void *object; 1692 struct page *page;
1654 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; 1693 int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
1655 1694
1656 object = get_partial_node(s, get_node(s, searchnode), c, flags); 1695 page = get_partial_node(s, get_node(s, searchnode));
1657 if (object || node != NUMA_NO_NODE) 1696 if (page || node != NUMA_NO_NODE)
1658 return object; 1697 return page;
1659 1698
1660 return get_any_partial(s, flags, c); 1699 return get_any_partial(s, flags);
1661} 1700}
1662 1701
1663#ifdef CONFIG_PREEMPT 1702#ifdef CONFIG_PREEMPT
@@ -1719,33 +1758,43 @@ static inline void note_cmpxchg_failure(const char *n,
1719 stat(s, CMPXCHG_DOUBLE_CPU_FAIL); 1758 stat(s, CMPXCHG_DOUBLE_CPU_FAIL);
1720} 1759}
1721 1760
1722static void init_kmem_cache_cpus(struct kmem_cache *s) 1761void init_kmem_cache_cpus(struct kmem_cache *s)
1723{ 1762{
1724 int cpu; 1763 int cpu;
1725 1764
1726 for_each_possible_cpu(cpu) 1765 for_each_possible_cpu(cpu)
1727 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); 1766 per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
1728} 1767}
1768/*
1769 * Remove the cpu slab
1770 */
1729 1771
1730/* 1772/*
1731 * Remove the cpu slab 1773 * Remove the cpu slab
1732 */ 1774 */
1733static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist) 1775static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1734{ 1776{
1735 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; 1777 enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE };
1778 struct page *page = c->page;
1736 struct kmem_cache_node *n = get_node(s, page_to_nid(page)); 1779 struct kmem_cache_node *n = get_node(s, page_to_nid(page));
1737 int lock = 0; 1780 int lock = 0;
1738 enum slab_modes l = M_NONE, m = M_NONE; 1781 enum slab_modes l = M_NONE, m = M_NONE;
1782 void *freelist;
1739 void *nextfree; 1783 void *nextfree;
1740 int tail = DEACTIVATE_TO_HEAD; 1784 int tail = 0;
1741 struct page new; 1785 struct page new;
1742 struct page old; 1786 struct page old;
1743 1787
1744 if (page->freelist) { 1788 if (page->freelist) {
1745 stat(s, DEACTIVATE_REMOTE_FREES); 1789 stat(s, DEACTIVATE_REMOTE_FREES);
1746 tail = DEACTIVATE_TO_TAIL; 1790 tail = 1;
1747 } 1791 }
1748 1792
1793 c->tid = next_tid(c->tid);
1794 c->page = NULL;
1795 freelist = c->freelist;
1796 c->freelist = NULL;
1797
1749 /* 1798 /*
1750 * Stage one: Free all available per cpu objects back 1799 * Stage one: Free all available per cpu objects back
1751 * to the page freelist while it is still frozen. Leave the 1800 * to the page freelist while it is still frozen. Leave the
@@ -1844,7 +1893,7 @@ redo:
1844 if (m == M_PARTIAL) { 1893 if (m == M_PARTIAL) {
1845 1894
1846 add_partial(n, page, tail); 1895 add_partial(n, page, tail);
1847 stat(s, tail); 1896 stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
1848 1897
1849 } else if (m == M_FULL) { 1898 } else if (m == M_FULL) {
1850 1899
@@ -1871,130 +1920,10 @@ redo:
1871 } 1920 }
1872} 1921}
1873 1922
1874/*
1875 * Unfreeze all the cpu partial slabs.
1876 *
1877 * This function must be called with interrupts disabled
1878 * for the cpu using c (or some other guarantee must be there
1879 * to guarantee no concurrent accesses).
1880 */
1881static void unfreeze_partials(struct kmem_cache *s,
1882 struct kmem_cache_cpu *c)
1883{
1884 struct kmem_cache_node *n = NULL, *n2 = NULL;
1885 struct page *page, *discard_page = NULL;
1886
1887 while ((page = c->partial)) {
1888 struct page new;
1889 struct page old;
1890
1891 c->partial = page->next;
1892
1893 n2 = get_node(s, page_to_nid(page));
1894 if (n != n2) {
1895 if (n)
1896 spin_unlock(&n->list_lock);
1897
1898 n = n2;
1899 spin_lock(&n->list_lock);
1900 }
1901
1902 do {
1903
1904 old.freelist = page->freelist;
1905 old.counters = page->counters;
1906 VM_BUG_ON(!old.frozen);
1907
1908 new.counters = old.counters;
1909 new.freelist = old.freelist;
1910
1911 new.frozen = 0;
1912
1913 } while (!__cmpxchg_double_slab(s, page,
1914 old.freelist, old.counters,
1915 new.freelist, new.counters,
1916 "unfreezing slab"));
1917
1918 if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
1919 page->next = discard_page;
1920 discard_page = page;
1921 } else {
1922 add_partial(n, page, DEACTIVATE_TO_TAIL);
1923 stat(s, FREE_ADD_PARTIAL);
1924 }
1925 }
1926
1927 if (n)
1928 spin_unlock(&n->list_lock);
1929
1930 while (discard_page) {
1931 page = discard_page;
1932 discard_page = discard_page->next;
1933
1934 stat(s, DEACTIVATE_EMPTY);
1935 discard_slab(s, page);
1936 stat(s, FREE_SLAB);
1937 }
1938}
1939
1940/*
1941 * Put a page that was just frozen (in __slab_free) into a partial page
1942 * slot if available. This is done without interrupts disabled and without
1943 * preemption disabled. The cmpxchg is racy and may put the partial page
1944 * onto a random cpus partial slot.
1945 *
1946 * If we did not find a slot then simply move all the partials to the
1947 * per node partial list.
1948 */
1949static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
1950{
1951 struct page *oldpage;
1952 int pages;
1953 int pobjects;
1954
1955 do {
1956 pages = 0;
1957 pobjects = 0;
1958 oldpage = this_cpu_read(s->cpu_slab->partial);
1959
1960 if (oldpage) {
1961 pobjects = oldpage->pobjects;
1962 pages = oldpage->pages;
1963 if (drain && pobjects > s->cpu_partial) {
1964 unsigned long flags;
1965 /*
1966 * partial array is full. Move the existing
1967 * set to the per node partial list.
1968 */
1969 local_irq_save(flags);
1970 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
1971 local_irq_restore(flags);
1972 oldpage = NULL;
1973 pobjects = 0;
1974 pages = 0;
1975 stat(s, CPU_PARTIAL_DRAIN);
1976 }
1977 }
1978
1979 pages++;
1980 pobjects += page->objects - page->inuse;
1981
1982 page->pages = pages;
1983 page->pobjects = pobjects;
1984 page->next = oldpage;
1985
1986 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
1987 return pobjects;
1988}
1989
1990static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) 1923static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
1991{ 1924{
1992 stat(s, CPUSLAB_FLUSH); 1925 stat(s, CPUSLAB_FLUSH);
1993 deactivate_slab(s, c->page, c->freelist); 1926 deactivate_slab(s, c);
1994
1995 c->tid = next_tid(c->tid);
1996 c->page = NULL;
1997 c->freelist = NULL;
1998} 1927}
1999 1928
2000/* 1929/*
@@ -2006,12 +1935,8 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
2006{ 1935{
2007 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 1936 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2008 1937
2009 if (likely(c)) { 1938 if (likely(c && c->page))
2010 if (c->page) 1939 flush_slab(s, c);
2011 flush_slab(s, c);
2012
2013 unfreeze_partials(s, c);
2014 }
2015} 1940}
2016 1941
2017static void flush_cpu_slab(void *d) 1942static void flush_cpu_slab(void *d)
@@ -2021,27 +1946,19 @@ static void flush_cpu_slab(void *d)
2021 __flush_cpu_slab(s, smp_processor_id()); 1946 __flush_cpu_slab(s, smp_processor_id());
2022} 1947}
2023 1948
2024static bool has_cpu_slab(int cpu, void *info)
2025{
2026 struct kmem_cache *s = info;
2027 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
2028
2029 return c->page || c->partial;
2030}
2031
2032static void flush_all(struct kmem_cache *s) 1949static void flush_all(struct kmem_cache *s)
2033{ 1950{
2034 on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); 1951 on_each_cpu(flush_cpu_slab, s, 1);
2035} 1952}
2036 1953
2037/* 1954/*
2038 * Check if the objects in a per cpu structure fit numa 1955 * Check if the objects in a per cpu structure fit numa
2039 * locality expectations. 1956 * locality expectations.
2040 */ 1957 */
2041static inline int node_match(struct page *page, int node) 1958static inline int node_match(struct kmem_cache_cpu *c, int node)
2042{ 1959{
2043#ifdef CONFIG_NUMA 1960#ifdef CONFIG_NUMA
2044 if (node != NUMA_NO_NODE && page_to_nid(page) != node) 1961 if (node != NUMA_NO_NODE && c->node != node)
2045 return 0; 1962 return 0;
2046#endif 1963#endif
2047 return 1; 1964 return 1;
@@ -2084,10 +2001,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2084 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", 2001 "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n",
2085 nid, gfpflags); 2002 nid, gfpflags);
2086 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " 2003 printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, "
2087 "default order: %d, min order: %d\n", s->name, s->object_size, 2004 "default order: %d, min order: %d\n", s->name, s->objsize,
2088 s->size, oo_order(s->oo), oo_order(s->min)); 2005 s->size, oo_order(s->oo), oo_order(s->min));
2089 2006
2090 if (oo_order(s->min) > get_order(s->object_size)) 2007 if (oo_order(s->min) > get_order(s->objsize))
2091 printk(KERN_WARNING " %s debugging increased min order, use " 2008 printk(KERN_WARNING " %s debugging increased min order, use "
2092 "slub_debug=O to disable.\n", s->name); 2009 "slub_debug=O to disable.\n", s->name);
2093 2010
@@ -2110,86 +2027,12 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
2110 } 2027 }
2111} 2028}
2112 2029
2113static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
2114 int node, struct kmem_cache_cpu **pc)
2115{
2116 void *freelist;
2117 struct kmem_cache_cpu *c = *pc;
2118 struct page *page;
2119
2120 freelist = get_partial(s, flags, node, c);
2121
2122 if (freelist)
2123 return freelist;
2124
2125 page = new_slab(s, flags, node);
2126 if (page) {
2127 c = __this_cpu_ptr(s->cpu_slab);
2128 if (c->page)
2129 flush_slab(s, c);
2130
2131 /*
2132 * No other reference to the page yet so we can
2133 * muck around with it freely without cmpxchg
2134 */
2135 freelist = page->freelist;
2136 page->freelist = NULL;
2137
2138 stat(s, ALLOC_SLAB);
2139 c->page = page;
2140 *pc = c;
2141 } else
2142 freelist = NULL;
2143
2144 return freelist;
2145}
2146
2147static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags)
2148{
2149 if (unlikely(PageSlabPfmemalloc(page)))
2150 return gfp_pfmemalloc_allowed(gfpflags);
2151
2152 return true;
2153}
2154
2155/*
2156 * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist
2157 * or deactivate the page.
2158 *
2159 * The page is still frozen if the return value is not NULL.
2160 *
2161 * If this function returns NULL then the page has been unfrozen.
2162 *
2163 * This function must be called with interrupt disabled.
2164 */
2165static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2166{
2167 struct page new;
2168 unsigned long counters;
2169 void *freelist;
2170
2171 do {
2172 freelist = page->freelist;
2173 counters = page->counters;
2174
2175 new.counters = counters;
2176 VM_BUG_ON(!new.frozen);
2177
2178 new.inuse = page->objects;
2179 new.frozen = freelist != NULL;
2180
2181 } while (!__cmpxchg_double_slab(s, page,
2182 freelist, counters,
2183 NULL, new.counters,
2184 "get_freelist"));
2185
2186 return freelist;
2187}
2188
2189/* 2030/*
2190 * Slow path. The lockless freelist is empty or we need to perform 2031 * Slow path. The lockless freelist is empty or we need to perform
2191 * debugging duties. 2032 * debugging duties.
2192 * 2033 *
2034 * Interrupts are disabled.
2035 *
2193 * Processing is still very fast if new objects have been freed to the 2036 * Processing is still very fast if new objects have been freed to the
2194 * regular freelist. In that case we simply take over the regular freelist 2037 * regular freelist. In that case we simply take over the regular freelist
2195 * as the lockless freelist and zap the regular freelist. 2038 * as the lockless freelist and zap the regular freelist.
@@ -2205,9 +2048,11 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page)
2205static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, 2048static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2206 unsigned long addr, struct kmem_cache_cpu *c) 2049 unsigned long addr, struct kmem_cache_cpu *c)
2207{ 2050{
2208 void *freelist; 2051 void **object;
2209 struct page *page; 2052 struct page *page;
2210 unsigned long flags; 2053 unsigned long flags;
2054 struct page new;
2055 unsigned long counters;
2211 2056
2212 local_irq_save(flags); 2057 local_irq_save(flags);
2213#ifdef CONFIG_PREEMPT 2058#ifdef CONFIG_PREEMPT
@@ -2219,41 +2064,51 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
2219 c = this_cpu_ptr(s->cpu_slab); 2064 c = this_cpu_ptr(s->cpu_slab);
2220#endif 2065#endif
2221 2066
2067 /* We handle __GFP_ZERO in the caller */
2068 gfpflags &= ~__GFP_ZERO;
2069
2222 page = c->page; 2070 page = c->page;
2223 if (!page) 2071 if (!page)
2224 goto new_slab; 2072 goto new_slab;
2225redo:
2226 2073
2227 if (unlikely(!node_match(page, node))) { 2074 if (unlikely(!node_match(c, node))) {
2228 stat(s, ALLOC_NODE_MISMATCH); 2075 stat(s, ALLOC_NODE_MISMATCH);
2229 deactivate_slab(s, page, c->freelist); 2076 deactivate_slab(s, c);
2230 c->page = NULL;
2231 c->freelist = NULL;
2232 goto new_slab;
2233 }
2234
2235 /*
2236 * By rights, we should be searching for a slab page that was
2237 * PFMEMALLOC but right now, we are losing the pfmemalloc
2238 * information when the page leaves the per-cpu allocator
2239 */
2240 if (unlikely(!pfmemalloc_match(page, gfpflags))) {
2241 deactivate_slab(s, page, c->freelist);
2242 c->page = NULL;
2243 c->freelist = NULL;
2244 goto new_slab; 2077 goto new_slab;
2245 } 2078 }
2246 2079
2247 /* must check again c->freelist in case of cpu migration or IRQ */ 2080 /* must check again c->freelist in case of cpu migration or IRQ */
2248 freelist = c->freelist; 2081 object = c->freelist;
2249 if (freelist) 2082 if (object)
2250 goto load_freelist; 2083 goto load_freelist;
2251 2084
2252 stat(s, ALLOC_SLOWPATH); 2085 stat(s, ALLOC_SLOWPATH);
2253 2086
2254 freelist = get_freelist(s, page); 2087 do {
2088 object = page->freelist;
2089 counters = page->counters;
2090 new.counters = counters;
2091 VM_BUG_ON(!new.frozen);
2092
2093 /*
2094 * If there is no object left then we use this loop to
2095 * deactivate the slab which is simple since no objects
2096 * are left in the slab and therefore we do not need to
2097 * put the page back onto the partial list.
2098 *
2099 * If there are objects left then we retrieve them
2100 * and use them to refill the per cpu queue.
2101 */
2102
2103 new.inuse = page->objects;
2104 new.frozen = object != NULL;
2105
2106 } while (!__cmpxchg_double_slab(s, page,
2107 object, counters,
2108 NULL, new.counters,
2109 "__slab_alloc"));
2255 2110
2256 if (!freelist) { 2111 if (unlikely(!object)) {
2257 c->page = NULL; 2112 c->page = NULL;
2258 stat(s, DEACTIVATE_BYPASS); 2113 stat(s, DEACTIVATE_BYPASS);
2259 goto new_slab; 2114 goto new_slab;
@@ -2262,50 +2117,61 @@ redo:
2262 stat(s, ALLOC_REFILL); 2117 stat(s, ALLOC_REFILL);
2263 2118
2264load_freelist: 2119load_freelist:
2265 /* 2120 VM_BUG_ON(!page->frozen);
2266 * freelist is pointing to the list of objects to be used. 2121 c->freelist = get_freepointer(s, object);
2267 * page is pointing to the page from which the objects are obtained.
2268 * That page must be frozen for per cpu allocations to work.
2269 */
2270 VM_BUG_ON(!c->page->frozen);
2271 c->freelist = get_freepointer(s, freelist);
2272 c->tid = next_tid(c->tid); 2122 c->tid = next_tid(c->tid);
2273 local_irq_restore(flags); 2123 local_irq_restore(flags);
2274 return freelist; 2124 return object;
2275 2125
2276new_slab: 2126new_slab:
2127 page = get_partial(s, gfpflags, node);
2128 if (page) {
2129 stat(s, ALLOC_FROM_PARTIAL);
2130 object = c->freelist;
2277 2131
2278 if (c->partial) { 2132 if (kmem_cache_debug(s))
2279 page = c->page = c->partial; 2133 goto debug;
2280 c->partial = page->next; 2134 goto load_freelist;
2281 stat(s, CPU_PARTIAL_ALLOC);
2282 c->freelist = NULL;
2283 goto redo;
2284 } 2135 }
2285 2136
2286 freelist = new_slab_objects(s, gfpflags, node, &c); 2137 page = new_slab(s, gfpflags, node);
2287 2138
2288 if (unlikely(!freelist)) { 2139 if (page) {
2289 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) 2140 c = __this_cpu_ptr(s->cpu_slab);
2290 slab_out_of_memory(s, gfpflags, node); 2141 if (c->page)
2142 flush_slab(s, c);
2291 2143
2292 local_irq_restore(flags); 2144 /*
2293 return NULL; 2145 * No other reference to the page yet so we can
2294 } 2146 * muck around with it freely without cmpxchg
2147 */
2148 object = page->freelist;
2149 page->freelist = NULL;
2150 page->inuse = page->objects;
2295 2151
2296 page = c->page; 2152 stat(s, ALLOC_SLAB);
2297 if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) 2153 c->node = page_to_nid(page);
2154 c->page = page;
2155
2156 if (kmem_cache_debug(s))
2157 goto debug;
2298 goto load_freelist; 2158 goto load_freelist;
2159 }
2160 if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
2161 slab_out_of_memory(s, gfpflags, node);
2162 local_irq_restore(flags);
2163 return NULL;
2299 2164
2300 /* Only entered in the debug case */ 2165debug:
2301 if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr)) 2166 if (!object || !alloc_debug_processing(s, page, object, addr))
2302 goto new_slab; /* Slab failed checks. Next slab needed */ 2167 goto new_slab;
2303 2168
2304 deactivate_slab(s, page, get_freepointer(s, freelist)); 2169 c->freelist = get_freepointer(s, object);
2170 deactivate_slab(s, c);
2305 c->page = NULL; 2171 c->page = NULL;
2306 c->freelist = NULL; 2172 c->node = NUMA_NO_NODE;
2307 local_irq_restore(flags); 2173 local_irq_restore(flags);
2308 return freelist; 2174 return object;
2309} 2175}
2310 2176
2311/* 2177/*
@@ -2318,18 +2184,16 @@ new_slab:
2318 * 2184 *
2319 * Otherwise we can simply pick the next object from the lockless free list. 2185 * Otherwise we can simply pick the next object from the lockless free list.
2320 */ 2186 */
2321static __always_inline void *slab_alloc_node(struct kmem_cache *s, 2187static __always_inline void *slab_alloc(struct kmem_cache *s,
2322 gfp_t gfpflags, int node, unsigned long addr) 2188 gfp_t gfpflags, int node, unsigned long addr)
2323{ 2189{
2324 void **object; 2190 void **object;
2325 struct kmem_cache_cpu *c; 2191 struct kmem_cache_cpu *c;
2326 struct page *page;
2327 unsigned long tid; 2192 unsigned long tid;
2328 2193
2329 if (slab_pre_alloc_hook(s, gfpflags)) 2194 if (slab_pre_alloc_hook(s, gfpflags))
2330 return NULL; 2195 return NULL;
2331 2196
2332 s = memcg_kmem_get_cache(s, gfpflags);
2333redo: 2197redo:
2334 2198
2335 /* 2199 /*
@@ -2350,13 +2214,11 @@ redo:
2350 barrier(); 2214 barrier();
2351 2215
2352 object = c->freelist; 2216 object = c->freelist;
2353 page = c->page; 2217 if (unlikely(!object || !node_match(c, node)))
2354 if (unlikely(!object || !node_match(page, node))) 2218
2355 object = __slab_alloc(s, gfpflags, node, addr, c); 2219 object = __slab_alloc(s, gfpflags, node, addr, c);
2356 2220
2357 else { 2221 else {
2358 void *next_object = get_freepointer_safe(s, object);
2359
2360 /* 2222 /*
2361 * The cmpxchg will only match if there was no additional 2223 * The cmpxchg will only match if there was no additional
2362 * operation and if we are on the right processor. 2224 * operation and if we are on the right processor.
@@ -2369,37 +2231,30 @@ redo:
2369 * Since this is without lock semantics the protection is only against 2231 * Since this is without lock semantics the protection is only against
2370 * code executing on this cpu *not* from access by other cpus. 2232 * code executing on this cpu *not* from access by other cpus.
2371 */ 2233 */
2372 if (unlikely(!this_cpu_cmpxchg_double( 2234 if (unlikely(!irqsafe_cpu_cmpxchg_double(
2373 s->cpu_slab->freelist, s->cpu_slab->tid, 2235 s->cpu_slab->freelist, s->cpu_slab->tid,
2374 object, tid, 2236 object, tid,
2375 next_object, next_tid(tid)))) { 2237 get_freepointer_safe(s, object), next_tid(tid)))) {
2376 2238
2377 note_cmpxchg_failure("slab_alloc", s, tid); 2239 note_cmpxchg_failure("slab_alloc", s, tid);
2378 goto redo; 2240 goto redo;
2379 } 2241 }
2380 prefetch_freepointer(s, next_object);
2381 stat(s, ALLOC_FASTPATH); 2242 stat(s, ALLOC_FASTPATH);
2382 } 2243 }
2383 2244
2384 if (unlikely(gfpflags & __GFP_ZERO) && object) 2245 if (unlikely(gfpflags & __GFP_ZERO) && object)
2385 memset(object, 0, s->object_size); 2246 memset(object, 0, s->objsize);
2386 2247
2387 slab_post_alloc_hook(s, gfpflags, object); 2248 slab_post_alloc_hook(s, gfpflags, object);
2388 2249
2389 return object; 2250 return object;
2390} 2251}
2391 2252
2392static __always_inline void *slab_alloc(struct kmem_cache *s,
2393 gfp_t gfpflags, unsigned long addr)
2394{
2395 return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr);
2396}
2397
2398void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 2253void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
2399{ 2254{
2400 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2255 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
2401 2256
2402 trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); 2257 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
2403 2258
2404 return ret; 2259 return ret;
2405} 2260}
@@ -2408,7 +2263,7 @@ EXPORT_SYMBOL(kmem_cache_alloc);
2408#ifdef CONFIG_TRACING 2263#ifdef CONFIG_TRACING
2409void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) 2264void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
2410{ 2265{
2411 void *ret = slab_alloc(s, gfpflags, _RET_IP_); 2266 void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_);
2412 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); 2267 trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
2413 return ret; 2268 return ret;
2414} 2269}
@@ -2426,10 +2281,10 @@ EXPORT_SYMBOL(kmalloc_order_trace);
2426#ifdef CONFIG_NUMA 2281#ifdef CONFIG_NUMA
2427void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 2282void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
2428{ 2283{
2429 void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); 2284 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
2430 2285
2431 trace_kmem_cache_alloc_node(_RET_IP_, ret, 2286 trace_kmem_cache_alloc_node(_RET_IP_, ret,
2432 s->object_size, s->size, gfpflags, node); 2287 s->objsize, s->size, gfpflags, node);
2433 2288
2434 return ret; 2289 return ret;
2435} 2290}
@@ -2440,7 +2295,7 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
2440 gfp_t gfpflags, 2295 gfp_t gfpflags,
2441 int node, size_t size) 2296 int node, size_t size)
2442{ 2297{
2443 void *ret = slab_alloc_node(s, gfpflags, node, _RET_IP_); 2298 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
2444 2299
2445 trace_kmalloc_node(_RET_IP_, ret, 2300 trace_kmalloc_node(_RET_IP_, ret,
2446 size, s->size, gfpflags, node); 2301 size, s->size, gfpflags, node);
@@ -2464,6 +2319,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2464 void *prior; 2319 void *prior;
2465 void **object = (void *)x; 2320 void **object = (void *)x;
2466 int was_frozen; 2321 int was_frozen;
2322 int inuse;
2467 struct page new; 2323 struct page new;
2468 unsigned long counters; 2324 unsigned long counters;
2469 struct kmem_cache_node *n = NULL; 2325 struct kmem_cache_node *n = NULL;
@@ -2471,46 +2327,29 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2471 2327
2472 stat(s, FREE_SLOWPATH); 2328 stat(s, FREE_SLOWPATH);
2473 2329
2474 if (kmem_cache_debug(s) && 2330 if (kmem_cache_debug(s) && !free_debug_processing(s, page, x, addr))
2475 !(n = free_debug_processing(s, page, x, addr, &flags)))
2476 return; 2331 return;
2477 2332
2478 do { 2333 do {
2479 if (unlikely(n)) {
2480 spin_unlock_irqrestore(&n->list_lock, flags);
2481 n = NULL;
2482 }
2483 prior = page->freelist; 2334 prior = page->freelist;
2484 counters = page->counters; 2335 counters = page->counters;
2485 set_freepointer(s, object, prior); 2336 set_freepointer(s, object, prior);
2486 new.counters = counters; 2337 new.counters = counters;
2487 was_frozen = new.frozen; 2338 was_frozen = new.frozen;
2488 new.inuse--; 2339 new.inuse--;
2489 if ((!new.inuse || !prior) && !was_frozen) { 2340 if ((!new.inuse || !prior) && !was_frozen && !n) {
2490 2341 n = get_node(s, page_to_nid(page));
2491 if (!kmem_cache_debug(s) && !prior) 2342 /*
2492 2343 * Speculatively acquire the list_lock.
2493 /* 2344 * If the cmpxchg does not succeed then we may
2494 * Slab was on no list before and will be partially empty 2345 * drop the list_lock without any processing.
2495 * We can defer the list move and instead freeze it. 2346 *
2496 */ 2347 * Otherwise the list_lock will synchronize with
2497 new.frozen = 1; 2348 * other processors updating the list of slabs.
2498 2349 */
2499 else { /* Needs to be taken off a list */ 2350 spin_lock_irqsave(&n->list_lock, flags);
2500
2501 n = get_node(s, page_to_nid(page));
2502 /*
2503 * Speculatively acquire the list_lock.
2504 * If the cmpxchg does not succeed then we may
2505 * drop the list_lock without any processing.
2506 *
2507 * Otherwise the list_lock will synchronize with
2508 * other processors updating the list of slabs.
2509 */
2510 spin_lock_irqsave(&n->list_lock, flags);
2511
2512 }
2513 } 2351 }
2352 inuse = new.inuse;
2514 2353
2515 } while (!cmpxchg_double_slab(s, page, 2354 } while (!cmpxchg_double_slab(s, page,
2516 prior, counters, 2355 prior, counters,
@@ -2518,16 +2357,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2518 "__slab_free")); 2357 "__slab_free"));
2519 2358
2520 if (likely(!n)) { 2359 if (likely(!n)) {
2521 2360 /*
2522 /*
2523 * If we just froze the page then put it onto the
2524 * per cpu partial list.
2525 */
2526 if (new.frozen && !was_frozen) {
2527 put_cpu_partial(s, page, 1);
2528 stat(s, CPU_PARTIAL_FREE);
2529 }
2530 /*
2531 * The list lock was not taken therefore no list 2361 * The list lock was not taken therefore no list
2532 * activity can be necessary. 2362 * activity can be necessary.
2533 */ 2363 */
@@ -2536,17 +2366,25 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
2536 return; 2366 return;
2537 } 2367 }
2538 2368
2539 if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
2540 goto slab_empty;
2541
2542 /* 2369 /*
2543 * Objects left in the slab. If it was not on the partial list before 2370 * was_frozen may have been set after we acquired the list_lock in
2544 * then add it. 2371 * an earlier loop. So we need to check it here again.
2545 */ 2372 */
2546 if (kmem_cache_debug(s) && unlikely(!prior)) { 2373 if (was_frozen)
2547 remove_full(s, page); 2374 stat(s, FREE_FROZEN);
2548 add_partial(n, page, DEACTIVATE_TO_TAIL); 2375 else {
2549 stat(s, FREE_ADD_PARTIAL); 2376 if (unlikely(!inuse && n->nr_partial > s->min_partial))
2377 goto slab_empty;
2378
2379 /*
2380 * Objects left in the slab. If it was not on the partial list before
2381 * then add it.
2382 */
2383 if (unlikely(!prior)) {
2384 remove_full(s, page);
2385 add_partial(n, page, 1);
2386 stat(s, FREE_ADD_PARTIAL);
2387 }
2550 } 2388 }
2551 spin_unlock_irqrestore(&n->list_lock, flags); 2389 spin_unlock_irqrestore(&n->list_lock, flags);
2552 return; 2390 return;
@@ -2588,6 +2426,7 @@ static __always_inline void slab_free(struct kmem_cache *s,
2588 slab_free_hook(s, x); 2426 slab_free_hook(s, x);
2589 2427
2590redo: 2428redo:
2429
2591 /* 2430 /*
2592 * Determine the currently cpus per cpu slab. 2431 * Determine the currently cpus per cpu slab.
2593 * The cpu may change afterward. However that does not matter since 2432 * The cpu may change afterward. However that does not matter since
@@ -2602,7 +2441,7 @@ redo:
2602 if (likely(page == c->page)) { 2441 if (likely(page == c->page)) {
2603 set_freepointer(s, object, c->freelist); 2442 set_freepointer(s, object, c->freelist);
2604 2443
2605 if (unlikely(!this_cpu_cmpxchg_double( 2444 if (unlikely(!irqsafe_cpu_cmpxchg_double(
2606 s->cpu_slab->freelist, s->cpu_slab->tid, 2445 s->cpu_slab->freelist, s->cpu_slab->tid,
2607 c->freelist, tid, 2446 c->freelist, tid,
2608 object, next_tid(tid)))) { 2447 object, next_tid(tid)))) {
@@ -2618,10 +2457,12 @@ redo:
2618 2457
2619void kmem_cache_free(struct kmem_cache *s, void *x) 2458void kmem_cache_free(struct kmem_cache *s, void *x)
2620{ 2459{
2621 s = cache_from_obj(s, x); 2460 struct page *page;
2622 if (!s) 2461
2623 return; 2462 page = virt_to_head_page(x);
2624 slab_free(s, virt_to_head_page(x), x, _RET_IP_); 2463
2464 slab_free(s, page, x, _RET_IP_);
2465
2625 trace_kmem_cache_free(_RET_IP_, x); 2466 trace_kmem_cache_free(_RET_IP_, x);
2626} 2467}
2627EXPORT_SYMBOL(kmem_cache_free); 2468EXPORT_SYMBOL(kmem_cache_free);
@@ -2759,8 +2600,34 @@ static inline int calculate_order(int size, int reserved)
2759 return -ENOSYS; 2600 return -ENOSYS;
2760} 2601}
2761 2602
2603/*
2604 * Figure out what the alignment of the objects will be.
2605 */
2606static unsigned long calculate_alignment(unsigned long flags,
2607 unsigned long align, unsigned long size)
2608{
2609 /*
2610 * If the user wants hardware cache aligned objects then follow that
2611 * suggestion if the object is sufficiently large.
2612 *
2613 * The hardware cache alignment cannot override the specified
2614 * alignment though. If that is greater then use it.
2615 */
2616 if (flags & SLAB_HWCACHE_ALIGN) {
2617 unsigned long ralign = cache_line_size();
2618 while (size <= ralign / 2)
2619 ralign /= 2;
2620 align = max(align, ralign);
2621 }
2622
2623 if (align < ARCH_SLAB_MINALIGN)
2624 align = ARCH_SLAB_MINALIGN;
2625
2626 return ALIGN(align, sizeof(void *));
2627}
2628
2762static void 2629static void
2763init_kmem_cache_node(struct kmem_cache_node *n) 2630init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
2764{ 2631{
2765 n->nr_partial = 0; 2632 n->nr_partial = 0;
2766 spin_lock_init(&n->list_lock); 2633 spin_lock_init(&n->list_lock);
@@ -2823,17 +2690,17 @@ static void early_kmem_cache_node_alloc(int node)
2823 n = page->freelist; 2690 n = page->freelist;
2824 BUG_ON(!n); 2691 BUG_ON(!n);
2825 page->freelist = get_freepointer(kmem_cache_node, n); 2692 page->freelist = get_freepointer(kmem_cache_node, n);
2826 page->inuse = 1; 2693 page->inuse++;
2827 page->frozen = 0; 2694 page->frozen = 0;
2828 kmem_cache_node->node[node] = n; 2695 kmem_cache_node->node[node] = n;
2829#ifdef CONFIG_SLUB_DEBUG 2696#ifdef CONFIG_SLUB_DEBUG
2830 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); 2697 init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
2831 init_tracking(kmem_cache_node, n); 2698 init_tracking(kmem_cache_node, n);
2832#endif 2699#endif
2833 init_kmem_cache_node(n); 2700 init_kmem_cache_node(n, kmem_cache_node);
2834 inc_slabs_node(kmem_cache_node, node, page->objects); 2701 inc_slabs_node(kmem_cache_node, node, page->objects);
2835 2702
2836 add_partial(n, page, DEACTIVATE_TO_HEAD); 2703 add_partial(n, page, 0);
2837} 2704}
2838 2705
2839static void free_kmem_cache_nodes(struct kmem_cache *s) 2706static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2870,7 +2737,7 @@ static int init_kmem_cache_nodes(struct kmem_cache *s)
2870 } 2737 }
2871 2738
2872 s->node[node] = n; 2739 s->node[node] = n;
2873 init_kmem_cache_node(n); 2740 init_kmem_cache_node(n, s);
2874 } 2741 }
2875 return 1; 2742 return 1;
2876} 2743}
@@ -2891,7 +2758,8 @@ static void set_min_partial(struct kmem_cache *s, unsigned long min)
2891static int calculate_sizes(struct kmem_cache *s, int forced_order) 2758static int calculate_sizes(struct kmem_cache *s, int forced_order)
2892{ 2759{
2893 unsigned long flags = s->flags; 2760 unsigned long flags = s->flags;
2894 unsigned long size = s->object_size; 2761 unsigned long size = s->objsize;
2762 unsigned long align = s->align;
2895 int order; 2763 int order;
2896 2764
2897 /* 2765 /*
@@ -2919,7 +2787,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2919 * end of the object and the free pointer. If not then add an 2787 * end of the object and the free pointer. If not then add an
2920 * additional word to have some bytes to store Redzone information. 2788 * additional word to have some bytes to store Redzone information.
2921 */ 2789 */
2922 if ((flags & SLAB_RED_ZONE) && size == s->object_size) 2790 if ((flags & SLAB_RED_ZONE) && size == s->objsize)
2923 size += sizeof(void *); 2791 size += sizeof(void *);
2924#endif 2792#endif
2925 2793
@@ -2963,11 +2831,19 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2963#endif 2831#endif
2964 2832
2965 /* 2833 /*
2834 * Determine the alignment based on various parameters that the
2835 * user specified and the dynamic determination of cache line size
2836 * on bootup.
2837 */
2838 align = calculate_alignment(flags, align, s->objsize);
2839 s->align = align;
2840
2841 /*
2966 * SLUB stores one object immediately after another beginning from 2842 * SLUB stores one object immediately after another beginning from
2967 * offset 0. In order to align the objects we have to simply size 2843 * offset 0. In order to align the objects we have to simply size
2968 * each object to conform to the alignment. 2844 * each object to conform to the alignment.
2969 */ 2845 */
2970 size = ALIGN(size, s->align); 2846 size = ALIGN(size, align);
2971 s->size = size; 2847 s->size = size;
2972 if (forced_order >= 0) 2848 if (forced_order >= 0)
2973 order = forced_order; 2849 order = forced_order;
@@ -2996,11 +2872,20 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2996 s->max = s->oo; 2872 s->max = s->oo;
2997 2873
2998 return !!oo_objects(s->oo); 2874 return !!oo_objects(s->oo);
2875
2999} 2876}
3000 2877
3001static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) 2878static int kmem_cache_open(struct kmem_cache *s,
2879 const char *name, size_t size,
2880 size_t align, unsigned long flags,
2881 void (*ctor)(void *))
3002{ 2882{
3003 s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); 2883 memset(s, 0, kmem_size);
2884 s->name = name;
2885 s->ctor = ctor;
2886 s->objsize = size;
2887 s->align = align;
2888 s->flags = kmem_cache_flags(size, flags, name, ctor);
3004 s->reserved = 0; 2889 s->reserved = 0;
3005 2890
3006 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) 2891 if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU))
@@ -3013,7 +2898,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3013 * Disable debugging flags that store metadata if the min slab 2898 * Disable debugging flags that store metadata if the min slab
3014 * order increased. 2899 * order increased.
3015 */ 2900 */
3016 if (get_order(s->size) > get_order(s->object_size)) { 2901 if (get_order(s->size) > get_order(s->objsize)) {
3017 s->flags &= ~DEBUG_METADATA_FLAGS; 2902 s->flags &= ~DEBUG_METADATA_FLAGS;
3018 s->offset = 0; 2903 s->offset = 0;
3019 if (!calculate_sizes(s, -1)) 2904 if (!calculate_sizes(s, -1))
@@ -3021,8 +2906,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3021 } 2906 }
3022 } 2907 }
3023 2908
3024#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ 2909#ifdef CONFIG_CMPXCHG_DOUBLE
3025 defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
3026 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0) 2910 if (system_has_cmpxchg_double() && (s->flags & SLAB_DEBUG_FLAGS) == 0)
3027 /* Enable fast mode */ 2911 /* Enable fast mode */
3028 s->flags |= __CMPXCHG_DOUBLE; 2912 s->flags |= __CMPXCHG_DOUBLE;
@@ -3032,36 +2916,8 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3032 * The larger the object size is, the more pages we want on the partial 2916 * The larger the object size is, the more pages we want on the partial
3033 * list to avoid pounding the page allocator excessively. 2917 * list to avoid pounding the page allocator excessively.
3034 */ 2918 */
3035 set_min_partial(s, ilog2(s->size) / 2); 2919 set_min_partial(s, ilog2(s->size));
3036 2920 s->refcount = 1;
3037 /*
3038 * cpu_partial determined the maximum number of objects kept in the
3039 * per cpu partial lists of a processor.
3040 *
3041 * Per cpu partial lists mainly contain slabs that just have one
3042 * object freed. If they are used for allocation then they can be
3043 * filled up again with minimal effort. The slab will never hit the
3044 * per node partial lists and therefore no locking will be required.
3045 *
3046 * This setting also determines
3047 *
3048 * A) The number of objects from per cpu partial slabs dumped to the
3049 * per node list when we reach the limit.
3050 * B) The number of objects in cpu partial slabs to extract from the
3051 * per node list when we run out of per cpu objects. We only fetch 50%
3052 * to keep some capacity around for frees.
3053 */
3054 if (kmem_cache_debug(s))
3055 s->cpu_partial = 0;
3056 else if (s->size >= PAGE_SIZE)
3057 s->cpu_partial = 2;
3058 else if (s->size >= 1024)
3059 s->cpu_partial = 6;
3060 else if (s->size >= 256)
3061 s->cpu_partial = 13;
3062 else
3063 s->cpu_partial = 30;
3064
3065#ifdef CONFIG_NUMA 2921#ifdef CONFIG_NUMA
3066 s->remote_node_defrag_ratio = 1000; 2922 s->remote_node_defrag_ratio = 1000;
3067#endif 2923#endif
@@ -3069,17 +2925,26 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags)
3069 goto error; 2925 goto error;
3070 2926
3071 if (alloc_kmem_cache_cpus(s)) 2927 if (alloc_kmem_cache_cpus(s))
3072 return 0; 2928 return 1;
3073 2929
3074 free_kmem_cache_nodes(s); 2930 free_kmem_cache_nodes(s);
3075error: 2931error:
3076 if (flags & SLAB_PANIC) 2932 if (flags & SLAB_PANIC)
3077 panic("Cannot create slab %s size=%lu realsize=%u " 2933 panic("Cannot create slab %s size=%lu realsize=%u "
3078 "order=%u offset=%u flags=%lx\n", 2934 "order=%u offset=%u flags=%lx\n",
3079 s->name, (unsigned long)s->size, s->size, oo_order(s->oo), 2935 s->name, (unsigned long)size, s->size, oo_order(s->oo),
3080 s->offset, flags); 2936 s->offset, flags);
3081 return -EINVAL; 2937 return 0;
2938}
2939
2940/*
2941 * Determine the size of a slab object
2942 */
2943unsigned int kmem_cache_size(struct kmem_cache *s)
2944{
2945 return s->objsize;
3082} 2946}
2947EXPORT_SYMBOL(kmem_cache_size);
3083 2948
3084static void list_slab_objects(struct kmem_cache *s, struct page *page, 2949static void list_slab_objects(struct kmem_cache *s, struct page *page,
3085 const char *text) 2950 const char *text)
@@ -3091,7 +2956,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
3091 sizeof(long), GFP_ATOMIC); 2956 sizeof(long), GFP_ATOMIC);
3092 if (!map) 2957 if (!map)
3093 return; 2958 return;
3094 slab_err(s, page, text, s->name); 2959 slab_err(s, page, "%s", text);
3095 slab_lock(page); 2960 slab_lock(page);
3096 2961
3097 get_map(s, page, map); 2962 get_map(s, page, map);
@@ -3110,22 +2975,23 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
3110 2975
3111/* 2976/*
3112 * Attempt to free all partial slabs on a node. 2977 * Attempt to free all partial slabs on a node.
3113 * This is called from kmem_cache_close(). We must be the last thread
3114 * using the cache and therefore we do not need to lock anymore.
3115 */ 2978 */
3116static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) 2979static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
3117{ 2980{
2981 unsigned long flags;
3118 struct page *page, *h; 2982 struct page *page, *h;
3119 2983
2984 spin_lock_irqsave(&n->list_lock, flags);
3120 list_for_each_entry_safe(page, h, &n->partial, lru) { 2985 list_for_each_entry_safe(page, h, &n->partial, lru) {
3121 if (!page->inuse) { 2986 if (!page->inuse) {
3122 remove_partial(n, page); 2987 remove_partial(n, page);
3123 discard_slab(s, page); 2988 discard_slab(s, page);
3124 } else { 2989 } else {
3125 list_slab_objects(s, page, 2990 list_slab_objects(s, page,
3126 "Objects remaining in %s on kmem_cache_close()"); 2991 "Objects remaining on kmem_cache_close()");
3127 } 2992 }
3128 } 2993 }
2994 spin_unlock_irqrestore(&n->list_lock, flags);
3129} 2995}
3130 2996
3131/* 2997/*
@@ -3136,6 +3002,7 @@ static inline int kmem_cache_close(struct kmem_cache *s)
3136 int node; 3002 int node;
3137 3003
3138 flush_all(s); 3004 flush_all(s);
3005 free_percpu(s->cpu_slab);
3139 /* Attempt to free all objects */ 3006 /* Attempt to free all objects */
3140 for_each_node_state(node, N_NORMAL_MEMORY) { 3007 for_each_node_state(node, N_NORMAL_MEMORY) {
3141 struct kmem_cache_node *n = get_node(s, node); 3008 struct kmem_cache_node *n = get_node(s, node);
@@ -3144,31 +3011,32 @@ static inline int kmem_cache_close(struct kmem_cache *s)
3144 if (n->nr_partial || slabs_node(s, node)) 3011 if (n->nr_partial || slabs_node(s, node))
3145 return 1; 3012 return 1;
3146 } 3013 }
3147 free_percpu(s->cpu_slab);
3148 free_kmem_cache_nodes(s); 3014 free_kmem_cache_nodes(s);
3149 return 0; 3015 return 0;
3150} 3016}
3151 3017
3152int __kmem_cache_shutdown(struct kmem_cache *s) 3018/*
3153{ 3019 * Close a cache and release the kmem_cache structure
3154 int rc = kmem_cache_close(s); 3020 * (must be used for caches created using kmem_cache_create)
3155 3021 */
3156 if (!rc) { 3022void kmem_cache_destroy(struct kmem_cache *s)
3157 /* 3023{
3158 * We do the same lock strategy around sysfs_slab_add, see 3024 down_write(&slub_lock);
3159 * __kmem_cache_create. Because this is pretty much the last 3025 s->refcount--;
3160 * operation we do and the lock will be released shortly after 3026 if (!s->refcount) {
3161 * that in slab_common.c, we could just move sysfs_slab_remove 3027 list_del(&s->list);
3162 * to a later point in common code. We should do that when we 3028 if (kmem_cache_close(s)) {
3163 * have a common sysfs framework for all allocators. 3029 printk(KERN_ERR "SLUB %s: %s called for cache that "
3164 */ 3030 "still has objects.\n", s->name, __func__);
3165 mutex_unlock(&slab_mutex); 3031 dump_stack();
3032 }
3033 if (s->flags & SLAB_DESTROY_BY_RCU)
3034 rcu_barrier();
3166 sysfs_slab_remove(s); 3035 sysfs_slab_remove(s);
3167 mutex_lock(&slab_mutex);
3168 } 3036 }
3169 3037 up_write(&slub_lock);
3170 return rc;
3171} 3038}
3039EXPORT_SYMBOL(kmem_cache_destroy);
3172 3040
3173/******************************************************************** 3041/********************************************************************
3174 * Kmalloc subsystem 3042 * Kmalloc subsystem
@@ -3177,6 +3045,8 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
3177struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; 3045struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
3178EXPORT_SYMBOL(kmalloc_caches); 3046EXPORT_SYMBOL(kmalloc_caches);
3179 3047
3048static struct kmem_cache *kmem_cache;
3049
3180#ifdef CONFIG_ZONE_DMA 3050#ifdef CONFIG_ZONE_DMA
3181static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; 3051static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
3182#endif 3052#endif
@@ -3217,6 +3087,29 @@ static int __init setup_slub_nomerge(char *str)
3217 3087
3218__setup("slub_nomerge", setup_slub_nomerge); 3088__setup("slub_nomerge", setup_slub_nomerge);
3219 3089
3090static struct kmem_cache *__init create_kmalloc_cache(const char *name,
3091 int size, unsigned int flags)
3092{
3093 struct kmem_cache *s;
3094
3095 s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3096
3097 /*
3098 * This function is called with IRQs disabled during early-boot on
3099 * single CPU so there's no need to take slub_lock here.
3100 */
3101 if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
3102 flags, NULL))
3103 goto panic;
3104
3105 list_add(&s->list, &slab_caches);
3106 return s;
3107
3108panic:
3109 panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
3110 return NULL;
3111}
3112
3220/* 3113/*
3221 * Conversion table for small slabs sizes / 8 to the index in the 3114 * Conversion table for small slabs sizes / 8 to the index in the
3222 * kmalloc array. This is necessary for slabs < 192 since we have non power 3115 * kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -3288,7 +3181,7 @@ void *__kmalloc(size_t size, gfp_t flags)
3288 if (unlikely(ZERO_OR_NULL_PTR(s))) 3181 if (unlikely(ZERO_OR_NULL_PTR(s)))
3289 return s; 3182 return s;
3290 3183
3291 ret = slab_alloc(s, flags, _RET_IP_); 3184 ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_);
3292 3185
3293 trace_kmalloc(_RET_IP_, ret, size, s->size, flags); 3186 trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
3294 3187
@@ -3302,7 +3195,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
3302 struct page *page; 3195 struct page *page;
3303 void *ptr = NULL; 3196 void *ptr = NULL;
3304 3197
3305 flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; 3198 flags |= __GFP_COMP | __GFP_NOTRACK;
3306 page = alloc_pages_node(node, flags, get_order(size)); 3199 page = alloc_pages_node(node, flags, get_order(size));
3307 if (page) 3200 if (page)
3308 ptr = page_address(page); 3201 ptr = page_address(page);
@@ -3331,7 +3224,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3331 if (unlikely(ZERO_OR_NULL_PTR(s))) 3224 if (unlikely(ZERO_OR_NULL_PTR(s)))
3332 return s; 3225 return s;
3333 3226
3334 ret = slab_alloc_node(s, flags, node, _RET_IP_); 3227 ret = slab_alloc(s, flags, node, _RET_IP_);
3335 3228
3336 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); 3229 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
3337 3230
@@ -3354,7 +3247,7 @@ size_t ksize(const void *object)
3354 return PAGE_SIZE << compound_order(page); 3247 return PAGE_SIZE << compound_order(page);
3355 } 3248 }
3356 3249
3357 return slab_ksize(page->slab_cache); 3250 return slab_ksize(page->slab);
3358} 3251}
3359EXPORT_SYMBOL(ksize); 3252EXPORT_SYMBOL(ksize);
3360 3253
@@ -3379,8 +3272,8 @@ bool verify_mem_not_deleted(const void *x)
3379 } 3272 }
3380 3273
3381 slab_lock(page); 3274 slab_lock(page);
3382 if (on_freelist(page->slab_cache, page, object)) { 3275 if (on_freelist(page->slab, page, object)) {
3383 object_err(page->slab_cache, page, object, "Object is on free-list"); 3276 object_err(page->slab, page, object, "Object is on free-list");
3384 rv = false; 3277 rv = false;
3385 } else { 3278 } else {
3386 rv = true; 3279 rv = true;
@@ -3408,10 +3301,10 @@ void kfree(const void *x)
3408 if (unlikely(!PageSlab(page))) { 3301 if (unlikely(!PageSlab(page))) {
3409 BUG_ON(!PageCompound(page)); 3302 BUG_ON(!PageCompound(page));
3410 kmemleak_free(x); 3303 kmemleak_free(x);
3411 __free_memcg_kmem_pages(page, compound_order(page)); 3304 put_page(page);
3412 return; 3305 return;
3413 } 3306 }
3414 slab_free(page->slab_cache, page, object, _RET_IP_); 3307 slab_free(page->slab, page, object, _RET_IP_);
3415} 3308}
3416EXPORT_SYMBOL(kfree); 3309EXPORT_SYMBOL(kfree);
3417 3310
@@ -3459,23 +3352,23 @@ int kmem_cache_shrink(struct kmem_cache *s)
3459 * list_lock. page->inuse here is the upper limit. 3352 * list_lock. page->inuse here is the upper limit.
3460 */ 3353 */
3461 list_for_each_entry_safe(page, t, &n->partial, lru) { 3354 list_for_each_entry_safe(page, t, &n->partial, lru) {
3462 list_move(&page->lru, slabs_by_inuse + page->inuse); 3355 if (!page->inuse) {
3463 if (!page->inuse) 3356 remove_partial(n, page);
3464 n->nr_partial--; 3357 discard_slab(s, page);
3358 } else {
3359 list_move(&page->lru,
3360 slabs_by_inuse + page->inuse);
3361 }
3465 } 3362 }
3466 3363
3467 /* 3364 /*
3468 * Rebuild the partial list with the slabs filled up most 3365 * Rebuild the partial list with the slabs filled up most
3469 * first and the least used slabs at the end. 3366 * first and the least used slabs at the end.
3470 */ 3367 */
3471 for (i = objects - 1; i > 0; i--) 3368 for (i = objects - 1; i >= 0; i--)
3472 list_splice(slabs_by_inuse + i, n->partial.prev); 3369 list_splice(slabs_by_inuse + i, n->partial.prev);
3473 3370
3474 spin_unlock_irqrestore(&n->list_lock, flags); 3371 spin_unlock_irqrestore(&n->list_lock, flags);
3475
3476 /* Release empty slabs */
3477 list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
3478 discard_slab(s, page);
3479 } 3372 }
3480 3373
3481 kfree(slabs_by_inuse); 3374 kfree(slabs_by_inuse);
@@ -3488,10 +3381,10 @@ static int slab_mem_going_offline_callback(void *arg)
3488{ 3381{
3489 struct kmem_cache *s; 3382 struct kmem_cache *s;
3490 3383
3491 mutex_lock(&slab_mutex); 3384 down_read(&slub_lock);
3492 list_for_each_entry(s, &slab_caches, list) 3385 list_for_each_entry(s, &slab_caches, list)
3493 kmem_cache_shrink(s); 3386 kmem_cache_shrink(s);
3494 mutex_unlock(&slab_mutex); 3387 up_read(&slub_lock);
3495 3388
3496 return 0; 3389 return 0;
3497} 3390}
@@ -3503,7 +3396,7 @@ static void slab_mem_offline_callback(void *arg)
3503 struct memory_notify *marg = arg; 3396 struct memory_notify *marg = arg;
3504 int offline_node; 3397 int offline_node;
3505 3398
3506 offline_node = marg->status_change_nid_normal; 3399 offline_node = marg->status_change_nid;
3507 3400
3508 /* 3401 /*
3509 * If the node still has available memory. we need kmem_cache_node 3402 * If the node still has available memory. we need kmem_cache_node
@@ -3512,7 +3405,7 @@ static void slab_mem_offline_callback(void *arg)
3512 if (offline_node < 0) 3405 if (offline_node < 0)
3513 return; 3406 return;
3514 3407
3515 mutex_lock(&slab_mutex); 3408 down_read(&slub_lock);
3516 list_for_each_entry(s, &slab_caches, list) { 3409 list_for_each_entry(s, &slab_caches, list) {
3517 n = get_node(s, offline_node); 3410 n = get_node(s, offline_node);
3518 if (n) { 3411 if (n) {
@@ -3528,7 +3421,7 @@ static void slab_mem_offline_callback(void *arg)
3528 kmem_cache_free(kmem_cache_node, n); 3421 kmem_cache_free(kmem_cache_node, n);
3529 } 3422 }
3530 } 3423 }
3531 mutex_unlock(&slab_mutex); 3424 up_read(&slub_lock);
3532} 3425}
3533 3426
3534static int slab_mem_going_online_callback(void *arg) 3427static int slab_mem_going_online_callback(void *arg)
@@ -3536,7 +3429,7 @@ static int slab_mem_going_online_callback(void *arg)
3536 struct kmem_cache_node *n; 3429 struct kmem_cache_node *n;
3537 struct kmem_cache *s; 3430 struct kmem_cache *s;
3538 struct memory_notify *marg = arg; 3431 struct memory_notify *marg = arg;
3539 int nid = marg->status_change_nid_normal; 3432 int nid = marg->status_change_nid;
3540 int ret = 0; 3433 int ret = 0;
3541 3434
3542 /* 3435 /*
@@ -3551,7 +3444,7 @@ static int slab_mem_going_online_callback(void *arg)
3551 * allocate a kmem_cache_node structure in order to bring the node 3444 * allocate a kmem_cache_node structure in order to bring the node
3552 * online. 3445 * online.
3553 */ 3446 */
3554 mutex_lock(&slab_mutex); 3447 down_read(&slub_lock);
3555 list_for_each_entry(s, &slab_caches, list) { 3448 list_for_each_entry(s, &slab_caches, list) {
3556 /* 3449 /*
3557 * XXX: kmem_cache_alloc_node will fallback to other nodes 3450 * XXX: kmem_cache_alloc_node will fallback to other nodes
@@ -3563,11 +3456,11 @@ static int slab_mem_going_online_callback(void *arg)
3563 ret = -ENOMEM; 3456 ret = -ENOMEM;
3564 goto out; 3457 goto out;
3565 } 3458 }
3566 init_kmem_cache_node(n); 3459 init_kmem_cache_node(n, s);
3567 s->node[nid] = n; 3460 s->node[nid] = n;
3568 } 3461 }
3569out: 3462out:
3570 mutex_unlock(&slab_mutex); 3463 up_read(&slub_lock);
3571 return ret; 3464 return ret;
3572} 3465}
3573 3466
@@ -3606,16 +3499,15 @@ static int slab_memory_callback(struct notifier_block *self,
3606 3499
3607/* 3500/*
3608 * Used for early kmem_cache structures that were allocated using 3501 * Used for early kmem_cache structures that were allocated using
3609 * the page allocator. Allocate them properly then fix up the pointers 3502 * the page allocator
3610 * that may be pointing to the wrong kmem_cache structure.
3611 */ 3503 */
3612 3504
3613static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) 3505static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
3614{ 3506{
3615 int node; 3507 int node;
3616 struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT);
3617 3508
3618 memcpy(s, static_cache, kmem_cache->object_size); 3509 list_add(&s->list, &slab_caches);
3510 s->refcount = -1;
3619 3511
3620 for_each_node_state(node, N_NORMAL_MEMORY) { 3512 for_each_node_state(node, N_NORMAL_MEMORY) {
3621 struct kmem_cache_node *n = get_node(s, node); 3513 struct kmem_cache_node *n = get_node(s, node);
@@ -3623,52 +3515,72 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3623 3515
3624 if (n) { 3516 if (n) {
3625 list_for_each_entry(p, &n->partial, lru) 3517 list_for_each_entry(p, &n->partial, lru)
3626 p->slab_cache = s; 3518 p->slab = s;
3627 3519
3628#ifdef CONFIG_SLUB_DEBUG 3520#ifdef CONFIG_SLUB_DEBUG
3629 list_for_each_entry(p, &n->full, lru) 3521 list_for_each_entry(p, &n->full, lru)
3630 p->slab_cache = s; 3522 p->slab = s;
3631#endif 3523#endif
3632 } 3524 }
3633 } 3525 }
3634 list_add(&s->list, &slab_caches);
3635 return s;
3636} 3526}
3637 3527
3638void __init kmem_cache_init(void) 3528void __init kmem_cache_init(void)
3639{ 3529{
3640 static __initdata struct kmem_cache boot_kmem_cache,
3641 boot_kmem_cache_node;
3642 int i; 3530 int i;
3643 int caches = 2; 3531 int caches = 0;
3532 struct kmem_cache *temp_kmem_cache;
3533 int order;
3534 struct kmem_cache *temp_kmem_cache_node;
3535 unsigned long kmalloc_size;
3644 3536
3645 if (debug_guardpage_minorder()) 3537 kmem_size = offsetof(struct kmem_cache, node) +
3646 slub_max_order = 0; 3538 nr_node_ids * sizeof(struct kmem_cache_node *);
3647 3539
3648 kmem_cache_node = &boot_kmem_cache_node; 3540 /* Allocate two kmem_caches from the page allocator */
3649 kmem_cache = &boot_kmem_cache; 3541 kmalloc_size = ALIGN(kmem_size, cache_line_size());
3542 order = get_order(2 * kmalloc_size);
3543 kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
3650 3544
3651 create_boot_cache(kmem_cache_node, "kmem_cache_node", 3545 /*
3652 sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN); 3546 * Must first have the slab cache available for the allocations of the
3547 * struct kmem_cache_node's. There is special bootstrap code in
3548 * kmem_cache_open for slab_state == DOWN.
3549 */
3550 kmem_cache_node = (void *)kmem_cache + kmalloc_size;
3551
3552 kmem_cache_open(kmem_cache_node, "kmem_cache_node",
3553 sizeof(struct kmem_cache_node),
3554 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3653 3555
3654 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); 3556 hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
3655 3557
3656 /* Able to allocate the per node structures */ 3558 /* Able to allocate the per node structures */
3657 slab_state = PARTIAL; 3559 slab_state = PARTIAL;
3658 3560
3659 create_boot_cache(kmem_cache, "kmem_cache", 3561 temp_kmem_cache = kmem_cache;
3660 offsetof(struct kmem_cache, node) + 3562 kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
3661 nr_node_ids * sizeof(struct kmem_cache_node *), 3563 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
3662 SLAB_HWCACHE_ALIGN); 3564 kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3663 3565 memcpy(kmem_cache, temp_kmem_cache, kmem_size);
3664 kmem_cache = bootstrap(&boot_kmem_cache);
3665 3566
3666 /* 3567 /*
3667 * Allocate kmem_cache_node properly from the kmem_cache slab. 3568 * Allocate kmem_cache_node properly from the kmem_cache slab.
3668 * kmem_cache_node is separately allocated so no need to 3569 * kmem_cache_node is separately allocated so no need to
3669 * update any list pointers. 3570 * update any list pointers.
3670 */ 3571 */
3671 kmem_cache_node = bootstrap(&boot_kmem_cache_node); 3572 temp_kmem_cache_node = kmem_cache_node;
3573
3574 kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
3575 memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
3576
3577 kmem_cache_bootstrap_fixup(kmem_cache_node);
3578
3579 caches++;
3580 kmem_cache_bootstrap_fixup(kmem_cache);
3581 caches++;
3582 /* Free temporary boot structure */
3583 free_pages((unsigned long)temp_kmem_cache, order);
3672 3584
3673 /* Now we can use the kmem_cache to allocate kmalloc slabs */ 3585 /* Now we can use the kmem_cache to allocate kmalloc slabs */
3674 3586
@@ -3756,11 +3668,11 @@ void __init kmem_cache_init(void)
3756 3668
3757 if (s && s->size) { 3669 if (s && s->size) {
3758 char *name = kasprintf(GFP_NOWAIT, 3670 char *name = kasprintf(GFP_NOWAIT,
3759 "dma-kmalloc-%d", s->object_size); 3671 "dma-kmalloc-%d", s->objsize);
3760 3672
3761 BUG_ON(!name); 3673 BUG_ON(!name);
3762 kmalloc_dma_caches[i] = create_kmalloc_cache(name, 3674 kmalloc_dma_caches[i] = create_kmalloc_cache(name,
3763 s->object_size, SLAB_CACHE_DMA); 3675 s->objsize, SLAB_CACHE_DMA);
3764 } 3676 }
3765 } 3677 }
3766#endif 3678#endif
@@ -3796,7 +3708,7 @@ static int slab_unmergeable(struct kmem_cache *s)
3796 return 0; 3708 return 0;
3797} 3709}
3798 3710
3799static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, 3711static struct kmem_cache *find_mergeable(size_t size,
3800 size_t align, unsigned long flags, const char *name, 3712 size_t align, unsigned long flags, const char *name,
3801 void (*ctor)(void *)) 3713 void (*ctor)(void *))
3802{ 3714{
@@ -3832,61 +3744,70 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
3832 if (s->size - size >= sizeof(void *)) 3744 if (s->size - size >= sizeof(void *))
3833 continue; 3745 continue;
3834 3746
3835 if (!cache_match_memcg(s, memcg))
3836 continue;
3837
3838 return s; 3747 return s;
3839 } 3748 }
3840 return NULL; 3749 return NULL;
3841} 3750}
3842 3751
3843struct kmem_cache * 3752struct kmem_cache *kmem_cache_create(const char *name, size_t size,
3844__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, 3753 size_t align, unsigned long flags, void (*ctor)(void *))
3845 size_t align, unsigned long flags, void (*ctor)(void *))
3846{ 3754{
3847 struct kmem_cache *s; 3755 struct kmem_cache *s;
3756 char *n;
3757
3758 if (WARN_ON(!name))
3759 return NULL;
3848 3760
3849 s = find_mergeable(memcg, size, align, flags, name, ctor); 3761 down_write(&slub_lock);
3762 s = find_mergeable(size, align, flags, name, ctor);
3850 if (s) { 3763 if (s) {
3851 s->refcount++; 3764 s->refcount++;
3852 /* 3765 /*
3853 * Adjust the object sizes so that we clear 3766 * Adjust the object sizes so that we clear
3854 * the complete object on kzalloc. 3767 * the complete object on kzalloc.
3855 */ 3768 */
3856 s->object_size = max(s->object_size, (int)size); 3769 s->objsize = max(s->objsize, (int)size);
3857 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3770 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3858 3771
3859 if (sysfs_slab_alias(s, name)) { 3772 if (sysfs_slab_alias(s, name)) {
3860 s->refcount--; 3773 s->refcount--;
3861 s = NULL; 3774 goto err;
3862 } 3775 }
3776 up_write(&slub_lock);
3777 return s;
3863 } 3778 }
3864 3779
3865 return s; 3780 n = kstrdup(name, GFP_KERNEL);
3866} 3781 if (!n)
3782 goto err;
3867 3783
3868int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) 3784 s = kmalloc(kmem_size, GFP_KERNEL);
3869{ 3785 if (s) {
3870 int err; 3786 if (kmem_cache_open(s, n,
3871 3787 size, align, flags, ctor)) {
3872 err = kmem_cache_open(s, flags); 3788 list_add(&s->list, &slab_caches);
3873 if (err) 3789 if (sysfs_slab_add(s)) {
3874 return err; 3790 list_del(&s->list);
3875 3791 kfree(n);
3876 /* Mutex is not taken during early boot */ 3792 kfree(s);
3877 if (slab_state <= UP) 3793 goto err;
3878 return 0; 3794 }
3879 3795 up_write(&slub_lock);
3880 memcg_propagate_slab_attrs(s); 3796 return s;
3881 mutex_unlock(&slab_mutex); 3797 }
3882 err = sysfs_slab_add(s); 3798 kfree(n);
3883 mutex_lock(&slab_mutex); 3799 kfree(s);
3884 3800 }
3885 if (err) 3801err:
3886 kmem_cache_close(s); 3802 up_write(&slub_lock);
3887 3803
3888 return err; 3804 if (flags & SLAB_PANIC)
3805 panic("Cannot create slabcache %s\n", name);
3806 else
3807 s = NULL;
3808 return s;
3889} 3809}
3810EXPORT_SYMBOL(kmem_cache_create);
3890 3811
3891#ifdef CONFIG_SMP 3812#ifdef CONFIG_SMP
3892/* 3813/*
@@ -3905,13 +3826,13 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb,
3905 case CPU_UP_CANCELED_FROZEN: 3826 case CPU_UP_CANCELED_FROZEN:
3906 case CPU_DEAD: 3827 case CPU_DEAD:
3907 case CPU_DEAD_FROZEN: 3828 case CPU_DEAD_FROZEN:
3908 mutex_lock(&slab_mutex); 3829 down_read(&slub_lock);
3909 list_for_each_entry(s, &slab_caches, list) { 3830 list_for_each_entry(s, &slab_caches, list) {
3910 local_irq_save(flags); 3831 local_irq_save(flags);
3911 __flush_cpu_slab(s, cpu); 3832 __flush_cpu_slab(s, cpu);
3912 local_irq_restore(flags); 3833 local_irq_restore(flags);
3913 } 3834 }
3914 mutex_unlock(&slab_mutex); 3835 up_read(&slub_lock);
3915 break; 3836 break;
3916 default: 3837 default:
3917 break; 3838 break;
@@ -3938,7 +3859,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3938 if (unlikely(ZERO_OR_NULL_PTR(s))) 3859 if (unlikely(ZERO_OR_NULL_PTR(s)))
3939 return s; 3860 return s;
3940 3861
3941 ret = slab_alloc(s, gfpflags, caller); 3862 ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller);
3942 3863
3943 /* Honor the call site pointer we received. */ 3864 /* Honor the call site pointer we received. */
3944 trace_kmalloc(caller, ret, size, s->size, gfpflags); 3865 trace_kmalloc(caller, ret, size, s->size, gfpflags);
@@ -3968,7 +3889,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3968 if (unlikely(ZERO_OR_NULL_PTR(s))) 3889 if (unlikely(ZERO_OR_NULL_PTR(s)))
3969 return s; 3890 return s;
3970 3891
3971 ret = slab_alloc_node(s, gfpflags, node, caller); 3892 ret = slab_alloc(s, gfpflags, node, caller);
3972 3893
3973 /* Honor the call site pointer we received. */ 3894 /* Honor the call site pointer we received. */
3974 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node); 3895 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
@@ -4403,32 +4324,22 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
4403 4324
4404 for_each_possible_cpu(cpu) { 4325 for_each_possible_cpu(cpu) {
4405 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); 4326 struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
4406 int node;
4407 struct page *page;
4408 4327
4409 page = ACCESS_ONCE(c->page); 4328 if (!c || c->node < 0)
4410 if (!page)
4411 continue; 4329 continue;
4412 4330
4413 node = page_to_nid(page); 4331 if (c->page) {
4414 if (flags & SO_TOTAL) 4332 if (flags & SO_TOTAL)
4415 x = page->objects; 4333 x = c->page->objects;
4416 else if (flags & SO_OBJECTS) 4334 else if (flags & SO_OBJECTS)
4417 x = page->inuse; 4335 x = c->page->inuse;
4418 else 4336 else
4419 x = 1; 4337 x = 1;
4420
4421 total += x;
4422 nodes[node] += x;
4423 4338
4424 page = ACCESS_ONCE(c->partial);
4425 if (page) {
4426 x = page->pobjects;
4427 total += x; 4339 total += x;
4428 nodes[node] += x; 4340 nodes[c->node] += x;
4429 } 4341 }
4430 4342 per_cpu[c->node]++;
4431 per_cpu[node]++;
4432 } 4343 }
4433 } 4344 }
4434 4345
@@ -4506,12 +4417,11 @@ struct slab_attribute {
4506}; 4417};
4507 4418
4508#define SLAB_ATTR_RO(_name) \ 4419#define SLAB_ATTR_RO(_name) \
4509 static struct slab_attribute _name##_attr = \ 4420 static struct slab_attribute _name##_attr = __ATTR_RO(_name)
4510 __ATTR(_name, 0400, _name##_show, NULL)
4511 4421
4512#define SLAB_ATTR(_name) \ 4422#define SLAB_ATTR(_name) \
4513 static struct slab_attribute _name##_attr = \ 4423 static struct slab_attribute _name##_attr = \
4514 __ATTR(_name, 0600, _name##_show, _name##_store) 4424 __ATTR(_name, 0644, _name##_show, _name##_store)
4515 4425
4516static ssize_t slab_size_show(struct kmem_cache *s, char *buf) 4426static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
4517{ 4427{
@@ -4527,7 +4437,7 @@ SLAB_ATTR_RO(align);
4527 4437
4528static ssize_t object_size_show(struct kmem_cache *s, char *buf) 4438static ssize_t object_size_show(struct kmem_cache *s, char *buf)
4529{ 4439{
4530 return sprintf(buf, "%d\n", s->object_size); 4440 return sprintf(buf, "%d\n", s->objsize);
4531} 4441}
4532SLAB_ATTR_RO(object_size); 4442SLAB_ATTR_RO(object_size);
4533 4443
@@ -4580,29 +4490,6 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
4580} 4490}
4581SLAB_ATTR(min_partial); 4491SLAB_ATTR(min_partial);
4582 4492
4583static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
4584{
4585 return sprintf(buf, "%u\n", s->cpu_partial);
4586}
4587
4588static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
4589 size_t length)
4590{
4591 unsigned long objects;
4592 int err;
4593
4594 err = strict_strtoul(buf, 10, &objects);
4595 if (err)
4596 return err;
4597 if (objects && kmem_cache_debug(s))
4598 return -EINVAL;
4599
4600 s->cpu_partial = objects;
4601 flush_all(s);
4602 return length;
4603}
4604SLAB_ATTR(cpu_partial);
4605
4606static ssize_t ctor_show(struct kmem_cache *s, char *buf) 4493static ssize_t ctor_show(struct kmem_cache *s, char *buf)
4607{ 4494{
4608 if (!s->ctor) 4495 if (!s->ctor)
@@ -4641,37 +4528,6 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
4641} 4528}
4642SLAB_ATTR_RO(objects_partial); 4529SLAB_ATTR_RO(objects_partial);
4643 4530
4644static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
4645{
4646 int objects = 0;
4647 int pages = 0;
4648 int cpu;
4649 int len;
4650
4651 for_each_online_cpu(cpu) {
4652 struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
4653
4654 if (page) {
4655 pages += page->pages;
4656 objects += page->pobjects;
4657 }
4658 }
4659
4660 len = sprintf(buf, "%d(%d)", objects, pages);
4661
4662#ifdef CONFIG_SMP
4663 for_each_online_cpu(cpu) {
4664 struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
4665
4666 if (page && len < PAGE_SIZE - 20)
4667 len += sprintf(buf + len, " C%d=%d(%d)", cpu,
4668 page->pobjects, page->pages);
4669 }
4670#endif
4671 return len + sprintf(buf + len, "\n");
4672}
4673SLAB_ATTR_RO(slabs_cpu_partial);
4674
4675static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) 4531static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
4676{ 4532{
4677 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); 4533 return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4994,10 +4850,6 @@ STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
4994STAT_ATTR(ORDER_FALLBACK, order_fallback); 4850STAT_ATTR(ORDER_FALLBACK, order_fallback);
4995STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); 4851STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
4996STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); 4852STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
4997STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
4998STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
4999STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
5000STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
5001#endif 4853#endif
5002 4854
5003static struct attribute *slab_attrs[] = { 4855static struct attribute *slab_attrs[] = {
@@ -5006,7 +4858,6 @@ static struct attribute *slab_attrs[] = {
5006 &objs_per_slab_attr.attr, 4858 &objs_per_slab_attr.attr,
5007 &order_attr.attr, 4859 &order_attr.attr,
5008 &min_partial_attr.attr, 4860 &min_partial_attr.attr,
5009 &cpu_partial_attr.attr,
5010 &objects_attr.attr, 4861 &objects_attr.attr,
5011 &objects_partial_attr.attr, 4862 &objects_partial_attr.attr,
5012 &partial_attr.attr, 4863 &partial_attr.attr,
@@ -5019,7 +4870,6 @@ static struct attribute *slab_attrs[] = {
5019 &destroy_by_rcu_attr.attr, 4870 &destroy_by_rcu_attr.attr,
5020 &shrink_attr.attr, 4871 &shrink_attr.attr,
5021 &reserved_attr.attr, 4872 &reserved_attr.attr,
5022 &slabs_cpu_partial_attr.attr,
5023#ifdef CONFIG_SLUB_DEBUG 4873#ifdef CONFIG_SLUB_DEBUG
5024 &total_objects_attr.attr, 4874 &total_objects_attr.attr,
5025 &slabs_attr.attr, 4875 &slabs_attr.attr,
@@ -5061,10 +4911,6 @@ static struct attribute *slab_attrs[] = {
5061 &order_fallback_attr.attr, 4911 &order_fallback_attr.attr,
5062 &cmpxchg_double_fail_attr.attr, 4912 &cmpxchg_double_fail_attr.attr,
5063 &cmpxchg_double_cpu_fail_attr.attr, 4913 &cmpxchg_double_cpu_fail_attr.attr,
5064 &cpu_partial_alloc_attr.attr,
5065 &cpu_partial_free_attr.attr,
5066 &cpu_partial_node_attr.attr,
5067 &cpu_partial_drain_attr.attr,
5068#endif 4914#endif
5069#ifdef CONFIG_FAILSLAB 4915#ifdef CONFIG_FAILSLAB
5070 &failslab_attr.attr, 4916 &failslab_attr.attr,
@@ -5111,93 +4957,16 @@ static ssize_t slab_attr_store(struct kobject *kobj,
5111 return -EIO; 4957 return -EIO;
5112 4958
5113 err = attribute->store(s, buf, len); 4959 err = attribute->store(s, buf, len);
5114#ifdef CONFIG_MEMCG_KMEM
5115 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
5116 int i;
5117
5118 mutex_lock(&slab_mutex);
5119 if (s->max_attr_size < len)
5120 s->max_attr_size = len;
5121 4960
5122 /*
5123 * This is a best effort propagation, so this function's return
5124 * value will be determined by the parent cache only. This is
5125 * basically because not all attributes will have a well
5126 * defined semantics for rollbacks - most of the actions will
5127 * have permanent effects.
5128 *
5129 * Returning the error value of any of the children that fail
5130 * is not 100 % defined, in the sense that users seeing the
5131 * error code won't be able to know anything about the state of
5132 * the cache.
5133 *
5134 * Only returning the error code for the parent cache at least
5135 * has well defined semantics. The cache being written to
5136 * directly either failed or succeeded, in which case we loop
5137 * through the descendants with best-effort propagation.
5138 */
5139 for_each_memcg_cache_index(i) {
5140 struct kmem_cache *c = cache_from_memcg(s, i);
5141 if (c)
5142 attribute->store(c, buf, len);
5143 }
5144 mutex_unlock(&slab_mutex);
5145 }
5146#endif
5147 return err; 4961 return err;
5148} 4962}
5149 4963
5150static void memcg_propagate_slab_attrs(struct kmem_cache *s) 4964static void kmem_cache_release(struct kobject *kobj)
5151{ 4965{
5152#ifdef CONFIG_MEMCG_KMEM 4966 struct kmem_cache *s = to_slab(kobj);
5153 int i;
5154 char *buffer = NULL;
5155
5156 if (!is_root_cache(s))
5157 return;
5158
5159 /*
5160 * This mean this cache had no attribute written. Therefore, no point
5161 * in copying default values around
5162 */
5163 if (!s->max_attr_size)
5164 return;
5165 4967
5166 for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { 4968 kfree(s->name);
5167 char mbuf[64]; 4969 kfree(s);
5168 char *buf;
5169 struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
5170
5171 if (!attr || !attr->store || !attr->show)
5172 continue;
5173
5174 /*
5175 * It is really bad that we have to allocate here, so we will
5176 * do it only as a fallback. If we actually allocate, though,
5177 * we can just use the allocated buffer until the end.
5178 *
5179 * Most of the slub attributes will tend to be very small in
5180 * size, but sysfs allows buffers up to a page, so they can
5181 * theoretically happen.
5182 */
5183 if (buffer)
5184 buf = buffer;
5185 else if (s->max_attr_size < ARRAY_SIZE(mbuf))
5186 buf = mbuf;
5187 else {
5188 buffer = (char *) get_zeroed_page(GFP_KERNEL);
5189 if (WARN_ON(!buffer))
5190 continue;
5191 buf = buffer;
5192 }
5193
5194 attr->show(s->memcg_params->root_cache, buf);
5195 attr->store(s, buf, strlen(buf));
5196 }
5197
5198 if (buffer)
5199 free_page((unsigned long)buffer);
5200#endif
5201} 4970}
5202 4971
5203static const struct sysfs_ops slab_sysfs_ops = { 4972static const struct sysfs_ops slab_sysfs_ops = {
@@ -5207,6 +4976,7 @@ static const struct sysfs_ops slab_sysfs_ops = {
5207 4976
5208static struct kobj_type slab_ktype = { 4977static struct kobj_type slab_ktype = {
5209 .sysfs_ops = &slab_sysfs_ops, 4978 .sysfs_ops = &slab_sysfs_ops,
4979 .release = kmem_cache_release
5210}; 4980};
5211 4981
5212static int uevent_filter(struct kset *kset, struct kobject *kobj) 4982static int uevent_filter(struct kset *kset, struct kobject *kobj)
@@ -5256,12 +5026,6 @@ static char *create_unique_id(struct kmem_cache *s)
5256 if (p != name + 1) 5026 if (p != name + 1)
5257 *p++ = '-'; 5027 *p++ = '-';
5258 p += sprintf(p, "%07d", s->size); 5028 p += sprintf(p, "%07d", s->size);
5259
5260#ifdef CONFIG_MEMCG_KMEM
5261 if (!is_root_cache(s))
5262 p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
5263#endif
5264
5265 BUG_ON(p > name + ID_STR_LENGTH - 1); 5029 BUG_ON(p > name + ID_STR_LENGTH - 1);
5266 return name; 5030 return name;
5267} 5031}
@@ -5270,8 +5034,13 @@ static int sysfs_slab_add(struct kmem_cache *s)
5270{ 5034{
5271 int err; 5035 int err;
5272 const char *name; 5036 const char *name;
5273 int unmergeable = slab_unmergeable(s); 5037 int unmergeable;
5038
5039 if (slab_state < SYSFS)
5040 /* Defer until later */
5041 return 0;
5274 5042
5043 unmergeable = slab_unmergeable(s);
5275 if (unmergeable) { 5044 if (unmergeable) {
5276 /* 5045 /*
5277 * Slabcache can never be merged so we can use the name proper. 5046 * Slabcache can never be merged so we can use the name proper.
@@ -5312,7 +5081,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
5312 5081
5313static void sysfs_slab_remove(struct kmem_cache *s) 5082static void sysfs_slab_remove(struct kmem_cache *s)
5314{ 5083{
5315 if (slab_state < FULL) 5084 if (slab_state < SYSFS)
5316 /* 5085 /*
5317 * Sysfs has not been setup yet so no need to remove the 5086 * Sysfs has not been setup yet so no need to remove the
5318 * cache from sysfs. 5087 * cache from sysfs.
@@ -5340,7 +5109,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name)
5340{ 5109{
5341 struct saved_alias *al; 5110 struct saved_alias *al;
5342 5111
5343 if (slab_state == FULL) { 5112 if (slab_state == SYSFS) {
5344 /* 5113 /*
5345 * If we have a leftover link then remove it. 5114 * If we have a leftover link then remove it.
5346 */ 5115 */
@@ -5364,16 +5133,16 @@ static int __init slab_sysfs_init(void)
5364 struct kmem_cache *s; 5133 struct kmem_cache *s;
5365 int err; 5134 int err;
5366 5135
5367 mutex_lock(&slab_mutex); 5136 down_write(&slub_lock);
5368 5137
5369 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); 5138 slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
5370 if (!slab_kset) { 5139 if (!slab_kset) {
5371 mutex_unlock(&slab_mutex); 5140 up_write(&slub_lock);
5372 printk(KERN_ERR "Cannot register slab subsystem.\n"); 5141 printk(KERN_ERR "Cannot register slab subsystem.\n");
5373 return -ENOSYS; 5142 return -ENOSYS;
5374 } 5143 }
5375 5144
5376 slab_state = FULL; 5145 slab_state = SYSFS;
5377 5146
5378 list_for_each_entry(s, &slab_caches, list) { 5147 list_for_each_entry(s, &slab_caches, list) {
5379 err = sysfs_slab_add(s); 5148 err = sysfs_slab_add(s);
@@ -5389,11 +5158,11 @@ static int __init slab_sysfs_init(void)
5389 err = sysfs_slab_alias(al->s, al->name); 5158 err = sysfs_slab_alias(al->s, al->name);
5390 if (err) 5159 if (err)
5391 printk(KERN_ERR "SLUB: Unable to add boot slab alias" 5160 printk(KERN_ERR "SLUB: Unable to add boot slab alias"
5392 " %s to sysfs\n", al->name); 5161 " %s to sysfs\n", s->name);
5393 kfree(al); 5162 kfree(al);
5394 } 5163 }
5395 5164
5396 mutex_unlock(&slab_mutex); 5165 up_write(&slub_lock);
5397 resiliency_test(); 5166 resiliency_test();
5398 return 0; 5167 return 0;
5399} 5168}
@@ -5405,14 +5174,49 @@ __initcall(slab_sysfs_init);
5405 * The /proc/slabinfo ABI 5174 * The /proc/slabinfo ABI
5406 */ 5175 */
5407#ifdef CONFIG_SLABINFO 5176#ifdef CONFIG_SLABINFO
5408void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) 5177static void print_slabinfo_header(struct seq_file *m)
5178{
5179 seq_puts(m, "slabinfo - version: 2.1\n");
5180 seq_puts(m, "# name <active_objs> <num_objs> <objsize> "
5181 "<objperslab> <pagesperslab>");
5182 seq_puts(m, " : tunables <limit> <batchcount> <sharedfactor>");
5183 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
5184 seq_putc(m, '\n');
5185}
5186
5187static void *s_start(struct seq_file *m, loff_t *pos)
5188{
5189 loff_t n = *pos;
5190
5191 down_read(&slub_lock);
5192 if (!n)
5193 print_slabinfo_header(m);
5194
5195 return seq_list_start(&slab_caches, *pos);
5196}
5197
5198static void *s_next(struct seq_file *m, void *p, loff_t *pos)
5199{
5200 return seq_list_next(p, &slab_caches, pos);
5201}
5202
5203static void s_stop(struct seq_file *m, void *p)
5204{
5205 up_read(&slub_lock);
5206}
5207
5208static int s_show(struct seq_file *m, void *p)
5409{ 5209{
5410 unsigned long nr_partials = 0; 5210 unsigned long nr_partials = 0;
5411 unsigned long nr_slabs = 0; 5211 unsigned long nr_slabs = 0;
5212 unsigned long nr_inuse = 0;
5412 unsigned long nr_objs = 0; 5213 unsigned long nr_objs = 0;
5413 unsigned long nr_free = 0; 5214 unsigned long nr_free = 0;
5215 struct kmem_cache *s;
5414 int node; 5216 int node;
5415 5217
5218 s = list_entry(p, struct kmem_cache, list);
5219
5416 for_each_online_node(node) { 5220 for_each_online_node(node) {
5417 struct kmem_cache_node *n = get_node(s, node); 5221 struct kmem_cache_node *n = get_node(s, node);
5418 5222
@@ -5425,21 +5229,41 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
5425 nr_free += count_partial(n, count_free); 5229 nr_free += count_partial(n, count_free);
5426 } 5230 }
5427 5231
5428 sinfo->active_objs = nr_objs - nr_free; 5232 nr_inuse = nr_objs - nr_free;
5429 sinfo->num_objs = nr_objs; 5233
5430 sinfo->active_slabs = nr_slabs; 5234 seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse,
5431 sinfo->num_slabs = nr_slabs; 5235 nr_objs, s->size, oo_objects(s->oo),
5432 sinfo->objects_per_slab = oo_objects(s->oo); 5236 (1 << oo_order(s->oo)));
5433 sinfo->cache_order = oo_order(s->oo); 5237 seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0);
5238 seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs,
5239 0UL);
5240 seq_putc(m, '\n');
5241 return 0;
5434} 5242}
5435 5243
5436void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s) 5244static const struct seq_operations slabinfo_op = {
5245 .start = s_start,
5246 .next = s_next,
5247 .stop = s_stop,
5248 .show = s_show,
5249};
5250
5251static int slabinfo_open(struct inode *inode, struct file *file)
5437{ 5252{
5253 return seq_open(file, &slabinfo_op);
5438} 5254}
5439 5255
5440ssize_t slabinfo_write(struct file *file, const char __user *buffer, 5256static const struct file_operations proc_slabinfo_operations = {
5441 size_t count, loff_t *ppos) 5257 .open = slabinfo_open,
5258 .read = seq_read,
5259 .llseek = seq_lseek,
5260 .release = seq_release,
5261};
5262
5263static int __init slab_proc_init(void)
5442{ 5264{
5443 return -EIO; 5265 proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
5266 return 0;
5444} 5267}
5268module_init(slab_proc_init);
5445#endif /* CONFIG_SLABINFO */ 5269#endif /* CONFIG_SLABINFO */