aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mm/slab.c1132
1 files changed, 812 insertions, 320 deletions
diff --git a/mm/slab.c b/mm/slab.c
index d7c4443991fe..a041c5378dfa 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -75,6 +75,15 @@
75 * 75 *
76 * At present, each engine can be growing a cache. This should be blocked. 76 * At present, each engine can be growing a cache. This should be blocked.
77 * 77 *
78 * 15 March 2005. NUMA slab allocator.
79 * Shai Fultheim <shai@scalex86.org>.
80 * Shobhit Dayal <shobhit@calsoftinc.com>
81 * Alok N Kataria <alokk@calsoftinc.com>
82 * Christoph Lameter <christoph@lameter.com>
83 *
84 * Modified the slab allocator to be node aware on NUMA systems.
85 * Each node has its own list of partial, free and full slabs.
86 * All object allocations for a node occur from node specific slab lists.
78 */ 87 */
79 88
80#include <linux/config.h> 89#include <linux/config.h>
@@ -93,6 +102,7 @@
93#include <linux/module.h> 102#include <linux/module.h>
94#include <linux/rcupdate.h> 103#include <linux/rcupdate.h>
95#include <linux/string.h> 104#include <linux/string.h>
105#include <linux/nodemask.h>
96 106
97#include <asm/uaccess.h> 107#include <asm/uaccess.h>
98#include <asm/cacheflush.h> 108#include <asm/cacheflush.h>
@@ -212,6 +222,7 @@ struct slab {
212 void *s_mem; /* including colour offset */ 222 void *s_mem; /* including colour offset */
213 unsigned int inuse; /* num of objs active in slab */ 223 unsigned int inuse; /* num of objs active in slab */
214 kmem_bufctl_t free; 224 kmem_bufctl_t free;
225 unsigned short nodeid;
215}; 226};
216 227
217/* 228/*
@@ -239,7 +250,6 @@ struct slab_rcu {
239/* 250/*
240 * struct array_cache 251 * struct array_cache
241 * 252 *
242 * Per cpu structures
243 * Purpose: 253 * Purpose:
244 * - LIFO ordering, to hand out cache-warm objects from _alloc 254 * - LIFO ordering, to hand out cache-warm objects from _alloc
245 * - reduce the number of linked list operations 255 * - reduce the number of linked list operations
@@ -254,6 +264,13 @@ struct array_cache {
254 unsigned int limit; 264 unsigned int limit;
255 unsigned int batchcount; 265 unsigned int batchcount;
256 unsigned int touched; 266 unsigned int touched;
267 spinlock_t lock;
268 void *entry[0]; /*
269 * Must have this definition in here for the proper
270 * alignment of array_cache. Also simplifies accessing
271 * the entries.
272 * [0] is for gcc 2.95. It should really be [].
273 */
257}; 274};
258 275
259/* bootstrap: The caches do not work without cpuarrays anymore, 276/* bootstrap: The caches do not work without cpuarrays anymore,
@@ -266,34 +283,83 @@ struct arraycache_init {
266}; 283};
267 284
268/* 285/*
269 * The slab lists of all objects. 286 * The slab lists for all objects.
270 * Hopefully reduce the internal fragmentation
271 * NUMA: The spinlock could be moved from the kmem_cache_t
272 * into this structure, too. Figure out what causes
273 * fewer cross-node spinlock operations.
274 */ 287 */
275struct kmem_list3 { 288struct kmem_list3 {
276 struct list_head slabs_partial; /* partial list first, better asm code */ 289 struct list_head slabs_partial; /* partial list first, better asm code */
277 struct list_head slabs_full; 290 struct list_head slabs_full;
278 struct list_head slabs_free; 291 struct list_head slabs_free;
279 unsigned long free_objects; 292 unsigned long free_objects;
280 int free_touched;
281 unsigned long next_reap; 293 unsigned long next_reap;
282 struct array_cache *shared; 294 int free_touched;
295 unsigned int free_limit;
296 spinlock_t list_lock;
297 struct array_cache *shared; /* shared per node */
298 struct array_cache **alien; /* on other nodes */
283}; 299};
284 300
285#define LIST3_INIT(parent) \ 301/*
286 { \ 302 * Need this for bootstrapping a per node allocator.
287 .slabs_full = LIST_HEAD_INIT(parent.slabs_full), \ 303 */
288 .slabs_partial = LIST_HEAD_INIT(parent.slabs_partial), \ 304#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
289 .slabs_free = LIST_HEAD_INIT(parent.slabs_free) \ 305struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
306#define CACHE_CACHE 0
307#define SIZE_AC 1
308#define SIZE_L3 (1 + MAX_NUMNODES)
309
310/*
311 * This function may be completely optimized away if
312 * a constant is passed to it. Mostly the same as
313 * what is in linux/slab.h except it returns an
314 * index.
315 */
316static inline int index_of(const size_t size)
317{
318 if (__builtin_constant_p(size)) {
319 int i = 0;
320
321#define CACHE(x) \
322 if (size <=x) \
323 return i; \
324 else \
325 i++;
326#include "linux/kmalloc_sizes.h"
327#undef CACHE
328 {
329 extern void __bad_size(void);
330 __bad_size();
331 }
290 } 332 }
291#define list3_data(cachep) \ 333 return 0;
292 (&(cachep)->lists) 334}
335
336#define INDEX_AC index_of(sizeof(struct arraycache_init))
337#define INDEX_L3 index_of(sizeof(struct kmem_list3))
338
339static inline void kmem_list3_init(struct kmem_list3 *parent)
340{
341 INIT_LIST_HEAD(&parent->slabs_full);
342 INIT_LIST_HEAD(&parent->slabs_partial);
343 INIT_LIST_HEAD(&parent->slabs_free);
344 parent->shared = NULL;
345 parent->alien = NULL;
346 spin_lock_init(&parent->list_lock);
347 parent->free_objects = 0;
348 parent->free_touched = 0;
349}
293 350
294/* NUMA: per-node */ 351#define MAKE_LIST(cachep, listp, slab, nodeid) \
295#define list3_data_ptr(cachep, ptr) \ 352 do { \
296 list3_data(cachep) 353 INIT_LIST_HEAD(listp); \
354 list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
355 } while (0)
356
357#define MAKE_ALL_LISTS(cachep, ptr, nodeid) \
358 do { \
359 MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid); \
360 MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
361 MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \
362 } while (0)
297 363
298/* 364/*
299 * kmem_cache_t 365 * kmem_cache_t
@@ -306,13 +372,12 @@ struct kmem_cache_s {
306 struct array_cache *array[NR_CPUS]; 372 struct array_cache *array[NR_CPUS];
307 unsigned int batchcount; 373 unsigned int batchcount;
308 unsigned int limit; 374 unsigned int limit;
309/* 2) touched by every alloc & free from the backend */ 375 unsigned int shared;
310 struct kmem_list3 lists;
311 /* NUMA: kmem_3list_t *nodelists[MAX_NUMNODES] */
312 unsigned int objsize; 376 unsigned int objsize;
377/* 2) touched by every alloc & free from the backend */
378 struct kmem_list3 *nodelists[MAX_NUMNODES];
313 unsigned int flags; /* constant flags */ 379 unsigned int flags; /* constant flags */
314 unsigned int num; /* # of objs per slab */ 380 unsigned int num; /* # of objs per slab */
315 unsigned int free_limit; /* upper limit of objects in the lists */
316 spinlock_t spinlock; 381 spinlock_t spinlock;
317 382
318/* 3) cache_grow/shrink */ 383/* 3) cache_grow/shrink */
@@ -349,6 +414,7 @@ struct kmem_cache_s {
349 unsigned long errors; 414 unsigned long errors;
350 unsigned long max_freeable; 415 unsigned long max_freeable;
351 unsigned long node_allocs; 416 unsigned long node_allocs;
417 unsigned long node_frees;
352 atomic_t allochit; 418 atomic_t allochit;
353 atomic_t allocmiss; 419 atomic_t allocmiss;
354 atomic_t freehit; 420 atomic_t freehit;
@@ -384,6 +450,7 @@ struct kmem_cache_s {
384 } while (0) 450 } while (0)
385#define STATS_INC_ERR(x) ((x)->errors++) 451#define STATS_INC_ERR(x) ((x)->errors++)
386#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++) 452#define STATS_INC_NODEALLOCS(x) ((x)->node_allocs++)
453#define STATS_INC_NODEFREES(x) ((x)->node_frees++)
387#define STATS_SET_FREEABLE(x, i) \ 454#define STATS_SET_FREEABLE(x, i) \
388 do { if ((x)->max_freeable < i) \ 455 do { if ((x)->max_freeable < i) \
389 (x)->max_freeable = i; \ 456 (x)->max_freeable = i; \
@@ -402,6 +469,7 @@ struct kmem_cache_s {
402#define STATS_SET_HIGH(x) do { } while (0) 469#define STATS_SET_HIGH(x) do { } while (0)
403#define STATS_INC_ERR(x) do { } while (0) 470#define STATS_INC_ERR(x) do { } while (0)
404#define STATS_INC_NODEALLOCS(x) do { } while (0) 471#define STATS_INC_NODEALLOCS(x) do { } while (0)
472#define STATS_INC_NODEFREES(x) do { } while (0)
405#define STATS_SET_FREEABLE(x, i) \ 473#define STATS_SET_FREEABLE(x, i) \
406 do { } while (0) 474 do { } while (0)
407 475
@@ -534,9 +602,9 @@ static struct arraycache_init initarray_generic =
534 602
535/* internal cache of cache description objs */ 603/* internal cache of cache description objs */
536static kmem_cache_t cache_cache = { 604static kmem_cache_t cache_cache = {
537 .lists = LIST3_INIT(cache_cache.lists),
538 .batchcount = 1, 605 .batchcount = 1,
539 .limit = BOOT_CPUCACHE_ENTRIES, 606 .limit = BOOT_CPUCACHE_ENTRIES,
607 .shared = 1,
540 .objsize = sizeof(kmem_cache_t), 608 .objsize = sizeof(kmem_cache_t),
541 .flags = SLAB_NO_REAP, 609 .flags = SLAB_NO_REAP,
542 .spinlock = SPIN_LOCK_UNLOCKED, 610 .spinlock = SPIN_LOCK_UNLOCKED,
@@ -557,7 +625,6 @@ static struct list_head cache_chain;
557 * SLAB_RECLAIM_ACCOUNT turns this on per-slab 625 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
558 */ 626 */
559atomic_t slab_reclaim_pages; 627atomic_t slab_reclaim_pages;
560EXPORT_SYMBOL(slab_reclaim_pages);
561 628
562/* 629/*
563 * chicken and egg problem: delay the per-cpu array allocation 630 * chicken and egg problem: delay the per-cpu array allocation
@@ -565,7 +632,8 @@ EXPORT_SYMBOL(slab_reclaim_pages);
565 */ 632 */
566static enum { 633static enum {
567 NONE, 634 NONE,
568 PARTIAL, 635 PARTIAL_AC,
636 PARTIAL_L3,
569 FULL 637 FULL
570} g_cpucache_up; 638} g_cpucache_up;
571 639
@@ -574,11 +642,7 @@ static DEFINE_PER_CPU(struct work_struct, reap_work);
574static void free_block(kmem_cache_t* cachep, void** objpp, int len); 642static void free_block(kmem_cache_t* cachep, void** objpp, int len);
575static void enable_cpucache (kmem_cache_t *cachep); 643static void enable_cpucache (kmem_cache_t *cachep);
576static void cache_reap (void *unused); 644static void cache_reap (void *unused);
577 645static int __node_shrink(kmem_cache_t *cachep, int node);
578static inline void **ac_entry(struct array_cache *ac)
579{
580 return (void**)(ac+1);
581}
582 646
583static inline struct array_cache *ac_data(kmem_cache_t *cachep) 647static inline struct array_cache *ac_data(kmem_cache_t *cachep)
584{ 648{
@@ -676,48 +740,160 @@ static void __devinit start_cpu_timer(int cpu)
676 } 740 }
677} 741}
678 742
679static struct array_cache *alloc_arraycache(int cpu, int entries, 743static struct array_cache *alloc_arraycache(int node, int entries,
680 int batchcount) 744 int batchcount)
681{ 745{
682 int memsize = sizeof(void*)*entries+sizeof(struct array_cache); 746 int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
683 struct array_cache *nc = NULL; 747 struct array_cache *nc = NULL;
684 748
685 if (cpu == -1) 749 nc = kmalloc_node(memsize, GFP_KERNEL, node);
686 nc = kmalloc(memsize, GFP_KERNEL);
687 else
688 nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
689
690 if (nc) { 750 if (nc) {
691 nc->avail = 0; 751 nc->avail = 0;
692 nc->limit = entries; 752 nc->limit = entries;
693 nc->batchcount = batchcount; 753 nc->batchcount = batchcount;
694 nc->touched = 0; 754 nc->touched = 0;
755 spin_lock_init(&nc->lock);
695 } 756 }
696 return nc; 757 return nc;
697} 758}
698 759
760#ifdef CONFIG_NUMA
761static inline struct array_cache **alloc_alien_cache(int node, int limit)
762{
763 struct array_cache **ac_ptr;
764 int memsize = sizeof(void*)*MAX_NUMNODES;
765 int i;
766
767 if (limit > 1)
768 limit = 12;
769 ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
770 if (ac_ptr) {
771 for_each_node(i) {
772 if (i == node || !node_online(i)) {
773 ac_ptr[i] = NULL;
774 continue;
775 }
776 ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
777 if (!ac_ptr[i]) {
778 for (i--; i <=0; i--)
779 kfree(ac_ptr[i]);
780 kfree(ac_ptr);
781 return NULL;
782 }
783 }
784 }
785 return ac_ptr;
786}
787
788static inline void free_alien_cache(struct array_cache **ac_ptr)
789{
790 int i;
791
792 if (!ac_ptr)
793 return;
794
795 for_each_node(i)
796 kfree(ac_ptr[i]);
797
798 kfree(ac_ptr);
799}
800
801static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
802{
803 struct kmem_list3 *rl3 = cachep->nodelists[node];
804
805 if (ac->avail) {
806 spin_lock(&rl3->list_lock);
807 free_block(cachep, ac->entry, ac->avail);
808 ac->avail = 0;
809 spin_unlock(&rl3->list_lock);
810 }
811}
812
813static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
814{
815 int i=0;
816 struct array_cache *ac;
817 unsigned long flags;
818
819 for_each_online_node(i) {
820 ac = l3->alien[i];
821 if (ac) {
822 spin_lock_irqsave(&ac->lock, flags);
823 __drain_alien_cache(cachep, ac, i);
824 spin_unlock_irqrestore(&ac->lock, flags);
825 }
826 }
827}
828#else
829#define alloc_alien_cache(node, limit) do { } while (0)
830#define free_alien_cache(ac_ptr) do { } while (0)
831#define drain_alien_cache(cachep, l3) do { } while (0)
832#endif
833
699static int __devinit cpuup_callback(struct notifier_block *nfb, 834static int __devinit cpuup_callback(struct notifier_block *nfb,
700 unsigned long action, void *hcpu) 835 unsigned long action, void *hcpu)
701{ 836{
702 long cpu = (long)hcpu; 837 long cpu = (long)hcpu;
703 kmem_cache_t* cachep; 838 kmem_cache_t* cachep;
839 struct kmem_list3 *l3 = NULL;
840 int node = cpu_to_node(cpu);
841 int memsize = sizeof(struct kmem_list3);
842 struct array_cache *nc = NULL;
704 843
705 switch (action) { 844 switch (action) {
706 case CPU_UP_PREPARE: 845 case CPU_UP_PREPARE:
707 down(&cache_chain_sem); 846 down(&cache_chain_sem);
847 /* we need to do this right in the beginning since
848 * alloc_arraycache's are going to use this list.
849 * kmalloc_node allows us to add the slab to the right
850 * kmem_list3 and not this cpu's kmem_list3
851 */
852
708 list_for_each_entry(cachep, &cache_chain, next) { 853 list_for_each_entry(cachep, &cache_chain, next) {
709 struct array_cache *nc; 854 /* setup the size64 kmemlist for cpu before we can
855 * begin anything. Make sure some other cpu on this
856 * node has not already allocated this
857 */
858 if (!cachep->nodelists[node]) {
859 if (!(l3 = kmalloc_node(memsize,
860 GFP_KERNEL, node)))
861 goto bad;
862 kmem_list3_init(l3);
863 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
864 ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
865
866 cachep->nodelists[node] = l3;
867 }
868
869 spin_lock_irq(&cachep->nodelists[node]->list_lock);
870 cachep->nodelists[node]->free_limit =
871 (1 + nr_cpus_node(node)) *
872 cachep->batchcount + cachep->num;
873 spin_unlock_irq(&cachep->nodelists[node]->list_lock);
874 }
710 875
711 nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount); 876 /* Now we can go ahead with allocating the shared array's
877 & array cache's */
878 list_for_each_entry(cachep, &cache_chain, next) {
879 nc = alloc_arraycache(node, cachep->limit,
880 cachep->batchcount);
712 if (!nc) 881 if (!nc)
713 goto bad; 882 goto bad;
714
715 spin_lock_irq(&cachep->spinlock);
716 cachep->array[cpu] = nc; 883 cachep->array[cpu] = nc;
717 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
718 + cachep->num;
719 spin_unlock_irq(&cachep->spinlock);
720 884
885 l3 = cachep->nodelists[node];
886 BUG_ON(!l3);
887 if (!l3->shared) {
888 if (!(nc = alloc_arraycache(node,
889 cachep->shared*cachep->batchcount,
890 0xbaadf00d)))
891 goto bad;
892
893 /* we are serialised from CPU_DEAD or
894 CPU_UP_CANCELLED by the cpucontrol lock */
895 l3->shared = nc;
896 }
721 } 897 }
722 up(&cache_chain_sem); 898 up(&cache_chain_sem);
723 break; 899 break;
@@ -732,13 +908,51 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
732 908
733 list_for_each_entry(cachep, &cache_chain, next) { 909 list_for_each_entry(cachep, &cache_chain, next) {
734 struct array_cache *nc; 910 struct array_cache *nc;
911 cpumask_t mask;
735 912
913 mask = node_to_cpumask(node);
736 spin_lock_irq(&cachep->spinlock); 914 spin_lock_irq(&cachep->spinlock);
737 /* cpu is dead; no one can alloc from it. */ 915 /* cpu is dead; no one can alloc from it. */
738 nc = cachep->array[cpu]; 916 nc = cachep->array[cpu];
739 cachep->array[cpu] = NULL; 917 cachep->array[cpu] = NULL;
740 cachep->free_limit -= cachep->batchcount; 918 l3 = cachep->nodelists[node];
741 free_block(cachep, ac_entry(nc), nc->avail); 919
920 if (!l3)
921 goto unlock_cache;
922
923 spin_lock(&l3->list_lock);
924
925 /* Free limit for this kmem_list3 */
926 l3->free_limit -= cachep->batchcount;
927 if (nc)
928 free_block(cachep, nc->entry, nc->avail);
929
930 if (!cpus_empty(mask)) {
931 spin_unlock(&l3->list_lock);
932 goto unlock_cache;
933 }
934
935 if (l3->shared) {
936 free_block(cachep, l3->shared->entry,
937 l3->shared->avail);
938 kfree(l3->shared);
939 l3->shared = NULL;
940 }
941 if (l3->alien) {
942 drain_alien_cache(cachep, l3);
943 free_alien_cache(l3->alien);
944 l3->alien = NULL;
945 }
946
947 /* free slabs belonging to this node */
948 if (__node_shrink(cachep, node)) {
949 cachep->nodelists[node] = NULL;
950 spin_unlock(&l3->list_lock);
951 kfree(l3);
952 } else {
953 spin_unlock(&l3->list_lock);
954 }
955unlock_cache:
742 spin_unlock_irq(&cachep->spinlock); 956 spin_unlock_irq(&cachep->spinlock);
743 kfree(nc); 957 kfree(nc);
744 } 958 }
@@ -754,6 +968,25 @@ bad:
754 968
755static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; 969static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
756 970
971/*
972 * swap the static kmem_list3 with kmalloced memory
973 */
974static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
975 int nodeid)
976{
977 struct kmem_list3 *ptr;
978
979 BUG_ON(cachep->nodelists[nodeid] != list);
980 ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
981 BUG_ON(!ptr);
982
983 local_irq_disable();
984 memcpy(ptr, list, sizeof(struct kmem_list3));
985 MAKE_ALL_LISTS(cachep, ptr, nodeid);
986 cachep->nodelists[nodeid] = ptr;
987 local_irq_enable();
988}
989
757/* Initialisation. 990/* Initialisation.
758 * Called after the gfp() functions have been enabled, and before smp_init(). 991 * Called after the gfp() functions have been enabled, and before smp_init().
759 */ 992 */
@@ -762,6 +995,13 @@ void __init kmem_cache_init(void)
762 size_t left_over; 995 size_t left_over;
763 struct cache_sizes *sizes; 996 struct cache_sizes *sizes;
764 struct cache_names *names; 997 struct cache_names *names;
998 int i;
999
1000 for (i = 0; i < NUM_INIT_LISTS; i++) {
1001 kmem_list3_init(&initkmem_list3[i]);
1002 if (i < MAX_NUMNODES)
1003 cache_cache.nodelists[i] = NULL;
1004 }
765 1005
766 /* 1006 /*
767 * Fragmentation resistance on low memory - only use bigger 1007 * Fragmentation resistance on low memory - only use bigger
@@ -770,21 +1010,24 @@ void __init kmem_cache_init(void)
770 if (num_physpages > (32 << 20) >> PAGE_SHIFT) 1010 if (num_physpages > (32 << 20) >> PAGE_SHIFT)
771 slab_break_gfp_order = BREAK_GFP_ORDER_HI; 1011 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
772 1012
773
774 /* Bootstrap is tricky, because several objects are allocated 1013 /* Bootstrap is tricky, because several objects are allocated
775 * from caches that do not exist yet: 1014 * from caches that do not exist yet:
776 * 1) initialize the cache_cache cache: it contains the kmem_cache_t 1015 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
777 * structures of all caches, except cache_cache itself: cache_cache 1016 * structures of all caches, except cache_cache itself: cache_cache
778 * is statically allocated. 1017 * is statically allocated.
779 * Initially an __init data area is used for the head array, it's 1018 * Initially an __init data area is used for the head array and the
780 * replaced with a kmalloc allocated array at the end of the bootstrap. 1019 * kmem_list3 structures, it's replaced with a kmalloc allocated
1020 * array at the end of the bootstrap.
781 * 2) Create the first kmalloc cache. 1021 * 2) Create the first kmalloc cache.
782 * The kmem_cache_t for the new cache is allocated normally. An __init 1022 * The kmem_cache_t for the new cache is allocated normally.
783 * data area is used for the head array. 1023 * An __init data area is used for the head array.
784 * 3) Create the remaining kmalloc caches, with minimally sized head arrays. 1024 * 3) Create the remaining kmalloc caches, with minimally sized
1025 * head arrays.
785 * 4) Replace the __init data head arrays for cache_cache and the first 1026 * 4) Replace the __init data head arrays for cache_cache and the first
786 * kmalloc cache with kmalloc allocated arrays. 1027 * kmalloc cache with kmalloc allocated arrays.
787 * 5) Resize the head arrays of the kmalloc caches to their final sizes. 1028 * 5) Replace the __init data for kmem_list3 for cache_cache and
1029 * the other cache's with kmalloc allocated memory.
1030 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
788 */ 1031 */
789 1032
790 /* 1) create the cache_cache */ 1033 /* 1) create the cache_cache */
@@ -793,6 +1036,7 @@ void __init kmem_cache_init(void)
793 list_add(&cache_cache.next, &cache_chain); 1036 list_add(&cache_cache.next, &cache_chain);
794 cache_cache.colour_off = cache_line_size(); 1037 cache_cache.colour_off = cache_line_size();
795 cache_cache.array[smp_processor_id()] = &initarray_cache.cache; 1038 cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
1039 cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
796 1040
797 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); 1041 cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
798 1042
@@ -810,15 +1054,33 @@ void __init kmem_cache_init(void)
810 sizes = malloc_sizes; 1054 sizes = malloc_sizes;
811 names = cache_names; 1055 names = cache_names;
812 1056
1057 /* Initialize the caches that provide memory for the array cache
1058 * and the kmem_list3 structures first.
1059 * Without this, further allocations will bug
1060 */
1061
1062 sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
1063 sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
1064 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
1065
1066 if (INDEX_AC != INDEX_L3)
1067 sizes[INDEX_L3].cs_cachep =
1068 kmem_cache_create(names[INDEX_L3].name,
1069 sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
1070 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
1071
813 while (sizes->cs_size != ULONG_MAX) { 1072 while (sizes->cs_size != ULONG_MAX) {
814 /* For performance, all the general caches are L1 aligned. 1073 /*
1074 * For performance, all the general caches are L1 aligned.
815 * This should be particularly beneficial on SMP boxes, as it 1075 * This should be particularly beneficial on SMP boxes, as it
816 * eliminates "false sharing". 1076 * eliminates "false sharing".
817 * Note for systems short on memory removing the alignment will 1077 * Note for systems short on memory removing the alignment will
818 * allow tighter packing of the smaller caches. */ 1078 * allow tighter packing of the smaller caches.
819 sizes->cs_cachep = kmem_cache_create(names->name, 1079 */
820 sizes->cs_size, ARCH_KMALLOC_MINALIGN, 1080 if(!sizes->cs_cachep)
821 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL); 1081 sizes->cs_cachep = kmem_cache_create(names->name,
1082 sizes->cs_size, ARCH_KMALLOC_MINALIGN,
1083 (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
822 1084
823 /* Inc off-slab bufctl limit until the ceiling is hit. */ 1085 /* Inc off-slab bufctl limit until the ceiling is hit. */
824 if (!(OFF_SLAB(sizes->cs_cachep))) { 1086 if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -837,24 +1099,47 @@ void __init kmem_cache_init(void)
837 /* 4) Replace the bootstrap head arrays */ 1099 /* 4) Replace the bootstrap head arrays */
838 { 1100 {
839 void * ptr; 1101 void * ptr;
840 1102
841 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1103 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1104
842 local_irq_disable(); 1105 local_irq_disable();
843 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache); 1106 BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
844 memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init)); 1107 memcpy(ptr, ac_data(&cache_cache),
1108 sizeof(struct arraycache_init));
845 cache_cache.array[smp_processor_id()] = ptr; 1109 cache_cache.array[smp_processor_id()] = ptr;
846 local_irq_enable(); 1110 local_irq_enable();
847 1111
848 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL); 1112 ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
1113
849 local_irq_disable(); 1114 local_irq_disable();
850 BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache); 1115 BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
851 memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep), 1116 != &initarray_generic.cache);
1117 memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
852 sizeof(struct arraycache_init)); 1118 sizeof(struct arraycache_init));
853 malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr; 1119 malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
1120 ptr;
854 local_irq_enable(); 1121 local_irq_enable();
855 } 1122 }
1123 /* 5) Replace the bootstrap kmem_list3's */
1124 {
1125 int node;
1126 /* Replace the static kmem_list3 structures for the boot cpu */
1127 init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
1128 numa_node_id());
1129
1130 for_each_online_node(node) {
1131 init_list(malloc_sizes[INDEX_AC].cs_cachep,
1132 &initkmem_list3[SIZE_AC+node], node);
1133
1134 if (INDEX_AC != INDEX_L3) {
1135 init_list(malloc_sizes[INDEX_L3].cs_cachep,
1136 &initkmem_list3[SIZE_L3+node],
1137 node);
1138 }
1139 }
1140 }
856 1141
857 /* 5) resize the head arrays to their final sizes */ 1142 /* 6) resize the head arrays to their final sizes */
858 { 1143 {
859 kmem_cache_t *cachep; 1144 kmem_cache_t *cachep;
860 down(&cache_chain_sem); 1145 down(&cache_chain_sem);
@@ -870,7 +1155,6 @@ void __init kmem_cache_init(void)
870 * that initializes ac_data for all new cpus 1155 * that initializes ac_data for all new cpus
871 */ 1156 */
872 register_cpu_notifier(&cpucache_notifier); 1157 register_cpu_notifier(&cpucache_notifier);
873
874 1158
875 /* The reap timers are started later, with a module init call: 1159 /* The reap timers are started later, with a module init call:
876 * That part of the kernel is not yet operational. 1160 * That part of the kernel is not yet operational.
@@ -885,10 +1169,8 @@ static int __init cpucache_init(void)
885 * Register the timers that return unneeded 1169 * Register the timers that return unneeded
886 * pages to gfp. 1170 * pages to gfp.
887 */ 1171 */
888 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1172 for_each_online_cpu(cpu)
889 if (cpu_online(cpu)) 1173 start_cpu_timer(cpu);
890 start_cpu_timer(cpu);
891 }
892 1174
893 return 0; 1175 return 0;
894} 1176}
@@ -1167,6 +1449,20 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1167 } 1449 }
1168} 1450}
1169 1451
1452/* For setting up all the kmem_list3s for cache whose objsize is same
1453 as size of kmem_list3. */
1454static inline void set_up_list3s(kmem_cache_t *cachep, int index)
1455{
1456 int node;
1457
1458 for_each_online_node(node) {
1459 cachep->nodelists[node] = &initkmem_list3[index+node];
1460 cachep->nodelists[node]->next_reap = jiffies +
1461 REAPTIMEOUT_LIST3 +
1462 ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
1463 }
1464}
1465
1170/** 1466/**
1171 * kmem_cache_create - Create a cache. 1467 * kmem_cache_create - Create a cache.
1172 * @name: A string which is used in /proc/slabinfo to identify this cache. 1468 * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1320,7 +1616,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
1320 size += BYTES_PER_WORD; 1616 size += BYTES_PER_WORD;
1321 } 1617 }
1322#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) 1618#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1323 if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { 1619 if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
1324 cachep->dbghead += PAGE_SIZE - size; 1620 cachep->dbghead += PAGE_SIZE - size;
1325 size = PAGE_SIZE; 1621 size = PAGE_SIZE;
1326 } 1622 }
@@ -1422,10 +1718,6 @@ next:
1422 cachep->gfpflags |= GFP_DMA; 1718 cachep->gfpflags |= GFP_DMA;
1423 spin_lock_init(&cachep->spinlock); 1719 spin_lock_init(&cachep->spinlock);
1424 cachep->objsize = size; 1720 cachep->objsize = size;
1425 /* NUMA */
1426 INIT_LIST_HEAD(&cachep->lists.slabs_full);
1427 INIT_LIST_HEAD(&cachep->lists.slabs_partial);
1428 INIT_LIST_HEAD(&cachep->lists.slabs_free);
1429 1721
1430 if (flags & CFLGS_OFF_SLAB) 1722 if (flags & CFLGS_OFF_SLAB)
1431 cachep->slabp_cache = kmem_find_general_cachep(slab_size,0); 1723 cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
@@ -1444,11 +1736,43 @@ next:
1444 * the cache that's used by kmalloc(24), otherwise 1736 * the cache that's used by kmalloc(24), otherwise
1445 * the creation of further caches will BUG(). 1737 * the creation of further caches will BUG().
1446 */ 1738 */
1447 cachep->array[smp_processor_id()] = &initarray_generic.cache; 1739 cachep->array[smp_processor_id()] =
1448 g_cpucache_up = PARTIAL; 1740 &initarray_generic.cache;
1741
1742 /* If the cache that's used by
1743 * kmalloc(sizeof(kmem_list3)) is the first cache,
1744 * then we need to set up all its list3s, otherwise
1745 * the creation of further caches will BUG().
1746 */
1747 set_up_list3s(cachep, SIZE_AC);
1748 if (INDEX_AC == INDEX_L3)
1749 g_cpucache_up = PARTIAL_L3;
1750 else
1751 g_cpucache_up = PARTIAL_AC;
1449 } else { 1752 } else {
1450 cachep->array[smp_processor_id()] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL); 1753 cachep->array[smp_processor_id()] =
1754 kmalloc(sizeof(struct arraycache_init),
1755 GFP_KERNEL);
1756
1757 if (g_cpucache_up == PARTIAL_AC) {
1758 set_up_list3s(cachep, SIZE_L3);
1759 g_cpucache_up = PARTIAL_L3;
1760 } else {
1761 int node;
1762 for_each_online_node(node) {
1763
1764 cachep->nodelists[node] =
1765 kmalloc_node(sizeof(struct kmem_list3),
1766 GFP_KERNEL, node);
1767 BUG_ON(!cachep->nodelists[node]);
1768 kmem_list3_init(cachep->nodelists[node]);
1769 }
1770 }
1451 } 1771 }
1772 cachep->nodelists[numa_node_id()]->next_reap =
1773 jiffies + REAPTIMEOUT_LIST3 +
1774 ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
1775
1452 BUG_ON(!ac_data(cachep)); 1776 BUG_ON(!ac_data(cachep));
1453 ac_data(cachep)->avail = 0; 1777 ac_data(cachep)->avail = 0;
1454 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES; 1778 ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
@@ -1456,13 +1780,8 @@ next:
1456 ac_data(cachep)->touched = 0; 1780 ac_data(cachep)->touched = 0;
1457 cachep->batchcount = 1; 1781 cachep->batchcount = 1;
1458 cachep->limit = BOOT_CPUCACHE_ENTRIES; 1782 cachep->limit = BOOT_CPUCACHE_ENTRIES;
1459 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
1460 + cachep->num;
1461 } 1783 }
1462 1784
1463 cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
1464 ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
1465
1466 /* Need the semaphore to access the chain. */ 1785 /* Need the semaphore to access the chain. */
1467 down(&cache_chain_sem); 1786 down(&cache_chain_sem);
1468 { 1787 {
@@ -1519,13 +1838,23 @@ static void check_spinlock_acquired(kmem_cache_t *cachep)
1519{ 1838{
1520#ifdef CONFIG_SMP 1839#ifdef CONFIG_SMP
1521 check_irq_off(); 1840 check_irq_off();
1522 BUG_ON(spin_trylock(&cachep->spinlock)); 1841 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
1523#endif 1842#endif
1524} 1843}
1844
1845static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
1846{
1847#ifdef CONFIG_SMP
1848 check_irq_off();
1849 assert_spin_locked(&cachep->nodelists[node]->list_lock);
1850#endif
1851}
1852
1525#else 1853#else
1526#define check_irq_off() do { } while(0) 1854#define check_irq_off() do { } while(0)
1527#define check_irq_on() do { } while(0) 1855#define check_irq_on() do { } while(0)
1528#define check_spinlock_acquired(x) do { } while(0) 1856#define check_spinlock_acquired(x) do { } while(0)
1857#define check_spinlock_acquired_node(x, y) do { } while(0)
1529#endif 1858#endif
1530 1859
1531/* 1860/*
@@ -1547,7 +1876,7 @@ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
1547} 1876}
1548 1877
1549static void drain_array_locked(kmem_cache_t* cachep, 1878static void drain_array_locked(kmem_cache_t* cachep,
1550 struct array_cache *ac, int force); 1879 struct array_cache *ac, int force, int node);
1551 1880
1552static void do_drain(void *arg) 1881static void do_drain(void *arg)
1553{ 1882{
@@ -1556,59 +1885,82 @@ static void do_drain(void *arg)
1556 1885
1557 check_irq_off(); 1886 check_irq_off();
1558 ac = ac_data(cachep); 1887 ac = ac_data(cachep);
1559 spin_lock(&cachep->spinlock); 1888 spin_lock(&cachep->nodelists[numa_node_id()]->list_lock);
1560 free_block(cachep, &ac_entry(ac)[0], ac->avail); 1889 free_block(cachep, ac->entry, ac->avail);
1561 spin_unlock(&cachep->spinlock); 1890 spin_unlock(&cachep->nodelists[numa_node_id()]->list_lock);
1562 ac->avail = 0; 1891 ac->avail = 0;
1563} 1892}
1564 1893
1565static void drain_cpu_caches(kmem_cache_t *cachep) 1894static void drain_cpu_caches(kmem_cache_t *cachep)
1566{ 1895{
1896 struct kmem_list3 *l3;
1897 int node;
1898
1567 smp_call_function_all_cpus(do_drain, cachep); 1899 smp_call_function_all_cpus(do_drain, cachep);
1568 check_irq_on(); 1900 check_irq_on();
1569 spin_lock_irq(&cachep->spinlock); 1901 spin_lock_irq(&cachep->spinlock);
1570 if (cachep->lists.shared) 1902 for_each_online_node(node) {
1571 drain_array_locked(cachep, cachep->lists.shared, 1); 1903 l3 = cachep->nodelists[node];
1904 if (l3) {
1905 spin_lock(&l3->list_lock);
1906 drain_array_locked(cachep, l3->shared, 1, node);
1907 spin_unlock(&l3->list_lock);
1908 if (l3->alien)
1909 drain_alien_cache(cachep, l3);
1910 }
1911 }
1572 spin_unlock_irq(&cachep->spinlock); 1912 spin_unlock_irq(&cachep->spinlock);
1573} 1913}
1574 1914
1575 1915static int __node_shrink(kmem_cache_t *cachep, int node)
1576/* NUMA shrink all list3s */
1577static int __cache_shrink(kmem_cache_t *cachep)
1578{ 1916{
1579 struct slab *slabp; 1917 struct slab *slabp;
1918 struct kmem_list3 *l3 = cachep->nodelists[node];
1580 int ret; 1919 int ret;
1581 1920
1582 drain_cpu_caches(cachep); 1921 for (;;) {
1583
1584 check_irq_on();
1585 spin_lock_irq(&cachep->spinlock);
1586
1587 for(;;) {
1588 struct list_head *p; 1922 struct list_head *p;
1589 1923
1590 p = cachep->lists.slabs_free.prev; 1924 p = l3->slabs_free.prev;
1591 if (p == &cachep->lists.slabs_free) 1925 if (p == &l3->slabs_free)
1592 break; 1926 break;
1593 1927
1594 slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list); 1928 slabp = list_entry(l3->slabs_free.prev, struct slab, list);
1595#if DEBUG 1929#if DEBUG
1596 if (slabp->inuse) 1930 if (slabp->inuse)
1597 BUG(); 1931 BUG();
1598#endif 1932#endif
1599 list_del(&slabp->list); 1933 list_del(&slabp->list);
1600 1934
1601 cachep->lists.free_objects -= cachep->num; 1935 l3->free_objects -= cachep->num;
1602 spin_unlock_irq(&cachep->spinlock); 1936 spin_unlock_irq(&l3->list_lock);
1603 slab_destroy(cachep, slabp); 1937 slab_destroy(cachep, slabp);
1604 spin_lock_irq(&cachep->spinlock); 1938 spin_lock_irq(&l3->list_lock);
1605 } 1939 }
1606 ret = !list_empty(&cachep->lists.slabs_full) || 1940 ret = !list_empty(&l3->slabs_full) ||
1607 !list_empty(&cachep->lists.slabs_partial); 1941 !list_empty(&l3->slabs_partial);
1608 spin_unlock_irq(&cachep->spinlock);
1609 return ret; 1942 return ret;
1610} 1943}
1611 1944
1945static int __cache_shrink(kmem_cache_t *cachep)
1946{
1947 int ret = 0, i = 0;
1948 struct kmem_list3 *l3;
1949
1950 drain_cpu_caches(cachep);
1951
1952 check_irq_on();
1953 for_each_online_node(i) {
1954 l3 = cachep->nodelists[i];
1955 if (l3) {
1956 spin_lock_irq(&l3->list_lock);
1957 ret += __node_shrink(cachep, i);
1958 spin_unlock_irq(&l3->list_lock);
1959 }
1960 }
1961 return (ret ? 1 : 0);
1962}
1963
1612/** 1964/**
1613 * kmem_cache_shrink - Shrink a cache. 1965 * kmem_cache_shrink - Shrink a cache.
1614 * @cachep: The cache to shrink. 1966 * @cachep: The cache to shrink.
@@ -1645,6 +1997,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
1645int kmem_cache_destroy(kmem_cache_t * cachep) 1997int kmem_cache_destroy(kmem_cache_t * cachep)
1646{ 1998{
1647 int i; 1999 int i;
2000 struct kmem_list3 *l3;
1648 2001
1649 if (!cachep || in_interrupt()) 2002 if (!cachep || in_interrupt())
1650 BUG(); 2003 BUG();
@@ -1672,15 +2025,17 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
1672 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) 2025 if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
1673 synchronize_rcu(); 2026 synchronize_rcu();
1674 2027
1675 /* no cpu_online check required here since we clear the percpu 2028 for_each_online_cpu(i)
1676 * array on cpu offline and set this to NULL.
1677 */
1678 for (i = 0; i < NR_CPUS; i++)
1679 kfree(cachep->array[i]); 2029 kfree(cachep->array[i]);
1680 2030
1681 /* NUMA: free the list3 structures */ 2031 /* NUMA: free the list3 structures */
1682 kfree(cachep->lists.shared); 2032 for_each_online_node(i) {
1683 cachep->lists.shared = NULL; 2033 if ((l3 = cachep->nodelists[i])) {
2034 kfree(l3->shared);
2035 free_alien_cache(l3->alien);
2036 kfree(l3);
2037 }
2038 }
1684 kmem_cache_free(&cache_cache, cachep); 2039 kmem_cache_free(&cache_cache, cachep);
1685 2040
1686 unlock_cpu_hotplug(); 2041 unlock_cpu_hotplug();
@@ -1690,8 +2045,8 @@ int kmem_cache_destroy(kmem_cache_t * cachep)
1690EXPORT_SYMBOL(kmem_cache_destroy); 2045EXPORT_SYMBOL(kmem_cache_destroy);
1691 2046
1692/* Get the memory for a slab management obj. */ 2047/* Get the memory for a slab management obj. */
1693static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, 2048static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
1694 void *objp, int colour_off, unsigned int __nocast local_flags) 2049 int colour_off, unsigned int __nocast local_flags)
1695{ 2050{
1696 struct slab *slabp; 2051 struct slab *slabp;
1697 2052
@@ -1722,7 +2077,7 @@ static void cache_init_objs(kmem_cache_t *cachep,
1722 int i; 2077 int i;
1723 2078
1724 for (i = 0; i < cachep->num; i++) { 2079 for (i = 0; i < cachep->num; i++) {
1725 void* objp = slabp->s_mem+cachep->objsize*i; 2080 void *objp = slabp->s_mem+cachep->objsize*i;
1726#if DEBUG 2081#if DEBUG
1727 /* need to poison the objs? */ 2082 /* need to poison the objs? */
1728 if (cachep->flags & SLAB_POISON) 2083 if (cachep->flags & SLAB_POISON)
@@ -1799,6 +2154,7 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
1799 size_t offset; 2154 size_t offset;
1800 unsigned int local_flags; 2155 unsigned int local_flags;
1801 unsigned long ctor_flags; 2156 unsigned long ctor_flags;
2157 struct kmem_list3 *l3;
1802 2158
1803 /* Be lazy and only check for valid flags here, 2159 /* Be lazy and only check for valid flags here,
1804 * keeping it out of the critical path in kmem_cache_alloc(). 2160 * keeping it out of the critical path in kmem_cache_alloc().
@@ -1830,6 +2186,7 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
1830 2186
1831 spin_unlock(&cachep->spinlock); 2187 spin_unlock(&cachep->spinlock);
1832 2188
2189 check_irq_off();
1833 if (local_flags & __GFP_WAIT) 2190 if (local_flags & __GFP_WAIT)
1834 local_irq_enable(); 2191 local_irq_enable();
1835 2192
@@ -1841,8 +2198,9 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
1841 */ 2198 */
1842 kmem_flagcheck(cachep, flags); 2199 kmem_flagcheck(cachep, flags);
1843 2200
1844 2201 /* Get mem for the objs.
1845 /* Get mem for the objs. */ 2202 * Attempt to allocate a physical page from 'nodeid',
2203 */
1846 if (!(objp = kmem_getpages(cachep, flags, nodeid))) 2204 if (!(objp = kmem_getpages(cachep, flags, nodeid)))
1847 goto failed; 2205 goto failed;
1848 2206
@@ -1850,6 +2208,7 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
1850 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags))) 2208 if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
1851 goto opps1; 2209 goto opps1;
1852 2210
2211 slabp->nodeid = nodeid;
1853 set_slab_attr(cachep, slabp, objp); 2212 set_slab_attr(cachep, slabp, objp);
1854 2213
1855 cache_init_objs(cachep, slabp, ctor_flags); 2214 cache_init_objs(cachep, slabp, ctor_flags);
@@ -1857,13 +2216,14 @@ static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nod
1857 if (local_flags & __GFP_WAIT) 2216 if (local_flags & __GFP_WAIT)
1858 local_irq_disable(); 2217 local_irq_disable();
1859 check_irq_off(); 2218 check_irq_off();
1860 spin_lock(&cachep->spinlock); 2219 l3 = cachep->nodelists[nodeid];
2220 spin_lock(&l3->list_lock);
1861 2221
1862 /* Make slab active. */ 2222 /* Make slab active. */
1863 list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free)); 2223 list_add_tail(&slabp->list, &(l3->slabs_free));
1864 STATS_INC_GROWN(cachep); 2224 STATS_INC_GROWN(cachep);
1865 list3_data(cachep)->free_objects += cachep->num; 2225 l3->free_objects += cachep->num;
1866 spin_unlock(&cachep->spinlock); 2226 spin_unlock(&l3->list_lock);
1867 return 1; 2227 return 1;
1868opps1: 2228opps1:
1869 kmem_freepages(cachep, objp); 2229 kmem_freepages(cachep, objp);
@@ -1969,7 +2329,6 @@ static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
1969 kmem_bufctl_t i; 2329 kmem_bufctl_t i;
1970 int entries = 0; 2330 int entries = 0;
1971 2331
1972 check_spinlock_acquired(cachep);
1973 /* Check slab's freelist to see if this obj is there. */ 2332 /* Check slab's freelist to see if this obj is there. */
1974 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { 2333 for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
1975 entries++; 2334 entries++;
@@ -2012,10 +2371,11 @@ retry:
2012 */ 2371 */
2013 batchcount = BATCHREFILL_LIMIT; 2372 batchcount = BATCHREFILL_LIMIT;
2014 } 2373 }
2015 l3 = list3_data(cachep); 2374 l3 = cachep->nodelists[numa_node_id()];
2375
2376 BUG_ON(ac->avail > 0 || !l3);
2377 spin_lock(&l3->list_lock);
2016 2378
2017 BUG_ON(ac->avail > 0);
2018 spin_lock(&cachep->spinlock);
2019 if (l3->shared) { 2379 if (l3->shared) {
2020 struct array_cache *shared_array = l3->shared; 2380 struct array_cache *shared_array = l3->shared;
2021 if (shared_array->avail) { 2381 if (shared_array->avail) {
@@ -2023,8 +2383,9 @@ retry:
2023 batchcount = shared_array->avail; 2383 batchcount = shared_array->avail;
2024 shared_array->avail -= batchcount; 2384 shared_array->avail -= batchcount;
2025 ac->avail = batchcount; 2385 ac->avail = batchcount;
2026 memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail], 2386 memcpy(ac->entry,
2027 sizeof(void*)*batchcount); 2387 &(shared_array->entry[shared_array->avail]),
2388 sizeof(void*)*batchcount);
2028 shared_array->touched = 1; 2389 shared_array->touched = 1;
2029 goto alloc_done; 2390 goto alloc_done;
2030 } 2391 }
@@ -2051,7 +2412,8 @@ retry:
2051 STATS_SET_HIGH(cachep); 2412 STATS_SET_HIGH(cachep);
2052 2413
2053 /* get obj pointer */ 2414 /* get obj pointer */
2054 ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize; 2415 ac->entry[ac->avail++] = slabp->s_mem +
2416 slabp->free*cachep->objsize;
2055 2417
2056 slabp->inuse++; 2418 slabp->inuse++;
2057 next = slab_bufctl(slabp)[slabp->free]; 2419 next = slab_bufctl(slabp)[slabp->free];
@@ -2073,12 +2435,12 @@ retry:
2073must_grow: 2435must_grow:
2074 l3->free_objects -= ac->avail; 2436 l3->free_objects -= ac->avail;
2075alloc_done: 2437alloc_done:
2076 spin_unlock(&cachep->spinlock); 2438 spin_unlock(&l3->list_lock);
2077 2439
2078 if (unlikely(!ac->avail)) { 2440 if (unlikely(!ac->avail)) {
2079 int x; 2441 int x;
2080 x = cache_grow(cachep, flags, -1); 2442 x = cache_grow(cachep, flags, numa_node_id());
2081 2443
2082 // cache_grow can reenable interrupts, then ac could change. 2444 // cache_grow can reenable interrupts, then ac could change.
2083 ac = ac_data(cachep); 2445 ac = ac_data(cachep);
2084 if (!x && ac->avail == 0) // no objects in sight? abort 2446 if (!x && ac->avail == 0) // no objects in sight? abort
@@ -2088,7 +2450,7 @@ alloc_done:
2088 goto retry; 2450 goto retry;
2089 } 2451 }
2090 ac->touched = 1; 2452 ac->touched = 1;
2091 return ac_entry(ac)[--ac->avail]; 2453 return ac->entry[--ac->avail];
2092} 2454}
2093 2455
2094static inline void 2456static inline void
@@ -2160,7 +2522,7 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast fl
2160 if (likely(ac->avail)) { 2522 if (likely(ac->avail)) {
2161 STATS_INC_ALLOCHIT(cachep); 2523 STATS_INC_ALLOCHIT(cachep);
2162 ac->touched = 1; 2524 ac->touched = 1;
2163 objp = ac_entry(ac)[--ac->avail]; 2525 objp = ac->entry[--ac->avail];
2164 } else { 2526 } else {
2165 STATS_INC_ALLOCMISS(cachep); 2527 STATS_INC_ALLOCMISS(cachep);
2166 objp = cache_alloc_refill(cachep, flags); 2528 objp = cache_alloc_refill(cachep, flags);
@@ -2172,33 +2534,104 @@ static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast fl
2172 return objp; 2534 return objp;
2173} 2535}
2174 2536
2175/* 2537#ifdef CONFIG_NUMA
2176 * NUMA: different approach needed if the spinlock is moved into 2538/*
2177 * the l3 structure 2539 * A interface to enable slab creation on nodeid
2178 */ 2540 */
2541static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
2542{
2543 struct list_head *entry;
2544 struct slab *slabp;
2545 struct kmem_list3 *l3;
2546 void *obj;
2547 kmem_bufctl_t next;
2548 int x;
2549
2550 l3 = cachep->nodelists[nodeid];
2551 BUG_ON(!l3);
2552
2553retry:
2554 spin_lock(&l3->list_lock);
2555 entry = l3->slabs_partial.next;
2556 if (entry == &l3->slabs_partial) {
2557 l3->free_touched = 1;
2558 entry = l3->slabs_free.next;
2559 if (entry == &l3->slabs_free)
2560 goto must_grow;
2561 }
2562
2563 slabp = list_entry(entry, struct slab, list);
2564 check_spinlock_acquired_node(cachep, nodeid);
2565 check_slabp(cachep, slabp);
2566
2567 STATS_INC_NODEALLOCS(cachep);
2568 STATS_INC_ACTIVE(cachep);
2569 STATS_SET_HIGH(cachep);
2570
2571 BUG_ON(slabp->inuse == cachep->num);
2572
2573 /* get obj pointer */
2574 obj = slabp->s_mem + slabp->free*cachep->objsize;
2575 slabp->inuse++;
2576 next = slab_bufctl(slabp)[slabp->free];
2577#if DEBUG
2578 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2579#endif
2580 slabp->free = next;
2581 check_slabp(cachep, slabp);
2582 l3->free_objects--;
2583 /* move slabp to correct slabp list: */
2584 list_del(&slabp->list);
2585
2586 if (slabp->free == BUFCTL_END) {
2587 list_add(&slabp->list, &l3->slabs_full);
2588 } else {
2589 list_add(&slabp->list, &l3->slabs_partial);
2590 }
2591
2592 spin_unlock(&l3->list_lock);
2593 goto done;
2594
2595must_grow:
2596 spin_unlock(&l3->list_lock);
2597 x = cache_grow(cachep, flags, nodeid);
2179 2598
2599 if (!x)
2600 return NULL;
2601
2602 goto retry;
2603done:
2604 return obj;
2605}
2606#endif
2607
2608/*
2609 * Caller needs to acquire correct kmem_list's list_lock
2610 */
2180static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects) 2611static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
2181{ 2612{
2182 int i; 2613 int i;
2183 2614 struct kmem_list3 *l3;
2184 check_spinlock_acquired(cachep);
2185
2186 /* NUMA: move add into loop */
2187 cachep->lists.free_objects += nr_objects;
2188 2615
2189 for (i = 0; i < nr_objects; i++) { 2616 for (i = 0; i < nr_objects; i++) {
2190 void *objp = objpp[i]; 2617 void *objp = objpp[i];
2191 struct slab *slabp; 2618 struct slab *slabp;
2192 unsigned int objnr; 2619 unsigned int objnr;
2620 int nodeid = 0;
2193 2621
2194 slabp = GET_PAGE_SLAB(virt_to_page(objp)); 2622 slabp = GET_PAGE_SLAB(virt_to_page(objp));
2623 nodeid = slabp->nodeid;
2624 l3 = cachep->nodelists[nodeid];
2195 list_del(&slabp->list); 2625 list_del(&slabp->list);
2196 objnr = (objp - slabp->s_mem) / cachep->objsize; 2626 objnr = (objp - slabp->s_mem) / cachep->objsize;
2627 check_spinlock_acquired_node(cachep, nodeid);
2197 check_slabp(cachep, slabp); 2628 check_slabp(cachep, slabp);
2629
2630
2198#if DEBUG 2631#if DEBUG
2199 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) { 2632 if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2200 printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n", 2633 printk(KERN_ERR "slab: double free detected in cache "
2201 cachep->name, objp); 2634 "'%s', objp %p\n", cachep->name, objp);
2202 BUG(); 2635 BUG();
2203 } 2636 }
2204#endif 2637#endif
@@ -2206,24 +2639,23 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
2206 slabp->free = objnr; 2639 slabp->free = objnr;
2207 STATS_DEC_ACTIVE(cachep); 2640 STATS_DEC_ACTIVE(cachep);
2208 slabp->inuse--; 2641 slabp->inuse--;
2642 l3->free_objects++;
2209 check_slabp(cachep, slabp); 2643 check_slabp(cachep, slabp);
2210 2644
2211 /* fixup slab chains */ 2645 /* fixup slab chains */
2212 if (slabp->inuse == 0) { 2646 if (slabp->inuse == 0) {
2213 if (cachep->lists.free_objects > cachep->free_limit) { 2647 if (l3->free_objects > l3->free_limit) {
2214 cachep->lists.free_objects -= cachep->num; 2648 l3->free_objects -= cachep->num;
2215 slab_destroy(cachep, slabp); 2649 slab_destroy(cachep, slabp);
2216 } else { 2650 } else {
2217 list_add(&slabp->list, 2651 list_add(&slabp->list, &l3->slabs_free);
2218 &list3_data_ptr(cachep, objp)->slabs_free);
2219 } 2652 }
2220 } else { 2653 } else {
2221 /* Unconditionally move a slab to the end of the 2654 /* Unconditionally move a slab to the end of the
2222 * partial list on free - maximum time for the 2655 * partial list on free - maximum time for the
2223 * other objects to be freed, too. 2656 * other objects to be freed, too.
2224 */ 2657 */
2225 list_add_tail(&slabp->list, 2658 list_add_tail(&slabp->list, &l3->slabs_partial);
2226 &list3_data_ptr(cachep, objp)->slabs_partial);
2227 } 2659 }
2228 } 2660 }
2229} 2661}
@@ -2231,36 +2663,38 @@ static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
2231static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac) 2663static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
2232{ 2664{
2233 int batchcount; 2665 int batchcount;
2666 struct kmem_list3 *l3;
2234 2667
2235 batchcount = ac->batchcount; 2668 batchcount = ac->batchcount;
2236#if DEBUG 2669#if DEBUG
2237 BUG_ON(!batchcount || batchcount > ac->avail); 2670 BUG_ON(!batchcount || batchcount > ac->avail);
2238#endif 2671#endif
2239 check_irq_off(); 2672 check_irq_off();
2240 spin_lock(&cachep->spinlock); 2673 l3 = cachep->nodelists[numa_node_id()];
2241 if (cachep->lists.shared) { 2674 spin_lock(&l3->list_lock);
2242 struct array_cache *shared_array = cachep->lists.shared; 2675 if (l3->shared) {
2676 struct array_cache *shared_array = l3->shared;
2243 int max = shared_array->limit-shared_array->avail; 2677 int max = shared_array->limit-shared_array->avail;
2244 if (max) { 2678 if (max) {
2245 if (batchcount > max) 2679 if (batchcount > max)
2246 batchcount = max; 2680 batchcount = max;
2247 memcpy(&ac_entry(shared_array)[shared_array->avail], 2681 memcpy(&(shared_array->entry[shared_array->avail]),
2248 &ac_entry(ac)[0], 2682 ac->entry,
2249 sizeof(void*)*batchcount); 2683 sizeof(void*)*batchcount);
2250 shared_array->avail += batchcount; 2684 shared_array->avail += batchcount;
2251 goto free_done; 2685 goto free_done;
2252 } 2686 }
2253 } 2687 }
2254 2688
2255 free_block(cachep, &ac_entry(ac)[0], batchcount); 2689 free_block(cachep, ac->entry, batchcount);
2256free_done: 2690free_done:
2257#if STATS 2691#if STATS
2258 { 2692 {
2259 int i = 0; 2693 int i = 0;
2260 struct list_head *p; 2694 struct list_head *p;
2261 2695
2262 p = list3_data(cachep)->slabs_free.next; 2696 p = l3->slabs_free.next;
2263 while (p != &(list3_data(cachep)->slabs_free)) { 2697 while (p != &(l3->slabs_free)) {
2264 struct slab *slabp; 2698 struct slab *slabp;
2265 2699
2266 slabp = list_entry(p, struct slab, list); 2700 slabp = list_entry(p, struct slab, list);
@@ -2272,12 +2706,13 @@ free_done:
2272 STATS_SET_FREEABLE(cachep, i); 2706 STATS_SET_FREEABLE(cachep, i);
2273 } 2707 }
2274#endif 2708#endif
2275 spin_unlock(&cachep->spinlock); 2709 spin_unlock(&l3->list_lock);
2276 ac->avail -= batchcount; 2710 ac->avail -= batchcount;
2277 memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount], 2711 memmove(ac->entry, &(ac->entry[batchcount]),
2278 sizeof(void*)*ac->avail); 2712 sizeof(void*)*ac->avail);
2279} 2713}
2280 2714
2715
2281/* 2716/*
2282 * __cache_free 2717 * __cache_free
2283 * Release an obj back to its cache. If the obj has a constructed 2718 * Release an obj back to its cache. If the obj has a constructed
@@ -2292,14 +2727,46 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
2292 check_irq_off(); 2727 check_irq_off();
2293 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); 2728 objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
2294 2729
2730 /* Make sure we are not freeing a object from another
2731 * node to the array cache on this cpu.
2732 */
2733#ifdef CONFIG_NUMA
2734 {
2735 struct slab *slabp;
2736 slabp = GET_PAGE_SLAB(virt_to_page(objp));
2737 if (unlikely(slabp->nodeid != numa_node_id())) {
2738 struct array_cache *alien = NULL;
2739 int nodeid = slabp->nodeid;
2740 struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
2741
2742 STATS_INC_NODEFREES(cachep);
2743 if (l3->alien && l3->alien[nodeid]) {
2744 alien = l3->alien[nodeid];
2745 spin_lock(&alien->lock);
2746 if (unlikely(alien->avail == alien->limit))
2747 __drain_alien_cache(cachep,
2748 alien, nodeid);
2749 alien->entry[alien->avail++] = objp;
2750 spin_unlock(&alien->lock);
2751 } else {
2752 spin_lock(&(cachep->nodelists[nodeid])->
2753 list_lock);
2754 free_block(cachep, &objp, 1);
2755 spin_unlock(&(cachep->nodelists[nodeid])->
2756 list_lock);
2757 }
2758 return;
2759 }
2760 }
2761#endif
2295 if (likely(ac->avail < ac->limit)) { 2762 if (likely(ac->avail < ac->limit)) {
2296 STATS_INC_FREEHIT(cachep); 2763 STATS_INC_FREEHIT(cachep);
2297 ac_entry(ac)[ac->avail++] = objp; 2764 ac->entry[ac->avail++] = objp;
2298 return; 2765 return;
2299 } else { 2766 } else {
2300 STATS_INC_FREEMISS(cachep); 2767 STATS_INC_FREEMISS(cachep);
2301 cache_flusharray(cachep, ac); 2768 cache_flusharray(cachep, ac);
2302 ac_entry(ac)[ac->avail++] = objp; 2769 ac->entry[ac->avail++] = objp;
2303 } 2770 }
2304} 2771}
2305 2772
@@ -2369,81 +2836,30 @@ out:
2369 * Identical to kmem_cache_alloc, except that this function is slow 2836 * Identical to kmem_cache_alloc, except that this function is slow
2370 * and can sleep. And it will allocate memory on the given node, which 2837 * and can sleep. And it will allocate memory on the given node, which
2371 * can improve the performance for cpu bound structures. 2838 * can improve the performance for cpu bound structures.
2839 * New and improved: it will now make sure that the object gets
2840 * put on the correct node list so that there is no false sharing.
2372 */ 2841 */
2373void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid) 2842void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
2374{ 2843{
2375 int loop; 2844 unsigned long save_flags;
2376 void *objp; 2845 void *ptr;
2377 struct slab *slabp;
2378 kmem_bufctl_t next;
2379
2380 if (nodeid == -1)
2381 return kmem_cache_alloc(cachep, flags);
2382
2383 for (loop = 0;;loop++) {
2384 struct list_head *q;
2385
2386 objp = NULL;
2387 check_irq_on();
2388 spin_lock_irq(&cachep->spinlock);
2389 /* walk through all partial and empty slab and find one
2390 * from the right node */
2391 list_for_each(q,&cachep->lists.slabs_partial) {
2392 slabp = list_entry(q, struct slab, list);
2393
2394 if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
2395 loop > 2)
2396 goto got_slabp;
2397 }
2398 list_for_each(q, &cachep->lists.slabs_free) {
2399 slabp = list_entry(q, struct slab, list);
2400 2846
2401 if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid || 2847 if (nodeid == numa_node_id() || nodeid == -1)
2402 loop > 2) 2848 return __cache_alloc(cachep, flags);
2403 goto got_slabp;
2404 }
2405 spin_unlock_irq(&cachep->spinlock);
2406 2849
2407 local_irq_disable(); 2850 if (unlikely(!cachep->nodelists[nodeid])) {
2408 if (!cache_grow(cachep, flags, nodeid)) { 2851 /* Fall back to __cache_alloc if we run into trouble */
2409 local_irq_enable(); 2852 printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name);
2410 return NULL; 2853 return __cache_alloc(cachep,flags);
2411 }
2412 local_irq_enable();
2413 } 2854 }
2414got_slabp:
2415 /* found one: allocate object */
2416 check_slabp(cachep, slabp);
2417 check_spinlock_acquired(cachep);
2418 2855
2419 STATS_INC_ALLOCED(cachep); 2856 cache_alloc_debugcheck_before(cachep, flags);
2420 STATS_INC_ACTIVE(cachep); 2857 local_irq_save(save_flags);
2421 STATS_SET_HIGH(cachep); 2858 ptr = __cache_alloc_node(cachep, flags, nodeid);
2422 STATS_INC_NODEALLOCS(cachep); 2859 local_irq_restore(save_flags);
2423 2860 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
2424 objp = slabp->s_mem + slabp->free*cachep->objsize;
2425
2426 slabp->inuse++;
2427 next = slab_bufctl(slabp)[slabp->free];
2428#if DEBUG
2429 slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2430#endif
2431 slabp->free = next;
2432 check_slabp(cachep, slabp);
2433
2434 /* move slabp to correct slabp list: */
2435 list_del(&slabp->list);
2436 if (slabp->free == BUFCTL_END)
2437 list_add(&slabp->list, &cachep->lists.slabs_full);
2438 else
2439 list_add(&slabp->list, &cachep->lists.slabs_partial);
2440
2441 list3_data(cachep)->free_objects--;
2442 spin_unlock_irq(&cachep->spinlock);
2443 2861
2444 objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp, 2862 return ptr;
2445 __builtin_return_address(0));
2446 return objp;
2447} 2863}
2448EXPORT_SYMBOL(kmem_cache_alloc_node); 2864EXPORT_SYMBOL(kmem_cache_alloc_node);
2449 2865
@@ -2513,11 +2929,18 @@ void *__alloc_percpu(size_t size, size_t align)
2513 if (!pdata) 2929 if (!pdata)
2514 return NULL; 2930 return NULL;
2515 2931
2516 for (i = 0; i < NR_CPUS; i++) { 2932 /*
2517 if (!cpu_possible(i)) 2933 * Cannot use for_each_online_cpu since a cpu may come online
2518 continue; 2934 * and we have no way of figuring out how to fix the array
2519 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, 2935 * that we have allocated then....
2520 cpu_to_node(i)); 2936 */
2937 for_each_cpu(i) {
2938 int node = cpu_to_node(i);
2939
2940 if (node_online(node))
2941 pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
2942 else
2943 pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
2521 2944
2522 if (!pdata->ptrs[i]) 2945 if (!pdata->ptrs[i])
2523 goto unwind_oom; 2946 goto unwind_oom;
@@ -2607,11 +3030,11 @@ free_percpu(const void *objp)
2607 int i; 3030 int i;
2608 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp); 3031 struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
2609 3032
2610 for (i = 0; i < NR_CPUS; i++) { 3033 /*
2611 if (!cpu_possible(i)) 3034 * We allocate for all cpus so we cannot use for online cpu here.
2612 continue; 3035 */
3036 for_each_cpu(i)
2613 kfree(p->ptrs[i]); 3037 kfree(p->ptrs[i]);
2614 }
2615 kfree(p); 3038 kfree(p);
2616} 3039}
2617EXPORT_SYMBOL(free_percpu); 3040EXPORT_SYMBOL(free_percpu);
@@ -2629,6 +3052,64 @@ const char *kmem_cache_name(kmem_cache_t *cachep)
2629} 3052}
2630EXPORT_SYMBOL_GPL(kmem_cache_name); 3053EXPORT_SYMBOL_GPL(kmem_cache_name);
2631 3054
3055/*
3056 * This initializes kmem_list3 for all nodes.
3057 */
3058static int alloc_kmemlist(kmem_cache_t *cachep)
3059{
3060 int node;
3061 struct kmem_list3 *l3;
3062 int err = 0;
3063
3064 for_each_online_node(node) {
3065 struct array_cache *nc = NULL, *new;
3066 struct array_cache **new_alien = NULL;
3067#ifdef CONFIG_NUMA
3068 if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
3069 goto fail;
3070#endif
3071 if (!(new = alloc_arraycache(node, (cachep->shared*
3072 cachep->batchcount), 0xbaadf00d)))
3073 goto fail;
3074 if ((l3 = cachep->nodelists[node])) {
3075
3076 spin_lock_irq(&l3->list_lock);
3077
3078 if ((nc = cachep->nodelists[node]->shared))
3079 free_block(cachep, nc->entry,
3080 nc->avail);
3081
3082 l3->shared = new;
3083 if (!cachep->nodelists[node]->alien) {
3084 l3->alien = new_alien;
3085 new_alien = NULL;
3086 }
3087 l3->free_limit = (1 + nr_cpus_node(node))*
3088 cachep->batchcount + cachep->num;
3089 spin_unlock_irq(&l3->list_lock);
3090 kfree(nc);
3091 free_alien_cache(new_alien);
3092 continue;
3093 }
3094 if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
3095 GFP_KERNEL, node)))
3096 goto fail;
3097
3098 kmem_list3_init(l3);
3099 l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
3100 ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
3101 l3->shared = new;
3102 l3->alien = new_alien;
3103 l3->free_limit = (1 + nr_cpus_node(node))*
3104 cachep->batchcount + cachep->num;
3105 cachep->nodelists[node] = l3;
3106 }
3107 return err;
3108fail:
3109 err = -ENOMEM;
3110 return err;
3111}
3112
2632struct ccupdate_struct { 3113struct ccupdate_struct {
2633 kmem_cache_t *cachep; 3114 kmem_cache_t *cachep;
2634 struct array_cache *new[NR_CPUS]; 3115 struct array_cache *new[NR_CPUS];
@@ -2641,7 +3122,7 @@ static void do_ccupdate_local(void *info)
2641 3122
2642 check_irq_off(); 3123 check_irq_off();
2643 old = ac_data(new->cachep); 3124 old = ac_data(new->cachep);
2644 3125
2645 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()]; 3126 new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
2646 new->new[smp_processor_id()] = old; 3127 new->new[smp_processor_id()] = old;
2647} 3128}
@@ -2651,54 +3132,43 @@ static int do_tune_cpucache(kmem_cache_t *cachep, int limit, int batchcount,
2651 int shared) 3132 int shared)
2652{ 3133{
2653 struct ccupdate_struct new; 3134 struct ccupdate_struct new;
2654 struct array_cache *new_shared; 3135 int i, err;
2655 int i;
2656 3136
2657 memset(&new.new,0,sizeof(new.new)); 3137 memset(&new.new,0,sizeof(new.new));
2658 for (i = 0; i < NR_CPUS; i++) { 3138 for_each_online_cpu(i) {
2659 if (cpu_online(i)) { 3139 new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
2660 new.new[i] = alloc_arraycache(i, limit, batchcount); 3140 if (!new.new[i]) {
2661 if (!new.new[i]) { 3141 for (i--; i >= 0; i--) kfree(new.new[i]);
2662 for (i--; i >= 0; i--) kfree(new.new[i]); 3142 return -ENOMEM;
2663 return -ENOMEM;
2664 }
2665 } else {
2666 new.new[i] = NULL;
2667 } 3143 }
2668 } 3144 }
2669 new.cachep = cachep; 3145 new.cachep = cachep;
2670 3146
2671 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new); 3147 smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
2672 3148
2673 check_irq_on(); 3149 check_irq_on();
2674 spin_lock_irq(&cachep->spinlock); 3150 spin_lock_irq(&cachep->spinlock);
2675 cachep->batchcount = batchcount; 3151 cachep->batchcount = batchcount;
2676 cachep->limit = limit; 3152 cachep->limit = limit;
2677 cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num; 3153 cachep->shared = shared;
2678 spin_unlock_irq(&cachep->spinlock); 3154 spin_unlock_irq(&cachep->spinlock);
2679 3155
2680 for (i = 0; i < NR_CPUS; i++) { 3156 for_each_online_cpu(i) {
2681 struct array_cache *ccold = new.new[i]; 3157 struct array_cache *ccold = new.new[i];
2682 if (!ccold) 3158 if (!ccold)
2683 continue; 3159 continue;
2684 spin_lock_irq(&cachep->spinlock); 3160 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
2685 free_block(cachep, ac_entry(ccold), ccold->avail); 3161 free_block(cachep, ccold->entry, ccold->avail);
2686 spin_unlock_irq(&cachep->spinlock); 3162 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
2687 kfree(ccold); 3163 kfree(ccold);
2688 } 3164 }
2689 new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
2690 if (new_shared) {
2691 struct array_cache *old;
2692 3165
2693 spin_lock_irq(&cachep->spinlock); 3166 err = alloc_kmemlist(cachep);
2694 old = cachep->lists.shared; 3167 if (err) {
2695 cachep->lists.shared = new_shared; 3168 printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
2696 if (old) 3169 cachep->name, -err);
2697 free_block(cachep, ac_entry(old), old->avail); 3170 BUG();
2698 spin_unlock_irq(&cachep->spinlock);
2699 kfree(old);
2700 } 3171 }
2701
2702 return 0; 3172 return 0;
2703} 3173}
2704 3174
@@ -2756,11 +3226,11 @@ static void enable_cpucache(kmem_cache_t *cachep)
2756} 3226}
2757 3227
2758static void drain_array_locked(kmem_cache_t *cachep, 3228static void drain_array_locked(kmem_cache_t *cachep,
2759 struct array_cache *ac, int force) 3229 struct array_cache *ac, int force, int node)
2760{ 3230{
2761 int tofree; 3231 int tofree;
2762 3232
2763 check_spinlock_acquired(cachep); 3233 check_spinlock_acquired_node(cachep, node);
2764 if (ac->touched && !force) { 3234 if (ac->touched && !force) {
2765 ac->touched = 0; 3235 ac->touched = 0;
2766 } else if (ac->avail) { 3236 } else if (ac->avail) {
@@ -2768,9 +3238,9 @@ static void drain_array_locked(kmem_cache_t *cachep,
2768 if (tofree > ac->avail) { 3238 if (tofree > ac->avail) {
2769 tofree = (ac->avail+1)/2; 3239 tofree = (ac->avail+1)/2;
2770 } 3240 }
2771 free_block(cachep, ac_entry(ac), tofree); 3241 free_block(cachep, ac->entry, tofree);
2772 ac->avail -= tofree; 3242 ac->avail -= tofree;
2773 memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree], 3243 memmove(ac->entry, &(ac->entry[tofree]),
2774 sizeof(void*)*ac->avail); 3244 sizeof(void*)*ac->avail);
2775 } 3245 }
2776} 3246}
@@ -2789,6 +3259,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
2789static void cache_reap(void *unused) 3259static void cache_reap(void *unused)
2790{ 3260{
2791 struct list_head *walk; 3261 struct list_head *walk;
3262 struct kmem_list3 *l3;
2792 3263
2793 if (down_trylock(&cache_chain_sem)) { 3264 if (down_trylock(&cache_chain_sem)) {
2794 /* Give up. Setup the next iteration. */ 3265 /* Give up. Setup the next iteration. */
@@ -2809,27 +3280,32 @@ static void cache_reap(void *unused)
2809 3280
2810 check_irq_on(); 3281 check_irq_on();
2811 3282
2812 spin_lock_irq(&searchp->spinlock); 3283 l3 = searchp->nodelists[numa_node_id()];
3284 if (l3->alien)
3285 drain_alien_cache(searchp, l3);
3286 spin_lock_irq(&l3->list_lock);
2813 3287
2814 drain_array_locked(searchp, ac_data(searchp), 0); 3288 drain_array_locked(searchp, ac_data(searchp), 0,
3289 numa_node_id());
2815 3290
2816 if(time_after(searchp->lists.next_reap, jiffies)) 3291 if (time_after(l3->next_reap, jiffies))
2817 goto next_unlock; 3292 goto next_unlock;
2818 3293
2819 searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3; 3294 l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
2820 3295
2821 if (searchp->lists.shared) 3296 if (l3->shared)
2822 drain_array_locked(searchp, searchp->lists.shared, 0); 3297 drain_array_locked(searchp, l3->shared, 0,
3298 numa_node_id());
2823 3299
2824 if (searchp->lists.free_touched) { 3300 if (l3->free_touched) {
2825 searchp->lists.free_touched = 0; 3301 l3->free_touched = 0;
2826 goto next_unlock; 3302 goto next_unlock;
2827 } 3303 }
2828 3304
2829 tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num); 3305 tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
2830 do { 3306 do {
2831 p = list3_data(searchp)->slabs_free.next; 3307 p = l3->slabs_free.next;
2832 if (p == &(list3_data(searchp)->slabs_free)) 3308 if (p == &(l3->slabs_free))
2833 break; 3309 break;
2834 3310
2835 slabp = list_entry(p, struct slab, list); 3311 slabp = list_entry(p, struct slab, list);
@@ -2842,13 +3318,13 @@ static void cache_reap(void *unused)
2842 * searchp cannot disappear, we hold 3318 * searchp cannot disappear, we hold
2843 * cache_chain_lock 3319 * cache_chain_lock
2844 */ 3320 */
2845 searchp->lists.free_objects -= searchp->num; 3321 l3->free_objects -= searchp->num;
2846 spin_unlock_irq(&searchp->spinlock); 3322 spin_unlock_irq(&l3->list_lock);
2847 slab_destroy(searchp, slabp); 3323 slab_destroy(searchp, slabp);
2848 spin_lock_irq(&searchp->spinlock); 3324 spin_lock_irq(&l3->list_lock);
2849 } while(--tofree > 0); 3325 } while(--tofree > 0);
2850next_unlock: 3326next_unlock:
2851 spin_unlock_irq(&searchp->spinlock); 3327 spin_unlock_irq(&l3->list_lock);
2852next: 3328next:
2853 cond_resched(); 3329 cond_resched();
2854 } 3330 }
@@ -2882,7 +3358,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
2882 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>"); 3358 seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
2883#if STATS 3359#if STATS
2884 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>" 3360 seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
2885 " <error> <maxfreeable> <freelimit> <nodeallocs>"); 3361 " <error> <maxfreeable> <nodeallocs> <remotefrees>");
2886 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>"); 3362 seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
2887#endif 3363#endif
2888 seq_putc(m, '\n'); 3364 seq_putc(m, '\n');
@@ -2917,39 +3393,53 @@ static int s_show(struct seq_file *m, void *p)
2917 unsigned long active_objs; 3393 unsigned long active_objs;
2918 unsigned long num_objs; 3394 unsigned long num_objs;
2919 unsigned long active_slabs = 0; 3395 unsigned long active_slabs = 0;
2920 unsigned long num_slabs; 3396 unsigned long num_slabs, free_objects = 0, shared_avail = 0;
2921 const char *name; 3397 const char *name;
2922 char *error = NULL; 3398 char *error = NULL;
3399 int node;
3400 struct kmem_list3 *l3;
2923 3401
2924 check_irq_on(); 3402 check_irq_on();
2925 spin_lock_irq(&cachep->spinlock); 3403 spin_lock_irq(&cachep->spinlock);
2926 active_objs = 0; 3404 active_objs = 0;
2927 num_slabs = 0; 3405 num_slabs = 0;
2928 list_for_each(q,&cachep->lists.slabs_full) { 3406 for_each_online_node(node) {
2929 slabp = list_entry(q, struct slab, list); 3407 l3 = cachep->nodelists[node];
2930 if (slabp->inuse != cachep->num && !error) 3408 if (!l3)
2931 error = "slabs_full accounting error"; 3409 continue;
2932 active_objs += cachep->num; 3410
2933 active_slabs++; 3411 spin_lock(&l3->list_lock);
2934 } 3412
2935 list_for_each(q,&cachep->lists.slabs_partial) { 3413 list_for_each(q,&l3->slabs_full) {
2936 slabp = list_entry(q, struct slab, list); 3414 slabp = list_entry(q, struct slab, list);
2937 if (slabp->inuse == cachep->num && !error) 3415 if (slabp->inuse != cachep->num && !error)
2938 error = "slabs_partial inuse accounting error"; 3416 error = "slabs_full accounting error";
2939 if (!slabp->inuse && !error) 3417 active_objs += cachep->num;
2940 error = "slabs_partial/inuse accounting error"; 3418 active_slabs++;
2941 active_objs += slabp->inuse; 3419 }
2942 active_slabs++; 3420 list_for_each(q,&l3->slabs_partial) {
2943 } 3421 slabp = list_entry(q, struct slab, list);
2944 list_for_each(q,&cachep->lists.slabs_free) { 3422 if (slabp->inuse == cachep->num && !error)
2945 slabp = list_entry(q, struct slab, list); 3423 error = "slabs_partial inuse accounting error";
2946 if (slabp->inuse && !error) 3424 if (!slabp->inuse && !error)
2947 error = "slabs_free/inuse accounting error"; 3425 error = "slabs_partial/inuse accounting error";
2948 num_slabs++; 3426 active_objs += slabp->inuse;
3427 active_slabs++;
3428 }
3429 list_for_each(q,&l3->slabs_free) {
3430 slabp = list_entry(q, struct slab, list);
3431 if (slabp->inuse && !error)
3432 error = "slabs_free/inuse accounting error";
3433 num_slabs++;
3434 }
3435 free_objects += l3->free_objects;
3436 shared_avail += l3->shared->avail;
3437
3438 spin_unlock(&l3->list_lock);
2949 } 3439 }
2950 num_slabs+=active_slabs; 3440 num_slabs+=active_slabs;
2951 num_objs = num_slabs*cachep->num; 3441 num_objs = num_slabs*cachep->num;
2952 if (num_objs - active_objs != cachep->lists.free_objects && !error) 3442 if (num_objs - active_objs != free_objects && !error)
2953 error = "free_objects accounting error"; 3443 error = "free_objects accounting error";
2954 3444
2955 name = cachep->name; 3445 name = cachep->name;
@@ -2961,9 +3451,9 @@ static int s_show(struct seq_file *m, void *p)
2961 cachep->num, (1<<cachep->gfporder)); 3451 cachep->num, (1<<cachep->gfporder));
2962 seq_printf(m, " : tunables %4u %4u %4u", 3452 seq_printf(m, " : tunables %4u %4u %4u",
2963 cachep->limit, cachep->batchcount, 3453 cachep->limit, cachep->batchcount,
2964 cachep->lists.shared->limit/cachep->batchcount); 3454 cachep->shared);
2965 seq_printf(m, " : slabdata %6lu %6lu %6u", 3455 seq_printf(m, " : slabdata %6lu %6lu %6lu",
2966 active_slabs, num_slabs, cachep->lists.shared->avail); 3456 active_slabs, num_slabs, shared_avail);
2967#if STATS 3457#if STATS
2968 { /* list3 stats */ 3458 { /* list3 stats */
2969 unsigned long high = cachep->high_mark; 3459 unsigned long high = cachep->high_mark;
@@ -2972,12 +3462,13 @@ static int s_show(struct seq_file *m, void *p)
2972 unsigned long reaped = cachep->reaped; 3462 unsigned long reaped = cachep->reaped;
2973 unsigned long errors = cachep->errors; 3463 unsigned long errors = cachep->errors;
2974 unsigned long max_freeable = cachep->max_freeable; 3464 unsigned long max_freeable = cachep->max_freeable;
2975 unsigned long free_limit = cachep->free_limit;
2976 unsigned long node_allocs = cachep->node_allocs; 3465 unsigned long node_allocs = cachep->node_allocs;
3466 unsigned long node_frees = cachep->node_frees;
2977 3467
2978 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu", 3468 seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
2979 allocs, high, grown, reaped, errors, 3469 %4lu %4lu %4lu %4lu",
2980 max_freeable, free_limit, node_allocs); 3470 allocs, high, grown, reaped, errors,
3471 max_freeable, node_allocs, node_frees);
2981 } 3472 }
2982 /* cpu stats */ 3473 /* cpu stats */
2983 { 3474 {
@@ -3056,9 +3547,10 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
3056 batchcount < 1 || 3547 batchcount < 1 ||
3057 batchcount > limit || 3548 batchcount > limit ||
3058 shared < 0) { 3549 shared < 0) {
3059 res = -EINVAL; 3550 res = 0;
3060 } else { 3551 } else {
3061 res = do_tune_cpucache(cachep, limit, batchcount, shared); 3552 res = do_tune_cpucache(cachep, limit,
3553 batchcount, shared);
3062 } 3554 }
3063 break; 3555 break;
3064 } 3556 }