aboutsummaryrefslogtreecommitdiffstats
path: root/mm/slab.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/slab.c')
-rw-r--r--mm/slab.c411
1 files changed, 307 insertions, 104 deletions
diff --git a/mm/slab.c b/mm/slab.c
index 3c4a7e34eddc..c6100628a6ef 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -103,12 +103,14 @@
103#include <linux/module.h> 103#include <linux/module.h>
104#include <linux/rcupdate.h> 104#include <linux/rcupdate.h>
105#include <linux/string.h> 105#include <linux/string.h>
106#include <linux/uaccess.h>
106#include <linux/nodemask.h> 107#include <linux/nodemask.h>
107#include <linux/mempolicy.h> 108#include <linux/mempolicy.h>
108#include <linux/mutex.h> 109#include <linux/mutex.h>
110#include <linux/fault-inject.h>
109#include <linux/rtmutex.h> 111#include <linux/rtmutex.h>
112#include <linux/reciprocal_div.h>
110 113
111#include <asm/uaccess.h>
112#include <asm/cacheflush.h> 114#include <asm/cacheflush.h>
113#include <asm/tlbflush.h> 115#include <asm/tlbflush.h>
114#include <asm/page.h> 116#include <asm/page.h>
@@ -313,7 +315,7 @@ static int drain_freelist(struct kmem_cache *cache,
313static void free_block(struct kmem_cache *cachep, void **objpp, int len, 315static void free_block(struct kmem_cache *cachep, void **objpp, int len,
314 int node); 316 int node);
315static int enable_cpucache(struct kmem_cache *cachep); 317static int enable_cpucache(struct kmem_cache *cachep);
316static void cache_reap(void *unused); 318static void cache_reap(struct work_struct *unused);
317 319
318/* 320/*
319 * This function must be completely optimized away if a constant is passed to 321 * This function must be completely optimized away if a constant is passed to
@@ -385,6 +387,7 @@ struct kmem_cache {
385 unsigned int shared; 387 unsigned int shared;
386 388
387 unsigned int buffer_size; 389 unsigned int buffer_size;
390 u32 reciprocal_buffer_size;
388/* 3) touched by every alloc & free from the backend */ 391/* 3) touched by every alloc & free from the backend */
389 struct kmem_list3 *nodelists[MAX_NUMNODES]; 392 struct kmem_list3 *nodelists[MAX_NUMNODES];
390 393
@@ -626,10 +629,17 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
626 return slab->s_mem + cache->buffer_size * idx; 629 return slab->s_mem + cache->buffer_size * idx;
627} 630}
628 631
629static inline unsigned int obj_to_index(struct kmem_cache *cache, 632/*
630 struct slab *slab, void *obj) 633 * We want to avoid an expensive divide : (offset / cache->buffer_size)
634 * Using the fact that buffer_size is a constant for a particular cache,
635 * we can replace (offset / cache->buffer_size) by
636 * reciprocal_divide(offset, cache->reciprocal_buffer_size)
637 */
638static inline unsigned int obj_to_index(const struct kmem_cache *cache,
639 const struct slab *slab, void *obj)
631{ 640{
632 return (unsigned)(obj - slab->s_mem) / cache->buffer_size; 641 u32 offset = (obj - slab->s_mem);
642 return reciprocal_divide(offset, cache->reciprocal_buffer_size);
633} 643}
634 644
635/* 645/*
@@ -730,7 +740,10 @@ static inline void init_lock_keys(void)
730} 740}
731#endif 741#endif
732 742
733/* Guard access to the cache-chain. */ 743/*
744 * 1. Guard access to the cache-chain.
745 * 2. Protect sanity of cpu_online_map against cpu hotplug events
746 */
734static DEFINE_MUTEX(cache_chain_mutex); 747static DEFINE_MUTEX(cache_chain_mutex);
735static struct list_head cache_chain; 748static struct list_head cache_chain;
736 749
@@ -753,7 +766,7 @@ int slab_is_available(void)
753 return g_cpucache_up == FULL; 766 return g_cpucache_up == FULL;
754} 767}
755 768
756static DEFINE_PER_CPU(struct work_struct, reap_work); 769static DEFINE_PER_CPU(struct delayed_work, reap_work);
757 770
758static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) 771static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
759{ 772{
@@ -866,6 +879,22 @@ static void __slab_error(const char *function, struct kmem_cache *cachep,
866 dump_stack(); 879 dump_stack();
867} 880}
868 881
882/*
883 * By default on NUMA we use alien caches to stage the freeing of
884 * objects allocated from other nodes. This causes massive memory
885 * inefficiencies when using fake NUMA setup to split memory into a
886 * large number of small nodes, so it can be disabled on the command
887 * line
888 */
889
890static int use_alien_caches __read_mostly = 1;
891static int __init noaliencache_setup(char *s)
892{
893 use_alien_caches = 0;
894 return 1;
895}
896__setup("noaliencache", noaliencache_setup);
897
869#ifdef CONFIG_NUMA 898#ifdef CONFIG_NUMA
870/* 899/*
871 * Special reaping functions for NUMA systems called from cache_reap(). 900 * Special reaping functions for NUMA systems called from cache_reap().
@@ -916,17 +945,18 @@ static void next_reap_node(void)
916 */ 945 */
917static void __devinit start_cpu_timer(int cpu) 946static void __devinit start_cpu_timer(int cpu)
918{ 947{
919 struct work_struct *reap_work = &per_cpu(reap_work, cpu); 948 struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
920 949
921 /* 950 /*
922 * When this gets called from do_initcalls via cpucache_init(), 951 * When this gets called from do_initcalls via cpucache_init(),
923 * init_workqueues() has already run, so keventd will be setup 952 * init_workqueues() has already run, so keventd will be setup
924 * at that time. 953 * at that time.
925 */ 954 */
926 if (keventd_up() && reap_work->func == NULL) { 955 if (keventd_up() && reap_work->work.func == NULL) {
927 init_reap_node(cpu); 956 init_reap_node(cpu);
928 INIT_WORK(reap_work, cache_reap, NULL); 957 INIT_DELAYED_WORK(reap_work, cache_reap);
929 schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); 958 schedule_delayed_work_on(cpu, reap_work,
959 __round_jiffies_relative(HZ, cpu));
930 } 960 }
931} 961}
932 962
@@ -996,7 +1026,7 @@ static inline void *alternate_node_alloc(struct kmem_cache *cachep,
996 return NULL; 1026 return NULL;
997} 1027}
998 1028
999static inline void *__cache_alloc_node(struct kmem_cache *cachep, 1029static inline void *____cache_alloc_node(struct kmem_cache *cachep,
1000 gfp_t flags, int nodeid) 1030 gfp_t flags, int nodeid)
1001{ 1031{
1002 return NULL; 1032 return NULL;
@@ -1004,7 +1034,7 @@ static inline void *__cache_alloc_node(struct kmem_cache *cachep,
1004 1034
1005#else /* CONFIG_NUMA */ 1035#else /* CONFIG_NUMA */
1006 1036
1007static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 1037static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
1008static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1038static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
1009 1039
1010static struct array_cache **alloc_alien_cache(int node, int limit) 1040static struct array_cache **alloc_alien_cache(int node, int limit)
@@ -1114,7 +1144,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1114 * Make sure we are not freeing a object from another node to the array 1144 * Make sure we are not freeing a object from another node to the array
1115 * cache on this cpu. 1145 * cache on this cpu.
1116 */ 1146 */
1117 if (likely(slabp->nodeid == node)) 1147 if (likely(slabp->nodeid == node) || unlikely(!use_alien_caches))
1118 return 0; 1148 return 0;
1119 1149
1120 l3 = cachep->nodelists[node]; 1150 l3 = cachep->nodelists[node];
@@ -1192,7 +1222,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1192 list_for_each_entry(cachep, &cache_chain, next) { 1222 list_for_each_entry(cachep, &cache_chain, next) {
1193 struct array_cache *nc; 1223 struct array_cache *nc;
1194 struct array_cache *shared; 1224 struct array_cache *shared;
1195 struct array_cache **alien; 1225 struct array_cache **alien = NULL;
1196 1226
1197 nc = alloc_arraycache(node, cachep->limit, 1227 nc = alloc_arraycache(node, cachep->limit,
1198 cachep->batchcount); 1228 cachep->batchcount);
@@ -1204,9 +1234,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1204 if (!shared) 1234 if (!shared)
1205 goto bad; 1235 goto bad;
1206 1236
1207 alien = alloc_alien_cache(node, cachep->limit); 1237 if (use_alien_caches) {
1208 if (!alien) 1238 alien = alloc_alien_cache(node, cachep->limit);
1209 goto bad; 1239 if (!alien)
1240 goto bad;
1241 }
1210 cachep->array[cpu] = nc; 1242 cachep->array[cpu] = nc;
1211 l3 = cachep->nodelists[node]; 1243 l3 = cachep->nodelists[node];
1212 BUG_ON(!l3); 1244 BUG_ON(!l3);
@@ -1230,12 +1262,18 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1230 kfree(shared); 1262 kfree(shared);
1231 free_alien_cache(alien); 1263 free_alien_cache(alien);
1232 } 1264 }
1233 mutex_unlock(&cache_chain_mutex);
1234 break; 1265 break;
1235 case CPU_ONLINE: 1266 case CPU_ONLINE:
1267 mutex_unlock(&cache_chain_mutex);
1236 start_cpu_timer(cpu); 1268 start_cpu_timer(cpu);
1237 break; 1269 break;
1238#ifdef CONFIG_HOTPLUG_CPU 1270#ifdef CONFIG_HOTPLUG_CPU
1271 case CPU_DOWN_PREPARE:
1272 mutex_lock(&cache_chain_mutex);
1273 break;
1274 case CPU_DOWN_FAILED:
1275 mutex_unlock(&cache_chain_mutex);
1276 break;
1239 case CPU_DEAD: 1277 case CPU_DEAD:
1240 /* 1278 /*
1241 * Even if all the cpus of a node are down, we don't free the 1279 * Even if all the cpus of a node are down, we don't free the
@@ -1246,8 +1284,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
1246 * gets destroyed at kmem_cache_destroy(). 1284 * gets destroyed at kmem_cache_destroy().
1247 */ 1285 */
1248 /* fall thru */ 1286 /* fall thru */
1287#endif
1249 case CPU_UP_CANCELED: 1288 case CPU_UP_CANCELED:
1250 mutex_lock(&cache_chain_mutex);
1251 list_for_each_entry(cachep, &cache_chain, next) { 1289 list_for_each_entry(cachep, &cache_chain, next) {
1252 struct array_cache *nc; 1290 struct array_cache *nc;
1253 struct array_cache *shared; 1291 struct array_cache *shared;
@@ -1308,11 +1346,9 @@ free_array_cache:
1308 } 1346 }
1309 mutex_unlock(&cache_chain_mutex); 1347 mutex_unlock(&cache_chain_mutex);
1310 break; 1348 break;
1311#endif
1312 } 1349 }
1313 return NOTIFY_OK; 1350 return NOTIFY_OK;
1314bad: 1351bad:
1315 mutex_unlock(&cache_chain_mutex);
1316 return NOTIFY_BAD; 1352 return NOTIFY_BAD;
1317} 1353}
1318 1354
@@ -1400,6 +1436,8 @@ void __init kmem_cache_init(void)
1400 1436
1401 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, 1437 cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
1402 cache_line_size()); 1438 cache_line_size());
1439 cache_cache.reciprocal_buffer_size =
1440 reciprocal_value(cache_cache.buffer_size);
1403 1441
1404 for (order = 0; order < MAX_ORDER; order++) { 1442 for (order = 0; order < MAX_ORDER; order++) {
1405 cache_estimate(order, cache_cache.buffer_size, 1443 cache_estimate(order, cache_cache.buffer_size,
@@ -1580,12 +1618,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1580 flags |= __GFP_COMP; 1618 flags |= __GFP_COMP;
1581#endif 1619#endif
1582 1620
1583 /* 1621 flags |= cachep->gfpflags;
1584 * Under NUMA we want memory on the indicated node. We will handle
1585 * the needed fallback ourselves since we want to serve from our
1586 * per node object lists first for other nodes.
1587 */
1588 flags |= cachep->gfpflags | GFP_THISNODE;
1589 1622
1590 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1623 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1591 if (!page) 1624 if (!page)
@@ -2098,15 +2131,12 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2098 } 2131 }
2099 2132
2100 /* 2133 /*
2101 * Prevent CPUs from coming and going. 2134 * We use cache_chain_mutex to ensure a consistent view of
2102 * lock_cpu_hotplug() nests outside cache_chain_mutex 2135 * cpu_online_map as well. Please see cpuup_callback
2103 */ 2136 */
2104 lock_cpu_hotplug();
2105
2106 mutex_lock(&cache_chain_mutex); 2137 mutex_lock(&cache_chain_mutex);
2107 2138
2108 list_for_each_entry(pc, &cache_chain, next) { 2139 list_for_each_entry(pc, &cache_chain, next) {
2109 mm_segment_t old_fs = get_fs();
2110 char tmp; 2140 char tmp;
2111 int res; 2141 int res;
2112 2142
@@ -2115,9 +2145,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2115 * destroy its slab cache and no-one else reuses the vmalloc 2145 * destroy its slab cache and no-one else reuses the vmalloc
2116 * area of the module. Print a warning. 2146 * area of the module. Print a warning.
2117 */ 2147 */
2118 set_fs(KERNEL_DS); 2148 res = probe_kernel_address(pc->name, tmp);
2119 res = __get_user(tmp, pc->name);
2120 set_fs(old_fs);
2121 if (res) { 2149 if (res) {
2122 printk("SLAB: cache with size %d has lost its name\n", 2150 printk("SLAB: cache with size %d has lost its name\n",
2123 pc->buffer_size); 2151 pc->buffer_size);
@@ -2197,25 +2225,24 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2197 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER) 2225 if (flags & SLAB_RED_ZONE || flags & SLAB_STORE_USER)
2198 ralign = BYTES_PER_WORD; 2226 ralign = BYTES_PER_WORD;
2199 2227
2200 /* 2) arch mandated alignment: disables debug if necessary */ 2228 /* 2) arch mandated alignment */
2201 if (ralign < ARCH_SLAB_MINALIGN) { 2229 if (ralign < ARCH_SLAB_MINALIGN) {
2202 ralign = ARCH_SLAB_MINALIGN; 2230 ralign = ARCH_SLAB_MINALIGN;
2203 if (ralign > BYTES_PER_WORD)
2204 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2205 } 2231 }
2206 /* 3) caller mandated alignment: disables debug if necessary */ 2232 /* 3) caller mandated alignment */
2207 if (ralign < align) { 2233 if (ralign < align) {
2208 ralign = align; 2234 ralign = align;
2209 if (ralign > BYTES_PER_WORD)
2210 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2211 } 2235 }
2236 /* disable debug if necessary */
2237 if (ralign > BYTES_PER_WORD)
2238 flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
2212 /* 2239 /*
2213 * 4) Store it. 2240 * 4) Store it.
2214 */ 2241 */
2215 align = ralign; 2242 align = ralign;
2216 2243
2217 /* Get cache's description obj. */ 2244 /* Get cache's description obj. */
2218 cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL); 2245 cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
2219 if (!cachep) 2246 if (!cachep)
2220 goto oops; 2247 goto oops;
2221 2248
@@ -2297,6 +2324,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2297 if (flags & SLAB_CACHE_DMA) 2324 if (flags & SLAB_CACHE_DMA)
2298 cachep->gfpflags |= GFP_DMA; 2325 cachep->gfpflags |= GFP_DMA;
2299 cachep->buffer_size = size; 2326 cachep->buffer_size = size;
2327 cachep->reciprocal_buffer_size = reciprocal_value(size);
2300 2328
2301 if (flags & CFLGS_OFF_SLAB) { 2329 if (flags & CFLGS_OFF_SLAB) {
2302 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); 2330 cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
@@ -2326,7 +2354,6 @@ oops:
2326 panic("kmem_cache_create(): failed to create slab `%s'\n", 2354 panic("kmem_cache_create(): failed to create slab `%s'\n",
2327 name); 2355 name);
2328 mutex_unlock(&cache_chain_mutex); 2356 mutex_unlock(&cache_chain_mutex);
2329 unlock_cpu_hotplug();
2330 return cachep; 2357 return cachep;
2331} 2358}
2332EXPORT_SYMBOL(kmem_cache_create); 2359EXPORT_SYMBOL(kmem_cache_create);
@@ -2444,6 +2471,7 @@ out:
2444 return nr_freed; 2471 return nr_freed;
2445} 2472}
2446 2473
2474/* Called with cache_chain_mutex held to protect against cpu hotplug */
2447static int __cache_shrink(struct kmem_cache *cachep) 2475static int __cache_shrink(struct kmem_cache *cachep)
2448{ 2476{
2449 int ret = 0, i = 0; 2477 int ret = 0, i = 0;
@@ -2474,9 +2502,13 @@ static int __cache_shrink(struct kmem_cache *cachep)
2474 */ 2502 */
2475int kmem_cache_shrink(struct kmem_cache *cachep) 2503int kmem_cache_shrink(struct kmem_cache *cachep)
2476{ 2504{
2505 int ret;
2477 BUG_ON(!cachep || in_interrupt()); 2506 BUG_ON(!cachep || in_interrupt());
2478 2507
2479 return __cache_shrink(cachep); 2508 mutex_lock(&cache_chain_mutex);
2509 ret = __cache_shrink(cachep);
2510 mutex_unlock(&cache_chain_mutex);
2511 return ret;
2480} 2512}
2481EXPORT_SYMBOL(kmem_cache_shrink); 2513EXPORT_SYMBOL(kmem_cache_shrink);
2482 2514
@@ -2500,23 +2532,16 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2500{ 2532{
2501 BUG_ON(!cachep || in_interrupt()); 2533 BUG_ON(!cachep || in_interrupt());
2502 2534
2503 /* Don't let CPUs to come and go */
2504 lock_cpu_hotplug();
2505
2506 /* Find the cache in the chain of caches. */ 2535 /* Find the cache in the chain of caches. */
2507 mutex_lock(&cache_chain_mutex); 2536 mutex_lock(&cache_chain_mutex);
2508 /* 2537 /*
2509 * the chain is never empty, cache_cache is never destroyed 2538 * the chain is never empty, cache_cache is never destroyed
2510 */ 2539 */
2511 list_del(&cachep->next); 2540 list_del(&cachep->next);
2512 mutex_unlock(&cache_chain_mutex);
2513
2514 if (__cache_shrink(cachep)) { 2541 if (__cache_shrink(cachep)) {
2515 slab_error(cachep, "Can't free all objects"); 2542 slab_error(cachep, "Can't free all objects");
2516 mutex_lock(&cache_chain_mutex);
2517 list_add(&cachep->next, &cache_chain); 2543 list_add(&cachep->next, &cache_chain);
2518 mutex_unlock(&cache_chain_mutex); 2544 mutex_unlock(&cache_chain_mutex);
2519 unlock_cpu_hotplug();
2520 return; 2545 return;
2521 } 2546 }
2522 2547
@@ -2524,7 +2549,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
2524 synchronize_rcu(); 2549 synchronize_rcu();
2525 2550
2526 __kmem_cache_destroy(cachep); 2551 __kmem_cache_destroy(cachep);
2527 unlock_cpu_hotplug(); 2552 mutex_unlock(&cache_chain_mutex);
2528} 2553}
2529EXPORT_SYMBOL(kmem_cache_destroy); 2554EXPORT_SYMBOL(kmem_cache_destroy);
2530 2555
@@ -2548,7 +2573,7 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
2548 if (OFF_SLAB(cachep)) { 2573 if (OFF_SLAB(cachep)) {
2549 /* Slab management obj is off-slab. */ 2574 /* Slab management obj is off-slab. */
2550 slabp = kmem_cache_alloc_node(cachep->slabp_cache, 2575 slabp = kmem_cache_alloc_node(cachep->slabp_cache,
2551 local_flags, nodeid); 2576 local_flags & ~GFP_THISNODE, nodeid);
2552 if (!slabp) 2577 if (!slabp)
2553 return NULL; 2578 return NULL;
2554 } else { 2579 } else {
@@ -2618,7 +2643,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
2618 2643
2619static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) 2644static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
2620{ 2645{
2621 if (flags & SLAB_DMA) 2646 if (flags & GFP_DMA)
2622 BUG_ON(!(cachep->gfpflags & GFP_DMA)); 2647 BUG_ON(!(cachep->gfpflags & GFP_DMA));
2623 else 2648 else
2624 BUG_ON(cachep->gfpflags & GFP_DMA); 2649 BUG_ON(cachep->gfpflags & GFP_DMA);
@@ -2689,10 +2714,10 @@ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
2689 * Grow (by 1) the number of slabs within a cache. This is called by 2714 * Grow (by 1) the number of slabs within a cache. This is called by
2690 * kmem_cache_alloc() when there are no active objs left in a cache. 2715 * kmem_cache_alloc() when there are no active objs left in a cache.
2691 */ 2716 */
2692static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) 2717static int cache_grow(struct kmem_cache *cachep,
2718 gfp_t flags, int nodeid, void *objp)
2693{ 2719{
2694 struct slab *slabp; 2720 struct slab *slabp;
2695 void *objp;
2696 size_t offset; 2721 size_t offset;
2697 gfp_t local_flags; 2722 gfp_t local_flags;
2698 unsigned long ctor_flags; 2723 unsigned long ctor_flags;
@@ -2702,12 +2727,12 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2702 * Be lazy and only check for valid flags here, keeping it out of the 2727 * Be lazy and only check for valid flags here, keeping it out of the
2703 * critical path in kmem_cache_alloc(). 2728 * critical path in kmem_cache_alloc().
2704 */ 2729 */
2705 BUG_ON(flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW)); 2730 BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW));
2706 if (flags & SLAB_NO_GROW) 2731 if (flags & __GFP_NO_GROW)
2707 return 0; 2732 return 0;
2708 2733
2709 ctor_flags = SLAB_CTOR_CONSTRUCTOR; 2734 ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2710 local_flags = (flags & SLAB_LEVEL_MASK); 2735 local_flags = (flags & GFP_LEVEL_MASK);
2711 if (!(local_flags & __GFP_WAIT)) 2736 if (!(local_flags & __GFP_WAIT))
2712 /* 2737 /*
2713 * Not allowed to sleep. Need to tell a constructor about 2738 * Not allowed to sleep. Need to tell a constructor about
@@ -2744,12 +2769,14 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
2744 * Get mem for the objs. Attempt to allocate a physical page from 2769 * Get mem for the objs. Attempt to allocate a physical page from
2745 * 'nodeid'. 2770 * 'nodeid'.
2746 */ 2771 */
2747 objp = kmem_getpages(cachep, flags, nodeid); 2772 if (!objp)
2773 objp = kmem_getpages(cachep, flags, nodeid);
2748 if (!objp) 2774 if (!objp)
2749 goto failed; 2775 goto failed;
2750 2776
2751 /* Get slab management. */ 2777 /* Get slab management. */
2752 slabp = alloc_slabmgmt(cachep, objp, offset, local_flags, nodeid); 2778 slabp = alloc_slabmgmt(cachep, objp, offset,
2779 local_flags & ~GFP_THISNODE, nodeid);
2753 if (!slabp) 2780 if (!slabp)
2754 goto opps1; 2781 goto opps1;
2755 2782
@@ -2987,7 +3014,7 @@ alloc_done:
2987 3014
2988 if (unlikely(!ac->avail)) { 3015 if (unlikely(!ac->avail)) {
2989 int x; 3016 int x;
2990 x = cache_grow(cachep, flags, node); 3017 x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);
2991 3018
2992 /* cache_grow can reenable interrupts, then ac could change. */ 3019 /* cache_grow can reenable interrupts, then ac could change. */
2993 ac = cpu_cache_get(cachep); 3020 ac = cpu_cache_get(cachep);
@@ -3063,18 +3090,101 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
3063 3090
3064 cachep->ctor(objp, cachep, ctor_flags); 3091 cachep->ctor(objp, cachep, ctor_flags);
3065 } 3092 }
3093#if ARCH_SLAB_MINALIGN
3094 if ((u32)objp & (ARCH_SLAB_MINALIGN-1)) {
3095 printk(KERN_ERR "0x%p: not aligned to ARCH_SLAB_MINALIGN=%d\n",
3096 objp, ARCH_SLAB_MINALIGN);
3097 }
3098#endif
3066 return objp; 3099 return objp;
3067} 3100}
3068#else 3101#else
3069#define cache_alloc_debugcheck_after(a,b,objp,d) (objp) 3102#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
3070#endif 3103#endif
3071 3104
3105#ifdef CONFIG_FAILSLAB
3106
3107static struct failslab_attr {
3108
3109 struct fault_attr attr;
3110
3111 u32 ignore_gfp_wait;
3112#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3113 struct dentry *ignore_gfp_wait_file;
3114#endif
3115
3116} failslab = {
3117 .attr = FAULT_ATTR_INITIALIZER,
3118 .ignore_gfp_wait = 1,
3119};
3120
3121static int __init setup_failslab(char *str)
3122{
3123 return setup_fault_attr(&failslab.attr, str);
3124}
3125__setup("failslab=", setup_failslab);
3126
3127static int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3128{
3129 if (cachep == &cache_cache)
3130 return 0;
3131 if (flags & __GFP_NOFAIL)
3132 return 0;
3133 if (failslab.ignore_gfp_wait && (flags & __GFP_WAIT))
3134 return 0;
3135
3136 return should_fail(&failslab.attr, obj_size(cachep));
3137}
3138
3139#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
3140
3141static int __init failslab_debugfs(void)
3142{
3143 mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
3144 struct dentry *dir;
3145 int err;
3146
3147 err = init_fault_attr_dentries(&failslab.attr, "failslab");
3148 if (err)
3149 return err;
3150 dir = failslab.attr.dentries.dir;
3151
3152 failslab.ignore_gfp_wait_file =
3153 debugfs_create_bool("ignore-gfp-wait", mode, dir,
3154 &failslab.ignore_gfp_wait);
3155
3156 if (!failslab.ignore_gfp_wait_file) {
3157 err = -ENOMEM;
3158 debugfs_remove(failslab.ignore_gfp_wait_file);
3159 cleanup_fault_attr_dentries(&failslab.attr);
3160 }
3161
3162 return err;
3163}
3164
3165late_initcall(failslab_debugfs);
3166
3167#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
3168
3169#else /* CONFIG_FAILSLAB */
3170
3171static inline int should_failslab(struct kmem_cache *cachep, gfp_t flags)
3172{
3173 return 0;
3174}
3175
3176#endif /* CONFIG_FAILSLAB */
3177
3072static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3178static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3073{ 3179{
3074 void *objp; 3180 void *objp;
3075 struct array_cache *ac; 3181 struct array_cache *ac;
3076 3182
3077 check_irq_off(); 3183 check_irq_off();
3184
3185 if (should_failslab(cachep, flags))
3186 return NULL;
3187
3078 ac = cpu_cache_get(cachep); 3188 ac = cpu_cache_get(cachep);
3079 if (likely(ac->avail)) { 3189 if (likely(ac->avail)) {
3080 STATS_INC_ALLOCHIT(cachep); 3190 STATS_INC_ALLOCHIT(cachep);
@@ -3105,10 +3215,10 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
3105 objp = ____cache_alloc(cachep, flags); 3215 objp = ____cache_alloc(cachep, flags);
3106 /* 3216 /*
3107 * We may just have run out of memory on the local node. 3217 * We may just have run out of memory on the local node.
3108 * __cache_alloc_node() knows how to locate memory on other nodes 3218 * ____cache_alloc_node() knows how to locate memory on other nodes
3109 */ 3219 */
3110 if (NUMA_BUILD && !objp) 3220 if (NUMA_BUILD && !objp)
3111 objp = __cache_alloc_node(cachep, flags, numa_node_id()); 3221 objp = ____cache_alloc_node(cachep, flags, numa_node_id());
3112 local_irq_restore(save_flags); 3222 local_irq_restore(save_flags);
3113 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 3223 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
3114 caller); 3224 caller);
@@ -3135,15 +3245,17 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3135 else if (current->mempolicy) 3245 else if (current->mempolicy)
3136 nid_alloc = slab_node(current->mempolicy); 3246 nid_alloc = slab_node(current->mempolicy);
3137 if (nid_alloc != nid_here) 3247 if (nid_alloc != nid_here)
3138 return __cache_alloc_node(cachep, flags, nid_alloc); 3248 return ____cache_alloc_node(cachep, flags, nid_alloc);
3139 return NULL; 3249 return NULL;
3140} 3250}
3141 3251
3142/* 3252/*
3143 * Fallback function if there was no memory available and no objects on a 3253 * Fallback function if there was no memory available and no objects on a
3144 * certain node and we are allowed to fall back. We mimick the behavior of 3254 * certain node and fall back is permitted. First we scan all the
3145 * the page allocator. We fall back according to a zonelist determined by 3255 * available nodelists for available objects. If that fails then we
3146 * the policy layer while obeying cpuset constraints. 3256 * perform an allocation without specifying a node. This allows the page
3257 * allocator to do its reclaim / fallback magic. We then insert the
3258 * slab into the proper nodelist and then allocate from it.
3147 */ 3259 */
3148void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) 3260void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3149{ 3261{
@@ -3151,15 +3263,57 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3151 ->node_zonelists[gfp_zone(flags)]; 3263 ->node_zonelists[gfp_zone(flags)];
3152 struct zone **z; 3264 struct zone **z;
3153 void *obj = NULL; 3265 void *obj = NULL;
3266 int nid;
3267 gfp_t local_flags = (flags & GFP_LEVEL_MASK);
3154 3268
3269retry:
3270 /*
3271 * Look through allowed nodes for objects available
3272 * from existing per node queues.
3273 */
3155 for (z = zonelist->zones; *z && !obj; z++) { 3274 for (z = zonelist->zones; *z && !obj; z++) {
3156 int nid = zone_to_nid(*z); 3275 nid = zone_to_nid(*z);
3157 3276
3158 if (zone_idx(*z) <= ZONE_NORMAL && 3277 if (cpuset_zone_allowed_hardwall(*z, flags) &&
3159 cpuset_zone_allowed(*z, flags) && 3278 cache->nodelists[nid] &&
3160 cache->nodelists[nid]) 3279 cache->nodelists[nid]->free_objects)
3161 obj = __cache_alloc_node(cache, 3280 obj = ____cache_alloc_node(cache,
3162 flags | __GFP_THISNODE, nid); 3281 flags | GFP_THISNODE, nid);
3282 }
3283
3284 if (!obj && !(flags & __GFP_NO_GROW)) {
3285 /*
3286 * This allocation will be performed within the constraints
3287 * of the current cpuset / memory policy requirements.
3288 * We may trigger various forms of reclaim on the allowed
3289 * set and go into memory reserves if necessary.
3290 */
3291 if (local_flags & __GFP_WAIT)
3292 local_irq_enable();
3293 kmem_flagcheck(cache, flags);
3294 obj = kmem_getpages(cache, flags, -1);
3295 if (local_flags & __GFP_WAIT)
3296 local_irq_disable();
3297 if (obj) {
3298 /*
3299 * Insert into the appropriate per node queues
3300 */
3301 nid = page_to_nid(virt_to_page(obj));
3302 if (cache_grow(cache, flags, nid, obj)) {
3303 obj = ____cache_alloc_node(cache,
3304 flags | GFP_THISNODE, nid);
3305 if (!obj)
3306 /*
3307 * Another processor may allocate the
3308 * objects in the slab since we are
3309 * not holding any locks.
3310 */
3311 goto retry;
3312 } else {
3313 /* cache_grow already freed obj */
3314 obj = NULL;
3315 }
3316 }
3163 } 3317 }
3164 return obj; 3318 return obj;
3165} 3319}
@@ -3167,7 +3321,7 @@ void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3167/* 3321/*
3168 * A interface to enable slab creation on nodeid 3322 * A interface to enable slab creation on nodeid
3169 */ 3323 */
3170static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3324static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3171 int nodeid) 3325 int nodeid)
3172{ 3326{
3173 struct list_head *entry; 3327 struct list_head *entry;
@@ -3216,7 +3370,7 @@ retry:
3216 3370
3217must_grow: 3371must_grow:
3218 spin_unlock(&l3->list_lock); 3372 spin_unlock(&l3->list_lock);
3219 x = cache_grow(cachep, flags, nodeid); 3373 x = cache_grow(cachep, flags | GFP_THISNODE, nodeid, NULL);
3220 if (x) 3374 if (x)
3221 goto retry; 3375 goto retry;
3222 3376
@@ -3399,7 +3553,7 @@ EXPORT_SYMBOL(kmem_cache_zalloc);
3399 * 3553 *
3400 * Currently only used for dentry validation. 3554 * Currently only used for dentry validation.
3401 */ 3555 */
3402int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr) 3556int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr)
3403{ 3557{
3404 unsigned long addr = (unsigned long)ptr; 3558 unsigned long addr = (unsigned long)ptr;
3405 unsigned long min_addr = PAGE_OFFSET; 3559 unsigned long min_addr = PAGE_OFFSET;
@@ -3433,36 +3587,61 @@ out:
3433 * @cachep: The cache to allocate from. 3587 * @cachep: The cache to allocate from.
3434 * @flags: See kmalloc(). 3588 * @flags: See kmalloc().
3435 * @nodeid: node number of the target node. 3589 * @nodeid: node number of the target node.
3590 * @caller: return address of caller, used for debug information
3591 *
3592 * Identical to kmem_cache_alloc but it will allocate memory on the given
3593 * node, which can improve the performance for cpu bound structures.
3436 * 3594 *
3437 * Identical to kmem_cache_alloc, except that this function is slow 3595 * Fallback to other node is possible if __GFP_THISNODE is not set.
3438 * and can sleep. And it will allocate memory on the given node, which
3439 * can improve the performance for cpu bound structures.
3440 * New and improved: it will now make sure that the object gets
3441 * put on the correct node list so that there is no false sharing.
3442 */ 3596 */
3443void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3597static __always_inline void *
3598__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
3599 int nodeid, void *caller)
3444{ 3600{
3445 unsigned long save_flags; 3601 unsigned long save_flags;
3446 void *ptr; 3602 void *ptr = NULL;
3447 3603
3448 cache_alloc_debugcheck_before(cachep, flags); 3604 cache_alloc_debugcheck_before(cachep, flags);
3449 local_irq_save(save_flags); 3605 local_irq_save(save_flags);
3450 3606
3451 if (nodeid == -1 || nodeid == numa_node_id() || 3607 if (unlikely(nodeid == -1))
3452 !cachep->nodelists[nodeid]) 3608 nodeid = numa_node_id();
3453 ptr = ____cache_alloc(cachep, flags);
3454 else
3455 ptr = __cache_alloc_node(cachep, flags, nodeid);
3456 local_irq_restore(save_flags);
3457 3609
3458 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, 3610 if (likely(cachep->nodelists[nodeid])) {
3459 __builtin_return_address(0)); 3611 if (nodeid == numa_node_id()) {
3612 /*
3613 * Use the locally cached objects if possible.
3614 * However ____cache_alloc does not allow fallback
3615 * to other nodes. It may fail while we still have
3616 * objects on other nodes available.
3617 */
3618 ptr = ____cache_alloc(cachep, flags);
3619 }
3620 if (!ptr) {
3621 /* ___cache_alloc_node can fall back to other nodes */
3622 ptr = ____cache_alloc_node(cachep, flags, nodeid);
3623 }
3624 } else {
3625 /* Node not bootstrapped yet */
3626 if (!(flags & __GFP_THISNODE))
3627 ptr = fallback_alloc(cachep, flags);
3628 }
3629
3630 local_irq_restore(save_flags);
3631 ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
3460 3632
3461 return ptr; 3633 return ptr;
3462} 3634}
3635
3636void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3637{
3638 return __cache_alloc_node(cachep, flags, nodeid,
3639 __builtin_return_address(0));
3640}
3463EXPORT_SYMBOL(kmem_cache_alloc_node); 3641EXPORT_SYMBOL(kmem_cache_alloc_node);
3464 3642
3465void *__kmalloc_node(size_t size, gfp_t flags, int node) 3643static __always_inline void *
3644__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3466{ 3645{
3467 struct kmem_cache *cachep; 3646 struct kmem_cache *cachep;
3468 3647
@@ -3471,8 +3650,29 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
3471 return NULL; 3650 return NULL;
3472 return kmem_cache_alloc_node(cachep, flags, node); 3651 return kmem_cache_alloc_node(cachep, flags, node);
3473} 3652}
3653
3654#ifdef CONFIG_DEBUG_SLAB
3655void *__kmalloc_node(size_t size, gfp_t flags, int node)
3656{
3657 return __do_kmalloc_node(size, flags, node,
3658 __builtin_return_address(0));
3659}
3474EXPORT_SYMBOL(__kmalloc_node); 3660EXPORT_SYMBOL(__kmalloc_node);
3475#endif 3661
3662void *__kmalloc_node_track_caller(size_t size, gfp_t flags,
3663 int node, void *caller)
3664{
3665 return __do_kmalloc_node(size, flags, node, caller);
3666}
3667EXPORT_SYMBOL(__kmalloc_node_track_caller);
3668#else
3669void *__kmalloc_node(size_t size, gfp_t flags, int node)
3670{
3671 return __do_kmalloc_node(size, flags, node, NULL);
3672}
3673EXPORT_SYMBOL(__kmalloc_node);
3674#endif /* CONFIG_DEBUG_SLAB */
3675#endif /* CONFIG_NUMA */
3476 3676
3477/** 3677/**
3478 * __do_kmalloc - allocate memory 3678 * __do_kmalloc - allocate memory
@@ -3583,13 +3783,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
3583 int node; 3783 int node;
3584 struct kmem_list3 *l3; 3784 struct kmem_list3 *l3;
3585 struct array_cache *new_shared; 3785 struct array_cache *new_shared;
3586 struct array_cache **new_alien; 3786 struct array_cache **new_alien = NULL;
3587 3787
3588 for_each_online_node(node) { 3788 for_each_online_node(node) {
3589 3789
3590 new_alien = alloc_alien_cache(node, cachep->limit); 3790 if (use_alien_caches) {
3591 if (!new_alien) 3791 new_alien = alloc_alien_cache(node, cachep->limit);
3592 goto fail; 3792 if (!new_alien)
3793 goto fail;
3794 }
3593 3795
3594 new_shared = alloc_arraycache(node, 3796 new_shared = alloc_arraycache(node,
3595 cachep->shared*cachep->batchcount, 3797 cachep->shared*cachep->batchcount,
@@ -3815,7 +4017,7 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
3815 * If we cannot acquire the cache chain mutex then just give up - we'll try 4017 * If we cannot acquire the cache chain mutex then just give up - we'll try
3816 * again on the next iteration. 4018 * again on the next iteration.
3817 */ 4019 */
3818static void cache_reap(void *unused) 4020static void cache_reap(struct work_struct *unused)
3819{ 4021{
3820 struct kmem_cache *searchp; 4022 struct kmem_cache *searchp;
3821 struct kmem_list3 *l3; 4023 struct kmem_list3 *l3;
@@ -3824,7 +4026,7 @@ static void cache_reap(void *unused)
3824 if (!mutex_trylock(&cache_chain_mutex)) { 4026 if (!mutex_trylock(&cache_chain_mutex)) {
3825 /* Give up. Setup the next iteration. */ 4027 /* Give up. Setup the next iteration. */
3826 schedule_delayed_work(&__get_cpu_var(reap_work), 4028 schedule_delayed_work(&__get_cpu_var(reap_work),
3827 REAPTIMEOUT_CPUC); 4029 round_jiffies_relative(REAPTIMEOUT_CPUC));
3828 return; 4030 return;
3829 } 4031 }
3830 4032
@@ -3870,7 +4072,8 @@ next:
3870 next_reap_node(); 4072 next_reap_node();
3871 refresh_cpu_vm_stats(smp_processor_id()); 4073 refresh_cpu_vm_stats(smp_processor_id());
3872 /* Set up the next iteration */ 4074 /* Set up the next iteration */
3873 schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC); 4075 schedule_delayed_work(&__get_cpu_var(reap_work),
4076 round_jiffies_relative(REAPTIMEOUT_CPUC));
3874} 4077}
3875 4078
3876#ifdef CONFIG_PROC_FS 4079#ifdef CONFIG_PROC_FS
@@ -4038,7 +4241,7 @@ static int s_show(struct seq_file *m, void *p)
4038 * + further values on SMP and with statistics enabled 4241 * + further values on SMP and with statistics enabled
4039 */ 4242 */
4040 4243
4041struct seq_operations slabinfo_op = { 4244const struct seq_operations slabinfo_op = {
4042 .start = s_start, 4245 .start = s_start,
4043 .next = s_next, 4246 .next = s_next,
4044 .stop = s_stop, 4247 .stop = s_stop,
@@ -4236,7 +4439,7 @@ static int leaks_show(struct seq_file *m, void *p)
4236 return 0; 4439 return 0;
4237} 4440}
4238 4441
4239struct seq_operations slabstats_op = { 4442const struct seq_operations slabstats_op = {
4240 .start = leaks_start, 4443 .start = leaks_start,
4241 .next = s_next, 4444 .next = s_next,
4242 .stop = s_stop, 4445 .stop = s_stop,