aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLee Schermerhorn <lee.schermerhorn@hp.com>2010-05-26 17:45:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-05-27 12:12:57 -0400
commit7d6e6d09de82cf6cff7fecdba55198b9f47b381c (patch)
tree57b26da3bea0af63dd0b65cdcdde33fe5670a35f
parentfd1197f1131a1f1d8bc192f9cfbbe17e305f17f3 (diff)
numa: slab: use numa_mem_id() for slab local memory node
Example usage of generic "numa_mem_id()": The mainline slab code, since ~ 2.6.19, does not handle memoryless nodes well. Specifically, the "fast path"--____cache_alloc()--will never succeed as slab doesn't cache offnode object on the per cpu queues, and for memoryless nodes, all memory will be "off node" relative to numa_node_id(). This adds significant overhead to all kmem cache allocations, incurring a significant regression relative to earlier kernels [from before slab.c was reorganized]. This patch uses the generic topology function "numa_mem_id()" to return the "effective local memory node" for the calling context. This is the first node in the local node's generic fallback zonelist-- the same node that "local" mempolicy-based allocations would use. This lets slab cache these "local" allocations and avoid fallback/refill on every allocation. N.B.: Slab will need to handle node and memory hotplug events that could change the value returned by numa_mem_id() for any given node if recent changes to address memory hotplug don't already address this. E.g., flush all per cpu slab queues before rebuilding the zonelists while the "machine" is held in the stopped state. Performance impact on "hackbench 400 process 200" 2.6.34-rc3-mmotm-100405-1609 no-patch this-patch ia64 no memoryless nodes [avg of 10]: 11.713 11.637 ~0.65 diff ia64 cpus all on memless nodes [10]: 228.259 26.484 ~8.6x speedup The slowdown of the patched kernel from ~12 sec to ~28 seconds when configured with memoryless nodes is the result of all cpus allocating from a single node's mm pagepool. The cache lines of the single node are distributed/interleaved over the memory of the real physical nodes, but the zone lock, list heads, ... of the single node with memory still each live in a single cache line that is accessed from all processors. x86_64 [8x6 AMD] [avg of 40]: 2.883 2.845 Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Tejun Heo <tj@kernel.org> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Nick Piggin <npiggin@suse.de> Cc: David Rientjes <rientjes@google.com> Cc: Eric Whitney <eric.whitney@hp.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/slab.c43
1 files changed, 22 insertions, 21 deletions
diff --git a/mm/slab.c b/mm/slab.c
index 6437d89a8401..e49f8f46f46d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -821,7 +821,7 @@ static void init_reap_node(int cpu)
821{ 821{
822 int node; 822 int node;
823 823
824 node = next_node(cpu_to_node(cpu), node_online_map); 824 node = next_node(cpu_to_mem(cpu), node_online_map);
825 if (node == MAX_NUMNODES) 825 if (node == MAX_NUMNODES)
826 node = first_node(node_online_map); 826 node = first_node(node_online_map);
827 827
@@ -1050,7 +1050,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1050 struct array_cache *alien = NULL; 1050 struct array_cache *alien = NULL;
1051 int node; 1051 int node;
1052 1052
1053 node = numa_node_id(); 1053 node = numa_mem_id();
1054 1054
1055 /* 1055 /*
1056 * Make sure we are not freeing a object from another node to the array 1056 * Make sure we are not freeing a object from another node to the array
@@ -1129,7 +1129,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1129{ 1129{
1130 struct kmem_cache *cachep; 1130 struct kmem_cache *cachep;
1131 struct kmem_list3 *l3 = NULL; 1131 struct kmem_list3 *l3 = NULL;
1132 int node = cpu_to_node(cpu); 1132 int node = cpu_to_mem(cpu);
1133 const struct cpumask *mask = cpumask_of_node(node); 1133 const struct cpumask *mask = cpumask_of_node(node);
1134 1134
1135 list_for_each_entry(cachep, &cache_chain, next) { 1135 list_for_each_entry(cachep, &cache_chain, next) {
@@ -1194,7 +1194,7 @@ static int __cpuinit cpuup_prepare(long cpu)
1194{ 1194{
1195 struct kmem_cache *cachep; 1195 struct kmem_cache *cachep;
1196 struct kmem_list3 *l3 = NULL; 1196 struct kmem_list3 *l3 = NULL;
1197 int node = cpu_to_node(cpu); 1197 int node = cpu_to_mem(cpu);
1198 int err; 1198 int err;
1199 1199
1200 /* 1200 /*
@@ -1479,7 +1479,7 @@ void __init kmem_cache_init(void)
1479 * 6) Resize the head arrays of the kmalloc caches to their final sizes. 1479 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
1480 */ 1480 */
1481 1481
1482 node = numa_node_id(); 1482 node = numa_mem_id();
1483 1483
1484 /* 1) create the cache_cache */ 1484 /* 1) create the cache_cache */
1485 INIT_LIST_HEAD(&cache_chain); 1485 INIT_LIST_HEAD(&cache_chain);
@@ -2121,7 +2121,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
2121 } 2121 }
2122 } 2122 }
2123 } 2123 }
2124 cachep->nodelists[numa_node_id()]->next_reap = 2124 cachep->nodelists[numa_mem_id()]->next_reap =
2125 jiffies + REAPTIMEOUT_LIST3 + 2125 jiffies + REAPTIMEOUT_LIST3 +
2126 ((unsigned long)cachep) % REAPTIMEOUT_LIST3; 2126 ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
2127 2127
@@ -2452,7 +2452,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
2452{ 2452{
2453#ifdef CONFIG_SMP 2453#ifdef CONFIG_SMP
2454 check_irq_off(); 2454 check_irq_off();
2455 assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); 2455 assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
2456#endif 2456#endif
2457} 2457}
2458 2458
@@ -2479,7 +2479,7 @@ static void do_drain(void *arg)
2479{ 2479{
2480 struct kmem_cache *cachep = arg; 2480 struct kmem_cache *cachep = arg;
2481 struct array_cache *ac; 2481 struct array_cache *ac;
2482 int node = numa_node_id(); 2482 int node = numa_mem_id();
2483 2483
2484 check_irq_off(); 2484 check_irq_off();
2485 ac = cpu_cache_get(cachep); 2485 ac = cpu_cache_get(cachep);
@@ -3012,7 +3012,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
3012 3012
3013retry: 3013retry:
3014 check_irq_off(); 3014 check_irq_off();
3015 node = numa_node_id(); 3015 node = numa_mem_id();
3016 ac = cpu_cache_get(cachep); 3016 ac = cpu_cache_get(cachep);
3017 batchcount = ac->batchcount; 3017 batchcount = ac->batchcount;
3018 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { 3018 if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -3216,7 +3216,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3216 3216
3217 if (in_interrupt() || (flags & __GFP_THISNODE)) 3217 if (in_interrupt() || (flags & __GFP_THISNODE))
3218 return NULL; 3218 return NULL;
3219 nid_alloc = nid_here = numa_node_id(); 3219 nid_alloc = nid_here = numa_mem_id();
3220 get_mems_allowed(); 3220 get_mems_allowed();
3221 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3221 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
3222 nid_alloc = cpuset_slab_spread_node(); 3222 nid_alloc = cpuset_slab_spread_node();
@@ -3281,7 +3281,7 @@ retry:
3281 if (local_flags & __GFP_WAIT) 3281 if (local_flags & __GFP_WAIT)
3282 local_irq_enable(); 3282 local_irq_enable();
3283 kmem_flagcheck(cache, flags); 3283 kmem_flagcheck(cache, flags);
3284 obj = kmem_getpages(cache, local_flags, numa_node_id()); 3284 obj = kmem_getpages(cache, local_flags, numa_mem_id());
3285 if (local_flags & __GFP_WAIT) 3285 if (local_flags & __GFP_WAIT)
3286 local_irq_disable(); 3286 local_irq_disable();
3287 if (obj) { 3287 if (obj) {
@@ -3389,6 +3389,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3389{ 3389{
3390 unsigned long save_flags; 3390 unsigned long save_flags;
3391 void *ptr; 3391 void *ptr;
3392 int slab_node = numa_mem_id();
3392 3393
3393 flags &= gfp_allowed_mask; 3394 flags &= gfp_allowed_mask;
3394 3395
@@ -3401,7 +3402,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3401 local_irq_save(save_flags); 3402 local_irq_save(save_flags);
3402 3403
3403 if (nodeid == -1) 3404 if (nodeid == -1)
3404 nodeid = numa_node_id(); 3405 nodeid = slab_node;
3405 3406
3406 if (unlikely(!cachep->nodelists[nodeid])) { 3407 if (unlikely(!cachep->nodelists[nodeid])) {
3407 /* Node not bootstrapped yet */ 3408 /* Node not bootstrapped yet */
@@ -3409,7 +3410,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
3409 goto out; 3410 goto out;
3410 } 3411 }
3411 3412
3412 if (nodeid == numa_node_id()) { 3413 if (nodeid == slab_node) {
3413 /* 3414 /*
3414 * Use the locally cached objects if possible. 3415 * Use the locally cached objects if possible.
3415 * However ____cache_alloc does not allow fallback 3416 * However ____cache_alloc does not allow fallback
@@ -3453,8 +3454,8 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags)
3453 * We may just have run out of memory on the local node. 3454 * We may just have run out of memory on the local node.
3454 * ____cache_alloc_node() knows how to locate memory on other nodes 3455 * ____cache_alloc_node() knows how to locate memory on other nodes
3455 */ 3456 */
3456 if (!objp) 3457 if (!objp)
3457 objp = ____cache_alloc_node(cache, flags, numa_node_id()); 3458 objp = ____cache_alloc_node(cache, flags, numa_mem_id());
3458 3459
3459 out: 3460 out:
3460 return objp; 3461 return objp;
@@ -3551,7 +3552,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
3551{ 3552{
3552 int batchcount; 3553 int batchcount;
3553 struct kmem_list3 *l3; 3554 struct kmem_list3 *l3;
3554 int node = numa_node_id(); 3555 int node = numa_mem_id();
3555 3556
3556 batchcount = ac->batchcount; 3557 batchcount = ac->batchcount;
3557#if DEBUG 3558#if DEBUG
@@ -3985,7 +3986,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3985 return -ENOMEM; 3986 return -ENOMEM;
3986 3987
3987 for_each_online_cpu(i) { 3988 for_each_online_cpu(i) {
3988 new->new[i] = alloc_arraycache(cpu_to_node(i), limit, 3989 new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
3989 batchcount, gfp); 3990 batchcount, gfp);
3990 if (!new->new[i]) { 3991 if (!new->new[i]) {
3991 for (i--; i >= 0; i--) 3992 for (i--; i >= 0; i--)
@@ -4007,9 +4008,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
4007 struct array_cache *ccold = new->new[i]; 4008 struct array_cache *ccold = new->new[i];
4008 if (!ccold) 4009 if (!ccold)
4009 continue; 4010 continue;
4010 spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 4011 spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
4011 free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); 4012 free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
4012 spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); 4013 spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
4013 kfree(ccold); 4014 kfree(ccold);
4014 } 4015 }
4015 kfree(new); 4016 kfree(new);
@@ -4115,7 +4116,7 @@ static void cache_reap(struct work_struct *w)
4115{ 4116{
4116 struct kmem_cache *searchp; 4117 struct kmem_cache *searchp;
4117 struct kmem_list3 *l3; 4118 struct kmem_list3 *l3;
4118 int node = numa_node_id(); 4119 int node = numa_mem_id();
4119 struct delayed_work *work = to_delayed_work(w); 4120 struct delayed_work *work = to_delayed_work(w);
4120 4121
4121 if (!mutex_trylock(&cache_chain_mutex)) 4122 if (!mutex_trylock(&cache_chain_mutex))