aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@sgi.com>2006-09-27 04:50:08 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-09-27 11:26:12 -0400
commit765c4507af71c39aba21006bbd3ec809fe9714ff (patch)
tree8bf1f5f940af830e18321b4e8ceac55457e5b981
parent77f700dab4c05f8ee17584ec869672796d7bcb87 (diff)
[PATCH] GFP_THISNODE for the slab allocator
This patch insures that the slab node lists in the NUMA case only contain slabs that belong to that specific node. All slab allocations use GFP_THISNODE when calling into the page allocator. If an allocation fails then we fall back in the slab allocator according to the zonelists appropriate for a certain context. This allows a replication of the behavior of alloc_pages and alloc_pages node in the slab layer. Currently allocations requested from the page allocator may be redirected via cpusets to other nodes. This results in remote pages on nodelists and that in turn results in interrupt latency issues during cache draining. Plus the slab is handing out memory as local when it is really remote. Fallback for slab memory allocations will occur within the slab allocator and not in the page allocator. This is necessary in order to be able to use the existing pools of objects on the nodes that we fall back to before adding more pages to a slab. The fallback function insures that the nodes we fall back to obey cpuset restrictions of the current context. We do not allocate objects from outside of the current cpuset context like before. Note that the implementation of locality constraints within the slab allocator requires importing logic from the page allocator. This is a mischmash that is not that great. Other allocators (uncached allocator, vmalloc, huge pages) face similar problems and have similar minimal reimplementations of the basic fallback logic of the page allocator. There is another way of implementing a slab by avoiding per node lists (see modular slab) but this wont work within the existing slab. V1->V2: - Use NUMA_BUILD to avoid #ifdef CONFIG_NUMA - Exploit GFP_THISNODE being 0 in the NON_NUMA case to avoid another #ifdef [akpm@osdl.org: build fix] Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--mm/mempolicy.c4
-rw-r--r--mm/slab.c107
2 files changed, 81 insertions, 30 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 38f89650bc84..cf18f0942553 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1136,7 +1136,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1136 */ 1136 */
1137unsigned slab_node(struct mempolicy *policy) 1137unsigned slab_node(struct mempolicy *policy)
1138{ 1138{
1139 switch (policy->policy) { 1139 int pol = policy ? policy->policy : MPOL_DEFAULT;
1140
1141 switch (pol) {
1140 case MPOL_INTERLEAVE: 1142 case MPOL_INTERLEAVE:
1141 return interleave_nodes(policy); 1143 return interleave_nodes(policy);
1142 1144
diff --git a/mm/slab.c b/mm/slab.c
index 69e11c45002f..792bfe320a8b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -972,7 +972,39 @@ static int transfer_objects(struct array_cache *to,
972 return nr; 972 return nr;
973} 973}
974 974
975#ifdef CONFIG_NUMA 975#ifndef CONFIG_NUMA
976
977#define drain_alien_cache(cachep, alien) do { } while (0)
978#define reap_alien(cachep, l3) do { } while (0)
979
980static inline struct array_cache **alloc_alien_cache(int node, int limit)
981{
982 return (struct array_cache **)BAD_ALIEN_MAGIC;
983}
984
985static inline void free_alien_cache(struct array_cache **ac_ptr)
986{
987}
988
989static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
990{
991 return 0;
992}
993
994static inline void *alternate_node_alloc(struct kmem_cache *cachep,
995 gfp_t flags)
996{
997 return NULL;
998}
999
1000static inline void *__cache_alloc_node(struct kmem_cache *cachep,
1001 gfp_t flags, int nodeid)
1002{
1003 return NULL;
1004}
1005
1006#else /* CONFIG_NUMA */
1007
976static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int); 1008static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
977static void *alternate_node_alloc(struct kmem_cache *, gfp_t); 1009static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
978 1010
@@ -1101,26 +1133,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1101 } 1133 }
1102 return 1; 1134 return 1;
1103} 1135}
1104
1105#else
1106
1107#define drain_alien_cache(cachep, alien) do { } while (0)
1108#define reap_alien(cachep, l3) do { } while (0)
1109
1110static inline struct array_cache **alloc_alien_cache(int node, int limit)
1111{
1112 return (struct array_cache **)BAD_ALIEN_MAGIC;
1113}
1114
1115static inline void free_alien_cache(struct array_cache **ac_ptr)
1116{
1117}
1118
1119static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
1120{
1121 return 0;
1122}
1123
1124#endif 1136#endif
1125 1137
1126static int __cpuinit cpuup_callback(struct notifier_block *nfb, 1138static int __cpuinit cpuup_callback(struct notifier_block *nfb,
@@ -1564,7 +1576,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
1564 */ 1576 */
1565 flags |= __GFP_COMP; 1577 flags |= __GFP_COMP;
1566#endif 1578#endif
1567 flags |= cachep->gfpflags; 1579
1580 /*
1581 * Under NUMA we want memory on the indicated node. We will handle
1582 * the needed fallback ourselves since we want to serve from our
1583 * per node object lists first for other nodes.
1584 */
1585 flags |= cachep->gfpflags | GFP_THISNODE;
1568 1586
1569 page = alloc_pages_node(nodeid, flags, cachep->gfporder); 1587 page = alloc_pages_node(nodeid, flags, cachep->gfporder);
1570 if (!page) 1588 if (!page)
@@ -3051,13 +3069,18 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
3051 3069
3052 local_irq_save(save_flags); 3070 local_irq_save(save_flags);
3053 3071
3054#ifdef CONFIG_NUMA 3072 if (unlikely(NUMA_BUILD &&
3055 if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) 3073 current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY)))
3056 objp = alternate_node_alloc(cachep, flags); 3074 objp = alternate_node_alloc(cachep, flags);
3057#endif
3058 3075
3059 if (!objp) 3076 if (!objp)
3060 objp = ____cache_alloc(cachep, flags); 3077 objp = ____cache_alloc(cachep, flags);
3078 /*
3079 * We may just have run out of memory on the local node.
3080 * __cache_alloc_node() knows how to locate memory on other nodes
3081 */
3082 if (NUMA_BUILD && !objp)
3083 objp = __cache_alloc_node(cachep, flags, numa_node_id());
3061 local_irq_restore(save_flags); 3084 local_irq_restore(save_flags);
3062 objp = cache_alloc_debugcheck_after(cachep, flags, objp, 3085 objp = cache_alloc_debugcheck_after(cachep, flags, objp,
3063 caller); 3086 caller);
@@ -3076,7 +3099,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3076{ 3099{
3077 int nid_alloc, nid_here; 3100 int nid_alloc, nid_here;
3078 3101
3079 if (in_interrupt()) 3102 if (in_interrupt() || (flags & __GFP_THISNODE))
3080 return NULL; 3103 return NULL;
3081 nid_alloc = nid_here = numa_node_id(); 3104 nid_alloc = nid_here = numa_node_id();
3082 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) 3105 if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
@@ -3089,6 +3112,28 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
3089} 3112}
3090 3113
3091/* 3114/*
3115 * Fallback function if there was no memory available and no objects on a
3116 * certain node and we are allowed to fall back. We mimick the behavior of
3117 * the page allocator. We fall back according to a zonelist determined by
3118 * the policy layer while obeying cpuset constraints.
3119 */
3120void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3121{
3122 struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy))
3123 ->node_zonelists[gfp_zone(flags)];
3124 struct zone **z;
3125 void *obj = NULL;
3126
3127 for (z = zonelist->zones; *z && !obj; z++)
3128 if (zone_idx(*z) <= ZONE_NORMAL &&
3129 cpuset_zone_allowed(*z, flags))
3130 obj = __cache_alloc_node(cache,
3131 flags | __GFP_THISNODE,
3132 zone_to_nid(*z));
3133 return obj;
3134}
3135
3136/*
3092 * A interface to enable slab creation on nodeid 3137 * A interface to enable slab creation on nodeid
3093 */ 3138 */
3094static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, 3139static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
@@ -3141,11 +3186,15 @@ retry:
3141must_grow: 3186must_grow:
3142 spin_unlock(&l3->list_lock); 3187 spin_unlock(&l3->list_lock);
3143 x = cache_grow(cachep, flags, nodeid); 3188 x = cache_grow(cachep, flags, nodeid);
3189 if (x)
3190 goto retry;
3144 3191
3145 if (!x) 3192 if (!(flags & __GFP_THISNODE))
3146 return NULL; 3193 /* Unable to grow the cache. Fall back to other nodes. */
3194 return fallback_alloc(cachep, flags);
3195
3196 return NULL;
3147 3197
3148 goto retry;
3149done: 3198done:
3150 return obj; 3199 return obj;
3151} 3200}