aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@engr.sgi.com>2006-01-18 20:42:36 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-18 22:20:18 -0500
commitdc85da15d42b0efc792b0f5eab774dc5dbc1ceec (patch)
tree4b347b10dadf3cc7bdbff36709e8cee2bc673996
parentfc0abb1451c64c79ac80665d5ba74450ce274e4d (diff)
[PATCH] NUMA policies in the slab allocator V2
This patch fixes a regression in 2.6.14 against 2.6.13 that causes an imbalance in memory allocation during bootup. The slab allocator in 2.6.13 is not numa aware and simply calls alloc_pages(). This means that memory policies may control the behavior of alloc_pages(). During bootup the memory policy is set to MPOL_INTERLEAVE resulting in the spreading out of allocations during bootup over all available nodes. The slab allocator in 2.6.13 has only a single list of slab pages. As a result the per cpu slab cache and the spinlock controlled page lists may contain slab entries from off node memory. The slab allocator in 2.6.13 makes no effort to discern the locality of an entry on its lists. The NUMA aware slab allocator in 2.6.14 controls locality of the slab pages explicitly by calling alloc_pages_node(). The NUMA slab allocator manages slab entries by having lists of available slab pages for each node. The per cpu slab cache can only contain slab entries associated with the node local to the processor. This guarantees that the default allocation mode of the slab allocator always assigns local memory if available. Setting MPOL_INTERLEAVE as a default policy during bootup has no effect anymore. In 2.6.14 all node unspecific slab allocations are performed on the boot processor. This means that most of key data structures are allocated on one node. Most processors will have to refer to these structures making the boot node a potential bottleneck. This may reduce performance and cause unnecessary memory pressure on the boot node. This patch implements NUMA policies in the slab layer. There is the need of explicit application of NUMA memory policies by the slab allcator itself since the NUMA slab allocator does no longer let the page_allocator control locality. The check for policies is made directly at the beginning of __cache_alloc using current->mempolicy. The memory policy is already frequently checked by the page allocator (alloc_page_vma() and alloc_page_current()). So it is highly likely that the cacheline is present. For MPOL_INTERLEAVE kmalloc() will spread out each request to one node after another so that an equal distribution of allocations can be obtained during bootup. It is not possible to push the policy check to lower layers of the NUMA slab allocator since the per cpu caches are now only containing slab entries from the current node. If the policy says that the local node is not to be preferred or forbidden then there is no point in checking the slab cache or local list of slab pages. The allocation better be directed immediately to the lists containing slab entries for the allowed set of nodes. This way of applying policy also fixes another strange behavior in 2.6.13. alloc_pages() is controlled by the memory allocation policy of the current process. It could therefore be that one process is running with MPOL_INTERLEAVE and would f.e. obtain a new page following that policy since no slab entries are in the lists anymore. A page can typically be used for multiple slab entries but lets say that the current process is only using one. The other entries are then added to the slab lists. These are now non local entries in the slab lists despite of the possible availability of local pages that would provide faster access and increase the performance of the application. Another process without MPOL_INTERLEAVE may now run and expect a local slab entry from kmalloc(). However, there are still these free slab entries from the off node page obtained from the other process via MPOL_INTERLEAVE in the cache. The process will then get an off node slab entry although other slab entries may be available that are local to that process. This means that the policy if one process may contaminate the locality of the slab caches for other processes. This patch in effect insures that a per process policy is followed for the allocation of slab entries and that there cannot be a memory policy influence from one process to another. A process with default policy will always get a local slab entry if one is available. And the process using memory policies will get its memory arranged as requested. Off-node slab allocation will require the use of spinlocks and will make the use of per cpu caches not possible. A process using memory policies to redirect allocations offnode will have to cope with additional lock overhead in addition to the latency added by the need to access a remote slab entry. Changes V1->V2 - Remove #ifdef CONFIG_NUMA by moving forward declaration into prior #ifdef CONFIG_NUMA section. - Give the function determining the node number to use a saner name. Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/mempolicy.h1
-rw-r--r--mm/mempolicy.c30
-rw-r--r--mm/slab.c12
3 files changed, 43 insertions, 0 deletions
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index d6a53ed6ab6c..bbd2221923c3 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -159,6 +159,7 @@ extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
159extern struct mempolicy default_policy; 159extern struct mempolicy default_policy;
160extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, 160extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
161 unsigned long addr); 161 unsigned long addr);
162extern unsigned slab_node(struct mempolicy *policy);
162 163
163extern int policy_zone; 164extern int policy_zone;
164 165
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a683a66599b1..71430d440822 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -976,6 +976,36 @@ static unsigned interleave_nodes(struct mempolicy *policy)
976 return nid; 976 return nid;
977} 977}
978 978
979/*
980 * Depending on the memory policy provide a node from which to allocate the
981 * next slab entry.
982 */
983unsigned slab_node(struct mempolicy *policy)
984{
985 if (in_interrupt())
986 return numa_node_id();
987
988 switch (policy->policy) {
989 case MPOL_INTERLEAVE:
990 return interleave_nodes(policy);
991
992 case MPOL_BIND:
993 /*
994 * Follow bind policy behavior and start allocation at the
995 * first node.
996 */
997 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
998
999 case MPOL_PREFERRED:
1000 if (policy->v.preferred_node >= 0)
1001 return policy->v.preferred_node;
1002 /* Fall through */
1003
1004 default:
1005 return numa_node_id();
1006 }
1007}
1008
979/* Do static interleaving for a VMA with known offset. */ 1009/* Do static interleaving for a VMA with known offset. */
980static unsigned offset_il_node(struct mempolicy *pol, 1010static unsigned offset_il_node(struct mempolicy *pol,
981 struct vm_area_struct *vma, unsigned long off) 1011 struct vm_area_struct *vma, unsigned long off)
diff --git a/mm/slab.c b/mm/slab.c
index bd0317f1e06c..9025608696ec 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -103,6 +103,7 @@
103#include <linux/rcupdate.h> 103#include <linux/rcupdate.h>
104#include <linux/string.h> 104#include <linux/string.h>
105#include <linux/nodemask.h> 105#include <linux/nodemask.h>
106#include <linux/mempolicy.h>
106#include <linux/mutex.h> 107#include <linux/mutex.h>
107 108
108#include <asm/uaccess.h> 109#include <asm/uaccess.h>
@@ -773,6 +774,8 @@ static struct array_cache *alloc_arraycache(int node, int entries,
773} 774}
774 775
775#ifdef CONFIG_NUMA 776#ifdef CONFIG_NUMA
777static void *__cache_alloc_node(kmem_cache_t *, gfp_t, int);
778
776static inline struct array_cache **alloc_alien_cache(int node, int limit) 779static inline struct array_cache **alloc_alien_cache(int node, int limit)
777{ 780{
778 struct array_cache **ac_ptr; 781 struct array_cache **ac_ptr;
@@ -2570,6 +2573,15 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
2570 void *objp; 2573 void *objp;
2571 struct array_cache *ac; 2574 struct array_cache *ac;
2572 2575
2576#ifdef CONFIG_NUMA
2577 if (current->mempolicy) {
2578 int nid = slab_node(current->mempolicy);
2579
2580 if (nid != numa_node_id())
2581 return __cache_alloc_node(cachep, flags, nid);
2582 }
2583#endif
2584
2573 check_irq_off(); 2585 check_irq_off();
2574 ac = ac_data(cachep); 2586 ac = ac_data(cachep);
2575 if (likely(ac->avail)) { 2587 if (likely(ac->avail)) {