[PATCH] cpuset memory spread slab cache optimizations

The hooks in the slab cache allocator code path for support of NUMA mempolicies and cpuset memory spreading are in an important code path. Many systems will use neither feature. This patch optimizes those hooks down to a single check of some bits in the current tasks task_struct flags. For non NUMA systems, this hook and related code is already ifdef'd out. The optimization is done by using another task flag, set if the task is using a non-default NUMA mempolicy. Taking this flag bit along with the PF_SPREAD_PAGE and PF_SPREAD_SLAB flag bits added earlier in this 'cpuset memory spreading' patch set, one can check for the combination of any of these special case memory placement mechanisms with a single test of the current tasks task_struct flags. This patch also tightens up the code, to save a few bytes of kernel text space, and moves some of it out of line. Due to the nested inlines called from multiple places, we were ending up with three copies of this code, which once we get off the main code path (for local node allocation) seems a bit wasteful of instruction memory. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Paul Jackson <pj@sgi.com> 2006-03-24 06:16:08 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-03-24 10:33:23 -0500
commit: c61afb181c649754ea221f104e268cbacfc993e3 (patch)
tree: 870917b3f9175cf1663a2620d989856913cfb5f8
parent: 101a50019ae5e370d73984ee05d56dd3b08f330a (diff)
5 files changed, 67 insertions, 13 deletions
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index bbd2221923c3..6a7621b2b12b 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -147,6 +147,7 @@ extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
 extern void mpol_rebind_task(struct task_struct *tsk,
                                        const nodemask_t *new);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+extern void mpol_fix_fork_child_flag(struct task_struct *p);
 #define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
 #ifdef CONFIG_CPUSET
@@ -248,6 +249,10 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 }
+static inline void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+}
 #define set_cpuset_being_rebound(x) do {} while (0)
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0e37cfa09f5..2cda439ece43 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -932,6 +932,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_SWAPWRITE    0x01000000      /* Allowed to write to swap */
 #define PF_SPREAD_PAGE  0x04000000      /* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB  0x08000000      /* Spread some slab caches over cpuset */
+#define PF_MEMPOLICY    0x10000000      /* Non-default NUMA mempolicy */
 /*
 * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/fork.c b/kernel/fork.c
index c21bae8c93b9..a02063903aaa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1021,6 +1021,7 @@ static task_t *copy_process(unsigned long clone_flags,
                p->mempolicy = NULL;
                goto bad_fork_cleanup_cpuset;
        }
+        mpol_fix_fork_child_flag(p);
 #endif
 #ifdef CONFIG_DEBUG_MUTEXES
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e93cc740c22b..4f71cfd29c6f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
        return mpol_check_policy(mode, nodes);
 }
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy.  Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+        if (p->mempolicy)
+                p->flags |= PF_MEMPOLICY;
+        else
+                p->flags &= ~PF_MEMPOLICY;
+}
+static void mpol_set_task_struct_flag(void)
+{
+        mpol_fix_fork_child_flag(current);
+}
 /* Set the process memory policy */
 long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
@@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
                return PTR_ERR(new);
        mpol_free(current->mempolicy);
        current->mempolicy = new;
+        mpol_set_task_struct_flag();
        if (new && new->policy == MPOL_INTERLEAVE)
                current->il_next = first_node(new->v.nodes);
        return 0;
diff --git a/mm/slab.c b/mm/slab.c
index de516658d3d8..f80b52388a12 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -899,6 +899,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 #ifdef CONFIG_NUMA
 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -2808,19 +2809,11 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
        struct array_cache *ac;
 #ifdef CONFIG_NUMA
-        if (unlikely(current->mempolicy && !in_interrupt())) {
+        if (unlikely(current->flags & (PF_SPREAD_PAGE | PF_SPREAD_SLAB |
-                int nid = slab_node(current->mempolicy);
+                                                        PF_MEMPOLICY))) {
+                objp = alternate_node_alloc(cachep, flags);
-                if (nid != numa_node_id())
+                if (objp != NULL)
-                        return __cache_alloc_node(cachep, flags, nid);
+                        return objp;
-        }
-        if (unlikely(cpuset_do_slab_mem_spread() &&
-                                        (cachep->flags & SLAB_MEM_SPREAD) &&
-                                        !in_interrupt())) {
-                int nid = cpuset_mem_spread_node();
-                if (nid != numa_node_id())
-                        return __cache_alloc_node(cachep, flags, nid);
        }
 #endif
@@ -2856,6 +2849,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
 #ifdef CONFIG_NUMA
 /*
+ * Try allocating on another node if PF_SPREAD_PAGE|PF_SPREAD_SLAB|PF_MEMPOLICY.
+ *
+ * If we are in_interrupt, then process context, including cpusets and
+ * mempolicy, may not apply and should not be used for allocation policy.
+ */
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+        int nid_alloc, nid_here;
+        if (in_interrupt())
+                return NULL;
+        nid_alloc = nid_here = numa_node_id();
+        if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
+                nid_alloc = cpuset_mem_spread_node();
+        else if (current->mempolicy)
+                nid_alloc = slab_node(current->mempolicy);
+        if (nid_alloc != nid_here)
+                return __cache_alloc_node(cachep, flags, nid_alloc);
+        return NULL;
+}
+/*
 * A interface to enable slab creation on nodeid
 */
 static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
author	Paul Jackson <pj@sgi.com>	2006-03-24 06:16:08 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-03-24 10:33:23 -0500
commit	c61afb181c649754ea221f104e268cbacfc993e3 (patch)
tree	870917b3f9175cf1663a2620d989856913cfb5f8
parent	101a50019ae5e370d73984ee05d56dd3b08f330a (diff)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index bbd2221923c3..6a7621b2b12b 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h
@@ -147,6 +147,7 @@ extern void mpol_rebind_policy(struct mempolicy pol, const nodemask_t new);
147	extern void mpol_rebind_task(struct task_struct *tsk,	147	extern void mpol_rebind_task(struct task_struct *tsk,
148	const nodemask_t *new);	148	const nodemask_t *new);
149	extern void mpol_rebind_mm(struct mm_struct mm, nodemask_t new);	149	extern void mpol_rebind_mm(struct mm_struct mm, nodemask_t new);
		150	extern void mpol_fix_fork_child_flag(struct task_struct *p);
150	#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))	151	#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
151		152
152	#ifdef CONFIG_CPUSET	153	#ifdef CONFIG_CPUSET
@@ -248,6 +249,10 @@ static inline void mpol_rebind_mm(struct mm_struct mm, nodemask_t new)
248	{	249	{
249	}	250	}
250		251
		252	static inline void mpol_fix_fork_child_flag(struct task_struct *p)
		253	{
		254	}
		255
251	#define set_cpuset_being_rebound(x) do {} while (0)	256	#define set_cpuset_being_rebound(x) do {} while (0)
252		257
253	static inline struct zonelist huge_zonelist(struct vm_area_struct vma,	258	static inline struct zonelist huge_zonelist(struct vm_area_struct vma,


diff --git a/include/linux/sched.h b/include/linux/sched.h index b0e37cfa09f5..2cda439ece43 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h
@@ -932,6 +932,7 @@ static inline void put_task_struct(struct task_struct *t)
932	#define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */	932	#define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */
933	#define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */	933	#define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */
934	#define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */	934	#define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */
		935	#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
935		936
936	/*	937	/*
937	* Only the _current_ task can read/write to tsk->flags, but other	938	* Only the _current_ task can read/write to tsk->flags, but other


diff --git a/kernel/fork.c b/kernel/fork.c index c21bae8c93b9..a02063903aaa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c
@@ -1021,6 +1021,7 @@ static task_t *copy_process(unsigned long clone_flags,
1021	p->mempolicy = NULL;	1021	p->mempolicy = NULL;
1022	goto bad_fork_cleanup_cpuset;	1022	goto bad_fork_cleanup_cpuset;
1023	}	1023	}
		1024	mpol_fix_fork_child_flag(p);
1024	#endif	1025	#endif
1025		1026
1026	#ifdef CONFIG_DEBUG_MUTEXES	1027	#ifdef CONFIG_DEBUG_MUTEXES


diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e93cc740c22b..4f71cfd29c6f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
422	return mpol_check_policy(mode, nodes);	422	return mpol_check_policy(mode, nodes);
423	}	423	}
424		424
		425
		426	/*
		427	* Update task->flags PF_MEMPOLICY bit: set iff non-default
		428	* mempolicy. Allows more rapid checking of this (combined perhaps
		429	* with other PF_* flag bits) on memory allocation hot code paths.
		430	*
		431	* If called from outside this file, the task 'p' should -only- be
		432	* a newly forked child not yet visible on the task list, because
		433	* manipulating the task flags of a visible task is not safe.
		434	*
		435	* The above limitation is why this routine has the funny name
		436	* mpol_fix_fork_child_flag().
		437	*
		438	* It is also safe to call this with a task pointer of current,
		439	* which the static wrapper mpol_set_task_struct_flag() does,
		440	* for use within this file.
		441	*/
		442
		443	void mpol_fix_fork_child_flag(struct task_struct *p)
		444	{
		445	if (p->mempolicy)
		446	p->flags \|= PF_MEMPOLICY;
		447	else
		448	p->flags &= ~PF_MEMPOLICY;
		449	}
		450
		451	static void mpol_set_task_struct_flag(void)
		452	{
		453	mpol_fix_fork_child_flag(current);
		454	}
		455
425	/* Set the process memory policy */	456	/* Set the process memory policy */
426	long do_set_mempolicy(int mode, nodemask_t *nodes)	457	long do_set_mempolicy(int mode, nodemask_t *nodes)
427	{	458	{
@@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
434	return PTR_ERR(new);	465	return PTR_ERR(new);
435	mpol_free(current->mempolicy);	466	mpol_free(current->mempolicy);
436	current->mempolicy = new;	467	current->mempolicy = new;
		468	mpol_set_task_struct_flag();
437	if (new && new->policy == MPOL_INTERLEAVE)	469	if (new && new->policy == MPOL_INTERLEAVE)
438	current->il_next = first_node(new->v.nodes);	470	current->il_next = first_node(new->v.nodes);
439	return 0;	471	return 0;


diff --git a/mm/slab.c b/mm/slab.c index de516658d3d8..f80b52388a12 100644 --- a/mm/slab.c +++ b/mm/slab.c
@@ -899,6 +899,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
899		899
900	#ifdef CONFIG_NUMA	900	#ifdef CONFIG_NUMA
901	static void __cache_alloc_node(struct kmem_cache , gfp_t, int);	901	static void __cache_alloc_node(struct kmem_cache , gfp_t, int);
		902	static void alternate_node_alloc(struct kmem_cache , gfp_t);
902		903
903	static struct array_cache **alloc_alien_cache(int node, int limit)	904	static struct array_cache **alloc_alien_cache(int node, int limit)
904	{	905	{
@@ -2808,19 +2809,11 @@ static inline void ____cache_alloc(struct kmem_cache cachep, gfp_t flags)
2808	struct array_cache *ac;	2809	struct array_cache *ac;
2809		2810
2810	#ifdef CONFIG_NUMA	2811	#ifdef CONFIG_NUMA
2811	if (unlikely(current->mempolicy && !in_interrupt())) {	2812	if (unlikely(current->flags & (PF_SPREAD_PAGE \| PF_SPREAD_SLAB \|
2812	int nid = slab_node(current->mempolicy);	2813	PF_MEMPOLICY))) {
2813		2814	objp = alternate_node_alloc(cachep, flags);
2814	if (nid != numa_node_id())	2815	if (objp != NULL)
2815	return __cache_alloc_node(cachep, flags, nid);	2816	return objp;
2816	}
2817	if (unlikely(cpuset_do_slab_mem_spread() &&
2818	(cachep->flags & SLAB_MEM_SPREAD) &&
2819	!in_interrupt())) {
2820	int nid = cpuset_mem_spread_node();
2821
2822	if (nid != numa_node_id())
2823	return __cache_alloc_node(cachep, flags, nid);
2824	}	2817	}
2825	#endif	2818	#endif
2826		2819
@@ -2856,6 +2849,28 @@ static __always_inline void __cache_alloc(struct kmem_cache cachep,
2856		2849
2857	#ifdef CONFIG_NUMA	2850	#ifdef CONFIG_NUMA
2858	/*	2851	/*
		2852	* Try allocating on another node if PF_SPREAD_PAGE\|PF_SPREAD_SLAB\|PF_MEMPOLICY.
		2853	*
		2854	* If we are in_interrupt, then process context, including cpusets and
		2855	* mempolicy, may not apply and should not be used for allocation policy.
		2856	*/
		2857	static void alternate_node_alloc(struct kmem_cache cachep, gfp_t flags)
		2858	{
		2859	int nid_alloc, nid_here;
		2860
		2861	if (in_interrupt())
		2862	return NULL;
		2863	nid_alloc = nid_here = numa_node_id();
		2864	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
		2865	nid_alloc = cpuset_mem_spread_node();
		2866	else if (current->mempolicy)
		2867	nid_alloc = slab_node(current->mempolicy);
		2868	if (nid_alloc != nid_here)
		2869	return __cache_alloc_node(cachep, flags, nid_alloc);
		2870	return NULL;
		2871	}
		2872
		2873	/*
2859	* A interface to enable slab creation on nodeid	2874	* A interface to enable slab creation on nodeid
2860	*/	2875	*/
2861	static void __cache_alloc_node(struct kmem_cache cachep, gfp_t flags,	2876	static void __cache_alloc_node(struct kmem_cache cachep, gfp_t flags,