2 files changed, 86 insertions, 26 deletions
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 829b94b156f2..b359c4a9ec9e 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -82,6 +82,12 @@
 *    to generate slightly worse code.  So use a simple one-line #define
 *    for node_isset(), instead of wrapping an inline inside a macro, the
 *    way we do the other calls.
+ *
+ * NODEMASK_SCRATCH
+ * When doing above logical AND, OR, XOR, Remap operations the callers tend to
+ * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large,
+ * nodemask_t's consume too much stack space.  NODEMASK_SCRATCH is a helper
+ * for such situations. See below and CPUMASK_ALLOC also.
 */
 #include <linux/kernel.h>
@@ -473,4 +479,26 @@ static inline int num_node_state(enum node_states state)
 #define for_each_node(node)        for_each_node_state(node, N_POSSIBLE)
 #define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
+/*
+ * For nodemask scrach area.(See CPUMASK_ALLOC() in cpumask.h)
+ */
+#if NODES_SHIFT > 8 /* nodemask_t > 64 bytes */
+#define NODEMASK_ALLOC(x, m) struct x *m = kmalloc(sizeof(*m), GFP_KERNEL)
+#define NODEMASK_FREE(m) kfree(m)
+#else
+#define NODEMASK_ALLOC(x, m) struct x _m, *m = &_m
+#define NODEMASK_FREE(m)
+#endif
+/* A example struture for using NODEMASK_ALLOC, used in mempolicy. */
+struct nodemask_scratch {
+        nodemask_t      mask1;
+        nodemask_t      mask2;
+};
+#define NODEMASK_SCRATCH(x) NODEMASK_ALLOC(nodemask_scratch, x)
+#define NODEMASK_SCRATCH_FREE(x)  NODEMASK_FREE(x)
 #endif /* __LINUX_NODEMASK_H */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e08e2c4da63a..7dd9d9f80694 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 */
-static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
+static int mpol_set_nodemask(struct mempolicy *pol,
+                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
 {
-        nodemask_t cpuset_context_nmask;
        int ret;
        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
        if (pol == NULL)
                return 0;
+        /* Check N_HIGH_MEMORY */
+        nodes_and(nsc->mask1,
+                  cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
        VM_BUG_ON(!nodes);
        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
                nodes = NULL;   /* explicit local allocation */
        else {
                if (pol->flags & MPOL_F_RELATIVE_NODES)
-                        mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+                        mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
-                                               &cpuset_current_mems_allowed);
                else
-                        nodes_and(cpuset_context_nmask, *nodes,
+                        nodes_and(nsc->mask2, *nodes, nsc->mask1);
-                                  cpuset_current_mems_allowed);
                if (mpol_store_user_nodemask(pol))
                        pol->w.user_nodemask = *nodes;
                else
@@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes)
                                                cpuset_current_mems_allowed;
        }
-        ret = mpol_ops[pol->mode].create(pol,
+        if (nodes)
-                                nodes ? &cpuset_context_nmask : NULL);
+                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+        else
+                ret = mpol_ops[pol->mode].create(pol, NULL);
        return ret;
 }
@@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 {
        struct mempolicy *new, *old;
        struct mm_struct *mm = current->mm;
+        NODEMASK_SCRATCH(scratch);
        int ret;
-        new = mpol_new(mode, flags, nodes);
+        if (!scratch)
-        if (IS_ERR(new))
+                return -ENOMEM;
-                return PTR_ERR(new);
+        new = mpol_new(mode, flags, nodes);
+        if (IS_ERR(new)) {
+                ret = PTR_ERR(new);
+                goto out;
+        }
        /*
         * prevent changing our mempolicy while show_numa_maps()
         * is using it.
@@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
        if (mm)
                down_write(&mm->mmap_sem);
        task_lock(current);
-        ret = mpol_set_nodemask(new, nodes);
+        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                task_unlock(current);
                if (mm)
                        up_write(&mm->mmap_sem);
                mpol_put(new);
-                return ret;
+                goto out;
        }
        old = current->mempolicy;
        current->mempolicy = new;
@@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                up_write(&mm->mmap_sem);
        mpol_put(old);
-        return 0;
+        ret = 0;
+out:
+        NODEMASK_SCRATCH_FREE(scratch);
+        return ret;
 }
 /*
@@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len,
                if (err)
                        return err;
        }
-        down_write(&mm->mmap_sem);
+        {
-        task_lock(current);
+                NODEMASK_SCRATCH(scratch);
-        err = mpol_set_nodemask(new, nmask);
+                if (scratch) {
-        task_unlock(current);
+                        down_write(&mm->mmap_sem);
+                        task_lock(current);
+                        err = mpol_set_nodemask(new, nmask, scratch);
+                        task_unlock(current);
+                        if (err)
+                                up_write(&mm->mmap_sem);
+                } else
+                        err = -ENOMEM;
+                NODEMASK_SCRATCH_FREE(scratch);
+        }
        if (err) {
-                up_write(&mm->mmap_sem);
                mpol_put(new);
                return err;
        }
@@ -1891,6 +1911,7 @@ restart:
 * Install non-NULL @mpol in inode's shared policy rb-tree.
 * On entry, the current task has a reference on a non-NULL @mpol.
 * This must be released on exit.
+ * This is called at get_inode() calls and we can use GFP_KERNEL.
 */
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
 {
@@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
        if (mpol) {
                struct vm_area_struct pvma;
                struct mempolicy *new;
+                NODEMASK_SCRATCH(scratch);
+                if (!scratch)
+                        return;
                /* contextualize the tmpfs mount point mempolicy */
                new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
                if (IS_ERR(new)) {
                        mpol_put(mpol); /* drop our ref on sb mpol */
+                        NODEMASK_SCRATCH_FREE(scratch);
                        return;         /* no valid nodemask intersection */
                }
                task_lock(current);
-                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);
+                ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
                task_unlock(current);
                mpol_put(mpol); /* drop our ref on sb mpol */
                if (ret) {
+                        NODEMASK_SCRATCH_FREE(scratch);
                        mpol_put(new);
                        return;
                }
@@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
                pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
                mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
                mpol_put(new);                  /* drop initial ref */
+                NODEMASK_SCRATCH_FREE(scratch);
        }
 }
@@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
                err = 1;
        else {
                int ret;
+                NODEMASK_SCRATCH(scratch);
-                task_lock(current);
+                if (scratch) {
-                ret = mpol_set_nodemask(new, &nodes);
+                        task_lock(current);
-                task_unlock(current);
+                        ret = mpol_set_nodemask(new, &nodes, scratch);
-                if (ret)
+                        task_unlock(current);
+                } else
+                        ret = -ENOMEM;
+                NODEMASK_SCRATCH_FREE(scratch);
+                if (ret) {
                        err = 1;
-                else if (no_context) {
+                        mpol_put(new);
+                } else if (no_context) {
                        /* save for contextualization */
                        new->w.user_nodemask = nodes;
                }

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 829b94b156f2..b359c4a9ec9e 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h
@@ -82,6 +82,12 @@
82	* to generate slightly worse code. So use a simple one-line #define	82	* to generate slightly worse code. So use a simple one-line #define
83	* for node_isset(), instead of wrapping an inline inside a macro, the	83	* for node_isset(), instead of wrapping an inline inside a macro, the
84	* way we do the other calls.	84	* way we do the other calls.
		85	*
		86	* NODEMASK_SCRATCH
		87	* When doing above logical AND, OR, XOR, Remap operations the callers tend to
		88	* need temporary nodemask_t's on the stack. But if NODES_SHIFT is large,
		89	* nodemask_t's consume too much stack space. NODEMASK_SCRATCH is a helper
		90	* for such situations. See below and CPUMASK_ALLOC also.
85	*/	91	*/
86		92
87	#include <linux/kernel.h>	93	#include <linux/kernel.h>
@@ -473,4 +479,26 @@ static inline int num_node_state(enum node_states state)
473	#define for_each_node(node) for_each_node_state(node, N_POSSIBLE)	479	#define for_each_node(node) for_each_node_state(node, N_POSSIBLE)
474	#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)	480	#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
475		481
		482	/*
		483	* For nodemask scrach area.(See CPUMASK_ALLOC() in cpumask.h)
		484	*/
		485
		486	#if NODES_SHIFT > 8 /* nodemask_t > 64 bytes */
		487	#define NODEMASK_ALLOC(x, m) struct x m = kmalloc(sizeof(m), GFP_KERNEL)
		488	#define NODEMASK_FREE(m) kfree(m)
		489	#else
		490	#define NODEMASK_ALLOC(x, m) struct x _m, *m = &_m
		491	#define NODEMASK_FREE(m)
		492	#endif
		493
		494	/* A example struture for using NODEMASK_ALLOC, used in mempolicy. */
		495	struct nodemask_scratch {
		496	nodemask_t mask1;
		497	nodemask_t mask2;
		498	};
		499
		500	#define NODEMASK_SCRATCH(x) NODEMASK_ALLOC(nodemask_scratch, x)
		501	#define NODEMASK_SCRATCH_FREE(x) NODEMASK_FREE(x)
		502
		503
476	#endif /* __LINUX_NODEMASK_H */	504	#endif /* __LINUX_NODEMASK_H */


diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e08e2c4da63a..7dd9d9f80694 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy pol, const nodemask_t nodes)
191	* Must be called holding task's alloc_lock to protect task's mems_allowed	191	* Must be called holding task's alloc_lock to protect task's mems_allowed
192	* and mempolicy. May also be called holding the mmap_semaphore for write.	192	* and mempolicy. May also be called holding the mmap_semaphore for write.
193	*/	193	*/
194	static int mpol_set_nodemask(struct mempolicy pol, const nodemask_t nodes)	194	static int mpol_set_nodemask(struct mempolicy *pol,
		195	const nodemask_t nodes, struct nodemask_scratch nsc)
195	{	196	{
196	nodemask_t cpuset_context_nmask;
197	int ret;	197	int ret;
198		198
199	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */	199	/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
200	if (pol == NULL)	200	if (pol == NULL)
201	return 0;	201	return 0;
		202	/* Check N_HIGH_MEMORY */
		203	nodes_and(nsc->mask1,
		204	cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
202		205
203	VM_BUG_ON(!nodes);	206	VM_BUG_ON(!nodes);
204	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))	207	if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
205	nodes = NULL; /* explicit local allocation */	208	nodes = NULL; /* explicit local allocation */
206	else {	209	else {
207	if (pol->flags & MPOL_F_RELATIVE_NODES)	210	if (pol->flags & MPOL_F_RELATIVE_NODES)
208	mpol_relative_nodemask(&cpuset_context_nmask, nodes,	211	mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
209	&cpuset_current_mems_allowed);
210	else	212	else
211	nodes_and(cpuset_context_nmask, *nodes,	213	nodes_and(nsc->mask2, *nodes, nsc->mask1);
212	cpuset_current_mems_allowed);	214
213	if (mpol_store_user_nodemask(pol))	215	if (mpol_store_user_nodemask(pol))
214	pol->w.user_nodemask = *nodes;	216	pol->w.user_nodemask = *nodes;
215	else	217	else
@@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy pol, const nodemask_t nodes)
217	cpuset_current_mems_allowed;	219	cpuset_current_mems_allowed;
218	}	220	}
219		221
220	ret = mpol_ops[pol->mode].create(pol,	222	if (nodes)
221	nodes ? &cpuset_context_nmask : NULL);	223	ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
		224	else
		225	ret = mpol_ops[pol->mode].create(pol, NULL);
222	return ret;	226	return ret;
223	}	227	}
224		228
@@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
620	{	624	{
621	struct mempolicy new, old;	625	struct mempolicy new, old;
622	struct mm_struct *mm = current->mm;	626	struct mm_struct *mm = current->mm;
		627	NODEMASK_SCRATCH(scratch);
623	int ret;	628	int ret;
624		629
625	new = mpol_new(mode, flags, nodes);	630	if (!scratch)
626	if (IS_ERR(new))	631	return -ENOMEM;
627	return PTR_ERR(new);
628		632
		633	new = mpol_new(mode, flags, nodes);
		634	if (IS_ERR(new)) {
		635	ret = PTR_ERR(new);
		636	goto out;
		637	}
629	/*	638	/*
630	* prevent changing our mempolicy while show_numa_maps()	639	* prevent changing our mempolicy while show_numa_maps()
631	* is using it.	640	* is using it.
@@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
635	if (mm)	644	if (mm)
636	down_write(&mm->mmap_sem);	645	down_write(&mm->mmap_sem);
637	task_lock(current);	646	task_lock(current);
638	ret = mpol_set_nodemask(new, nodes);	647	ret = mpol_set_nodemask(new, nodes, scratch);
639	if (ret) {	648	if (ret) {
640	task_unlock(current);	649	task_unlock(current);
641	if (mm)	650	if (mm)
642	up_write(&mm->mmap_sem);	651	up_write(&mm->mmap_sem);
643	mpol_put(new);	652	mpol_put(new);
644	return ret;	653	goto out;
645	}	654	}
646	old = current->mempolicy;	655	old = current->mempolicy;
647	current->mempolicy = new;	656	current->mempolicy = new;
@@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
654	up_write(&mm->mmap_sem);	663	up_write(&mm->mmap_sem);
655		664
656	mpol_put(old);	665	mpol_put(old);
657	return 0;	666	ret = 0;
		667	out:
		668	NODEMASK_SCRATCH_FREE(scratch);
		669	return ret;
658	}	670	}
659		671
660	/*	672	/*
@@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len,
1014	if (err)	1026	if (err)
1015	return err;	1027	return err;
1016	}	1028	}
1017	down_write(&mm->mmap_sem);	1029	{
1018	task_lock(current);	1030	NODEMASK_SCRATCH(scratch);
1019	err = mpol_set_nodemask(new, nmask);	1031	if (scratch) {
1020	task_unlock(current);	1032	down_write(&mm->mmap_sem);
		1033	task_lock(current);
		1034	err = mpol_set_nodemask(new, nmask, scratch);
		1035	task_unlock(current);
		1036	if (err)
		1037	up_write(&mm->mmap_sem);
		1038	} else
		1039	err = -ENOMEM;
		1040	NODEMASK_SCRATCH_FREE(scratch);
		1041	}
1021	if (err) {	1042	if (err) {
1022	up_write(&mm->mmap_sem);
1023	mpol_put(new);	1043	mpol_put(new);
1024	return err;	1044	return err;
1025	}	1045	}
@@ -1891,6 +1911,7 @@ restart:
1891	* Install non-NULL @mpol in inode's shared policy rb-tree.	1911	* Install non-NULL @mpol in inode's shared policy rb-tree.
1892	* On entry, the current task has a reference on a non-NULL @mpol.	1912	* On entry, the current task has a reference on a non-NULL @mpol.
1893	* This must be released on exit.	1913	* This must be released on exit.
		1914	* This is called at get_inode() calls and we can use GFP_KERNEL.
1894	*/	1915	*/
1895	void mpol_shared_policy_init(struct shared_policy sp, struct mempolicy mpol)	1916	void mpol_shared_policy_init(struct shared_policy sp, struct mempolicy mpol)
1896	{	1917	{
@@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy sp, struct mempolicy mpol)
1902	if (mpol) {	1923	if (mpol) {
1903	struct vm_area_struct pvma;	1924	struct vm_area_struct pvma;
1904	struct mempolicy *new;	1925	struct mempolicy *new;
		1926	NODEMASK_SCRATCH(scratch);
1905		1927
		1928	if (!scratch)
		1929	return;
1906	/* contextualize the tmpfs mount point mempolicy */	1930	/* contextualize the tmpfs mount point mempolicy */
1907	new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);	1931	new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1908	if (IS_ERR(new)) {	1932	if (IS_ERR(new)) {
1909	mpol_put(mpol); /* drop our ref on sb mpol */	1933	mpol_put(mpol); /* drop our ref on sb mpol */
		1934	NODEMASK_SCRATCH_FREE(scratch);
1910	return; /* no valid nodemask intersection */	1935	return; /* no valid nodemask intersection */
1911	}	1936	}
1912		1937
1913	task_lock(current);	1938	task_lock(current);
1914	ret = mpol_set_nodemask(new, &mpol->w.user_nodemask);	1939	ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1915	task_unlock(current);	1940	task_unlock(current);
1916	mpol_put(mpol); /* drop our ref on sb mpol */	1941	mpol_put(mpol); /* drop our ref on sb mpol */
1917	if (ret) {	1942	if (ret) {
		1943	NODEMASK_SCRATCH_FREE(scratch);
1918	mpol_put(new);	1944	mpol_put(new);
1919	return;	1945	return;
1920	}	1946	}
@@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy sp, struct mempolicy mpol)
1924	pvma.vm_end = TASK_SIZE; /* policy covers entire file */	1950	pvma.vm_end = TASK_SIZE; /* policy covers entire file */
1925	mpol_set_shared_policy(sp, &pvma, new); /* adds ref */	1951	mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1926	mpol_put(new); /* drop initial ref */	1952	mpol_put(new); /* drop initial ref */
		1953	NODEMASK_SCRATCH_FREE(scratch);
1927	}	1954	}
1928	}	1955	}
1929		1956
@@ -2140,13 +2167,18 @@ int mpol_parse_str(char str, struct mempolicy *mpol, int no_context)
2140	err = 1;	2167	err = 1;
2141	else {	2168	else {
2142	int ret;	2169	int ret;
2143		2170	NODEMASK_SCRATCH(scratch);
2144	task_lock(current);	2171	if (scratch) {
2145	ret = mpol_set_nodemask(new, &nodes);	2172	task_lock(current);
2146	task_unlock(current);	2173	ret = mpol_set_nodemask(new, &nodes, scratch);
2147	if (ret)	2174	task_unlock(current);
		2175	} else
		2176	ret = -ENOMEM;
		2177	NODEMASK_SCRATCH_FREE(scratch);
		2178	if (ret) {
2148	err = 1;	2179	err = 1;
2149	else if (no_context) {	2180	mpol_put(new);
		2181	} else if (no_context) {
2150	/* save for contextualization */	2182	/* save for contextualization */
2151	new->w.user_nodemask = nodes;	2183	new->w.user_nodemask = nodes;
2152	}	2184	}