aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>2008-02-11 23:30:22 -0500
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2008-02-11 23:48:29 -0500
commit31f1de46b90ad360a16e7af3e277d104961df923 (patch)
treea54e8698d4e4d088c4008e0ae91b579b13d2c208 /mm
parent1a510089849ff9f70b654659bf976a6baf3a4833 (diff)
mempolicy: silently restrict nodemask to allowed nodes
Kosaki Motohito noted that "numactl --interleave=all ..." failed in the presence of memoryless nodes. This patch attempts to fix that problem. Some background: numactl --interleave=all calls set_mempolicy(2) with a fully populated [out to MAXNUMNODES] nodemask. set_mempolicy() [in do_set_mempolicy()] calls contextualize_policy() which requires that the nodemask be a subset of the current task's mems_allowed; else EINVAL will be returned. A task's mems_allowed will always be a subset of node_states[N_HIGH_MEMORY] i.e., nodes with memory. So, a fully populated nodemask will be declared invalid if it includes memoryless nodes. NOTE: the same thing will occur when running in a cpuset with restricted mem_allowed--for the same reason: node mask contains dis-allowed nodes. mbind(2), on the other hand, just masks off any nodes in the nodemask that are not included in the caller's mems_allowed. In each case [mbind() and set_mempolicy()], mpol_check_policy() will complain [again, resulting in EINVAL] if the nodemask contains any memoryless nodes. This is somewhat redundant as mpol_new() will remove memoryless nodes for interleave policy, as will bind_zonelist()--called by mpol_new() for BIND policy. Proposed fix: 1) modify contextualize_policy logic to: a) remember whether the incoming node mask is empty. b) if not, restrict the nodemask to allowed nodes, as is currently done in-line for mbind(). This guarantees that the resulting mask includes only nodes with memory. NOTE: this is a [benign, IMO] change in behavior for set_mempolicy(). Dis-allowed nodes will be silently ignored, rather than returning an error. c) fold this code into mpol_check_policy(), replace 2 calls to contextualize_policy() to call mpol_check_policy() directly and remove contextualize_policy(). 2) In existing mpol_check_policy() logic, after "contextualization": a) MPOL_DEFAULT: require that in coming mask "was_empty" b) MPOL_{BIND|INTERLEAVE}: require that contextualized nodemask contains at least one node. c) add a case for MPOL_PREFERRED: if in coming was not empty and resulting mask IS empty, user specified invalid nodes. Return EINVAL. c) remove the now redundant check for memoryless nodes 3) remove the now redundant masking of policy nodes for interleave policy from mpol_new(). 4) Now that mpol_check_policy() contextualizes the nodemask, remove the in-line nodes_and() from sys_mbind(). I believe that this restores mbind() to the behavior before the memoryless-nodes patch series. E.g., we'll no longer treat an invalid nodemask with MPOL_PREFERRED as local allocation. [ Patch history: v1 -> v2: - Communicate whether or not incoming node mask was empty to mpol_check_policy() for better error checking. - As suggested by David Rientjes, remove the now unused cpuset_nodes_subset_current_mems_allowed() from cpuset.h v2 -> v3: - As suggested by Kosaki Motohito, fold the "contextualization" of policy nodemask into mpol_check_policy(). Looks a little cleaner. ] Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/mempolicy.c61
1 files changed, 36 insertions, 25 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 83c69f8a64c2..8d246c3b340f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -116,22 +116,51 @@ static void mpol_rebind_policy(struct mempolicy *pol,
116/* Do sanity checking on a policy */ 116/* Do sanity checking on a policy */
117static int mpol_check_policy(int mode, nodemask_t *nodes) 117static int mpol_check_policy(int mode, nodemask_t *nodes)
118{ 118{
119 int empty = nodes_empty(*nodes); 119 int was_empty, is_empty;
120
121 if (!nodes)
122 return 0;
123
124 /*
125 * "Contextualize" the in-coming nodemast for cpusets:
126 * Remember whether in-coming nodemask was empty, If not,
127 * restrict the nodes to the allowed nodes in the cpuset.
128 * This is guaranteed to be a subset of nodes with memory.
129 */
130 cpuset_update_task_memory_state();
131 is_empty = was_empty = nodes_empty(*nodes);
132 if (!was_empty) {
133 nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
134 is_empty = nodes_empty(*nodes); /* after "contextualization" */
135 }
120 136
121 switch (mode) { 137 switch (mode) {
122 case MPOL_DEFAULT: 138 case MPOL_DEFAULT:
123 if (!empty) 139 /*
140 * require caller to specify an empty nodemask
141 * before "contextualization"
142 */
143 if (!was_empty)
124 return -EINVAL; 144 return -EINVAL;
125 break; 145 break;
126 case MPOL_BIND: 146 case MPOL_BIND:
127 case MPOL_INTERLEAVE: 147 case MPOL_INTERLEAVE:
128 /* Preferred will only use the first bit, but allow 148 /*
129 more for now. */ 149 * require at least 1 valid node after "contextualization"
130 if (empty) 150 */
151 if (is_empty)
152 return -EINVAL;
153 break;
154 case MPOL_PREFERRED:
155 /*
156 * Did caller specify invalid nodes?
157 * Don't silently accept this as "local allocation".
158 */
159 if (!was_empty && is_empty)
131 return -EINVAL; 160 return -EINVAL;
132 break; 161 break;
133 } 162 }
134 return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL; 163 return 0;
135} 164}
136 165
137/* Generate a custom zonelist for the BIND policy. */ 166/* Generate a custom zonelist for the BIND policy. */
@@ -188,8 +217,6 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
188 switch (mode) { 217 switch (mode) {
189 case MPOL_INTERLEAVE: 218 case MPOL_INTERLEAVE:
190 policy->v.nodes = *nodes; 219 policy->v.nodes = *nodes;
191 nodes_and(policy->v.nodes, policy->v.nodes,
192 node_states[N_HIGH_MEMORY]);
193 if (nodes_weight(policy->v.nodes) == 0) { 220 if (nodes_weight(policy->v.nodes) == 0) {
194 kmem_cache_free(policy_cache, policy); 221 kmem_cache_free(policy_cache, policy);
195 return ERR_PTR(-EINVAL); 222 return ERR_PTR(-EINVAL);
@@ -421,18 +448,6 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
421 return err; 448 return err;
422} 449}
423 450
424static int contextualize_policy(int mode, nodemask_t *nodes)
425{
426 if (!nodes)
427 return 0;
428
429 cpuset_update_task_memory_state();
430 if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
431 return -EINVAL;
432 return mpol_check_policy(mode, nodes);
433}
434
435
436/* 451/*
437 * Update task->flags PF_MEMPOLICY bit: set iff non-default 452 * Update task->flags PF_MEMPOLICY bit: set iff non-default
438 * mempolicy. Allows more rapid checking of this (combined perhaps 453 * mempolicy. Allows more rapid checking of this (combined perhaps
@@ -468,7 +483,7 @@ static long do_set_mempolicy(int mode, nodemask_t *nodes)
468{ 483{
469 struct mempolicy *new; 484 struct mempolicy *new;
470 485
471 if (contextualize_policy(mode, nodes)) 486 if (mpol_check_policy(mode, nodes))
472 return -EINVAL; 487 return -EINVAL;
473 new = mpol_new(mode, nodes); 488 new = mpol_new(mode, nodes);
474 if (IS_ERR(new)) 489 if (IS_ERR(new))
@@ -915,10 +930,6 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
915 err = get_nodes(&nodes, nmask, maxnode); 930 err = get_nodes(&nodes, nmask, maxnode);
916 if (err) 931 if (err)
917 return err; 932 return err;
918#ifdef CONFIG_CPUSETS
919 /* Restrict the nodes to the allowed nodes in the cpuset */
920 nodes_and(nodes, nodes, current->mems_allowed);
921#endif
922 return do_mbind(start, len, mode, &nodes, flags); 933 return do_mbind(start, len, mode, &nodes, flags);
923} 934}
924 935