aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c393
1 files changed, 212 insertions, 181 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 1d5c64df1653..2076b1542b8a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2,6 +2,7 @@
2 * Simple NUMA memory policy for the Linux kernel. 2 * Simple NUMA memory policy for the Linux kernel.
3 * 3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
5 * Subject to the GNU Public License, version 2. 6 * Subject to the GNU Public License, version 2.
6 * 7 *
7 * NUMA policy allows the user to give hints in which node(s) memory should 8 * NUMA policy allows the user to give hints in which node(s) memory should
@@ -17,13 +18,19 @@
17 * offset into the backing object or offset into the mapping 18 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter 19 * for anonymous memory. For process policy an process counter
19 * is used. 20 * is used.
21 *
20 * bind Only allocate memory on a specific set of nodes, 22 * bind Only allocate memory on a specific set of nodes,
21 * no fallback. 23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
22 * preferred Try a specific node first before normal fallback. 28 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation 29 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default, 30 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default 31 * but useful to set in a VMA when you have a non default
26 * process policy. 32 * process policy.
33 *
27 * default Allocate on the local node first, or when on a VMA 34 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did 35 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default. 36 * in a NUMA aware kernel and still does by, ahem, default.
@@ -93,23 +100,10 @@ struct mempolicy default_policy = {
93 .policy = MPOL_DEFAULT, 100 .policy = MPOL_DEFAULT,
94}; 101};
95 102
96/* Check if all specified nodes are online */
97static int nodes_online(unsigned long *nodes)
98{
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
100
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
103 set_bit(0, online2);
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105 return -EINVAL;
106 return 0;
107}
108
109/* Do sanity checking on a policy */ 103/* Do sanity checking on a policy */
110static int mpol_check_policy(int mode, unsigned long *nodes) 104static int mpol_check_policy(int mode, nodemask_t *nodes)
111{ 105{
112 int empty = bitmap_empty(nodes, MAX_NUMNODES); 106 int empty = nodes_empty(*nodes);
113 107
114 switch (mode) { 108 switch (mode) {
115 case MPOL_DEFAULT: 109 case MPOL_DEFAULT:
@@ -124,71 +118,20 @@ static int mpol_check_policy(int mode, unsigned long *nodes)
124 return -EINVAL; 118 return -EINVAL;
125 break; 119 break;
126 } 120 }
127 return nodes_online(nodes); 121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
128}
129
130/* Copy a node mask from user space. */
131static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
133{
134 unsigned long k;
135 unsigned long nlongs;
136 unsigned long endmask;
137
138 --maxnode;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
141 return 0;
142
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
145 endmask = ~0UL;
146 else
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
153 return -EINVAL;
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 unsigned long t;
156 if (get_user(t, nmask + k))
157 return -EFAULT;
158 if (k == nlongs - 1) {
159 if (t & endmask)
160 return -EINVAL;
161 } else if (t)
162 return -EINVAL;
163 }
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165 endmask = ~0UL;
166 }
167
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 return -EFAULT;
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
176} 122}
177
178/* Generate a custom zonelist for the BIND policy. */ 123/* Generate a custom zonelist for the BIND policy. */
179static struct zonelist *bind_zonelist(unsigned long *nodes) 124static struct zonelist *bind_zonelist(nodemask_t *nodes)
180{ 125{
181 struct zonelist *zl; 126 struct zonelist *zl;
182 int num, max, nd; 127 int num, max, nd;
183 128
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES); 129 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 130 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186 if (!zl) 131 if (!zl)
187 return NULL; 132 return NULL;
188 num = 0; 133 num = 0;
189 for (nd = find_first_bit(nodes, MAX_NUMNODES); 134 for_each_node_mask(nd, *nodes) {
190 nd < MAX_NUMNODES;
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192 int k; 135 int k;
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) { 136 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k]; 137 struct zone *z = &NODE_DATA(nd)->node_zones[k];
@@ -199,17 +142,16 @@ static struct zonelist *bind_zonelist(unsigned long *nodes)
199 policy_zone = k; 142 policy_zone = k;
200 } 143 }
201 } 144 }
202 BUG_ON(num >= max);
203 zl->zones[num] = NULL; 145 zl->zones[num] = NULL;
204 return zl; 146 return zl;
205} 147}
206 148
207/* Create a new policy */ 149/* Create a new policy */
208static struct mempolicy *mpol_new(int mode, unsigned long *nodes) 150static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
209{ 151{
210 struct mempolicy *policy; 152 struct mempolicy *policy;
211 153
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]); 154 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
213 if (mode == MPOL_DEFAULT) 155 if (mode == MPOL_DEFAULT)
214 return NULL; 156 return NULL;
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 157 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -218,10 +160,10 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
218 atomic_set(&policy->refcnt, 1); 160 atomic_set(&policy->refcnt, 1);
219 switch (mode) { 161 switch (mode) {
220 case MPOL_INTERLEAVE: 162 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES); 163 policy->v.nodes = *nodes;
222 break; 164 break;
223 case MPOL_PREFERRED: 165 case MPOL_PREFERRED:
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES); 166 policy->v.preferred_node = first_node(*nodes);
225 if (policy->v.preferred_node >= MAX_NUMNODES) 167 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1; 168 policy->v.preferred_node = -1;
227 break; 169 break;
@@ -238,14 +180,14 @@ static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
238} 180}
239 181
240/* Ensure all existing pages follow the policy. */ 182/* Ensure all existing pages follow the policy. */
241static int check_pte_range(struct mm_struct *mm, pmd_t *pmd, 183static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
242 unsigned long addr, unsigned long end, unsigned long *nodes) 184 unsigned long addr, unsigned long end, nodemask_t *nodes)
243{ 185{
244 pte_t *orig_pte; 186 pte_t *orig_pte;
245 pte_t *pte; 187 pte_t *pte;
188 spinlock_t *ptl;
246 189
247 spin_lock(&mm->page_table_lock); 190 orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
248 orig_pte = pte = pte_offset_map(pmd, addr);
249 do { 191 do {
250 unsigned long pfn; 192 unsigned long pfn;
251 unsigned int nid; 193 unsigned int nid;
@@ -253,19 +195,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
253 if (!pte_present(*pte)) 195 if (!pte_present(*pte))
254 continue; 196 continue;
255 pfn = pte_pfn(*pte); 197 pfn = pte_pfn(*pte);
256 if (!pfn_valid(pfn)) 198 if (!pfn_valid(pfn)) {
199 print_bad_pte(vma, *pte, addr);
257 continue; 200 continue;
201 }
258 nid = pfn_to_nid(pfn); 202 nid = pfn_to_nid(pfn);
259 if (!test_bit(nid, nodes)) 203 if (!node_isset(nid, *nodes))
260 break; 204 break;
261 } while (pte++, addr += PAGE_SIZE, addr != end); 205 } while (pte++, addr += PAGE_SIZE, addr != end);
262 pte_unmap(orig_pte); 206 pte_unmap_unlock(orig_pte, ptl);
263 spin_unlock(&mm->page_table_lock);
264 return addr != end; 207 return addr != end;
265} 208}
266 209
267static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud, 210static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
268 unsigned long addr, unsigned long end, unsigned long *nodes) 211 unsigned long addr, unsigned long end, nodemask_t *nodes)
269{ 212{
270 pmd_t *pmd; 213 pmd_t *pmd;
271 unsigned long next; 214 unsigned long next;
@@ -275,14 +218,14 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
275 next = pmd_addr_end(addr, end); 218 next = pmd_addr_end(addr, end);
276 if (pmd_none_or_clear_bad(pmd)) 219 if (pmd_none_or_clear_bad(pmd))
277 continue; 220 continue;
278 if (check_pte_range(mm, pmd, addr, next, nodes)) 221 if (check_pte_range(vma, pmd, addr, next, nodes))
279 return -EIO; 222 return -EIO;
280 } while (pmd++, addr = next, addr != end); 223 } while (pmd++, addr = next, addr != end);
281 return 0; 224 return 0;
282} 225}
283 226
284static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd, 227static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
285 unsigned long addr, unsigned long end, unsigned long *nodes) 228 unsigned long addr, unsigned long end, nodemask_t *nodes)
286{ 229{
287 pud_t *pud; 230 pud_t *pud;
288 unsigned long next; 231 unsigned long next;
@@ -292,24 +235,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
292 next = pud_addr_end(addr, end); 235 next = pud_addr_end(addr, end);
293 if (pud_none_or_clear_bad(pud)) 236 if (pud_none_or_clear_bad(pud))
294 continue; 237 continue;
295 if (check_pmd_range(mm, pud, addr, next, nodes)) 238 if (check_pmd_range(vma, pud, addr, next, nodes))
296 return -EIO; 239 return -EIO;
297 } while (pud++, addr = next, addr != end); 240 } while (pud++, addr = next, addr != end);
298 return 0; 241 return 0;
299} 242}
300 243
301static inline int check_pgd_range(struct mm_struct *mm, 244static inline int check_pgd_range(struct vm_area_struct *vma,
302 unsigned long addr, unsigned long end, unsigned long *nodes) 245 unsigned long addr, unsigned long end, nodemask_t *nodes)
303{ 246{
304 pgd_t *pgd; 247 pgd_t *pgd;
305 unsigned long next; 248 unsigned long next;
306 249
307 pgd = pgd_offset(mm, addr); 250 pgd = pgd_offset(vma->vm_mm, addr);
308 do { 251 do {
309 next = pgd_addr_end(addr, end); 252 next = pgd_addr_end(addr, end);
310 if (pgd_none_or_clear_bad(pgd)) 253 if (pgd_none_or_clear_bad(pgd))
311 continue; 254 continue;
312 if (check_pud_range(mm, pgd, addr, next, nodes)) 255 if (check_pud_range(vma, pgd, addr, next, nodes))
313 return -EIO; 256 return -EIO;
314 } while (pgd++, addr = next, addr != end); 257 } while (pgd++, addr = next, addr != end);
315 return 0; 258 return 0;
@@ -318,7 +261,7 @@ static inline int check_pgd_range(struct mm_struct *mm,
318/* Step 1: check the range */ 261/* Step 1: check the range */
319static struct vm_area_struct * 262static struct vm_area_struct *
320check_range(struct mm_struct *mm, unsigned long start, unsigned long end, 263check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
321 unsigned long *nodes, unsigned long flags) 264 nodemask_t *nodes, unsigned long flags)
322{ 265{
323 int err; 266 int err;
324 struct vm_area_struct *first, *vma, *prev; 267 struct vm_area_struct *first, *vma, *prev;
@@ -326,6 +269,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
326 first = find_vma(mm, start); 269 first = find_vma(mm, start);
327 if (!first) 270 if (!first)
328 return ERR_PTR(-EFAULT); 271 return ERR_PTR(-EFAULT);
272 if (first->vm_flags & VM_RESERVED)
273 return ERR_PTR(-EACCES);
329 prev = NULL; 274 prev = NULL;
330 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { 275 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
331 if (!vma->vm_next && vma->vm_end < end) 276 if (!vma->vm_next && vma->vm_end < end)
@@ -338,8 +283,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
338 endvma = end; 283 endvma = end;
339 if (vma->vm_start > start) 284 if (vma->vm_start > start)
340 start = vma->vm_start; 285 start = vma->vm_start;
341 err = check_pgd_range(vma->vm_mm, 286 err = check_pgd_range(vma, start, endvma, nodes);
342 start, endvma, nodes);
343 if (err) { 287 if (err) {
344 first = ERR_PTR(err); 288 first = ERR_PTR(err);
345 break; 289 break;
@@ -393,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
393 return err; 337 return err;
394} 338}
395 339
396/* Change policy for a memory range */ 340static int contextualize_policy(int mode, nodemask_t *nodes)
397asmlinkage long sys_mbind(unsigned long start, unsigned long len, 341{
398 unsigned long mode, 342 if (!nodes)
399 unsigned long __user *nmask, unsigned long maxnode, 343 return 0;
400 unsigned flags) 344
345 /* Update current mems_allowed */
346 cpuset_update_current_mems_allowed();
347 /* Ignore nodes not set in current->mems_allowed */
348 cpuset_restrict_to_mems_allowed(nodes->bits);
349 return mpol_check_policy(mode, nodes);
350}
351
352long do_mbind(unsigned long start, unsigned long len,
353 unsigned long mode, nodemask_t *nmask, unsigned long flags)
401{ 354{
402 struct vm_area_struct *vma; 355 struct vm_area_struct *vma;
403 struct mm_struct *mm = current->mm; 356 struct mm_struct *mm = current->mm;
404 struct mempolicy *new; 357 struct mempolicy *new;
405 unsigned long end; 358 unsigned long end;
406 DECLARE_BITMAP(nodes, MAX_NUMNODES);
407 int err; 359 int err;
408 360
409 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) 361 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@@ -418,20 +370,17 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
418 return -EINVAL; 370 return -EINVAL;
419 if (end == start) 371 if (end == start)
420 return 0; 372 return 0;
421 373 if (mpol_check_policy(mode, nmask))
422 err = get_nodes(nodes, nmask, maxnode, mode); 374 return -EINVAL;
423 if (err) 375 new = mpol_new(mode, nmask);
424 return err;
425
426 new = mpol_new(mode, nodes);
427 if (IS_ERR(new)) 376 if (IS_ERR(new))
428 return PTR_ERR(new); 377 return PTR_ERR(new);
429 378
430 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 379 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
431 mode,nodes[0]); 380 mode,nodes_addr(nodes)[0]);
432 381
433 down_write(&mm->mmap_sem); 382 down_write(&mm->mmap_sem);
434 vma = check_range(mm, start, end, nodes, flags); 383 vma = check_range(mm, start, end, nmask, flags);
435 err = PTR_ERR(vma); 384 err = PTR_ERR(vma);
436 if (!IS_ERR(vma)) 385 if (!IS_ERR(vma))
437 err = mbind_range(vma, start, end, new); 386 err = mbind_range(vma, start, end, new);
@@ -441,50 +390,45 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
441} 390}
442 391
443/* Set the process memory policy */ 392/* Set the process memory policy */
444asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 393long do_set_mempolicy(int mode, nodemask_t *nodes)
445 unsigned long maxnode)
446{ 394{
447 int err;
448 struct mempolicy *new; 395 struct mempolicy *new;
449 DECLARE_BITMAP(nodes, MAX_NUMNODES);
450 396
451 if (mode < 0 || mode > MPOL_MAX) 397 if (contextualize_policy(mode, nodes))
452 return -EINVAL; 398 return -EINVAL;
453 err = get_nodes(nodes, nmask, maxnode, mode);
454 if (err)
455 return err;
456 new = mpol_new(mode, nodes); 399 new = mpol_new(mode, nodes);
457 if (IS_ERR(new)) 400 if (IS_ERR(new))
458 return PTR_ERR(new); 401 return PTR_ERR(new);
459 mpol_free(current->mempolicy); 402 mpol_free(current->mempolicy);
460 current->mempolicy = new; 403 current->mempolicy = new;
461 if (new && new->policy == MPOL_INTERLEAVE) 404 if (new && new->policy == MPOL_INTERLEAVE)
462 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); 405 current->il_next = first_node(new->v.nodes);
463 return 0; 406 return 0;
464} 407}
465 408
466/* Fill a zone bitmap for a policy */ 409/* Fill a zone bitmap for a policy */
467static void get_zonemask(struct mempolicy *p, unsigned long *nodes) 410static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
468{ 411{
469 int i; 412 int i;
470 413
471 bitmap_zero(nodes, MAX_NUMNODES); 414 nodes_clear(*nodes);
472 switch (p->policy) { 415 switch (p->policy) {
473 case MPOL_BIND: 416 case MPOL_BIND:
474 for (i = 0; p->v.zonelist->zones[i]; i++) 417 for (i = 0; p->v.zonelist->zones[i]; i++)
475 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes); 418 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
419 *nodes);
476 break; 420 break;
477 case MPOL_DEFAULT: 421 case MPOL_DEFAULT:
478 break; 422 break;
479 case MPOL_INTERLEAVE: 423 case MPOL_INTERLEAVE:
480 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES); 424 *nodes = p->v.nodes;
481 break; 425 break;
482 case MPOL_PREFERRED: 426 case MPOL_PREFERRED:
483 /* or use current node instead of online map? */ 427 /* or use current node instead of online map? */
484 if (p->v.preferred_node < 0) 428 if (p->v.preferred_node < 0)
485 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES); 429 *nodes = node_online_map;
486 else 430 else
487 __set_bit(p->v.preferred_node, nodes); 431 node_set(p->v.preferred_node, *nodes);
488 break; 432 break;
489 default: 433 default:
490 BUG(); 434 BUG();
@@ -504,37 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
504 return err; 448 return err;
505} 449}
506 450
507/* Copy a kernel node mask to user space */
508static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
509 void *nodes, unsigned nbytes)
510{
511 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
512
513 if (copy > nbytes) {
514 if (copy > PAGE_SIZE)
515 return -EINVAL;
516 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
517 return -EFAULT;
518 copy = nbytes;
519 }
520 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
521}
522
523/* Retrieve NUMA policy */ 451/* Retrieve NUMA policy */
524asmlinkage long sys_get_mempolicy(int __user *policy, 452long do_get_mempolicy(int *policy, nodemask_t *nmask,
525 unsigned long __user *nmask, 453 unsigned long addr, unsigned long flags)
526 unsigned long maxnode,
527 unsigned long addr, unsigned long flags)
528{ 454{
529 int err, pval; 455 int err;
530 struct mm_struct *mm = current->mm; 456 struct mm_struct *mm = current->mm;
531 struct vm_area_struct *vma = NULL; 457 struct vm_area_struct *vma = NULL;
532 struct mempolicy *pol = current->mempolicy; 458 struct mempolicy *pol = current->mempolicy;
533 459
534 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 460 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
535 return -EINVAL; 461 return -EINVAL;
536 if (nmask != NULL && maxnode < MAX_NUMNODES)
537 return -EINVAL;
538 if (flags & MPOL_F_ADDR) { 462 if (flags & MPOL_F_ADDR) {
539 down_read(&mm->mmap_sem); 463 down_read(&mm->mmap_sem);
540 vma = find_vma_intersection(mm, addr, addr+1); 464 vma = find_vma_intersection(mm, addr, addr+1);
@@ -557,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
557 err = lookup_node(mm, addr); 481 err = lookup_node(mm, addr);
558 if (err < 0) 482 if (err < 0)
559 goto out; 483 goto out;
560 pval = err; 484 *policy = err;
561 } else if (pol == current->mempolicy && 485 } else if (pol == current->mempolicy &&
562 pol->policy == MPOL_INTERLEAVE) { 486 pol->policy == MPOL_INTERLEAVE) {
563 pval = current->il_next; 487 *policy = current->il_next;
564 } else { 488 } else {
565 err = -EINVAL; 489 err = -EINVAL;
566 goto out; 490 goto out;
567 } 491 }
568 } else 492 } else
569 pval = pol->policy; 493 *policy = pol->policy;
570 494
571 if (vma) { 495 if (vma) {
572 up_read(&current->mm->mmap_sem); 496 up_read(&current->mm->mmap_sem);
573 vma = NULL; 497 vma = NULL;
574 } 498 }
575 499
576 if (policy && put_user(pval, policy))
577 return -EFAULT;
578
579 err = 0; 500 err = 0;
580 if (nmask) { 501 if (nmask)
581 DECLARE_BITMAP(nodes, MAX_NUMNODES); 502 get_zonemask(pol, nmask);
582 get_zonemask(pol, nodes);
583 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
584 }
585 503
586 out: 504 out:
587 if (vma) 505 if (vma)
@@ -589,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
589 return err; 507 return err;
590} 508}
591 509
510/*
511 * User space interface with variable sized bitmaps for nodelists.
512 */
513
514/* Copy a node mask from user space. */
515static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
516 unsigned long maxnode)
517{
518 unsigned long k;
519 unsigned long nlongs;
520 unsigned long endmask;
521
522 --maxnode;
523 nodes_clear(*nodes);
524 if (maxnode == 0 || !nmask)
525 return 0;
526
527 nlongs = BITS_TO_LONGS(maxnode);
528 if ((maxnode % BITS_PER_LONG) == 0)
529 endmask = ~0UL;
530 else
531 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
532
533 /* When the user specified more nodes than supported just check
534 if the non supported part is all zero. */
535 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
536 if (nlongs > PAGE_SIZE/sizeof(long))
537 return -EINVAL;
538 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
539 unsigned long t;
540 if (get_user(t, nmask + k))
541 return -EFAULT;
542 if (k == nlongs - 1) {
543 if (t & endmask)
544 return -EINVAL;
545 } else if (t)
546 return -EINVAL;
547 }
548 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
549 endmask = ~0UL;
550 }
551
552 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
553 return -EFAULT;
554 nodes_addr(*nodes)[nlongs-1] &= endmask;
555 return 0;
556}
557
558/* Copy a kernel node mask to user space */
559static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
560 nodemask_t *nodes)
561{
562 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
563 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
564
565 if (copy > nbytes) {
566 if (copy > PAGE_SIZE)
567 return -EINVAL;
568 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
569 return -EFAULT;
570 copy = nbytes;
571 }
572 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
573}
574
575asmlinkage long sys_mbind(unsigned long start, unsigned long len,
576 unsigned long mode,
577 unsigned long __user *nmask, unsigned long maxnode,
578 unsigned flags)
579{
580 nodemask_t nodes;
581 int err;
582
583 err = get_nodes(&nodes, nmask, maxnode);
584 if (err)
585 return err;
586 return do_mbind(start, len, mode, &nodes, flags);
587}
588
589/* Set the process memory policy */
590asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
591 unsigned long maxnode)
592{
593 int err;
594 nodemask_t nodes;
595
596 if (mode < 0 || mode > MPOL_MAX)
597 return -EINVAL;
598 err = get_nodes(&nodes, nmask, maxnode);
599 if (err)
600 return err;
601 return do_set_mempolicy(mode, &nodes);
602}
603
604/* Retrieve NUMA policy */
605asmlinkage long sys_get_mempolicy(int __user *policy,
606 unsigned long __user *nmask,
607 unsigned long maxnode,
608 unsigned long addr, unsigned long flags)
609{
610 int err, pval;
611 nodemask_t nodes;
612
613 if (nmask != NULL && maxnode < MAX_NUMNODES)
614 return -EINVAL;
615
616 err = do_get_mempolicy(&pval, &nodes, addr, flags);
617
618 if (err)
619 return err;
620
621 if (policy && put_user(pval, policy))
622 return -EFAULT;
623
624 if (nmask)
625 err = copy_nodes_to_user(nmask, maxnode, &nodes);
626
627 return err;
628}
629
592#ifdef CONFIG_COMPAT 630#ifdef CONFIG_COMPAT
593 631
594asmlinkage long compat_sys_get_mempolicy(int __user *policy, 632asmlinkage long compat_sys_get_mempolicy(int __user *policy,
@@ -649,15 +687,15 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
649 long err = 0; 687 long err = 0;
650 unsigned long __user *nm = NULL; 688 unsigned long __user *nm = NULL;
651 unsigned long nr_bits, alloc_size; 689 unsigned long nr_bits, alloc_size;
652 DECLARE_BITMAP(bm, MAX_NUMNODES); 690 nodemask_t bm;
653 691
654 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); 692 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
655 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; 693 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
656 694
657 if (nmask) { 695 if (nmask) {
658 err = compat_get_bitmap(bm, nmask, nr_bits); 696 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
659 nm = compat_alloc_user_space(alloc_size); 697 nm = compat_alloc_user_space(alloc_size);
660 err |= copy_to_user(nm, bm, alloc_size); 698 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
661 } 699 }
662 700
663 if (err) 701 if (err)
@@ -676,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
676 714
677 if (vma) { 715 if (vma) {
678 if (vma->vm_ops && vma->vm_ops->get_policy) 716 if (vma->vm_ops && vma->vm_ops->get_policy)
679 pol = vma->vm_ops->get_policy(vma, addr); 717 pol = vma->vm_ops->get_policy(vma, addr);
680 else if (vma->vm_policy && 718 else if (vma->vm_policy &&
681 vma->vm_policy->policy != MPOL_DEFAULT) 719 vma->vm_policy->policy != MPOL_DEFAULT)
682 pol = vma->vm_policy; 720 pol = vma->vm_policy;
@@ -722,10 +760,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
722 struct task_struct *me = current; 760 struct task_struct *me = current;
723 761
724 nid = me->il_next; 762 nid = me->il_next;
725 BUG_ON(nid >= MAX_NUMNODES); 763 next = next_node(nid, policy->v.nodes);
726 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
727 if (next >= MAX_NUMNODES) 764 if (next >= MAX_NUMNODES)
728 next = find_first_bit(policy->v.nodes, MAX_NUMNODES); 765 next = first_node(policy->v.nodes);
729 me->il_next = next; 766 me->il_next = next;
730 return nid; 767 return nid;
731} 768}
@@ -734,29 +771,27 @@ static unsigned interleave_nodes(struct mempolicy *policy)
734static unsigned offset_il_node(struct mempolicy *pol, 771static unsigned offset_il_node(struct mempolicy *pol,
735 struct vm_area_struct *vma, unsigned long off) 772 struct vm_area_struct *vma, unsigned long off)
736{ 773{
737 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES); 774 unsigned nnodes = nodes_weight(pol->v.nodes);
738 unsigned target = (unsigned)off % nnodes; 775 unsigned target = (unsigned)off % nnodes;
739 int c; 776 int c;
740 int nid = -1; 777 int nid = -1;
741 778
742 c = 0; 779 c = 0;
743 do { 780 do {
744 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1); 781 nid = next_node(nid, pol->v.nodes);
745 c++; 782 c++;
746 } while (c <= target); 783 } while (c <= target);
747 BUG_ON(nid >= MAX_NUMNODES);
748 BUG_ON(!test_bit(nid, pol->v.nodes));
749 return nid; 784 return nid;
750} 785}
751 786
752/* Allocate a page in interleaved policy. 787/* Allocate a page in interleaved policy.
753 Own path because it needs to do special accounting. */ 788 Own path because it needs to do special accounting. */
754static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) 789static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
790 unsigned nid)
755{ 791{
756 struct zonelist *zl; 792 struct zonelist *zl;
757 struct page *page; 793 struct page *page;
758 794
759 BUG_ON(!node_online(nid));
760 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); 795 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
761 page = __alloc_pages(gfp, order, zl); 796 page = __alloc_pages(gfp, order, zl);
762 if (page && page_zone(page) == zl->zones[0]) { 797 if (page && page_zone(page) == zl->zones[0]) {
@@ -799,8 +834,6 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
799 unsigned nid; 834 unsigned nid;
800 if (vma) { 835 if (vma) {
801 unsigned long off; 836 unsigned long off;
802 BUG_ON(addr >= vma->vm_end);
803 BUG_ON(addr < vma->vm_start);
804 off = vma->vm_pgoff; 837 off = vma->vm_pgoff;
805 off += (addr - vma->vm_start) >> PAGE_SHIFT; 838 off += (addr - vma->vm_start) >> PAGE_SHIFT;
806 nid = offset_il_node(pol, vma, off); 839 nid = offset_il_node(pol, vma, off);
@@ -878,7 +911,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
878 case MPOL_DEFAULT: 911 case MPOL_DEFAULT:
879 return 1; 912 return 1;
880 case MPOL_INTERLEAVE: 913 case MPOL_INTERLEAVE:
881 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES); 914 return nodes_equal(a->v.nodes, b->v.nodes);
882 case MPOL_PREFERRED: 915 case MPOL_PREFERRED:
883 return a->v.preferred_node == b->v.preferred_node; 916 return a->v.preferred_node == b->v.preferred_node;
884 case MPOL_BIND: { 917 case MPOL_BIND: {
@@ -1117,7 +1150,7 @@ int mpol_set_shared_policy(struct shared_policy *info,
1117 PDprintk("set_shared_policy %lx sz %lu %d %lx\n", 1150 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1118 vma->vm_pgoff, 1151 vma->vm_pgoff,
1119 sz, npol? npol->policy : -1, 1152 sz, npol? npol->policy : -1,
1120 npol ? npol->v.nodes[0] : -1); 1153 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1121 1154
1122 if (npol) { 1155 if (npol) {
1123 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); 1156 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1164,14 +1197,12 @@ void __init numa_policy_init(void)
1164 /* Set interleaving policy for system init. This way not all 1197 /* Set interleaving policy for system init. This way not all
1165 the data structures allocated at system boot end up in node zero. */ 1198 the data structures allocated at system boot end up in node zero. */
1166 1199
1167 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), 1200 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1168 MAX_NUMNODES) < 0)
1169 printk("numa_policy_init: interleaving failed\n"); 1201 printk("numa_policy_init: interleaving failed\n");
1170} 1202}
1171 1203
1172/* Reset policy of current process to default. 1204/* Reset policy of current process to default */
1173 * Assumes fs == KERNEL_DS */
1174void numa_default_policy(void) 1205void numa_default_policy(void)
1175{ 1206{
1176 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); 1207 do_set_mempolicy(MPOL_DEFAULT, NULL);
1177} 1208}