aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Lameter <clameter@engr.sgi.com>2005-10-29 21:16:59 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-10-30 00:40:45 -0400
commit8bccd85ffbaf8ff1448d1235fa6594e207695531 (patch)
treed5ed1f3b2ba1d301c74cc0a62ed416e634c5bebb
parentbb7e7e032d2cb8e0e9a88a2be209de5e61033b39 (diff)
[PATCH] Implement sys_* do_* layering in the memory policy layer.
- Do a separation between do_xxx and sys_xxx functions. sys_xxx functions take variable sized bitmaps from user space as arguments. do_xxx functions take fixed sized nodemask_t as arguments and may be used from inside the kernel. Doing so simplifies the initialization code. There is no fs = kernel_ds assumption anymore. - Split up get_nodes into get_nodes (which gets the node list) and contextualize_policy which restricts the nodes to those accessible to the task and updates cpusets. - Add comments explaining limitations of bind policy Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@muc.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--mm/mempolicy.c276
1 files changed, 162 insertions, 114 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 902d4c9eccdc..123925f50f86 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2,6 +2,7 @@
2 * Simple NUMA memory policy for the Linux kernel. 2 * Simple NUMA memory policy for the Linux kernel.
3 * 3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs. 4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
5 * Subject to the GNU Public License, version 2. 6 * Subject to the GNU Public License, version 2.
6 * 7 *
7 * NUMA policy allows the user to give hints in which node(s) memory should 8 * NUMA policy allows the user to give hints in which node(s) memory should
@@ -17,13 +18,19 @@
17 * offset into the backing object or offset into the mapping 18 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter 19 * for anonymous memory. For process policy an process counter
19 * is used. 20 * is used.
21 *
20 * bind Only allocate memory on a specific set of nodes, 22 * bind Only allocate memory on a specific set of nodes,
21 * no fallback. 23 * no fallback.
24 * FIXME: memory is allocated starting with the first node
25 * to the last. It would be better if bind would truly restrict
26 * the allocation to memory nodes instead
27 *
22 * preferred Try a specific node first before normal fallback. 28 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation 29 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default, 30 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default 31 * but useful to set in a VMA when you have a non default
26 * process policy. 32 * process policy.
33 *
27 * default Allocate on the local node first, or when on a VMA 34 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did 35 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default. 36 * in a NUMA aware kernel and still does by, ahem, default.
@@ -113,56 +120,6 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
113 } 120 }
114 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; 121 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
115} 122}
116
117/* Copy a node mask from user space. */
118static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
119 unsigned long maxnode, int mode)
120{
121 unsigned long k;
122 unsigned long nlongs;
123 unsigned long endmask;
124
125 --maxnode;
126 nodes_clear(*nodes);
127 if (maxnode == 0 || !nmask)
128 return 0;
129
130 nlongs = BITS_TO_LONGS(maxnode);
131 if ((maxnode % BITS_PER_LONG) == 0)
132 endmask = ~0UL;
133 else
134 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
135
136 /* When the user specified more nodes than supported just check
137 if the non supported part is all zero. */
138 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
139 if (nlongs > PAGE_SIZE/sizeof(long))
140 return -EINVAL;
141 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
142 unsigned long t;
143 if (get_user(t, nmask + k))
144 return -EFAULT;
145 if (k == nlongs - 1) {
146 if (t & endmask)
147 return -EINVAL;
148 } else if (t)
149 return -EINVAL;
150 }
151 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
152 endmask = ~0UL;
153 }
154
155 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
156 return -EFAULT;
157 nodes_addr(*nodes)[nlongs-1] &= endmask;
158 /* Update current mems_allowed */
159 cpuset_update_current_mems_allowed();
160 /* Ignore nodes not set in current->mems_allowed */
161 /* AK: shouldn't this error out instead? */
162 cpuset_restrict_to_mems_allowed(nodes_addr(*nodes));
163 return mpol_check_policy(mode, nodes);
164}
165
166/* Generate a custom zonelist for the BIND policy. */ 123/* Generate a custom zonelist for the BIND policy. */
167static struct zonelist *bind_zonelist(nodemask_t *nodes) 124static struct zonelist *bind_zonelist(nodemask_t *nodes)
168{ 125{
@@ -380,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start,
380 return err; 337 return err;
381} 338}
382 339
383/* Change policy for a memory range */ 340static int contextualize_policy(int mode, nodemask_t *nodes)
384asmlinkage long sys_mbind(unsigned long start, unsigned long len, 341{
385 unsigned long mode, 342 if (!nodes)
386 unsigned long __user *nmask, unsigned long maxnode, 343 return 0;
387 unsigned flags) 344
345 /* Update current mems_allowed */
346 cpuset_update_current_mems_allowed();
347 /* Ignore nodes not set in current->mems_allowed */
348 cpuset_restrict_to_mems_allowed(nodes->bits);
349 return mpol_check_policy(mode, nodes);
350}
351
352long do_mbind(unsigned long start, unsigned long len,
353 unsigned long mode, nodemask_t *nmask, unsigned long flags)
388{ 354{
389 struct vm_area_struct *vma; 355 struct vm_area_struct *vma;
390 struct mm_struct *mm = current->mm; 356 struct mm_struct *mm = current->mm;
391 struct mempolicy *new; 357 struct mempolicy *new;
392 unsigned long end; 358 unsigned long end;
393 nodemask_t nodes;
394 int err; 359 int err;
395 360
396 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) 361 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
@@ -405,12 +370,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
405 return -EINVAL; 370 return -EINVAL;
406 if (end == start) 371 if (end == start)
407 return 0; 372 return 0;
408 373 if (contextualize_policy(mode, nmask))
409 err = get_nodes(&nodes, nmask, maxnode, mode); 374 return -EINVAL;
410 if (err) 375 new = mpol_new(mode, nmask);
411 return err;
412
413 new = mpol_new(mode, &nodes);
414 if (IS_ERR(new)) 376 if (IS_ERR(new))
415 return PTR_ERR(new); 377 return PTR_ERR(new);
416 378
@@ -418,7 +380,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
418 mode,nodes_addr(nodes)[0]); 380 mode,nodes_addr(nodes)[0]);
419 381
420 down_write(&mm->mmap_sem); 382 down_write(&mm->mmap_sem);
421 vma = check_range(mm, start, end, &nodes, flags); 383 vma = check_range(mm, start, end, nmask, flags);
422 err = PTR_ERR(vma); 384 err = PTR_ERR(vma);
423 if (!IS_ERR(vma)) 385 if (!IS_ERR(vma))
424 err = mbind_range(vma, start, end, new); 386 err = mbind_range(vma, start, end, new);
@@ -428,19 +390,13 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
428} 390}
429 391
430/* Set the process memory policy */ 392/* Set the process memory policy */
431asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 393long do_set_mempolicy(int mode, nodemask_t *nodes)
432 unsigned long maxnode)
433{ 394{
434 int err;
435 struct mempolicy *new; 395 struct mempolicy *new;
436 nodemask_t nodes;
437 396
438 if (mode < 0 || mode > MPOL_MAX) 397 if (contextualize_policy(mode, nodes))
439 return -EINVAL; 398 return -EINVAL;
440 err = get_nodes(&nodes, nmask, maxnode, mode); 399 new = mpol_new(mode, nodes);
441 if (err)
442 return err;
443 new = mpol_new(mode, &nodes);
444 if (IS_ERR(new)) 400 if (IS_ERR(new))
445 return PTR_ERR(new); 401 return PTR_ERR(new);
446 mpol_free(current->mempolicy); 402 mpol_free(current->mempolicy);
@@ -459,7 +415,8 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
459 switch (p->policy) { 415 switch (p->policy) {
460 case MPOL_BIND: 416 case MPOL_BIND:
461 for (i = 0; p->v.zonelist->zones[i]; i++) 417 for (i = 0; p->v.zonelist->zones[i]; i++)
462 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes); 418 node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
419 *nodes);
463 break; 420 break;
464 case MPOL_DEFAULT: 421 case MPOL_DEFAULT:
465 break; 422 break;
@@ -491,38 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
491 return err; 448 return err;
492} 449}
493 450
494/* Copy a kernel node mask to user space */
495static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
496 nodemask_t *nodes)
497{
498 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
499 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
500
501 if (copy > nbytes) {
502 if (copy > PAGE_SIZE)
503 return -EINVAL;
504 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
505 return -EFAULT;
506 copy = nbytes;
507 }
508 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
509}
510
511/* Retrieve NUMA policy */ 451/* Retrieve NUMA policy */
512asmlinkage long sys_get_mempolicy(int __user *policy, 452long do_get_mempolicy(int *policy, nodemask_t *nmask,
513 unsigned long __user *nmask, 453 unsigned long addr, unsigned long flags)
514 unsigned long maxnode,
515 unsigned long addr, unsigned long flags)
516{ 454{
517 int err, pval; 455 int err;
518 struct mm_struct *mm = current->mm; 456 struct mm_struct *mm = current->mm;
519 struct vm_area_struct *vma = NULL; 457 struct vm_area_struct *vma = NULL;
520 struct mempolicy *pol = current->mempolicy; 458 struct mempolicy *pol = current->mempolicy;
521 459
522 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) 460 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
523 return -EINVAL; 461 return -EINVAL;
524 if (nmask != NULL && maxnode < MAX_NUMNODES)
525 return -EINVAL;
526 if (flags & MPOL_F_ADDR) { 462 if (flags & MPOL_F_ADDR) {
527 down_read(&mm->mmap_sem); 463 down_read(&mm->mmap_sem);
528 vma = find_vma_intersection(mm, addr, addr+1); 464 vma = find_vma_intersection(mm, addr, addr+1);
@@ -545,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
545 err = lookup_node(mm, addr); 481 err = lookup_node(mm, addr);
546 if (err < 0) 482 if (err < 0)
547 goto out; 483 goto out;
548 pval = err; 484 *policy = err;
549 } else if (pol == current->mempolicy && 485 } else if (pol == current->mempolicy &&
550 pol->policy == MPOL_INTERLEAVE) { 486 pol->policy == MPOL_INTERLEAVE) {
551 pval = current->il_next; 487 *policy = current->il_next;
552 } else { 488 } else {
553 err = -EINVAL; 489 err = -EINVAL;
554 goto out; 490 goto out;
555 } 491 }
556 } else 492 } else
557 pval = pol->policy; 493 *policy = pol->policy;
558 494
559 if (vma) { 495 if (vma) {
560 up_read(&current->mm->mmap_sem); 496 up_read(&current->mm->mmap_sem);
561 vma = NULL; 497 vma = NULL;
562 } 498 }
563 499
564 if (policy && put_user(pval, policy))
565 return -EFAULT;
566
567 err = 0; 500 err = 0;
568 if (nmask) { 501 if (nmask)
569 nodemask_t nodes; 502 get_zonemask(pol, nmask);
570 get_zonemask(pol, &nodes);
571 err = copy_nodes_to_user(nmask, maxnode, &nodes);
572 }
573 503
574 out: 504 out:
575 if (vma) 505 if (vma)
@@ -577,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
577 return err; 507 return err;
578} 508}
579 509
510/*
511 * User space interface with variable sized bitmaps for nodelists.
512 */
513
514/* Copy a node mask from user space. */
515static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask,
516 unsigned long maxnode)
517{
518 unsigned long k;
519 unsigned long nlongs;
520 unsigned long endmask;
521
522 --maxnode;
523 nodes_clear(*nodes);
524 if (maxnode == 0 || !nmask)
525 return 0;
526
527 nlongs = BITS_TO_LONGS(maxnode);
528 if ((maxnode % BITS_PER_LONG) == 0)
529 endmask = ~0UL;
530 else
531 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
532
533 /* When the user specified more nodes than supported just check
534 if the non supported part is all zero. */
535 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
536 if (nlongs > PAGE_SIZE/sizeof(long))
537 return -EINVAL;
538 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
539 unsigned long t;
540 if (get_user(t, nmask + k))
541 return -EFAULT;
542 if (k == nlongs - 1) {
543 if (t & endmask)
544 return -EINVAL;
545 } else if (t)
546 return -EINVAL;
547 }
548 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
549 endmask = ~0UL;
550 }
551
552 if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
553 return -EFAULT;
554 nodes_addr(*nodes)[nlongs-1] &= endmask;
555 return 0;
556}
557
558/* Copy a kernel node mask to user space */
559static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
560 nodemask_t *nodes)
561{
562 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
563 const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
564
565 if (copy > nbytes) {
566 if (copy > PAGE_SIZE)
567 return -EINVAL;
568 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
569 return -EFAULT;
570 copy = nbytes;
571 }
572 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
573}
574
575asmlinkage long sys_mbind(unsigned long start, unsigned long len,
576 unsigned long mode,
577 unsigned long __user *nmask, unsigned long maxnode,
578 unsigned flags)
579{
580 nodemask_t nodes;
581 int err;
582
583 err = get_nodes(&nodes, nmask, maxnode);
584 if (err)
585 return err;
586 return do_mbind(start, len, mode, &nodes, flags);
587}
588
589/* Set the process memory policy */
590asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
591 unsigned long maxnode)
592{
593 int err;
594 nodemask_t nodes;
595
596 if (mode < 0 || mode > MPOL_MAX)
597 return -EINVAL;
598 err = get_nodes(&nodes, nmask, maxnode);
599 if (err)
600 return err;
601 return do_set_mempolicy(mode, &nodes);
602}
603
604/* Retrieve NUMA policy */
605asmlinkage long sys_get_mempolicy(int __user *policy,
606 unsigned long __user *nmask,
607 unsigned long maxnode,
608 unsigned long addr, unsigned long flags)
609{
610 int err, pval;
611 nodemask_t nodes;
612
613 if (nmask != NULL && maxnode < MAX_NUMNODES)
614 return -EINVAL;
615
616 err = do_get_mempolicy(&pval, &nodes, addr, flags);
617
618 if (err)
619 return err;
620
621 if (policy && put_user(pval, policy))
622 return -EFAULT;
623
624 if (nmask)
625 err = copy_nodes_to_user(nmask, maxnode, &nodes);
626
627 return err;
628}
629
580#ifdef CONFIG_COMPAT 630#ifdef CONFIG_COMPAT
581 631
582asmlinkage long compat_sys_get_mempolicy(int __user *policy, 632asmlinkage long compat_sys_get_mempolicy(int __user *policy,
@@ -664,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
664 714
665 if (vma) { 715 if (vma) {
666 if (vma->vm_ops && vma->vm_ops->get_policy) 716 if (vma->vm_ops && vma->vm_ops->get_policy)
667 pol = vma->vm_ops->get_policy(vma, addr); 717 pol = vma->vm_ops->get_policy(vma, addr);
668 else if (vma->vm_policy && 718 else if (vma->vm_policy &&
669 vma->vm_policy->policy != MPOL_DEFAULT) 719 vma->vm_policy->policy != MPOL_DEFAULT)
670 pol = vma->vm_policy; 720 pol = vma->vm_policy;
@@ -1147,14 +1197,12 @@ void __init numa_policy_init(void)
1147 /* Set interleaving policy for system init. This way not all 1197 /* Set interleaving policy for system init. This way not all
1148 the data structures allocated at system boot end up in node zero. */ 1198 the data structures allocated at system boot end up in node zero. */
1149 1199
1150 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), 1200 if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1151 MAX_NUMNODES) < 0)
1152 printk("numa_policy_init: interleaving failed\n"); 1201 printk("numa_policy_init: interleaving failed\n");
1153} 1202}
1154 1203
1155/* Reset policy of current process to default. 1204/* Reset policy of current process to default */
1156 * Assumes fs == KERNEL_DS */
1157void numa_default_policy(void) 1205void numa_default_policy(void)
1158{ 1206{
1159 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); 1207 do_set_mempolicy(MPOL_DEFAULT, NULL);
1160} 1208}