diff options
author | Christoph Lameter <clameter@engr.sgi.com> | 2005-10-29 21:16:59 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-10-30 00:40:45 -0400 |
commit | 8bccd85ffbaf8ff1448d1235fa6594e207695531 (patch) | |
tree | d5ed1f3b2ba1d301c74cc0a62ed416e634c5bebb | |
parent | bb7e7e032d2cb8e0e9a88a2be209de5e61033b39 (diff) |
[PATCH] Implement sys_* do_* layering in the memory policy layer.
- Do a separation between do_xxx and sys_xxx functions. sys_xxx functions
take variable sized bitmaps from user space as arguments. do_xxx functions
take fixed sized nodemask_t as arguments and may be used from inside the
kernel. Doing so simplifies the initialization code. There is no
fs = kernel_ds assumption anymore.
- Split up get_nodes into get_nodes (which gets the node list) and
contextualize_policy which restricts the nodes to those accessible
to the task and updates cpusets.
- Add comments explaining limitations of bind policy
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | mm/mempolicy.c | 276 |
1 files changed, 162 insertions, 114 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 902d4c9eccdc..123925f50f86 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -2,6 +2,7 @@ | |||
2 | * Simple NUMA memory policy for the Linux kernel. | 2 | * Simple NUMA memory policy for the Linux kernel. |
3 | * | 3 | * |
4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. | 4 | * Copyright 2003,2004 Andi Kleen, SuSE Labs. |
5 | * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. | ||
5 | * Subject to the GNU Public License, version 2. | 6 | * Subject to the GNU Public License, version 2. |
6 | * | 7 | * |
7 | * NUMA policy allows the user to give hints in which node(s) memory should | 8 | * NUMA policy allows the user to give hints in which node(s) memory should |
@@ -17,13 +18,19 @@ | |||
17 | * offset into the backing object or offset into the mapping | 18 | * offset into the backing object or offset into the mapping |
18 | * for anonymous memory. For process policy an process counter | 19 | * for anonymous memory. For process policy an process counter |
19 | * is used. | 20 | * is used. |
21 | * | ||
20 | * bind Only allocate memory on a specific set of nodes, | 22 | * bind Only allocate memory on a specific set of nodes, |
21 | * no fallback. | 23 | * no fallback. |
24 | * FIXME: memory is allocated starting with the first node | ||
25 | * to the last. It would be better if bind would truly restrict | ||
26 | * the allocation to memory nodes instead | ||
27 | * | ||
22 | * preferred Try a specific node first before normal fallback. | 28 | * preferred Try a specific node first before normal fallback. |
23 | * As a special case node -1 here means do the allocation | 29 | * As a special case node -1 here means do the allocation |
24 | * on the local CPU. This is normally identical to default, | 30 | * on the local CPU. This is normally identical to default, |
25 | * but useful to set in a VMA when you have a non default | 31 | * but useful to set in a VMA when you have a non default |
26 | * process policy. | 32 | * process policy. |
33 | * | ||
27 | * default Allocate on the local node first, or when on a VMA | 34 | * default Allocate on the local node first, or when on a VMA |
28 | * use the process policy. This is what Linux always did | 35 | * use the process policy. This is what Linux always did |
29 | * in a NUMA aware kernel and still does by, ahem, default. | 36 | * in a NUMA aware kernel and still does by, ahem, default. |
@@ -113,56 +120,6 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
113 | } | 120 | } |
114 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; | 121 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; |
115 | } | 122 | } |
116 | |||
117 | /* Copy a node mask from user space. */ | ||
118 | static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, | ||
119 | unsigned long maxnode, int mode) | ||
120 | { | ||
121 | unsigned long k; | ||
122 | unsigned long nlongs; | ||
123 | unsigned long endmask; | ||
124 | |||
125 | --maxnode; | ||
126 | nodes_clear(*nodes); | ||
127 | if (maxnode == 0 || !nmask) | ||
128 | return 0; | ||
129 | |||
130 | nlongs = BITS_TO_LONGS(maxnode); | ||
131 | if ((maxnode % BITS_PER_LONG) == 0) | ||
132 | endmask = ~0UL; | ||
133 | else | ||
134 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | ||
135 | |||
136 | /* When the user specified more nodes than supported just check | ||
137 | if the non supported part is all zero. */ | ||
138 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | ||
139 | if (nlongs > PAGE_SIZE/sizeof(long)) | ||
140 | return -EINVAL; | ||
141 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | ||
142 | unsigned long t; | ||
143 | if (get_user(t, nmask + k)) | ||
144 | return -EFAULT; | ||
145 | if (k == nlongs - 1) { | ||
146 | if (t & endmask) | ||
147 | return -EINVAL; | ||
148 | } else if (t) | ||
149 | return -EINVAL; | ||
150 | } | ||
151 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | ||
152 | endmask = ~0UL; | ||
153 | } | ||
154 | |||
155 | if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) | ||
156 | return -EFAULT; | ||
157 | nodes_addr(*nodes)[nlongs-1] &= endmask; | ||
158 | /* Update current mems_allowed */ | ||
159 | cpuset_update_current_mems_allowed(); | ||
160 | /* Ignore nodes not set in current->mems_allowed */ | ||
161 | /* AK: shouldn't this error out instead? */ | ||
162 | cpuset_restrict_to_mems_allowed(nodes_addr(*nodes)); | ||
163 | return mpol_check_policy(mode, nodes); | ||
164 | } | ||
165 | |||
166 | /* Generate a custom zonelist for the BIND policy. */ | 123 | /* Generate a custom zonelist for the BIND policy. */ |
167 | static struct zonelist *bind_zonelist(nodemask_t *nodes) | 124 | static struct zonelist *bind_zonelist(nodemask_t *nodes) |
168 | { | 125 | { |
@@ -380,17 +337,25 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, | |||
380 | return err; | 337 | return err; |
381 | } | 338 | } |
382 | 339 | ||
383 | /* Change policy for a memory range */ | 340 | static int contextualize_policy(int mode, nodemask_t *nodes) |
384 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | 341 | { |
385 | unsigned long mode, | 342 | if (!nodes) |
386 | unsigned long __user *nmask, unsigned long maxnode, | 343 | return 0; |
387 | unsigned flags) | 344 | |
345 | /* Update current mems_allowed */ | ||
346 | cpuset_update_current_mems_allowed(); | ||
347 | /* Ignore nodes not set in current->mems_allowed */ | ||
348 | cpuset_restrict_to_mems_allowed(nodes->bits); | ||
349 | return mpol_check_policy(mode, nodes); | ||
350 | } | ||
351 | |||
352 | long do_mbind(unsigned long start, unsigned long len, | ||
353 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
388 | { | 354 | { |
389 | struct vm_area_struct *vma; | 355 | struct vm_area_struct *vma; |
390 | struct mm_struct *mm = current->mm; | 356 | struct mm_struct *mm = current->mm; |
391 | struct mempolicy *new; | 357 | struct mempolicy *new; |
392 | unsigned long end; | 358 | unsigned long end; |
393 | nodemask_t nodes; | ||
394 | int err; | 359 | int err; |
395 | 360 | ||
396 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) | 361 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX) |
@@ -405,12 +370,9 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
405 | return -EINVAL; | 370 | return -EINVAL; |
406 | if (end == start) | 371 | if (end == start) |
407 | return 0; | 372 | return 0; |
408 | 373 | if (contextualize_policy(mode, nmask)) | |
409 | err = get_nodes(&nodes, nmask, maxnode, mode); | 374 | return -EINVAL; |
410 | if (err) | 375 | new = mpol_new(mode, nmask); |
411 | return err; | ||
412 | |||
413 | new = mpol_new(mode, &nodes); | ||
414 | if (IS_ERR(new)) | 376 | if (IS_ERR(new)) |
415 | return PTR_ERR(new); | 377 | return PTR_ERR(new); |
416 | 378 | ||
@@ -418,7 +380,7 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
418 | mode,nodes_addr(nodes)[0]); | 380 | mode,nodes_addr(nodes)[0]); |
419 | 381 | ||
420 | down_write(&mm->mmap_sem); | 382 | down_write(&mm->mmap_sem); |
421 | vma = check_range(mm, start, end, &nodes, flags); | 383 | vma = check_range(mm, start, end, nmask, flags); |
422 | err = PTR_ERR(vma); | 384 | err = PTR_ERR(vma); |
423 | if (!IS_ERR(vma)) | 385 | if (!IS_ERR(vma)) |
424 | err = mbind_range(vma, start, end, new); | 386 | err = mbind_range(vma, start, end, new); |
@@ -428,19 +390,13 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, | |||
428 | } | 390 | } |
429 | 391 | ||
430 | /* Set the process memory policy */ | 392 | /* Set the process memory policy */ |
431 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | 393 | long do_set_mempolicy(int mode, nodemask_t *nodes) |
432 | unsigned long maxnode) | ||
433 | { | 394 | { |
434 | int err; | ||
435 | struct mempolicy *new; | 395 | struct mempolicy *new; |
436 | nodemask_t nodes; | ||
437 | 396 | ||
438 | if (mode < 0 || mode > MPOL_MAX) | 397 | if (contextualize_policy(mode, nodes)) |
439 | return -EINVAL; | 398 | return -EINVAL; |
440 | err = get_nodes(&nodes, nmask, maxnode, mode); | 399 | new = mpol_new(mode, nodes); |
441 | if (err) | ||
442 | return err; | ||
443 | new = mpol_new(mode, &nodes); | ||
444 | if (IS_ERR(new)) | 400 | if (IS_ERR(new)) |
445 | return PTR_ERR(new); | 401 | return PTR_ERR(new); |
446 | mpol_free(current->mempolicy); | 402 | mpol_free(current->mempolicy); |
@@ -459,7 +415,8 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) | |||
459 | switch (p->policy) { | 415 | switch (p->policy) { |
460 | case MPOL_BIND: | 416 | case MPOL_BIND: |
461 | for (i = 0; p->v.zonelist->zones[i]; i++) | 417 | for (i = 0; p->v.zonelist->zones[i]; i++) |
462 | node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, *nodes); | 418 | node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, |
419 | *nodes); | ||
463 | break; | 420 | break; |
464 | case MPOL_DEFAULT: | 421 | case MPOL_DEFAULT: |
465 | break; | 422 | break; |
@@ -491,38 +448,17 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) | |||
491 | return err; | 448 | return err; |
492 | } | 449 | } |
493 | 450 | ||
494 | /* Copy a kernel node mask to user space */ | ||
495 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | ||
496 | nodemask_t *nodes) | ||
497 | { | ||
498 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | ||
499 | const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); | ||
500 | |||
501 | if (copy > nbytes) { | ||
502 | if (copy > PAGE_SIZE) | ||
503 | return -EINVAL; | ||
504 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | ||
505 | return -EFAULT; | ||
506 | copy = nbytes; | ||
507 | } | ||
508 | return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; | ||
509 | } | ||
510 | |||
511 | /* Retrieve NUMA policy */ | 451 | /* Retrieve NUMA policy */ |
512 | asmlinkage long sys_get_mempolicy(int __user *policy, | 452 | long do_get_mempolicy(int *policy, nodemask_t *nmask, |
513 | unsigned long __user *nmask, | 453 | unsigned long addr, unsigned long flags) |
514 | unsigned long maxnode, | ||
515 | unsigned long addr, unsigned long flags) | ||
516 | { | 454 | { |
517 | int err, pval; | 455 | int err; |
518 | struct mm_struct *mm = current->mm; | 456 | struct mm_struct *mm = current->mm; |
519 | struct vm_area_struct *vma = NULL; | 457 | struct vm_area_struct *vma = NULL; |
520 | struct mempolicy *pol = current->mempolicy; | 458 | struct mempolicy *pol = current->mempolicy; |
521 | 459 | ||
522 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) | 460 | if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) |
523 | return -EINVAL; | 461 | return -EINVAL; |
524 | if (nmask != NULL && maxnode < MAX_NUMNODES) | ||
525 | return -EINVAL; | ||
526 | if (flags & MPOL_F_ADDR) { | 462 | if (flags & MPOL_F_ADDR) { |
527 | down_read(&mm->mmap_sem); | 463 | down_read(&mm->mmap_sem); |
528 | vma = find_vma_intersection(mm, addr, addr+1); | 464 | vma = find_vma_intersection(mm, addr, addr+1); |
@@ -545,31 +481,25 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
545 | err = lookup_node(mm, addr); | 481 | err = lookup_node(mm, addr); |
546 | if (err < 0) | 482 | if (err < 0) |
547 | goto out; | 483 | goto out; |
548 | pval = err; | 484 | *policy = err; |
549 | } else if (pol == current->mempolicy && | 485 | } else if (pol == current->mempolicy && |
550 | pol->policy == MPOL_INTERLEAVE) { | 486 | pol->policy == MPOL_INTERLEAVE) { |
551 | pval = current->il_next; | 487 | *policy = current->il_next; |
552 | } else { | 488 | } else { |
553 | err = -EINVAL; | 489 | err = -EINVAL; |
554 | goto out; | 490 | goto out; |
555 | } | 491 | } |
556 | } else | 492 | } else |
557 | pval = pol->policy; | 493 | *policy = pol->policy; |
558 | 494 | ||
559 | if (vma) { | 495 | if (vma) { |
560 | up_read(¤t->mm->mmap_sem); | 496 | up_read(¤t->mm->mmap_sem); |
561 | vma = NULL; | 497 | vma = NULL; |
562 | } | 498 | } |
563 | 499 | ||
564 | if (policy && put_user(pval, policy)) | ||
565 | return -EFAULT; | ||
566 | |||
567 | err = 0; | 500 | err = 0; |
568 | if (nmask) { | 501 | if (nmask) |
569 | nodemask_t nodes; | 502 | get_zonemask(pol, nmask); |
570 | get_zonemask(pol, &nodes); | ||
571 | err = copy_nodes_to_user(nmask, maxnode, &nodes); | ||
572 | } | ||
573 | 503 | ||
574 | out: | 504 | out: |
575 | if (vma) | 505 | if (vma) |
@@ -577,6 +507,126 @@ asmlinkage long sys_get_mempolicy(int __user *policy, | |||
577 | return err; | 507 | return err; |
578 | } | 508 | } |
579 | 509 | ||
510 | /* | ||
511 | * User space interface with variable sized bitmaps for nodelists. | ||
512 | */ | ||
513 | |||
514 | /* Copy a node mask from user space. */ | ||
515 | static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, | ||
516 | unsigned long maxnode) | ||
517 | { | ||
518 | unsigned long k; | ||
519 | unsigned long nlongs; | ||
520 | unsigned long endmask; | ||
521 | |||
522 | --maxnode; | ||
523 | nodes_clear(*nodes); | ||
524 | if (maxnode == 0 || !nmask) | ||
525 | return 0; | ||
526 | |||
527 | nlongs = BITS_TO_LONGS(maxnode); | ||
528 | if ((maxnode % BITS_PER_LONG) == 0) | ||
529 | endmask = ~0UL; | ||
530 | else | ||
531 | endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; | ||
532 | |||
533 | /* When the user specified more nodes than supported just check | ||
534 | if the non supported part is all zero. */ | ||
535 | if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { | ||
536 | if (nlongs > PAGE_SIZE/sizeof(long)) | ||
537 | return -EINVAL; | ||
538 | for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { | ||
539 | unsigned long t; | ||
540 | if (get_user(t, nmask + k)) | ||
541 | return -EFAULT; | ||
542 | if (k == nlongs - 1) { | ||
543 | if (t & endmask) | ||
544 | return -EINVAL; | ||
545 | } else if (t) | ||
546 | return -EINVAL; | ||
547 | } | ||
548 | nlongs = BITS_TO_LONGS(MAX_NUMNODES); | ||
549 | endmask = ~0UL; | ||
550 | } | ||
551 | |||
552 | if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) | ||
553 | return -EFAULT; | ||
554 | nodes_addr(*nodes)[nlongs-1] &= endmask; | ||
555 | return 0; | ||
556 | } | ||
557 | |||
558 | /* Copy a kernel node mask to user space */ | ||
559 | static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, | ||
560 | nodemask_t *nodes) | ||
561 | { | ||
562 | unsigned long copy = ALIGN(maxnode-1, 64) / 8; | ||
563 | const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long); | ||
564 | |||
565 | if (copy > nbytes) { | ||
566 | if (copy > PAGE_SIZE) | ||
567 | return -EINVAL; | ||
568 | if (clear_user((char __user *)mask + nbytes, copy - nbytes)) | ||
569 | return -EFAULT; | ||
570 | copy = nbytes; | ||
571 | } | ||
572 | return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; | ||
573 | } | ||
574 | |||
575 | asmlinkage long sys_mbind(unsigned long start, unsigned long len, | ||
576 | unsigned long mode, | ||
577 | unsigned long __user *nmask, unsigned long maxnode, | ||
578 | unsigned flags) | ||
579 | { | ||
580 | nodemask_t nodes; | ||
581 | int err; | ||
582 | |||
583 | err = get_nodes(&nodes, nmask, maxnode); | ||
584 | if (err) | ||
585 | return err; | ||
586 | return do_mbind(start, len, mode, &nodes, flags); | ||
587 | } | ||
588 | |||
589 | /* Set the process memory policy */ | ||
590 | asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, | ||
591 | unsigned long maxnode) | ||
592 | { | ||
593 | int err; | ||
594 | nodemask_t nodes; | ||
595 | |||
596 | if (mode < 0 || mode > MPOL_MAX) | ||
597 | return -EINVAL; | ||
598 | err = get_nodes(&nodes, nmask, maxnode); | ||
599 | if (err) | ||
600 | return err; | ||
601 | return do_set_mempolicy(mode, &nodes); | ||
602 | } | ||
603 | |||
604 | /* Retrieve NUMA policy */ | ||
605 | asmlinkage long sys_get_mempolicy(int __user *policy, | ||
606 | unsigned long __user *nmask, | ||
607 | unsigned long maxnode, | ||
608 | unsigned long addr, unsigned long flags) | ||
609 | { | ||
610 | int err, pval; | ||
611 | nodemask_t nodes; | ||
612 | |||
613 | if (nmask != NULL && maxnode < MAX_NUMNODES) | ||
614 | return -EINVAL; | ||
615 | |||
616 | err = do_get_mempolicy(&pval, &nodes, addr, flags); | ||
617 | |||
618 | if (err) | ||
619 | return err; | ||
620 | |||
621 | if (policy && put_user(pval, policy)) | ||
622 | return -EFAULT; | ||
623 | |||
624 | if (nmask) | ||
625 | err = copy_nodes_to_user(nmask, maxnode, &nodes); | ||
626 | |||
627 | return err; | ||
628 | } | ||
629 | |||
580 | #ifdef CONFIG_COMPAT | 630 | #ifdef CONFIG_COMPAT |
581 | 631 | ||
582 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, | 632 | asmlinkage long compat_sys_get_mempolicy(int __user *policy, |
@@ -664,7 +714,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo | |||
664 | 714 | ||
665 | if (vma) { | 715 | if (vma) { |
666 | if (vma->vm_ops && vma->vm_ops->get_policy) | 716 | if (vma->vm_ops && vma->vm_ops->get_policy) |
667 | pol = vma->vm_ops->get_policy(vma, addr); | 717 | pol = vma->vm_ops->get_policy(vma, addr); |
668 | else if (vma->vm_policy && | 718 | else if (vma->vm_policy && |
669 | vma->vm_policy->policy != MPOL_DEFAULT) | 719 | vma->vm_policy->policy != MPOL_DEFAULT) |
670 | pol = vma->vm_policy; | 720 | pol = vma->vm_policy; |
@@ -1147,14 +1197,12 @@ void __init numa_policy_init(void) | |||
1147 | /* Set interleaving policy for system init. This way not all | 1197 | /* Set interleaving policy for system init. This way not all |
1148 | the data structures allocated at system boot end up in node zero. */ | 1198 | the data structures allocated at system boot end up in node zero. */ |
1149 | 1199 | ||
1150 | if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map), | 1200 | if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) |
1151 | MAX_NUMNODES) < 0) | ||
1152 | printk("numa_policy_init: interleaving failed\n"); | 1201 | printk("numa_policy_init: interleaving failed\n"); |
1153 | } | 1202 | } |
1154 | 1203 | ||
1155 | /* Reset policy of current process to default. | 1204 | /* Reset policy of current process to default */ |
1156 | * Assumes fs == KERNEL_DS */ | ||
1157 | void numa_default_policy(void) | 1205 | void numa_default_policy(void) |
1158 | { | 1206 | { |
1159 | sys_set_mempolicy(MPOL_DEFAULT, NULL, 0); | 1207 | do_set_mempolicy(MPOL_DEFAULT, NULL); |
1160 | } | 1208 | } |