diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 270 |
1 files changed, 138 insertions, 132 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4b077ec6c005..7051fe450e96 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -183,55 +183,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
183 | return policy; | 183 | return policy; |
184 | } | 184 | } |
185 | 185 | ||
186 | /* Check if we are the only process mapping the page in question */ | ||
187 | static inline int single_mm_mapping(struct mm_struct *mm, | ||
188 | struct address_space *mapping) | ||
189 | { | ||
190 | struct vm_area_struct *vma; | ||
191 | struct prio_tree_iter iter; | ||
192 | int rc = 1; | ||
193 | |||
194 | spin_lock(&mapping->i_mmap_lock); | ||
195 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) | ||
196 | if (mm != vma->vm_mm) { | ||
197 | rc = 0; | ||
198 | goto out; | ||
199 | } | ||
200 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | ||
201 | if (mm != vma->vm_mm) { | ||
202 | rc = 0; | ||
203 | goto out; | ||
204 | } | ||
205 | out: | ||
206 | spin_unlock(&mapping->i_mmap_lock); | ||
207 | return rc; | ||
208 | } | ||
209 | |||
210 | /* | ||
211 | * Add a page to be migrated to the pagelist | ||
212 | */ | ||
213 | static void migrate_page_add(struct vm_area_struct *vma, | ||
214 | struct page *page, struct list_head *pagelist, unsigned long flags) | ||
215 | { | ||
216 | /* | ||
217 | * Avoid migrating a page that is shared by others and not writable. | ||
218 | */ | ||
219 | if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || | ||
220 | mapping_writably_mapped(page->mapping) || | ||
221 | single_mm_mapping(vma->vm_mm, page->mapping)) { | ||
222 | int rc = isolate_lru_page(page); | ||
223 | |||
224 | if (rc == 1) | ||
225 | list_add(&page->lru, pagelist); | ||
226 | /* | ||
227 | * If the isolate attempt was not successful then we just | ||
228 | * encountered an unswappable page. Something must be wrong. | ||
229 | */ | ||
230 | WARN_ON(rc == 0); | ||
231 | } | ||
232 | } | ||
233 | |||
234 | static void gather_stats(struct page *, void *); | 186 | static void gather_stats(struct page *, void *); |
187 | static void migrate_page_add(struct vm_area_struct *vma, | ||
188 | struct page *page, struct list_head *pagelist, unsigned long flags); | ||
235 | 189 | ||
236 | /* Scan through pages checking if pages follow certain conditions. */ | 190 | /* Scan through pages checking if pages follow certain conditions. */ |
237 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 191 | static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
@@ -440,90 +394,6 @@ static int contextualize_policy(int mode, nodemask_t *nodes) | |||
440 | return mpol_check_policy(mode, nodes); | 394 | return mpol_check_policy(mode, nodes); |
441 | } | 395 | } |
442 | 396 | ||
443 | static int swap_pages(struct list_head *pagelist) | ||
444 | { | ||
445 | LIST_HEAD(moved); | ||
446 | LIST_HEAD(failed); | ||
447 | int n; | ||
448 | |||
449 | n = migrate_pages(pagelist, NULL, &moved, &failed); | ||
450 | putback_lru_pages(&failed); | ||
451 | putback_lru_pages(&moved); | ||
452 | |||
453 | return n; | ||
454 | } | ||
455 | |||
456 | long do_mbind(unsigned long start, unsigned long len, | ||
457 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
458 | { | ||
459 | struct vm_area_struct *vma; | ||
460 | struct mm_struct *mm = current->mm; | ||
461 | struct mempolicy *new; | ||
462 | unsigned long end; | ||
463 | int err; | ||
464 | LIST_HEAD(pagelist); | ||
465 | |||
466 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT | | ||
467 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
468 | || mode > MPOL_MAX) | ||
469 | return -EINVAL; | ||
470 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) | ||
471 | return -EPERM; | ||
472 | |||
473 | if (start & ~PAGE_MASK) | ||
474 | return -EINVAL; | ||
475 | |||
476 | if (mode == MPOL_DEFAULT) | ||
477 | flags &= ~MPOL_MF_STRICT; | ||
478 | |||
479 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; | ||
480 | end = start + len; | ||
481 | |||
482 | if (end < start) | ||
483 | return -EINVAL; | ||
484 | if (end == start) | ||
485 | return 0; | ||
486 | |||
487 | if (mpol_check_policy(mode, nmask)) | ||
488 | return -EINVAL; | ||
489 | |||
490 | new = mpol_new(mode, nmask); | ||
491 | if (IS_ERR(new)) | ||
492 | return PTR_ERR(new); | ||
493 | |||
494 | /* | ||
495 | * If we are using the default policy then operation | ||
496 | * on discontinuous address spaces is okay after all | ||
497 | */ | ||
498 | if (!new) | ||
499 | flags |= MPOL_MF_DISCONTIG_OK; | ||
500 | |||
501 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | ||
502 | mode,nodes_addr(nodes)[0]); | ||
503 | |||
504 | down_write(&mm->mmap_sem); | ||
505 | vma = check_range(mm, start, end, nmask, | ||
506 | flags | MPOL_MF_INVERT, &pagelist); | ||
507 | |||
508 | err = PTR_ERR(vma); | ||
509 | if (!IS_ERR(vma)) { | ||
510 | int nr_failed = 0; | ||
511 | |||
512 | err = mbind_range(vma, start, end, new); | ||
513 | if (!list_empty(&pagelist)) | ||
514 | nr_failed = swap_pages(&pagelist); | ||
515 | |||
516 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | ||
517 | err = -EIO; | ||
518 | } | ||
519 | if (!list_empty(&pagelist)) | ||
520 | putback_lru_pages(&pagelist); | ||
521 | |||
522 | up_write(&mm->mmap_sem); | ||
523 | mpol_free(new); | ||
524 | return err; | ||
525 | } | ||
526 | |||
527 | /* Set the process memory policy */ | 397 | /* Set the process memory policy */ |
528 | long do_set_mempolicy(int mode, nodemask_t *nodes) | 398 | long do_set_mempolicy(int mode, nodemask_t *nodes) |
529 | { | 399 | { |
@@ -644,6 +514,71 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
644 | } | 514 | } |
645 | 515 | ||
646 | /* | 516 | /* |
517 | * page migration | ||
518 | */ | ||
519 | |||
520 | /* Check if we are the only process mapping the page in question */ | ||
521 | static inline int single_mm_mapping(struct mm_struct *mm, | ||
522 | struct address_space *mapping) | ||
523 | { | ||
524 | struct vm_area_struct *vma; | ||
525 | struct prio_tree_iter iter; | ||
526 | int rc = 1; | ||
527 | |||
528 | spin_lock(&mapping->i_mmap_lock); | ||
529 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) | ||
530 | if (mm != vma->vm_mm) { | ||
531 | rc = 0; | ||
532 | goto out; | ||
533 | } | ||
534 | list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) | ||
535 | if (mm != vma->vm_mm) { | ||
536 | rc = 0; | ||
537 | goto out; | ||
538 | } | ||
539 | out: | ||
540 | spin_unlock(&mapping->i_mmap_lock); | ||
541 | return rc; | ||
542 | } | ||
543 | |||
544 | /* | ||
545 | * Add a page to be migrated to the pagelist | ||
546 | */ | ||
547 | static void migrate_page_add(struct vm_area_struct *vma, | ||
548 | struct page *page, struct list_head *pagelist, unsigned long flags) | ||
549 | { | ||
550 | /* | ||
551 | * Avoid migrating a page that is shared by others and not writable. | ||
552 | */ | ||
553 | if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || | ||
554 | mapping_writably_mapped(page->mapping) || | ||
555 | single_mm_mapping(vma->vm_mm, page->mapping)) { | ||
556 | int rc = isolate_lru_page(page); | ||
557 | |||
558 | if (rc == 1) | ||
559 | list_add(&page->lru, pagelist); | ||
560 | /* | ||
561 | * If the isolate attempt was not successful then we just | ||
562 | * encountered an unswappable page. Something must be wrong. | ||
563 | */ | ||
564 | WARN_ON(rc == 0); | ||
565 | } | ||
566 | } | ||
567 | |||
568 | static int swap_pages(struct list_head *pagelist) | ||
569 | { | ||
570 | LIST_HEAD(moved); | ||
571 | LIST_HEAD(failed); | ||
572 | int n; | ||
573 | |||
574 | n = migrate_pages(pagelist, NULL, &moved, &failed); | ||
575 | putback_lru_pages(&failed); | ||
576 | putback_lru_pages(&moved); | ||
577 | |||
578 | return n; | ||
579 | } | ||
580 | |||
581 | /* | ||
647 | * For now migrate_pages simply swaps out the pages from nodes that are in | 582 | * For now migrate_pages simply swaps out the pages from nodes that are in |
648 | * the source set but not in the target set. In the future, we would | 583 | * the source set but not in the target set. In the future, we would |
649 | * want a function that moves pages between the two nodesets in such | 584 | * want a function that moves pages between the two nodesets in such |
@@ -673,6 +608,77 @@ int do_migrate_pages(struct mm_struct *mm, | |||
673 | return count; | 608 | return count; |
674 | } | 609 | } |
675 | 610 | ||
611 | long do_mbind(unsigned long start, unsigned long len, | ||
612 | unsigned long mode, nodemask_t *nmask, unsigned long flags) | ||
613 | { | ||
614 | struct vm_area_struct *vma; | ||
615 | struct mm_struct *mm = current->mm; | ||
616 | struct mempolicy *new; | ||
617 | unsigned long end; | ||
618 | int err; | ||
619 | LIST_HEAD(pagelist); | ||
620 | |||
621 | if ((flags & ~(unsigned long)(MPOL_MF_STRICT | | ||
622 | MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) | ||
623 | || mode > MPOL_MAX) | ||
624 | return -EINVAL; | ||
625 | if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) | ||
626 | return -EPERM; | ||
627 | |||
628 | if (start & ~PAGE_MASK) | ||
629 | return -EINVAL; | ||
630 | |||
631 | if (mode == MPOL_DEFAULT) | ||
632 | flags &= ~MPOL_MF_STRICT; | ||
633 | |||
634 | len = (len + PAGE_SIZE - 1) & PAGE_MASK; | ||
635 | end = start + len; | ||
636 | |||
637 | if (end < start) | ||
638 | return -EINVAL; | ||
639 | if (end == start) | ||
640 | return 0; | ||
641 | |||
642 | if (mpol_check_policy(mode, nmask)) | ||
643 | return -EINVAL; | ||
644 | |||
645 | new = mpol_new(mode, nmask); | ||
646 | if (IS_ERR(new)) | ||
647 | return PTR_ERR(new); | ||
648 | |||
649 | /* | ||
650 | * If we are using the default policy then operation | ||
651 | * on discontinuous address spaces is okay after all | ||
652 | */ | ||
653 | if (!new) | ||
654 | flags |= MPOL_MF_DISCONTIG_OK; | ||
655 | |||
656 | PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, | ||
657 | mode,nodes_addr(nodes)[0]); | ||
658 | |||
659 | down_write(&mm->mmap_sem); | ||
660 | vma = check_range(mm, start, end, nmask, | ||
661 | flags | MPOL_MF_INVERT, &pagelist); | ||
662 | |||
663 | err = PTR_ERR(vma); | ||
664 | if (!IS_ERR(vma)) { | ||
665 | int nr_failed = 0; | ||
666 | |||
667 | err = mbind_range(vma, start, end, new); | ||
668 | if (!list_empty(&pagelist)) | ||
669 | nr_failed = swap_pages(&pagelist); | ||
670 | |||
671 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | ||
672 | err = -EIO; | ||
673 | } | ||
674 | if (!list_empty(&pagelist)) | ||
675 | putback_lru_pages(&pagelist); | ||
676 | |||
677 | up_write(&mm->mmap_sem); | ||
678 | mpol_free(new); | ||
679 | return err; | ||
680 | } | ||
681 | |||
676 | /* | 682 | /* |
677 | * User space interface with variable sized bitmaps for nodelists. | 683 | * User space interface with variable sized bitmaps for nodelists. |
678 | */ | 684 | */ |