diff options
author | Christoph Lameter <clameter@sgi.com> | 2006-02-01 06:05:40 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-02-01 11:53:16 -0500 |
commit | 7e2ab150d1b3b286a4c864c60a549b2601777b63 (patch) | |
tree | 9d8f4f3af382a043ada81f75c324e76dff9f0043 | |
parent | a3351e525e4768c29aa5d22ef59b5b38e0361e53 (diff) |
[PATCH] Direct Migration V9: upgrade MPOL_MF_MOVE and sys_migrate_pages()
Modify policy layer to support direct page migration
- Add migrate_pages_to() allowing the migration of a list of pages to a a
specified node or to vma with a specific allocation policy in sets of
MIGRATE_CHUNK_SIZE pages
- Modify do_migrate_pages() to do a staged move of pages from the source
nodes to the target nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | mm/mempolicy.c | 167 |
1 files changed, 146 insertions, 21 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 73790188b0eb..27da6d5c77ba 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -95,6 +95,9 @@ | |||
95 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | 95 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ |
96 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ | 96 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ |
97 | 97 | ||
98 | /* The number of pages to migrate per call to migrate_pages() */ | ||
99 | #define MIGRATE_CHUNK_SIZE 256 | ||
100 | |||
98 | static kmem_cache_t *policy_cache; | 101 | static kmem_cache_t *policy_cache; |
99 | static kmem_cache_t *sn_cache; | 102 | static kmem_cache_t *sn_cache; |
100 | 103 | ||
@@ -543,24 +546,91 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
543 | } | 546 | } |
544 | } | 547 | } |
545 | 548 | ||
546 | static int swap_pages(struct list_head *pagelist) | 549 | /* |
550 | * Migrate the list 'pagelist' of pages to a certain destination. | ||
551 | * | ||
552 | * Specify destination with either non-NULL vma or dest_node >= 0 | ||
553 | * Return the number of pages not migrated or error code | ||
554 | */ | ||
555 | static int migrate_pages_to(struct list_head *pagelist, | ||
556 | struct vm_area_struct *vma, int dest) | ||
547 | { | 557 | { |
558 | LIST_HEAD(newlist); | ||
548 | LIST_HEAD(moved); | 559 | LIST_HEAD(moved); |
549 | LIST_HEAD(failed); | 560 | LIST_HEAD(failed); |
550 | int n; | 561 | int err = 0; |
562 | int nr_pages; | ||
563 | struct page *page; | ||
564 | struct list_head *p; | ||
551 | 565 | ||
552 | n = migrate_pages(pagelist, NULL, &moved, &failed); | 566 | redo: |
553 | putback_lru_pages(&failed); | 567 | nr_pages = 0; |
554 | putback_lru_pages(&moved); | 568 | list_for_each(p, pagelist) { |
569 | if (vma) | ||
570 | page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start); | ||
571 | else | ||
572 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | ||
555 | 573 | ||
556 | return n; | 574 | if (!page) { |
575 | err = -ENOMEM; | ||
576 | goto out; | ||
577 | } | ||
578 | list_add(&page->lru, &newlist); | ||
579 | nr_pages++; | ||
580 | if (nr_pages > MIGRATE_CHUNK_SIZE); | ||
581 | break; | ||
582 | } | ||
583 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | ||
584 | |||
585 | putback_lru_pages(&moved); /* Call release pages instead ?? */ | ||
586 | |||
587 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) | ||
588 | goto redo; | ||
589 | out: | ||
590 | /* Return leftover allocated pages */ | ||
591 | while (!list_empty(&newlist)) { | ||
592 | page = list_entry(newlist.next, struct page, lru); | ||
593 | list_del(&page->lru); | ||
594 | __free_page(page); | ||
595 | } | ||
596 | list_splice(&failed, pagelist); | ||
597 | if (err < 0) | ||
598 | return err; | ||
599 | |||
600 | /* Calculate number of leftover pages */ | ||
601 | nr_pages = 0; | ||
602 | list_for_each(p, pagelist) | ||
603 | nr_pages++; | ||
604 | return nr_pages; | ||
605 | } | ||
606 | |||
607 | /* | ||
608 | * Migrate pages from one node to a target node. | ||
609 | * Returns error or the number of pages not migrated. | ||
610 | */ | ||
611 | int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) | ||
612 | { | ||
613 | nodemask_t nmask; | ||
614 | LIST_HEAD(pagelist); | ||
615 | int err = 0; | ||
616 | |||
617 | nodes_clear(nmask); | ||
618 | node_set(source, nmask); | ||
619 | |||
620 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, | ||
621 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | ||
622 | |||
623 | if (!list_empty(&pagelist)) { | ||
624 | err = migrate_pages_to(&pagelist, NULL, dest); | ||
625 | if (!list_empty(&pagelist)) | ||
626 | putback_lru_pages(&pagelist); | ||
627 | } | ||
628 | return err; | ||
557 | } | 629 | } |
558 | 630 | ||
559 | /* | 631 | /* |
560 | * For now migrate_pages simply swaps out the pages from nodes that are in | 632 | * Move pages between the two nodesets so as to preserve the physical |
561 | * the source set but not in the target set. In the future, we would | 633 | * layout as much as possible. |
562 | * want a function that moves pages between the two nodesets in such | ||
563 | * a way as to preserve the physical layout as much as possible. | ||
564 | * | 634 | * |
565 | * Returns the number of page that could not be moved. | 635 | * Returns the number of page that could not be moved. |
566 | */ | 636 | */ |
@@ -568,22 +638,76 @@ int do_migrate_pages(struct mm_struct *mm, | |||
568 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 638 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) |
569 | { | 639 | { |
570 | LIST_HEAD(pagelist); | 640 | LIST_HEAD(pagelist); |
571 | int count = 0; | 641 | int busy = 0; |
572 | nodemask_t nodes; | 642 | int err = 0; |
643 | nodemask_t tmp; | ||
573 | 644 | ||
574 | nodes_andnot(nodes, *from_nodes, *to_nodes); | 645 | down_read(&mm->mmap_sem); |
575 | 646 | ||
576 | down_read(&mm->mmap_sem); | 647 | /* |
577 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, | 648 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' |
578 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 649 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' |
650 | * bit in 'tmp', and return that <source, dest> pair for migration. | ||
651 | * The pair of nodemasks 'to' and 'from' define the map. | ||
652 | * | ||
653 | * If no pair of bits is found that way, fallback to picking some | ||
654 | * pair of 'source' and 'dest' bits that are not the same. If the | ||
655 | * 'source' and 'dest' bits are the same, this represents a node | ||
656 | * that will be migrating to itself, so no pages need move. | ||
657 | * | ||
658 | * If no bits are left in 'tmp', or if all remaining bits left | ||
659 | * in 'tmp' correspond to the same bit in 'to', return false | ||
660 | * (nothing left to migrate). | ||
661 | * | ||
662 | * This lets us pick a pair of nodes to migrate between, such that | ||
663 | * if possible the dest node is not already occupied by some other | ||
664 | * source node, minimizing the risk of overloading the memory on a | ||
665 | * node that would happen if we migrated incoming memory to a node | ||
666 | * before migrating outgoing memory source that same node. | ||
667 | * | ||
668 | * A single scan of tmp is sufficient. As we go, we remember the | ||
669 | * most recent <s, d> pair that moved (s != d). If we find a pair | ||
670 | * that not only moved, but what's better, moved to an empty slot | ||
671 | * (d is not set in tmp), then we break out then, with that pair. | ||
672 | * Otherwise when we finish scannng from_tmp, we at least have the | ||
673 | * most recent <s, d> pair that moved. If we get all the way through | ||
674 | * the scan of tmp without finding any node that moved, much less | ||
675 | * moved to an empty node, then there is nothing left worth migrating. | ||
676 | */ | ||
579 | 677 | ||
580 | if (!list_empty(&pagelist)) { | 678 | tmp = *from_nodes; |
581 | count = swap_pages(&pagelist); | 679 | while (!nodes_empty(tmp)) { |
582 | putback_lru_pages(&pagelist); | 680 | int s,d; |
681 | int source = -1; | ||
682 | int dest = 0; | ||
683 | |||
684 | for_each_node_mask(s, tmp) { | ||
685 | d = node_remap(s, *from_nodes, *to_nodes); | ||
686 | if (s == d) | ||
687 | continue; | ||
688 | |||
689 | source = s; /* Node moved. Memorize */ | ||
690 | dest = d; | ||
691 | |||
692 | /* dest not in remaining from nodes? */ | ||
693 | if (!node_isset(dest, tmp)) | ||
694 | break; | ||
695 | } | ||
696 | if (source == -1) | ||
697 | break; | ||
698 | |||
699 | node_clear(source, tmp); | ||
700 | err = migrate_to_node(mm, source, dest, flags); | ||
701 | if (err > 0) | ||
702 | busy += err; | ||
703 | if (err < 0) | ||
704 | break; | ||
583 | } | 705 | } |
584 | 706 | ||
585 | up_read(&mm->mmap_sem); | 707 | up_read(&mm->mmap_sem); |
586 | return count; | 708 | if (err < 0) |
709 | return err; | ||
710 | return busy; | ||
587 | } | 711 | } |
588 | 712 | ||
589 | long do_mbind(unsigned long start, unsigned long len, | 713 | long do_mbind(unsigned long start, unsigned long len, |
@@ -643,8 +767,9 @@ long do_mbind(unsigned long start, unsigned long len, | |||
643 | int nr_failed = 0; | 767 | int nr_failed = 0; |
644 | 768 | ||
645 | err = mbind_range(vma, start, end, new); | 769 | err = mbind_range(vma, start, end, new); |
770 | |||
646 | if (!list_empty(&pagelist)) | 771 | if (!list_empty(&pagelist)) |
647 | nr_failed = swap_pages(&pagelist); | 772 | nr_failed = migrate_pages_to(&pagelist, vma, -1); |
648 | 773 | ||
649 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 774 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
650 | err = -EIO; | 775 | err = -EIO; |