diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 189 |
1 files changed, 164 insertions, 25 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 73790188b0eb..880831bd3003 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -95,6 +95,9 @@ | |||
95 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ | 95 | #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ |
96 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ | 96 | #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ |
97 | 97 | ||
98 | /* The number of pages to migrate per call to migrate_pages() */ | ||
99 | #define MIGRATE_CHUNK_SIZE 256 | ||
100 | |||
98 | static kmem_cache_t *policy_cache; | 101 | static kmem_cache_t *policy_cache; |
99 | static kmem_cache_t *sn_cache; | 102 | static kmem_cache_t *sn_cache; |
100 | 103 | ||
@@ -129,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
129 | } | 132 | } |
130 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; | 133 | return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; |
131 | } | 134 | } |
135 | |||
132 | /* Generate a custom zonelist for the BIND policy. */ | 136 | /* Generate a custom zonelist for the BIND policy. */ |
133 | static struct zonelist *bind_zonelist(nodemask_t *nodes) | 137 | static struct zonelist *bind_zonelist(nodemask_t *nodes) |
134 | { | 138 | { |
135 | struct zonelist *zl; | 139 | struct zonelist *zl; |
136 | int num, max, nd; | 140 | int num, max, nd, k; |
137 | 141 | ||
138 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 142 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); |
139 | zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); | 143 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); |
140 | if (!zl) | 144 | if (!zl) |
141 | return NULL; | 145 | return NULL; |
142 | num = 0; | 146 | num = 0; |
143 | for_each_node_mask(nd, *nodes) | 147 | /* First put in the highest zones from all nodes, then all the next |
144 | zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; | 148 | lower zones etc. Avoid empty zones because the memory allocator |
149 | doesn't like them. If you implement node hot removal you | ||
150 | have to fix that. */ | ||
151 | for (k = policy_zone; k >= 0; k--) { | ||
152 | for_each_node_mask(nd, *nodes) { | ||
153 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | ||
154 | if (z->present_pages > 0) | ||
155 | zl->zones[num++] = z; | ||
156 | } | ||
157 | } | ||
145 | zl->zones[num] = NULL; | 158 | zl->zones[num] = NULL; |
146 | return zl; | 159 | return zl; |
147 | } | 160 | } |
@@ -543,24 +556,91 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
543 | } | 556 | } |
544 | } | 557 | } |
545 | 558 | ||
546 | static int swap_pages(struct list_head *pagelist) | 559 | /* |
560 | * Migrate the list 'pagelist' of pages to a certain destination. | ||
561 | * | ||
562 | * Specify destination with either non-NULL vma or dest_node >= 0 | ||
563 | * Return the number of pages not migrated or error code | ||
564 | */ | ||
565 | static int migrate_pages_to(struct list_head *pagelist, | ||
566 | struct vm_area_struct *vma, int dest) | ||
547 | { | 567 | { |
568 | LIST_HEAD(newlist); | ||
548 | LIST_HEAD(moved); | 569 | LIST_HEAD(moved); |
549 | LIST_HEAD(failed); | 570 | LIST_HEAD(failed); |
550 | int n; | 571 | int err = 0; |
572 | int nr_pages; | ||
573 | struct page *page; | ||
574 | struct list_head *p; | ||
551 | 575 | ||
552 | n = migrate_pages(pagelist, NULL, &moved, &failed); | 576 | redo: |
553 | putback_lru_pages(&failed); | 577 | nr_pages = 0; |
554 | putback_lru_pages(&moved); | 578 | list_for_each(p, pagelist) { |
579 | if (vma) | ||
580 | page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start); | ||
581 | else | ||
582 | page = alloc_pages_node(dest, GFP_HIGHUSER, 0); | ||
555 | 583 | ||
556 | return n; | 584 | if (!page) { |
585 | err = -ENOMEM; | ||
586 | goto out; | ||
587 | } | ||
588 | list_add(&page->lru, &newlist); | ||
589 | nr_pages++; | ||
590 | if (nr_pages > MIGRATE_CHUNK_SIZE) | ||
591 | break; | ||
592 | } | ||
593 | err = migrate_pages(pagelist, &newlist, &moved, &failed); | ||
594 | |||
595 | putback_lru_pages(&moved); /* Call release pages instead ?? */ | ||
596 | |||
597 | if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) | ||
598 | goto redo; | ||
599 | out: | ||
600 | /* Return leftover allocated pages */ | ||
601 | while (!list_empty(&newlist)) { | ||
602 | page = list_entry(newlist.next, struct page, lru); | ||
603 | list_del(&page->lru); | ||
604 | __free_page(page); | ||
605 | } | ||
606 | list_splice(&failed, pagelist); | ||
607 | if (err < 0) | ||
608 | return err; | ||
609 | |||
610 | /* Calculate number of leftover pages */ | ||
611 | nr_pages = 0; | ||
612 | list_for_each(p, pagelist) | ||
613 | nr_pages++; | ||
614 | return nr_pages; | ||
557 | } | 615 | } |
558 | 616 | ||
559 | /* | 617 | /* |
560 | * For now migrate_pages simply swaps out the pages from nodes that are in | 618 | * Migrate pages from one node to a target node. |
561 | * the source set but not in the target set. In the future, we would | 619 | * Returns error or the number of pages not migrated. |
562 | * want a function that moves pages between the two nodesets in such | 620 | */ |
563 | * a way as to preserve the physical layout as much as possible. | 621 | int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) |
622 | { | ||
623 | nodemask_t nmask; | ||
624 | LIST_HEAD(pagelist); | ||
625 | int err = 0; | ||
626 | |||
627 | nodes_clear(nmask); | ||
628 | node_set(source, nmask); | ||
629 | |||
630 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, | ||
631 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | ||
632 | |||
633 | if (!list_empty(&pagelist)) { | ||
634 | err = migrate_pages_to(&pagelist, NULL, dest); | ||
635 | if (!list_empty(&pagelist)) | ||
636 | putback_lru_pages(&pagelist); | ||
637 | } | ||
638 | return err; | ||
639 | } | ||
640 | |||
641 | /* | ||
642 | * Move pages between the two nodesets so as to preserve the physical | ||
643 | * layout as much as possible. | ||
564 | * | 644 | * |
565 | * Returns the number of page that could not be moved. | 645 | * Returns the number of page that could not be moved. |
566 | */ | 646 | */ |
@@ -568,22 +648,76 @@ int do_migrate_pages(struct mm_struct *mm, | |||
568 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) | 648 | const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) |
569 | { | 649 | { |
570 | LIST_HEAD(pagelist); | 650 | LIST_HEAD(pagelist); |
571 | int count = 0; | 651 | int busy = 0; |
572 | nodemask_t nodes; | 652 | int err = 0; |
653 | nodemask_t tmp; | ||
573 | 654 | ||
574 | nodes_andnot(nodes, *from_nodes, *to_nodes); | 655 | down_read(&mm->mmap_sem); |
575 | 656 | ||
576 | down_read(&mm->mmap_sem); | 657 | /* |
577 | check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, | 658 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' |
578 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 659 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' |
660 | * bit in 'tmp', and return that <source, dest> pair for migration. | ||
661 | * The pair of nodemasks 'to' and 'from' define the map. | ||
662 | * | ||
663 | * If no pair of bits is found that way, fallback to picking some | ||
664 | * pair of 'source' and 'dest' bits that are not the same. If the | ||
665 | * 'source' and 'dest' bits are the same, this represents a node | ||
666 | * that will be migrating to itself, so no pages need move. | ||
667 | * | ||
668 | * If no bits are left in 'tmp', or if all remaining bits left | ||
669 | * in 'tmp' correspond to the same bit in 'to', return false | ||
670 | * (nothing left to migrate). | ||
671 | * | ||
672 | * This lets us pick a pair of nodes to migrate between, such that | ||
673 | * if possible the dest node is not already occupied by some other | ||
674 | * source node, minimizing the risk of overloading the memory on a | ||
675 | * node that would happen if we migrated incoming memory to a node | ||
676 | * before migrating outgoing memory source that same node. | ||
677 | * | ||
678 | * A single scan of tmp is sufficient. As we go, we remember the | ||
679 | * most recent <s, d> pair that moved (s != d). If we find a pair | ||
680 | * that not only moved, but what's better, moved to an empty slot | ||
681 | * (d is not set in tmp), then we break out then, with that pair. | ||
682 | * Otherwise when we finish scannng from_tmp, we at least have the | ||
683 | * most recent <s, d> pair that moved. If we get all the way through | ||
684 | * the scan of tmp without finding any node that moved, much less | ||
685 | * moved to an empty node, then there is nothing left worth migrating. | ||
686 | */ | ||
579 | 687 | ||
580 | if (!list_empty(&pagelist)) { | 688 | tmp = *from_nodes; |
581 | count = swap_pages(&pagelist); | 689 | while (!nodes_empty(tmp)) { |
582 | putback_lru_pages(&pagelist); | 690 | int s,d; |
691 | int source = -1; | ||
692 | int dest = 0; | ||
693 | |||
694 | for_each_node_mask(s, tmp) { | ||
695 | d = node_remap(s, *from_nodes, *to_nodes); | ||
696 | if (s == d) | ||
697 | continue; | ||
698 | |||
699 | source = s; /* Node moved. Memorize */ | ||
700 | dest = d; | ||
701 | |||
702 | /* dest not in remaining from nodes? */ | ||
703 | if (!node_isset(dest, tmp)) | ||
704 | break; | ||
705 | } | ||
706 | if (source == -1) | ||
707 | break; | ||
708 | |||
709 | node_clear(source, tmp); | ||
710 | err = migrate_to_node(mm, source, dest, flags); | ||
711 | if (err > 0) | ||
712 | busy += err; | ||
713 | if (err < 0) | ||
714 | break; | ||
583 | } | 715 | } |
584 | 716 | ||
585 | up_read(&mm->mmap_sem); | 717 | up_read(&mm->mmap_sem); |
586 | return count; | 718 | if (err < 0) |
719 | return err; | ||
720 | return busy; | ||
587 | } | 721 | } |
588 | 722 | ||
589 | long do_mbind(unsigned long start, unsigned long len, | 723 | long do_mbind(unsigned long start, unsigned long len, |
@@ -643,8 +777,9 @@ long do_mbind(unsigned long start, unsigned long len, | |||
643 | int nr_failed = 0; | 777 | int nr_failed = 0; |
644 | 778 | ||
645 | err = mbind_range(vma, start, end, new); | 779 | err = mbind_range(vma, start, end, new); |
780 | |||
646 | if (!list_empty(&pagelist)) | 781 | if (!list_empty(&pagelist)) |
647 | nr_failed = swap_pages(&pagelist); | 782 | nr_failed = migrate_pages_to(&pagelist, vma, -1); |
648 | 783 | ||
649 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 784 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
650 | err = -EIO; | 785 | err = -EIO; |
@@ -673,6 +808,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, | |||
673 | nodes_clear(*nodes); | 808 | nodes_clear(*nodes); |
674 | if (maxnode == 0 || !nmask) | 809 | if (maxnode == 0 || !nmask) |
675 | return 0; | 810 | return 0; |
811 | if (maxnode > PAGE_SIZE*BITS_PER_BYTE) | ||
812 | return -EINVAL; | ||
676 | 813 | ||
677 | nlongs = BITS_TO_LONGS(maxnode); | 814 | nlongs = BITS_TO_LONGS(maxnode); |
678 | if ((maxnode % BITS_PER_LONG) == 0) | 815 | if ((maxnode % BITS_PER_LONG) == 0) |
@@ -1034,6 +1171,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1034 | return interleave_nodes(pol); | 1171 | return interleave_nodes(pol); |
1035 | } | 1172 | } |
1036 | 1173 | ||
1174 | #ifdef CONFIG_HUGETLBFS | ||
1037 | /* Return a zonelist suitable for a huge page allocation. */ | 1175 | /* Return a zonelist suitable for a huge page allocation. */ |
1038 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) | 1176 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) |
1039 | { | 1177 | { |
@@ -1047,6 +1185,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) | |||
1047 | } | 1185 | } |
1048 | return zonelist_policy(GFP_HIGHUSER, pol); | 1186 | return zonelist_policy(GFP_HIGHUSER, pol); |
1049 | } | 1187 | } |
1188 | #endif | ||
1050 | 1189 | ||
1051 | /* Allocate a page in interleaved policy. | 1190 | /* Allocate a page in interleaved policy. |
1052 | Own path because it needs to do special accounting. */ | 1191 | Own path because it needs to do special accounting. */ |