aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c189
1 files changed, 164 insertions, 25 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 73790188b0eb..880831bd3003 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -95,6 +95,9 @@
95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 95#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ 96#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
97 97
98/* The number of pages to migrate per call to migrate_pages() */
99#define MIGRATE_CHUNK_SIZE 256
100
98static kmem_cache_t *policy_cache; 101static kmem_cache_t *policy_cache;
99static kmem_cache_t *sn_cache; 102static kmem_cache_t *sn_cache;
100 103
@@ -129,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
129 } 132 }
130 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; 133 return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
131} 134}
135
132/* Generate a custom zonelist for the BIND policy. */ 136/* Generate a custom zonelist for the BIND policy. */
133static struct zonelist *bind_zonelist(nodemask_t *nodes) 137static struct zonelist *bind_zonelist(nodemask_t *nodes)
134{ 138{
135 struct zonelist *zl; 139 struct zonelist *zl;
136 int num, max, nd; 140 int num, max, nd, k;
137 141
138 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 142 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
139 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); 143 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
140 if (!zl) 144 if (!zl)
141 return NULL; 145 return NULL;
142 num = 0; 146 num = 0;
143 for_each_node_mask(nd, *nodes) 147 /* First put in the highest zones from all nodes, then all the next
144 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; 148 lower zones etc. Avoid empty zones because the memory allocator
149 doesn't like them. If you implement node hot removal you
150 have to fix that. */
151 for (k = policy_zone; k >= 0; k--) {
152 for_each_node_mask(nd, *nodes) {
153 struct zone *z = &NODE_DATA(nd)->node_zones[k];
154 if (z->present_pages > 0)
155 zl->zones[num++] = z;
156 }
157 }
145 zl->zones[num] = NULL; 158 zl->zones[num] = NULL;
146 return zl; 159 return zl;
147} 160}
@@ -543,24 +556,91 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
543 } 556 }
544} 557}
545 558
546static int swap_pages(struct list_head *pagelist) 559/*
560 * Migrate the list 'pagelist' of pages to a certain destination.
561 *
562 * Specify destination with either non-NULL vma or dest_node >= 0
563 * Return the number of pages not migrated or error code
564 */
565static int migrate_pages_to(struct list_head *pagelist,
566 struct vm_area_struct *vma, int dest)
547{ 567{
568 LIST_HEAD(newlist);
548 LIST_HEAD(moved); 569 LIST_HEAD(moved);
549 LIST_HEAD(failed); 570 LIST_HEAD(failed);
550 int n; 571 int err = 0;
572 int nr_pages;
573 struct page *page;
574 struct list_head *p;
551 575
552 n = migrate_pages(pagelist, NULL, &moved, &failed); 576redo:
553 putback_lru_pages(&failed); 577 nr_pages = 0;
554 putback_lru_pages(&moved); 578 list_for_each(p, pagelist) {
579 if (vma)
580 page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start);
581 else
582 page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
555 583
556 return n; 584 if (!page) {
585 err = -ENOMEM;
586 goto out;
587 }
588 list_add(&page->lru, &newlist);
589 nr_pages++;
590 if (nr_pages > MIGRATE_CHUNK_SIZE)
591 break;
592 }
593 err = migrate_pages(pagelist, &newlist, &moved, &failed);
594
595 putback_lru_pages(&moved); /* Call release pages instead ?? */
596
597 if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
598 goto redo;
599out:
600 /* Return leftover allocated pages */
601 while (!list_empty(&newlist)) {
602 page = list_entry(newlist.next, struct page, lru);
603 list_del(&page->lru);
604 __free_page(page);
605 }
606 list_splice(&failed, pagelist);
607 if (err < 0)
608 return err;
609
610 /* Calculate number of leftover pages */
611 nr_pages = 0;
612 list_for_each(p, pagelist)
613 nr_pages++;
614 return nr_pages;
557} 615}
558 616
559/* 617/*
560 * For now migrate_pages simply swaps out the pages from nodes that are in 618 * Migrate pages from one node to a target node.
561 * the source set but not in the target set. In the future, we would 619 * Returns error or the number of pages not migrated.
562 * want a function that moves pages between the two nodesets in such 620 */
563 * a way as to preserve the physical layout as much as possible. 621int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
622{
623 nodemask_t nmask;
624 LIST_HEAD(pagelist);
625 int err = 0;
626
627 nodes_clear(nmask);
628 node_set(source, nmask);
629
630 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
631 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
632
633 if (!list_empty(&pagelist)) {
634 err = migrate_pages_to(&pagelist, NULL, dest);
635 if (!list_empty(&pagelist))
636 putback_lru_pages(&pagelist);
637 }
638 return err;
639}
640
641/*
642 * Move pages between the two nodesets so as to preserve the physical
643 * layout as much as possible.
564 * 644 *
565 * Returns the number of page that could not be moved. 645 * Returns the number of page that could not be moved.
566 */ 646 */
@@ -568,22 +648,76 @@ int do_migrate_pages(struct mm_struct *mm,
568 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 648 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
569{ 649{
570 LIST_HEAD(pagelist); 650 LIST_HEAD(pagelist);
571 int count = 0; 651 int busy = 0;
572 nodemask_t nodes; 652 int err = 0;
653 nodemask_t tmp;
573 654
574 nodes_andnot(nodes, *from_nodes, *to_nodes); 655 down_read(&mm->mmap_sem);
575 656
576 down_read(&mm->mmap_sem); 657/*
577 check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, 658 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
578 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 659 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
660 * bit in 'tmp', and return that <source, dest> pair for migration.
661 * The pair of nodemasks 'to' and 'from' define the map.
662 *
663 * If no pair of bits is found that way, fallback to picking some
664 * pair of 'source' and 'dest' bits that are not the same. If the
665 * 'source' and 'dest' bits are the same, this represents a node
666 * that will be migrating to itself, so no pages need move.
667 *
668 * If no bits are left in 'tmp', or if all remaining bits left
669 * in 'tmp' correspond to the same bit in 'to', return false
670 * (nothing left to migrate).
671 *
672 * This lets us pick a pair of nodes to migrate between, such that
673 * if possible the dest node is not already occupied by some other
674 * source node, minimizing the risk of overloading the memory on a
675 * node that would happen if we migrated incoming memory to a node
676 * before migrating outgoing memory source that same node.
677 *
678 * A single scan of tmp is sufficient. As we go, we remember the
679 * most recent <s, d> pair that moved (s != d). If we find a pair
680 * that not only moved, but what's better, moved to an empty slot
681 * (d is not set in tmp), then we break out then, with that pair.
682 * Otherwise when we finish scannng from_tmp, we at least have the
683 * most recent <s, d> pair that moved. If we get all the way through
684 * the scan of tmp without finding any node that moved, much less
685 * moved to an empty node, then there is nothing left worth migrating.
686 */
579 687
580 if (!list_empty(&pagelist)) { 688 tmp = *from_nodes;
581 count = swap_pages(&pagelist); 689 while (!nodes_empty(tmp)) {
582 putback_lru_pages(&pagelist); 690 int s,d;
691 int source = -1;
692 int dest = 0;
693
694 for_each_node_mask(s, tmp) {
695 d = node_remap(s, *from_nodes, *to_nodes);
696 if (s == d)
697 continue;
698
699 source = s; /* Node moved. Memorize */
700 dest = d;
701
702 /* dest not in remaining from nodes? */
703 if (!node_isset(dest, tmp))
704 break;
705 }
706 if (source == -1)
707 break;
708
709 node_clear(source, tmp);
710 err = migrate_to_node(mm, source, dest, flags);
711 if (err > 0)
712 busy += err;
713 if (err < 0)
714 break;
583 } 715 }
584 716
585 up_read(&mm->mmap_sem); 717 up_read(&mm->mmap_sem);
586 return count; 718 if (err < 0)
719 return err;
720 return busy;
587} 721}
588 722
589long do_mbind(unsigned long start, unsigned long len, 723long do_mbind(unsigned long start, unsigned long len,
@@ -643,8 +777,9 @@ long do_mbind(unsigned long start, unsigned long len,
643 int nr_failed = 0; 777 int nr_failed = 0;
644 778
645 err = mbind_range(vma, start, end, new); 779 err = mbind_range(vma, start, end, new);
780
646 if (!list_empty(&pagelist)) 781 if (!list_empty(&pagelist))
647 nr_failed = swap_pages(&pagelist); 782 nr_failed = migrate_pages_to(&pagelist, vma, -1);
648 783
649 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 784 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
650 err = -EIO; 785 err = -EIO;
@@ -673,6 +808,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
673 nodes_clear(*nodes); 808 nodes_clear(*nodes);
674 if (maxnode == 0 || !nmask) 809 if (maxnode == 0 || !nmask)
675 return 0; 810 return 0;
811 if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
812 return -EINVAL;
676 813
677 nlongs = BITS_TO_LONGS(maxnode); 814 nlongs = BITS_TO_LONGS(maxnode);
678 if ((maxnode % BITS_PER_LONG) == 0) 815 if ((maxnode % BITS_PER_LONG) == 0)
@@ -1034,6 +1171,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1034 return interleave_nodes(pol); 1171 return interleave_nodes(pol);
1035} 1172}
1036 1173
1174#ifdef CONFIG_HUGETLBFS
1037/* Return a zonelist suitable for a huge page allocation. */ 1175/* Return a zonelist suitable for a huge page allocation. */
1038struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) 1176struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1039{ 1177{
@@ -1047,6 +1185,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1047 } 1185 }
1048 return zonelist_policy(GFP_HIGHUSER, pol); 1186 return zonelist_policy(GFP_HIGHUSER, pol);
1049} 1187}
1188#endif
1050 1189
1051/* Allocate a page in interleaved policy. 1190/* Allocate a page in interleaved policy.
1052 Own path because it needs to do special accounting. */ 1191 Own path because it needs to do special accounting. */