aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c234
1 files changed, 160 insertions, 74 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4545d5944243..08f40a2f3fe0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -73,7 +73,6 @@
73#include <linux/sched.h> 73#include <linux/sched.h>
74#include <linux/nodemask.h> 74#include <linux/nodemask.h>
75#include <linux/cpuset.h> 75#include <linux/cpuset.h>
76#include <linux/gfp.h>
77#include <linux/slab.h> 76#include <linux/slab.h>
78#include <linux/string.h> 77#include <linux/string.h>
79#include <linux/module.h> 78#include <linux/module.h>
@@ -85,10 +84,12 @@
85#include <linux/seq_file.h> 84#include <linux/seq_file.h>
86#include <linux/proc_fs.h> 85#include <linux/proc_fs.h>
87#include <linux/migrate.h> 86#include <linux/migrate.h>
87#include <linux/ksm.h>
88#include <linux/rmap.h> 88#include <linux/rmap.h>
89#include <linux/security.h> 89#include <linux/security.h>
90#include <linux/syscalls.h> 90#include <linux/syscalls.h>
91#include <linux/ctype.h> 91#include <linux/ctype.h>
92#include <linux/mm_inline.h>
92 93
93#include <asm/tlbflush.h> 94#include <asm/tlbflush.h>
94#include <asm/uaccess.h> 95#include <asm/uaccess.h>
@@ -412,17 +413,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
412 if (!page) 413 if (!page)
413 continue; 414 continue;
414 /* 415 /*
415 * The check for PageReserved here is important to avoid 416 * vm_normal_page() filters out zero pages, but there might
416 * handling zero pages and other pages that may have been 417 * still be PageReserved pages to skip, perhaps in a VDSO.
417 * marked special by the system. 418 * And we cannot move PageKsm pages sensibly or safely yet.
418 *
419 * If the PageReserved would not be checked here then f.e.
420 * the location of the zero page could have an influence
421 * on MPOL_MF_STRICT, zero pages would be counted for
422 * the per node stats, and there would be useless attempts
423 * to put zero pages on the migration list.
424 */ 419 */
425 if (PageReserved(page)) 420 if (PageReserved(page) || PageKsm(page))
426 continue; 421 continue;
427 nid = page_to_nid(page); 422 nid = page_to_nid(page);
428 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) 423 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -567,24 +562,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
567} 562}
568 563
569/* Step 2: apply policy to a range and do splits. */ 564/* Step 2: apply policy to a range and do splits. */
570static int mbind_range(struct vm_area_struct *vma, unsigned long start, 565static int mbind_range(struct mm_struct *mm, unsigned long start,
571 unsigned long end, struct mempolicy *new) 566 unsigned long end, struct mempolicy *new_pol)
572{ 567{
573 struct vm_area_struct *next; 568 struct vm_area_struct *next;
574 int err; 569 struct vm_area_struct *prev;
570 struct vm_area_struct *vma;
571 int err = 0;
572 pgoff_t pgoff;
573 unsigned long vmstart;
574 unsigned long vmend;
575 575
576 err = 0; 576 vma = find_vma_prev(mm, start, &prev);
577 for (; vma && vma->vm_start < end; vma = next) { 577 if (!vma || vma->vm_start > start)
578 return -EFAULT;
579
580 for (; vma && vma->vm_start < end; prev = vma, vma = next) {
578 next = vma->vm_next; 581 next = vma->vm_next;
579 if (vma->vm_start < start) 582 vmstart = max(start, vma->vm_start);
580 err = split_vma(vma->vm_mm, vma, start, 1); 583 vmend = min(end, vma->vm_end);
581 if (!err && vma->vm_end > end) 584
582 err = split_vma(vma->vm_mm, vma, end, 0); 585 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
583 if (!err) 586 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
584 err = policy_vma(vma, new); 587 vma->anon_vma, vma->vm_file, pgoff, new_pol);
588 if (prev) {
589 vma = prev;
590 next = vma->vm_next;
591 continue;
592 }
593 if (vma->vm_start != vmstart) {
594 err = split_vma(vma->vm_mm, vma, vmstart, 1);
595 if (err)
596 goto out;
597 }
598 if (vma->vm_end != vmend) {
599 err = split_vma(vma->vm_mm, vma, vmend, 0);
600 if (err)
601 goto out;
602 }
603 err = policy_vma(vma, new_pol);
585 if (err) 604 if (err)
586 break; 605 goto out;
587 } 606 }
607
608 out:
588 return err; 609 return err;
589} 610}
590 611
@@ -784,9 +805,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
784 805
785 err = 0; 806 err = 0;
786 if (nmask) { 807 if (nmask) {
787 task_lock(current); 808 if (mpol_store_user_nodemask(pol)) {
788 get_policy_nodemask(pol, nmask); 809 *nmask = pol->w.user_nodemask;
789 task_unlock(current); 810 } else {
811 task_lock(current);
812 get_policy_nodemask(pol, nmask);
813 task_unlock(current);
814 }
790 } 815 }
791 816
792 out: 817 out:
@@ -809,6 +834,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
809 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { 834 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
810 if (!isolate_lru_page(page)) { 835 if (!isolate_lru_page(page)) {
811 list_add_tail(&page->lru, pagelist); 836 list_add_tail(&page->lru, pagelist);
837 inc_zone_page_state(page, NR_ISOLATED_ANON +
838 page_is_file_cache(page));
812 } 839 }
813 } 840 }
814} 841}
@@ -836,7 +863,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
836 flags | MPOL_MF_DISCONTIG_OK, &pagelist); 863 flags | MPOL_MF_DISCONTIG_OK, &pagelist);
837 864
838 if (!list_empty(&pagelist)) 865 if (!list_empty(&pagelist))
839 err = migrate_pages(&pagelist, new_node_page, dest); 866 err = migrate_pages(&pagelist, new_node_page, dest, 0);
840 867
841 return err; 868 return err;
842} 869}
@@ -864,36 +891,36 @@ int do_migrate_pages(struct mm_struct *mm,
864 if (err) 891 if (err)
865 goto out; 892 goto out;
866 893
867/* 894 /*
868 * Find a 'source' bit set in 'tmp' whose corresponding 'dest' 895 * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
869 * bit in 'to' is not also set in 'tmp'. Clear the found 'source' 896 * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
870 * bit in 'tmp', and return that <source, dest> pair for migration. 897 * bit in 'tmp', and return that <source, dest> pair for migration.
871 * The pair of nodemasks 'to' and 'from' define the map. 898 * The pair of nodemasks 'to' and 'from' define the map.
872 * 899 *
873 * If no pair of bits is found that way, fallback to picking some 900 * If no pair of bits is found that way, fallback to picking some
874 * pair of 'source' and 'dest' bits that are not the same. If the 901 * pair of 'source' and 'dest' bits that are not the same. If the
875 * 'source' and 'dest' bits are the same, this represents a node 902 * 'source' and 'dest' bits are the same, this represents a node
876 * that will be migrating to itself, so no pages need move. 903 * that will be migrating to itself, so no pages need move.
877 * 904 *
878 * If no bits are left in 'tmp', or if all remaining bits left 905 * If no bits are left in 'tmp', or if all remaining bits left
879 * in 'tmp' correspond to the same bit in 'to', return false 906 * in 'tmp' correspond to the same bit in 'to', return false
880 * (nothing left to migrate). 907 * (nothing left to migrate).
881 * 908 *
882 * This lets us pick a pair of nodes to migrate between, such that 909 * This lets us pick a pair of nodes to migrate between, such that
883 * if possible the dest node is not already occupied by some other 910 * if possible the dest node is not already occupied by some other
884 * source node, minimizing the risk of overloading the memory on a 911 * source node, minimizing the risk of overloading the memory on a
885 * node that would happen if we migrated incoming memory to a node 912 * node that would happen if we migrated incoming memory to a node
886 * before migrating outgoing memory source that same node. 913 * before migrating outgoing memory source that same node.
887 * 914 *
888 * A single scan of tmp is sufficient. As we go, we remember the 915 * A single scan of tmp is sufficient. As we go, we remember the
889 * most recent <s, d> pair that moved (s != d). If we find a pair 916 * most recent <s, d> pair that moved (s != d). If we find a pair
890 * that not only moved, but what's better, moved to an empty slot 917 * that not only moved, but what's better, moved to an empty slot
891 * (d is not set in tmp), then we break out then, with that pair. 918 * (d is not set in tmp), then we break out then, with that pair.
892 * Otherwise when we finish scannng from_tmp, we at least have the 919 * Otherwise when we finish scannng from_tmp, we at least have the
893 * most recent <s, d> pair that moved. If we get all the way through 920 * most recent <s, d> pair that moved. If we get all the way through
894 * the scan of tmp without finding any node that moved, much less 921 * the scan of tmp without finding any node that moved, much less
895 * moved to an empty node, then there is nothing left worth migrating. 922 * moved to an empty node, then there is nothing left worth migrating.
896 */ 923 */
897 924
898 tmp = *from_nodes; 925 tmp = *from_nodes;
899 while (!nodes_empty(tmp)) { 926 while (!nodes_empty(tmp)) {
@@ -1049,11 +1076,11 @@ static long do_mbind(unsigned long start, unsigned long len,
1049 if (!IS_ERR(vma)) { 1076 if (!IS_ERR(vma)) {
1050 int nr_failed = 0; 1077 int nr_failed = 0;
1051 1078
1052 err = mbind_range(vma, start, end, new); 1079 err = mbind_range(mm, start, end, new);
1053 1080
1054 if (!list_empty(&pagelist)) 1081 if (!list_empty(&pagelist))
1055 nr_failed = migrate_pages(&pagelist, new_vma_page, 1082 nr_failed = migrate_pages(&pagelist, new_vma_page,
1056 (unsigned long)vma); 1083 (unsigned long)vma, 0);
1057 1084
1058 if (!err && nr_failed && (flags & MPOL_MF_STRICT)) 1085 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1059 err = -EIO; 1086 err = -EIO;
@@ -1565,6 +1592,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1565 } 1592 }
1566 return zl; 1593 return zl;
1567} 1594}
1595
1596/*
1597 * init_nodemask_of_mempolicy
1598 *
1599 * If the current task's mempolicy is "default" [NULL], return 'false'
1600 * to indicate default policy. Otherwise, extract the policy nodemask
1601 * for 'bind' or 'interleave' policy into the argument nodemask, or
1602 * initialize the argument nodemask to contain the single node for
1603 * 'preferred' or 'local' policy and return 'true' to indicate presence
1604 * of non-default mempolicy.
1605 *
1606 * We don't bother with reference counting the mempolicy [mpol_get/put]
1607 * because the current task is examining it's own mempolicy and a task's
1608 * mempolicy is only ever changed by the task itself.
1609 *
1610 * N.B., it is the caller's responsibility to free a returned nodemask.
1611 */
1612bool init_nodemask_of_mempolicy(nodemask_t *mask)
1613{
1614 struct mempolicy *mempolicy;
1615 int nid;
1616
1617 if (!(mask && current->mempolicy))
1618 return false;
1619
1620 mempolicy = current->mempolicy;
1621 switch (mempolicy->mode) {
1622 case MPOL_PREFERRED:
1623 if (mempolicy->flags & MPOL_F_LOCAL)
1624 nid = numa_node_id();
1625 else
1626 nid = mempolicy->v.preferred_node;
1627 init_nodemask_of_node(mask, nid);
1628 break;
1629
1630 case MPOL_BIND:
1631 /* Fall through */
1632 case MPOL_INTERLEAVE:
1633 *mask = mempolicy->v.nodes;
1634 break;
1635
1636 default:
1637 BUG();
1638 }
1639
1640 return true;
1641}
1568#endif 1642#endif
1569 1643
1570/* Allocate a page in interleaved policy. 1644/* Allocate a page in interleaved policy.
@@ -1685,10 +1759,12 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
1685 1759
1686 if (!new) 1760 if (!new)
1687 return ERR_PTR(-ENOMEM); 1761 return ERR_PTR(-ENOMEM);
1762 rcu_read_lock();
1688 if (current_cpuset_is_being_rebound()) { 1763 if (current_cpuset_is_being_rebound()) {
1689 nodemask_t mems = cpuset_mems_allowed(current); 1764 nodemask_t mems = cpuset_mems_allowed(current);
1690 mpol_rebind_policy(old, &mems); 1765 mpol_rebind_policy(old, &mems);
1691 } 1766 }
1767 rcu_read_unlock();
1692 *new = *old; 1768 *new = *old;
1693 atomic_set(&new->refcnt, 1); 1769 atomic_set(&new->refcnt, 1);
1694 return new; 1770 return new;
@@ -2122,8 +2198,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2122 char *rest = nodelist; 2198 char *rest = nodelist;
2123 while (isdigit(*rest)) 2199 while (isdigit(*rest))
2124 rest++; 2200 rest++;
2125 if (!*rest) 2201 if (*rest)
2126 err = 0; 2202 goto out;
2127 } 2203 }
2128 break; 2204 break;
2129 case MPOL_INTERLEAVE: 2205 case MPOL_INTERLEAVE:
@@ -2132,7 +2208,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2132 */ 2208 */
2133 if (!nodelist) 2209 if (!nodelist)
2134 nodes = node_states[N_HIGH_MEMORY]; 2210 nodes = node_states[N_HIGH_MEMORY];
2135 err = 0;
2136 break; 2211 break;
2137 case MPOL_LOCAL: 2212 case MPOL_LOCAL:
2138 /* 2213 /*
@@ -2142,11 +2217,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2142 goto out; 2217 goto out;
2143 mode = MPOL_PREFERRED; 2218 mode = MPOL_PREFERRED;
2144 break; 2219 break;
2145 2220 case MPOL_DEFAULT:
2146 /* 2221 /*
2147 * case MPOL_BIND: mpol_new() enforces non-empty nodemask. 2222 * Insist on a empty nodelist
2148 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. 2223 */
2149 */ 2224 if (!nodelist)
2225 err = 0;
2226 goto out;
2227 case MPOL_BIND:
2228 /*
2229 * Insist on a nodelist
2230 */
2231 if (!nodelist)
2232 goto out;
2150 } 2233 }
2151 2234
2152 mode_flags = 0; 2235 mode_flags = 0;
@@ -2160,13 +2243,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2160 else if (!strcmp(flags, "relative")) 2243 else if (!strcmp(flags, "relative"))
2161 mode_flags |= MPOL_F_RELATIVE_NODES; 2244 mode_flags |= MPOL_F_RELATIVE_NODES;
2162 else 2245 else
2163 err = 1; 2246 goto out;
2164 } 2247 }
2165 2248
2166 new = mpol_new(mode, mode_flags, &nodes); 2249 new = mpol_new(mode, mode_flags, &nodes);
2167 if (IS_ERR(new)) 2250 if (IS_ERR(new))
2168 err = 1; 2251 goto out;
2169 else { 2252
2253 {
2170 int ret; 2254 int ret;
2171 NODEMASK_SCRATCH(scratch); 2255 NODEMASK_SCRATCH(scratch);
2172 if (scratch) { 2256 if (scratch) {
@@ -2177,13 +2261,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2177 ret = -ENOMEM; 2261 ret = -ENOMEM;
2178 NODEMASK_SCRATCH_FREE(scratch); 2262 NODEMASK_SCRATCH_FREE(scratch);
2179 if (ret) { 2263 if (ret) {
2180 err = 1;
2181 mpol_put(new); 2264 mpol_put(new);
2182 } else if (no_context) { 2265 goto out;
2183 /* save for contextualization */
2184 new->w.user_nodemask = nodes;
2185 } 2266 }
2186 } 2267 }
2268 err = 0;
2269 if (no_context) {
2270 /* save for contextualization */
2271 new->w.user_nodemask = nodes;
2272 }
2187 2273
2188out: 2274out:
2189 /* Restore string for error message */ 2275 /* Restore string for error message */