diff options
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 234 |
1 files changed, 160 insertions, 74 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4545d5944243..08f40a2f3fe0 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -73,7 +73,6 @@ | |||
73 | #include <linux/sched.h> | 73 | #include <linux/sched.h> |
74 | #include <linux/nodemask.h> | 74 | #include <linux/nodemask.h> |
75 | #include <linux/cpuset.h> | 75 | #include <linux/cpuset.h> |
76 | #include <linux/gfp.h> | ||
77 | #include <linux/slab.h> | 76 | #include <linux/slab.h> |
78 | #include <linux/string.h> | 77 | #include <linux/string.h> |
79 | #include <linux/module.h> | 78 | #include <linux/module.h> |
@@ -85,10 +84,12 @@ | |||
85 | #include <linux/seq_file.h> | 84 | #include <linux/seq_file.h> |
86 | #include <linux/proc_fs.h> | 85 | #include <linux/proc_fs.h> |
87 | #include <linux/migrate.h> | 86 | #include <linux/migrate.h> |
87 | #include <linux/ksm.h> | ||
88 | #include <linux/rmap.h> | 88 | #include <linux/rmap.h> |
89 | #include <linux/security.h> | 89 | #include <linux/security.h> |
90 | #include <linux/syscalls.h> | 90 | #include <linux/syscalls.h> |
91 | #include <linux/ctype.h> | 91 | #include <linux/ctype.h> |
92 | #include <linux/mm_inline.h> | ||
92 | 93 | ||
93 | #include <asm/tlbflush.h> | 94 | #include <asm/tlbflush.h> |
94 | #include <asm/uaccess.h> | 95 | #include <asm/uaccess.h> |
@@ -412,17 +413,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
412 | if (!page) | 413 | if (!page) |
413 | continue; | 414 | continue; |
414 | /* | 415 | /* |
415 | * The check for PageReserved here is important to avoid | 416 | * vm_normal_page() filters out zero pages, but there might |
416 | * handling zero pages and other pages that may have been | 417 | * still be PageReserved pages to skip, perhaps in a VDSO. |
417 | * marked special by the system. | 418 | * And we cannot move PageKsm pages sensibly or safely yet. |
418 | * | ||
419 | * If the PageReserved would not be checked here then f.e. | ||
420 | * the location of the zero page could have an influence | ||
421 | * on MPOL_MF_STRICT, zero pages would be counted for | ||
422 | * the per node stats, and there would be useless attempts | ||
423 | * to put zero pages on the migration list. | ||
424 | */ | 419 | */ |
425 | if (PageReserved(page)) | 420 | if (PageReserved(page) || PageKsm(page)) |
426 | continue; | 421 | continue; |
427 | nid = page_to_nid(page); | 422 | nid = page_to_nid(page); |
428 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) | 423 | if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) |
@@ -567,24 +562,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) | |||
567 | } | 562 | } |
568 | 563 | ||
569 | /* Step 2: apply policy to a range and do splits. */ | 564 | /* Step 2: apply policy to a range and do splits. */ |
570 | static int mbind_range(struct vm_area_struct *vma, unsigned long start, | 565 | static int mbind_range(struct mm_struct *mm, unsigned long start, |
571 | unsigned long end, struct mempolicy *new) | 566 | unsigned long end, struct mempolicy *new_pol) |
572 | { | 567 | { |
573 | struct vm_area_struct *next; | 568 | struct vm_area_struct *next; |
574 | int err; | 569 | struct vm_area_struct *prev; |
570 | struct vm_area_struct *vma; | ||
571 | int err = 0; | ||
572 | pgoff_t pgoff; | ||
573 | unsigned long vmstart; | ||
574 | unsigned long vmend; | ||
575 | 575 | ||
576 | err = 0; | 576 | vma = find_vma_prev(mm, start, &prev); |
577 | for (; vma && vma->vm_start < end; vma = next) { | 577 | if (!vma || vma->vm_start > start) |
578 | return -EFAULT; | ||
579 | |||
580 | for (; vma && vma->vm_start < end; prev = vma, vma = next) { | ||
578 | next = vma->vm_next; | 581 | next = vma->vm_next; |
579 | if (vma->vm_start < start) | 582 | vmstart = max(start, vma->vm_start); |
580 | err = split_vma(vma->vm_mm, vma, start, 1); | 583 | vmend = min(end, vma->vm_end); |
581 | if (!err && vma->vm_end > end) | 584 | |
582 | err = split_vma(vma->vm_mm, vma, end, 0); | 585 | pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); |
583 | if (!err) | 586 | prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, |
584 | err = policy_vma(vma, new); | 587 | vma->anon_vma, vma->vm_file, pgoff, new_pol); |
588 | if (prev) { | ||
589 | vma = prev; | ||
590 | next = vma->vm_next; | ||
591 | continue; | ||
592 | } | ||
593 | if (vma->vm_start != vmstart) { | ||
594 | err = split_vma(vma->vm_mm, vma, vmstart, 1); | ||
595 | if (err) | ||
596 | goto out; | ||
597 | } | ||
598 | if (vma->vm_end != vmend) { | ||
599 | err = split_vma(vma->vm_mm, vma, vmend, 0); | ||
600 | if (err) | ||
601 | goto out; | ||
602 | } | ||
603 | err = policy_vma(vma, new_pol); | ||
585 | if (err) | 604 | if (err) |
586 | break; | 605 | goto out; |
587 | } | 606 | } |
607 | |||
608 | out: | ||
588 | return err; | 609 | return err; |
589 | } | 610 | } |
590 | 611 | ||
@@ -784,9 +805,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
784 | 805 | ||
785 | err = 0; | 806 | err = 0; |
786 | if (nmask) { | 807 | if (nmask) { |
787 | task_lock(current); | 808 | if (mpol_store_user_nodemask(pol)) { |
788 | get_policy_nodemask(pol, nmask); | 809 | *nmask = pol->w.user_nodemask; |
789 | task_unlock(current); | 810 | } else { |
811 | task_lock(current); | ||
812 | get_policy_nodemask(pol, nmask); | ||
813 | task_unlock(current); | ||
814 | } | ||
790 | } | 815 | } |
791 | 816 | ||
792 | out: | 817 | out: |
@@ -809,6 +834,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, | |||
809 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { | 834 | if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { |
810 | if (!isolate_lru_page(page)) { | 835 | if (!isolate_lru_page(page)) { |
811 | list_add_tail(&page->lru, pagelist); | 836 | list_add_tail(&page->lru, pagelist); |
837 | inc_zone_page_state(page, NR_ISOLATED_ANON + | ||
838 | page_is_file_cache(page)); | ||
812 | } | 839 | } |
813 | } | 840 | } |
814 | } | 841 | } |
@@ -836,7 +863,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, | |||
836 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); | 863 | flags | MPOL_MF_DISCONTIG_OK, &pagelist); |
837 | 864 | ||
838 | if (!list_empty(&pagelist)) | 865 | if (!list_empty(&pagelist)) |
839 | err = migrate_pages(&pagelist, new_node_page, dest); | 866 | err = migrate_pages(&pagelist, new_node_page, dest, 0); |
840 | 867 | ||
841 | return err; | 868 | return err; |
842 | } | 869 | } |
@@ -864,36 +891,36 @@ int do_migrate_pages(struct mm_struct *mm, | |||
864 | if (err) | 891 | if (err) |
865 | goto out; | 892 | goto out; |
866 | 893 | ||
867 | /* | 894 | /* |
868 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' | 895 | * Find a 'source' bit set in 'tmp' whose corresponding 'dest' |
869 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' | 896 | * bit in 'to' is not also set in 'tmp'. Clear the found 'source' |
870 | * bit in 'tmp', and return that <source, dest> pair for migration. | 897 | * bit in 'tmp', and return that <source, dest> pair for migration. |
871 | * The pair of nodemasks 'to' and 'from' define the map. | 898 | * The pair of nodemasks 'to' and 'from' define the map. |
872 | * | 899 | * |
873 | * If no pair of bits is found that way, fallback to picking some | 900 | * If no pair of bits is found that way, fallback to picking some |
874 | * pair of 'source' and 'dest' bits that are not the same. If the | 901 | * pair of 'source' and 'dest' bits that are not the same. If the |
875 | * 'source' and 'dest' bits are the same, this represents a node | 902 | * 'source' and 'dest' bits are the same, this represents a node |
876 | * that will be migrating to itself, so no pages need move. | 903 | * that will be migrating to itself, so no pages need move. |
877 | * | 904 | * |
878 | * If no bits are left in 'tmp', or if all remaining bits left | 905 | * If no bits are left in 'tmp', or if all remaining bits left |
879 | * in 'tmp' correspond to the same bit in 'to', return false | 906 | * in 'tmp' correspond to the same bit in 'to', return false |
880 | * (nothing left to migrate). | 907 | * (nothing left to migrate). |
881 | * | 908 | * |
882 | * This lets us pick a pair of nodes to migrate between, such that | 909 | * This lets us pick a pair of nodes to migrate between, such that |
883 | * if possible the dest node is not already occupied by some other | 910 | * if possible the dest node is not already occupied by some other |
884 | * source node, minimizing the risk of overloading the memory on a | 911 | * source node, minimizing the risk of overloading the memory on a |
885 | * node that would happen if we migrated incoming memory to a node | 912 | * node that would happen if we migrated incoming memory to a node |
886 | * before migrating outgoing memory source that same node. | 913 | * before migrating outgoing memory source that same node. |
887 | * | 914 | * |
888 | * A single scan of tmp is sufficient. As we go, we remember the | 915 | * A single scan of tmp is sufficient. As we go, we remember the |
889 | * most recent <s, d> pair that moved (s != d). If we find a pair | 916 | * most recent <s, d> pair that moved (s != d). If we find a pair |
890 | * that not only moved, but what's better, moved to an empty slot | 917 | * that not only moved, but what's better, moved to an empty slot |
891 | * (d is not set in tmp), then we break out then, with that pair. | 918 | * (d is not set in tmp), then we break out then, with that pair. |
892 | * Otherwise when we finish scannng from_tmp, we at least have the | 919 | * Otherwise when we finish scannng from_tmp, we at least have the |
893 | * most recent <s, d> pair that moved. If we get all the way through | 920 | * most recent <s, d> pair that moved. If we get all the way through |
894 | * the scan of tmp without finding any node that moved, much less | 921 | * the scan of tmp without finding any node that moved, much less |
895 | * moved to an empty node, then there is nothing left worth migrating. | 922 | * moved to an empty node, then there is nothing left worth migrating. |
896 | */ | 923 | */ |
897 | 924 | ||
898 | tmp = *from_nodes; | 925 | tmp = *from_nodes; |
899 | while (!nodes_empty(tmp)) { | 926 | while (!nodes_empty(tmp)) { |
@@ -1049,11 +1076,11 @@ static long do_mbind(unsigned long start, unsigned long len, | |||
1049 | if (!IS_ERR(vma)) { | 1076 | if (!IS_ERR(vma)) { |
1050 | int nr_failed = 0; | 1077 | int nr_failed = 0; |
1051 | 1078 | ||
1052 | err = mbind_range(vma, start, end, new); | 1079 | err = mbind_range(mm, start, end, new); |
1053 | 1080 | ||
1054 | if (!list_empty(&pagelist)) | 1081 | if (!list_empty(&pagelist)) |
1055 | nr_failed = migrate_pages(&pagelist, new_vma_page, | 1082 | nr_failed = migrate_pages(&pagelist, new_vma_page, |
1056 | (unsigned long)vma); | 1083 | (unsigned long)vma, 0); |
1057 | 1084 | ||
1058 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) | 1085 | if (!err && nr_failed && (flags & MPOL_MF_STRICT)) |
1059 | err = -EIO; | 1086 | err = -EIO; |
@@ -1565,6 +1592,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | |||
1565 | } | 1592 | } |
1566 | return zl; | 1593 | return zl; |
1567 | } | 1594 | } |
1595 | |||
1596 | /* | ||
1597 | * init_nodemask_of_mempolicy | ||
1598 | * | ||
1599 | * If the current task's mempolicy is "default" [NULL], return 'false' | ||
1600 | * to indicate default policy. Otherwise, extract the policy nodemask | ||
1601 | * for 'bind' or 'interleave' policy into the argument nodemask, or | ||
1602 | * initialize the argument nodemask to contain the single node for | ||
1603 | * 'preferred' or 'local' policy and return 'true' to indicate presence | ||
1604 | * of non-default mempolicy. | ||
1605 | * | ||
1606 | * We don't bother with reference counting the mempolicy [mpol_get/put] | ||
1607 | * because the current task is examining it's own mempolicy and a task's | ||
1608 | * mempolicy is only ever changed by the task itself. | ||
1609 | * | ||
1610 | * N.B., it is the caller's responsibility to free a returned nodemask. | ||
1611 | */ | ||
1612 | bool init_nodemask_of_mempolicy(nodemask_t *mask) | ||
1613 | { | ||
1614 | struct mempolicy *mempolicy; | ||
1615 | int nid; | ||
1616 | |||
1617 | if (!(mask && current->mempolicy)) | ||
1618 | return false; | ||
1619 | |||
1620 | mempolicy = current->mempolicy; | ||
1621 | switch (mempolicy->mode) { | ||
1622 | case MPOL_PREFERRED: | ||
1623 | if (mempolicy->flags & MPOL_F_LOCAL) | ||
1624 | nid = numa_node_id(); | ||
1625 | else | ||
1626 | nid = mempolicy->v.preferred_node; | ||
1627 | init_nodemask_of_node(mask, nid); | ||
1628 | break; | ||
1629 | |||
1630 | case MPOL_BIND: | ||
1631 | /* Fall through */ | ||
1632 | case MPOL_INTERLEAVE: | ||
1633 | *mask = mempolicy->v.nodes; | ||
1634 | break; | ||
1635 | |||
1636 | default: | ||
1637 | BUG(); | ||
1638 | } | ||
1639 | |||
1640 | return true; | ||
1641 | } | ||
1568 | #endif | 1642 | #endif |
1569 | 1643 | ||
1570 | /* Allocate a page in interleaved policy. | 1644 | /* Allocate a page in interleaved policy. |
@@ -1685,10 +1759,12 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) | |||
1685 | 1759 | ||
1686 | if (!new) | 1760 | if (!new) |
1687 | return ERR_PTR(-ENOMEM); | 1761 | return ERR_PTR(-ENOMEM); |
1762 | rcu_read_lock(); | ||
1688 | if (current_cpuset_is_being_rebound()) { | 1763 | if (current_cpuset_is_being_rebound()) { |
1689 | nodemask_t mems = cpuset_mems_allowed(current); | 1764 | nodemask_t mems = cpuset_mems_allowed(current); |
1690 | mpol_rebind_policy(old, &mems); | 1765 | mpol_rebind_policy(old, &mems); |
1691 | } | 1766 | } |
1767 | rcu_read_unlock(); | ||
1692 | *new = *old; | 1768 | *new = *old; |
1693 | atomic_set(&new->refcnt, 1); | 1769 | atomic_set(&new->refcnt, 1); |
1694 | return new; | 1770 | return new; |
@@ -2122,8 +2198,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2122 | char *rest = nodelist; | 2198 | char *rest = nodelist; |
2123 | while (isdigit(*rest)) | 2199 | while (isdigit(*rest)) |
2124 | rest++; | 2200 | rest++; |
2125 | if (!*rest) | 2201 | if (*rest) |
2126 | err = 0; | 2202 | goto out; |
2127 | } | 2203 | } |
2128 | break; | 2204 | break; |
2129 | case MPOL_INTERLEAVE: | 2205 | case MPOL_INTERLEAVE: |
@@ -2132,7 +2208,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2132 | */ | 2208 | */ |
2133 | if (!nodelist) | 2209 | if (!nodelist) |
2134 | nodes = node_states[N_HIGH_MEMORY]; | 2210 | nodes = node_states[N_HIGH_MEMORY]; |
2135 | err = 0; | ||
2136 | break; | 2211 | break; |
2137 | case MPOL_LOCAL: | 2212 | case MPOL_LOCAL: |
2138 | /* | 2213 | /* |
@@ -2142,11 +2217,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2142 | goto out; | 2217 | goto out; |
2143 | mode = MPOL_PREFERRED; | 2218 | mode = MPOL_PREFERRED; |
2144 | break; | 2219 | break; |
2145 | 2220 | case MPOL_DEFAULT: | |
2146 | /* | 2221 | /* |
2147 | * case MPOL_BIND: mpol_new() enforces non-empty nodemask. | 2222 | * Insist on a empty nodelist |
2148 | * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. | 2223 | */ |
2149 | */ | 2224 | if (!nodelist) |
2225 | err = 0; | ||
2226 | goto out; | ||
2227 | case MPOL_BIND: | ||
2228 | /* | ||
2229 | * Insist on a nodelist | ||
2230 | */ | ||
2231 | if (!nodelist) | ||
2232 | goto out; | ||
2150 | } | 2233 | } |
2151 | 2234 | ||
2152 | mode_flags = 0; | 2235 | mode_flags = 0; |
@@ -2160,13 +2243,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2160 | else if (!strcmp(flags, "relative")) | 2243 | else if (!strcmp(flags, "relative")) |
2161 | mode_flags |= MPOL_F_RELATIVE_NODES; | 2244 | mode_flags |= MPOL_F_RELATIVE_NODES; |
2162 | else | 2245 | else |
2163 | err = 1; | 2246 | goto out; |
2164 | } | 2247 | } |
2165 | 2248 | ||
2166 | new = mpol_new(mode, mode_flags, &nodes); | 2249 | new = mpol_new(mode, mode_flags, &nodes); |
2167 | if (IS_ERR(new)) | 2250 | if (IS_ERR(new)) |
2168 | err = 1; | 2251 | goto out; |
2169 | else { | 2252 | |
2253 | { | ||
2170 | int ret; | 2254 | int ret; |
2171 | NODEMASK_SCRATCH(scratch); | 2255 | NODEMASK_SCRATCH(scratch); |
2172 | if (scratch) { | 2256 | if (scratch) { |
@@ -2177,13 +2261,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) | |||
2177 | ret = -ENOMEM; | 2261 | ret = -ENOMEM; |
2178 | NODEMASK_SCRATCH_FREE(scratch); | 2262 | NODEMASK_SCRATCH_FREE(scratch); |
2179 | if (ret) { | 2263 | if (ret) { |
2180 | err = 1; | ||
2181 | mpol_put(new); | 2264 | mpol_put(new); |
2182 | } else if (no_context) { | 2265 | goto out; |
2183 | /* save for contextualization */ | ||
2184 | new->w.user_nodemask = nodes; | ||
2185 | } | 2266 | } |
2186 | } | 2267 | } |
2268 | err = 0; | ||
2269 | if (no_context) { | ||
2270 | /* save for contextualization */ | ||
2271 | new->w.user_nodemask = nodes; | ||
2272 | } | ||
2187 | 2273 | ||
2188 | out: | 2274 | out: |
2189 | /* Restore string for error message */ | 2275 | /* Restore string for error message */ |