diff options
-rw-r--r-- | Documentation/vm/numa_memory_policy.txt | 54 | ||||
-rw-r--r-- | mm/mempolicy.c | 68 |
2 files changed, 60 insertions, 62 deletions
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt index 6719d642653f..13cca5a3cf17 100644 --- a/Documentation/vm/numa_memory_policy.txt +++ b/Documentation/vm/numa_memory_policy.txt | |||
@@ -147,35 +147,18 @@ Components of Memory Policies | |||
147 | 147 | ||
148 | Linux memory policy supports the following 4 behavioral modes: | 148 | Linux memory policy supports the following 4 behavioral modes: |
149 | 149 | ||
150 | Default Mode--MPOL_DEFAULT: The behavior specified by this mode is | 150 | Default Mode--MPOL_DEFAULT: This mode is only used in the memory |
151 | context or scope dependent. | 151 | policy APIs. Internally, MPOL_DEFAULT is converted to the NULL |
152 | 152 | memory policy in all policy scopes. Any existing non-default policy | |
153 | As mentioned in the Policy Scope section above, during normal | 153 | will simply be removed when MPOL_DEFAULT is specified. As a result, |
154 | system operation, the System Default Policy is hard coded to | 154 | MPOL_DEFAULT means "fall back to the next most specific policy scope." |
155 | contain the Default mode. | 155 | |
156 | 156 | For example, a NULL or default task policy will fall back to the | |
157 | In this context, default mode means "local" allocation--that is | 157 | system default policy. A NULL or default vma policy will fall |
158 | attempt to allocate the page from the node associated with the cpu | 158 | back to the task policy. |
159 | where the fault occurs. If the "local" node has no memory, or the | 159 | |
160 | node's memory can be exhausted [no free pages available], local | 160 | When specified in one of the memory policy APIs, the Default mode |
161 | allocation will "fallback to"--attempt to allocate pages from-- | 161 | does not use the optional set of nodes. |
162 | "nearby" nodes, in order of increasing "distance". | ||
163 | |||
164 | Implementation detail -- subject to change: "Fallback" uses | ||
165 | a per node list of sibling nodes--called zonelists--built at | ||
166 | boot time, or when nodes or memory are added or removed from | ||
167 | the system [memory hotplug]. These per node zonelist are | ||
168 | constructed with nodes in order of increasing distance based | ||
169 | on information provided by the platform firmware. | ||
170 | |||
171 | When a task/process policy or a shared policy contains the Default | ||
172 | mode, this also means "local allocation", as described above. | ||
173 | |||
174 | In the context of a VMA, Default mode means "fall back to task | ||
175 | policy"--which may or may not specify Default mode. Thus, Default | ||
176 | mode can not be counted on to mean local allocation when used | ||
177 | on a non-shared region of the address space. However, see | ||
178 | MPOL_PREFERRED below. | ||
179 | 162 | ||
180 | It is an error for the set of nodes specified for this policy to | 163 | It is an error for the set of nodes specified for this policy to |
181 | be non-empty. | 164 | be non-empty. |
@@ -187,19 +170,18 @@ Components of Memory Policies | |||
187 | 170 | ||
188 | MPOL_PREFERRED: This mode specifies that the allocation should be | 171 | MPOL_PREFERRED: This mode specifies that the allocation should be |
189 | attempted from the single node specified in the policy. If that | 172 | attempted from the single node specified in the policy. If that |
190 | allocation fails, the kernel will search other nodes, exactly as | 173 | allocation fails, the kernel will search other nodes, in order of |
191 | it would for a local allocation that started at the preferred node | 174 | increasing distance from the preferred node based on information |
192 | in increasing distance from the preferred node. "Local" allocation | 175 | provided by the platform firmware. |
193 | policy can be viewed as a Preferred policy that starts at the node | ||
194 | containing the cpu where the allocation takes place. | 176 | containing the cpu where the allocation takes place. |
195 | 177 | ||
196 | Internally, the Preferred policy uses a single node--the | 178 | Internally, the Preferred policy uses a single node--the |
197 | preferred_node member of struct mempolicy. A "distinguished | 179 | preferred_node member of struct mempolicy. A "distinguished |
198 | value of this preferred_node, currently '-1', is interpreted | 180 | value of this preferred_node, currently '-1', is interpreted |
199 | as "the node containing the cpu where the allocation takes | 181 | as "the node containing the cpu where the allocation takes |
200 | place"--local allocation. This is the way to specify | 182 | place"--local allocation. "Local" allocation policy can be |
201 | local allocation for a specific range of addresses--i.e. for | 183 | viewed as a Preferred policy that starts at the node containing |
202 | VMA policies. | 184 | the cpu where the allocation takes place. |
203 | 185 | ||
204 | It is possible for the user to specify that local allocation is | 186 | It is possible for the user to specify that local allocation is |
205 | always preferred by passing an empty nodemask with this mode. | 187 | always preferred by passing an empty nodemask with this mode. |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a237295f8190..fea4a5da6e44 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -104,9 +104,13 @@ static struct kmem_cache *sn_cache; | |||
104 | policied. */ | 104 | policied. */ |
105 | enum zone_type policy_zone = 0; | 105 | enum zone_type policy_zone = 0; |
106 | 106 | ||
107 | /* | ||
108 | * run-time system-wide default policy => local allocation | ||
109 | */ | ||
107 | struct mempolicy default_policy = { | 110 | struct mempolicy default_policy = { |
108 | .refcnt = ATOMIC_INIT(1), /* never free it */ | 111 | .refcnt = ATOMIC_INIT(1), /* never free it */ |
109 | .mode = MPOL_DEFAULT, | 112 | .mode = MPOL_PREFERRED, |
113 | .v = { .preferred_node = -1 }, | ||
110 | }; | 114 | }; |
111 | 115 | ||
112 | static const struct mempolicy_operations { | 116 | static const struct mempolicy_operations { |
@@ -189,7 +193,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, | |||
189 | if (mode == MPOL_DEFAULT) { | 193 | if (mode == MPOL_DEFAULT) { |
190 | if (nodes && !nodes_empty(*nodes)) | 194 | if (nodes && !nodes_empty(*nodes)) |
191 | return ERR_PTR(-EINVAL); | 195 | return ERR_PTR(-EINVAL); |
192 | return NULL; | 196 | return NULL; /* simply delete any existing policy */ |
193 | } | 197 | } |
194 | VM_BUG_ON(!nodes); | 198 | VM_BUG_ON(!nodes); |
195 | 199 | ||
@@ -246,7 +250,6 @@ void __mpol_put(struct mempolicy *p) | |||
246 | { | 250 | { |
247 | if (!atomic_dec_and_test(&p->refcnt)) | 251 | if (!atomic_dec_and_test(&p->refcnt)) |
248 | return; | 252 | return; |
249 | p->mode = MPOL_DEFAULT; | ||
250 | kmem_cache_free(policy_cache, p); | 253 | kmem_cache_free(policy_cache, p); |
251 | } | 254 | } |
252 | 255 | ||
@@ -626,13 +629,16 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, | |||
626 | return 0; | 629 | return 0; |
627 | } | 630 | } |
628 | 631 | ||
629 | /* Fill a zone bitmap for a policy */ | 632 | /* |
630 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) | 633 | * Return nodemask for policy for get_mempolicy() query |
634 | */ | ||
635 | static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) | ||
631 | { | 636 | { |
632 | nodes_clear(*nodes); | 637 | nodes_clear(*nodes); |
638 | if (p == &default_policy) | ||
639 | return; | ||
640 | |||
633 | switch (p->mode) { | 641 | switch (p->mode) { |
634 | case MPOL_DEFAULT: | ||
635 | break; | ||
636 | case MPOL_BIND: | 642 | case MPOL_BIND: |
637 | /* Fall through */ | 643 | /* Fall through */ |
638 | case MPOL_INTERLEAVE: | 644 | case MPOL_INTERLEAVE: |
@@ -686,6 +692,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
686 | } | 692 | } |
687 | 693 | ||
688 | if (flags & MPOL_F_ADDR) { | 694 | if (flags & MPOL_F_ADDR) { |
695 | /* | ||
696 | * Do NOT fall back to task policy if the | ||
697 | * vma/shared policy at addr is NULL. We | ||
698 | * want to return MPOL_DEFAULT in this case. | ||
699 | */ | ||
689 | down_read(&mm->mmap_sem); | 700 | down_read(&mm->mmap_sem); |
690 | vma = find_vma_intersection(mm, addr, addr+1); | 701 | vma = find_vma_intersection(mm, addr, addr+1); |
691 | if (!vma) { | 702 | if (!vma) { |
@@ -700,7 +711,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
700 | return -EINVAL; | 711 | return -EINVAL; |
701 | 712 | ||
702 | if (!pol) | 713 | if (!pol) |
703 | pol = &default_policy; | 714 | pol = &default_policy; /* indicates default behavior */ |
704 | 715 | ||
705 | if (flags & MPOL_F_NODE) { | 716 | if (flags & MPOL_F_NODE) { |
706 | if (flags & MPOL_F_ADDR) { | 717 | if (flags & MPOL_F_ADDR) { |
@@ -715,8 +726,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
715 | err = -EINVAL; | 726 | err = -EINVAL; |
716 | goto out; | 727 | goto out; |
717 | } | 728 | } |
718 | } else | 729 | } else { |
719 | *policy = pol->mode | pol->flags; | 730 | *policy = pol == &default_policy ? MPOL_DEFAULT : |
731 | pol->mode; | ||
732 | *policy |= pol->flags; | ||
733 | } | ||
720 | 734 | ||
721 | if (vma) { | 735 | if (vma) { |
722 | up_read(¤t->mm->mmap_sem); | 736 | up_read(¤t->mm->mmap_sem); |
@@ -725,7 +739,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, | |||
725 | 739 | ||
726 | err = 0; | 740 | err = 0; |
727 | if (nmask) | 741 | if (nmask) |
728 | get_zonemask(pol, nmask); | 742 | get_policy_nodemask(pol, nmask); |
729 | 743 | ||
730 | out: | 744 | out: |
731 | mpol_cond_put(pol); | 745 | mpol_cond_put(pol); |
@@ -1286,8 +1300,7 @@ static struct mempolicy *get_vma_policy(struct task_struct *task, | |||
1286 | addr); | 1300 | addr); |
1287 | if (vpol) | 1301 | if (vpol) |
1288 | pol = vpol; | 1302 | pol = vpol; |
1289 | } else if (vma->vm_policy && | 1303 | } else if (vma->vm_policy) |
1290 | vma->vm_policy->mode != MPOL_DEFAULT) | ||
1291 | pol = vma->vm_policy; | 1304 | pol = vma->vm_policy; |
1292 | } | 1305 | } |
1293 | if (!pol) | 1306 | if (!pol) |
@@ -1334,7 +1347,6 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) | |||
1334 | nd = first_node(policy->v.nodes); | 1347 | nd = first_node(policy->v.nodes); |
1335 | break; | 1348 | break; |
1336 | case MPOL_INTERLEAVE: /* should not happen */ | 1349 | case MPOL_INTERLEAVE: /* should not happen */ |
1337 | case MPOL_DEFAULT: | ||
1338 | nd = numa_node_id(); | 1350 | nd = numa_node_id(); |
1339 | break; | 1351 | break; |
1340 | default: | 1352 | default: |
@@ -1369,9 +1381,15 @@ static unsigned interleave_nodes(struct mempolicy *policy) | |||
1369 | */ | 1381 | */ |
1370 | unsigned slab_node(struct mempolicy *policy) | 1382 | unsigned slab_node(struct mempolicy *policy) |
1371 | { | 1383 | { |
1372 | unsigned short pol = policy ? policy->mode : MPOL_DEFAULT; | 1384 | if (!policy) |
1385 | return numa_node_id(); | ||
1386 | |||
1387 | switch (policy->mode) { | ||
1388 | case MPOL_PREFERRED: | ||
1389 | if (unlikely(policy->v.preferred_node >= 0)) | ||
1390 | return policy->v.preferred_node; | ||
1391 | return numa_node_id(); | ||
1373 | 1392 | ||
1374 | switch (pol) { | ||
1375 | case MPOL_INTERLEAVE: | 1393 | case MPOL_INTERLEAVE: |
1376 | return interleave_nodes(policy); | 1394 | return interleave_nodes(policy); |
1377 | 1395 | ||
@@ -1390,13 +1408,8 @@ unsigned slab_node(struct mempolicy *policy) | |||
1390 | return zone->node; | 1408 | return zone->node; |
1391 | } | 1409 | } |
1392 | 1410 | ||
1393 | case MPOL_PREFERRED: | ||
1394 | if (policy->v.preferred_node >= 0) | ||
1395 | return policy->v.preferred_node; | ||
1396 | /* Fall through */ | ||
1397 | |||
1398 | default: | 1411 | default: |
1399 | return numa_node_id(); | 1412 | BUG(); |
1400 | } | 1413 | } |
1401 | } | 1414 | } |
1402 | 1415 | ||
@@ -1650,8 +1663,6 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
1650 | if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) | 1663 | if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) |
1651 | return 0; | 1664 | return 0; |
1652 | switch (a->mode) { | 1665 | switch (a->mode) { |
1653 | case MPOL_DEFAULT: | ||
1654 | return 1; | ||
1655 | case MPOL_BIND: | 1666 | case MPOL_BIND: |
1656 | /* Fall through */ | 1667 | /* Fall through */ |
1657 | case MPOL_INTERLEAVE: | 1668 | case MPOL_INTERLEAVE: |
@@ -1828,7 +1839,7 @@ void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy, | |||
1828 | if (policy != MPOL_DEFAULT) { | 1839 | if (policy != MPOL_DEFAULT) { |
1829 | struct mempolicy *newpol; | 1840 | struct mempolicy *newpol; |
1830 | 1841 | ||
1831 | /* Falls back to MPOL_DEFAULT on any error */ | 1842 | /* Falls back to NULL policy [MPOL_DEFAULT] on any error */ |
1832 | newpol = mpol_new(policy, flags, policy_nodes); | 1843 | newpol = mpol_new(policy, flags, policy_nodes); |
1833 | if (!IS_ERR(newpol)) { | 1844 | if (!IS_ERR(newpol)) { |
1834 | /* Create pseudo-vma that contains just the policy */ | 1845 | /* Create pseudo-vma that contains just the policy */ |
@@ -1952,9 +1963,14 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
1952 | char *p = buffer; | 1963 | char *p = buffer; |
1953 | int l; | 1964 | int l; |
1954 | nodemask_t nodes; | 1965 | nodemask_t nodes; |
1955 | unsigned short mode = pol ? pol->mode : MPOL_DEFAULT; | 1966 | unsigned short mode; |
1956 | unsigned short flags = pol ? pol->flags : 0; | 1967 | unsigned short flags = pol ? pol->flags : 0; |
1957 | 1968 | ||
1969 | if (!pol || pol == &default_policy) | ||
1970 | mode = MPOL_DEFAULT; | ||
1971 | else | ||
1972 | mode = pol->mode; | ||
1973 | |||
1958 | switch (mode) { | 1974 | switch (mode) { |
1959 | case MPOL_DEFAULT: | 1975 | case MPOL_DEFAULT: |
1960 | nodes_clear(nodes); | 1976 | nodes_clear(nodes); |