aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/vm/numa_memory_policy.txt54
-rw-r--r--mm/mempolicy.c68
2 files changed, 60 insertions, 62 deletions
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index 6719d642653f..13cca5a3cf17 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -147,35 +147,18 @@ Components of Memory Policies
147 147
148 Linux memory policy supports the following 4 behavioral modes: 148 Linux memory policy supports the following 4 behavioral modes:
149 149
150 Default Mode--MPOL_DEFAULT: The behavior specified by this mode is 150 Default Mode--MPOL_DEFAULT: This mode is only used in the memory
151 context or scope dependent. 151 policy APIs. Internally, MPOL_DEFAULT is converted to the NULL
152 152 memory policy in all policy scopes. Any existing non-default policy
153 As mentioned in the Policy Scope section above, during normal 153 will simply be removed when MPOL_DEFAULT is specified. As a result,
154 system operation, the System Default Policy is hard coded to 154 MPOL_DEFAULT means "fall back to the next most specific policy scope."
155 contain the Default mode. 155
156 156 For example, a NULL or default task policy will fall back to the
157 In this context, default mode means "local" allocation--that is 157 system default policy. A NULL or default vma policy will fall
158 attempt to allocate the page from the node associated with the cpu 158 back to the task policy.
159 where the fault occurs. If the "local" node has no memory, or the 159
160 node's memory can be exhausted [no free pages available], local 160 When specified in one of the memory policy APIs, the Default mode
161 allocation will "fallback to"--attempt to allocate pages from-- 161 does not use the optional set of nodes.
162 "nearby" nodes, in order of increasing "distance".
163
164 Implementation detail -- subject to change: "Fallback" uses
165 a per node list of sibling nodes--called zonelists--built at
166 boot time, or when nodes or memory are added or removed from
167 the system [memory hotplug]. These per node zonelist are
168 constructed with nodes in order of increasing distance based
169 on information provided by the platform firmware.
170
171 When a task/process policy or a shared policy contains the Default
172 mode, this also means "local allocation", as described above.
173
174 In the context of a VMA, Default mode means "fall back to task
175 policy"--which may or may not specify Default mode. Thus, Default
176 mode can not be counted on to mean local allocation when used
177 on a non-shared region of the address space. However, see
178 MPOL_PREFERRED below.
179 162
180 It is an error for the set of nodes specified for this policy to 163 It is an error for the set of nodes specified for this policy to
181 be non-empty. 164 be non-empty.
@@ -187,19 +170,18 @@ Components of Memory Policies
187 170
188 MPOL_PREFERRED: This mode specifies that the allocation should be 171 MPOL_PREFERRED: This mode specifies that the allocation should be
189 attempted from the single node specified in the policy. If that 172 attempted from the single node specified in the policy. If that
190 allocation fails, the kernel will search other nodes, exactly as 173 allocation fails, the kernel will search other nodes, in order of
191 it would for a local allocation that started at the preferred node 174 increasing distance from the preferred node based on information
192 in increasing distance from the preferred node. "Local" allocation 175 provided by the platform firmware.
193 policy can be viewed as a Preferred policy that starts at the node
194 containing the cpu where the allocation takes place. 176 containing the cpu where the allocation takes place.
195 177
196 Internally, the Preferred policy uses a single node--the 178 Internally, the Preferred policy uses a single node--the
197 preferred_node member of struct mempolicy. A "distinguished 179 preferred_node member of struct mempolicy. A "distinguished
198 value of this preferred_node, currently '-1', is interpreted 180 value of this preferred_node, currently '-1', is interpreted
199 as "the node containing the cpu where the allocation takes 181 as "the node containing the cpu where the allocation takes
200 place"--local allocation. This is the way to specify 182 place"--local allocation. "Local" allocation policy can be
201 local allocation for a specific range of addresses--i.e. for 183 viewed as a Preferred policy that starts at the node containing
202 VMA policies. 184 the cpu where the allocation takes place.
203 185
204 It is possible for the user to specify that local allocation is 186 It is possible for the user to specify that local allocation is
205 always preferred by passing an empty nodemask with this mode. 187 always preferred by passing an empty nodemask with this mode.
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index a237295f8190..fea4a5da6e44 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -104,9 +104,13 @@ static struct kmem_cache *sn_cache;
104 policied. */ 104 policied. */
105enum zone_type policy_zone = 0; 105enum zone_type policy_zone = 0;
106 106
107/*
108 * run-time system-wide default policy => local allocation
109 */
107struct mempolicy default_policy = { 110struct mempolicy default_policy = {
108 .refcnt = ATOMIC_INIT(1), /* never free it */ 111 .refcnt = ATOMIC_INIT(1), /* never free it */
109 .mode = MPOL_DEFAULT, 112 .mode = MPOL_PREFERRED,
113 .v = { .preferred_node = -1 },
110}; 114};
111 115
112static const struct mempolicy_operations { 116static const struct mempolicy_operations {
@@ -189,7 +193,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
189 if (mode == MPOL_DEFAULT) { 193 if (mode == MPOL_DEFAULT) {
190 if (nodes && !nodes_empty(*nodes)) 194 if (nodes && !nodes_empty(*nodes))
191 return ERR_PTR(-EINVAL); 195 return ERR_PTR(-EINVAL);
192 return NULL; 196 return NULL; /* simply delete any existing policy */
193 } 197 }
194 VM_BUG_ON(!nodes); 198 VM_BUG_ON(!nodes);
195 199
@@ -246,7 +250,6 @@ void __mpol_put(struct mempolicy *p)
246{ 250{
247 if (!atomic_dec_and_test(&p->refcnt)) 251 if (!atomic_dec_and_test(&p->refcnt))
248 return; 252 return;
249 p->mode = MPOL_DEFAULT;
250 kmem_cache_free(policy_cache, p); 253 kmem_cache_free(policy_cache, p);
251} 254}
252 255
@@ -626,13 +629,16 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
626 return 0; 629 return 0;
627} 630}
628 631
629/* Fill a zone bitmap for a policy */ 632/*
630static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) 633 * Return nodemask for policy for get_mempolicy() query
634 */
635static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
631{ 636{
632 nodes_clear(*nodes); 637 nodes_clear(*nodes);
638 if (p == &default_policy)
639 return;
640
633 switch (p->mode) { 641 switch (p->mode) {
634 case MPOL_DEFAULT:
635 break;
636 case MPOL_BIND: 642 case MPOL_BIND:
637 /* Fall through */ 643 /* Fall through */
638 case MPOL_INTERLEAVE: 644 case MPOL_INTERLEAVE:
@@ -686,6 +692,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
686 } 692 }
687 693
688 if (flags & MPOL_F_ADDR) { 694 if (flags & MPOL_F_ADDR) {
695 /*
696 * Do NOT fall back to task policy if the
697 * vma/shared policy at addr is NULL. We
698 * want to return MPOL_DEFAULT in this case.
699 */
689 down_read(&mm->mmap_sem); 700 down_read(&mm->mmap_sem);
690 vma = find_vma_intersection(mm, addr, addr+1); 701 vma = find_vma_intersection(mm, addr, addr+1);
691 if (!vma) { 702 if (!vma) {
@@ -700,7 +711,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
700 return -EINVAL; 711 return -EINVAL;
701 712
702 if (!pol) 713 if (!pol)
703 pol = &default_policy; 714 pol = &default_policy; /* indicates default behavior */
704 715
705 if (flags & MPOL_F_NODE) { 716 if (flags & MPOL_F_NODE) {
706 if (flags & MPOL_F_ADDR) { 717 if (flags & MPOL_F_ADDR) {
@@ -715,8 +726,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
715 err = -EINVAL; 726 err = -EINVAL;
716 goto out; 727 goto out;
717 } 728 }
718 } else 729 } else {
719 *policy = pol->mode | pol->flags; 730 *policy = pol == &default_policy ? MPOL_DEFAULT :
731 pol->mode;
732 *policy |= pol->flags;
733 }
720 734
721 if (vma) { 735 if (vma) {
722 up_read(&current->mm->mmap_sem); 736 up_read(&current->mm->mmap_sem);
@@ -725,7 +739,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
725 739
726 err = 0; 740 err = 0;
727 if (nmask) 741 if (nmask)
728 get_zonemask(pol, nmask); 742 get_policy_nodemask(pol, nmask);
729 743
730 out: 744 out:
731 mpol_cond_put(pol); 745 mpol_cond_put(pol);
@@ -1286,8 +1300,7 @@ static struct mempolicy *get_vma_policy(struct task_struct *task,
1286 addr); 1300 addr);
1287 if (vpol) 1301 if (vpol)
1288 pol = vpol; 1302 pol = vpol;
1289 } else if (vma->vm_policy && 1303 } else if (vma->vm_policy)
1290 vma->vm_policy->mode != MPOL_DEFAULT)
1291 pol = vma->vm_policy; 1304 pol = vma->vm_policy;
1292 } 1305 }
1293 if (!pol) 1306 if (!pol)
@@ -1334,7 +1347,6 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1334 nd = first_node(policy->v.nodes); 1347 nd = first_node(policy->v.nodes);
1335 break; 1348 break;
1336 case MPOL_INTERLEAVE: /* should not happen */ 1349 case MPOL_INTERLEAVE: /* should not happen */
1337 case MPOL_DEFAULT:
1338 nd = numa_node_id(); 1350 nd = numa_node_id();
1339 break; 1351 break;
1340 default: 1352 default:
@@ -1369,9 +1381,15 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1369 */ 1381 */
1370unsigned slab_node(struct mempolicy *policy) 1382unsigned slab_node(struct mempolicy *policy)
1371{ 1383{
1372 unsigned short pol = policy ? policy->mode : MPOL_DEFAULT; 1384 if (!policy)
1385 return numa_node_id();
1386
1387 switch (policy->mode) {
1388 case MPOL_PREFERRED:
1389 if (unlikely(policy->v.preferred_node >= 0))
1390 return policy->v.preferred_node;
1391 return numa_node_id();
1373 1392
1374 switch (pol) {
1375 case MPOL_INTERLEAVE: 1393 case MPOL_INTERLEAVE:
1376 return interleave_nodes(policy); 1394 return interleave_nodes(policy);
1377 1395
@@ -1390,13 +1408,8 @@ unsigned slab_node(struct mempolicy *policy)
1390 return zone->node; 1408 return zone->node;
1391 } 1409 }
1392 1410
1393 case MPOL_PREFERRED:
1394 if (policy->v.preferred_node >= 0)
1395 return policy->v.preferred_node;
1396 /* Fall through */
1397
1398 default: 1411 default:
1399 return numa_node_id(); 1412 BUG();
1400 } 1413 }
1401} 1414}
1402 1415
@@ -1650,8 +1663,6 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1650 if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) 1663 if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1651 return 0; 1664 return 0;
1652 switch (a->mode) { 1665 switch (a->mode) {
1653 case MPOL_DEFAULT:
1654 return 1;
1655 case MPOL_BIND: 1666 case MPOL_BIND:
1656 /* Fall through */ 1667 /* Fall through */
1657 case MPOL_INTERLEAVE: 1668 case MPOL_INTERLEAVE:
@@ -1828,7 +1839,7 @@ void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1828 if (policy != MPOL_DEFAULT) { 1839 if (policy != MPOL_DEFAULT) {
1829 struct mempolicy *newpol; 1840 struct mempolicy *newpol;
1830 1841
1831 /* Falls back to MPOL_DEFAULT on any error */ 1842 /* Falls back to NULL policy [MPOL_DEFAULT] on any error */
1832 newpol = mpol_new(policy, flags, policy_nodes); 1843 newpol = mpol_new(policy, flags, policy_nodes);
1833 if (!IS_ERR(newpol)) { 1844 if (!IS_ERR(newpol)) {
1834 /* Create pseudo-vma that contains just the policy */ 1845 /* Create pseudo-vma that contains just the policy */
@@ -1952,9 +1963,14 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1952 char *p = buffer; 1963 char *p = buffer;
1953 int l; 1964 int l;
1954 nodemask_t nodes; 1965 nodemask_t nodes;
1955 unsigned short mode = pol ? pol->mode : MPOL_DEFAULT; 1966 unsigned short mode;
1956 unsigned short flags = pol ? pol->flags : 0; 1967 unsigned short flags = pol ? pol->flags : 0;
1957 1968
1969 if (!pol || pol == &default_policy)
1970 mode = MPOL_DEFAULT;
1971 else
1972 mode = pol->mode;
1973
1958 switch (mode) { 1974 switch (mode) {
1959 case MPOL_DEFAULT: 1975 case MPOL_DEFAULT:
1960 nodes_clear(nodes); 1976 nodes_clear(nodes);