aboutsummaryrefslogtreecommitdiffstats
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2008-04-28 05:12:18 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-28 11:58:19 -0400
commit19770b32609b6bf97a3dece2529089494cbfc549 (patch)
tree3b5922d1b20aabdf929bde9309f323841717747a /mm/mempolicy.c
parentdd1a239f6f2d4d3eedd318583ec319aa145b324c (diff)
mm: filter based on a nodemask as well as a gfp_mask
The MPOL_BIND policy creates a zonelist that is used for allocations controlled by that mempolicy. As the per-node zonelist is already being filtered based on a zone id, this patch adds a version of __alloc_pages() that takes a nodemask for further filtering. This eliminates the need for MPOL_BIND to create a custom zonelist. A positive benefit of this is that allocations using MPOL_BIND now use the local node's distance-ordered zonelist instead of a custom node-id-ordered zonelist. I.e., pages will be allocated from the closest allowed node with available memory. [Lee.Schermerhorn@hp.com: Mempolicy: update stale documentation and comments] [Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask] [Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Hugh Dickins <hugh@veritas.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c184
1 files changed, 73 insertions, 111 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 90193a2a915b..acb5ee3587c3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -163,42 +163,25 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
163 return 0; 163 return 0;
164} 164}
165 165
166/* Generate a custom zonelist for the BIND policy. */ 166/* Check that the nodemask contains at least one populated zone */
167static struct zonelist *bind_zonelist(nodemask_t *nodes) 167static int is_valid_nodemask(nodemask_t *nodemask)
168{ 168{
169 struct zonelist *zl; 169 int nd, k;
170 int num, max, nd;
171 enum zone_type k;
172 170
173 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 171 /* Check that there is something useful in this mask */
174 max++; /* space for zlcache_ptr (see mmzone.h) */ 172 k = policy_zone;
175 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 173
176 if (!zl) 174 for_each_node_mask(nd, *nodemask) {
177 return ERR_PTR(-ENOMEM); 175 struct zone *z;
178 zl->zlcache_ptr = NULL; 176
179 num = 0; 177 for (k = 0; k <= policy_zone; k++) {
180 /* First put in the highest zones from all nodes, then all the next 178 z = &NODE_DATA(nd)->node_zones[k];
181 lower zones etc. Avoid empty zones because the memory allocator 179 if (z->present_pages > 0)
182 doesn't like them. If you implement node hot removal you 180 return 1;
183 have to fix that. */
184 k = MAX_NR_ZONES - 1;
185 while (1) {
186 for_each_node_mask(nd, *nodes) {
187 struct zone *z = &NODE_DATA(nd)->node_zones[k];
188 if (z->present_pages > 0)
189 zoneref_set_zone(z, &zl->_zonerefs[num++]);
190 } 181 }
191 if (k == 0)
192 break;
193 k--;
194 }
195 if (num == 0) {
196 kfree(zl);
197 return ERR_PTR(-EINVAL);
198 } 182 }
199 zl->_zonerefs[num].zone = NULL; 183
200 zl->_zonerefs[num].zone_idx = 0; 184 return 0;
201 return zl;
202} 185}
203 186
204/* Create a new policy */ 187/* Create a new policy */
@@ -229,12 +212,11 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
229 policy->v.preferred_node = -1; 212 policy->v.preferred_node = -1;
230 break; 213 break;
231 case MPOL_BIND: 214 case MPOL_BIND:
232 policy->v.zonelist = bind_zonelist(nodes); 215 if (!is_valid_nodemask(nodes)) {
233 if (IS_ERR(policy->v.zonelist)) {
234 void *error_code = policy->v.zonelist;
235 kmem_cache_free(policy_cache, policy); 216 kmem_cache_free(policy_cache, policy);
236 return error_code; 217 return ERR_PTR(-EINVAL);
237 } 218 }
219 policy->v.nodes = *nodes;
238 break; 220 break;
239 } 221 }
240 policy->policy = mode; 222 policy->policy = mode;
@@ -500,19 +482,12 @@ static long do_set_mempolicy(int mode, nodemask_t *nodes)
500/* Fill a zone bitmap for a policy */ 482/* Fill a zone bitmap for a policy */
501static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) 483static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
502{ 484{
503 int i;
504
505 nodes_clear(*nodes); 485 nodes_clear(*nodes);
506 switch (p->policy) { 486 switch (p->policy) {
507 case MPOL_BIND:
508 for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) {
509 struct zoneref *zref;
510 zref = &p->v.zonelist->_zonerefs[i];
511 node_set(zonelist_node_idx(zref), *nodes);
512 }
513 break;
514 case MPOL_DEFAULT: 487 case MPOL_DEFAULT:
515 break; 488 break;
489 case MPOL_BIND:
490 /* Fall through */
516 case MPOL_INTERLEAVE: 491 case MPOL_INTERLEAVE:
517 *nodes = p->v.nodes; 492 *nodes = p->v.nodes;
518 break; 493 break;
@@ -1160,6 +1135,18 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
1160 return pol; 1135 return pol;
1161} 1136}
1162 1137
1138/* Return a nodemask representing a mempolicy */
1139static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1140{
1141 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1142 if (unlikely(policy->policy == MPOL_BIND) &&
1143 gfp_zone(gfp) >= policy_zone &&
1144 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1145 return &policy->v.nodes;
1146
1147 return NULL;
1148}
1149
1163/* Return a zonelist representing a mempolicy */ 1150/* Return a zonelist representing a mempolicy */
1164static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) 1151static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1165{ 1152{
@@ -1172,12 +1159,17 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1172 nd = numa_node_id(); 1159 nd = numa_node_id();
1173 break; 1160 break;
1174 case MPOL_BIND: 1161 case MPOL_BIND:
1175 /* Lower zones don't get a policy applied */ 1162 /*
1176 /* Careful: current->mems_allowed might have moved */ 1163 * Normally, MPOL_BIND allocations node-local are node-local
1177 if (gfp_zone(gfp) >= policy_zone) 1164 * within the allowed nodemask. However, if __GFP_THISNODE is
1178 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) 1165 * set and the current node is part of the mask, we use the
1179 return policy->v.zonelist; 1166 * the zonelist for the first node in the mask instead.
1180 /*FALL THROUGH*/ 1167 */
1168 nd = numa_node_id();
1169 if (unlikely(gfp & __GFP_THISNODE) &&
1170 unlikely(!node_isset(nd, policy->v.nodes)))
1171 nd = first_node(policy->v.nodes);
1172 break;
1181 case MPOL_INTERLEAVE: /* should not happen */ 1173 case MPOL_INTERLEAVE: /* should not happen */
1182 case MPOL_DEFAULT: 1174 case MPOL_DEFAULT:
1183 nd = numa_node_id(); 1175 nd = numa_node_id();
@@ -1220,7 +1212,14 @@ unsigned slab_node(struct mempolicy *policy)
1220 * Follow bind policy behavior and start allocation at the 1212 * Follow bind policy behavior and start allocation at the
1221 * first node. 1213 * first node.
1222 */ 1214 */
1223 return zonelist_node_idx(policy->v.zonelist->_zonerefs); 1215 struct zonelist *zonelist;
1216 struct zone *zone;
1217 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1218 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1219 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1220 &policy->v.nodes,
1221 &zone);
1222 return zone->node;
1224 } 1223 }
1225 1224
1226 case MPOL_PREFERRED: 1225 case MPOL_PREFERRED:
@@ -1278,25 +1277,31 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1278 * @vma = virtual memory area whose policy is sought 1277 * @vma = virtual memory area whose policy is sought
1279 * @addr = address in @vma for shared policy lookup and interleave policy 1278 * @addr = address in @vma for shared policy lookup and interleave policy
1280 * @gfp_flags = for requested zone 1279 * @gfp_flags = for requested zone
1281 * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy 1280 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1281 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1282 * 1282 *
1283 * Returns a zonelist suitable for a huge page allocation. 1283 * Returns a zonelist suitable for a huge page allocation.
1284 * If the effective policy is 'BIND, returns pointer to policy's zonelist. 1284 * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1285 * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1285 * If it is also a policy for which get_vma_policy() returns an extra 1286 * If it is also a policy for which get_vma_policy() returns an extra
1286 * reference, we must hold that reference until after allocation. 1287 * reference, we must hold that reference until after the allocation.
1287 * In that case, return policy via @mpol so hugetlb allocation can drop 1288 * In that case, return policy via @mpol so hugetlb allocation can drop
1288 * the reference. For non-'BIND referenced policies, we can/do drop the 1289 * the reference. For non-'BIND referenced policies, we can/do drop the
1289 * reference here, so the caller doesn't need to know about the special case 1290 * reference here, so the caller doesn't need to know about the special case
1290 * for default and current task policy. 1291 * for default and current task policy.
1291 */ 1292 */
1292struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1293struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1293 gfp_t gfp_flags, struct mempolicy **mpol) 1294 gfp_t gfp_flags, struct mempolicy **mpol,
1295 nodemask_t **nodemask)
1294{ 1296{
1295 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1297 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1296 struct zonelist *zl; 1298 struct zonelist *zl;
1297 1299
1298 *mpol = NULL; /* probably no unref needed */ 1300 *mpol = NULL; /* probably no unref needed */
1299 if (pol->policy == MPOL_INTERLEAVE) { 1301 *nodemask = NULL; /* assume !MPOL_BIND */
1302 if (pol->policy == MPOL_BIND) {
1303 *nodemask = &pol->v.nodes;
1304 } else if (pol->policy == MPOL_INTERLEAVE) {
1300 unsigned nid; 1305 unsigned nid;
1301 1306
1302 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); 1307 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
@@ -1376,14 +1381,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1376 /* 1381 /*
1377 * slow path: ref counted policy -- shared or vma 1382 * slow path: ref counted policy -- shared or vma
1378 */ 1383 */
1379 struct page *page = __alloc_pages(gfp, 0, zl); 1384 struct page *page = __alloc_pages_nodemask(gfp, 0,
1385 zl, nodemask_policy(gfp, pol));
1380 __mpol_free(pol); 1386 __mpol_free(pol);
1381 return page; 1387 return page;
1382 } 1388 }
1383 /* 1389 /*
1384 * fast path: default or task policy 1390 * fast path: default or task policy
1385 */ 1391 */
1386 return __alloc_pages(gfp, 0, zl); 1392 return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
1387} 1393}
1388 1394
1389/** 1395/**
@@ -1415,7 +1421,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1415 pol = &default_policy; 1421 pol = &default_policy;
1416 if (pol->policy == MPOL_INTERLEAVE) 1422 if (pol->policy == MPOL_INTERLEAVE)
1417 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1423 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1418 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); 1424 return __alloc_pages_nodemask(gfp, order,
1425 zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
1419} 1426}
1420EXPORT_SYMBOL(alloc_pages_current); 1427EXPORT_SYMBOL(alloc_pages_current);
1421 1428
@@ -1440,14 +1447,6 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
1440 } 1447 }
1441 *new = *old; 1448 *new = *old;
1442 atomic_set(&new->refcnt, 1); 1449 atomic_set(&new->refcnt, 1);
1443 if (new->policy == MPOL_BIND) {
1444 int sz = ksize(old->v.zonelist);
1445 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1446 if (!new->v.zonelist) {
1447 kmem_cache_free(policy_cache, new);
1448 return ERR_PTR(-ENOMEM);
1449 }
1450 }
1451 return new; 1450 return new;
1452} 1451}
1453 1452
@@ -1461,21 +1460,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1461 switch (a->policy) { 1460 switch (a->policy) {
1462 case MPOL_DEFAULT: 1461 case MPOL_DEFAULT:
1463 return 1; 1462 return 1;
1463 case MPOL_BIND:
1464 /* Fall through */
1464 case MPOL_INTERLEAVE: 1465 case MPOL_INTERLEAVE:
1465 return nodes_equal(a->v.nodes, b->v.nodes); 1466 return nodes_equal(a->v.nodes, b->v.nodes);
1466 case MPOL_PREFERRED: 1467 case MPOL_PREFERRED:
1467 return a->v.preferred_node == b->v.preferred_node; 1468 return a->v.preferred_node == b->v.preferred_node;
1468 case MPOL_BIND: {
1469 int i;
1470 for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) {
1471 struct zone *za, *zb;
1472 za = zonelist_zone(&a->v.zonelist->_zonerefs[i]);
1473 zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]);
1474 if (za != zb)
1475 return 0;
1476 }
1477 return b->v.zonelist->_zonerefs[i].zone == NULL;
1478 }
1479 default: 1469 default:
1480 BUG(); 1470 BUG();
1481 return 0; 1471 return 0;
@@ -1487,8 +1477,6 @@ void __mpol_free(struct mempolicy *p)
1487{ 1477{
1488 if (!atomic_dec_and_test(&p->refcnt)) 1478 if (!atomic_dec_and_test(&p->refcnt))
1489 return; 1479 return;
1490 if (p->policy == MPOL_BIND)
1491 kfree(p->v.zonelist);
1492 p->policy = MPOL_DEFAULT; 1480 p->policy = MPOL_DEFAULT;
1493 kmem_cache_free(policy_cache, p); 1481 kmem_cache_free(policy_cache, p);
1494} 1482}
@@ -1779,6 +1767,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
1779 switch (pol->policy) { 1767 switch (pol->policy) {
1780 case MPOL_DEFAULT: 1768 case MPOL_DEFAULT:
1781 break; 1769 break;
1770 case MPOL_BIND:
1771 /* Fall through */
1782 case MPOL_INTERLEAVE: 1772 case MPOL_INTERLEAVE:
1783 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); 1773 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1784 pol->v.nodes = tmp; 1774 pol->v.nodes = tmp;
@@ -1791,32 +1781,6 @@ static void mpol_rebind_policy(struct mempolicy *pol,
1791 *mpolmask, *newmask); 1781 *mpolmask, *newmask);
1792 *mpolmask = *newmask; 1782 *mpolmask = *newmask;
1793 break; 1783 break;
1794 case MPOL_BIND: {
1795 nodemask_t nodes;
1796 struct zoneref *z;
1797 struct zonelist *zonelist;
1798
1799 nodes_clear(nodes);
1800 for (z = pol->v.zonelist->_zonerefs; z->zone; z++)
1801 node_set(zonelist_node_idx(z), nodes);
1802 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1803 nodes = tmp;
1804
1805 zonelist = bind_zonelist(&nodes);
1806
1807 /* If no mem, then zonelist is NULL and we keep old zonelist.
1808 * If that old zonelist has no remaining mems_allowed nodes,
1809 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1810 */
1811
1812 if (!IS_ERR(zonelist)) {
1813 /* Good - got mem - substitute new zonelist */
1814 kfree(pol->v.zonelist);
1815 pol->v.zonelist = zonelist;
1816 }
1817 *mpolmask = *newmask;
1818 break;
1819 }
1820 default: 1784 default:
1821 BUG(); 1785 BUG();
1822 break; 1786 break;
@@ -1879,9 +1843,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1879 break; 1843 break;
1880 1844
1881 case MPOL_BIND: 1845 case MPOL_BIND:
1882 get_zonemask(pol, &nodes); 1846 /* Fall through */
1883 break;
1884
1885 case MPOL_INTERLEAVE: 1847 case MPOL_INTERLEAVE:
1886 nodes = pol->v.nodes; 1848 nodes = pol->v.nodes;
1887 break; 1849 break;