diff options
author | Mel Gorman <mel@csn.ul.ie> | 2008-04-28 05:12:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-28 11:58:19 -0400 |
commit | 19770b32609b6bf97a3dece2529089494cbfc549 (patch) | |
tree | 3b5922d1b20aabdf929bde9309f323841717747a /mm/mempolicy.c | |
parent | dd1a239f6f2d4d3eedd318583ec319aa145b324c (diff) |
mm: filter based on a nodemask as well as a gfp_mask
The MPOL_BIND policy creates a zonelist that is used for allocations
controlled by that mempolicy. As the per-node zonelist is already being
filtered based on a zone id, this patch adds a version of __alloc_pages() that
takes a nodemask for further filtering. This eliminates the need for
MPOL_BIND to create a custom zonelist.
A positive benefit of this is that allocations using MPOL_BIND now use the
local node's distance-ordered zonelist instead of a custom node-id-ordered
zonelist. I.e., pages will be allocated from the closest allowed node with
available memory.
[Lee.Schermerhorn@hp.com: Mempolicy: update stale documentation and comments]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r-- | mm/mempolicy.c | 184 |
1 files changed, 73 insertions, 111 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 90193a2a915b..acb5ee3587c3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -163,42 +163,25 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
163 | return 0; | 163 | return 0; |
164 | } | 164 | } |
165 | 165 | ||
166 | /* Generate a custom zonelist for the BIND policy. */ | 166 | /* Check that the nodemask contains at least one populated zone */ |
167 | static struct zonelist *bind_zonelist(nodemask_t *nodes) | 167 | static int is_valid_nodemask(nodemask_t *nodemask) |
168 | { | 168 | { |
169 | struct zonelist *zl; | 169 | int nd, k; |
170 | int num, max, nd; | ||
171 | enum zone_type k; | ||
172 | 170 | ||
173 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 171 | /* Check that there is something useful in this mask */ |
174 | max++; /* space for zlcache_ptr (see mmzone.h) */ | 172 | k = policy_zone; |
175 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); | 173 | |
176 | if (!zl) | 174 | for_each_node_mask(nd, *nodemask) { |
177 | return ERR_PTR(-ENOMEM); | 175 | struct zone *z; |
178 | zl->zlcache_ptr = NULL; | 176 | |
179 | num = 0; | 177 | for (k = 0; k <= policy_zone; k++) { |
180 | /* First put in the highest zones from all nodes, then all the next | 178 | z = &NODE_DATA(nd)->node_zones[k]; |
181 | lower zones etc. Avoid empty zones because the memory allocator | 179 | if (z->present_pages > 0) |
182 | doesn't like them. If you implement node hot removal you | 180 | return 1; |
183 | have to fix that. */ | ||
184 | k = MAX_NR_ZONES - 1; | ||
185 | while (1) { | ||
186 | for_each_node_mask(nd, *nodes) { | ||
187 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | ||
188 | if (z->present_pages > 0) | ||
189 | zoneref_set_zone(z, &zl->_zonerefs[num++]); | ||
190 | } | 181 | } |
191 | if (k == 0) | ||
192 | break; | ||
193 | k--; | ||
194 | } | ||
195 | if (num == 0) { | ||
196 | kfree(zl); | ||
197 | return ERR_PTR(-EINVAL); | ||
198 | } | 182 | } |
199 | zl->_zonerefs[num].zone = NULL; | 183 | |
200 | zl->_zonerefs[num].zone_idx = 0; | 184 | return 0; |
201 | return zl; | ||
202 | } | 185 | } |
203 | 186 | ||
204 | /* Create a new policy */ | 187 | /* Create a new policy */ |
@@ -229,12 +212,11 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
229 | policy->v.preferred_node = -1; | 212 | policy->v.preferred_node = -1; |
230 | break; | 213 | break; |
231 | case MPOL_BIND: | 214 | case MPOL_BIND: |
232 | policy->v.zonelist = bind_zonelist(nodes); | 215 | if (!is_valid_nodemask(nodes)) { |
233 | if (IS_ERR(policy->v.zonelist)) { | ||
234 | void *error_code = policy->v.zonelist; | ||
235 | kmem_cache_free(policy_cache, policy); | 216 | kmem_cache_free(policy_cache, policy); |
236 | return error_code; | 217 | return ERR_PTR(-EINVAL); |
237 | } | 218 | } |
219 | policy->v.nodes = *nodes; | ||
238 | break; | 220 | break; |
239 | } | 221 | } |
240 | policy->policy = mode; | 222 | policy->policy = mode; |
@@ -500,19 +482,12 @@ static long do_set_mempolicy(int mode, nodemask_t *nodes) | |||
500 | /* Fill a zone bitmap for a policy */ | 482 | /* Fill a zone bitmap for a policy */ |
501 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) | 483 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) |
502 | { | 484 | { |
503 | int i; | ||
504 | |||
505 | nodes_clear(*nodes); | 485 | nodes_clear(*nodes); |
506 | switch (p->policy) { | 486 | switch (p->policy) { |
507 | case MPOL_BIND: | ||
508 | for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) { | ||
509 | struct zoneref *zref; | ||
510 | zref = &p->v.zonelist->_zonerefs[i]; | ||
511 | node_set(zonelist_node_idx(zref), *nodes); | ||
512 | } | ||
513 | break; | ||
514 | case MPOL_DEFAULT: | 487 | case MPOL_DEFAULT: |
515 | break; | 488 | break; |
489 | case MPOL_BIND: | ||
490 | /* Fall through */ | ||
516 | case MPOL_INTERLEAVE: | 491 | case MPOL_INTERLEAVE: |
517 | *nodes = p->v.nodes; | 492 | *nodes = p->v.nodes; |
518 | break; | 493 | break; |
@@ -1160,6 +1135,18 @@ static struct mempolicy * get_vma_policy(struct task_struct *task, | |||
1160 | return pol; | 1135 | return pol; |
1161 | } | 1136 | } |
1162 | 1137 | ||
1138 | /* Return a nodemask representing a mempolicy */ | ||
1139 | static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) | ||
1140 | { | ||
1141 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ | ||
1142 | if (unlikely(policy->policy == MPOL_BIND) && | ||
1143 | gfp_zone(gfp) >= policy_zone && | ||
1144 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) | ||
1145 | return &policy->v.nodes; | ||
1146 | |||
1147 | return NULL; | ||
1148 | } | ||
1149 | |||
1163 | /* Return a zonelist representing a mempolicy */ | 1150 | /* Return a zonelist representing a mempolicy */ |
1164 | static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | 1151 | static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) |
1165 | { | 1152 | { |
@@ -1172,12 +1159,17 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | |||
1172 | nd = numa_node_id(); | 1159 | nd = numa_node_id(); |
1173 | break; | 1160 | break; |
1174 | case MPOL_BIND: | 1161 | case MPOL_BIND: |
1175 | /* Lower zones don't get a policy applied */ | 1162 | /* |
1176 | /* Careful: current->mems_allowed might have moved */ | 1163 | * Normally, MPOL_BIND allocations node-local are node-local |
1177 | if (gfp_zone(gfp) >= policy_zone) | 1164 | * within the allowed nodemask. However, if __GFP_THISNODE is |
1178 | if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) | 1165 | * set and the current node is part of the mask, we use the |
1179 | return policy->v.zonelist; | 1166 | * the zonelist for the first node in the mask instead. |
1180 | /*FALL THROUGH*/ | 1167 | */ |
1168 | nd = numa_node_id(); | ||
1169 | if (unlikely(gfp & __GFP_THISNODE) && | ||
1170 | unlikely(!node_isset(nd, policy->v.nodes))) | ||
1171 | nd = first_node(policy->v.nodes); | ||
1172 | break; | ||
1181 | case MPOL_INTERLEAVE: /* should not happen */ | 1173 | case MPOL_INTERLEAVE: /* should not happen */ |
1182 | case MPOL_DEFAULT: | 1174 | case MPOL_DEFAULT: |
1183 | nd = numa_node_id(); | 1175 | nd = numa_node_id(); |
@@ -1220,7 +1212,14 @@ unsigned slab_node(struct mempolicy *policy) | |||
1220 | * Follow bind policy behavior and start allocation at the | 1212 | * Follow bind policy behavior and start allocation at the |
1221 | * first node. | 1213 | * first node. |
1222 | */ | 1214 | */ |
1223 | return zonelist_node_idx(policy->v.zonelist->_zonerefs); | 1215 | struct zonelist *zonelist; |
1216 | struct zone *zone; | ||
1217 | enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); | ||
1218 | zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; | ||
1219 | (void)first_zones_zonelist(zonelist, highest_zoneidx, | ||
1220 | &policy->v.nodes, | ||
1221 | &zone); | ||
1222 | return zone->node; | ||
1224 | } | 1223 | } |
1225 | 1224 | ||
1226 | case MPOL_PREFERRED: | 1225 | case MPOL_PREFERRED: |
@@ -1278,25 +1277,31 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1278 | * @vma = virtual memory area whose policy is sought | 1277 | * @vma = virtual memory area whose policy is sought |
1279 | * @addr = address in @vma for shared policy lookup and interleave policy | 1278 | * @addr = address in @vma for shared policy lookup and interleave policy |
1280 | * @gfp_flags = for requested zone | 1279 | * @gfp_flags = for requested zone |
1281 | * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy | 1280 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy |
1281 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask | ||
1282 | * | 1282 | * |
1283 | * Returns a zonelist suitable for a huge page allocation. | 1283 | * Returns a zonelist suitable for a huge page allocation. |
1284 | * If the effective policy is 'BIND, returns pointer to policy's zonelist. | 1284 | * If the effective policy is 'BIND, returns pointer to local node's zonelist, |
1285 | * and a pointer to the mempolicy's @nodemask for filtering the zonelist. | ||
1285 | * If it is also a policy for which get_vma_policy() returns an extra | 1286 | * If it is also a policy for which get_vma_policy() returns an extra |
1286 | * reference, we must hold that reference until after allocation. | 1287 | * reference, we must hold that reference until after the allocation. |
1287 | * In that case, return policy via @mpol so hugetlb allocation can drop | 1288 | * In that case, return policy via @mpol so hugetlb allocation can drop |
1288 | * the reference. For non-'BIND referenced policies, we can/do drop the | 1289 | * the reference. For non-'BIND referenced policies, we can/do drop the |
1289 | * reference here, so the caller doesn't need to know about the special case | 1290 | * reference here, so the caller doesn't need to know about the special case |
1290 | * for default and current task policy. | 1291 | * for default and current task policy. |
1291 | */ | 1292 | */ |
1292 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | 1293 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, |
1293 | gfp_t gfp_flags, struct mempolicy **mpol) | 1294 | gfp_t gfp_flags, struct mempolicy **mpol, |
1295 | nodemask_t **nodemask) | ||
1294 | { | 1296 | { |
1295 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1297 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
1296 | struct zonelist *zl; | 1298 | struct zonelist *zl; |
1297 | 1299 | ||
1298 | *mpol = NULL; /* probably no unref needed */ | 1300 | *mpol = NULL; /* probably no unref needed */ |
1299 | if (pol->policy == MPOL_INTERLEAVE) { | 1301 | *nodemask = NULL; /* assume !MPOL_BIND */ |
1302 | if (pol->policy == MPOL_BIND) { | ||
1303 | *nodemask = &pol->v.nodes; | ||
1304 | } else if (pol->policy == MPOL_INTERLEAVE) { | ||
1300 | unsigned nid; | 1305 | unsigned nid; |
1301 | 1306 | ||
1302 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); | 1307 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); |
@@ -1376,14 +1381,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1376 | /* | 1381 | /* |
1377 | * slow path: ref counted policy -- shared or vma | 1382 | * slow path: ref counted policy -- shared or vma |
1378 | */ | 1383 | */ |
1379 | struct page *page = __alloc_pages(gfp, 0, zl); | 1384 | struct page *page = __alloc_pages_nodemask(gfp, 0, |
1385 | zl, nodemask_policy(gfp, pol)); | ||
1380 | __mpol_free(pol); | 1386 | __mpol_free(pol); |
1381 | return page; | 1387 | return page; |
1382 | } | 1388 | } |
1383 | /* | 1389 | /* |
1384 | * fast path: default or task policy | 1390 | * fast path: default or task policy |
1385 | */ | 1391 | */ |
1386 | return __alloc_pages(gfp, 0, zl); | 1392 | return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol)); |
1387 | } | 1393 | } |
1388 | 1394 | ||
1389 | /** | 1395 | /** |
@@ -1415,7 +1421,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1415 | pol = &default_policy; | 1421 | pol = &default_policy; |
1416 | if (pol->policy == MPOL_INTERLEAVE) | 1422 | if (pol->policy == MPOL_INTERLEAVE) |
1417 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 1423 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); |
1418 | return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); | 1424 | return __alloc_pages_nodemask(gfp, order, |
1425 | zonelist_policy(gfp, pol), nodemask_policy(gfp, pol)); | ||
1419 | } | 1426 | } |
1420 | EXPORT_SYMBOL(alloc_pages_current); | 1427 | EXPORT_SYMBOL(alloc_pages_current); |
1421 | 1428 | ||
@@ -1440,14 +1447,6 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
1440 | } | 1447 | } |
1441 | *new = *old; | 1448 | *new = *old; |
1442 | atomic_set(&new->refcnt, 1); | 1449 | atomic_set(&new->refcnt, 1); |
1443 | if (new->policy == MPOL_BIND) { | ||
1444 | int sz = ksize(old->v.zonelist); | ||
1445 | new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); | ||
1446 | if (!new->v.zonelist) { | ||
1447 | kmem_cache_free(policy_cache, new); | ||
1448 | return ERR_PTR(-ENOMEM); | ||
1449 | } | ||
1450 | } | ||
1451 | return new; | 1450 | return new; |
1452 | } | 1451 | } |
1453 | 1452 | ||
@@ -1461,21 +1460,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
1461 | switch (a->policy) { | 1460 | switch (a->policy) { |
1462 | case MPOL_DEFAULT: | 1461 | case MPOL_DEFAULT: |
1463 | return 1; | 1462 | return 1; |
1463 | case MPOL_BIND: | ||
1464 | /* Fall through */ | ||
1464 | case MPOL_INTERLEAVE: | 1465 | case MPOL_INTERLEAVE: |
1465 | return nodes_equal(a->v.nodes, b->v.nodes); | 1466 | return nodes_equal(a->v.nodes, b->v.nodes); |
1466 | case MPOL_PREFERRED: | 1467 | case MPOL_PREFERRED: |
1467 | return a->v.preferred_node == b->v.preferred_node; | 1468 | return a->v.preferred_node == b->v.preferred_node; |
1468 | case MPOL_BIND: { | ||
1469 | int i; | ||
1470 | for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) { | ||
1471 | struct zone *za, *zb; | ||
1472 | za = zonelist_zone(&a->v.zonelist->_zonerefs[i]); | ||
1473 | zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]); | ||
1474 | if (za != zb) | ||
1475 | return 0; | ||
1476 | } | ||
1477 | return b->v.zonelist->_zonerefs[i].zone == NULL; | ||
1478 | } | ||
1479 | default: | 1469 | default: |
1480 | BUG(); | 1470 | BUG(); |
1481 | return 0; | 1471 | return 0; |
@@ -1487,8 +1477,6 @@ void __mpol_free(struct mempolicy *p) | |||
1487 | { | 1477 | { |
1488 | if (!atomic_dec_and_test(&p->refcnt)) | 1478 | if (!atomic_dec_and_test(&p->refcnt)) |
1489 | return; | 1479 | return; |
1490 | if (p->policy == MPOL_BIND) | ||
1491 | kfree(p->v.zonelist); | ||
1492 | p->policy = MPOL_DEFAULT; | 1480 | p->policy = MPOL_DEFAULT; |
1493 | kmem_cache_free(policy_cache, p); | 1481 | kmem_cache_free(policy_cache, p); |
1494 | } | 1482 | } |
@@ -1779,6 +1767,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
1779 | switch (pol->policy) { | 1767 | switch (pol->policy) { |
1780 | case MPOL_DEFAULT: | 1768 | case MPOL_DEFAULT: |
1781 | break; | 1769 | break; |
1770 | case MPOL_BIND: | ||
1771 | /* Fall through */ | ||
1782 | case MPOL_INTERLEAVE: | 1772 | case MPOL_INTERLEAVE: |
1783 | nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); | 1773 | nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); |
1784 | pol->v.nodes = tmp; | 1774 | pol->v.nodes = tmp; |
@@ -1791,32 +1781,6 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
1791 | *mpolmask, *newmask); | 1781 | *mpolmask, *newmask); |
1792 | *mpolmask = *newmask; | 1782 | *mpolmask = *newmask; |
1793 | break; | 1783 | break; |
1794 | case MPOL_BIND: { | ||
1795 | nodemask_t nodes; | ||
1796 | struct zoneref *z; | ||
1797 | struct zonelist *zonelist; | ||
1798 | |||
1799 | nodes_clear(nodes); | ||
1800 | for (z = pol->v.zonelist->_zonerefs; z->zone; z++) | ||
1801 | node_set(zonelist_node_idx(z), nodes); | ||
1802 | nodes_remap(tmp, nodes, *mpolmask, *newmask); | ||
1803 | nodes = tmp; | ||
1804 | |||
1805 | zonelist = bind_zonelist(&nodes); | ||
1806 | |||
1807 | /* If no mem, then zonelist is NULL and we keep old zonelist. | ||
1808 | * If that old zonelist has no remaining mems_allowed nodes, | ||
1809 | * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. | ||
1810 | */ | ||
1811 | |||
1812 | if (!IS_ERR(zonelist)) { | ||
1813 | /* Good - got mem - substitute new zonelist */ | ||
1814 | kfree(pol->v.zonelist); | ||
1815 | pol->v.zonelist = zonelist; | ||
1816 | } | ||
1817 | *mpolmask = *newmask; | ||
1818 | break; | ||
1819 | } | ||
1820 | default: | 1784 | default: |
1821 | BUG(); | 1785 | BUG(); |
1822 | break; | 1786 | break; |
@@ -1879,9 +1843,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
1879 | break; | 1843 | break; |
1880 | 1844 | ||
1881 | case MPOL_BIND: | 1845 | case MPOL_BIND: |
1882 | get_zonemask(pol, &nodes); | 1846 | /* Fall through */ |
1883 | break; | ||
1884 | |||
1885 | case MPOL_INTERLEAVE: | 1847 | case MPOL_INTERLEAVE: |
1886 | nodes = pol->v.nodes; | 1848 | nodes = pol->v.nodes; |
1887 | break; | 1849 | break; |