diff options
author | Mel Gorman <mel@csn.ul.ie> | 2008-04-28 05:12:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-28 11:58:19 -0400 |
commit | 19770b32609b6bf97a3dece2529089494cbfc549 (patch) | |
tree | 3b5922d1b20aabdf929bde9309f323841717747a /mm | |
parent | dd1a239f6f2d4d3eedd318583ec319aa145b324c (diff) |
mm: filter based on a nodemask as well as a gfp_mask
The MPOL_BIND policy creates a zonelist that is used for allocations
controlled by that mempolicy. As the per-node zonelist is already being
filtered based on a zone id, this patch adds a version of __alloc_pages() that
takes a nodemask for further filtering. This eliminates the need for
MPOL_BIND to create a custom zonelist.
A positive benefit of this is that allocations using MPOL_BIND now use the
local node's distance-ordered zonelist instead of a custom node-id-ordered
zonelist. I.e., pages will be allocated from the closest allowed node with
available memory.
[Lee.Schermerhorn@hp.com: Mempolicy: update stale documentation and comments]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hugetlb.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 184 | ||||
-rw-r--r-- | mm/mmzone.c | 30 | ||||
-rw-r--r-- | mm/page_alloc.c | 50 |
4 files changed, 143 insertions, 127 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4bced0d705ca..3737d82f5225 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -95,12 +95,14 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
95 | int nid; | 95 | int nid; |
96 | struct page *page = NULL; | 96 | struct page *page = NULL; |
97 | struct mempolicy *mpol; | 97 | struct mempolicy *mpol; |
98 | nodemask_t *nodemask; | ||
98 | struct zonelist *zonelist = huge_zonelist(vma, address, | 99 | struct zonelist *zonelist = huge_zonelist(vma, address, |
99 | htlb_alloc_mask, &mpol); | 100 | htlb_alloc_mask, &mpol, &nodemask); |
100 | struct zone *zone; | 101 | struct zone *zone; |
101 | struct zoneref *z; | 102 | struct zoneref *z; |
102 | 103 | ||
103 | for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) { | 104 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
105 | MAX_NR_ZONES - 1, nodemask) { | ||
104 | nid = zone_to_nid(zone); | 106 | nid = zone_to_nid(zone); |
105 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | 107 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && |
106 | !list_empty(&hugepage_freelists[nid])) { | 108 | !list_empty(&hugepage_freelists[nid])) { |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 90193a2a915b..acb5ee3587c3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -163,42 +163,25 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
163 | return 0; | 163 | return 0; |
164 | } | 164 | } |
165 | 165 | ||
166 | /* Generate a custom zonelist for the BIND policy. */ | 166 | /* Check that the nodemask contains at least one populated zone */ |
167 | static struct zonelist *bind_zonelist(nodemask_t *nodes) | 167 | static int is_valid_nodemask(nodemask_t *nodemask) |
168 | { | 168 | { |
169 | struct zonelist *zl; | 169 | int nd, k; |
170 | int num, max, nd; | ||
171 | enum zone_type k; | ||
172 | 170 | ||
173 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 171 | /* Check that there is something useful in this mask */ |
174 | max++; /* space for zlcache_ptr (see mmzone.h) */ | 172 | k = policy_zone; |
175 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); | 173 | |
176 | if (!zl) | 174 | for_each_node_mask(nd, *nodemask) { |
177 | return ERR_PTR(-ENOMEM); | 175 | struct zone *z; |
178 | zl->zlcache_ptr = NULL; | 176 | |
179 | num = 0; | 177 | for (k = 0; k <= policy_zone; k++) { |
180 | /* First put in the highest zones from all nodes, then all the next | 178 | z = &NODE_DATA(nd)->node_zones[k]; |
181 | lower zones etc. Avoid empty zones because the memory allocator | 179 | if (z->present_pages > 0) |
182 | doesn't like them. If you implement node hot removal you | 180 | return 1; |
183 | have to fix that. */ | ||
184 | k = MAX_NR_ZONES - 1; | ||
185 | while (1) { | ||
186 | for_each_node_mask(nd, *nodes) { | ||
187 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | ||
188 | if (z->present_pages > 0) | ||
189 | zoneref_set_zone(z, &zl->_zonerefs[num++]); | ||
190 | } | 181 | } |
191 | if (k == 0) | ||
192 | break; | ||
193 | k--; | ||
194 | } | ||
195 | if (num == 0) { | ||
196 | kfree(zl); | ||
197 | return ERR_PTR(-EINVAL); | ||
198 | } | 182 | } |
199 | zl->_zonerefs[num].zone = NULL; | 183 | |
200 | zl->_zonerefs[num].zone_idx = 0; | 184 | return 0; |
201 | return zl; | ||
202 | } | 185 | } |
203 | 186 | ||
204 | /* Create a new policy */ | 187 | /* Create a new policy */ |
@@ -229,12 +212,11 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
229 | policy->v.preferred_node = -1; | 212 | policy->v.preferred_node = -1; |
230 | break; | 213 | break; |
231 | case MPOL_BIND: | 214 | case MPOL_BIND: |
232 | policy->v.zonelist = bind_zonelist(nodes); | 215 | if (!is_valid_nodemask(nodes)) { |
233 | if (IS_ERR(policy->v.zonelist)) { | ||
234 | void *error_code = policy->v.zonelist; | ||
235 | kmem_cache_free(policy_cache, policy); | 216 | kmem_cache_free(policy_cache, policy); |
236 | return error_code; | 217 | return ERR_PTR(-EINVAL); |
237 | } | 218 | } |
219 | policy->v.nodes = *nodes; | ||
238 | break; | 220 | break; |
239 | } | 221 | } |
240 | policy->policy = mode; | 222 | policy->policy = mode; |
@@ -500,19 +482,12 @@ static long do_set_mempolicy(int mode, nodemask_t *nodes) | |||
500 | /* Fill a zone bitmap for a policy */ | 482 | /* Fill a zone bitmap for a policy */ |
501 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) | 483 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) |
502 | { | 484 | { |
503 | int i; | ||
504 | |||
505 | nodes_clear(*nodes); | 485 | nodes_clear(*nodes); |
506 | switch (p->policy) { | 486 | switch (p->policy) { |
507 | case MPOL_BIND: | ||
508 | for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) { | ||
509 | struct zoneref *zref; | ||
510 | zref = &p->v.zonelist->_zonerefs[i]; | ||
511 | node_set(zonelist_node_idx(zref), *nodes); | ||
512 | } | ||
513 | break; | ||
514 | case MPOL_DEFAULT: | 487 | case MPOL_DEFAULT: |
515 | break; | 488 | break; |
489 | case MPOL_BIND: | ||
490 | /* Fall through */ | ||
516 | case MPOL_INTERLEAVE: | 491 | case MPOL_INTERLEAVE: |
517 | *nodes = p->v.nodes; | 492 | *nodes = p->v.nodes; |
518 | break; | 493 | break; |
@@ -1160,6 +1135,18 @@ static struct mempolicy * get_vma_policy(struct task_struct *task, | |||
1160 | return pol; | 1135 | return pol; |
1161 | } | 1136 | } |
1162 | 1137 | ||
1138 | /* Return a nodemask representing a mempolicy */ | ||
1139 | static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) | ||
1140 | { | ||
1141 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ | ||
1142 | if (unlikely(policy->policy == MPOL_BIND) && | ||
1143 | gfp_zone(gfp) >= policy_zone && | ||
1144 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) | ||
1145 | return &policy->v.nodes; | ||
1146 | |||
1147 | return NULL; | ||
1148 | } | ||
1149 | |||
1163 | /* Return a zonelist representing a mempolicy */ | 1150 | /* Return a zonelist representing a mempolicy */ |
1164 | static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | 1151 | static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) |
1165 | { | 1152 | { |
@@ -1172,12 +1159,17 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | |||
1172 | nd = numa_node_id(); | 1159 | nd = numa_node_id(); |
1173 | break; | 1160 | break; |
1174 | case MPOL_BIND: | 1161 | case MPOL_BIND: |
1175 | /* Lower zones don't get a policy applied */ | 1162 | /* |
1176 | /* Careful: current->mems_allowed might have moved */ | 1163 | * Normally, MPOL_BIND allocations node-local are node-local |
1177 | if (gfp_zone(gfp) >= policy_zone) | 1164 | * within the allowed nodemask. However, if __GFP_THISNODE is |
1178 | if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) | 1165 | * set and the current node is part of the mask, we use the |
1179 | return policy->v.zonelist; | 1166 | * the zonelist for the first node in the mask instead. |
1180 | /*FALL THROUGH*/ | 1167 | */ |
1168 | nd = numa_node_id(); | ||
1169 | if (unlikely(gfp & __GFP_THISNODE) && | ||
1170 | unlikely(!node_isset(nd, policy->v.nodes))) | ||
1171 | nd = first_node(policy->v.nodes); | ||
1172 | break; | ||
1181 | case MPOL_INTERLEAVE: /* should not happen */ | 1173 | case MPOL_INTERLEAVE: /* should not happen */ |
1182 | case MPOL_DEFAULT: | 1174 | case MPOL_DEFAULT: |
1183 | nd = numa_node_id(); | 1175 | nd = numa_node_id(); |
@@ -1220,7 +1212,14 @@ unsigned slab_node(struct mempolicy *policy) | |||
1220 | * Follow bind policy behavior and start allocation at the | 1212 | * Follow bind policy behavior and start allocation at the |
1221 | * first node. | 1213 | * first node. |
1222 | */ | 1214 | */ |
1223 | return zonelist_node_idx(policy->v.zonelist->_zonerefs); | 1215 | struct zonelist *zonelist; |
1216 | struct zone *zone; | ||
1217 | enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); | ||
1218 | zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; | ||
1219 | (void)first_zones_zonelist(zonelist, highest_zoneidx, | ||
1220 | &policy->v.nodes, | ||
1221 | &zone); | ||
1222 | return zone->node; | ||
1224 | } | 1223 | } |
1225 | 1224 | ||
1226 | case MPOL_PREFERRED: | 1225 | case MPOL_PREFERRED: |
@@ -1278,25 +1277,31 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1278 | * @vma = virtual memory area whose policy is sought | 1277 | * @vma = virtual memory area whose policy is sought |
1279 | * @addr = address in @vma for shared policy lookup and interleave policy | 1278 | * @addr = address in @vma for shared policy lookup and interleave policy |
1280 | * @gfp_flags = for requested zone | 1279 | * @gfp_flags = for requested zone |
1281 | * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy | 1280 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy |
1281 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask | ||
1282 | * | 1282 | * |
1283 | * Returns a zonelist suitable for a huge page allocation. | 1283 | * Returns a zonelist suitable for a huge page allocation. |
1284 | * If the effective policy is 'BIND, returns pointer to policy's zonelist. | 1284 | * If the effective policy is 'BIND, returns pointer to local node's zonelist, |
1285 | * and a pointer to the mempolicy's @nodemask for filtering the zonelist. | ||
1285 | * If it is also a policy for which get_vma_policy() returns an extra | 1286 | * If it is also a policy for which get_vma_policy() returns an extra |
1286 | * reference, we must hold that reference until after allocation. | 1287 | * reference, we must hold that reference until after the allocation. |
1287 | * In that case, return policy via @mpol so hugetlb allocation can drop | 1288 | * In that case, return policy via @mpol so hugetlb allocation can drop |
1288 | * the reference. For non-'BIND referenced policies, we can/do drop the | 1289 | * the reference. For non-'BIND referenced policies, we can/do drop the |
1289 | * reference here, so the caller doesn't need to know about the special case | 1290 | * reference here, so the caller doesn't need to know about the special case |
1290 | * for default and current task policy. | 1291 | * for default and current task policy. |
1291 | */ | 1292 | */ |
1292 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | 1293 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, |
1293 | gfp_t gfp_flags, struct mempolicy **mpol) | 1294 | gfp_t gfp_flags, struct mempolicy **mpol, |
1295 | nodemask_t **nodemask) | ||
1294 | { | 1296 | { |
1295 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1297 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
1296 | struct zonelist *zl; | 1298 | struct zonelist *zl; |
1297 | 1299 | ||
1298 | *mpol = NULL; /* probably no unref needed */ | 1300 | *mpol = NULL; /* probably no unref needed */ |
1299 | if (pol->policy == MPOL_INTERLEAVE) { | 1301 | *nodemask = NULL; /* assume !MPOL_BIND */ |
1302 | if (pol->policy == MPOL_BIND) { | ||
1303 | *nodemask = &pol->v.nodes; | ||
1304 | } else if (pol->policy == MPOL_INTERLEAVE) { | ||
1300 | unsigned nid; | 1305 | unsigned nid; |
1301 | 1306 | ||
1302 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); | 1307 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); |
@@ -1376,14 +1381,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1376 | /* | 1381 | /* |
1377 | * slow path: ref counted policy -- shared or vma | 1382 | * slow path: ref counted policy -- shared or vma |
1378 | */ | 1383 | */ |
1379 | struct page *page = __alloc_pages(gfp, 0, zl); | 1384 | struct page *page = __alloc_pages_nodemask(gfp, 0, |
1385 | zl, nodemask_policy(gfp, pol)); | ||
1380 | __mpol_free(pol); | 1386 | __mpol_free(pol); |
1381 | return page; | 1387 | return page; |
1382 | } | 1388 | } |
1383 | /* | 1389 | /* |
1384 | * fast path: default or task policy | 1390 | * fast path: default or task policy |
1385 | */ | 1391 | */ |
1386 | return __alloc_pages(gfp, 0, zl); | 1392 | return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol)); |
1387 | } | 1393 | } |
1388 | 1394 | ||
1389 | /** | 1395 | /** |
@@ -1415,7 +1421,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1415 | pol = &default_policy; | 1421 | pol = &default_policy; |
1416 | if (pol->policy == MPOL_INTERLEAVE) | 1422 | if (pol->policy == MPOL_INTERLEAVE) |
1417 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 1423 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); |
1418 | return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); | 1424 | return __alloc_pages_nodemask(gfp, order, |
1425 | zonelist_policy(gfp, pol), nodemask_policy(gfp, pol)); | ||
1419 | } | 1426 | } |
1420 | EXPORT_SYMBOL(alloc_pages_current); | 1427 | EXPORT_SYMBOL(alloc_pages_current); |
1421 | 1428 | ||
@@ -1440,14 +1447,6 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
1440 | } | 1447 | } |
1441 | *new = *old; | 1448 | *new = *old; |
1442 | atomic_set(&new->refcnt, 1); | 1449 | atomic_set(&new->refcnt, 1); |
1443 | if (new->policy == MPOL_BIND) { | ||
1444 | int sz = ksize(old->v.zonelist); | ||
1445 | new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); | ||
1446 | if (!new->v.zonelist) { | ||
1447 | kmem_cache_free(policy_cache, new); | ||
1448 | return ERR_PTR(-ENOMEM); | ||
1449 | } | ||
1450 | } | ||
1451 | return new; | 1450 | return new; |
1452 | } | 1451 | } |
1453 | 1452 | ||
@@ -1461,21 +1460,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
1461 | switch (a->policy) { | 1460 | switch (a->policy) { |
1462 | case MPOL_DEFAULT: | 1461 | case MPOL_DEFAULT: |
1463 | return 1; | 1462 | return 1; |
1463 | case MPOL_BIND: | ||
1464 | /* Fall through */ | ||
1464 | case MPOL_INTERLEAVE: | 1465 | case MPOL_INTERLEAVE: |
1465 | return nodes_equal(a->v.nodes, b->v.nodes); | 1466 | return nodes_equal(a->v.nodes, b->v.nodes); |
1466 | case MPOL_PREFERRED: | 1467 | case MPOL_PREFERRED: |
1467 | return a->v.preferred_node == b->v.preferred_node; | 1468 | return a->v.preferred_node == b->v.preferred_node; |
1468 | case MPOL_BIND: { | ||
1469 | int i; | ||
1470 | for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) { | ||
1471 | struct zone *za, *zb; | ||
1472 | za = zonelist_zone(&a->v.zonelist->_zonerefs[i]); | ||
1473 | zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]); | ||
1474 | if (za != zb) | ||
1475 | return 0; | ||
1476 | } | ||
1477 | return b->v.zonelist->_zonerefs[i].zone == NULL; | ||
1478 | } | ||
1479 | default: | 1469 | default: |
1480 | BUG(); | 1470 | BUG(); |
1481 | return 0; | 1471 | return 0; |
@@ -1487,8 +1477,6 @@ void __mpol_free(struct mempolicy *p) | |||
1487 | { | 1477 | { |
1488 | if (!atomic_dec_and_test(&p->refcnt)) | 1478 | if (!atomic_dec_and_test(&p->refcnt)) |
1489 | return; | 1479 | return; |
1490 | if (p->policy == MPOL_BIND) | ||
1491 | kfree(p->v.zonelist); | ||
1492 | p->policy = MPOL_DEFAULT; | 1480 | p->policy = MPOL_DEFAULT; |
1493 | kmem_cache_free(policy_cache, p); | 1481 | kmem_cache_free(policy_cache, p); |
1494 | } | 1482 | } |
@@ -1779,6 +1767,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
1779 | switch (pol->policy) { | 1767 | switch (pol->policy) { |
1780 | case MPOL_DEFAULT: | 1768 | case MPOL_DEFAULT: |
1781 | break; | 1769 | break; |
1770 | case MPOL_BIND: | ||
1771 | /* Fall through */ | ||
1782 | case MPOL_INTERLEAVE: | 1772 | case MPOL_INTERLEAVE: |
1783 | nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); | 1773 | nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); |
1784 | pol->v.nodes = tmp; | 1774 | pol->v.nodes = tmp; |
@@ -1791,32 +1781,6 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
1791 | *mpolmask, *newmask); | 1781 | *mpolmask, *newmask); |
1792 | *mpolmask = *newmask; | 1782 | *mpolmask = *newmask; |
1793 | break; | 1783 | break; |
1794 | case MPOL_BIND: { | ||
1795 | nodemask_t nodes; | ||
1796 | struct zoneref *z; | ||
1797 | struct zonelist *zonelist; | ||
1798 | |||
1799 | nodes_clear(nodes); | ||
1800 | for (z = pol->v.zonelist->_zonerefs; z->zone; z++) | ||
1801 | node_set(zonelist_node_idx(z), nodes); | ||
1802 | nodes_remap(tmp, nodes, *mpolmask, *newmask); | ||
1803 | nodes = tmp; | ||
1804 | |||
1805 | zonelist = bind_zonelist(&nodes); | ||
1806 | |||
1807 | /* If no mem, then zonelist is NULL and we keep old zonelist. | ||
1808 | * If that old zonelist has no remaining mems_allowed nodes, | ||
1809 | * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. | ||
1810 | */ | ||
1811 | |||
1812 | if (!IS_ERR(zonelist)) { | ||
1813 | /* Good - got mem - substitute new zonelist */ | ||
1814 | kfree(pol->v.zonelist); | ||
1815 | pol->v.zonelist = zonelist; | ||
1816 | } | ||
1817 | *mpolmask = *newmask; | ||
1818 | break; | ||
1819 | } | ||
1820 | default: | 1784 | default: |
1821 | BUG(); | 1785 | BUG(); |
1822 | break; | 1786 | break; |
@@ -1879,9 +1843,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
1879 | break; | 1843 | break; |
1880 | 1844 | ||
1881 | case MPOL_BIND: | 1845 | case MPOL_BIND: |
1882 | get_zonemask(pol, &nodes); | 1846 | /* Fall through */ |
1883 | break; | ||
1884 | |||
1885 | case MPOL_INTERLEAVE: | 1847 | case MPOL_INTERLEAVE: |
1886 | nodes = pol->v.nodes; | 1848 | nodes = pol->v.nodes; |
1887 | break; | 1849 | break; |
diff --git a/mm/mmzone.c b/mm/mmzone.c index eb5838634f18..486ed595ee6f 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone) | |||
42 | return zone; | 42 | return zone; |
43 | } | 43 | } |
44 | 44 | ||
45 | static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) | ||
46 | { | ||
47 | #ifdef CONFIG_NUMA | ||
48 | return node_isset(zonelist_node_idx(zref), *nodes); | ||
49 | #else | ||
50 | return 1; | ||
51 | #endif /* CONFIG_NUMA */ | ||
52 | } | ||
53 | |||
54 | /* Returns the next zone at or below highest_zoneidx in a zonelist */ | ||
55 | struct zoneref *next_zones_zonelist(struct zoneref *z, | ||
56 | enum zone_type highest_zoneidx, | ||
57 | nodemask_t *nodes, | ||
58 | struct zone **zone) | ||
59 | { | ||
60 | /* | ||
61 | * Find the next suitable zone to use for the allocation. | ||
62 | * Only filter based on nodemask if it's set | ||
63 | */ | ||
64 | if (likely(nodes == NULL)) | ||
65 | while (zonelist_zone_idx(z) > highest_zoneidx) | ||
66 | z++; | ||
67 | else | ||
68 | while (zonelist_zone_idx(z) > highest_zoneidx || | ||
69 | (z->zone && !zref_in_nodemask(z, nodes))) | ||
70 | z++; | ||
71 | |||
72 | *zone = zonelist_zone(z++); | ||
73 | return z; | ||
74 | } | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d94d04ea784..b4beb3eea8b7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1377,7 +1377,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1377 | * a page. | 1377 | * a page. |
1378 | */ | 1378 | */ |
1379 | static struct page * | 1379 | static struct page * |
1380 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1380 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1381 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) | 1381 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) |
1382 | { | 1382 | { |
1383 | struct zoneref *z; | 1383 | struct zoneref *z; |
@@ -1388,16 +1388,17 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
1388 | int zlc_active = 0; /* set if using zonelist_cache */ | 1388 | int zlc_active = 0; /* set if using zonelist_cache */ |
1389 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1389 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1390 | 1390 | ||
1391 | z = first_zones_zonelist(zonelist, high_zoneidx); | 1391 | (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, |
1392 | classzone_idx = zonelist_zone_idx(z); | 1392 | &preferred_zone); |
1393 | preferred_zone = zonelist_zone(z); | 1393 | classzone_idx = zone_idx(preferred_zone); |
1394 | 1394 | ||
1395 | zonelist_scan: | 1395 | zonelist_scan: |
1396 | /* | 1396 | /* |
1397 | * Scan zonelist, looking for a zone with enough free. | 1397 | * Scan zonelist, looking for a zone with enough free. |
1398 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1398 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1399 | */ | 1399 | */ |
1400 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1400 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1401 | high_zoneidx, nodemask) { | ||
1401 | if (NUMA_BUILD && zlc_active && | 1402 | if (NUMA_BUILD && zlc_active && |
1402 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1403 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1403 | continue; | 1404 | continue; |
@@ -1447,9 +1448,9 @@ try_next_zone: | |||
1447 | /* | 1448 | /* |
1448 | * This is the 'heart' of the zoned buddy allocator. | 1449 | * This is the 'heart' of the zoned buddy allocator. |
1449 | */ | 1450 | */ |
1450 | struct page * | 1451 | static struct page * |
1451 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | 1452 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, |
1452 | struct zonelist *zonelist) | 1453 | struct zonelist *zonelist, nodemask_t *nodemask) |
1453 | { | 1454 | { |
1454 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 1455 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
1455 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1456 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
@@ -1478,7 +1479,7 @@ restart: | |||
1478 | return NULL; | 1479 | return NULL; |
1479 | } | 1480 | } |
1480 | 1481 | ||
1481 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1482 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
1482 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1483 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); |
1483 | if (page) | 1484 | if (page) |
1484 | goto got_pg; | 1485 | goto got_pg; |
@@ -1523,7 +1524,7 @@ restart: | |||
1523 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 1524 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1524 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1525 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1525 | */ | 1526 | */ |
1526 | page = get_page_from_freelist(gfp_mask, order, zonelist, | 1527 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
1527 | high_zoneidx, alloc_flags); | 1528 | high_zoneidx, alloc_flags); |
1528 | if (page) | 1529 | if (page) |
1529 | goto got_pg; | 1530 | goto got_pg; |
@@ -1536,7 +1537,7 @@ rebalance: | |||
1536 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1537 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
1537 | nofail_alloc: | 1538 | nofail_alloc: |
1538 | /* go through the zonelist yet again, ignoring mins */ | 1539 | /* go through the zonelist yet again, ignoring mins */ |
1539 | page = get_page_from_freelist(gfp_mask, order, | 1540 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1540 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); | 1541 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); |
1541 | if (page) | 1542 | if (page) |
1542 | goto got_pg; | 1543 | goto got_pg; |
@@ -1571,7 +1572,7 @@ nofail_alloc: | |||
1571 | drain_all_pages(); | 1572 | drain_all_pages(); |
1572 | 1573 | ||
1573 | if (likely(did_some_progress)) { | 1574 | if (likely(did_some_progress)) { |
1574 | page = get_page_from_freelist(gfp_mask, order, | 1575 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1575 | zonelist, high_zoneidx, alloc_flags); | 1576 | zonelist, high_zoneidx, alloc_flags); |
1576 | if (page) | 1577 | if (page) |
1577 | goto got_pg; | 1578 | goto got_pg; |
@@ -1587,8 +1588,9 @@ nofail_alloc: | |||
1587 | * a parallel oom killing, we must fail if we're still | 1588 | * a parallel oom killing, we must fail if we're still |
1588 | * under heavy pressure. | 1589 | * under heavy pressure. |
1589 | */ | 1590 | */ |
1590 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1591 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, |
1591 | zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET); | 1592 | order, zonelist, high_zoneidx, |
1593 | ALLOC_WMARK_HIGH|ALLOC_CPUSET); | ||
1592 | if (page) { | 1594 | if (page) { |
1593 | clear_zonelist_oom(zonelist, gfp_mask); | 1595 | clear_zonelist_oom(zonelist, gfp_mask); |
1594 | goto got_pg; | 1596 | goto got_pg; |
@@ -1637,6 +1639,20 @@ got_pg: | |||
1637 | return page; | 1639 | return page; |
1638 | } | 1640 | } |
1639 | 1641 | ||
1642 | struct page * | ||
1643 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | ||
1644 | struct zonelist *zonelist) | ||
1645 | { | ||
1646 | return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); | ||
1647 | } | ||
1648 | |||
1649 | struct page * | ||
1650 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
1651 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
1652 | { | ||
1653 | return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); | ||
1654 | } | ||
1655 | |||
1640 | EXPORT_SYMBOL(__alloc_pages); | 1656 | EXPORT_SYMBOL(__alloc_pages); |
1641 | 1657 | ||
1642 | /* | 1658 | /* |
@@ -1880,6 +1896,12 @@ void show_free_areas(void) | |||
1880 | show_swap_cache_info(); | 1896 | show_swap_cache_info(); |
1881 | } | 1897 | } |
1882 | 1898 | ||
1899 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) | ||
1900 | { | ||
1901 | zoneref->zone = zone; | ||
1902 | zoneref->zone_idx = zone_idx(zone); | ||
1903 | } | ||
1904 | |||
1883 | /* | 1905 | /* |
1884 | * Builds allocation fallback zone lists. | 1906 | * Builds allocation fallback zone lists. |
1885 | * | 1907 | * |