aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2008-04-28 05:12:18 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-28 11:58:19 -0400
commit19770b32609b6bf97a3dece2529089494cbfc549 (patch)
tree3b5922d1b20aabdf929bde9309f323841717747a /mm
parentdd1a239f6f2d4d3eedd318583ec319aa145b324c (diff)
mm: filter based on a nodemask as well as a gfp_mask
The MPOL_BIND policy creates a zonelist that is used for allocations controlled by that mempolicy. As the per-node zonelist is already being filtered based on a zone id, this patch adds a version of __alloc_pages() that takes a nodemask for further filtering. This eliminates the need for MPOL_BIND to create a custom zonelist. A positive benefit of this is that allocations using MPOL_BIND now use the local node's distance-ordered zonelist instead of a custom node-id-ordered zonelist. I.e., pages will be allocated from the closest allowed node with available memory. [Lee.Schermerhorn@hp.com: Mempolicy: update stale documentation and comments] [Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask] [Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Hugh Dickins <hugh@veritas.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/mempolicy.c184
-rw-r--r--mm/mmzone.c30
-rw-r--r--mm/page_alloc.c50
4 files changed, 143 insertions, 127 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4bced0d705ca..3737d82f5225 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -95,12 +95,14 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
95 int nid; 95 int nid;
96 struct page *page = NULL; 96 struct page *page = NULL;
97 struct mempolicy *mpol; 97 struct mempolicy *mpol;
98 nodemask_t *nodemask;
98 struct zonelist *zonelist = huge_zonelist(vma, address, 99 struct zonelist *zonelist = huge_zonelist(vma, address,
99 htlb_alloc_mask, &mpol); 100 htlb_alloc_mask, &mpol, &nodemask);
100 struct zone *zone; 101 struct zone *zone;
101 struct zoneref *z; 102 struct zoneref *z;
102 103
103 for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) { 104 for_each_zone_zonelist_nodemask(zone, z, zonelist,
105 MAX_NR_ZONES - 1, nodemask) {
104 nid = zone_to_nid(zone); 106 nid = zone_to_nid(zone);
105 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 107 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
106 !list_empty(&hugepage_freelists[nid])) { 108 !list_empty(&hugepage_freelists[nid])) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 90193a2a915b..acb5ee3587c3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -163,42 +163,25 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
163 return 0; 163 return 0;
164} 164}
165 165
166/* Generate a custom zonelist for the BIND policy. */ 166/* Check that the nodemask contains at least one populated zone */
167static struct zonelist *bind_zonelist(nodemask_t *nodes) 167static int is_valid_nodemask(nodemask_t *nodemask)
168{ 168{
169 struct zonelist *zl; 169 int nd, k;
170 int num, max, nd;
171 enum zone_type k;
172 170
173 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 171 /* Check that there is something useful in this mask */
174 max++; /* space for zlcache_ptr (see mmzone.h) */ 172 k = policy_zone;
175 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 173
176 if (!zl) 174 for_each_node_mask(nd, *nodemask) {
177 return ERR_PTR(-ENOMEM); 175 struct zone *z;
178 zl->zlcache_ptr = NULL; 176
179 num = 0; 177 for (k = 0; k <= policy_zone; k++) {
180 /* First put in the highest zones from all nodes, then all the next 178 z = &NODE_DATA(nd)->node_zones[k];
181 lower zones etc. Avoid empty zones because the memory allocator 179 if (z->present_pages > 0)
182 doesn't like them. If you implement node hot removal you 180 return 1;
183 have to fix that. */
184 k = MAX_NR_ZONES - 1;
185 while (1) {
186 for_each_node_mask(nd, *nodes) {
187 struct zone *z = &NODE_DATA(nd)->node_zones[k];
188 if (z->present_pages > 0)
189 zoneref_set_zone(z, &zl->_zonerefs[num++]);
190 } 181 }
191 if (k == 0)
192 break;
193 k--;
194 }
195 if (num == 0) {
196 kfree(zl);
197 return ERR_PTR(-EINVAL);
198 } 182 }
199 zl->_zonerefs[num].zone = NULL; 183
200 zl->_zonerefs[num].zone_idx = 0; 184 return 0;
201 return zl;
202} 185}
203 186
204/* Create a new policy */ 187/* Create a new policy */
@@ -229,12 +212,11 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
229 policy->v.preferred_node = -1; 212 policy->v.preferred_node = -1;
230 break; 213 break;
231 case MPOL_BIND: 214 case MPOL_BIND:
232 policy->v.zonelist = bind_zonelist(nodes); 215 if (!is_valid_nodemask(nodes)) {
233 if (IS_ERR(policy->v.zonelist)) {
234 void *error_code = policy->v.zonelist;
235 kmem_cache_free(policy_cache, policy); 216 kmem_cache_free(policy_cache, policy);
236 return error_code; 217 return ERR_PTR(-EINVAL);
237 } 218 }
219 policy->v.nodes = *nodes;
238 break; 220 break;
239 } 221 }
240 policy->policy = mode; 222 policy->policy = mode;
@@ -500,19 +482,12 @@ static long do_set_mempolicy(int mode, nodemask_t *nodes)
500/* Fill a zone bitmap for a policy */ 482/* Fill a zone bitmap for a policy */
501static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) 483static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
502{ 484{
503 int i;
504
505 nodes_clear(*nodes); 485 nodes_clear(*nodes);
506 switch (p->policy) { 486 switch (p->policy) {
507 case MPOL_BIND:
508 for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) {
509 struct zoneref *zref;
510 zref = &p->v.zonelist->_zonerefs[i];
511 node_set(zonelist_node_idx(zref), *nodes);
512 }
513 break;
514 case MPOL_DEFAULT: 487 case MPOL_DEFAULT:
515 break; 488 break;
489 case MPOL_BIND:
490 /* Fall through */
516 case MPOL_INTERLEAVE: 491 case MPOL_INTERLEAVE:
517 *nodes = p->v.nodes; 492 *nodes = p->v.nodes;
518 break; 493 break;
@@ -1160,6 +1135,18 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
1160 return pol; 1135 return pol;
1161} 1136}
1162 1137
1138/* Return a nodemask representing a mempolicy */
1139static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1140{
1141 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1142 if (unlikely(policy->policy == MPOL_BIND) &&
1143 gfp_zone(gfp) >= policy_zone &&
1144 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1145 return &policy->v.nodes;
1146
1147 return NULL;
1148}
1149
1163/* Return a zonelist representing a mempolicy */ 1150/* Return a zonelist representing a mempolicy */
1164static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) 1151static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1165{ 1152{
@@ -1172,12 +1159,17 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1172 nd = numa_node_id(); 1159 nd = numa_node_id();
1173 break; 1160 break;
1174 case MPOL_BIND: 1161 case MPOL_BIND:
1175 /* Lower zones don't get a policy applied */ 1162 /*
1176 /* Careful: current->mems_allowed might have moved */ 1163 * Normally, MPOL_BIND allocations node-local are node-local
1177 if (gfp_zone(gfp) >= policy_zone) 1164 * within the allowed nodemask. However, if __GFP_THISNODE is
1178 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) 1165 * set and the current node is part of the mask, we use the
1179 return policy->v.zonelist; 1166 * the zonelist for the first node in the mask instead.
1180 /*FALL THROUGH*/ 1167 */
1168 nd = numa_node_id();
1169 if (unlikely(gfp & __GFP_THISNODE) &&
1170 unlikely(!node_isset(nd, policy->v.nodes)))
1171 nd = first_node(policy->v.nodes);
1172 break;
1181 case MPOL_INTERLEAVE: /* should not happen */ 1173 case MPOL_INTERLEAVE: /* should not happen */
1182 case MPOL_DEFAULT: 1174 case MPOL_DEFAULT:
1183 nd = numa_node_id(); 1175 nd = numa_node_id();
@@ -1220,7 +1212,14 @@ unsigned slab_node(struct mempolicy *policy)
1220 * Follow bind policy behavior and start allocation at the 1212 * Follow bind policy behavior and start allocation at the
1221 * first node. 1213 * first node.
1222 */ 1214 */
1223 return zonelist_node_idx(policy->v.zonelist->_zonerefs); 1215 struct zonelist *zonelist;
1216 struct zone *zone;
1217 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1218 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1219 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1220 &policy->v.nodes,
1221 &zone);
1222 return zone->node;
1224 } 1223 }
1225 1224
1226 case MPOL_PREFERRED: 1225 case MPOL_PREFERRED:
@@ -1278,25 +1277,31 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1278 * @vma = virtual memory area whose policy is sought 1277 * @vma = virtual memory area whose policy is sought
1279 * @addr = address in @vma for shared policy lookup and interleave policy 1278 * @addr = address in @vma for shared policy lookup and interleave policy
1280 * @gfp_flags = for requested zone 1279 * @gfp_flags = for requested zone
1281 * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy 1280 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1281 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1282 * 1282 *
1283 * Returns a zonelist suitable for a huge page allocation. 1283 * Returns a zonelist suitable for a huge page allocation.
1284 * If the effective policy is 'BIND, returns pointer to policy's zonelist. 1284 * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1285 * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1285 * If it is also a policy for which get_vma_policy() returns an extra 1286 * If it is also a policy for which get_vma_policy() returns an extra
1286 * reference, we must hold that reference until after allocation. 1287 * reference, we must hold that reference until after the allocation.
1287 * In that case, return policy via @mpol so hugetlb allocation can drop 1288 * In that case, return policy via @mpol so hugetlb allocation can drop
1288 * the reference. For non-'BIND referenced policies, we can/do drop the 1289 * the reference. For non-'BIND referenced policies, we can/do drop the
1289 * reference here, so the caller doesn't need to know about the special case 1290 * reference here, so the caller doesn't need to know about the special case
1290 * for default and current task policy. 1291 * for default and current task policy.
1291 */ 1292 */
1292struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1293struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1293 gfp_t gfp_flags, struct mempolicy **mpol) 1294 gfp_t gfp_flags, struct mempolicy **mpol,
1295 nodemask_t **nodemask)
1294{ 1296{
1295 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1297 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1296 struct zonelist *zl; 1298 struct zonelist *zl;
1297 1299
1298 *mpol = NULL; /* probably no unref needed */ 1300 *mpol = NULL; /* probably no unref needed */
1299 if (pol->policy == MPOL_INTERLEAVE) { 1301 *nodemask = NULL; /* assume !MPOL_BIND */
1302 if (pol->policy == MPOL_BIND) {
1303 *nodemask = &pol->v.nodes;
1304 } else if (pol->policy == MPOL_INTERLEAVE) {
1300 unsigned nid; 1305 unsigned nid;
1301 1306
1302 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); 1307 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
@@ -1376,14 +1381,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1376 /* 1381 /*
1377 * slow path: ref counted policy -- shared or vma 1382 * slow path: ref counted policy -- shared or vma
1378 */ 1383 */
1379 struct page *page = __alloc_pages(gfp, 0, zl); 1384 struct page *page = __alloc_pages_nodemask(gfp, 0,
1385 zl, nodemask_policy(gfp, pol));
1380 __mpol_free(pol); 1386 __mpol_free(pol);
1381 return page; 1387 return page;
1382 } 1388 }
1383 /* 1389 /*
1384 * fast path: default or task policy 1390 * fast path: default or task policy
1385 */ 1391 */
1386 return __alloc_pages(gfp, 0, zl); 1392 return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
1387} 1393}
1388 1394
1389/** 1395/**
@@ -1415,7 +1421,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1415 pol = &default_policy; 1421 pol = &default_policy;
1416 if (pol->policy == MPOL_INTERLEAVE) 1422 if (pol->policy == MPOL_INTERLEAVE)
1417 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1423 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1418 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); 1424 return __alloc_pages_nodemask(gfp, order,
1425 zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
1419} 1426}
1420EXPORT_SYMBOL(alloc_pages_current); 1427EXPORT_SYMBOL(alloc_pages_current);
1421 1428
@@ -1440,14 +1447,6 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
1440 } 1447 }
1441 *new = *old; 1448 *new = *old;
1442 atomic_set(&new->refcnt, 1); 1449 atomic_set(&new->refcnt, 1);
1443 if (new->policy == MPOL_BIND) {
1444 int sz = ksize(old->v.zonelist);
1445 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1446 if (!new->v.zonelist) {
1447 kmem_cache_free(policy_cache, new);
1448 return ERR_PTR(-ENOMEM);
1449 }
1450 }
1451 return new; 1450 return new;
1452} 1451}
1453 1452
@@ -1461,21 +1460,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1461 switch (a->policy) { 1460 switch (a->policy) {
1462 case MPOL_DEFAULT: 1461 case MPOL_DEFAULT:
1463 return 1; 1462 return 1;
1463 case MPOL_BIND:
1464 /* Fall through */
1464 case MPOL_INTERLEAVE: 1465 case MPOL_INTERLEAVE:
1465 return nodes_equal(a->v.nodes, b->v.nodes); 1466 return nodes_equal(a->v.nodes, b->v.nodes);
1466 case MPOL_PREFERRED: 1467 case MPOL_PREFERRED:
1467 return a->v.preferred_node == b->v.preferred_node; 1468 return a->v.preferred_node == b->v.preferred_node;
1468 case MPOL_BIND: {
1469 int i;
1470 for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) {
1471 struct zone *za, *zb;
1472 za = zonelist_zone(&a->v.zonelist->_zonerefs[i]);
1473 zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]);
1474 if (za != zb)
1475 return 0;
1476 }
1477 return b->v.zonelist->_zonerefs[i].zone == NULL;
1478 }
1479 default: 1469 default:
1480 BUG(); 1470 BUG();
1481 return 0; 1471 return 0;
@@ -1487,8 +1477,6 @@ void __mpol_free(struct mempolicy *p)
1487{ 1477{
1488 if (!atomic_dec_and_test(&p->refcnt)) 1478 if (!atomic_dec_and_test(&p->refcnt))
1489 return; 1479 return;
1490 if (p->policy == MPOL_BIND)
1491 kfree(p->v.zonelist);
1492 p->policy = MPOL_DEFAULT; 1480 p->policy = MPOL_DEFAULT;
1493 kmem_cache_free(policy_cache, p); 1481 kmem_cache_free(policy_cache, p);
1494} 1482}
@@ -1779,6 +1767,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
1779 switch (pol->policy) { 1767 switch (pol->policy) {
1780 case MPOL_DEFAULT: 1768 case MPOL_DEFAULT:
1781 break; 1769 break;
1770 case MPOL_BIND:
1771 /* Fall through */
1782 case MPOL_INTERLEAVE: 1772 case MPOL_INTERLEAVE:
1783 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); 1773 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1784 pol->v.nodes = tmp; 1774 pol->v.nodes = tmp;
@@ -1791,32 +1781,6 @@ static void mpol_rebind_policy(struct mempolicy *pol,
1791 *mpolmask, *newmask); 1781 *mpolmask, *newmask);
1792 *mpolmask = *newmask; 1782 *mpolmask = *newmask;
1793 break; 1783 break;
1794 case MPOL_BIND: {
1795 nodemask_t nodes;
1796 struct zoneref *z;
1797 struct zonelist *zonelist;
1798
1799 nodes_clear(nodes);
1800 for (z = pol->v.zonelist->_zonerefs; z->zone; z++)
1801 node_set(zonelist_node_idx(z), nodes);
1802 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1803 nodes = tmp;
1804
1805 zonelist = bind_zonelist(&nodes);
1806
1807 /* If no mem, then zonelist is NULL and we keep old zonelist.
1808 * If that old zonelist has no remaining mems_allowed nodes,
1809 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1810 */
1811
1812 if (!IS_ERR(zonelist)) {
1813 /* Good - got mem - substitute new zonelist */
1814 kfree(pol->v.zonelist);
1815 pol->v.zonelist = zonelist;
1816 }
1817 *mpolmask = *newmask;
1818 break;
1819 }
1820 default: 1784 default:
1821 BUG(); 1785 BUG();
1822 break; 1786 break;
@@ -1879,9 +1843,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1879 break; 1843 break;
1880 1844
1881 case MPOL_BIND: 1845 case MPOL_BIND:
1882 get_zonemask(pol, &nodes); 1846 /* Fall through */
1883 break;
1884
1885 case MPOL_INTERLEAVE: 1847 case MPOL_INTERLEAVE:
1886 nodes = pol->v.nodes; 1848 nodes = pol->v.nodes;
1887 break; 1849 break;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index eb5838634f18..486ed595ee6f 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone)
42 return zone; 42 return zone;
43} 43}
44 44
45static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
46{
47#ifdef CONFIG_NUMA
48 return node_isset(zonelist_node_idx(zref), *nodes);
49#else
50 return 1;
51#endif /* CONFIG_NUMA */
52}
53
54/* Returns the next zone at or below highest_zoneidx in a zonelist */
55struct zoneref *next_zones_zonelist(struct zoneref *z,
56 enum zone_type highest_zoneidx,
57 nodemask_t *nodes,
58 struct zone **zone)
59{
60 /*
61 * Find the next suitable zone to use for the allocation.
62 * Only filter based on nodemask if it's set
63 */
64 if (likely(nodes == NULL))
65 while (zonelist_zone_idx(z) > highest_zoneidx)
66 z++;
67 else
68 while (zonelist_zone_idx(z) > highest_zoneidx ||
69 (z->zone && !zref_in_nodemask(z, nodes)))
70 z++;
71
72 *zone = zonelist_zone(z++);
73 return z;
74}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d94d04ea784..b4beb3eea8b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1377,7 +1377,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1377 * a page. 1377 * a page.
1378 */ 1378 */
1379static struct page * 1379static struct page *
1380get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1380get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1381 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1381 struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
1382{ 1382{
1383 struct zoneref *z; 1383 struct zoneref *z;
@@ -1388,16 +1388,17 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
1388 int zlc_active = 0; /* set if using zonelist_cache */ 1388 int zlc_active = 0; /* set if using zonelist_cache */
1389 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1389 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1390 1390
1391 z = first_zones_zonelist(zonelist, high_zoneidx); 1391 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1392 classzone_idx = zonelist_zone_idx(z); 1392 &preferred_zone);
1393 preferred_zone = zonelist_zone(z); 1393 classzone_idx = zone_idx(preferred_zone);
1394 1394
1395zonelist_scan: 1395zonelist_scan:
1396 /* 1396 /*
1397 * Scan zonelist, looking for a zone with enough free. 1397 * Scan zonelist, looking for a zone with enough free.
1398 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1398 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1399 */ 1399 */
1400 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1400 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1401 high_zoneidx, nodemask) {
1401 if (NUMA_BUILD && zlc_active && 1402 if (NUMA_BUILD && zlc_active &&
1402 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1403 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1403 continue; 1404 continue;
@@ -1447,9 +1448,9 @@ try_next_zone:
1447/* 1448/*
1448 * This is the 'heart' of the zoned buddy allocator. 1449 * This is the 'heart' of the zoned buddy allocator.
1449 */ 1450 */
1450struct page * 1451static struct page *
1451__alloc_pages(gfp_t gfp_mask, unsigned int order, 1452__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1452 struct zonelist *zonelist) 1453 struct zonelist *zonelist, nodemask_t *nodemask)
1453{ 1454{
1454 const gfp_t wait = gfp_mask & __GFP_WAIT; 1455 const gfp_t wait = gfp_mask & __GFP_WAIT;
1455 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1456 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
@@ -1478,7 +1479,7 @@ restart:
1478 return NULL; 1479 return NULL;
1479 } 1480 }
1480 1481
1481 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1482 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1482 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1483 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1483 if (page) 1484 if (page)
1484 goto got_pg; 1485 goto got_pg;
@@ -1523,7 +1524,7 @@ restart:
1523 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1524 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1524 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1525 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1525 */ 1526 */
1526 page = get_page_from_freelist(gfp_mask, order, zonelist, 1527 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1527 high_zoneidx, alloc_flags); 1528 high_zoneidx, alloc_flags);
1528 if (page) 1529 if (page)
1529 goto got_pg; 1530 goto got_pg;
@@ -1536,7 +1537,7 @@ rebalance:
1536 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1537 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1537nofail_alloc: 1538nofail_alloc:
1538 /* go through the zonelist yet again, ignoring mins */ 1539 /* go through the zonelist yet again, ignoring mins */
1539 page = get_page_from_freelist(gfp_mask, order, 1540 page = get_page_from_freelist(gfp_mask, nodemask, order,
1540 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1541 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1541 if (page) 1542 if (page)
1542 goto got_pg; 1543 goto got_pg;
@@ -1571,7 +1572,7 @@ nofail_alloc:
1571 drain_all_pages(); 1572 drain_all_pages();
1572 1573
1573 if (likely(did_some_progress)) { 1574 if (likely(did_some_progress)) {
1574 page = get_page_from_freelist(gfp_mask, order, 1575 page = get_page_from_freelist(gfp_mask, nodemask, order,
1575 zonelist, high_zoneidx, alloc_flags); 1576 zonelist, high_zoneidx, alloc_flags);
1576 if (page) 1577 if (page)
1577 goto got_pg; 1578 goto got_pg;
@@ -1587,8 +1588,9 @@ nofail_alloc:
1587 * a parallel oom killing, we must fail if we're still 1588 * a parallel oom killing, we must fail if we're still
1588 * under heavy pressure. 1589 * under heavy pressure.
1589 */ 1590 */
1590 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1591 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1591 zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1592 order, zonelist, high_zoneidx,
1593 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1592 if (page) { 1594 if (page) {
1593 clear_zonelist_oom(zonelist, gfp_mask); 1595 clear_zonelist_oom(zonelist, gfp_mask);
1594 goto got_pg; 1596 goto got_pg;
@@ -1637,6 +1639,20 @@ got_pg:
1637 return page; 1639 return page;
1638} 1640}
1639 1641
1642struct page *
1643__alloc_pages(gfp_t gfp_mask, unsigned int order,
1644 struct zonelist *zonelist)
1645{
1646 return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
1647}
1648
1649struct page *
1650__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1651 struct zonelist *zonelist, nodemask_t *nodemask)
1652{
1653 return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
1654}
1655
1640EXPORT_SYMBOL(__alloc_pages); 1656EXPORT_SYMBOL(__alloc_pages);
1641 1657
1642/* 1658/*
@@ -1880,6 +1896,12 @@ void show_free_areas(void)
1880 show_swap_cache_info(); 1896 show_swap_cache_info();
1881} 1897}
1882 1898
1899static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
1900{
1901 zoneref->zone = zone;
1902 zoneref->zone_idx = zone_idx(zone);
1903}
1904
1883/* 1905/*
1884 * Builds allocation fallback zone lists. 1906 * Builds allocation fallback zone lists.
1885 * 1907 *