aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2008-04-28 05:12:16 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-28 11:58:18 -0400
commit54a6eb5c4765aa573a030ceeba2c14e3d2ea5706 (patch)
tree547176a090beb787722a153cf2b8b942dc0e68db /mm/page_alloc.c
parent18ea7e710d2452fa726814a406779188028cf1bf (diff)
mm: use two zonelist that are filtered by GFP mask
Currently a node has two sets of zonelists, one for each zone type in the system and a second set for GFP_THISNODE allocations. Based on the zones allowed by a gfp mask, one of these zonelists is selected. All of these zonelists consume memory and occupy cache lines. This patch replaces the multiple zonelists per-node with two zonelists. The first contains all populated zones in the system, ordered by distance, for fallback allocations when the target/preferred node has no free pages. The second contains all populated zones in the node suitable for GFP_THISNODE allocations. An iterator macro is introduced called for_each_zone_zonelist() that interates through each zone allowed by the GFP flags in the selected zonelist. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Christoph Lameter <clameter@sgi.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c170
1 files changed, 73 insertions, 97 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 187efd47a446..4ccb8651cf22 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1378,42 +1378,29 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1378 */ 1378 */
1379static struct page * 1379static struct page *
1380get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1380get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
1381 struct zonelist *zonelist, int alloc_flags) 1381 struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
1382{ 1382{
1383 struct zone **z; 1383 struct zone **z;
1384 struct page *page = NULL; 1384 struct page *page = NULL;
1385 int classzone_idx = zone_idx(zonelist->zones[0]); 1385 int classzone_idx;
1386 struct zone *zone, *preferred_zone; 1386 struct zone *zone, *preferred_zone;
1387 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1387 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1388 int zlc_active = 0; /* set if using zonelist_cache */ 1388 int zlc_active = 0; /* set if using zonelist_cache */
1389 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1389 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1390 enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ 1390
1391 z = first_zones_zonelist(zonelist, high_zoneidx);
1392 classzone_idx = zone_idx(*z);
1393 preferred_zone = *z;
1391 1394
1392zonelist_scan: 1395zonelist_scan:
1393 /* 1396 /*
1394 * Scan zonelist, looking for a zone with enough free. 1397 * Scan zonelist, looking for a zone with enough free.
1395 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1398 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1396 */ 1399 */
1397 z = zonelist->zones; 1400 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1398 preferred_zone = *z;
1399
1400 do {
1401 /*
1402 * In NUMA, this could be a policy zonelist which contains
1403 * zones that may not be allowed by the current gfp_mask.
1404 * Check the zone is allowed by the current flags
1405 */
1406 if (unlikely(alloc_should_filter_zonelist(zonelist))) {
1407 if (highest_zoneidx == -1)
1408 highest_zoneidx = gfp_zone(gfp_mask);
1409 if (zone_idx(*z) > highest_zoneidx)
1410 continue;
1411 }
1412
1413 if (NUMA_BUILD && zlc_active && 1401 if (NUMA_BUILD && zlc_active &&
1414 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1402 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1415 continue; 1403 continue;
1416 zone = *z;
1417 if ((alloc_flags & ALLOC_CPUSET) && 1404 if ((alloc_flags & ALLOC_CPUSET) &&
1418 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1405 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1419 goto try_next_zone; 1406 goto try_next_zone;
@@ -1447,7 +1434,7 @@ try_next_zone:
1447 zlc_active = 1; 1434 zlc_active = 1;
1448 did_zlc_setup = 1; 1435 did_zlc_setup = 1;
1449 } 1436 }
1450 } while (*(++z) != NULL); 1437 }
1451 1438
1452 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1439 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1453 /* Disable zlc cache for second zonelist scan */ 1440 /* Disable zlc cache for second zonelist scan */
@@ -1465,6 +1452,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
1465 struct zonelist *zonelist) 1452 struct zonelist *zonelist)
1466{ 1453{
1467 const gfp_t wait = gfp_mask & __GFP_WAIT; 1454 const gfp_t wait = gfp_mask & __GFP_WAIT;
1455 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1468 struct zone **z; 1456 struct zone **z;
1469 struct page *page; 1457 struct page *page;
1470 struct reclaim_state reclaim_state; 1458 struct reclaim_state reclaim_state;
@@ -1490,7 +1478,7 @@ restart:
1490 } 1478 }
1491 1479
1492 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1480 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1493 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1481 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1494 if (page) 1482 if (page)
1495 goto got_pg; 1483 goto got_pg;
1496 1484
@@ -1534,7 +1522,8 @@ restart:
1534 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1522 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1535 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1523 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1536 */ 1524 */
1537 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); 1525 page = get_page_from_freelist(gfp_mask, order, zonelist,
1526 high_zoneidx, alloc_flags);
1538 if (page) 1527 if (page)
1539 goto got_pg; 1528 goto got_pg;
1540 1529
@@ -1547,7 +1536,7 @@ rebalance:
1547nofail_alloc: 1536nofail_alloc:
1548 /* go through the zonelist yet again, ignoring mins */ 1537 /* go through the zonelist yet again, ignoring mins */
1549 page = get_page_from_freelist(gfp_mask, order, 1538 page = get_page_from_freelist(gfp_mask, order,
1550 zonelist, ALLOC_NO_WATERMARKS); 1539 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1551 if (page) 1540 if (page)
1552 goto got_pg; 1541 goto got_pg;
1553 if (gfp_mask & __GFP_NOFAIL) { 1542 if (gfp_mask & __GFP_NOFAIL) {
@@ -1582,7 +1571,7 @@ nofail_alloc:
1582 1571
1583 if (likely(did_some_progress)) { 1572 if (likely(did_some_progress)) {
1584 page = get_page_from_freelist(gfp_mask, order, 1573 page = get_page_from_freelist(gfp_mask, order,
1585 zonelist, alloc_flags); 1574 zonelist, high_zoneidx, alloc_flags);
1586 if (page) 1575 if (page)
1587 goto got_pg; 1576 goto got_pg;
1588 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1577 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
@@ -1598,7 +1587,7 @@ nofail_alloc:
1598 * under heavy pressure. 1587 * under heavy pressure.
1599 */ 1588 */
1600 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1589 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
1601 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1590 zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1602 if (page) { 1591 if (page) {
1603 clear_zonelist_oom(zonelist); 1592 clear_zonelist_oom(zonelist);
1604 goto got_pg; 1593 goto got_pg;
@@ -1713,14 +1702,15 @@ EXPORT_SYMBOL(free_pages);
1713 1702
1714static unsigned int nr_free_zone_pages(int offset) 1703static unsigned int nr_free_zone_pages(int offset)
1715{ 1704{
1705 struct zone **z;
1706 struct zone *zone;
1707
1716 /* Just pick one node, since fallback list is circular */ 1708 /* Just pick one node, since fallback list is circular */
1717 unsigned int sum = 0; 1709 unsigned int sum = 0;
1718 1710
1719 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); 1711 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
1720 struct zone **zonep = zonelist->zones;
1721 struct zone *zone;
1722 1712
1723 for (zone = *zonep++; zone; zone = *zonep++) { 1713 for_each_zone_zonelist(zone, z, zonelist, offset) {
1724 unsigned long size = zone->present_pages; 1714 unsigned long size = zone->present_pages;
1725 unsigned long high = zone->pages_high; 1715 unsigned long high = zone->pages_high;
1726 if (size > high) 1716 if (size > high)
@@ -2078,17 +2068,15 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2078 */ 2068 */
2079static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 2069static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
2080{ 2070{
2081 enum zone_type i;
2082 int j; 2071 int j;
2083 struct zonelist *zonelist; 2072 struct zonelist *zonelist;
2084 2073
2085 for (i = 0; i < MAX_NR_ZONES; i++) { 2074 zonelist = &pgdat->node_zonelists[0];
2086 zonelist = pgdat->node_zonelists + i; 2075 for (j = 0; zonelist->zones[j] != NULL; j++)
2087 for (j = 0; zonelist->zones[j] != NULL; j++) 2076 ;
2088 ; 2077 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2089 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2078 MAX_NR_ZONES - 1);
2090 zonelist->zones[j] = NULL; 2079 zonelist->zones[j] = NULL;
2091 }
2092} 2080}
2093 2081
2094/* 2082/*
@@ -2096,15 +2084,12 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
2096 */ 2084 */
2097static void build_thisnode_zonelists(pg_data_t *pgdat) 2085static void build_thisnode_zonelists(pg_data_t *pgdat)
2098{ 2086{
2099 enum zone_type i;
2100 int j; 2087 int j;
2101 struct zonelist *zonelist; 2088 struct zonelist *zonelist;
2102 2089
2103 for (i = 0; i < MAX_NR_ZONES; i++) { 2090 zonelist = &pgdat->node_zonelists[1];
2104 zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; 2091 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
2105 j = build_zonelists_node(pgdat, zonelist, 0, i); 2092 zonelist->zones[j] = NULL;
2106 zonelist->zones[j] = NULL;
2107 }
2108} 2093}
2109 2094
2110/* 2095/*
@@ -2117,27 +2102,24 @@ static int node_order[MAX_NUMNODES];
2117 2102
2118static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 2103static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
2119{ 2104{
2120 enum zone_type i;
2121 int pos, j, node; 2105 int pos, j, node;
2122 int zone_type; /* needs to be signed */ 2106 int zone_type; /* needs to be signed */
2123 struct zone *z; 2107 struct zone *z;
2124 struct zonelist *zonelist; 2108 struct zonelist *zonelist;
2125 2109
2126 for (i = 0; i < MAX_NR_ZONES; i++) { 2110 zonelist = &pgdat->node_zonelists[0];
2127 zonelist = pgdat->node_zonelists + i; 2111 pos = 0;
2128 pos = 0; 2112 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
2129 for (zone_type = i; zone_type >= 0; zone_type--) { 2113 for (j = 0; j < nr_nodes; j++) {
2130 for (j = 0; j < nr_nodes; j++) { 2114 node = node_order[j];
2131 node = node_order[j]; 2115 z = &NODE_DATA(node)->node_zones[zone_type];
2132 z = &NODE_DATA(node)->node_zones[zone_type]; 2116 if (populated_zone(z)) {
2133 if (populated_zone(z)) { 2117 zonelist->zones[pos++] = z;
2134 zonelist->zones[pos++] = z; 2118 check_highest_zone(zone_type);
2135 check_highest_zone(zone_type);
2136 }
2137 } 2119 }
2138 } 2120 }
2139 zonelist->zones[pos] = NULL;
2140 } 2121 }
2122 zonelist->zones[pos] = NULL;
2141} 2123}
2142 2124
2143static int default_zonelist_order(void) 2125static int default_zonelist_order(void)
@@ -2264,19 +2246,15 @@ static void build_zonelists(pg_data_t *pgdat)
2264/* Construct the zonelist performance cache - see further mmzone.h */ 2246/* Construct the zonelist performance cache - see further mmzone.h */
2265static void build_zonelist_cache(pg_data_t *pgdat) 2247static void build_zonelist_cache(pg_data_t *pgdat)
2266{ 2248{
2267 int i; 2249 struct zonelist *zonelist;
2268 2250 struct zonelist_cache *zlc;
2269 for (i = 0; i < MAX_NR_ZONES; i++) { 2251 struct zone **z;
2270 struct zonelist *zonelist;
2271 struct zonelist_cache *zlc;
2272 struct zone **z;
2273 2252
2274 zonelist = pgdat->node_zonelists + i; 2253 zonelist = &pgdat->node_zonelists[0];
2275 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 2254 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
2276 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 2255 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
2277 for (z = zonelist->zones; *z; z++) 2256 for (z = zonelist->zones; *z; z++)
2278 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); 2257 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
2279 }
2280} 2258}
2281 2259
2282 2260
@@ -2290,45 +2268,43 @@ static void set_zonelist_order(void)
2290static void build_zonelists(pg_data_t *pgdat) 2268static void build_zonelists(pg_data_t *pgdat)
2291{ 2269{
2292 int node, local_node; 2270 int node, local_node;
2293 enum zone_type i,j; 2271 enum zone_type j;
2272 struct zonelist *zonelist;
2294 2273
2295 local_node = pgdat->node_id; 2274 local_node = pgdat->node_id;
2296 for (i = 0; i < MAX_NR_ZONES; i++) {
2297 struct zonelist *zonelist;
2298 2275
2299 zonelist = pgdat->node_zonelists + i; 2276 zonelist = &pgdat->node_zonelists[0];
2277 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
2300 2278
2301 j = build_zonelists_node(pgdat, zonelist, 0, i); 2279 /*
2302 /* 2280 * Now we build the zonelist so that it contains the zones
2303 * Now we build the zonelist so that it contains the zones 2281 * of all the other nodes.
2304 * of all the other nodes. 2282 * We don't want to pressure a particular node, so when
2305 * We don't want to pressure a particular node, so when 2283 * building the zones for node N, we make sure that the
2306 * building the zones for node N, we make sure that the 2284 * zones coming right after the local ones are those from
2307 * zones coming right after the local ones are those from 2285 * node N+1 (modulo N)
2308 * node N+1 (modulo N) 2286 */
2309 */ 2287 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
2310 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 2288 if (!node_online(node))
2311 if (!node_online(node)) 2289 continue;
2312 continue; 2290 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2313 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2291 MAX_NR_ZONES - 1);
2314 }
2315 for (node = 0; node < local_node; node++) {
2316 if (!node_online(node))
2317 continue;
2318 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
2319 }
2320
2321 zonelist->zones[j] = NULL;
2322 } 2292 }
2293 for (node = 0; node < local_node; node++) {
2294 if (!node_online(node))
2295 continue;
2296 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2297 MAX_NR_ZONES - 1);
2298 }
2299
2300 zonelist->zones[j] = NULL;
2323} 2301}
2324 2302
2325/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 2303/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
2326static void build_zonelist_cache(pg_data_t *pgdat) 2304static void build_zonelist_cache(pg_data_t *pgdat)
2327{ 2305{
2328 int i; 2306 pgdat->node_zonelists[0].zlcache_ptr = NULL;
2329 2307 pgdat->node_zonelists[1].zlcache_ptr = NULL;
2330 for (i = 0; i < MAX_NR_ZONES; i++)
2331 pgdat->node_zonelists[i].zlcache_ptr = NULL;
2332} 2308}
2333 2309
2334#endif /* CONFIG_NUMA */ 2310#endif /* CONFIG_NUMA */