diff options
author | Mel Gorman <mel@csn.ul.ie> | 2008-04-28 05:12:16 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-28 11:58:18 -0400 |
commit | 54a6eb5c4765aa573a030ceeba2c14e3d2ea5706 (patch) | |
tree | 547176a090beb787722a153cf2b8b942dc0e68db /mm/page_alloc.c | |
parent | 18ea7e710d2452fa726814a406779188028cf1bf (diff) |
mm: use two zonelist that are filtered by GFP mask
Currently a node has two sets of zonelists, one for each zone type in the
system and a second set for GFP_THISNODE allocations. Based on the zones
allowed by a gfp mask, one of these zonelists is selected. All of these
zonelists consume memory and occupy cache lines.
This patch replaces the multiple zonelists per-node with two zonelists. The
first contains all populated zones in the system, ordered by distance, for
fallback allocations when the target/preferred node has no free pages. The
second contains all populated zones in the node suitable for GFP_THISNODE
allocations.
An iterator macro is introduced called for_each_zone_zonelist() that interates
through each zone allowed by the GFP flags in the selected zonelist.
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 170 |
1 files changed, 73 insertions, 97 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 187efd47a446..4ccb8651cf22 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1378,42 +1378,29 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | |||
1378 | */ | 1378 | */ |
1379 | static struct page * | 1379 | static struct page * |
1380 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1380 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, |
1381 | struct zonelist *zonelist, int alloc_flags) | 1381 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) |
1382 | { | 1382 | { |
1383 | struct zone **z; | 1383 | struct zone **z; |
1384 | struct page *page = NULL; | 1384 | struct page *page = NULL; |
1385 | int classzone_idx = zone_idx(zonelist->zones[0]); | 1385 | int classzone_idx; |
1386 | struct zone *zone, *preferred_zone; | 1386 | struct zone *zone, *preferred_zone; |
1387 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1387 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1388 | int zlc_active = 0; /* set if using zonelist_cache */ | 1388 | int zlc_active = 0; /* set if using zonelist_cache */ |
1389 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1389 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1390 | enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ | 1390 | |
1391 | z = first_zones_zonelist(zonelist, high_zoneidx); | ||
1392 | classzone_idx = zone_idx(*z); | ||
1393 | preferred_zone = *z; | ||
1391 | 1394 | ||
1392 | zonelist_scan: | 1395 | zonelist_scan: |
1393 | /* | 1396 | /* |
1394 | * Scan zonelist, looking for a zone with enough free. | 1397 | * Scan zonelist, looking for a zone with enough free. |
1395 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1398 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1396 | */ | 1399 | */ |
1397 | z = zonelist->zones; | 1400 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
1398 | preferred_zone = *z; | ||
1399 | |||
1400 | do { | ||
1401 | /* | ||
1402 | * In NUMA, this could be a policy zonelist which contains | ||
1403 | * zones that may not be allowed by the current gfp_mask. | ||
1404 | * Check the zone is allowed by the current flags | ||
1405 | */ | ||
1406 | if (unlikely(alloc_should_filter_zonelist(zonelist))) { | ||
1407 | if (highest_zoneidx == -1) | ||
1408 | highest_zoneidx = gfp_zone(gfp_mask); | ||
1409 | if (zone_idx(*z) > highest_zoneidx) | ||
1410 | continue; | ||
1411 | } | ||
1412 | |||
1413 | if (NUMA_BUILD && zlc_active && | 1401 | if (NUMA_BUILD && zlc_active && |
1414 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1402 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1415 | continue; | 1403 | continue; |
1416 | zone = *z; | ||
1417 | if ((alloc_flags & ALLOC_CPUSET) && | 1404 | if ((alloc_flags & ALLOC_CPUSET) && |
1418 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1405 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1419 | goto try_next_zone; | 1406 | goto try_next_zone; |
@@ -1447,7 +1434,7 @@ try_next_zone: | |||
1447 | zlc_active = 1; | 1434 | zlc_active = 1; |
1448 | did_zlc_setup = 1; | 1435 | did_zlc_setup = 1; |
1449 | } | 1436 | } |
1450 | } while (*(++z) != NULL); | 1437 | } |
1451 | 1438 | ||
1452 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1439 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { |
1453 | /* Disable zlc cache for second zonelist scan */ | 1440 | /* Disable zlc cache for second zonelist scan */ |
@@ -1465,6 +1452,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, | |||
1465 | struct zonelist *zonelist) | 1452 | struct zonelist *zonelist) |
1466 | { | 1453 | { |
1467 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 1454 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
1455 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
1468 | struct zone **z; | 1456 | struct zone **z; |
1469 | struct page *page; | 1457 | struct page *page; |
1470 | struct reclaim_state reclaim_state; | 1458 | struct reclaim_state reclaim_state; |
@@ -1490,7 +1478,7 @@ restart: | |||
1490 | } | 1478 | } |
1491 | 1479 | ||
1492 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1480 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, |
1493 | zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1481 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); |
1494 | if (page) | 1482 | if (page) |
1495 | goto got_pg; | 1483 | goto got_pg; |
1496 | 1484 | ||
@@ -1534,7 +1522,8 @@ restart: | |||
1534 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 1522 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1535 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1523 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1536 | */ | 1524 | */ |
1537 | page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); | 1525 | page = get_page_from_freelist(gfp_mask, order, zonelist, |
1526 | high_zoneidx, alloc_flags); | ||
1538 | if (page) | 1527 | if (page) |
1539 | goto got_pg; | 1528 | goto got_pg; |
1540 | 1529 | ||
@@ -1547,7 +1536,7 @@ rebalance: | |||
1547 | nofail_alloc: | 1536 | nofail_alloc: |
1548 | /* go through the zonelist yet again, ignoring mins */ | 1537 | /* go through the zonelist yet again, ignoring mins */ |
1549 | page = get_page_from_freelist(gfp_mask, order, | 1538 | page = get_page_from_freelist(gfp_mask, order, |
1550 | zonelist, ALLOC_NO_WATERMARKS); | 1539 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); |
1551 | if (page) | 1540 | if (page) |
1552 | goto got_pg; | 1541 | goto got_pg; |
1553 | if (gfp_mask & __GFP_NOFAIL) { | 1542 | if (gfp_mask & __GFP_NOFAIL) { |
@@ -1582,7 +1571,7 @@ nofail_alloc: | |||
1582 | 1571 | ||
1583 | if (likely(did_some_progress)) { | 1572 | if (likely(did_some_progress)) { |
1584 | page = get_page_from_freelist(gfp_mask, order, | 1573 | page = get_page_from_freelist(gfp_mask, order, |
1585 | zonelist, alloc_flags); | 1574 | zonelist, high_zoneidx, alloc_flags); |
1586 | if (page) | 1575 | if (page) |
1587 | goto got_pg; | 1576 | goto got_pg; |
1588 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 1577 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
@@ -1598,7 +1587,7 @@ nofail_alloc: | |||
1598 | * under heavy pressure. | 1587 | * under heavy pressure. |
1599 | */ | 1588 | */ |
1600 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1589 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, |
1601 | zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); | 1590 | zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET); |
1602 | if (page) { | 1591 | if (page) { |
1603 | clear_zonelist_oom(zonelist); | 1592 | clear_zonelist_oom(zonelist); |
1604 | goto got_pg; | 1593 | goto got_pg; |
@@ -1713,14 +1702,15 @@ EXPORT_SYMBOL(free_pages); | |||
1713 | 1702 | ||
1714 | static unsigned int nr_free_zone_pages(int offset) | 1703 | static unsigned int nr_free_zone_pages(int offset) |
1715 | { | 1704 | { |
1705 | struct zone **z; | ||
1706 | struct zone *zone; | ||
1707 | |||
1716 | /* Just pick one node, since fallback list is circular */ | 1708 | /* Just pick one node, since fallback list is circular */ |
1717 | unsigned int sum = 0; | 1709 | unsigned int sum = 0; |
1718 | 1710 | ||
1719 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); | 1711 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); |
1720 | struct zone **zonep = zonelist->zones; | ||
1721 | struct zone *zone; | ||
1722 | 1712 | ||
1723 | for (zone = *zonep++; zone; zone = *zonep++) { | 1713 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
1724 | unsigned long size = zone->present_pages; | 1714 | unsigned long size = zone->present_pages; |
1725 | unsigned long high = zone->pages_high; | 1715 | unsigned long high = zone->pages_high; |
1726 | if (size > high) | 1716 | if (size > high) |
@@ -2078,17 +2068,15 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
2078 | */ | 2068 | */ |
2079 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | 2069 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) |
2080 | { | 2070 | { |
2081 | enum zone_type i; | ||
2082 | int j; | 2071 | int j; |
2083 | struct zonelist *zonelist; | 2072 | struct zonelist *zonelist; |
2084 | 2073 | ||
2085 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2074 | zonelist = &pgdat->node_zonelists[0]; |
2086 | zonelist = pgdat->node_zonelists + i; | 2075 | for (j = 0; zonelist->zones[j] != NULL; j++) |
2087 | for (j = 0; zonelist->zones[j] != NULL; j++) | 2076 | ; |
2088 | ; | 2077 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
2089 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 2078 | MAX_NR_ZONES - 1); |
2090 | zonelist->zones[j] = NULL; | 2079 | zonelist->zones[j] = NULL; |
2091 | } | ||
2092 | } | 2080 | } |
2093 | 2081 | ||
2094 | /* | 2082 | /* |
@@ -2096,15 +2084,12 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | |||
2096 | */ | 2084 | */ |
2097 | static void build_thisnode_zonelists(pg_data_t *pgdat) | 2085 | static void build_thisnode_zonelists(pg_data_t *pgdat) |
2098 | { | 2086 | { |
2099 | enum zone_type i; | ||
2100 | int j; | 2087 | int j; |
2101 | struct zonelist *zonelist; | 2088 | struct zonelist *zonelist; |
2102 | 2089 | ||
2103 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2090 | zonelist = &pgdat->node_zonelists[1]; |
2104 | zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; | 2091 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); |
2105 | j = build_zonelists_node(pgdat, zonelist, 0, i); | 2092 | zonelist->zones[j] = NULL; |
2106 | zonelist->zones[j] = NULL; | ||
2107 | } | ||
2108 | } | 2093 | } |
2109 | 2094 | ||
2110 | /* | 2095 | /* |
@@ -2117,27 +2102,24 @@ static int node_order[MAX_NUMNODES]; | |||
2117 | 2102 | ||
2118 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | 2103 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) |
2119 | { | 2104 | { |
2120 | enum zone_type i; | ||
2121 | int pos, j, node; | 2105 | int pos, j, node; |
2122 | int zone_type; /* needs to be signed */ | 2106 | int zone_type; /* needs to be signed */ |
2123 | struct zone *z; | 2107 | struct zone *z; |
2124 | struct zonelist *zonelist; | 2108 | struct zonelist *zonelist; |
2125 | 2109 | ||
2126 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2110 | zonelist = &pgdat->node_zonelists[0]; |
2127 | zonelist = pgdat->node_zonelists + i; | 2111 | pos = 0; |
2128 | pos = 0; | 2112 | for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { |
2129 | for (zone_type = i; zone_type >= 0; zone_type--) { | 2113 | for (j = 0; j < nr_nodes; j++) { |
2130 | for (j = 0; j < nr_nodes; j++) { | 2114 | node = node_order[j]; |
2131 | node = node_order[j]; | 2115 | z = &NODE_DATA(node)->node_zones[zone_type]; |
2132 | z = &NODE_DATA(node)->node_zones[zone_type]; | 2116 | if (populated_zone(z)) { |
2133 | if (populated_zone(z)) { | 2117 | zonelist->zones[pos++] = z; |
2134 | zonelist->zones[pos++] = z; | 2118 | check_highest_zone(zone_type); |
2135 | check_highest_zone(zone_type); | ||
2136 | } | ||
2137 | } | 2119 | } |
2138 | } | 2120 | } |
2139 | zonelist->zones[pos] = NULL; | ||
2140 | } | 2121 | } |
2122 | zonelist->zones[pos] = NULL; | ||
2141 | } | 2123 | } |
2142 | 2124 | ||
2143 | static int default_zonelist_order(void) | 2125 | static int default_zonelist_order(void) |
@@ -2264,19 +2246,15 @@ static void build_zonelists(pg_data_t *pgdat) | |||
2264 | /* Construct the zonelist performance cache - see further mmzone.h */ | 2246 | /* Construct the zonelist performance cache - see further mmzone.h */ |
2265 | static void build_zonelist_cache(pg_data_t *pgdat) | 2247 | static void build_zonelist_cache(pg_data_t *pgdat) |
2266 | { | 2248 | { |
2267 | int i; | 2249 | struct zonelist *zonelist; |
2268 | 2250 | struct zonelist_cache *zlc; | |
2269 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2251 | struct zone **z; |
2270 | struct zonelist *zonelist; | ||
2271 | struct zonelist_cache *zlc; | ||
2272 | struct zone **z; | ||
2273 | 2252 | ||
2274 | zonelist = pgdat->node_zonelists + i; | 2253 | zonelist = &pgdat->node_zonelists[0]; |
2275 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | 2254 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; |
2276 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 2255 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
2277 | for (z = zonelist->zones; *z; z++) | 2256 | for (z = zonelist->zones; *z; z++) |
2278 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); | 2257 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); |
2279 | } | ||
2280 | } | 2258 | } |
2281 | 2259 | ||
2282 | 2260 | ||
@@ -2290,45 +2268,43 @@ static void set_zonelist_order(void) | |||
2290 | static void build_zonelists(pg_data_t *pgdat) | 2268 | static void build_zonelists(pg_data_t *pgdat) |
2291 | { | 2269 | { |
2292 | int node, local_node; | 2270 | int node, local_node; |
2293 | enum zone_type i,j; | 2271 | enum zone_type j; |
2272 | struct zonelist *zonelist; | ||
2294 | 2273 | ||
2295 | local_node = pgdat->node_id; | 2274 | local_node = pgdat->node_id; |
2296 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
2297 | struct zonelist *zonelist; | ||
2298 | 2275 | ||
2299 | zonelist = pgdat->node_zonelists + i; | 2276 | zonelist = &pgdat->node_zonelists[0]; |
2277 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); | ||
2300 | 2278 | ||
2301 | j = build_zonelists_node(pgdat, zonelist, 0, i); | 2279 | /* |
2302 | /* | 2280 | * Now we build the zonelist so that it contains the zones |
2303 | * Now we build the zonelist so that it contains the zones | 2281 | * of all the other nodes. |
2304 | * of all the other nodes. | 2282 | * We don't want to pressure a particular node, so when |
2305 | * We don't want to pressure a particular node, so when | 2283 | * building the zones for node N, we make sure that the |
2306 | * building the zones for node N, we make sure that the | 2284 | * zones coming right after the local ones are those from |
2307 | * zones coming right after the local ones are those from | 2285 | * node N+1 (modulo N) |
2308 | * node N+1 (modulo N) | 2286 | */ |
2309 | */ | 2287 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { |
2310 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 2288 | if (!node_online(node)) |
2311 | if (!node_online(node)) | 2289 | continue; |
2312 | continue; | 2290 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
2313 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 2291 | MAX_NR_ZONES - 1); |
2314 | } | ||
2315 | for (node = 0; node < local_node; node++) { | ||
2316 | if (!node_online(node)) | ||
2317 | continue; | ||
2318 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | ||
2319 | } | ||
2320 | |||
2321 | zonelist->zones[j] = NULL; | ||
2322 | } | 2292 | } |
2293 | for (node = 0; node < local_node; node++) { | ||
2294 | if (!node_online(node)) | ||
2295 | continue; | ||
2296 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, | ||
2297 | MAX_NR_ZONES - 1); | ||
2298 | } | ||
2299 | |||
2300 | zonelist->zones[j] = NULL; | ||
2323 | } | 2301 | } |
2324 | 2302 | ||
2325 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | 2303 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ |
2326 | static void build_zonelist_cache(pg_data_t *pgdat) | 2304 | static void build_zonelist_cache(pg_data_t *pgdat) |
2327 | { | 2305 | { |
2328 | int i; | 2306 | pgdat->node_zonelists[0].zlcache_ptr = NULL; |
2329 | 2307 | pgdat->node_zonelists[1].zlcache_ptr = NULL; | |
2330 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
2331 | pgdat->node_zonelists[i].zlcache_ptr = NULL; | ||
2332 | } | 2308 | } |
2333 | 2309 | ||
2334 | #endif /* CONFIG_NUMA */ | 2310 | #endif /* CONFIG_NUMA */ |