aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c280
1 files changed, 142 insertions, 138 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 402a504f1228..d1cf4f05dcda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -546,7 +546,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
546/* 546/*
547 * permit the bootmem allocator to evade page validation on high-order frees 547 * permit the bootmem allocator to evade page validation on high-order frees
548 */ 548 */
549void __init __free_pages_bootmem(struct page *page, unsigned int order) 549void __free_pages_bootmem(struct page *page, unsigned int order)
550{ 550{
551 if (order == 0) { 551 if (order == 0) {
552 __ClearPageReserved(page); 552 __ClearPageReserved(page);
@@ -632,7 +632,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
632 if (PageReserved(page)) 632 if (PageReserved(page))
633 return 1; 633 return 1;
634 634
635 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | 635 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
636 1 << PG_referenced | 1 << PG_arch_1 | 636 1 << PG_referenced | 1 << PG_arch_1 |
637 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 637 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
638 set_page_private(page, 0); 638 set_page_private(page, 0);
@@ -1050,7 +1050,7 @@ void split_page(struct page *page, unsigned int order)
1050 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1050 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1051 * or two. 1051 * or two.
1052 */ 1052 */
1053static struct page *buffered_rmqueue(struct zonelist *zonelist, 1053static struct page *buffered_rmqueue(struct zone *preferred_zone,
1054 struct zone *zone, int order, gfp_t gfp_flags) 1054 struct zone *zone, int order, gfp_t gfp_flags)
1055{ 1055{
1056 unsigned long flags; 1056 unsigned long flags;
@@ -1102,7 +1102,7 @@ again:
1102 } 1102 }
1103 1103
1104 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1104 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1105 zone_statistics(zonelist, zone); 1105 zone_statistics(preferred_zone, zone);
1106 local_irq_restore(flags); 1106 local_irq_restore(flags);
1107 put_cpu(); 1107 put_cpu();
1108 1108
@@ -1284,7 +1284,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1284 if (!zlc) 1284 if (!zlc)
1285 return NULL; 1285 return NULL;
1286 1286
1287 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1287 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1288 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1288 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1289 zlc->last_full_zap = jiffies; 1289 zlc->last_full_zap = jiffies;
1290 } 1290 }
@@ -1317,7 +1317,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1317 * We are low on memory in the second scan, and should leave no stone 1317 * We are low on memory in the second scan, and should leave no stone
1318 * unturned looking for a free page. 1318 * unturned looking for a free page.
1319 */ 1319 */
1320static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1320static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1321 nodemask_t *allowednodes) 1321 nodemask_t *allowednodes)
1322{ 1322{
1323 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1323 struct zonelist_cache *zlc; /* cached zonelist speedup info */
@@ -1328,7 +1328,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1328 if (!zlc) 1328 if (!zlc)
1329 return 1; 1329 return 1;
1330 1330
1331 i = z - zonelist->zones; 1331 i = z - zonelist->_zonerefs;
1332 n = zlc->z_to_n[i]; 1332 n = zlc->z_to_n[i];
1333 1333
1334 /* This zone is worth trying if it is allowed but not full */ 1334 /* This zone is worth trying if it is allowed but not full */
@@ -1340,7 +1340,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1340 * zlc->fullzones, so that subsequent attempts to allocate a page 1340 * zlc->fullzones, so that subsequent attempts to allocate a page
1341 * from that zone don't waste time re-examining it. 1341 * from that zone don't waste time re-examining it.
1342 */ 1342 */
1343static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1343static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1344{ 1344{
1345 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1345 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1346 int i; /* index of *z in zonelist zones */ 1346 int i; /* index of *z in zonelist zones */
@@ -1349,7 +1349,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1349 if (!zlc) 1349 if (!zlc)
1350 return; 1350 return;
1351 1351
1352 i = z - zonelist->zones; 1352 i = z - zonelist->_zonerefs;
1353 1353
1354 set_bit(i, zlc->fullzones); 1354 set_bit(i, zlc->fullzones);
1355} 1355}
@@ -1361,13 +1361,13 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1361 return NULL; 1361 return NULL;
1362} 1362}
1363 1363
1364static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1364static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1365 nodemask_t *allowednodes) 1365 nodemask_t *allowednodes)
1366{ 1366{
1367 return 1; 1367 return 1;
1368} 1368}
1369 1369
1370static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1370static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1371{ 1371{
1372} 1372}
1373#endif /* CONFIG_NUMA */ 1373#endif /* CONFIG_NUMA */
@@ -1377,42 +1377,31 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1377 * a page. 1377 * a page.
1378 */ 1378 */
1379static struct page * 1379static struct page *
1380get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1380get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1381 struct zonelist *zonelist, int alloc_flags) 1381 struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
1382{ 1382{
1383 struct zone **z; 1383 struct zoneref *z;
1384 struct page *page = NULL; 1384 struct page *page = NULL;
1385 int classzone_idx = zone_idx(zonelist->zones[0]); 1385 int classzone_idx;
1386 struct zone *zone; 1386 struct zone *zone, *preferred_zone;
1387 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1387 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1388 int zlc_active = 0; /* set if using zonelist_cache */ 1388 int zlc_active = 0; /* set if using zonelist_cache */
1389 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1389 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1390 enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ 1390
1391 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1392 &preferred_zone);
1393 classzone_idx = zone_idx(preferred_zone);
1391 1394
1392zonelist_scan: 1395zonelist_scan:
1393 /* 1396 /*
1394 * Scan zonelist, looking for a zone with enough free. 1397 * Scan zonelist, looking for a zone with enough free.
1395 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1398 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1396 */ 1399 */
1397 z = zonelist->zones; 1400 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1398 1401 high_zoneidx, nodemask) {
1399 do {
1400 /*
1401 * In NUMA, this could be a policy zonelist which contains
1402 * zones that may not be allowed by the current gfp_mask.
1403 * Check the zone is allowed by the current flags
1404 */
1405 if (unlikely(alloc_should_filter_zonelist(zonelist))) {
1406 if (highest_zoneidx == -1)
1407 highest_zoneidx = gfp_zone(gfp_mask);
1408 if (zone_idx(*z) > highest_zoneidx)
1409 continue;
1410 }
1411
1412 if (NUMA_BUILD && zlc_active && 1402 if (NUMA_BUILD && zlc_active &&
1413 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1403 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1414 continue; 1404 continue;
1415 zone = *z;
1416 if ((alloc_flags & ALLOC_CPUSET) && 1405 if ((alloc_flags & ALLOC_CPUSET) &&
1417 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1406 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1418 goto try_next_zone; 1407 goto try_next_zone;
@@ -1433,7 +1422,7 @@ zonelist_scan:
1433 } 1422 }
1434 } 1423 }
1435 1424
1436 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1425 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
1437 if (page) 1426 if (page)
1438 break; 1427 break;
1439this_zone_full: 1428this_zone_full:
@@ -1446,7 +1435,7 @@ try_next_zone:
1446 zlc_active = 1; 1435 zlc_active = 1;
1447 did_zlc_setup = 1; 1436 did_zlc_setup = 1;
1448 } 1437 }
1449 } while (*(++z) != NULL); 1438 }
1450 1439
1451 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1440 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1452 /* Disable zlc cache for second zonelist scan */ 1441 /* Disable zlc cache for second zonelist scan */
@@ -1459,12 +1448,14 @@ try_next_zone:
1459/* 1448/*
1460 * This is the 'heart' of the zoned buddy allocator. 1449 * This is the 'heart' of the zoned buddy allocator.
1461 */ 1450 */
1462struct page * 1451static struct page *
1463__alloc_pages(gfp_t gfp_mask, unsigned int order, 1452__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1464 struct zonelist *zonelist) 1453 struct zonelist *zonelist, nodemask_t *nodemask)
1465{ 1454{
1466 const gfp_t wait = gfp_mask & __GFP_WAIT; 1455 const gfp_t wait = gfp_mask & __GFP_WAIT;
1467 struct zone **z; 1456 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1457 struct zoneref *z;
1458 struct zone *zone;
1468 struct page *page; 1459 struct page *page;
1469 struct reclaim_state reclaim_state; 1460 struct reclaim_state reclaim_state;
1470 struct task_struct *p = current; 1461 struct task_struct *p = current;
@@ -1478,9 +1469,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
1478 return NULL; 1469 return NULL;
1479 1470
1480restart: 1471restart:
1481 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1472 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */
1482 1473
1483 if (unlikely(*z == NULL)) { 1474 if (unlikely(!z->zone)) {
1484 /* 1475 /*
1485 * Happens if we have an empty zonelist as a result of 1476 * Happens if we have an empty zonelist as a result of
1486 * GFP_THISNODE being used on a memoryless node 1477 * GFP_THISNODE being used on a memoryless node
@@ -1488,8 +1479,8 @@ restart:
1488 return NULL; 1479 return NULL;
1489 } 1480 }
1490 1481
1491 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1482 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1492 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1483 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1493 if (page) 1484 if (page)
1494 goto got_pg; 1485 goto got_pg;
1495 1486
@@ -1504,8 +1495,8 @@ restart:
1504 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1495 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1505 goto nopage; 1496 goto nopage;
1506 1497
1507 for (z = zonelist->zones; *z; z++) 1498 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1508 wakeup_kswapd(*z, order); 1499 wakeup_kswapd(zone, order);
1509 1500
1510 /* 1501 /*
1511 * OK, we're below the kswapd watermark and have kicked background 1502 * OK, we're below the kswapd watermark and have kicked background
@@ -1533,7 +1524,8 @@ restart:
1533 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1524 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1534 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1525 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1535 */ 1526 */
1536 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); 1527 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1528 high_zoneidx, alloc_flags);
1537 if (page) 1529 if (page)
1538 goto got_pg; 1530 goto got_pg;
1539 1531
@@ -1545,8 +1537,8 @@ rebalance:
1545 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1537 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1546nofail_alloc: 1538nofail_alloc:
1547 /* go through the zonelist yet again, ignoring mins */ 1539 /* go through the zonelist yet again, ignoring mins */
1548 page = get_page_from_freelist(gfp_mask, order, 1540 page = get_page_from_freelist(gfp_mask, nodemask, order,
1549 zonelist, ALLOC_NO_WATERMARKS); 1541 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1550 if (page) 1542 if (page)
1551 goto got_pg; 1543 goto got_pg;
1552 if (gfp_mask & __GFP_NOFAIL) { 1544 if (gfp_mask & __GFP_NOFAIL) {
@@ -1569,7 +1561,7 @@ nofail_alloc:
1569 reclaim_state.reclaimed_slab = 0; 1561 reclaim_state.reclaimed_slab = 0;
1570 p->reclaim_state = &reclaim_state; 1562 p->reclaim_state = &reclaim_state;
1571 1563
1572 did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); 1564 did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
1573 1565
1574 p->reclaim_state = NULL; 1566 p->reclaim_state = NULL;
1575 p->flags &= ~PF_MEMALLOC; 1567 p->flags &= ~PF_MEMALLOC;
@@ -1580,12 +1572,12 @@ nofail_alloc:
1580 drain_all_pages(); 1572 drain_all_pages();
1581 1573
1582 if (likely(did_some_progress)) { 1574 if (likely(did_some_progress)) {
1583 page = get_page_from_freelist(gfp_mask, order, 1575 page = get_page_from_freelist(gfp_mask, nodemask, order,
1584 zonelist, alloc_flags); 1576 zonelist, high_zoneidx, alloc_flags);
1585 if (page) 1577 if (page)
1586 goto got_pg; 1578 goto got_pg;
1587 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1579 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1588 if (!try_set_zone_oom(zonelist)) { 1580 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1589 schedule_timeout_uninterruptible(1); 1581 schedule_timeout_uninterruptible(1);
1590 goto restart; 1582 goto restart;
1591 } 1583 }
@@ -1596,21 +1588,22 @@ nofail_alloc:
1596 * a parallel oom killing, we must fail if we're still 1588 * a parallel oom killing, we must fail if we're still
1597 * under heavy pressure. 1589 * under heavy pressure.
1598 */ 1590 */
1599 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1591 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1600 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1592 order, zonelist, high_zoneidx,
1593 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1601 if (page) { 1594 if (page) {
1602 clear_zonelist_oom(zonelist); 1595 clear_zonelist_oom(zonelist, gfp_mask);
1603 goto got_pg; 1596 goto got_pg;
1604 } 1597 }
1605 1598
1606 /* The OOM killer will not help higher order allocs so fail */ 1599 /* The OOM killer will not help higher order allocs so fail */
1607 if (order > PAGE_ALLOC_COSTLY_ORDER) { 1600 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1608 clear_zonelist_oom(zonelist); 1601 clear_zonelist_oom(zonelist, gfp_mask);
1609 goto nopage; 1602 goto nopage;
1610 } 1603 }
1611 1604
1612 out_of_memory(zonelist, gfp_mask, order); 1605 out_of_memory(zonelist, gfp_mask, order);
1613 clear_zonelist_oom(zonelist); 1606 clear_zonelist_oom(zonelist, gfp_mask);
1614 goto restart; 1607 goto restart;
1615 } 1608 }
1616 1609
@@ -1646,6 +1639,20 @@ got_pg:
1646 return page; 1639 return page;
1647} 1640}
1648 1641
1642struct page *
1643__alloc_pages(gfp_t gfp_mask, unsigned int order,
1644 struct zonelist *zonelist)
1645{
1646 return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
1647}
1648
1649struct page *
1650__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1651 struct zonelist *zonelist, nodemask_t *nodemask)
1652{
1653 return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
1654}
1655
1649EXPORT_SYMBOL(__alloc_pages); 1656EXPORT_SYMBOL(__alloc_pages);
1650 1657
1651/* 1658/*
@@ -1712,15 +1719,15 @@ EXPORT_SYMBOL(free_pages);
1712 1719
1713static unsigned int nr_free_zone_pages(int offset) 1720static unsigned int nr_free_zone_pages(int offset)
1714{ 1721{
1722 struct zoneref *z;
1723 struct zone *zone;
1724
1715 /* Just pick one node, since fallback list is circular */ 1725 /* Just pick one node, since fallback list is circular */
1716 pg_data_t *pgdat = NODE_DATA(numa_node_id());
1717 unsigned int sum = 0; 1726 unsigned int sum = 0;
1718 1727
1719 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1728 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
1720 struct zone **zonep = zonelist->zones;
1721 struct zone *zone;
1722 1729
1723 for (zone = *zonep++; zone; zone = *zonep++) { 1730 for_each_zone_zonelist(zone, z, zonelist, offset) {
1724 unsigned long size = zone->present_pages; 1731 unsigned long size = zone->present_pages;
1725 unsigned long high = zone->pages_high; 1732 unsigned long high = zone->pages_high;
1726 if (size > high) 1733 if (size > high)
@@ -1889,6 +1896,12 @@ void show_free_areas(void)
1889 show_swap_cache_info(); 1896 show_swap_cache_info();
1890} 1897}
1891 1898
1899static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
1900{
1901 zoneref->zone = zone;
1902 zoneref->zone_idx = zone_idx(zone);
1903}
1904
1892/* 1905/*
1893 * Builds allocation fallback zone lists. 1906 * Builds allocation fallback zone lists.
1894 * 1907 *
@@ -1906,7 +1919,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
1906 zone_type--; 1919 zone_type--;
1907 zone = pgdat->node_zones + zone_type; 1920 zone = pgdat->node_zones + zone_type;
1908 if (populated_zone(zone)) { 1921 if (populated_zone(zone)) {
1909 zonelist->zones[nr_zones++] = zone; 1922 zoneref_set_zone(zone,
1923 &zonelist->_zonerefs[nr_zones++]);
1910 check_highest_zone(zone_type); 1924 check_highest_zone(zone_type);
1911 } 1925 }
1912 1926
@@ -2029,6 +2043,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2029 int n, val; 2043 int n, val;
2030 int min_val = INT_MAX; 2044 int min_val = INT_MAX;
2031 int best_node = -1; 2045 int best_node = -1;
2046 node_to_cpumask_ptr(tmp, 0);
2032 2047
2033 /* Use the local node if we haven't already */ 2048 /* Use the local node if we haven't already */
2034 if (!node_isset(node, *used_node_mask)) { 2049 if (!node_isset(node, *used_node_mask)) {
@@ -2037,7 +2052,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2037 } 2052 }
2038 2053
2039 for_each_node_state(n, N_HIGH_MEMORY) { 2054 for_each_node_state(n, N_HIGH_MEMORY) {
2040 cpumask_t tmp;
2041 2055
2042 /* Don't want a node to appear more than once */ 2056 /* Don't want a node to appear more than once */
2043 if (node_isset(n, *used_node_mask)) 2057 if (node_isset(n, *used_node_mask))
@@ -2050,8 +2064,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2050 val += (n < node); 2064 val += (n < node);
2051 2065
2052 /* Give preference to headless and unused nodes */ 2066 /* Give preference to headless and unused nodes */
2053 tmp = node_to_cpumask(n); 2067 node_to_cpumask_ptr_next(tmp, n);
2054 if (!cpus_empty(tmp)) 2068 if (!cpus_empty(*tmp))
2055 val += PENALTY_FOR_NODE_WITH_CPUS; 2069 val += PENALTY_FOR_NODE_WITH_CPUS;
2056 2070
2057 /* Slight preference for less loaded node */ 2071 /* Slight preference for less loaded node */
@@ -2078,17 +2092,16 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2078 */ 2092 */
2079static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 2093static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
2080{ 2094{
2081 enum zone_type i;
2082 int j; 2095 int j;
2083 struct zonelist *zonelist; 2096 struct zonelist *zonelist;
2084 2097
2085 for (i = 0; i < MAX_NR_ZONES; i++) { 2098 zonelist = &pgdat->node_zonelists[0];
2086 zonelist = pgdat->node_zonelists + i; 2099 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
2087 for (j = 0; zonelist->zones[j] != NULL; j++) 2100 ;
2088 ; 2101 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2089 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2102 MAX_NR_ZONES - 1);
2090 zonelist->zones[j] = NULL; 2103 zonelist->_zonerefs[j].zone = NULL;
2091 } 2104 zonelist->_zonerefs[j].zone_idx = 0;
2092} 2105}
2093 2106
2094/* 2107/*
@@ -2096,15 +2109,13 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
2096 */ 2109 */
2097static void build_thisnode_zonelists(pg_data_t *pgdat) 2110static void build_thisnode_zonelists(pg_data_t *pgdat)
2098{ 2111{
2099 enum zone_type i;
2100 int j; 2112 int j;
2101 struct zonelist *zonelist; 2113 struct zonelist *zonelist;
2102 2114
2103 for (i = 0; i < MAX_NR_ZONES; i++) { 2115 zonelist = &pgdat->node_zonelists[1];
2104 zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; 2116 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
2105 j = build_zonelists_node(pgdat, zonelist, 0, i); 2117 zonelist->_zonerefs[j].zone = NULL;
2106 zonelist->zones[j] = NULL; 2118 zonelist->_zonerefs[j].zone_idx = 0;
2107 }
2108} 2119}
2109 2120
2110/* 2121/*
@@ -2117,27 +2128,26 @@ static int node_order[MAX_NUMNODES];
2117 2128
2118static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 2129static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
2119{ 2130{
2120 enum zone_type i;
2121 int pos, j, node; 2131 int pos, j, node;
2122 int zone_type; /* needs to be signed */ 2132 int zone_type; /* needs to be signed */
2123 struct zone *z; 2133 struct zone *z;
2124 struct zonelist *zonelist; 2134 struct zonelist *zonelist;
2125 2135
2126 for (i = 0; i < MAX_NR_ZONES; i++) { 2136 zonelist = &pgdat->node_zonelists[0];
2127 zonelist = pgdat->node_zonelists + i; 2137 pos = 0;
2128 pos = 0; 2138 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
2129 for (zone_type = i; zone_type >= 0; zone_type--) { 2139 for (j = 0; j < nr_nodes; j++) {
2130 for (j = 0; j < nr_nodes; j++) { 2140 node = node_order[j];
2131 node = node_order[j]; 2141 z = &NODE_DATA(node)->node_zones[zone_type];
2132 z = &NODE_DATA(node)->node_zones[zone_type]; 2142 if (populated_zone(z)) {
2133 if (populated_zone(z)) { 2143 zoneref_set_zone(z,
2134 zonelist->zones[pos++] = z; 2144 &zonelist->_zonerefs[pos++]);
2135 check_highest_zone(zone_type); 2145 check_highest_zone(zone_type);
2136 }
2137 } 2146 }
2138 } 2147 }
2139 zonelist->zones[pos] = NULL;
2140 } 2148 }
2149 zonelist->_zonerefs[pos].zone = NULL;
2150 zonelist->_zonerefs[pos].zone_idx = 0;
2141} 2151}
2142 2152
2143static int default_zonelist_order(void) 2153static int default_zonelist_order(void)
@@ -2214,7 +2224,8 @@ static void build_zonelists(pg_data_t *pgdat)
2214 /* initialize zonelists */ 2224 /* initialize zonelists */
2215 for (i = 0; i < MAX_ZONELISTS; i++) { 2225 for (i = 0; i < MAX_ZONELISTS; i++) {
2216 zonelist = pgdat->node_zonelists + i; 2226 zonelist = pgdat->node_zonelists + i;
2217 zonelist->zones[0] = NULL; 2227 zonelist->_zonerefs[0].zone = NULL;
2228 zonelist->_zonerefs[0].zone_idx = 0;
2218 } 2229 }
2219 2230
2220 /* NUMA-aware ordering of nodes */ 2231 /* NUMA-aware ordering of nodes */
@@ -2264,19 +2275,15 @@ static void build_zonelists(pg_data_t *pgdat)
2264/* Construct the zonelist performance cache - see further mmzone.h */ 2275/* Construct the zonelist performance cache - see further mmzone.h */
2265static void build_zonelist_cache(pg_data_t *pgdat) 2276static void build_zonelist_cache(pg_data_t *pgdat)
2266{ 2277{
2267 int i; 2278 struct zonelist *zonelist;
2268 2279 struct zonelist_cache *zlc;
2269 for (i = 0; i < MAX_NR_ZONES; i++) { 2280 struct zoneref *z;
2270 struct zonelist *zonelist;
2271 struct zonelist_cache *zlc;
2272 struct zone **z;
2273 2281
2274 zonelist = pgdat->node_zonelists + i; 2282 zonelist = &pgdat->node_zonelists[0];
2275 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 2283 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
2276 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 2284 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
2277 for (z = zonelist->zones; *z; z++) 2285 for (z = zonelist->_zonerefs; z->zone; z++)
2278 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); 2286 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
2279 }
2280} 2287}
2281 2288
2282 2289
@@ -2290,45 +2297,44 @@ static void set_zonelist_order(void)
2290static void build_zonelists(pg_data_t *pgdat) 2297static void build_zonelists(pg_data_t *pgdat)
2291{ 2298{
2292 int node, local_node; 2299 int node, local_node;
2293 enum zone_type i,j; 2300 enum zone_type j;
2301 struct zonelist *zonelist;
2294 2302
2295 local_node = pgdat->node_id; 2303 local_node = pgdat->node_id;
2296 for (i = 0; i < MAX_NR_ZONES; i++) {
2297 struct zonelist *zonelist;
2298 2304
2299 zonelist = pgdat->node_zonelists + i; 2305 zonelist = &pgdat->node_zonelists[0];
2306 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
2300 2307
2301 j = build_zonelists_node(pgdat, zonelist, 0, i); 2308 /*
2302 /* 2309 * Now we build the zonelist so that it contains the zones
2303 * Now we build the zonelist so that it contains the zones 2310 * of all the other nodes.
2304 * of all the other nodes. 2311 * We don't want to pressure a particular node, so when
2305 * We don't want to pressure a particular node, so when 2312 * building the zones for node N, we make sure that the
2306 * building the zones for node N, we make sure that the 2313 * zones coming right after the local ones are those from
2307 * zones coming right after the local ones are those from 2314 * node N+1 (modulo N)
2308 * node N+1 (modulo N) 2315 */
2309 */ 2316 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
2310 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 2317 if (!node_online(node))
2311 if (!node_online(node)) 2318 continue;
2312 continue; 2319 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2313 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2320 MAX_NR_ZONES - 1);
2314 } 2321 }
2315 for (node = 0; node < local_node; node++) { 2322 for (node = 0; node < local_node; node++) {
2316 if (!node_online(node)) 2323 if (!node_online(node))
2317 continue; 2324 continue;
2318 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2325 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2319 } 2326 MAX_NR_ZONES - 1);
2320
2321 zonelist->zones[j] = NULL;
2322 } 2327 }
2328
2329 zonelist->_zonerefs[j].zone = NULL;
2330 zonelist->_zonerefs[j].zone_idx = 0;
2323} 2331}
2324 2332
2325/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 2333/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
2326static void build_zonelist_cache(pg_data_t *pgdat) 2334static void build_zonelist_cache(pg_data_t *pgdat)
2327{ 2335{
2328 int i; 2336 pgdat->node_zonelists[0].zlcache_ptr = NULL;
2329 2337 pgdat->node_zonelists[1].zlcache_ptr = NULL;
2330 for (i = 0; i < MAX_NR_ZONES; i++)
2331 pgdat->node_zonelists[i].zlcache_ptr = NULL;
2332} 2338}
2333 2339
2334#endif /* CONFIG_NUMA */ 2340#endif /* CONFIG_NUMA */
@@ -4339,9 +4345,7 @@ void *__init alloc_large_system_hash(const char *tablename,
4339 else if (hashdist) 4345 else if (hashdist)
4340 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4346 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4341 else { 4347 else {
4342 unsigned long order; 4348 unsigned long order = get_order(size);
4343 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
4344 ;
4345 table = (void*) __get_free_pages(GFP_ATOMIC, order); 4349 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4346 /* 4350 /*
4347 * If bucketsize is not a power-of-two, we may free 4351 * If bucketsize is not a power-of-two, we may free