diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 280 |
1 files changed, 142 insertions, 138 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 402a504f1228..d1cf4f05dcda 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -546,7 +546,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
546 | /* | 546 | /* |
547 | * permit the bootmem allocator to evade page validation on high-order frees | 547 | * permit the bootmem allocator to evade page validation on high-order frees |
548 | */ | 548 | */ |
549 | void __init __free_pages_bootmem(struct page *page, unsigned int order) | 549 | void __free_pages_bootmem(struct page *page, unsigned int order) |
550 | { | 550 | { |
551 | if (order == 0) { | 551 | if (order == 0) { |
552 | __ClearPageReserved(page); | 552 | __ClearPageReserved(page); |
@@ -632,7 +632,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) | |||
632 | if (PageReserved(page)) | 632 | if (PageReserved(page)) |
633 | return 1; | 633 | return 1; |
634 | 634 | ||
635 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | | 635 | page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | |
636 | 1 << PG_referenced | 1 << PG_arch_1 | | 636 | 1 << PG_referenced | 1 << PG_arch_1 | |
637 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); | 637 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); |
638 | set_page_private(page, 0); | 638 | set_page_private(page, 0); |
@@ -1050,7 +1050,7 @@ void split_page(struct page *page, unsigned int order) | |||
1050 | * we cheat by calling it from here, in the order > 0 path. Saves a branch | 1050 | * we cheat by calling it from here, in the order > 0 path. Saves a branch |
1051 | * or two. | 1051 | * or two. |
1052 | */ | 1052 | */ |
1053 | static struct page *buffered_rmqueue(struct zonelist *zonelist, | 1053 | static struct page *buffered_rmqueue(struct zone *preferred_zone, |
1054 | struct zone *zone, int order, gfp_t gfp_flags) | 1054 | struct zone *zone, int order, gfp_t gfp_flags) |
1055 | { | 1055 | { |
1056 | unsigned long flags; | 1056 | unsigned long flags; |
@@ -1102,7 +1102,7 @@ again: | |||
1102 | } | 1102 | } |
1103 | 1103 | ||
1104 | __count_zone_vm_events(PGALLOC, zone, 1 << order); | 1104 | __count_zone_vm_events(PGALLOC, zone, 1 << order); |
1105 | zone_statistics(zonelist, zone); | 1105 | zone_statistics(preferred_zone, zone); |
1106 | local_irq_restore(flags); | 1106 | local_irq_restore(flags); |
1107 | put_cpu(); | 1107 | put_cpu(); |
1108 | 1108 | ||
@@ -1284,7 +1284,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1284 | if (!zlc) | 1284 | if (!zlc) |
1285 | return NULL; | 1285 | return NULL; |
1286 | 1286 | ||
1287 | if (time_after(jiffies, zlc->last_full_zap + HZ)) { | 1287 | if (time_after(jiffies, zlc->last_full_zap + HZ)) { |
1288 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 1288 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
1289 | zlc->last_full_zap = jiffies; | 1289 | zlc->last_full_zap = jiffies; |
1290 | } | 1290 | } |
@@ -1317,7 +1317,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1317 | * We are low on memory in the second scan, and should leave no stone | 1317 | * We are low on memory in the second scan, and should leave no stone |
1318 | * unturned looking for a free page. | 1318 | * unturned looking for a free page. |
1319 | */ | 1319 | */ |
1320 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | 1320 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, |
1321 | nodemask_t *allowednodes) | 1321 | nodemask_t *allowednodes) |
1322 | { | 1322 | { |
1323 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1323 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
@@ -1328,7 +1328,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | |||
1328 | if (!zlc) | 1328 | if (!zlc) |
1329 | return 1; | 1329 | return 1; |
1330 | 1330 | ||
1331 | i = z - zonelist->zones; | 1331 | i = z - zonelist->_zonerefs; |
1332 | n = zlc->z_to_n[i]; | 1332 | n = zlc->z_to_n[i]; |
1333 | 1333 | ||
1334 | /* This zone is worth trying if it is allowed but not full */ | 1334 | /* This zone is worth trying if it is allowed but not full */ |
@@ -1340,7 +1340,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | |||
1340 | * zlc->fullzones, so that subsequent attempts to allocate a page | 1340 | * zlc->fullzones, so that subsequent attempts to allocate a page |
1341 | * from that zone don't waste time re-examining it. | 1341 | * from that zone don't waste time re-examining it. |
1342 | */ | 1342 | */ |
1343 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | 1343 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1344 | { | 1344 | { |
1345 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ | 1345 | struct zonelist_cache *zlc; /* cached zonelist speedup info */ |
1346 | int i; /* index of *z in zonelist zones */ | 1346 | int i; /* index of *z in zonelist zones */ |
@@ -1349,7 +1349,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | |||
1349 | if (!zlc) | 1349 | if (!zlc) |
1350 | return; | 1350 | return; |
1351 | 1351 | ||
1352 | i = z - zonelist->zones; | 1352 | i = z - zonelist->_zonerefs; |
1353 | 1353 | ||
1354 | set_bit(i, zlc->fullzones); | 1354 | set_bit(i, zlc->fullzones); |
1355 | } | 1355 | } |
@@ -1361,13 +1361,13 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1361 | return NULL; | 1361 | return NULL; |
1362 | } | 1362 | } |
1363 | 1363 | ||
1364 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, | 1364 | static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, |
1365 | nodemask_t *allowednodes) | 1365 | nodemask_t *allowednodes) |
1366 | { | 1366 | { |
1367 | return 1; | 1367 | return 1; |
1368 | } | 1368 | } |
1369 | 1369 | ||
1370 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | 1370 | static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) |
1371 | { | 1371 | { |
1372 | } | 1372 | } |
1373 | #endif /* CONFIG_NUMA */ | 1373 | #endif /* CONFIG_NUMA */ |
@@ -1377,42 +1377,31 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) | |||
1377 | * a page. | 1377 | * a page. |
1378 | */ | 1378 | */ |
1379 | static struct page * | 1379 | static struct page * |
1380 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1380 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1381 | struct zonelist *zonelist, int alloc_flags) | 1381 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) |
1382 | { | 1382 | { |
1383 | struct zone **z; | 1383 | struct zoneref *z; |
1384 | struct page *page = NULL; | 1384 | struct page *page = NULL; |
1385 | int classzone_idx = zone_idx(zonelist->zones[0]); | 1385 | int classzone_idx; |
1386 | struct zone *zone; | 1386 | struct zone *zone, *preferred_zone; |
1387 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ | 1387 | nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ |
1388 | int zlc_active = 0; /* set if using zonelist_cache */ | 1388 | int zlc_active = 0; /* set if using zonelist_cache */ |
1389 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1389 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1390 | enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ | 1390 | |
1391 | (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, | ||
1392 | &preferred_zone); | ||
1393 | classzone_idx = zone_idx(preferred_zone); | ||
1391 | 1394 | ||
1392 | zonelist_scan: | 1395 | zonelist_scan: |
1393 | /* | 1396 | /* |
1394 | * Scan zonelist, looking for a zone with enough free. | 1397 | * Scan zonelist, looking for a zone with enough free. |
1395 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1398 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1396 | */ | 1399 | */ |
1397 | z = zonelist->zones; | 1400 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1398 | 1401 | high_zoneidx, nodemask) { | |
1399 | do { | ||
1400 | /* | ||
1401 | * In NUMA, this could be a policy zonelist which contains | ||
1402 | * zones that may not be allowed by the current gfp_mask. | ||
1403 | * Check the zone is allowed by the current flags | ||
1404 | */ | ||
1405 | if (unlikely(alloc_should_filter_zonelist(zonelist))) { | ||
1406 | if (highest_zoneidx == -1) | ||
1407 | highest_zoneidx = gfp_zone(gfp_mask); | ||
1408 | if (zone_idx(*z) > highest_zoneidx) | ||
1409 | continue; | ||
1410 | } | ||
1411 | |||
1412 | if (NUMA_BUILD && zlc_active && | 1402 | if (NUMA_BUILD && zlc_active && |
1413 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1403 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1414 | continue; | 1404 | continue; |
1415 | zone = *z; | ||
1416 | if ((alloc_flags & ALLOC_CPUSET) && | 1405 | if ((alloc_flags & ALLOC_CPUSET) && |
1417 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 1406 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) |
1418 | goto try_next_zone; | 1407 | goto try_next_zone; |
@@ -1433,7 +1422,7 @@ zonelist_scan: | |||
1433 | } | 1422 | } |
1434 | } | 1423 | } |
1435 | 1424 | ||
1436 | page = buffered_rmqueue(zonelist, zone, order, gfp_mask); | 1425 | page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); |
1437 | if (page) | 1426 | if (page) |
1438 | break; | 1427 | break; |
1439 | this_zone_full: | 1428 | this_zone_full: |
@@ -1446,7 +1435,7 @@ try_next_zone: | |||
1446 | zlc_active = 1; | 1435 | zlc_active = 1; |
1447 | did_zlc_setup = 1; | 1436 | did_zlc_setup = 1; |
1448 | } | 1437 | } |
1449 | } while (*(++z) != NULL); | 1438 | } |
1450 | 1439 | ||
1451 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1440 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { |
1452 | /* Disable zlc cache for second zonelist scan */ | 1441 | /* Disable zlc cache for second zonelist scan */ |
@@ -1459,12 +1448,14 @@ try_next_zone: | |||
1459 | /* | 1448 | /* |
1460 | * This is the 'heart' of the zoned buddy allocator. | 1449 | * This is the 'heart' of the zoned buddy allocator. |
1461 | */ | 1450 | */ |
1462 | struct page * | 1451 | static struct page * |
1463 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | 1452 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, |
1464 | struct zonelist *zonelist) | 1453 | struct zonelist *zonelist, nodemask_t *nodemask) |
1465 | { | 1454 | { |
1466 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 1455 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
1467 | struct zone **z; | 1456 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
1457 | struct zoneref *z; | ||
1458 | struct zone *zone; | ||
1468 | struct page *page; | 1459 | struct page *page; |
1469 | struct reclaim_state reclaim_state; | 1460 | struct reclaim_state reclaim_state; |
1470 | struct task_struct *p = current; | 1461 | struct task_struct *p = current; |
@@ -1478,9 +1469,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order, | |||
1478 | return NULL; | 1469 | return NULL; |
1479 | 1470 | ||
1480 | restart: | 1471 | restart: |
1481 | z = zonelist->zones; /* the list of zones suitable for gfp_mask */ | 1472 | z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ |
1482 | 1473 | ||
1483 | if (unlikely(*z == NULL)) { | 1474 | if (unlikely(!z->zone)) { |
1484 | /* | 1475 | /* |
1485 | * Happens if we have an empty zonelist as a result of | 1476 | * Happens if we have an empty zonelist as a result of |
1486 | * GFP_THISNODE being used on a memoryless node | 1477 | * GFP_THISNODE being used on a memoryless node |
@@ -1488,8 +1479,8 @@ restart: | |||
1488 | return NULL; | 1479 | return NULL; |
1489 | } | 1480 | } |
1490 | 1481 | ||
1491 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1482 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
1492 | zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1483 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); |
1493 | if (page) | 1484 | if (page) |
1494 | goto got_pg; | 1485 | goto got_pg; |
1495 | 1486 | ||
@@ -1504,8 +1495,8 @@ restart: | |||
1504 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 1495 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
1505 | goto nopage; | 1496 | goto nopage; |
1506 | 1497 | ||
1507 | for (z = zonelist->zones; *z; z++) | 1498 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) |
1508 | wakeup_kswapd(*z, order); | 1499 | wakeup_kswapd(zone, order); |
1509 | 1500 | ||
1510 | /* | 1501 | /* |
1511 | * OK, we're below the kswapd watermark and have kicked background | 1502 | * OK, we're below the kswapd watermark and have kicked background |
@@ -1533,7 +1524,8 @@ restart: | |||
1533 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 1524 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1534 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1525 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1535 | */ | 1526 | */ |
1536 | page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); | 1527 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
1528 | high_zoneidx, alloc_flags); | ||
1537 | if (page) | 1529 | if (page) |
1538 | goto got_pg; | 1530 | goto got_pg; |
1539 | 1531 | ||
@@ -1545,8 +1537,8 @@ rebalance: | |||
1545 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1537 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
1546 | nofail_alloc: | 1538 | nofail_alloc: |
1547 | /* go through the zonelist yet again, ignoring mins */ | 1539 | /* go through the zonelist yet again, ignoring mins */ |
1548 | page = get_page_from_freelist(gfp_mask, order, | 1540 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1549 | zonelist, ALLOC_NO_WATERMARKS); | 1541 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); |
1550 | if (page) | 1542 | if (page) |
1551 | goto got_pg; | 1543 | goto got_pg; |
1552 | if (gfp_mask & __GFP_NOFAIL) { | 1544 | if (gfp_mask & __GFP_NOFAIL) { |
@@ -1569,7 +1561,7 @@ nofail_alloc: | |||
1569 | reclaim_state.reclaimed_slab = 0; | 1561 | reclaim_state.reclaimed_slab = 0; |
1570 | p->reclaim_state = &reclaim_state; | 1562 | p->reclaim_state = &reclaim_state; |
1571 | 1563 | ||
1572 | did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); | 1564 | did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); |
1573 | 1565 | ||
1574 | p->reclaim_state = NULL; | 1566 | p->reclaim_state = NULL; |
1575 | p->flags &= ~PF_MEMALLOC; | 1567 | p->flags &= ~PF_MEMALLOC; |
@@ -1580,12 +1572,12 @@ nofail_alloc: | |||
1580 | drain_all_pages(); | 1572 | drain_all_pages(); |
1581 | 1573 | ||
1582 | if (likely(did_some_progress)) { | 1574 | if (likely(did_some_progress)) { |
1583 | page = get_page_from_freelist(gfp_mask, order, | 1575 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1584 | zonelist, alloc_flags); | 1576 | zonelist, high_zoneidx, alloc_flags); |
1585 | if (page) | 1577 | if (page) |
1586 | goto got_pg; | 1578 | goto got_pg; |
1587 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | 1579 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { |
1588 | if (!try_set_zone_oom(zonelist)) { | 1580 | if (!try_set_zone_oom(zonelist, gfp_mask)) { |
1589 | schedule_timeout_uninterruptible(1); | 1581 | schedule_timeout_uninterruptible(1); |
1590 | goto restart; | 1582 | goto restart; |
1591 | } | 1583 | } |
@@ -1596,21 +1588,22 @@ nofail_alloc: | |||
1596 | * a parallel oom killing, we must fail if we're still | 1588 | * a parallel oom killing, we must fail if we're still |
1597 | * under heavy pressure. | 1589 | * under heavy pressure. |
1598 | */ | 1590 | */ |
1599 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1591 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, |
1600 | zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); | 1592 | order, zonelist, high_zoneidx, |
1593 | ALLOC_WMARK_HIGH|ALLOC_CPUSET); | ||
1601 | if (page) { | 1594 | if (page) { |
1602 | clear_zonelist_oom(zonelist); | 1595 | clear_zonelist_oom(zonelist, gfp_mask); |
1603 | goto got_pg; | 1596 | goto got_pg; |
1604 | } | 1597 | } |
1605 | 1598 | ||
1606 | /* The OOM killer will not help higher order allocs so fail */ | 1599 | /* The OOM killer will not help higher order allocs so fail */ |
1607 | if (order > PAGE_ALLOC_COSTLY_ORDER) { | 1600 | if (order > PAGE_ALLOC_COSTLY_ORDER) { |
1608 | clear_zonelist_oom(zonelist); | 1601 | clear_zonelist_oom(zonelist, gfp_mask); |
1609 | goto nopage; | 1602 | goto nopage; |
1610 | } | 1603 | } |
1611 | 1604 | ||
1612 | out_of_memory(zonelist, gfp_mask, order); | 1605 | out_of_memory(zonelist, gfp_mask, order); |
1613 | clear_zonelist_oom(zonelist); | 1606 | clear_zonelist_oom(zonelist, gfp_mask); |
1614 | goto restart; | 1607 | goto restart; |
1615 | } | 1608 | } |
1616 | 1609 | ||
@@ -1646,6 +1639,20 @@ got_pg: | |||
1646 | return page; | 1639 | return page; |
1647 | } | 1640 | } |
1648 | 1641 | ||
1642 | struct page * | ||
1643 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | ||
1644 | struct zonelist *zonelist) | ||
1645 | { | ||
1646 | return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); | ||
1647 | } | ||
1648 | |||
1649 | struct page * | ||
1650 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
1651 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
1652 | { | ||
1653 | return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); | ||
1654 | } | ||
1655 | |||
1649 | EXPORT_SYMBOL(__alloc_pages); | 1656 | EXPORT_SYMBOL(__alloc_pages); |
1650 | 1657 | ||
1651 | /* | 1658 | /* |
@@ -1712,15 +1719,15 @@ EXPORT_SYMBOL(free_pages); | |||
1712 | 1719 | ||
1713 | static unsigned int nr_free_zone_pages(int offset) | 1720 | static unsigned int nr_free_zone_pages(int offset) |
1714 | { | 1721 | { |
1722 | struct zoneref *z; | ||
1723 | struct zone *zone; | ||
1724 | |||
1715 | /* Just pick one node, since fallback list is circular */ | 1725 | /* Just pick one node, since fallback list is circular */ |
1716 | pg_data_t *pgdat = NODE_DATA(numa_node_id()); | ||
1717 | unsigned int sum = 0; | 1726 | unsigned int sum = 0; |
1718 | 1727 | ||
1719 | struct zonelist *zonelist = pgdat->node_zonelists + offset; | 1728 | struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL); |
1720 | struct zone **zonep = zonelist->zones; | ||
1721 | struct zone *zone; | ||
1722 | 1729 | ||
1723 | for (zone = *zonep++; zone; zone = *zonep++) { | 1730 | for_each_zone_zonelist(zone, z, zonelist, offset) { |
1724 | unsigned long size = zone->present_pages; | 1731 | unsigned long size = zone->present_pages; |
1725 | unsigned long high = zone->pages_high; | 1732 | unsigned long high = zone->pages_high; |
1726 | if (size > high) | 1733 | if (size > high) |
@@ -1889,6 +1896,12 @@ void show_free_areas(void) | |||
1889 | show_swap_cache_info(); | 1896 | show_swap_cache_info(); |
1890 | } | 1897 | } |
1891 | 1898 | ||
1899 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) | ||
1900 | { | ||
1901 | zoneref->zone = zone; | ||
1902 | zoneref->zone_idx = zone_idx(zone); | ||
1903 | } | ||
1904 | |||
1892 | /* | 1905 | /* |
1893 | * Builds allocation fallback zone lists. | 1906 | * Builds allocation fallback zone lists. |
1894 | * | 1907 | * |
@@ -1906,7 +1919,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, | |||
1906 | zone_type--; | 1919 | zone_type--; |
1907 | zone = pgdat->node_zones + zone_type; | 1920 | zone = pgdat->node_zones + zone_type; |
1908 | if (populated_zone(zone)) { | 1921 | if (populated_zone(zone)) { |
1909 | zonelist->zones[nr_zones++] = zone; | 1922 | zoneref_set_zone(zone, |
1923 | &zonelist->_zonerefs[nr_zones++]); | ||
1910 | check_highest_zone(zone_type); | 1924 | check_highest_zone(zone_type); |
1911 | } | 1925 | } |
1912 | 1926 | ||
@@ -2029,6 +2043,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
2029 | int n, val; | 2043 | int n, val; |
2030 | int min_val = INT_MAX; | 2044 | int min_val = INT_MAX; |
2031 | int best_node = -1; | 2045 | int best_node = -1; |
2046 | node_to_cpumask_ptr(tmp, 0); | ||
2032 | 2047 | ||
2033 | /* Use the local node if we haven't already */ | 2048 | /* Use the local node if we haven't already */ |
2034 | if (!node_isset(node, *used_node_mask)) { | 2049 | if (!node_isset(node, *used_node_mask)) { |
@@ -2037,7 +2052,6 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
2037 | } | 2052 | } |
2038 | 2053 | ||
2039 | for_each_node_state(n, N_HIGH_MEMORY) { | 2054 | for_each_node_state(n, N_HIGH_MEMORY) { |
2040 | cpumask_t tmp; | ||
2041 | 2055 | ||
2042 | /* Don't want a node to appear more than once */ | 2056 | /* Don't want a node to appear more than once */ |
2043 | if (node_isset(n, *used_node_mask)) | 2057 | if (node_isset(n, *used_node_mask)) |
@@ -2050,8 +2064,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
2050 | val += (n < node); | 2064 | val += (n < node); |
2051 | 2065 | ||
2052 | /* Give preference to headless and unused nodes */ | 2066 | /* Give preference to headless and unused nodes */ |
2053 | tmp = node_to_cpumask(n); | 2067 | node_to_cpumask_ptr_next(tmp, n); |
2054 | if (!cpus_empty(tmp)) | 2068 | if (!cpus_empty(*tmp)) |
2055 | val += PENALTY_FOR_NODE_WITH_CPUS; | 2069 | val += PENALTY_FOR_NODE_WITH_CPUS; |
2056 | 2070 | ||
2057 | /* Slight preference for less loaded node */ | 2071 | /* Slight preference for less loaded node */ |
@@ -2078,17 +2092,16 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
2078 | */ | 2092 | */ |
2079 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | 2093 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) |
2080 | { | 2094 | { |
2081 | enum zone_type i; | ||
2082 | int j; | 2095 | int j; |
2083 | struct zonelist *zonelist; | 2096 | struct zonelist *zonelist; |
2084 | 2097 | ||
2085 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2098 | zonelist = &pgdat->node_zonelists[0]; |
2086 | zonelist = pgdat->node_zonelists + i; | 2099 | for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) |
2087 | for (j = 0; zonelist->zones[j] != NULL; j++) | 2100 | ; |
2088 | ; | 2101 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
2089 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 2102 | MAX_NR_ZONES - 1); |
2090 | zonelist->zones[j] = NULL; | 2103 | zonelist->_zonerefs[j].zone = NULL; |
2091 | } | 2104 | zonelist->_zonerefs[j].zone_idx = 0; |
2092 | } | 2105 | } |
2093 | 2106 | ||
2094 | /* | 2107 | /* |
@@ -2096,15 +2109,13 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | |||
2096 | */ | 2109 | */ |
2097 | static void build_thisnode_zonelists(pg_data_t *pgdat) | 2110 | static void build_thisnode_zonelists(pg_data_t *pgdat) |
2098 | { | 2111 | { |
2099 | enum zone_type i; | ||
2100 | int j; | 2112 | int j; |
2101 | struct zonelist *zonelist; | 2113 | struct zonelist *zonelist; |
2102 | 2114 | ||
2103 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2115 | zonelist = &pgdat->node_zonelists[1]; |
2104 | zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; | 2116 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); |
2105 | j = build_zonelists_node(pgdat, zonelist, 0, i); | 2117 | zonelist->_zonerefs[j].zone = NULL; |
2106 | zonelist->zones[j] = NULL; | 2118 | zonelist->_zonerefs[j].zone_idx = 0; |
2107 | } | ||
2108 | } | 2119 | } |
2109 | 2120 | ||
2110 | /* | 2121 | /* |
@@ -2117,27 +2128,26 @@ static int node_order[MAX_NUMNODES]; | |||
2117 | 2128 | ||
2118 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | 2129 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) |
2119 | { | 2130 | { |
2120 | enum zone_type i; | ||
2121 | int pos, j, node; | 2131 | int pos, j, node; |
2122 | int zone_type; /* needs to be signed */ | 2132 | int zone_type; /* needs to be signed */ |
2123 | struct zone *z; | 2133 | struct zone *z; |
2124 | struct zonelist *zonelist; | 2134 | struct zonelist *zonelist; |
2125 | 2135 | ||
2126 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2136 | zonelist = &pgdat->node_zonelists[0]; |
2127 | zonelist = pgdat->node_zonelists + i; | 2137 | pos = 0; |
2128 | pos = 0; | 2138 | for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) { |
2129 | for (zone_type = i; zone_type >= 0; zone_type--) { | 2139 | for (j = 0; j < nr_nodes; j++) { |
2130 | for (j = 0; j < nr_nodes; j++) { | 2140 | node = node_order[j]; |
2131 | node = node_order[j]; | 2141 | z = &NODE_DATA(node)->node_zones[zone_type]; |
2132 | z = &NODE_DATA(node)->node_zones[zone_type]; | 2142 | if (populated_zone(z)) { |
2133 | if (populated_zone(z)) { | 2143 | zoneref_set_zone(z, |
2134 | zonelist->zones[pos++] = z; | 2144 | &zonelist->_zonerefs[pos++]); |
2135 | check_highest_zone(zone_type); | 2145 | check_highest_zone(zone_type); |
2136 | } | ||
2137 | } | 2146 | } |
2138 | } | 2147 | } |
2139 | zonelist->zones[pos] = NULL; | ||
2140 | } | 2148 | } |
2149 | zonelist->_zonerefs[pos].zone = NULL; | ||
2150 | zonelist->_zonerefs[pos].zone_idx = 0; | ||
2141 | } | 2151 | } |
2142 | 2152 | ||
2143 | static int default_zonelist_order(void) | 2153 | static int default_zonelist_order(void) |
@@ -2214,7 +2224,8 @@ static void build_zonelists(pg_data_t *pgdat) | |||
2214 | /* initialize zonelists */ | 2224 | /* initialize zonelists */ |
2215 | for (i = 0; i < MAX_ZONELISTS; i++) { | 2225 | for (i = 0; i < MAX_ZONELISTS; i++) { |
2216 | zonelist = pgdat->node_zonelists + i; | 2226 | zonelist = pgdat->node_zonelists + i; |
2217 | zonelist->zones[0] = NULL; | 2227 | zonelist->_zonerefs[0].zone = NULL; |
2228 | zonelist->_zonerefs[0].zone_idx = 0; | ||
2218 | } | 2229 | } |
2219 | 2230 | ||
2220 | /* NUMA-aware ordering of nodes */ | 2231 | /* NUMA-aware ordering of nodes */ |
@@ -2264,19 +2275,15 @@ static void build_zonelists(pg_data_t *pgdat) | |||
2264 | /* Construct the zonelist performance cache - see further mmzone.h */ | 2275 | /* Construct the zonelist performance cache - see further mmzone.h */ |
2265 | static void build_zonelist_cache(pg_data_t *pgdat) | 2276 | static void build_zonelist_cache(pg_data_t *pgdat) |
2266 | { | 2277 | { |
2267 | int i; | 2278 | struct zonelist *zonelist; |
2268 | 2279 | struct zonelist_cache *zlc; | |
2269 | for (i = 0; i < MAX_NR_ZONES; i++) { | 2280 | struct zoneref *z; |
2270 | struct zonelist *zonelist; | ||
2271 | struct zonelist_cache *zlc; | ||
2272 | struct zone **z; | ||
2273 | 2281 | ||
2274 | zonelist = pgdat->node_zonelists + i; | 2282 | zonelist = &pgdat->node_zonelists[0]; |
2275 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; | 2283 | zonelist->zlcache_ptr = zlc = &zonelist->zlcache; |
2276 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); | 2284 | bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); |
2277 | for (z = zonelist->zones; *z; z++) | 2285 | for (z = zonelist->_zonerefs; z->zone; z++) |
2278 | zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); | 2286 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); |
2279 | } | ||
2280 | } | 2287 | } |
2281 | 2288 | ||
2282 | 2289 | ||
@@ -2290,45 +2297,44 @@ static void set_zonelist_order(void) | |||
2290 | static void build_zonelists(pg_data_t *pgdat) | 2297 | static void build_zonelists(pg_data_t *pgdat) |
2291 | { | 2298 | { |
2292 | int node, local_node; | 2299 | int node, local_node; |
2293 | enum zone_type i,j; | 2300 | enum zone_type j; |
2301 | struct zonelist *zonelist; | ||
2294 | 2302 | ||
2295 | local_node = pgdat->node_id; | 2303 | local_node = pgdat->node_id; |
2296 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
2297 | struct zonelist *zonelist; | ||
2298 | 2304 | ||
2299 | zonelist = pgdat->node_zonelists + i; | 2305 | zonelist = &pgdat->node_zonelists[0]; |
2306 | j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); | ||
2300 | 2307 | ||
2301 | j = build_zonelists_node(pgdat, zonelist, 0, i); | 2308 | /* |
2302 | /* | 2309 | * Now we build the zonelist so that it contains the zones |
2303 | * Now we build the zonelist so that it contains the zones | 2310 | * of all the other nodes. |
2304 | * of all the other nodes. | 2311 | * We don't want to pressure a particular node, so when |
2305 | * We don't want to pressure a particular node, so when | 2312 | * building the zones for node N, we make sure that the |
2306 | * building the zones for node N, we make sure that the | 2313 | * zones coming right after the local ones are those from |
2307 | * zones coming right after the local ones are those from | 2314 | * node N+1 (modulo N) |
2308 | * node N+1 (modulo N) | 2315 | */ |
2309 | */ | 2316 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { |
2310 | for (node = local_node + 1; node < MAX_NUMNODES; node++) { | 2317 | if (!node_online(node)) |
2311 | if (!node_online(node)) | 2318 | continue; |
2312 | continue; | 2319 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
2313 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 2320 | MAX_NR_ZONES - 1); |
2314 | } | 2321 | } |
2315 | for (node = 0; node < local_node; node++) { | 2322 | for (node = 0; node < local_node; node++) { |
2316 | if (!node_online(node)) | 2323 | if (!node_online(node)) |
2317 | continue; | 2324 | continue; |
2318 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 2325 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, |
2319 | } | 2326 | MAX_NR_ZONES - 1); |
2320 | |||
2321 | zonelist->zones[j] = NULL; | ||
2322 | } | 2327 | } |
2328 | |||
2329 | zonelist->_zonerefs[j].zone = NULL; | ||
2330 | zonelist->_zonerefs[j].zone_idx = 0; | ||
2323 | } | 2331 | } |
2324 | 2332 | ||
2325 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | 2333 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ |
2326 | static void build_zonelist_cache(pg_data_t *pgdat) | 2334 | static void build_zonelist_cache(pg_data_t *pgdat) |
2327 | { | 2335 | { |
2328 | int i; | 2336 | pgdat->node_zonelists[0].zlcache_ptr = NULL; |
2329 | 2337 | pgdat->node_zonelists[1].zlcache_ptr = NULL; | |
2330 | for (i = 0; i < MAX_NR_ZONES; i++) | ||
2331 | pgdat->node_zonelists[i].zlcache_ptr = NULL; | ||
2332 | } | 2338 | } |
2333 | 2339 | ||
2334 | #endif /* CONFIG_NUMA */ | 2340 | #endif /* CONFIG_NUMA */ |
@@ -4339,9 +4345,7 @@ void *__init alloc_large_system_hash(const char *tablename, | |||
4339 | else if (hashdist) | 4345 | else if (hashdist) |
4340 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); | 4346 | table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); |
4341 | else { | 4347 | else { |
4342 | unsigned long order; | 4348 | unsigned long order = get_order(size); |
4343 | for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++) | ||
4344 | ; | ||
4345 | table = (void*) __get_free_pages(GFP_ATOMIC, order); | 4349 | table = (void*) __get_free_pages(GFP_ATOMIC, order); |
4346 | /* | 4350 | /* |
4347 | * If bucketsize is not a power-of-two, we may free | 4351 | * If bucketsize is not a power-of-two, we may free |