aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2009-06-16 18:31:57 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-06-16 22:47:32 -0400
commit11e33f6a55ed7847d9c8ffe185ef87faf7806abe (patch)
treeca70fe29e836c508cc279c619f7b856380a6f10f
parent7f82af9742a9346794ecc1515139daed480e7025 (diff)
page allocator: break up the allocator entry point into fast and slow paths
The core of the page allocator is one giant function which allocates memory on the stack and makes calculations that may not be needed for every allocation. This patch breaks up the allocator path into fast and slow paths for clarity. Note the slow paths are still inlined but the entry is marked unlikely. If they were not inlined, it actally increases text size to generate the as there is only one call site. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Christoph Lameter <cl@linux-foundation.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/page_alloc.c353
1 files changed, 228 insertions, 125 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6be8fcb6f74f..512bf9a618c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1457,47 +1457,171 @@ try_next_zone:
1457 return page; 1457 return page;
1458} 1458}
1459 1459
1460/* 1460static inline int
1461 * This is the 'heart' of the zoned buddy allocator. 1461should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1462 */ 1462 unsigned long pages_reclaimed)
1463struct page *
1464__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1465 struct zonelist *zonelist, nodemask_t *nodemask)
1466{ 1463{
1467 const gfp_t wait = gfp_mask & __GFP_WAIT; 1464 /* Do not loop if specifically requested */
1468 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1465 if (gfp_mask & __GFP_NORETRY)
1469 struct zoneref *z; 1466 return 0;
1470 struct zone *zone;
1471 struct page *page;
1472 struct reclaim_state reclaim_state;
1473 struct task_struct *p = current;
1474 int do_retry;
1475 int alloc_flags;
1476 unsigned long did_some_progress;
1477 unsigned long pages_reclaimed = 0;
1478 1467
1479 lockdep_trace_alloc(gfp_mask); 1468 /*
1469 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1470 * means __GFP_NOFAIL, but that may not be true in other
1471 * implementations.
1472 */
1473 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1474 return 1;
1480 1475
1481 might_sleep_if(wait); 1476 /*
1477 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1478 * specified, then we retry until we no longer reclaim any pages
1479 * (above), or we've reclaimed an order of pages at least as
1480 * large as the allocation's order. In both cases, if the
1481 * allocation still fails, we stop retrying.
1482 */
1483 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1484 return 1;
1482 1485
1483 if (should_fail_alloc_page(gfp_mask, order)) 1486 /*
1484 return NULL; 1487 * Don't let big-order allocations loop unless the caller
1488 * explicitly requests that.
1489 */
1490 if (gfp_mask & __GFP_NOFAIL)
1491 return 1;
1485 1492
1486 /* the list of zones suitable for gfp_mask */ 1493 return 0;
1487 z = zonelist->_zonerefs; 1494}
1488 if (unlikely(!z->zone)) { 1495
1489 /* 1496static inline struct page *
1490 * Happens if we have an empty zonelist as a result of 1497__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1491 * GFP_THISNODE being used on a memoryless node 1498 struct zonelist *zonelist, enum zone_type high_zoneidx,
1492 */ 1499 nodemask_t *nodemask)
1500{
1501 struct page *page;
1502
1503 /* Acquire the OOM killer lock for the zones in zonelist */
1504 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1505 schedule_timeout_uninterruptible(1);
1493 return NULL; 1506 return NULL;
1494 } 1507 }
1495 1508
1496restart: 1509 /*
1497 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1510 * Go through the zonelist yet one more time, keep very high watermark
1498 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1511 * here, this is only to catch a parallel oom killing, we must fail if
1512 * we're still under heavy pressure.
1513 */
1514 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1515 order, zonelist, high_zoneidx,
1516 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1499 if (page) 1517 if (page)
1500 goto got_pg; 1518 goto out;
1519
1520 /* The OOM killer will not help higher order allocs */
1521 if (order > PAGE_ALLOC_COSTLY_ORDER)
1522 goto out;
1523
1524 /* Exhausted what can be done so it's blamo time */
1525 out_of_memory(zonelist, gfp_mask, order);
1526
1527out:
1528 clear_zonelist_oom(zonelist, gfp_mask);
1529 return page;
1530}
1531
1532/* The really slow allocator path where we enter direct reclaim */
1533static inline struct page *
1534__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1535 struct zonelist *zonelist, enum zone_type high_zoneidx,
1536 nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
1537{
1538 struct page *page = NULL;
1539 struct reclaim_state reclaim_state;
1540 struct task_struct *p = current;
1541
1542 cond_resched();
1543
1544 /* We now go into synchronous reclaim */
1545 cpuset_memory_pressure_bump();
1546
1547 /*
1548 * The task's cpuset might have expanded its set of allowable nodes
1549 */
1550 p->flags |= PF_MEMALLOC;
1551 lockdep_set_current_reclaim_state(gfp_mask);
1552 reclaim_state.reclaimed_slab = 0;
1553 p->reclaim_state = &reclaim_state;
1554
1555 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1556
1557 p->reclaim_state = NULL;
1558 lockdep_clear_current_reclaim_state();
1559 p->flags &= ~PF_MEMALLOC;
1560
1561 cond_resched();
1562
1563 if (order != 0)
1564 drain_all_pages();
1565
1566 if (likely(*did_some_progress))
1567 page = get_page_from_freelist(gfp_mask, nodemask, order,
1568 zonelist, high_zoneidx, alloc_flags);
1569 return page;
1570}
1571
1572static inline int
1573is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
1574{
1575 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1576 && !in_interrupt())
1577 return 1;
1578 return 0;
1579}
1580
1581/*
1582 * This is called in the allocator slow-path if the allocation request is of
1583 * sufficient urgency to ignore watermarks and take other desperate measures
1584 */
1585static inline struct page *
1586__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1587 struct zonelist *zonelist, enum zone_type high_zoneidx,
1588 nodemask_t *nodemask)
1589{
1590 struct page *page;
1591
1592 do {
1593 page = get_page_from_freelist(gfp_mask, nodemask, order,
1594 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1595
1596 if (!page && gfp_mask & __GFP_NOFAIL)
1597 congestion_wait(WRITE, HZ/50);
1598 } while (!page && (gfp_mask & __GFP_NOFAIL));
1599
1600 return page;
1601}
1602
1603static inline
1604void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1605 enum zone_type high_zoneidx)
1606{
1607 struct zoneref *z;
1608 struct zone *zone;
1609
1610 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1611 wakeup_kswapd(zone, order);
1612}
1613
1614static inline struct page *
1615__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1616 struct zonelist *zonelist, enum zone_type high_zoneidx,
1617 nodemask_t *nodemask)
1618{
1619 const gfp_t wait = gfp_mask & __GFP_WAIT;
1620 struct page *page = NULL;
1621 int alloc_flags;
1622 unsigned long pages_reclaimed = 0;
1623 unsigned long did_some_progress;
1624 struct task_struct *p = current;
1501 1625
1502 /* 1626 /*
1503 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1627 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,8 +1634,7 @@ restart:
1510 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1634 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1511 goto nopage; 1635 goto nopage;
1512 1636
1513 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1637 wake_all_kswapd(order, zonelist, high_zoneidx);
1514 wakeup_kswapd(zone, order);
1515 1638
1516 /* 1639 /*
1517 * OK, we're below the kswapd watermark and have kicked background 1640 * OK, we're below the kswapd watermark and have kicked background
@@ -1531,6 +1654,7 @@ restart:
1531 if (wait) 1654 if (wait)
1532 alloc_flags |= ALLOC_CPUSET; 1655 alloc_flags |= ALLOC_CPUSET;
1533 1656
1657restart:
1534 /* 1658 /*
1535 * Go through the zonelist again. Let __GFP_HIGH and allocations 1659 * Go through the zonelist again. Let __GFP_HIGH and allocations
1536 * coming from realtime tasks go deeper into reserves. 1660 * coming from realtime tasks go deeper into reserves.
@@ -1544,23 +1668,18 @@ restart:
1544 if (page) 1668 if (page)
1545 goto got_pg; 1669 goto got_pg;
1546 1670
1547 /* This allocation should allow future memory freeing. */
1548
1549rebalance: 1671rebalance:
1550 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1672 /* Allocate without watermarks if the context allows */
1551 && !in_interrupt()) { 1673 if (is_allocation_high_priority(p, gfp_mask)) {
1674 /* Do not dip into emergency reserves if specified */
1552 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1675 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1553nofail_alloc: 1676 page = __alloc_pages_high_priority(gfp_mask, order,
1554 /* go through the zonelist yet again, ignoring mins */ 1677 zonelist, high_zoneidx, nodemask);
1555 page = get_page_from_freelist(gfp_mask, nodemask, order,
1556 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1557 if (page) 1678 if (page)
1558 goto got_pg; 1679 goto got_pg;
1559 if (gfp_mask & __GFP_NOFAIL) {
1560 congestion_wait(WRITE, HZ/50);
1561 goto nofail_alloc;
1562 }
1563 } 1680 }
1681
1682 /* Ensure no recursion into the allocator */
1564 goto nopage; 1683 goto nopage;
1565 } 1684 }
1566 1685
@@ -1568,93 +1687,42 @@ nofail_alloc:
1568 if (!wait) 1687 if (!wait)
1569 goto nopage; 1688 goto nopage;
1570 1689
1571 cond_resched(); 1690 /* Try direct reclaim and then allocating */
1572 1691 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1573 /* We now go into synchronous reclaim */ 1692 zonelist, high_zoneidx,
1574 cpuset_memory_pressure_bump(); 1693 nodemask,
1575 1694 alloc_flags, &did_some_progress);
1576 p->flags |= PF_MEMALLOC; 1695 if (page)
1577 1696 goto got_pg;
1578 lockdep_set_current_reclaim_state(gfp_mask);
1579 reclaim_state.reclaimed_slab = 0;
1580 p->reclaim_state = &reclaim_state;
1581
1582 did_some_progress = try_to_free_pages(zonelist, order,
1583 gfp_mask, nodemask);
1584
1585 p->reclaim_state = NULL;
1586 lockdep_clear_current_reclaim_state();
1587 p->flags &= ~PF_MEMALLOC;
1588 1697
1589 cond_resched(); 1698 /*
1699 * If we failed to make any progress reclaiming, then we are
1700 * running out of options and have to consider going OOM
1701 */
1702 if (!did_some_progress) {
1703 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1704 page = __alloc_pages_may_oom(gfp_mask, order,
1705 zonelist, high_zoneidx,
1706 nodemask);
1707 if (page)
1708 goto got_pg;
1590 1709
1591 if (order != 0) 1710 /*
1592 drain_all_pages(); 1711 * The OOM killer does not trigger for high-order allocations
1712 * but if no progress is being made, there are no other
1713 * options and retrying is unlikely to help
1714 */
1715 if (order > PAGE_ALLOC_COSTLY_ORDER)
1716 goto nopage;
1593 1717
1594 if (likely(did_some_progress)) {
1595 page = get_page_from_freelist(gfp_mask, nodemask, order,
1596 zonelist, high_zoneidx, alloc_flags);
1597 if (page)
1598 goto got_pg;
1599 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1600 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1601 schedule_timeout_uninterruptible(1);
1602 goto restart; 1718 goto restart;
1603 } 1719 }
1604
1605 /*
1606 * Go through the zonelist yet one more time, keep
1607 * very high watermark here, this is only to catch
1608 * a parallel oom killing, we must fail if we're still
1609 * under heavy pressure.
1610 */
1611 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1612 order, zonelist, high_zoneidx,
1613 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1614 if (page) {
1615 clear_zonelist_oom(zonelist, gfp_mask);
1616 goto got_pg;
1617 }
1618
1619 /* The OOM killer will not help higher order allocs so fail */
1620 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1621 clear_zonelist_oom(zonelist, gfp_mask);
1622 goto nopage;
1623 }
1624
1625 out_of_memory(zonelist, gfp_mask, order);
1626 clear_zonelist_oom(zonelist, gfp_mask);
1627 goto restart;
1628 } 1720 }
1629 1721
1630 /* 1722 /* Check if we should retry the allocation */
1631 * Don't let big-order allocations loop unless the caller explicitly
1632 * requests that. Wait for some write requests to complete then retry.
1633 *
1634 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1635 * means __GFP_NOFAIL, but that may not be true in other
1636 * implementations.
1637 *
1638 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1639 * specified, then we retry until we no longer reclaim any pages
1640 * (above), or we've reclaimed an order of pages at least as
1641 * large as the allocation's order. In both cases, if the
1642 * allocation still fails, we stop retrying.
1643 */
1644 pages_reclaimed += did_some_progress; 1723 pages_reclaimed += did_some_progress;
1645 do_retry = 0; 1724 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1646 if (!(gfp_mask & __GFP_NORETRY)) { 1725 /* Wait for some write requests to complete then retry */
1647 if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1648 do_retry = 1;
1649 } else {
1650 if (gfp_mask & __GFP_REPEAT &&
1651 pages_reclaimed < (1 << order))
1652 do_retry = 1;
1653 }
1654 if (gfp_mask & __GFP_NOFAIL)
1655 do_retry = 1;
1656 }
1657 if (do_retry) {
1658 congestion_wait(WRITE, HZ/50); 1726 congestion_wait(WRITE, HZ/50);
1659 goto rebalance; 1727 goto rebalance;
1660 } 1728 }
@@ -1669,6 +1737,41 @@ nopage:
1669 } 1737 }
1670got_pg: 1738got_pg:
1671 return page; 1739 return page;
1740
1741}
1742
1743/*
1744 * This is the 'heart' of the zoned buddy allocator.
1745 */
1746struct page *
1747__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1748 struct zonelist *zonelist, nodemask_t *nodemask)
1749{
1750 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1751 struct page *page;
1752
1753 lockdep_trace_alloc(gfp_mask);
1754
1755 might_sleep_if(gfp_mask & __GFP_WAIT);
1756
1757 if (should_fail_alloc_page(gfp_mask, order))
1758 return NULL;
1759
1760 /*
1761 * Check the zones suitable for the gfp_mask contain at least one
1762 * valid zone. It's possible to have an empty zonelist as a result
1763 * of GFP_THISNODE and a memoryless node
1764 */
1765 if (unlikely(!zonelist->_zonerefs->zone))
1766 return NULL;
1767
1768 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1769 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1770 if (unlikely(!page))
1771 page = __alloc_pages_slowpath(gfp_mask, order,
1772 zonelist, high_zoneidx, nodemask);
1773
1774 return page;
1672} 1775}
1673EXPORT_SYMBOL(__alloc_pages_nodemask); 1776EXPORT_SYMBOL(__alloc_pages_nodemask);
1674 1777