diff options
-rw-r--r-- | mm/page_alloc.c | 353 |
1 files changed, 228 insertions, 125 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6be8fcb6f74f..512bf9a618c7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1457,47 +1457,171 @@ try_next_zone: | |||
1457 | return page; | 1457 | return page; |
1458 | } | 1458 | } |
1459 | 1459 | ||
1460 | /* | 1460 | static inline int |
1461 | * This is the 'heart' of the zoned buddy allocator. | 1461 | should_alloc_retry(gfp_t gfp_mask, unsigned int order, |
1462 | */ | 1462 | unsigned long pages_reclaimed) |
1463 | struct page * | ||
1464 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
1465 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
1466 | { | 1463 | { |
1467 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 1464 | /* Do not loop if specifically requested */ |
1468 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1465 | if (gfp_mask & __GFP_NORETRY) |
1469 | struct zoneref *z; | 1466 | return 0; |
1470 | struct zone *zone; | ||
1471 | struct page *page; | ||
1472 | struct reclaim_state reclaim_state; | ||
1473 | struct task_struct *p = current; | ||
1474 | int do_retry; | ||
1475 | int alloc_flags; | ||
1476 | unsigned long did_some_progress; | ||
1477 | unsigned long pages_reclaimed = 0; | ||
1478 | 1467 | ||
1479 | lockdep_trace_alloc(gfp_mask); | 1468 | /* |
1469 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
1470 | * means __GFP_NOFAIL, but that may not be true in other | ||
1471 | * implementations. | ||
1472 | */ | ||
1473 | if (order <= PAGE_ALLOC_COSTLY_ORDER) | ||
1474 | return 1; | ||
1480 | 1475 | ||
1481 | might_sleep_if(wait); | 1476 | /* |
1477 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
1478 | * specified, then we retry until we no longer reclaim any pages | ||
1479 | * (above), or we've reclaimed an order of pages at least as | ||
1480 | * large as the allocation's order. In both cases, if the | ||
1481 | * allocation still fails, we stop retrying. | ||
1482 | */ | ||
1483 | if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) | ||
1484 | return 1; | ||
1482 | 1485 | ||
1483 | if (should_fail_alloc_page(gfp_mask, order)) | 1486 | /* |
1484 | return NULL; | 1487 | * Don't let big-order allocations loop unless the caller |
1488 | * explicitly requests that. | ||
1489 | */ | ||
1490 | if (gfp_mask & __GFP_NOFAIL) | ||
1491 | return 1; | ||
1485 | 1492 | ||
1486 | /* the list of zones suitable for gfp_mask */ | 1493 | return 0; |
1487 | z = zonelist->_zonerefs; | 1494 | } |
1488 | if (unlikely(!z->zone)) { | 1495 | |
1489 | /* | 1496 | static inline struct page * |
1490 | * Happens if we have an empty zonelist as a result of | 1497 | __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, |
1491 | * GFP_THISNODE being used on a memoryless node | 1498 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
1492 | */ | 1499 | nodemask_t *nodemask) |
1500 | { | ||
1501 | struct page *page; | ||
1502 | |||
1503 | /* Acquire the OOM killer lock for the zones in zonelist */ | ||
1504 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
1505 | schedule_timeout_uninterruptible(1); | ||
1493 | return NULL; | 1506 | return NULL; |
1494 | } | 1507 | } |
1495 | 1508 | ||
1496 | restart: | 1509 | /* |
1497 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | 1510 | * Go through the zonelist yet one more time, keep very high watermark |
1498 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1511 | * here, this is only to catch a parallel oom killing, we must fail if |
1512 | * we're still under heavy pressure. | ||
1513 | */ | ||
1514 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
1515 | order, zonelist, high_zoneidx, | ||
1516 | ALLOC_WMARK_HIGH|ALLOC_CPUSET); | ||
1499 | if (page) | 1517 | if (page) |
1500 | goto got_pg; | 1518 | goto out; |
1519 | |||
1520 | /* The OOM killer will not help higher order allocs */ | ||
1521 | if (order > PAGE_ALLOC_COSTLY_ORDER) | ||
1522 | goto out; | ||
1523 | |||
1524 | /* Exhausted what can be done so it's blamo time */ | ||
1525 | out_of_memory(zonelist, gfp_mask, order); | ||
1526 | |||
1527 | out: | ||
1528 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1529 | return page; | ||
1530 | } | ||
1531 | |||
1532 | /* The really slow allocator path where we enter direct reclaim */ | ||
1533 | static inline struct page * | ||
1534 | __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | ||
1535 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1536 | nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress) | ||
1537 | { | ||
1538 | struct page *page = NULL; | ||
1539 | struct reclaim_state reclaim_state; | ||
1540 | struct task_struct *p = current; | ||
1541 | |||
1542 | cond_resched(); | ||
1543 | |||
1544 | /* We now go into synchronous reclaim */ | ||
1545 | cpuset_memory_pressure_bump(); | ||
1546 | |||
1547 | /* | ||
1548 | * The task's cpuset might have expanded its set of allowable nodes | ||
1549 | */ | ||
1550 | p->flags |= PF_MEMALLOC; | ||
1551 | lockdep_set_current_reclaim_state(gfp_mask); | ||
1552 | reclaim_state.reclaimed_slab = 0; | ||
1553 | p->reclaim_state = &reclaim_state; | ||
1554 | |||
1555 | *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); | ||
1556 | |||
1557 | p->reclaim_state = NULL; | ||
1558 | lockdep_clear_current_reclaim_state(); | ||
1559 | p->flags &= ~PF_MEMALLOC; | ||
1560 | |||
1561 | cond_resched(); | ||
1562 | |||
1563 | if (order != 0) | ||
1564 | drain_all_pages(); | ||
1565 | |||
1566 | if (likely(*did_some_progress)) | ||
1567 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1568 | zonelist, high_zoneidx, alloc_flags); | ||
1569 | return page; | ||
1570 | } | ||
1571 | |||
1572 | static inline int | ||
1573 | is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask) | ||
1574 | { | ||
1575 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | ||
1576 | && !in_interrupt()) | ||
1577 | return 1; | ||
1578 | return 0; | ||
1579 | } | ||
1580 | |||
1581 | /* | ||
1582 | * This is called in the allocator slow-path if the allocation request is of | ||
1583 | * sufficient urgency to ignore watermarks and take other desperate measures | ||
1584 | */ | ||
1585 | static inline struct page * | ||
1586 | __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, | ||
1587 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1588 | nodemask_t *nodemask) | ||
1589 | { | ||
1590 | struct page *page; | ||
1591 | |||
1592 | do { | ||
1593 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1594 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); | ||
1595 | |||
1596 | if (!page && gfp_mask & __GFP_NOFAIL) | ||
1597 | congestion_wait(WRITE, HZ/50); | ||
1598 | } while (!page && (gfp_mask & __GFP_NOFAIL)); | ||
1599 | |||
1600 | return page; | ||
1601 | } | ||
1602 | |||
1603 | static inline | ||
1604 | void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, | ||
1605 | enum zone_type high_zoneidx) | ||
1606 | { | ||
1607 | struct zoneref *z; | ||
1608 | struct zone *zone; | ||
1609 | |||
1610 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | ||
1611 | wakeup_kswapd(zone, order); | ||
1612 | } | ||
1613 | |||
1614 | static inline struct page * | ||
1615 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | ||
1616 | struct zonelist *zonelist, enum zone_type high_zoneidx, | ||
1617 | nodemask_t *nodemask) | ||
1618 | { | ||
1619 | const gfp_t wait = gfp_mask & __GFP_WAIT; | ||
1620 | struct page *page = NULL; | ||
1621 | int alloc_flags; | ||
1622 | unsigned long pages_reclaimed = 0; | ||
1623 | unsigned long did_some_progress; | ||
1624 | struct task_struct *p = current; | ||
1501 | 1625 | ||
1502 | /* | 1626 | /* |
1503 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and | 1627 | * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and |
@@ -1510,8 +1634,7 @@ restart: | |||
1510 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 1634 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) |
1511 | goto nopage; | 1635 | goto nopage; |
1512 | 1636 | ||
1513 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 1637 | wake_all_kswapd(order, zonelist, high_zoneidx); |
1514 | wakeup_kswapd(zone, order); | ||
1515 | 1638 | ||
1516 | /* | 1639 | /* |
1517 | * OK, we're below the kswapd watermark and have kicked background | 1640 | * OK, we're below the kswapd watermark and have kicked background |
@@ -1531,6 +1654,7 @@ restart: | |||
1531 | if (wait) | 1654 | if (wait) |
1532 | alloc_flags |= ALLOC_CPUSET; | 1655 | alloc_flags |= ALLOC_CPUSET; |
1533 | 1656 | ||
1657 | restart: | ||
1534 | /* | 1658 | /* |
1535 | * Go through the zonelist again. Let __GFP_HIGH and allocations | 1659 | * Go through the zonelist again. Let __GFP_HIGH and allocations |
1536 | * coming from realtime tasks go deeper into reserves. | 1660 | * coming from realtime tasks go deeper into reserves. |
@@ -1544,23 +1668,18 @@ restart: | |||
1544 | if (page) | 1668 | if (page) |
1545 | goto got_pg; | 1669 | goto got_pg; |
1546 | 1670 | ||
1547 | /* This allocation should allow future memory freeing. */ | ||
1548 | |||
1549 | rebalance: | 1671 | rebalance: |
1550 | if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) | 1672 | /* Allocate without watermarks if the context allows */ |
1551 | && !in_interrupt()) { | 1673 | if (is_allocation_high_priority(p, gfp_mask)) { |
1674 | /* Do not dip into emergency reserves if specified */ | ||
1552 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1675 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
1553 | nofail_alloc: | 1676 | page = __alloc_pages_high_priority(gfp_mask, order, |
1554 | /* go through the zonelist yet again, ignoring mins */ | 1677 | zonelist, high_zoneidx, nodemask); |
1555 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1556 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); | ||
1557 | if (page) | 1678 | if (page) |
1558 | goto got_pg; | 1679 | goto got_pg; |
1559 | if (gfp_mask & __GFP_NOFAIL) { | ||
1560 | congestion_wait(WRITE, HZ/50); | ||
1561 | goto nofail_alloc; | ||
1562 | } | ||
1563 | } | 1680 | } |
1681 | |||
1682 | /* Ensure no recursion into the allocator */ | ||
1564 | goto nopage; | 1683 | goto nopage; |
1565 | } | 1684 | } |
1566 | 1685 | ||
@@ -1568,93 +1687,42 @@ nofail_alloc: | |||
1568 | if (!wait) | 1687 | if (!wait) |
1569 | goto nopage; | 1688 | goto nopage; |
1570 | 1689 | ||
1571 | cond_resched(); | 1690 | /* Try direct reclaim and then allocating */ |
1572 | 1691 | page = __alloc_pages_direct_reclaim(gfp_mask, order, | |
1573 | /* We now go into synchronous reclaim */ | 1692 | zonelist, high_zoneidx, |
1574 | cpuset_memory_pressure_bump(); | 1693 | nodemask, |
1575 | 1694 | alloc_flags, &did_some_progress); | |
1576 | p->flags |= PF_MEMALLOC; | 1695 | if (page) |
1577 | 1696 | goto got_pg; | |
1578 | lockdep_set_current_reclaim_state(gfp_mask); | ||
1579 | reclaim_state.reclaimed_slab = 0; | ||
1580 | p->reclaim_state = &reclaim_state; | ||
1581 | |||
1582 | did_some_progress = try_to_free_pages(zonelist, order, | ||
1583 | gfp_mask, nodemask); | ||
1584 | |||
1585 | p->reclaim_state = NULL; | ||
1586 | lockdep_clear_current_reclaim_state(); | ||
1587 | p->flags &= ~PF_MEMALLOC; | ||
1588 | 1697 | ||
1589 | cond_resched(); | 1698 | /* |
1699 | * If we failed to make any progress reclaiming, then we are | ||
1700 | * running out of options and have to consider going OOM | ||
1701 | */ | ||
1702 | if (!did_some_progress) { | ||
1703 | if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | ||
1704 | page = __alloc_pages_may_oom(gfp_mask, order, | ||
1705 | zonelist, high_zoneidx, | ||
1706 | nodemask); | ||
1707 | if (page) | ||
1708 | goto got_pg; | ||
1590 | 1709 | ||
1591 | if (order != 0) | 1710 | /* |
1592 | drain_all_pages(); | 1711 | * The OOM killer does not trigger for high-order allocations |
1712 | * but if no progress is being made, there are no other | ||
1713 | * options and retrying is unlikely to help | ||
1714 | */ | ||
1715 | if (order > PAGE_ALLOC_COSTLY_ORDER) | ||
1716 | goto nopage; | ||
1593 | 1717 | ||
1594 | if (likely(did_some_progress)) { | ||
1595 | page = get_page_from_freelist(gfp_mask, nodemask, order, | ||
1596 | zonelist, high_zoneidx, alloc_flags); | ||
1597 | if (page) | ||
1598 | goto got_pg; | ||
1599 | } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { | ||
1600 | if (!try_set_zone_oom(zonelist, gfp_mask)) { | ||
1601 | schedule_timeout_uninterruptible(1); | ||
1602 | goto restart; | 1718 | goto restart; |
1603 | } | 1719 | } |
1604 | |||
1605 | /* | ||
1606 | * Go through the zonelist yet one more time, keep | ||
1607 | * very high watermark here, this is only to catch | ||
1608 | * a parallel oom killing, we must fail if we're still | ||
1609 | * under heavy pressure. | ||
1610 | */ | ||
1611 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, | ||
1612 | order, zonelist, high_zoneidx, | ||
1613 | ALLOC_WMARK_HIGH|ALLOC_CPUSET); | ||
1614 | if (page) { | ||
1615 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1616 | goto got_pg; | ||
1617 | } | ||
1618 | |||
1619 | /* The OOM killer will not help higher order allocs so fail */ | ||
1620 | if (order > PAGE_ALLOC_COSTLY_ORDER) { | ||
1621 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1622 | goto nopage; | ||
1623 | } | ||
1624 | |||
1625 | out_of_memory(zonelist, gfp_mask, order); | ||
1626 | clear_zonelist_oom(zonelist, gfp_mask); | ||
1627 | goto restart; | ||
1628 | } | 1720 | } |
1629 | 1721 | ||
1630 | /* | 1722 | /* Check if we should retry the allocation */ |
1631 | * Don't let big-order allocations loop unless the caller explicitly | ||
1632 | * requests that. Wait for some write requests to complete then retry. | ||
1633 | * | ||
1634 | * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER | ||
1635 | * means __GFP_NOFAIL, but that may not be true in other | ||
1636 | * implementations. | ||
1637 | * | ||
1638 | * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is | ||
1639 | * specified, then we retry until we no longer reclaim any pages | ||
1640 | * (above), or we've reclaimed an order of pages at least as | ||
1641 | * large as the allocation's order. In both cases, if the | ||
1642 | * allocation still fails, we stop retrying. | ||
1643 | */ | ||
1644 | pages_reclaimed += did_some_progress; | 1723 | pages_reclaimed += did_some_progress; |
1645 | do_retry = 0; | 1724 | if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { |
1646 | if (!(gfp_mask & __GFP_NORETRY)) { | 1725 | /* Wait for some write requests to complete then retry */ |
1647 | if (order <= PAGE_ALLOC_COSTLY_ORDER) { | ||
1648 | do_retry = 1; | ||
1649 | } else { | ||
1650 | if (gfp_mask & __GFP_REPEAT && | ||
1651 | pages_reclaimed < (1 << order)) | ||
1652 | do_retry = 1; | ||
1653 | } | ||
1654 | if (gfp_mask & __GFP_NOFAIL) | ||
1655 | do_retry = 1; | ||
1656 | } | ||
1657 | if (do_retry) { | ||
1658 | congestion_wait(WRITE, HZ/50); | 1726 | congestion_wait(WRITE, HZ/50); |
1659 | goto rebalance; | 1727 | goto rebalance; |
1660 | } | 1728 | } |
@@ -1669,6 +1737,41 @@ nopage: | |||
1669 | } | 1737 | } |
1670 | got_pg: | 1738 | got_pg: |
1671 | return page; | 1739 | return page; |
1740 | |||
1741 | } | ||
1742 | |||
1743 | /* | ||
1744 | * This is the 'heart' of the zoned buddy allocator. | ||
1745 | */ | ||
1746 | struct page * | ||
1747 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
1748 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
1749 | { | ||
1750 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
1751 | struct page *page; | ||
1752 | |||
1753 | lockdep_trace_alloc(gfp_mask); | ||
1754 | |||
1755 | might_sleep_if(gfp_mask & __GFP_WAIT); | ||
1756 | |||
1757 | if (should_fail_alloc_page(gfp_mask, order)) | ||
1758 | return NULL; | ||
1759 | |||
1760 | /* | ||
1761 | * Check the zones suitable for the gfp_mask contain at least one | ||
1762 | * valid zone. It's possible to have an empty zonelist as a result | ||
1763 | * of GFP_THISNODE and a memoryless node | ||
1764 | */ | ||
1765 | if (unlikely(!zonelist->_zonerefs->zone)) | ||
1766 | return NULL; | ||
1767 | |||
1768 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, | ||
1769 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); | ||
1770 | if (unlikely(!page)) | ||
1771 | page = __alloc_pages_slowpath(gfp_mask, order, | ||
1772 | zonelist, high_zoneidx, nodemask); | ||
1773 | |||
1774 | return page; | ||
1672 | } | 1775 | } |
1673 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 1776 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
1674 | 1777 | ||