aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mm/page_alloc.c353
1 files changed, 228 insertions, 125 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6be8fcb6f74f..512bf9a618c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1457,47 +1457,171 @@ try_next_zone:
1457 return page; 1457 return page;
1458} 1458}
1459 1459
1460/* 1460static inline int
1461 * This is the 'heart' of the zoned buddy allocator. 1461should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1462 */ 1462 unsigned long pages_reclaimed)
1463struct page *
1464__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1465 struct zonelist *zonelist, nodemask_t *nodemask)
1466{ 1463{
1467 const gfp_t wait = gfp_mask & __GFP_WAIT; 1464 /* Do not loop if specifically requested */
1468 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1465 if (gfp_mask & __GFP_NORETRY)
1469 struct zoneref *z; 1466 return 0;
1470 struct zone *zone;
1471 struct page *page;
1472 struct reclaim_state reclaim_state;
1473 struct task_struct *p = current;
1474 int do_retry;
1475 int alloc_flags;
1476 unsigned long did_some_progress;
1477 unsigned long pages_reclaimed = 0;
1478 1467
1479 lockdep_trace_alloc(gfp_mask); 1468 /*
1469 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1470 * means __GFP_NOFAIL, but that may not be true in other
1471 * implementations.
1472 */
1473 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1474 return 1;
1480 1475
1481 might_sleep_if(wait); 1476 /*
1477 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1478 * specified, then we retry until we no longer reclaim any pages
1479 * (above), or we've reclaimed an order of pages at least as
1480 * large as the allocation's order. In both cases, if the
1481 * allocation still fails, we stop retrying.
1482 */
1483 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1484 return 1;
1482 1485
1483 if (should_fail_alloc_page(gfp_mask, order)) 1486 /*
1484 return NULL; 1487 * Don't let big-order allocations loop unless the caller
1488 * explicitly requests that.
1489 */
1490 if (gfp_mask & __GFP_NOFAIL)
1491 return 1;
1485 1492
1486 /* the list of zones suitable for gfp_mask */ 1493 return 0;
1487 z = zonelist->_zonerefs; 1494}
1488 if (unlikely(!z->zone)) { 1495
1489 /* 1496static inline struct page *
1490 * Happens if we have an empty zonelist as a result of 1497__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1491 * GFP_THISNODE being used on a memoryless node 1498 struct zonelist *zonelist, enum zone_type high_zoneidx,
1492 */ 1499 nodemask_t *nodemask)
1500{
1501 struct page *page;
1502
1503 /* Acquire the OOM killer lock for the zones in zonelist */
1504 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1505 schedule_timeout_uninterruptible(1);
1493 return NULL; 1506 return NULL;
1494 } 1507 }
1495 1508
1496restart: 1509 /*
1497 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1510 * Go through the zonelist yet one more time, keep very high watermark
1498 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1511 * here, this is only to catch a parallel oom killing, we must fail if
1512 * we're still under heavy pressure.
1513 */
1514 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1515 order, zonelist, high_zoneidx,
1516 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1499 if (page) 1517 if (page)
1500 goto got_pg; 1518 goto out;
1519
1520 /* The OOM killer will not help higher order allocs */
1521 if (order > PAGE_ALLOC_COSTLY_ORDER)
1522 goto out;
1523
1524 /* Exhausted what can be done so it's blamo time */
1525 out_of_memory(zonelist, gfp_mask, order);
1526
1527out:
1528 clear_zonelist_oom(zonelist, gfp_mask);
1529 return page;
1530}
1531
1532/* The really slow allocator path where we enter direct reclaim */
1533static inline struct page *
1534__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1535 struct zonelist *zonelist, enum zone_type high_zoneidx,
1536 nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
1537{
1538 struct page *page = NULL;
1539 struct reclaim_state reclaim_state;
1540 struct task_struct *p = current;
1541
1542 cond_resched();
1543
1544 /* We now go into synchronous reclaim */
1545 cpuset_memory_pressure_bump();
1546
1547 /*
1548 * The task's cpuset might have expanded its set of allowable nodes
1549 */
1550 p->flags |= PF_MEMALLOC;
1551 lockdep_set_current_reclaim_state(gfp_mask);
1552 reclaim_state.reclaimed_slab = 0;
1553 p->reclaim_state = &reclaim_state;
1554
1555 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1556
1557 p->reclaim_state = NULL;
1558 lockdep_clear_current_reclaim_state();
1559 p->flags &= ~PF_MEMALLOC;
1560
1561 cond_resched();
1562
1563 if (order != 0)
1564 drain_all_pages();
1565
1566 if (likely(*did_some_progress))
1567 page = get_page_from_freelist(gfp_mask, nodemask, order,
1568 zonelist, high_zoneidx, alloc_flags);
1569 return page;
1570}
1571
1572static inline int
1573is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
1574{
1575 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
1576 && !in_interrupt())
1577 return 1;
1578 return 0;
1579}
1580
1581/*
1582 * This is called in the allocator slow-path if the allocation request is of
1583 * sufficient urgency to ignore watermarks and take other desperate measures
1584 */
1585static inline struct page *
1586__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1587 struct zonelist *zonelist, enum zone_type high_zoneidx,
1588 nodemask_t *nodemask)
1589{
1590 struct page *page;
1591
1592 do {
1593 page = get_page_from_freelist(gfp_mask, nodemask, order,
1594 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1595
1596 if (!page && gfp_mask & __GFP_NOFAIL)
1597 congestion_wait(WRITE, HZ/50);
1598 } while (!page && (gfp_mask & __GFP_NOFAIL));
1599
1600 return page;
1601}
1602
1603static inline
1604void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1605 enum zone_type high_zoneidx)
1606{
1607 struct zoneref *z;
1608 struct zone *zone;
1609
1610 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1611 wakeup_kswapd(zone, order);
1612}
1613
1614static inline struct page *
1615__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1616 struct zonelist *zonelist, enum zone_type high_zoneidx,
1617 nodemask_t *nodemask)
1618{
1619 const gfp_t wait = gfp_mask & __GFP_WAIT;
1620 struct page *page = NULL;
1621 int alloc_flags;
1622 unsigned long pages_reclaimed = 0;
1623 unsigned long did_some_progress;
1624 struct task_struct *p = current;
1501 1625
1502 /* 1626 /*
1503 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1627 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,8 +1634,7 @@ restart:
1510 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1634 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1511 goto nopage; 1635 goto nopage;
1512 1636
1513 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1637 wake_all_kswapd(order, zonelist, high_zoneidx);
1514 wakeup_kswapd(zone, order);
1515 1638
1516 /* 1639 /*
1517 * OK, we're below the kswapd watermark and have kicked background 1640 * OK, we're below the kswapd watermark and have kicked background
@@ -1531,6 +1654,7 @@ restart:
1531 if (wait) 1654 if (wait)
1532 alloc_flags |= ALLOC_CPUSET; 1655 alloc_flags |= ALLOC_CPUSET;
1533 1656
1657restart:
1534 /* 1658 /*
1535 * Go through the zonelist again. Let __GFP_HIGH and allocations 1659 * Go through the zonelist again. Let __GFP_HIGH and allocations
1536 * coming from realtime tasks go deeper into reserves. 1660 * coming from realtime tasks go deeper into reserves.
@@ -1544,23 +1668,18 @@ restart:
1544 if (page) 1668 if (page)
1545 goto got_pg; 1669 goto got_pg;
1546 1670
1547 /* This allocation should allow future memory freeing. */
1548
1549rebalance: 1671rebalance:
1550 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1672 /* Allocate without watermarks if the context allows */
1551 && !in_interrupt()) { 1673 if (is_allocation_high_priority(p, gfp_mask)) {
1674 /* Do not dip into emergency reserves if specified */
1552 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1675 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1553nofail_alloc: 1676 page = __alloc_pages_high_priority(gfp_mask, order,
1554 /* go through the zonelist yet again, ignoring mins */ 1677 zonelist, high_zoneidx, nodemask);
1555 page = get_page_from_freelist(gfp_mask, nodemask, order,
1556 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1557 if (page) 1678 if (page)
1558 goto got_pg; 1679 goto got_pg;
1559 if (gfp_mask & __GFP_NOFAIL) {
1560 congestion_wait(WRITE, HZ/50);
1561 goto nofail_alloc;
1562 }
1563 } 1680 }
1681
1682 /* Ensure no recursion into the allocator */
1564 goto nopage; 1683 goto nopage;
1565 } 1684 }
1566 1685
@@ -1568,93 +1687,42 @@ nofail_alloc:
1568 if (!wait) 1687 if (!wait)
1569 goto nopage; 1688 goto nopage;
1570 1689
1571 cond_resched(); 1690 /* Try direct reclaim and then allocating */
1572 1691 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1573 /* We now go into synchronous reclaim */ 1692 zonelist, high_zoneidx,
1574 cpuset_memory_pressure_bump(); 1693 nodemask,
1575 1694 alloc_flags, &did_some_progress);
1576 p->flags |= PF_MEMALLOC; 1695 if (page)
1577 1696 goto got_pg;
1578 lockdep_set_current_reclaim_state(gfp_mask);
1579 reclaim_state.reclaimed_slab = 0;
1580 p->reclaim_state = &reclaim_state;
1581
1582 did_some_progress = try_to_free_pages(zonelist, order,
1583 gfp_mask, nodemask);
1584
1585 p->reclaim_state = NULL;
1586 lockdep_clear_current_reclaim_state();
1587 p->flags &= ~PF_MEMALLOC;
1588 1697
1589 cond_resched(); 1698 /*
1699 * If we failed to make any progress reclaiming, then we are
1700 * running out of options and have to consider going OOM
1701 */
1702 if (!did_some_progress) {
1703 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1704 page = __alloc_pages_may_oom(gfp_mask, order,
1705 zonelist, high_zoneidx,
1706 nodemask);
1707 if (page)
1708 goto got_pg;
1590 1709
1591 if (order != 0) 1710 /*
1592 drain_all_pages(); 1711 * The OOM killer does not trigger for high-order allocations
1712 * but if no progress is being made, there are no other
1713 * options and retrying is unlikely to help
1714 */
1715 if (order > PAGE_ALLOC_COSTLY_ORDER)
1716 goto nopage;
1593 1717
1594 if (likely(did_some_progress)) {
1595 page = get_page_from_freelist(gfp_mask, nodemask, order,
1596 zonelist, high_zoneidx, alloc_flags);
1597 if (page)
1598 goto got_pg;
1599 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1600 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1601 schedule_timeout_uninterruptible(1);
1602 goto restart; 1718 goto restart;
1603 } 1719 }
1604
1605 /*
1606 * Go through the zonelist yet one more time, keep
1607 * very high watermark here, this is only to catch
1608 * a parallel oom killing, we must fail if we're still
1609 * under heavy pressure.
1610 */
1611 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1612 order, zonelist, high_zoneidx,
1613 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1614 if (page) {
1615 clear_zonelist_oom(zonelist, gfp_mask);
1616 goto got_pg;
1617 }
1618
1619 /* The OOM killer will not help higher order allocs so fail */
1620 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1621 clear_zonelist_oom(zonelist, gfp_mask);
1622 goto nopage;
1623 }
1624
1625 out_of_memory(zonelist, gfp_mask, order);
1626 clear_zonelist_oom(zonelist, gfp_mask);
1627 goto restart;
1628 } 1720 }
1629 1721
1630 /* 1722 /* Check if we should retry the allocation */
1631 * Don't let big-order allocations loop unless the caller explicitly
1632 * requests that. Wait for some write requests to complete then retry.
1633 *
1634 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1635 * means __GFP_NOFAIL, but that may not be true in other
1636 * implementations.
1637 *
1638 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1639 * specified, then we retry until we no longer reclaim any pages
1640 * (above), or we've reclaimed an order of pages at least as
1641 * large as the allocation's order. In both cases, if the
1642 * allocation still fails, we stop retrying.
1643 */
1644 pages_reclaimed += did_some_progress; 1723 pages_reclaimed += did_some_progress;
1645 do_retry = 0; 1724 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1646 if (!(gfp_mask & __GFP_NORETRY)) { 1725 /* Wait for some write requests to complete then retry */
1647 if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1648 do_retry = 1;
1649 } else {
1650 if (gfp_mask & __GFP_REPEAT &&
1651 pages_reclaimed < (1 << order))
1652 do_retry = 1;
1653 }
1654 if (gfp_mask & __GFP_NOFAIL)
1655 do_retry = 1;
1656 }
1657 if (do_retry) {
1658 congestion_wait(WRITE, HZ/50); 1726 congestion_wait(WRITE, HZ/50);
1659 goto rebalance; 1727 goto rebalance;
1660 } 1728 }
@@ -1669,6 +1737,41 @@ nopage:
1669 } 1737 }
1670got_pg: 1738got_pg:
1671 return page; 1739 return page;
1740
1741}
1742
1743/*
1744 * This is the 'heart' of the zoned buddy allocator.
1745 */
1746struct page *
1747__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1748 struct zonelist *zonelist, nodemask_t *nodemask)
1749{
1750 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1751 struct page *page;
1752
1753 lockdep_trace_alloc(gfp_mask);
1754
1755 might_sleep_if(gfp_mask & __GFP_WAIT);
1756
1757 if (should_fail_alloc_page(gfp_mask, order))
1758 return NULL;
1759
1760 /*
1761 * Check the zones suitable for the gfp_mask contain at least one
1762 * valid zone. It's possible to have an empty zonelist as a result
1763 * of GFP_THISNODE and a memoryless node
1764 */
1765 if (unlikely(!zonelist->_zonerefs->zone))
1766 return NULL;
1767
1768 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1769 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1770 if (unlikely(!page))
1771 page = __alloc_pages_slowpath(gfp_mask, order,
1772 zonelist, high_zoneidx, nodemask);
1773
1774 return page;
1672} 1775}
1673EXPORT_SYMBOL(__alloc_pages_nodemask); 1776EXPORT_SYMBOL(__alloc_pages_nodemask);
1674 1777