1 files changed, 228 insertions, 125 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6be8fcb6f74f..512bf9a618c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1457,47 +1457,171 @@ try_next_zone:
        return page;
 }
-/*
+static inline int
- * This is the 'heart' of the zoned buddy allocator.
+should_alloc_retry(gfp_t gfp_mask, unsigned int order,
- */
+                                unsigned long pages_reclaimed)
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-                        struct zonelist *zonelist, nodemask_t *nodemask)
 {
-        const gfp_t wait = gfp_mask & __GFP_WAIT;
+        /* Do not loop if specifically requested */
-        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+        if (gfp_mask & __GFP_NORETRY)
-        struct zoneref *z;
+                return 0;
-        struct zone *zone;
-        struct page *page;
-        struct reclaim_state reclaim_state;
-        struct task_struct *p = current;
-        int do_retry;
-        int alloc_flags;
-        unsigned long did_some_progress;
-        unsigned long pages_reclaimed = 0;
-        lockdep_trace_alloc(gfp_mask);
+        /*
+         * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+         * means __GFP_NOFAIL, but that may not be true in other
+         * implementations.
+         */
+        if (order <= PAGE_ALLOC_COSTLY_ORDER)
+                return 1;
-        might_sleep_if(wait);
+        /*
+         * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+         * specified, then we retry until we no longer reclaim any pages
+         * (above), or we've reclaimed an order of pages at least as
+         * large as the allocation's order. In both cases, if the
+         * allocation still fails, we stop retrying.
+         */
+        if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
+                return 1;
-        if (should_fail_alloc_page(gfp_mask, order))
+        /*
-                return NULL;
+         * Don't let big-order allocations loop unless the caller
+         * explicitly requests that.
+         */
+        if (gfp_mask & __GFP_NOFAIL)
+                return 1;
-        /* the list of zones suitable for gfp_mask */
+        return 0;
-        z = zonelist->_zonerefs;
+}
-        if (unlikely(!z->zone)) {
-                /*
+static inline struct page *
-                 * Happens if we have an empty zonelist as a result of
+__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
-                 * GFP_THISNODE being used on a memoryless node
+        struct zonelist *zonelist, enum zone_type high_zoneidx,
-                 */
+        nodemask_t *nodemask)
+{
+        struct page *page;
+        /* Acquire the OOM killer lock for the zones in zonelist */
+        if (!try_set_zone_oom(zonelist, gfp_mask)) {
+                schedule_timeout_uninterruptible(1);
                return NULL;
        }
-restart:
+        /*
-        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+         * Go through the zonelist yet one more time, keep very high watermark
-                        zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+         * here, this is only to catch a parallel oom killing, we must fail if
+         * we're still under heavy pressure.
+         */
+        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+                order, zonelist, high_zoneidx,
+                ALLOC_WMARK_HIGH|ALLOC_CPUSET);
        if (page)
-                goto got_pg;
+                goto out;
+        /* The OOM killer will not help higher order allocs */
+        if (order > PAGE_ALLOC_COSTLY_ORDER)
+                goto out;
+        /* Exhausted what can be done so it's blamo time */
+        out_of_memory(zonelist, gfp_mask, order);
+out:
+        clear_zonelist_oom(zonelist, gfp_mask);
+        return page;
+}
+/* The really slow allocator path where we enter direct reclaim */
+static inline struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+        struct zonelist *zonelist, enum zone_type high_zoneidx,
+        nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
+{
+        struct page *page = NULL;
+        struct reclaim_state reclaim_state;
+        struct task_struct *p = current;
+        cond_resched();
+        /* We now go into synchronous reclaim */
+        cpuset_memory_pressure_bump();
+        /*
+         * The task's cpuset might have expanded its set of allowable nodes
+         */
+        p->flags |= PF_MEMALLOC;
+        lockdep_set_current_reclaim_state(gfp_mask);
+        reclaim_state.reclaimed_slab = 0;
+        p->reclaim_state = &reclaim_state;
+        *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+        p->reclaim_state = NULL;
+        lockdep_clear_current_reclaim_state();
+        p->flags &= ~PF_MEMALLOC;
+        cond_resched();
+        if (order != 0)
+                drain_all_pages();
+        if (likely(*did_some_progress))
+                page = get_page_from_freelist(gfp_mask, nodemask, order,
+                                        zonelist, high_zoneidx, alloc_flags);
+        return page;
+}
+static inline int
+is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
+{
+        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+                        && !in_interrupt())
+                return 1;
+        return 0;
+}
+/*
+ * This is called in the allocator slow-path if the allocation request is of
+ * sufficient urgency to ignore watermarks and take other desperate measures
+ */
+static inline struct page *
+__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+        struct zonelist *zonelist, enum zone_type high_zoneidx,
+        nodemask_t *nodemask)
+{
+        struct page *page;
+        do {
+                page = get_page_from_freelist(gfp_mask, nodemask, order,
+                        zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+                if (!page && gfp_mask & __GFP_NOFAIL)
+                        congestion_wait(WRITE, HZ/50);
+        } while (!page && (gfp_mask & __GFP_NOFAIL));
+        return page;
+}
+static inline
+void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
+                                                enum zone_type high_zoneidx)
+{
+        struct zoneref *z;
+        struct zone *zone;
+        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+                wakeup_kswapd(zone, order);
+}
+static inline struct page *
+__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+        struct zonelist *zonelist, enum zone_type high_zoneidx,
+        nodemask_t *nodemask)
+{
+        const gfp_t wait = gfp_mask & __GFP_WAIT;
+        struct page *page = NULL;
+        int alloc_flags;
+        unsigned long pages_reclaimed = 0;
+        unsigned long did_some_progress;
+        struct task_struct *p = current;
        /*
         * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,8 +1634,7 @@ restart:
        if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+        wake_all_kswapd(order, zonelist, high_zoneidx);
-                wakeup_kswapd(zone, order);
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -1531,6 +1654,7 @@ restart:
        if (wait)
                alloc_flags |= ALLOC_CPUSET;
+restart:
        /*
         * Go through the zonelist again. Let __GFP_HIGH and allocations
         * coming from realtime tasks go deeper into reserves.
@@ -1544,23 +1668,18 @@ restart:
        if (page)
                goto got_pg;
-        /* This allocation should allow future memory freeing. */
 rebalance:
-        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+        /* Allocate without watermarks if the context allows */
-                        && !in_interrupt()) {
+        if (is_allocation_high_priority(p, gfp_mask)) {
+                /* Do not dip into emergency reserves if specified */
                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
-nofail_alloc:
+                        page = __alloc_pages_high_priority(gfp_mask, order,
-                        /* go through the zonelist yet again, ignoring mins */
+                                zonelist, high_zoneidx, nodemask);
-                        page = get_page_from_freelist(gfp_mask, nodemask, order,
-                                zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
                        if (page)
                                goto got_pg;
-                        if (gfp_mask & __GFP_NOFAIL) {
-                                congestion_wait(WRITE, HZ/50);
-                                goto nofail_alloc;
-                        }
                }
+                /* Ensure no recursion into the allocator */
                goto nopage;
        }
@@ -1568,93 +1687,42 @@ nofail_alloc:
        if (!wait)
                goto nopage;
-        cond_resched();
+        /* Try direct reclaim and then allocating */
+        page = __alloc_pages_direct_reclaim(gfp_mask, order,
-        /* We now go into synchronous reclaim */
+                                        zonelist, high_zoneidx,
-        cpuset_memory_pressure_bump();
+                                        nodemask,
+                                        alloc_flags, &did_some_progress);
-        p->flags |= PF_MEMALLOC;
+        if (page)
+                goto got_pg;
-        lockdep_set_current_reclaim_state(gfp_mask);
-        reclaim_state.reclaimed_slab = 0;
-        p->reclaim_state = &reclaim_state;
-        did_some_progress = try_to_free_pages(zonelist, order,
-                                                gfp_mask, nodemask);
-        p->reclaim_state = NULL;
-        lockdep_clear_current_reclaim_state();
-        p->flags &= ~PF_MEMALLOC;
-        cond_resched();
+        /*
+         * If we failed to make any progress reclaiming, then we are
+         * running out of options and have to consider going OOM
+         */
+        if (!did_some_progress) {
+                if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+                        page = __alloc_pages_may_oom(gfp_mask, order,
+                                        zonelist, high_zoneidx,
+                                        nodemask);
+                        if (page)
+                                goto got_pg;
-        if (order != 0)
+                        /*
-                drain_all_pages();
+                         * The OOM killer does not trigger for high-order allocations
+                         * but if no progress is being made, there are no other
+                         * options and retrying is unlikely to help
+                         */
+                        if (order > PAGE_ALLOC_COSTLY_ORDER)
+                                goto nopage;
-        if (likely(did_some_progress)) {
-                page = get_page_from_freelist(gfp_mask, nodemask, order,
-                                        zonelist, high_zoneidx, alloc_flags);
-                if (page)
-                        goto got_pg;
-        } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
-                if (!try_set_zone_oom(zonelist, gfp_mask)) {
-                        schedule_timeout_uninterruptible(1);
                        goto restart;
                }
-                /*
-                 * Go through the zonelist yet one more time, keep
-                 * very high watermark here, this is only to catch
-                 * a parallel oom killing, we must fail if we're still
-                 * under heavy pressure.
-                 */
-                page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
-                        order, zonelist, high_zoneidx,
-                        ALLOC_WMARK_HIGH|ALLOC_CPUSET);
-                if (page) {
-                        clear_zonelist_oom(zonelist, gfp_mask);
-                        goto got_pg;
-                }
-                /* The OOM killer will not help higher order allocs so fail */
-                if (order > PAGE_ALLOC_COSTLY_ORDER) {
-                        clear_zonelist_oom(zonelist, gfp_mask);
-                        goto nopage;
-                }
-                out_of_memory(zonelist, gfp_mask, order);
-                clear_zonelist_oom(zonelist, gfp_mask);
-                goto restart;
        }
-        /*
+        /* Check if we should retry the allocation */
-         * Don't let big-order allocations loop unless the caller explicitly
-         * requests that.  Wait for some write requests to complete then retry.
-         *
-         * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
-         * means __GFP_NOFAIL, but that may not be true in other
-         * implementations.
-         *
-         * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
-         * specified, then we retry until we no longer reclaim any pages
-         * (above), or we've reclaimed an order of pages at least as
-         * large as the allocation's order. In both cases, if the
-         * allocation still fails, we stop retrying.
-         */
        pages_reclaimed += did_some_progress;
-        do_retry = 0;
+        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
-        if (!(gfp_mask & __GFP_NORETRY)) {
+                /* Wait for some write requests to complete then retry */
-                if (order <= PAGE_ALLOC_COSTLY_ORDER) {
-                        do_retry = 1;
-                } else {
-                        if (gfp_mask & __GFP_REPEAT &&
-                                pages_reclaimed < (1 << order))
-                                        do_retry = 1;
-                }
-                if (gfp_mask & __GFP_NOFAIL)
-                        do_retry = 1;
-        }
-        if (do_retry) {
                congestion_wait(WRITE, HZ/50);
                goto rebalance;
        }
@@ -1669,6 +1737,41 @@ nopage:
        }
 got_pg:
        return page;
+}
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+                        struct zonelist *zonelist, nodemask_t *nodemask)
+{
+        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+        struct page *page;
+        lockdep_trace_alloc(gfp_mask);
+        might_sleep_if(gfp_mask & __GFP_WAIT);
+        if (should_fail_alloc_page(gfp_mask, order))
+                return NULL;
+        /*
+         * Check the zones suitable for the gfp_mask contain at least one
+         * valid zone. It's possible to have an empty zonelist as a result
+         * of GFP_THISNODE and a memoryless node
+         */
+        if (unlikely(!zonelist->_zonerefs->zone))
+                return NULL;
+        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+                        zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+        if (unlikely(!page))
+                page = __alloc_pages_slowpath(gfp_mask, order,
+                                zonelist, high_zoneidx, nodemask);
+        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6be8fcb6f74f..512bf9a618c7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -1457,47 +1457,171 @@ try_next_zone:
1457	return page;	1457	return page;
1458	}	1458	}
1459		1459
1460	/*	1460	static inline int
1461	* This is the 'heart' of the zoned buddy allocator.	1461	should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1462	*/	1462	unsigned long pages_reclaimed)
1463	struct page *
1464	__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1465	struct zonelist zonelist, nodemask_t nodemask)
1466	{	1463	{
1467	const gfp_t wait = gfp_mask & __GFP_WAIT;	1464	/* Do not loop if specifically requested */
1468	enum zone_type high_zoneidx = gfp_zone(gfp_mask);	1465	if (gfp_mask & __GFP_NORETRY)
1469	struct zoneref *z;	1466	return 0;
1470	struct zone *zone;
1471	struct page *page;
1472	struct reclaim_state reclaim_state;
1473	struct task_struct *p = current;
1474	int do_retry;
1475	int alloc_flags;
1476	unsigned long did_some_progress;
1477	unsigned long pages_reclaimed = 0;
1478		1467
1479	lockdep_trace_alloc(gfp_mask);	1468	/*
		1469	* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
		1470	* means __GFP_NOFAIL, but that may not be true in other
		1471	* implementations.
		1472	*/
		1473	if (order <= PAGE_ALLOC_COSTLY_ORDER)
		1474	return 1;
1480		1475
1481	might_sleep_if(wait);	1476	/*
		1477	* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
		1478	* specified, then we retry until we no longer reclaim any pages
		1479	* (above), or we've reclaimed an order of pages at least as
		1480	* large as the allocation's order. In both cases, if the
		1481	* allocation still fails, we stop retrying.
		1482	*/
		1483	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
		1484	return 1;
1482		1485
1483	if (should_fail_alloc_page(gfp_mask, order))	1486	/*
1484	return NULL;	1487	* Don't let big-order allocations loop unless the caller
		1488	* explicitly requests that.
		1489	*/
		1490	if (gfp_mask & __GFP_NOFAIL)
		1491	return 1;
1485		1492
1486	/* the list of zones suitable for gfp_mask */	1493	return 0;
1487	z = zonelist->_zonerefs;	1494	}
1488	if (unlikely(!z->zone)) {	1495
1489	/*	1496	static inline struct page *
1490	* Happens if we have an empty zonelist as a result of	1497	__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1491	* GFP_THISNODE being used on a memoryless node	1498	struct zonelist *zonelist, enum zone_type high_zoneidx,
1492	*/	1499	nodemask_t *nodemask)
		1500	{
		1501	struct page *page;
		1502
		1503	/* Acquire the OOM killer lock for the zones in zonelist */
		1504	if (!try_set_zone_oom(zonelist, gfp_mask)) {
		1505	schedule_timeout_uninterruptible(1);
1493	return NULL;	1506	return NULL;
1494	}	1507	}
1495		1508
1496	restart:	1509	/*
1497	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask, order,	1510	* Go through the zonelist yet one more time, keep very high watermark
1498	zonelist, high_zoneidx, ALLOC_WMARK_LOW\|ALLOC_CPUSET);	1511	* here, this is only to catch a parallel oom killing, we must fail if
		1512	* we're still under heavy pressure.
		1513	*/
		1514	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask,
		1515	order, zonelist, high_zoneidx,
		1516	ALLOC_WMARK_HIGH\|ALLOC_CPUSET);
1499	if (page)	1517	if (page)
1500	goto got_pg;	1518	goto out;
		1519
		1520	/* The OOM killer will not help higher order allocs */
		1521	if (order > PAGE_ALLOC_COSTLY_ORDER)
		1522	goto out;
		1523
		1524	/* Exhausted what can be done so it's blamo time */
		1525	out_of_memory(zonelist, gfp_mask, order);
		1526
		1527	out:
		1528	clear_zonelist_oom(zonelist, gfp_mask);
		1529	return page;
		1530	}
		1531
		1532	/* The really slow allocator path where we enter direct reclaim */
		1533	static inline struct page *
		1534	__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
		1535	struct zonelist *zonelist, enum zone_type high_zoneidx,
		1536	nodemask_t nodemask, int alloc_flags, unsigned long did_some_progress)
		1537	{
		1538	struct page *page = NULL;
		1539	struct reclaim_state reclaim_state;
		1540	struct task_struct *p = current;
		1541
		1542	cond_resched();
		1543
		1544	/* We now go into synchronous reclaim */
		1545	cpuset_memory_pressure_bump();
		1546
		1547	/*
		1548	* The task's cpuset might have expanded its set of allowable nodes
		1549	*/
		1550	p->flags \|= PF_MEMALLOC;
		1551	lockdep_set_current_reclaim_state(gfp_mask);
		1552	reclaim_state.reclaimed_slab = 0;
		1553	p->reclaim_state = &reclaim_state;
		1554
		1555	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
		1556
		1557	p->reclaim_state = NULL;
		1558	lockdep_clear_current_reclaim_state();
		1559	p->flags &= ~PF_MEMALLOC;
		1560
		1561	cond_resched();
		1562
		1563	if (order != 0)
		1564	drain_all_pages();
		1565
		1566	if (likely(*did_some_progress))
		1567	page = get_page_from_freelist(gfp_mask, nodemask, order,
		1568	zonelist, high_zoneidx, alloc_flags);
		1569	return page;
		1570	}
		1571
		1572	static inline int
		1573	is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
		1574	{
		1575	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))
		1576	&& !in_interrupt())
		1577	return 1;
		1578	return 0;
		1579	}
		1580
		1581	/*
		1582	* This is called in the allocator slow-path if the allocation request is of
		1583	* sufficient urgency to ignore watermarks and take other desperate measures
		1584	*/
		1585	static inline struct page *
		1586	__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
		1587	struct zonelist *zonelist, enum zone_type high_zoneidx,
		1588	nodemask_t *nodemask)
		1589	{
		1590	struct page *page;
		1591
		1592	do {
		1593	page = get_page_from_freelist(gfp_mask, nodemask, order,
		1594	zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
		1595
		1596	if (!page && gfp_mask & __GFP_NOFAIL)
		1597	congestion_wait(WRITE, HZ/50);
		1598	} while (!page && (gfp_mask & __GFP_NOFAIL));
		1599
		1600	return page;
		1601	}
		1602
		1603	static inline
		1604	void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
		1605	enum zone_type high_zoneidx)
		1606	{
		1607	struct zoneref *z;
		1608	struct zone *zone;
		1609
		1610	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
		1611	wakeup_kswapd(zone, order);
		1612	}
		1613
		1614	static inline struct page *
		1615	__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
		1616	struct zonelist *zonelist, enum zone_type high_zoneidx,
		1617	nodemask_t *nodemask)
		1618	{
		1619	const gfp_t wait = gfp_mask & __GFP_WAIT;
		1620	struct page *page = NULL;
		1621	int alloc_flags;
		1622	unsigned long pages_reclaimed = 0;
		1623	unsigned long did_some_progress;
		1624	struct task_struct *p = current;
1501		1625
1502	/*	1626	/*
1503	* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and	1627	* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,8 +1634,7 @@ restart:
1510	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)	1634	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1511	goto nopage;	1635	goto nopage;
1512		1636
1513	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)	1637	wake_all_kswapd(order, zonelist, high_zoneidx);
1514	wakeup_kswapd(zone, order);
1515		1638
1516	/*	1639	/*
1517	* OK, we're below the kswapd watermark and have kicked background	1640	* OK, we're below the kswapd watermark and have kicked background
@@ -1531,6 +1654,7 @@ restart:
1531	if (wait)	1654	if (wait)
1532	alloc_flags \|= ALLOC_CPUSET;	1655	alloc_flags \|= ALLOC_CPUSET;
1533		1656
		1657	restart:
1534	/*	1658	/*
1535	* Go through the zonelist again. Let __GFP_HIGH and allocations	1659	* Go through the zonelist again. Let __GFP_HIGH and allocations
1536	* coming from realtime tasks go deeper into reserves.	1660	* coming from realtime tasks go deeper into reserves.
@@ -1544,23 +1668,18 @@ restart:
1544	if (page)	1668	if (page)
1545	goto got_pg;	1669	goto got_pg;
1546		1670
1547	/* This allocation should allow future memory freeing. */
1548
1549	rebalance:	1671	rebalance:
1550	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))	1672	/* Allocate without watermarks if the context allows */
1551	&& !in_interrupt()) {	1673	if (is_allocation_high_priority(p, gfp_mask)) {
		1674	/* Do not dip into emergency reserves if specified */
1552	if (!(gfp_mask & __GFP_NOMEMALLOC)) {	1675	if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1553	nofail_alloc:	1676	page = __alloc_pages_high_priority(gfp_mask, order,
1554	/* go through the zonelist yet again, ignoring mins */	1677	zonelist, high_zoneidx, nodemask);
1555	page = get_page_from_freelist(gfp_mask, nodemask, order,
1556	zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1557	if (page)	1678	if (page)
1558	goto got_pg;	1679	goto got_pg;
1559	if (gfp_mask & __GFP_NOFAIL) {
1560	congestion_wait(WRITE, HZ/50);
1561	goto nofail_alloc;
1562	}
1563	}	1680	}
		1681
		1682	/* Ensure no recursion into the allocator */
1564	goto nopage;	1683	goto nopage;
1565	}	1684	}
1566		1685
@@ -1568,93 +1687,42 @@ nofail_alloc:
1568	if (!wait)	1687	if (!wait)
1569	goto nopage;	1688	goto nopage;
1570		1689
1571	cond_resched();	1690	/* Try direct reclaim and then allocating */
1572		1691	page = __alloc_pages_direct_reclaim(gfp_mask, order,
1573	/* We now go into synchronous reclaim */	1692	zonelist, high_zoneidx,
1574	cpuset_memory_pressure_bump();	1693	nodemask,
1575		1694	alloc_flags, &did_some_progress);
1576	p->flags \|= PF_MEMALLOC;	1695	if (page)
1577		1696	goto got_pg;
1578	lockdep_set_current_reclaim_state(gfp_mask);
1579	reclaim_state.reclaimed_slab = 0;
1580	p->reclaim_state = &reclaim_state;
1581
1582	did_some_progress = try_to_free_pages(zonelist, order,
1583	gfp_mask, nodemask);
1584
1585	p->reclaim_state = NULL;
1586	lockdep_clear_current_reclaim_state();
1587	p->flags &= ~PF_MEMALLOC;
1588		1697
1589	cond_resched();	1698	/*
		1699	* If we failed to make any progress reclaiming, then we are
		1700	* running out of options and have to consider going OOM
		1701	*/
		1702	if (!did_some_progress) {
		1703	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
		1704	page = __alloc_pages_may_oom(gfp_mask, order,
		1705	zonelist, high_zoneidx,
		1706	nodemask);
		1707	if (page)
		1708	goto got_pg;
1590		1709
1591	if (order != 0)	1710	/*
1592	drain_all_pages();	1711	* The OOM killer does not trigger for high-order allocations
		1712	* but if no progress is being made, there are no other
		1713	* options and retrying is unlikely to help
		1714	*/
		1715	if (order > PAGE_ALLOC_COSTLY_ORDER)
		1716	goto nopage;
1593		1717
1594	if (likely(did_some_progress)) {
1595	page = get_page_from_freelist(gfp_mask, nodemask, order,
1596	zonelist, high_zoneidx, alloc_flags);
1597	if (page)
1598	goto got_pg;
1599	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1600	if (!try_set_zone_oom(zonelist, gfp_mask)) {
1601	schedule_timeout_uninterruptible(1);
1602	goto restart;	1718	goto restart;
1603	}	1719	}
1604
1605	/*
1606	* Go through the zonelist yet one more time, keep
1607	* very high watermark here, this is only to catch
1608	* a parallel oom killing, we must fail if we're still
1609	* under heavy pressure.
1610	*/
1611	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask,
1612	order, zonelist, high_zoneidx,
1613	ALLOC_WMARK_HIGH\|ALLOC_CPUSET);
1614	if (page) {
1615	clear_zonelist_oom(zonelist, gfp_mask);
1616	goto got_pg;
1617	}
1618
1619	/* The OOM killer will not help higher order allocs so fail */
1620	if (order > PAGE_ALLOC_COSTLY_ORDER) {
1621	clear_zonelist_oom(zonelist, gfp_mask);
1622	goto nopage;
1623	}
1624
1625	out_of_memory(zonelist, gfp_mask, order);
1626	clear_zonelist_oom(zonelist, gfp_mask);
1627	goto restart;
1628	}	1720	}
1629		1721
1630	/*	1722	/* Check if we should retry the allocation */
1631	* Don't let big-order allocations loop unless the caller explicitly
1632	* requests that. Wait for some write requests to complete then retry.
1633	*
1634	* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1635	* means __GFP_NOFAIL, but that may not be true in other
1636	* implementations.
1637	*
1638	* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1639	* specified, then we retry until we no longer reclaim any pages
1640	* (above), or we've reclaimed an order of pages at least as
1641	* large as the allocation's order. In both cases, if the
1642	* allocation still fails, we stop retrying.
1643	*/
1644	pages_reclaimed += did_some_progress;	1723	pages_reclaimed += did_some_progress;
1645	do_retry = 0;	1724	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1646	if (!(gfp_mask & __GFP_NORETRY)) {	1725	/* Wait for some write requests to complete then retry */
1647	if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1648	do_retry = 1;
1649	} else {
1650	if (gfp_mask & __GFP_REPEAT &&
1651	pages_reclaimed < (1 << order))
1652	do_retry = 1;
1653	}
1654	if (gfp_mask & __GFP_NOFAIL)
1655	do_retry = 1;
1656	}
1657	if (do_retry) {
1658	congestion_wait(WRITE, HZ/50);	1726	congestion_wait(WRITE, HZ/50);
1659	goto rebalance;	1727	goto rebalance;
1660	}	1728	}
@@ -1669,6 +1737,41 @@ nopage:
1669	}	1737	}
1670	got_pg:	1738	got_pg:
1671	return page;	1739	return page;
		1740
		1741	}
		1742
		1743	/*
		1744	* This is the 'heart' of the zoned buddy allocator.
		1745	*/
		1746	struct page *
		1747	__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
		1748	struct zonelist zonelist, nodemask_t nodemask)
		1749	{
		1750	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
		1751	struct page *page;
		1752
		1753	lockdep_trace_alloc(gfp_mask);
		1754
		1755	might_sleep_if(gfp_mask & __GFP_WAIT);
		1756
		1757	if (should_fail_alloc_page(gfp_mask, order))
		1758	return NULL;
		1759
		1760	/*
		1761	* Check the zones suitable for the gfp_mask contain at least one
		1762	* valid zone. It's possible to have an empty zonelist as a result
		1763	* of GFP_THISNODE and a memoryless node
		1764	*/
		1765	if (unlikely(!zonelist->_zonerefs->zone))
		1766	return NULL;
		1767
		1768	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask, order,
		1769	zonelist, high_zoneidx, ALLOC_WMARK_LOW\|ALLOC_CPUSET);
		1770	if (unlikely(!page))
		1771	page = __alloc_pages_slowpath(gfp_mask, order,
		1772	zonelist, high_zoneidx, nodemask);
		1773
		1774	return page;
1672	}	1775	}
1673	EXPORT_SYMBOL(__alloc_pages_nodemask);	1776	EXPORT_SYMBOL(__alloc_pages_nodemask);
1674		1777