page allocator: break up the allocator entry point into fast and slow paths

The core of the page allocator is one giant function which allocates memory on the stack and makes calculations that may not be needed for every allocation. This patch breaks up the allocator path into fast and slow paths for clarity. Note the slow paths are still inlined but the entry is marked unlikely. If they were not inlined, it actally increases text size to generate the as there is only one call site. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Christoph Lameter <cl@linux-foundation.org> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Dave Hansen <dave@linux.vnet.ibm.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Mel Gorman <mel@csn.ul.ie> 2009-06-16 18:31:57 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-06-16 22:47:32 -0400
commit: 11e33f6a55ed7847d9c8ffe185ef87faf7806abe (patch)
tree: ca70fe29e836c508cc279c619f7b856380a6f10f /mm/page_alloc.c
parent: 7f82af9742a9346794ecc1515139daed480e7025 (diff)
1 files changed, 228 insertions, 125 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6be8fcb6f74f..512bf9a618c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1457,47 +1457,171 @@ try_next_zone:
        return page;
 }
-/*
+static inline int
- * This is the 'heart' of the zoned buddy allocator.
+should_alloc_retry(gfp_t gfp_mask, unsigned int order,
- */
+                                unsigned long pages_reclaimed)
-struct page *
-__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
-                        struct zonelist *zonelist, nodemask_t *nodemask)
 {
-        const gfp_t wait = gfp_mask & __GFP_WAIT;
+        /* Do not loop if specifically requested */
-        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+        if (gfp_mask & __GFP_NORETRY)
-        struct zoneref *z;
+                return 0;
-        struct zone *zone;
-        struct page *page;
-        struct reclaim_state reclaim_state;
-        struct task_struct *p = current;
-        int do_retry;
-        int alloc_flags;
-        unsigned long did_some_progress;
-        unsigned long pages_reclaimed = 0;
-        lockdep_trace_alloc(gfp_mask);
+        /*
+         * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
+         * means __GFP_NOFAIL, but that may not be true in other
+         * implementations.
+         */
+        if (order <= PAGE_ALLOC_COSTLY_ORDER)
+                return 1;
-        might_sleep_if(wait);
+        /*
+         * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
+         * specified, then we retry until we no longer reclaim any pages
+         * (above), or we've reclaimed an order of pages at least as
+         * large as the allocation's order. In both cases, if the
+         * allocation still fails, we stop retrying.
+         */
+        if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
+                return 1;
-        if (should_fail_alloc_page(gfp_mask, order))
+        /*
-                return NULL;
+         * Don't let big-order allocations loop unless the caller
+         * explicitly requests that.
+         */
+        if (gfp_mask & __GFP_NOFAIL)
+                return 1;
-        /* the list of zones suitable for gfp_mask */
+        return 0;
-        z = zonelist->_zonerefs;
+}
-        if (unlikely(!z->zone)) {
-                /*
+static inline struct page *
-                 * Happens if we have an empty zonelist as a result of
+__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
-                 * GFP_THISNODE being used on a memoryless node
+        struct zonelist *zonelist, enum zone_type high_zoneidx,
-                 */
+        nodemask_t *nodemask)
+{
+        struct page *page;
+        /* Acquire the OOM killer lock for the zones in zonelist */
+        if (!try_set_zone_oom(zonelist, gfp_mask)) {
+                schedule_timeout_uninterruptible(1);
                return NULL;
        }
-restart:
+        /*
-        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+         * Go through the zonelist yet one more time, keep very high watermark
-                        zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+         * here, this is only to catch a parallel oom killing, we must fail if
+         * we're still under heavy pressure.
+         */
+        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
+                order, zonelist, high_zoneidx,
+                ALLOC_WMARK_HIGH|ALLOC_CPUSET);
        if (page)
-                goto got_pg;
+                goto out;
+        /* The OOM killer will not help higher order allocs */
+        if (order > PAGE_ALLOC_COSTLY_ORDER)
+                goto out;
+        /* Exhausted what can be done so it's blamo time */
+        out_of_memory(zonelist, gfp_mask, order);
+out:
+        clear_zonelist_oom(zonelist, gfp_mask);
+        return page;
+}
+/* The really slow allocator path where we enter direct reclaim */
+static inline struct page *
+__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
+        struct zonelist *zonelist, enum zone_type high_zoneidx,
+        nodemask_t *nodemask, int alloc_flags, unsigned long *did_some_progress)
+{
+        struct page *page = NULL;
+        struct reclaim_state reclaim_state;
+        struct task_struct *p = current;
+        cond_resched();
+        /* We now go into synchronous reclaim */
+        cpuset_memory_pressure_bump();
+        /*
+         * The task's cpuset might have expanded its set of allowable nodes
+         */
+        p->flags |= PF_MEMALLOC;
+        lockdep_set_current_reclaim_state(gfp_mask);
+        reclaim_state.reclaimed_slab = 0;
+        p->reclaim_state = &reclaim_state;
+        *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+        p->reclaim_state = NULL;
+        lockdep_clear_current_reclaim_state();
+        p->flags &= ~PF_MEMALLOC;
+        cond_resched();
+        if (order != 0)
+                drain_all_pages();
+        if (likely(*did_some_progress))
+                page = get_page_from_freelist(gfp_mask, nodemask, order,
+                                        zonelist, high_zoneidx, alloc_flags);
+        return page;
+}
+static inline int
+is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
+{
+        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+                        && !in_interrupt())
+                return 1;
+        return 0;
+}
+/*
+ * This is called in the allocator slow-path if the allocation request is of
+ * sufficient urgency to ignore watermarks and take other desperate measures
+ */
+static inline struct page *
+__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
+        struct zonelist *zonelist, enum zone_type high_zoneidx,
+        nodemask_t *nodemask)
+{
+        struct page *page;
+        do {
+                page = get_page_from_freelist(gfp_mask, nodemask, order,
+                        zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
+                if (!page && gfp_mask & __GFP_NOFAIL)
+                        congestion_wait(WRITE, HZ/50);
+        } while (!page && (gfp_mask & __GFP_NOFAIL));
+        return page;
+}
+static inline
+void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
+                                                enum zone_type high_zoneidx)
+{
+        struct zoneref *z;
+        struct zone *zone;
+        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+                wakeup_kswapd(zone, order);
+}
+static inline struct page *
+__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
+        struct zonelist *zonelist, enum zone_type high_zoneidx,
+        nodemask_t *nodemask)
+{
+        const gfp_t wait = gfp_mask & __GFP_WAIT;
+        struct page *page = NULL;
+        int alloc_flags;
+        unsigned long pages_reclaimed = 0;
+        unsigned long did_some_progress;
+        struct task_struct *p = current;
        /*
         * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,8 +1634,7 @@ restart:
        if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
                goto nopage;
-        for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
+        wake_all_kswapd(order, zonelist, high_zoneidx);
-                wakeup_kswapd(zone, order);
        /*
         * OK, we're below the kswapd watermark and have kicked background
@@ -1531,6 +1654,7 @@ restart:
        if (wait)
                alloc_flags |= ALLOC_CPUSET;
+restart:
        /*
         * Go through the zonelist again. Let __GFP_HIGH and allocations
         * coming from realtime tasks go deeper into reserves.
@@ -1544,23 +1668,18 @@ restart:
        if (page)
                goto got_pg;
-        /* This allocation should allow future memory freeing. */
 rebalance:
-        if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
+        /* Allocate without watermarks if the context allows */
-                        && !in_interrupt()) {
+        if (is_allocation_high_priority(p, gfp_mask)) {
+                /* Do not dip into emergency reserves if specified */
                if (!(gfp_mask & __GFP_NOMEMALLOC)) {
-nofail_alloc:
+                        page = __alloc_pages_high_priority(gfp_mask, order,
-                        /* go through the zonelist yet again, ignoring mins */
+                                zonelist, high_zoneidx, nodemask);
-                        page = get_page_from_freelist(gfp_mask, nodemask, order,
-                                zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
                        if (page)
                                goto got_pg;
-                        if (gfp_mask & __GFP_NOFAIL) {
-                                congestion_wait(WRITE, HZ/50);
-                                goto nofail_alloc;
-                        }
                }
+                /* Ensure no recursion into the allocator */
                goto nopage;
        }
@@ -1568,93 +1687,42 @@ nofail_alloc:
        if (!wait)
                goto nopage;
-        cond_resched();
+        /* Try direct reclaim and then allocating */
+        page = __alloc_pages_direct_reclaim(gfp_mask, order,
-        /* We now go into synchronous reclaim */
+                                        zonelist, high_zoneidx,
-        cpuset_memory_pressure_bump();
+                                        nodemask,
+                                        alloc_flags, &did_some_progress);
-        p->flags |= PF_MEMALLOC;
+        if (page)
+                goto got_pg;
-        lockdep_set_current_reclaim_state(gfp_mask);
-        reclaim_state.reclaimed_slab = 0;
-        p->reclaim_state = &reclaim_state;
-        did_some_progress = try_to_free_pages(zonelist, order,
-                                                gfp_mask, nodemask);
-        p->reclaim_state = NULL;
-        lockdep_clear_current_reclaim_state();
-        p->flags &= ~PF_MEMALLOC;
-        cond_resched();
+        /*
+         * If we failed to make any progress reclaiming, then we are
+         * running out of options and have to consider going OOM
+         */
+        if (!did_some_progress) {
+                if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+                        page = __alloc_pages_may_oom(gfp_mask, order,
+                                        zonelist, high_zoneidx,
+                                        nodemask);
+                        if (page)
+                                goto got_pg;
-        if (order != 0)
+                        /*
-                drain_all_pages();
+                         * The OOM killer does not trigger for high-order allocations
+                         * but if no progress is being made, there are no other
+                         * options and retrying is unlikely to help
+                         */
+                        if (order > PAGE_ALLOC_COSTLY_ORDER)
+                                goto nopage;
-        if (likely(did_some_progress)) {
-                page = get_page_from_freelist(gfp_mask, nodemask, order,
-                                        zonelist, high_zoneidx, alloc_flags);
-                if (page)
-                        goto got_pg;
-        } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
-                if (!try_set_zone_oom(zonelist, gfp_mask)) {
-                        schedule_timeout_uninterruptible(1);
                        goto restart;
                }
-                /*
-                 * Go through the zonelist yet one more time, keep
-                 * very high watermark here, this is only to catch
-                 * a parallel oom killing, we must fail if we're still
-                 * under heavy pressure.
-                 */
-                page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
-                        order, zonelist, high_zoneidx,
-                        ALLOC_WMARK_HIGH|ALLOC_CPUSET);
-                if (page) {
-                        clear_zonelist_oom(zonelist, gfp_mask);
-                        goto got_pg;
-                }
-                /* The OOM killer will not help higher order allocs so fail */
-                if (order > PAGE_ALLOC_COSTLY_ORDER) {
-                        clear_zonelist_oom(zonelist, gfp_mask);
-                        goto nopage;
-                }
-                out_of_memory(zonelist, gfp_mask, order);
-                clear_zonelist_oom(zonelist, gfp_mask);
-                goto restart;
        }
-        /*
+        /* Check if we should retry the allocation */
-         * Don't let big-order allocations loop unless the caller explicitly
-         * requests that.  Wait for some write requests to complete then retry.
-         *
-         * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
-         * means __GFP_NOFAIL, but that may not be true in other
-         * implementations.
-         *
-         * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
-         * specified, then we retry until we no longer reclaim any pages
-         * (above), or we've reclaimed an order of pages at least as
-         * large as the allocation's order. In both cases, if the
-         * allocation still fails, we stop retrying.
-         */
        pages_reclaimed += did_some_progress;
-        do_retry = 0;
+        if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
-        if (!(gfp_mask & __GFP_NORETRY)) {
+                /* Wait for some write requests to complete then retry */
-                if (order <= PAGE_ALLOC_COSTLY_ORDER) {
-                        do_retry = 1;
-                } else {
-                        if (gfp_mask & __GFP_REPEAT &&
-                                pages_reclaimed < (1 << order))
-                                        do_retry = 1;
-                }
-                if (gfp_mask & __GFP_NOFAIL)
-                        do_retry = 1;
-        }
-        if (do_retry) {
                congestion_wait(WRITE, HZ/50);
                goto rebalance;
        }
@@ -1669,6 +1737,41 @@ nopage:
        }
 got_pg:
        return page;
+}
+/*
+ * This is the 'heart' of the zoned buddy allocator.
+ */
+struct page *
+__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
+                        struct zonelist *zonelist, nodemask_t *nodemask)
+{
+        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+        struct page *page;
+        lockdep_trace_alloc(gfp_mask);
+        might_sleep_if(gfp_mask & __GFP_WAIT);
+        if (should_fail_alloc_page(gfp_mask, order))
+                return NULL;
+        /*
+         * Check the zones suitable for the gfp_mask contain at least one
+         * valid zone. It's possible to have an empty zonelist as a result
+         * of GFP_THISNODE and a memoryless node
+         */
+        if (unlikely(!zonelist->_zonerefs->zone))
+                return NULL;
+        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
+                        zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+        if (unlikely(!page))
+                page = __alloc_pages_slowpath(gfp_mask, order,
+                                zonelist, high_zoneidx, nodemask);
+        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
author	Mel Gorman <mel@csn.ul.ie>	2009-06-16 18:31:57 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-06-16 22:47:32 -0400
commit	11e33f6a55ed7847d9c8ffe185ef87faf7806abe (patch)
tree	ca70fe29e836c508cc279c619f7b856380a6f10f /mm/page_alloc.c
parent	7f82af9742a9346794ecc1515139daed480e7025 (diff)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6be8fcb6f74f..512bf9a618c7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -1457,47 +1457,171 @@ try_next_zone:
1457	return page;	1457	return page;
1458	}	1458	}
1459		1459
1460	/*	1460	static inline int
1461	* This is the 'heart' of the zoned buddy allocator.	1461	should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1462	*/	1462	unsigned long pages_reclaimed)
1463	struct page *
1464	__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1465	struct zonelist zonelist, nodemask_t nodemask)
1466	{	1463	{
1467	const gfp_t wait = gfp_mask & __GFP_WAIT;	1464	/* Do not loop if specifically requested */
1468	enum zone_type high_zoneidx = gfp_zone(gfp_mask);	1465	if (gfp_mask & __GFP_NORETRY)
1469	struct zoneref *z;	1466	return 0;
1470	struct zone *zone;
1471	struct page *page;
1472	struct reclaim_state reclaim_state;
1473	struct task_struct *p = current;
1474	int do_retry;
1475	int alloc_flags;
1476	unsigned long did_some_progress;
1477	unsigned long pages_reclaimed = 0;
1478		1467
1479	lockdep_trace_alloc(gfp_mask);	1468	/*
		1469	* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
		1470	* means __GFP_NOFAIL, but that may not be true in other
		1471	* implementations.
		1472	*/
		1473	if (order <= PAGE_ALLOC_COSTLY_ORDER)
		1474	return 1;
1480		1475
1481	might_sleep_if(wait);	1476	/*
		1477	* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
		1478	* specified, then we retry until we no longer reclaim any pages
		1479	* (above), or we've reclaimed an order of pages at least as
		1480	* large as the allocation's order. In both cases, if the
		1481	* allocation still fails, we stop retrying.
		1482	*/
		1483	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
		1484	return 1;
1482		1485
1483	if (should_fail_alloc_page(gfp_mask, order))	1486	/*
1484	return NULL;	1487	* Don't let big-order allocations loop unless the caller
		1488	* explicitly requests that.
		1489	*/
		1490	if (gfp_mask & __GFP_NOFAIL)
		1491	return 1;
1485		1492
1486	/* the list of zones suitable for gfp_mask */	1493	return 0;
1487	z = zonelist->_zonerefs;	1494	}
1488	if (unlikely(!z->zone)) {	1495
1489	/*	1496	static inline struct page *
1490	* Happens if we have an empty zonelist as a result of	1497	__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1491	* GFP_THISNODE being used on a memoryless node	1498	struct zonelist *zonelist, enum zone_type high_zoneidx,
1492	*/	1499	nodemask_t *nodemask)
		1500	{
		1501	struct page *page;
		1502
		1503	/* Acquire the OOM killer lock for the zones in zonelist */
		1504	if (!try_set_zone_oom(zonelist, gfp_mask)) {
		1505	schedule_timeout_uninterruptible(1);
1493	return NULL;	1506	return NULL;
1494	}	1507	}
1495		1508
1496	restart:	1509	/*
1497	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask, order,	1510	* Go through the zonelist yet one more time, keep very high watermark
1498	zonelist, high_zoneidx, ALLOC_WMARK_LOW\|ALLOC_CPUSET);	1511	* here, this is only to catch a parallel oom killing, we must fail if
		1512	* we're still under heavy pressure.
		1513	*/
		1514	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask,
		1515	order, zonelist, high_zoneidx,
		1516	ALLOC_WMARK_HIGH\|ALLOC_CPUSET);
1499	if (page)	1517	if (page)
1500	goto got_pg;	1518	goto out;
		1519
		1520	/* The OOM killer will not help higher order allocs */
		1521	if (order > PAGE_ALLOC_COSTLY_ORDER)
		1522	goto out;
		1523
		1524	/* Exhausted what can be done so it's blamo time */
		1525	out_of_memory(zonelist, gfp_mask, order);
		1526
		1527	out:
		1528	clear_zonelist_oom(zonelist, gfp_mask);
		1529	return page;
		1530	}
		1531
		1532	/* The really slow allocator path where we enter direct reclaim */
		1533	static inline struct page *
		1534	__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
		1535	struct zonelist *zonelist, enum zone_type high_zoneidx,
		1536	nodemask_t nodemask, int alloc_flags, unsigned long did_some_progress)
		1537	{
		1538	struct page *page = NULL;
		1539	struct reclaim_state reclaim_state;
		1540	struct task_struct *p = current;
		1541
		1542	cond_resched();
		1543
		1544	/* We now go into synchronous reclaim */
		1545	cpuset_memory_pressure_bump();
		1546
		1547	/*
		1548	* The task's cpuset might have expanded its set of allowable nodes
		1549	*/
		1550	p->flags \|= PF_MEMALLOC;
		1551	lockdep_set_current_reclaim_state(gfp_mask);
		1552	reclaim_state.reclaimed_slab = 0;
		1553	p->reclaim_state = &reclaim_state;
		1554
		1555	*did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
		1556
		1557	p->reclaim_state = NULL;
		1558	lockdep_clear_current_reclaim_state();
		1559	p->flags &= ~PF_MEMALLOC;
		1560
		1561	cond_resched();
		1562
		1563	if (order != 0)
		1564	drain_all_pages();
		1565
		1566	if (likely(*did_some_progress))
		1567	page = get_page_from_freelist(gfp_mask, nodemask, order,
		1568	zonelist, high_zoneidx, alloc_flags);
		1569	return page;
		1570	}
		1571
		1572	static inline int
		1573	is_allocation_high_priority(struct task_struct *p, gfp_t gfp_mask)
		1574	{
		1575	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))
		1576	&& !in_interrupt())
		1577	return 1;
		1578	return 0;
		1579	}
		1580
		1581	/*
		1582	* This is called in the allocator slow-path if the allocation request is of
		1583	* sufficient urgency to ignore watermarks and take other desperate measures
		1584	*/
		1585	static inline struct page *
		1586	__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
		1587	struct zonelist *zonelist, enum zone_type high_zoneidx,
		1588	nodemask_t *nodemask)
		1589	{
		1590	struct page *page;
		1591
		1592	do {
		1593	page = get_page_from_freelist(gfp_mask, nodemask, order,
		1594	zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
		1595
		1596	if (!page && gfp_mask & __GFP_NOFAIL)
		1597	congestion_wait(WRITE, HZ/50);
		1598	} while (!page && (gfp_mask & __GFP_NOFAIL));
		1599
		1600	return page;
		1601	}
		1602
		1603	static inline
		1604	void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
		1605	enum zone_type high_zoneidx)
		1606	{
		1607	struct zoneref *z;
		1608	struct zone *zone;
		1609
		1610	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
		1611	wakeup_kswapd(zone, order);
		1612	}
		1613
		1614	static inline struct page *
		1615	__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
		1616	struct zonelist *zonelist, enum zone_type high_zoneidx,
		1617	nodemask_t *nodemask)
		1618	{
		1619	const gfp_t wait = gfp_mask & __GFP_WAIT;
		1620	struct page *page = NULL;
		1621	int alloc_flags;
		1622	unsigned long pages_reclaimed = 0;
		1623	unsigned long did_some_progress;
		1624	struct task_struct *p = current;
1501		1625
1502	/*	1626	/*
1503	* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and	1627	* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1510,8 +1634,7 @@ restart:
1510	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)	1634	if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1511	goto nopage;	1635	goto nopage;
1512		1636
1513	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)	1637	wake_all_kswapd(order, zonelist, high_zoneidx);
1514	wakeup_kswapd(zone, order);
1515		1638
1516	/*	1639	/*
1517	* OK, we're below the kswapd watermark and have kicked background	1640	* OK, we're below the kswapd watermark and have kicked background
@@ -1531,6 +1654,7 @@ restart:
1531	if (wait)	1654	if (wait)
1532	alloc_flags \|= ALLOC_CPUSET;	1655	alloc_flags \|= ALLOC_CPUSET;
1533		1656
		1657	restart:
1534	/*	1658	/*
1535	* Go through the zonelist again. Let __GFP_HIGH and allocations	1659	* Go through the zonelist again. Let __GFP_HIGH and allocations
1536	* coming from realtime tasks go deeper into reserves.	1660	* coming from realtime tasks go deeper into reserves.
@@ -1544,23 +1668,18 @@ restart:
1544	if (page)	1668	if (page)
1545	goto got_pg;	1669	goto got_pg;
1546		1670
1547	/* This allocation should allow future memory freeing. */
1548
1549	rebalance:	1671	rebalance:
1550	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))	1672	/* Allocate without watermarks if the context allows */
1551	&& !in_interrupt()) {	1673	if (is_allocation_high_priority(p, gfp_mask)) {
		1674	/* Do not dip into emergency reserves if specified */
1552	if (!(gfp_mask & __GFP_NOMEMALLOC)) {	1675	if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1553	nofail_alloc:	1676	page = __alloc_pages_high_priority(gfp_mask, order,
1554	/* go through the zonelist yet again, ignoring mins */	1677	zonelist, high_zoneidx, nodemask);
1555	page = get_page_from_freelist(gfp_mask, nodemask, order,
1556	zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1557	if (page)	1678	if (page)
1558	goto got_pg;	1679	goto got_pg;
1559	if (gfp_mask & __GFP_NOFAIL) {
1560	congestion_wait(WRITE, HZ/50);
1561	goto nofail_alloc;
1562	}
1563	}	1680	}
		1681
		1682	/* Ensure no recursion into the allocator */
1564	goto nopage;	1683	goto nopage;
1565	}	1684	}
1566		1685
@@ -1568,93 +1687,42 @@ nofail_alloc:
1568	if (!wait)	1687	if (!wait)
1569	goto nopage;	1688	goto nopage;
1570		1689
1571	cond_resched();	1690	/* Try direct reclaim and then allocating */
1572		1691	page = __alloc_pages_direct_reclaim(gfp_mask, order,
1573	/* We now go into synchronous reclaim */	1692	zonelist, high_zoneidx,
1574	cpuset_memory_pressure_bump();	1693	nodemask,
1575		1694	alloc_flags, &did_some_progress);
1576	p->flags \|= PF_MEMALLOC;	1695	if (page)
1577		1696	goto got_pg;
1578	lockdep_set_current_reclaim_state(gfp_mask);
1579	reclaim_state.reclaimed_slab = 0;
1580	p->reclaim_state = &reclaim_state;
1581
1582	did_some_progress = try_to_free_pages(zonelist, order,
1583	gfp_mask, nodemask);
1584
1585	p->reclaim_state = NULL;
1586	lockdep_clear_current_reclaim_state();
1587	p->flags &= ~PF_MEMALLOC;
1588		1697
1589	cond_resched();	1698	/*
		1699	* If we failed to make any progress reclaiming, then we are
		1700	* running out of options and have to consider going OOM
		1701	*/
		1702	if (!did_some_progress) {
		1703	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
		1704	page = __alloc_pages_may_oom(gfp_mask, order,
		1705	zonelist, high_zoneidx,
		1706	nodemask);
		1707	if (page)
		1708	goto got_pg;
1590		1709
1591	if (order != 0)	1710	/*
1592	drain_all_pages();	1711	* The OOM killer does not trigger for high-order allocations
		1712	* but if no progress is being made, there are no other
		1713	* options and retrying is unlikely to help
		1714	*/
		1715	if (order > PAGE_ALLOC_COSTLY_ORDER)
		1716	goto nopage;
1593		1717
1594	if (likely(did_some_progress)) {
1595	page = get_page_from_freelist(gfp_mask, nodemask, order,
1596	zonelist, high_zoneidx, alloc_flags);
1597	if (page)
1598	goto got_pg;
1599	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1600	if (!try_set_zone_oom(zonelist, gfp_mask)) {
1601	schedule_timeout_uninterruptible(1);
1602	goto restart;	1718	goto restart;
1603	}	1719	}
1604
1605	/*
1606	* Go through the zonelist yet one more time, keep
1607	* very high watermark here, this is only to catch
1608	* a parallel oom killing, we must fail if we're still
1609	* under heavy pressure.
1610	*/
1611	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask,
1612	order, zonelist, high_zoneidx,
1613	ALLOC_WMARK_HIGH\|ALLOC_CPUSET);
1614	if (page) {
1615	clear_zonelist_oom(zonelist, gfp_mask);
1616	goto got_pg;
1617	}
1618
1619	/* The OOM killer will not help higher order allocs so fail */
1620	if (order > PAGE_ALLOC_COSTLY_ORDER) {
1621	clear_zonelist_oom(zonelist, gfp_mask);
1622	goto nopage;
1623	}
1624
1625	out_of_memory(zonelist, gfp_mask, order);
1626	clear_zonelist_oom(zonelist, gfp_mask);
1627	goto restart;
1628	}	1720	}
1629		1721
1630	/*	1722	/* Check if we should retry the allocation */
1631	* Don't let big-order allocations loop unless the caller explicitly
1632	* requests that. Wait for some write requests to complete then retry.
1633	*
1634	* In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1635	* means __GFP_NOFAIL, but that may not be true in other
1636	* implementations.
1637	*
1638	* For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1639	* specified, then we retry until we no longer reclaim any pages
1640	* (above), or we've reclaimed an order of pages at least as
1641	* large as the allocation's order. In both cases, if the
1642	* allocation still fails, we stop retrying.
1643	*/
1644	pages_reclaimed += did_some_progress;	1723	pages_reclaimed += did_some_progress;
1645	do_retry = 0;	1724	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1646	if (!(gfp_mask & __GFP_NORETRY)) {	1725	/* Wait for some write requests to complete then retry */
1647	if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1648	do_retry = 1;
1649	} else {
1650	if (gfp_mask & __GFP_REPEAT &&
1651	pages_reclaimed < (1 << order))
1652	do_retry = 1;
1653	}
1654	if (gfp_mask & __GFP_NOFAIL)
1655	do_retry = 1;
1656	}
1657	if (do_retry) {
1658	congestion_wait(WRITE, HZ/50);	1726	congestion_wait(WRITE, HZ/50);
1659	goto rebalance;	1727	goto rebalance;
1660	}	1728	}
@@ -1669,6 +1737,41 @@ nopage:
1669	}	1737	}
1670	got_pg:	1738	got_pg:
1671	return page;	1739	return page;
		1740
		1741	}
		1742
		1743	/*
		1744	* This is the 'heart' of the zoned buddy allocator.
		1745	*/
		1746	struct page *
		1747	__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
		1748	struct zonelist zonelist, nodemask_t nodemask)
		1749	{
		1750	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
		1751	struct page *page;
		1752
		1753	lockdep_trace_alloc(gfp_mask);
		1754
		1755	might_sleep_if(gfp_mask & __GFP_WAIT);
		1756
		1757	if (should_fail_alloc_page(gfp_mask, order))
		1758	return NULL;
		1759
		1760	/*
		1761	* Check the zones suitable for the gfp_mask contain at least one
		1762	* valid zone. It's possible to have an empty zonelist as a result
		1763	* of GFP_THISNODE and a memoryless node
		1764	*/
		1765	if (unlikely(!zonelist->_zonerefs->zone))
		1766	return NULL;
		1767
		1768	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask, order,
		1769	zonelist, high_zoneidx, ALLOC_WMARK_LOW\|ALLOC_CPUSET);
		1770	if (unlikely(!page))
		1771	page = __alloc_pages_slowpath(gfp_mask, order,
		1772	zonelist, high_zoneidx, nodemask);
		1773
		1774	return page;
1672	}	1775	}
1673	EXPORT_SYMBOL(__alloc_pages_nodemask);	1776	EXPORT_SYMBOL(__alloc_pages_nodemask);
1674		1777