6 files changed, 196 insertions, 49 deletions
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 5ac51552d908..2592883d862d 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,6 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *mask);
+extern unsigned long compaction_suitable(struct zone *zone, int order);
+extern unsigned long compact_zone_order(struct zone *zone, int order,
+                                                gfp_t gfp_mask);
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
@@ -59,6 +62,17 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
        return COMPACT_CONTINUE;
 }
+static inline unsigned long compaction_suitable(struct zone *zone, int order)
+{
+        return COMPACT_SKIPPED;
+}
+static inline unsigned long compact_zone_order(struct zone *zone, int order,
+                                                gfp_t gfp_mask)
+{
+        return 0;
+}
 static inline void defer_compaction(struct zone *zone)
 {
 }
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 57dac7022b63..5a9d9059520b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -600,6 +600,13 @@ struct sysinfo {
 #define NUMA_BUILD 0
 #endif
+/* This helps us avoid #ifdef CONFIG_COMPACTION */
+#ifdef CONFIG_COMPACTION
+#define COMPACTION_BUILD 1
+#else
+#define COMPACTION_BUILD 0
+#endif
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
diff --git a/mm/compaction.c b/mm/compaction.c
index 20011a850fef..8fe917ec7c11 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -384,10 +384,62 @@ static int compact_finished(struct zone *zone,
        return COMPACT_CONTINUE;
 }
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ *   COMPACT_SKIPPED  - If there are too few free pages for compaction
+ *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
+ *   COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+        int fragindex;
+        unsigned long watermark;
+        /*
+         * Watermarks for order-0 must be met for compaction. Note the 2UL.
+         * This is because during migration, copies of pages need to be
+         * allocated and for a short time, the footprint is higher
+         */
+        watermark = low_wmark_pages(zone) + (2UL << order);
+        if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                return COMPACT_SKIPPED;
+        /*
+         * fragmentation index determines if allocation failures are due to
+         * low memory or external fragmentation
+         *
+         * index of -1 implies allocations might succeed dependingon watermarks
+         * index towards 0 implies failure is due to lack of memory
+         * index towards 1000 implies failure is due to fragmentation
+         *
+         * Only compact if a failure would be due to fragmentation.
+         */
+        fragindex = fragmentation_index(zone, order);
+        if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+                return COMPACT_SKIPPED;
+        if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+                return COMPACT_PARTIAL;
+        return COMPACT_CONTINUE;
+}
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
+        ret = compaction_suitable(zone, cc->order);
+        switch (ret) {
+        case COMPACT_PARTIAL:
+        case COMPACT_SKIPPED:
+                /* Compaction is likely to fail */
+                return ret;
+        case COMPACT_CONTINUE:
+                /* Fall through to compaction */
+                ;
+        }
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -429,7 +481,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        return ret;
 }
-static unsigned long compact_zone_order(struct zone *zone,
+unsigned long compact_zone_order(struct zone *zone,
                                                int order, gfp_t gfp_mask)
 {
        struct compact_control cc = {
@@ -462,7 +514,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
-        unsigned long watermark;
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
@@ -480,43 +531,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
-                int fragindex;
                int status;
-                /*
-                 * Watermarks for order-0 must be met for compaction. Note
-                 * the 2UL. This is because during migration, copies of
-                 * pages need to be allocated and for a short time, the
-                 * footprint is higher
-                 */
-                watermark = low_wmark_pages(zone) + (2UL << order);
-                if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
-                        continue;
-                /*
-                 * fragmentation index determines if allocation failures are
-                 * due to low memory or external fragmentation
-                 *
-                 * index of -1 implies allocations might succeed depending
-                 *      on watermarks
-                 * index towards 0 implies failure is due to lack of memory
-                 * index towards 1000 implies failure is due to fragmentation
-                 *
-                 * Only compact if a failure would be due to fragmentation.
-                 */
-                fragindex = fragmentation_index(zone, order);
-                if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-                        continue;
-                if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
-                        rc = COMPACT_PARTIAL;
-                        break;
-                }
                status = compact_zone_order(zone, order, gfp_mask);
                rc = max(status, rc);
-                if (zone_watermark_ok(zone, order, watermark, 0, 0))
+                /* If a normal allocation would succeed, stop compacting */
+                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
                        break;
        }
diff --git a/mm/migrate.c b/mm/migrate.c
index 6ae8a66a7045..94875b265928 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,6 +639,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        if (!trylock_page(page)) {
                if (!force)
                        goto move_newpage;
+                /*
+                 * It's not safe for direct compaction to call lock_page.
+                 * For example, during page readahead pages are added locked
+                 * to the LRU. Later, when the IO completes the pages are
+                 * marked uptodate and unlocked. However, the queueing
+                 * could be merging multiple pages for one bio (e.g.
+                 * mpage_readpages). If an allocation happens for the
+                 * second or third page, the process can end up locking
+                 * the same page twice and deadlocking. Rather than
+                 * trying to be clever about what pages can be locked,
+                 * avoid the use of lock_page for direct compaction
+                 * altogether.
+                 */
+                if (current->flags & PF_MEMALLOC)
+                        goto move_newpage;
                lock_page(page);
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22a1bb7723e4..03a66a31bfcd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        int migratetype, unsigned long *did_some_progress)
 {
        struct page *page;
+        struct task_struct *tsk = current;
        if (!order || compaction_deferred(preferred_zone))
                return NULL;
+        tsk->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                                nodemask);
+        tsk->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
                /* Page migration frees to the PCP lists but we want merging */
@@ -2121,6 +2124,19 @@ rebalance:
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
+        } else {
+                /*
+                 * High-order allocations do not necessarily loop after
+                 * direct reclaim and reclaim/compaction depends on compaction
+                 * being called after reclaim so call directly if necessary
+                 */
+                page = __alloc_pages_direct_compact(gfp_mask, order,
+                                        zonelist, high_zoneidx,
+                                        nodemask,
+                                        alloc_flags, preferred_zone,
+                                        migratetype, &did_some_progress);
+                if (page)
+                        goto got_pg;
        }
 nopage:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3464312bde07..10ebd74a423c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/compaction.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
@@ -59,12 +60,15 @@
 * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference
 *                      page from the LRU and reclaim all pages within a
 *                      naturally aligned range
+ * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ *                      order-0 pages and then compact the zone
 */
 typedef unsigned __bitwise__ lumpy_mode;
 #define LUMPY_MODE_SINGLE               ((__force lumpy_mode)0x01u)
 #define LUMPY_MODE_ASYNC                ((__force lumpy_mode)0x02u)
 #define LUMPY_MODE_SYNC                 ((__force lumpy_mode)0x04u)
 #define LUMPY_MODE_CONTIGRECLAIM        ((__force lumpy_mode)0x08u)
+#define LUMPY_MODE_COMPACTION           ((__force lumpy_mode)0x10u)
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
@@ -286,18 +290,20 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
        lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
        /*
-         * Some reclaim have alredy been failed. No worth to try synchronous
+         * Initially assume we are entering either lumpy reclaim or
-         * lumpy reclaim.
+         * reclaim/compaction.Depending on the order, we will either set the
+         * sync mode or just reclaim order-0 pages later.
         */
-        if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE)
+        if (COMPACTION_BUILD)
-                return;
+                sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION;
+        else
+                sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
        /*
-         * If we need a large contiguous chunk of memory, or have
+         * Avoid using lumpy reclaim or reclaim/compaction if possible by
-         * trouble getting a small set of contiguous pages, we
+         * restricting when its set to either costly allocations or when
-         * will reclaim both active and inactive pages.
+         * under memory pressure
         */
-        sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
                sc->lumpy_reclaim_mode |= syncmode;
        else if (sc->order && priority < DEF_PRIORITY - 2)
@@ -1385,8 +1391,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?
+                        sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
-                                        ISOLATE_INACTIVE : ISOLATE_BOTH,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
@@ -1398,8 +1404,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?
+                        sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
-                                        ISOLATE_INACTIVE : ISOLATE_BOTH,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, sc->mem_cgroup,
                        0, file);
                /*
@@ -1815,6 +1821,57 @@ out:
 }
 /*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+                                        unsigned long nr_reclaimed,
+                                        unsigned long nr_scanned,
+                                        struct scan_control *sc)
+{
+        unsigned long pages_for_compaction;
+        unsigned long inactive_lru_pages;
+        /* If not in reclaim/compaction mode, stop */
+        if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION))
+                return false;
+        /*
+         * If we failed to reclaim and have scanned the full list, stop.
+         * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+         *       faster but obviously would be less likely to succeed
+         *       allocation. If this is desirable, use GFP_REPEAT to decide
+         *       if both reclaimed and scanned should be checked or just
+         *       reclaimed
+         */
+        if (!nr_reclaimed && !nr_scanned)
+                return false;
+        /*
+         * If we have not reclaimed enough pages for compaction and the
+         * inactive lists are large enough, continue reclaiming
+         */
+        pages_for_compaction = (2UL << sc->order);
+        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+                                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        if (sc->nr_reclaimed < pages_for_compaction &&
+                        inactive_lru_pages > pages_for_compaction)
+                return true;
+        /* If compaction would go ahead or the allocation would succeed, stop */
+        switch (compaction_suitable(zone, sc->order)) {
+        case COMPACT_PARTIAL:
+        case COMPACT_CONTINUE:
+                return false;
+        default:
+                return true;
+        }
+}
+/*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
 static void shrink_zone(int priority, struct zone *zone,
@@ -1823,9 +1880,12 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list l;
-        unsigned long nr_reclaimed = sc->nr_reclaimed;
+        unsigned long nr_reclaimed;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+        unsigned long nr_scanned = sc->nr_scanned;
+restart:
+        nr_reclaimed = 0;
        get_scan_count(zone, sc, nr, priority);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1851,8 +1911,7 @@ static void shrink_zone(int priority, struct zone *zone,
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
+        sc->nr_reclaimed += nr_reclaimed;
-        sc->nr_reclaimed = nr_reclaimed;
        /*
         * Even if we did not try to evict anon pages at all, we want to
@@ -1861,6 +1920,11 @@ static void shrink_zone(int priority, struct zone *zone,
        if (inactive_anon_is_low(zone, sc))
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+        /* reclaim/compaction might need reclaim to continue */
+        if (should_continue_reclaim(zone, nr_reclaimed,
+                                        sc->nr_scanned - nr_scanned, sc))
+                goto restart;
        throttle_vm_writeout(sc->gfp_mask);
 }
@@ -2307,6 +2371,14 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
+                        /*
+                         * Compact the zone for higher orders to reduce
+                         * latencies for higher-order allocations that
+                         * would ordinarily call try_to_compact_pages()
+                         */
+                        if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
+                                compact_zone_order(zone, sc.order, sc.gfp_mask);
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;

diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 5ac51552d908..2592883d862d 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h
@@ -22,6 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
22	extern int fragmentation_index(struct zone *zone, unsigned int order);	22	extern int fragmentation_index(struct zone *zone, unsigned int order);
23	extern unsigned long try_to_compact_pages(struct zonelist *zonelist,	23	extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24	int order, gfp_t gfp_mask, nodemask_t *mask);	24	int order, gfp_t gfp_mask, nodemask_t *mask);
		25	extern unsigned long compaction_suitable(struct zone *zone, int order);
		26	extern unsigned long compact_zone_order(struct zone *zone, int order,
		27	gfp_t gfp_mask);
25		28
26	/* Do not skip compaction more than 64 times */	29	/* Do not skip compaction more than 64 times */
27	#define COMPACT_MAX_DEFER_SHIFT 6	30	#define COMPACT_MAX_DEFER_SHIFT 6
@@ -59,6 +62,17 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
59	return COMPACT_CONTINUE;	62	return COMPACT_CONTINUE;
60	}	63	}
61		64
		65	static inline unsigned long compaction_suitable(struct zone *zone, int order)
		66	{
		67	return COMPACT_SKIPPED;
		68	}
		69
		70	static inline unsigned long compact_zone_order(struct zone *zone, int order,
		71	gfp_t gfp_mask)
		72	{
		73	return 0;
		74	}
		75
62	static inline void defer_compaction(struct zone *zone)	76	static inline void defer_compaction(struct zone *zone)
63	{	77	{
64	}	78	}


diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 57dac7022b63..5a9d9059520b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h
@@ -600,6 +600,13 @@ struct sysinfo {
600	#define NUMA_BUILD 0	600	#define NUMA_BUILD 0
601	#endif	601	#endif
602		602
		603	/* This helps us avoid #ifdef CONFIG_COMPACTION */
		604	#ifdef CONFIG_COMPACTION
		605	#define COMPACTION_BUILD 1
		606	#else
		607	#define COMPACTION_BUILD 0
		608	#endif
		609
603	/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */	610	/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
604	#ifdef CONFIG_FTRACE_MCOUNT_RECORD	611	#ifdef CONFIG_FTRACE_MCOUNT_RECORD
605	# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD	612	# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD


diff --git a/mm/compaction.c b/mm/compaction.c index 20011a850fef..8fe917ec7c11 100644 --- a/mm/compaction.c +++ b/mm/compaction.c
@@ -384,10 +384,62 @@ static int compact_finished(struct zone *zone,
384	return COMPACT_CONTINUE;	384	return COMPACT_CONTINUE;
385	}	385	}
386		386
		387	/*
		388	* compaction_suitable: Is this suitable to run compaction on this zone now?
		389	* Returns
		390	* COMPACT_SKIPPED - If there are too few free pages for compaction
		391	* COMPACT_PARTIAL - If the allocation would succeed without compaction
		392	* COMPACT_CONTINUE - If compaction should run now
		393	*/
		394	unsigned long compaction_suitable(struct zone *zone, int order)
		395	{
		396	int fragindex;
		397	unsigned long watermark;
		398
		399	/*
		400	* Watermarks for order-0 must be met for compaction. Note the 2UL.
		401	* This is because during migration, copies of pages need to be
		402	* allocated and for a short time, the footprint is higher
		403	*/
		404	watermark = low_wmark_pages(zone) + (2UL << order);
		405	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
		406	return COMPACT_SKIPPED;
		407
		408	/*
		409	* fragmentation index determines if allocation failures are due to
		410	* low memory or external fragmentation
		411	*
		412	* index of -1 implies allocations might succeed dependingon watermarks
		413	* index towards 0 implies failure is due to lack of memory
		414	* index towards 1000 implies failure is due to fragmentation
		415	*
		416	* Only compact if a failure would be due to fragmentation.
		417	*/
		418	fragindex = fragmentation_index(zone, order);
		419	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
		420	return COMPACT_SKIPPED;
		421
		422	if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
		423	return COMPACT_PARTIAL;
		424
		425	return COMPACT_CONTINUE;
		426	}
		427
387	static int compact_zone(struct zone zone, struct compact_control cc)	428	static int compact_zone(struct zone zone, struct compact_control cc)
388	{	429	{
389	int ret;	430	int ret;
390		431
		432	ret = compaction_suitable(zone, cc->order);
		433	switch (ret) {
		434	case COMPACT_PARTIAL:
		435	case COMPACT_SKIPPED:
		436	/* Compaction is likely to fail */
		437	return ret;
		438	case COMPACT_CONTINUE:
		439	/* Fall through to compaction */
		440	;
		441	}
		442
391	/* Setup to move all movable pages to the end of the zone */	443	/* Setup to move all movable pages to the end of the zone */
392	cc->migrate_pfn = zone->zone_start_pfn;	444	cc->migrate_pfn = zone->zone_start_pfn;
393	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;	445	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -429,7 +481,7 @@ static int compact_zone(struct zone zone, struct compact_control cc)
429	return ret;	481	return ret;
430	}	482	}
431		483
432	static unsigned long compact_zone_order(struct zone *zone,	484	unsigned long compact_zone_order(struct zone *zone,
433	int order, gfp_t gfp_mask)	485	int order, gfp_t gfp_mask)
434	{	486	{
435	struct compact_control cc = {	487	struct compact_control cc = {
@@ -462,7 +514,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
462	enum zone_type high_zoneidx = gfp_zone(gfp_mask);	514	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
463	int may_enter_fs = gfp_mask & __GFP_FS;	515	int may_enter_fs = gfp_mask & __GFP_FS;
464	int may_perform_io = gfp_mask & __GFP_IO;	516	int may_perform_io = gfp_mask & __GFP_IO;
465	unsigned long watermark;
466	struct zoneref *z;	517	struct zoneref *z;
467	struct zone *zone;	518	struct zone *zone;
468	int rc = COMPACT_SKIPPED;	519	int rc = COMPACT_SKIPPED;
@@ -480,43 +531,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
480	/* Compact each zone in the list */	531	/* Compact each zone in the list */
481	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,	532	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
482	nodemask) {	533	nodemask) {
483	int fragindex;
484	int status;	534	int status;
485		535
486	/*
487	* Watermarks for order-0 must be met for compaction. Note
488	* the 2UL. This is because during migration, copies of
489	* pages need to be allocated and for a short time, the
490	* footprint is higher
491	*/
492	watermark = low_wmark_pages(zone) + (2UL << order);
493	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
494	continue;
495
496	/*
497	* fragmentation index determines if allocation failures are
498	* due to low memory or external fragmentation
499	*
500	* index of -1 implies allocations might succeed depending
501	* on watermarks
502	* index towards 0 implies failure is due to lack of memory
503	* index towards 1000 implies failure is due to fragmentation
504	*
505	* Only compact if a failure would be due to fragmentation.
506	*/
507	fragindex = fragmentation_index(zone, order);
508	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
509	continue;
510
511	if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
512	rc = COMPACT_PARTIAL;
513	break;
514	}
515
516	status = compact_zone_order(zone, order, gfp_mask);	536	status = compact_zone_order(zone, order, gfp_mask);
517	rc = max(status, rc);	537	rc = max(status, rc);
518		538
519	if (zone_watermark_ok(zone, order, watermark, 0, 0))	539	/* If a normal allocation would succeed, stop compacting */
		540	if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
520	break;	541	break;
521	}	542	}
522		543


diff --git a/mm/migrate.c b/mm/migrate.c index 6ae8a66a7045..94875b265928 100644 --- a/mm/migrate.c +++ b/mm/migrate.c
@@ -639,6 +639,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
639	if (!trylock_page(page)) {	639	if (!trylock_page(page)) {
640	if (!force)	640	if (!force)
641	goto move_newpage;	641	goto move_newpage;
		642
		643	/*
		644	* It's not safe for direct compaction to call lock_page.
		645	* For example, during page readahead pages are added locked
		646	* to the LRU. Later, when the IO completes the pages are
		647	* marked uptodate and unlocked. However, the queueing
		648	* could be merging multiple pages for one bio (e.g.
		649	* mpage_readpages). If an allocation happens for the
		650	* second or third page, the process can end up locking
		651	* the same page twice and deadlocking. Rather than
		652	* trying to be clever about what pages can be locked,
		653	* avoid the use of lock_page for direct compaction
		654	* altogether.
		655	*/
		656	if (current->flags & PF_MEMALLOC)
		657	goto move_newpage;
		658
642	lock_page(page);	659	lock_page(page);
643	}	660	}
644		661


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22a1bb7723e4..03a66a31bfcd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1815	int migratetype, unsigned long *did_some_progress)	1815	int migratetype, unsigned long *did_some_progress)
1816	{	1816	{
1817	struct page *page;	1817	struct page *page;
		1818	struct task_struct *tsk = current;
1818		1819
1819	if (!order \|\| compaction_deferred(preferred_zone))	1820	if (!order \|\| compaction_deferred(preferred_zone))
1820	return NULL;	1821	return NULL;
1821		1822
		1823	tsk->flags \|= PF_MEMALLOC;
1822	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,	1824	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1823	nodemask);	1825	nodemask);
		1826	tsk->flags &= ~PF_MEMALLOC;
1824	if (*did_some_progress != COMPACT_SKIPPED) {	1827	if (*did_some_progress != COMPACT_SKIPPED) {
1825		1828
1826	/* Page migration frees to the PCP lists but we want merging */	1829	/* Page migration frees to the PCP lists but we want merging */
@@ -2121,6 +2124,19 @@ rebalance:
2121	/* Wait for some write requests to complete then retry */	2124	/* Wait for some write requests to complete then retry */
2122	wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);	2125	wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2123	goto rebalance;	2126	goto rebalance;
		2127	} else {
		2128	/*
		2129	* High-order allocations do not necessarily loop after
		2130	* direct reclaim and reclaim/compaction depends on compaction
		2131	* being called after reclaim so call directly if necessary
		2132	*/
		2133	page = __alloc_pages_direct_compact(gfp_mask, order,
		2134	zonelist, high_zoneidx,
		2135	nodemask,
		2136	alloc_flags, preferred_zone,
		2137	migratetype, &did_some_progress);
		2138	if (page)
		2139	goto got_pg;
2124	}	2140	}
2125		2141
2126	nopage:	2142	nopage:


diff --git a/mm/vmscan.c b/mm/vmscan.c index 3464312bde07..10ebd74a423c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
32	#include <linux/topology.h>	32	#include <linux/topology.h>
33	#include <linux/cpu.h>	33	#include <linux/cpu.h>
34	#include <linux/cpuset.h>	34	#include <linux/cpuset.h>
		35	#include <linux/compaction.h>
35	#include <linux/notifier.h>	36	#include <linux/notifier.h>
36	#include <linux/rwsem.h>	37	#include <linux/rwsem.h>
37	#include <linux/delay.h>	38	#include <linux/delay.h>
@@ -59,12 +60,15 @@
59	* LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference	60	* LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference
60	* page from the LRU and reclaim all pages within a	61	* page from the LRU and reclaim all pages within a
61	* naturally aligned range	62	* naturally aligned range
		63	* LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of
		64	* order-0 pages and then compact the zone
62	*/	65	*/
63	typedef unsigned __bitwise__ lumpy_mode;	66	typedef unsigned __bitwise__ lumpy_mode;
64	#define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u)	67	#define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u)
65	#define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u)	68	#define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u)
66	#define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u)	69	#define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u)
67	#define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u)	70	#define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u)
		71	#define LUMPY_MODE_COMPACTION ((__force lumpy_mode)0x10u)
68		72
69	struct scan_control {	73	struct scan_control {
70	/* Incremented by the number of inactive pages that were scanned */	74	/* Incremented by the number of inactive pages that were scanned */
@@ -286,18 +290,20 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
286	lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;	290	lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
287		291
288	/*	292	/*
289	* Some reclaim have alredy been failed. No worth to try synchronous	293	* Initially assume we are entering either lumpy reclaim or
290	* lumpy reclaim.	294	* reclaim/compaction.Depending on the order, we will either set the
		295	* sync mode or just reclaim order-0 pages later.
291	*/	296	*/
292	if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE)	297	if (COMPACTION_BUILD)
293	return;	298	sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION;
		299	else
		300	sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
294		301
295	/*	302	/*
296	* If we need a large contiguous chunk of memory, or have	303	* Avoid using lumpy reclaim or reclaim/compaction if possible by
297	* trouble getting a small set of contiguous pages, we	304	* restricting when its set to either costly allocations or when
298	* will reclaim both active and inactive pages.	305	* under memory pressure
299	*/	306	*/
300	sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
301	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)	307	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
302	sc->lumpy_reclaim_mode \|= syncmode;	308	sc->lumpy_reclaim_mode \|= syncmode;
303	else if (sc->order && priority < DEF_PRIORITY - 2)	309	else if (sc->order && priority < DEF_PRIORITY - 2)
@@ -1385,8 +1391,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1385	if (scanning_global_lru(sc)) {	1391	if (scanning_global_lru(sc)) {
1386	nr_taken = isolate_pages_global(nr_to_scan,	1392	nr_taken = isolate_pages_global(nr_to_scan,
1387	&page_list, &nr_scanned, sc->order,	1393	&page_list, &nr_scanned, sc->order,
1388	sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?	1394	sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
1389	ISOLATE_INACTIVE : ISOLATE_BOTH,	1395	ISOLATE_BOTH : ISOLATE_INACTIVE,
1390	zone, 0, file);	1396	zone, 0, file);
1391	zone->pages_scanned += nr_scanned;	1397	zone->pages_scanned += nr_scanned;
1392	if (current_is_kswapd())	1398	if (current_is_kswapd())
@@ -1398,8 +1404,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1398	} else {	1404	} else {
1399	nr_taken = mem_cgroup_isolate_pages(nr_to_scan,	1405	nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1400	&page_list, &nr_scanned, sc->order,	1406	&page_list, &nr_scanned, sc->order,
1401	sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?	1407	sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
1402	ISOLATE_INACTIVE : ISOLATE_BOTH,	1408	ISOLATE_BOTH : ISOLATE_INACTIVE,
1403	zone, sc->mem_cgroup,	1409	zone, sc->mem_cgroup,
1404	0, file);	1410	0, file);
1405	/*	1411	/*
@@ -1815,6 +1821,57 @@ out:
1815	}	1821	}
1816		1822
1817	/*	1823	/*
		1824	* Reclaim/compaction depends on a number of pages being freed. To avoid
		1825	* disruption to the system, a small number of order-0 pages continue to be
		1826	* rotated and reclaimed in the normal fashion. However, by the time we get
		1827	* back to the allocator and call try_to_compact_zone(), we ensure that
		1828	* there are enough free pages for it to be likely successful
		1829	*/
		1830	static inline bool should_continue_reclaim(struct zone *zone,
		1831	unsigned long nr_reclaimed,
		1832	unsigned long nr_scanned,
		1833	struct scan_control *sc)
		1834	{
		1835	unsigned long pages_for_compaction;
		1836	unsigned long inactive_lru_pages;
		1837
		1838	/* If not in reclaim/compaction mode, stop */
		1839	if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION))
		1840	return false;
		1841
		1842	/*
		1843	* If we failed to reclaim and have scanned the full list, stop.
		1844	* NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
		1845	* faster but obviously would be less likely to succeed
		1846	* allocation. If this is desirable, use GFP_REPEAT to decide
		1847	* if both reclaimed and scanned should be checked or just
		1848	* reclaimed
		1849	*/
		1850	if (!nr_reclaimed && !nr_scanned)
		1851	return false;
		1852
		1853	/*
		1854	* If we have not reclaimed enough pages for compaction and the
		1855	* inactive lists are large enough, continue reclaiming
		1856	*/
		1857	pages_for_compaction = (2UL << sc->order);
		1858	inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
		1859	zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
		1860	if (sc->nr_reclaimed < pages_for_compaction &&
		1861	inactive_lru_pages > pages_for_compaction)
		1862	return true;
		1863
		1864	/* If compaction would go ahead or the allocation would succeed, stop */
		1865	switch (compaction_suitable(zone, sc->order)) {
		1866	case COMPACT_PARTIAL:
		1867	case COMPACT_CONTINUE:
		1868	return false;
		1869	default:
		1870	return true;
		1871	}
		1872	}
		1873
		1874	/*
1818	* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.	1875	* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1819	*/	1876	*/
1820	static void shrink_zone(int priority, struct zone *zone,	1877	static void shrink_zone(int priority, struct zone *zone,
@@ -1823,9 +1880,12 @@ static void shrink_zone(int priority, struct zone *zone,
1823	unsigned long nr[NR_LRU_LISTS];	1880	unsigned long nr[NR_LRU_LISTS];
1824	unsigned long nr_to_scan;	1881	unsigned long nr_to_scan;
1825	enum lru_list l;	1882	enum lru_list l;
1826	unsigned long nr_reclaimed = sc->nr_reclaimed;	1883	unsigned long nr_reclaimed;
1827	unsigned long nr_to_reclaim = sc->nr_to_reclaim;	1884	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
		1885	unsigned long nr_scanned = sc->nr_scanned;
1828		1886
		1887	restart:
		1888	nr_reclaimed = 0;
1829	get_scan_count(zone, sc, nr, priority);	1889	get_scan_count(zone, sc, nr, priority);
1830		1890
1831	while (nr[LRU_INACTIVE_ANON] \|\| nr[LRU_ACTIVE_FILE] \|\|	1891	while (nr[LRU_INACTIVE_ANON] \|\| nr[LRU_ACTIVE_FILE] \|\|
@@ -1851,8 +1911,7 @@ static void shrink_zone(int priority, struct zone *zone,
1851	if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)	1911	if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1852	break;	1912	break;
1853	}	1913	}
1854		1914	sc->nr_reclaimed += nr_reclaimed;
1855	sc->nr_reclaimed = nr_reclaimed;
1856		1915
1857	/*	1916	/*
1858	* Even if we did not try to evict anon pages at all, we want to	1917	* Even if we did not try to evict anon pages at all, we want to
@@ -1861,6 +1920,11 @@ static void shrink_zone(int priority, struct zone *zone,
1861	if (inactive_anon_is_low(zone, sc))	1920	if (inactive_anon_is_low(zone, sc))
1862	shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);	1921	shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1863		1922
		1923	/* reclaim/compaction might need reclaim to continue */
		1924	if (should_continue_reclaim(zone, nr_reclaimed,
		1925	sc->nr_scanned - nr_scanned, sc))
		1926	goto restart;
		1927
1864	throttle_vm_writeout(sc->gfp_mask);	1928	throttle_vm_writeout(sc->gfp_mask);
1865	}	1929	}
1866		1930
@@ -2307,6 +2371,14 @@ loop_again:
2307	total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)	2371	total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2308	sc.may_writepage = 1;	2372	sc.may_writepage = 1;
2309		2373
		2374	/*
		2375	* Compact the zone for higher orders to reduce
		2376	* latencies for higher-order allocations that
		2377	* would ordinarily call try_to_compact_pages()
		2378	*/
		2379	if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
		2380	compact_zone_order(zone, sc.order, sc.gfp_mask);
		2381
2310	if (!zone_watermark_ok_safe(zone, order,	2382	if (!zone_watermark_ok_safe(zone, order,
2311	high_wmark_pages(zone), end_zone, 0)) {	2383	high_wmark_pages(zone), end_zone, 0)) {
2312	all_zones_ok = 0;	2384	all_zones_ok = 0;