mm: vmscan: reclaim order-0 and use compaction instead of lumpy reclaim

Lumpy reclaim is disruptive. It reclaims a large number of pages and ignores the age of the pages it reclaims. This can incur significant stalls and potentially increase the number of major faults. Compaction has reached the point where it is considered reasonably stable (meaning it has passed a lot of testing) and is a potential candidate for displacing lumpy reclaim. This patch introduces an alternative to lumpy reclaim whe compaction is available called reclaim/compaction. The basic operation is very simple - instead of selecting a contiguous range of pages to reclaim, a number of order-0 pages are reclaimed and then compaction is later by either kswapd (compact_zone_order()) or direct compaction (__alloc_pages_direct_compact()). [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: use conventional task_struct naming] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Andy Whitcroft <apw@shadowen.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Mel Gorman <mel@csn.ul.ie> 2011-01-13 18:45:56 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2011-01-13 20:32:33 -0500
commit: 3e7d344970673c5334cf7b5bb27c8c0942b06126 (patch)
tree: 832ecb4da5fd27efa5a503df5b96bfdee2a52ffd
parent: ee64fc9354e515a79c7232cfde65c88ec627308b (diff)
6 files changed, 196 insertions, 49 deletions
diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 5ac51552d908..2592883d862d 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -22,6 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
                        int order, gfp_t gfp_mask, nodemask_t *mask);
+extern unsigned long compaction_suitable(struct zone *zone, int order);
+extern unsigned long compact_zone_order(struct zone *zone, int order,
+                                                gfp_t gfp_mask);
 /* Do not skip compaction more than 64 times */
 #define COMPACT_MAX_DEFER_SHIFT 6
@@ -59,6 +62,17 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
        return COMPACT_CONTINUE;
 }
+static inline unsigned long compaction_suitable(struct zone *zone, int order)
+{
+        return COMPACT_SKIPPED;
+}
+static inline unsigned long compact_zone_order(struct zone *zone, int order,
+                                                gfp_t gfp_mask)
+{
+        return 0;
+}
 static inline void defer_compaction(struct zone *zone)
 {
 }
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 57dac7022b63..5a9d9059520b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -600,6 +600,13 @@ struct sysinfo {
 #define NUMA_BUILD 0
 #endif
+/* This helps us avoid #ifdef CONFIG_COMPACTION */
+#ifdef CONFIG_COMPACTION
+#define COMPACTION_BUILD 1
+#else
+#define COMPACTION_BUILD 0
+#endif
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
diff --git a/mm/compaction.c b/mm/compaction.c
index 20011a850fef..8fe917ec7c11 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -384,10 +384,62 @@ static int compact_finished(struct zone *zone,
        return COMPACT_CONTINUE;
 }
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ *   COMPACT_SKIPPED  - If there are too few free pages for compaction
+ *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
+ *   COMPACT_CONTINUE - If compaction should run now
+ */
+unsigned long compaction_suitable(struct zone *zone, int order)
+{
+        int fragindex;
+        unsigned long watermark;
+        /*
+         * Watermarks for order-0 must be met for compaction. Note the 2UL.
+         * This is because during migration, copies of pages need to be
+         * allocated and for a short time, the footprint is higher
+         */
+        watermark = low_wmark_pages(zone) + (2UL << order);
+        if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                return COMPACT_SKIPPED;
+        /*
+         * fragmentation index determines if allocation failures are due to
+         * low memory or external fragmentation
+         *
+         * index of -1 implies allocations might succeed dependingon watermarks
+         * index towards 0 implies failure is due to lack of memory
+         * index towards 1000 implies failure is due to fragmentation
+         *
+         * Only compact if a failure would be due to fragmentation.
+         */
+        fragindex = fragmentation_index(zone, order);
+        if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
+                return COMPACT_SKIPPED;
+        if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
+                return COMPACT_PARTIAL;
+        return COMPACT_CONTINUE;
+}
 static int compact_zone(struct zone *zone, struct compact_control *cc)
 {
        int ret;
+        ret = compaction_suitable(zone, cc->order);
+        switch (ret) {
+        case COMPACT_PARTIAL:
+        case COMPACT_SKIPPED:
+                /* Compaction is likely to fail */
+                return ret;
+        case COMPACT_CONTINUE:
+                /* Fall through to compaction */
+                ;
+        }
        /* Setup to move all movable pages to the end of the zone */
        cc->migrate_pfn = zone->zone_start_pfn;
        cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -429,7 +481,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        return ret;
 }
-static unsigned long compact_zone_order(struct zone *zone,
+unsigned long compact_zone_order(struct zone *zone,
                                                int order, gfp_t gfp_mask)
 {
        struct compact_control cc = {
@@ -462,7 +514,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        int may_enter_fs = gfp_mask & __GFP_FS;
        int may_perform_io = gfp_mask & __GFP_IO;
-        unsigned long watermark;
        struct zoneref *z;
        struct zone *zone;
        int rc = COMPACT_SKIPPED;
@@ -480,43 +531,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
        /* Compact each zone in the list */
        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
                                                                nodemask) {
-                int fragindex;
                int status;
-                /*
-                 * Watermarks for order-0 must be met for compaction. Note
-                 * the 2UL. This is because during migration, copies of
-                 * pages need to be allocated and for a short time, the
-                 * footprint is higher
-                 */
-                watermark = low_wmark_pages(zone) + (2UL << order);
-                if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
-                        continue;
-                /*
-                 * fragmentation index determines if allocation failures are
-                 * due to low memory or external fragmentation
-                 *
-                 * index of -1 implies allocations might succeed depending
-                 *      on watermarks
-                 * index towards 0 implies failure is due to lack of memory
-                 * index towards 1000 implies failure is due to fragmentation
-                 *
-                 * Only compact if a failure would be due to fragmentation.
-                 */
-                fragindex = fragmentation_index(zone, order);
-                if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
-                        continue;
-                if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
-                        rc = COMPACT_PARTIAL;
-                        break;
-                }
                status = compact_zone_order(zone, order, gfp_mask);
                rc = max(status, rc);
-                if (zone_watermark_ok(zone, order, watermark, 0, 0))
+                /* If a normal allocation would succeed, stop compacting */
+                if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
                        break;
        }
diff --git a/mm/migrate.c b/mm/migrate.c
index 6ae8a66a7045..94875b265928 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,6 +639,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
        if (!trylock_page(page)) {
                if (!force)
                        goto move_newpage;
+                /*
+                 * It's not safe for direct compaction to call lock_page.
+                 * For example, during page readahead pages are added locked
+                 * to the LRU. Later, when the IO completes the pages are
+                 * marked uptodate and unlocked. However, the queueing
+                 * could be merging multiple pages for one bio (e.g.
+                 * mpage_readpages). If an allocation happens for the
+                 * second or third page, the process can end up locking
+                 * the same page twice and deadlocking. Rather than
+                 * trying to be clever about what pages can be locked,
+                 * avoid the use of lock_page for direct compaction
+                 * altogether.
+                 */
+                if (current->flags & PF_MEMALLOC)
+                        goto move_newpage;
                lock_page(page);
        }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22a1bb7723e4..03a66a31bfcd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        int migratetype, unsigned long *did_some_progress)
 {
        struct page *page;
+        struct task_struct *tsk = current;
        if (!order || compaction_deferred(preferred_zone))
                return NULL;
+        tsk->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                                nodemask);
+        tsk->flags &= ~PF_MEMALLOC;
        if (*did_some_progress != COMPACT_SKIPPED) {
                /* Page migration frees to the PCP lists but we want merging */
@@ -2121,6 +2124,19 @@ rebalance:
                /* Wait for some write requests to complete then retry */
                wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                goto rebalance;
+        } else {
+                /*
+                 * High-order allocations do not necessarily loop after
+                 * direct reclaim and reclaim/compaction depends on compaction
+                 * being called after reclaim so call directly if necessary
+                 */
+                page = __alloc_pages_direct_compact(gfp_mask, order,
+                                        zonelist, high_zoneidx,
+                                        nodemask,
+                                        alloc_flags, preferred_zone,
+                                        migratetype, &did_some_progress);
+                if (page)
+                        goto got_pg;
        }
 nopage:
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3464312bde07..10ebd74a423c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/compaction.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
 #include <linux/delay.h>
@@ -59,12 +60,15 @@
 * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference
 *                      page from the LRU and reclaim all pages within a
 *                      naturally aligned range
+ * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of
+ *                      order-0 pages and then compact the zone
 */
 typedef unsigned __bitwise__ lumpy_mode;
 #define LUMPY_MODE_SINGLE               ((__force lumpy_mode)0x01u)
 #define LUMPY_MODE_ASYNC                ((__force lumpy_mode)0x02u)
 #define LUMPY_MODE_SYNC                 ((__force lumpy_mode)0x04u)
 #define LUMPY_MODE_CONTIGRECLAIM        ((__force lumpy_mode)0x08u)
+#define LUMPY_MODE_COMPACTION           ((__force lumpy_mode)0x10u)
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
@@ -286,18 +290,20 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
        lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
        /*
-         * Some reclaim have alredy been failed. No worth to try synchronous
+         * Initially assume we are entering either lumpy reclaim or
-         * lumpy reclaim.
+         * reclaim/compaction.Depending on the order, we will either set the
+         * sync mode or just reclaim order-0 pages later.
         */
-        if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE)
+        if (COMPACTION_BUILD)
-                return;
+                sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION;
+        else
+                sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
        /*
-         * If we need a large contiguous chunk of memory, or have
+         * Avoid using lumpy reclaim or reclaim/compaction if possible by
-         * trouble getting a small set of contiguous pages, we
+         * restricting when its set to either costly allocations or when
-         * will reclaim both active and inactive pages.
+         * under memory pressure
         */
-        sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
                sc->lumpy_reclaim_mode |= syncmode;
        else if (sc->order && priority < DEF_PRIORITY - 2)
@@ -1385,8 +1391,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        if (scanning_global_lru(sc)) {
                nr_taken = isolate_pages_global(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?
+                        sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
-                                        ISOLATE_INACTIVE : ISOLATE_BOTH,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, 0, file);
                zone->pages_scanned += nr_scanned;
                if (current_is_kswapd())
@@ -1398,8 +1404,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
        } else {
                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
                        &page_list, &nr_scanned, sc->order,
-                        sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?
+                        sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
-                                        ISOLATE_INACTIVE : ISOLATE_BOTH,
+                                        ISOLATE_BOTH : ISOLATE_INACTIVE,
                        zone, sc->mem_cgroup,
                        0, file);
                /*
@@ -1815,6 +1821,57 @@ out:
 }
 /*
+ * Reclaim/compaction depends on a number of pages being freed. To avoid
+ * disruption to the system, a small number of order-0 pages continue to be
+ * rotated and reclaimed in the normal fashion. However, by the time we get
+ * back to the allocator and call try_to_compact_zone(), we ensure that
+ * there are enough free pages for it to be likely successful
+ */
+static inline bool should_continue_reclaim(struct zone *zone,
+                                        unsigned long nr_reclaimed,
+                                        unsigned long nr_scanned,
+                                        struct scan_control *sc)
+{
+        unsigned long pages_for_compaction;
+        unsigned long inactive_lru_pages;
+        /* If not in reclaim/compaction mode, stop */
+        if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION))
+                return false;
+        /*
+         * If we failed to reclaim and have scanned the full list, stop.
+         * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
+         *       faster but obviously would be less likely to succeed
+         *       allocation. If this is desirable, use GFP_REPEAT to decide
+         *       if both reclaimed and scanned should be checked or just
+         *       reclaimed
+         */
+        if (!nr_reclaimed && !nr_scanned)
+                return false;
+        /*
+         * If we have not reclaimed enough pages for compaction and the
+         * inactive lists are large enough, continue reclaiming
+         */
+        pages_for_compaction = (2UL << sc->order);
+        inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
+                                zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
+        if (sc->nr_reclaimed < pages_for_compaction &&
+                        inactive_lru_pages > pages_for_compaction)
+                return true;
+        /* If compaction would go ahead or the allocation would succeed, stop */
+        switch (compaction_suitable(zone, sc->order)) {
+        case COMPACT_PARTIAL:
+        case COMPACT_CONTINUE:
+                return false;
+        default:
+                return true;
+        }
+}
+/*
 * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
 */
 static void shrink_zone(int priority, struct zone *zone,
@@ -1823,9 +1880,12 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long nr[NR_LRU_LISTS];
        unsigned long nr_to_scan;
        enum lru_list l;
-        unsigned long nr_reclaimed = sc->nr_reclaimed;
+        unsigned long nr_reclaimed;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+        unsigned long nr_scanned = sc->nr_scanned;
+restart:
+        nr_reclaimed = 0;
        get_scan_count(zone, sc, nr, priority);
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -1851,8 +1911,7 @@ static void shrink_zone(int priority, struct zone *zone,
                if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
+        sc->nr_reclaimed += nr_reclaimed;
-        sc->nr_reclaimed = nr_reclaimed;
        /*
         * Even if we did not try to evict anon pages at all, we want to
@@ -1861,6 +1920,11 @@ static void shrink_zone(int priority, struct zone *zone,
        if (inactive_anon_is_low(zone, sc))
                shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
+        /* reclaim/compaction might need reclaim to continue */
+        if (should_continue_reclaim(zone, nr_reclaimed,
+                                        sc->nr_scanned - nr_scanned, sc))
+                goto restart;
        throttle_vm_writeout(sc->gfp_mask);
 }
@@ -2307,6 +2371,14 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
+                        /*
+                         * Compact the zone for higher orders to reduce
+                         * latencies for higher-order allocations that
+                         * would ordinarily call try_to_compact_pages()
+                         */
+                        if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
+                                compact_zone_order(zone, sc.order, sc.gfp_mask);
                        if (!zone_watermark_ok_safe(zone, order,
                                        high_wmark_pages(zone), end_zone, 0)) {
                                all_zones_ok = 0;
author	Mel Gorman <mel@csn.ul.ie>	2011-01-13 18:45:56 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2011-01-13 20:32:33 -0500
commit	3e7d344970673c5334cf7b5bb27c8c0942b06126 (patch)
tree	832ecb4da5fd27efa5a503df5b96bfdee2a52ffd
parent	ee64fc9354e515a79c7232cfde65c88ec627308b (diff)

diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 5ac51552d908..2592883d862d 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h
@@ -22,6 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
22	extern int fragmentation_index(struct zone *zone, unsigned int order);	22	extern int fragmentation_index(struct zone *zone, unsigned int order);
23	extern unsigned long try_to_compact_pages(struct zonelist *zonelist,	23	extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
24	int order, gfp_t gfp_mask, nodemask_t *mask);	24	int order, gfp_t gfp_mask, nodemask_t *mask);
		25	extern unsigned long compaction_suitable(struct zone *zone, int order);
		26	extern unsigned long compact_zone_order(struct zone *zone, int order,
		27	gfp_t gfp_mask);
25		28
26	/* Do not skip compaction more than 64 times */	29	/* Do not skip compaction more than 64 times */
27	#define COMPACT_MAX_DEFER_SHIFT 6	30	#define COMPACT_MAX_DEFER_SHIFT 6
@@ -59,6 +62,17 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
59	return COMPACT_CONTINUE;	62	return COMPACT_CONTINUE;
60	}	63	}
61		64
		65	static inline unsigned long compaction_suitable(struct zone *zone, int order)
		66	{
		67	return COMPACT_SKIPPED;
		68	}
		69
		70	static inline unsigned long compact_zone_order(struct zone *zone, int order,
		71	gfp_t gfp_mask)
		72	{
		73	return 0;
		74	}
		75
62	static inline void defer_compaction(struct zone *zone)	76	static inline void defer_compaction(struct zone *zone)
63	{	77	{
64	}	78	}


diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 57dac7022b63..5a9d9059520b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h
@@ -600,6 +600,13 @@ struct sysinfo {
600	#define NUMA_BUILD 0	600	#define NUMA_BUILD 0
601	#endif	601	#endif
602		602
		603	/* This helps us avoid #ifdef CONFIG_COMPACTION */
		604	#ifdef CONFIG_COMPACTION
		605	#define COMPACTION_BUILD 1
		606	#else
		607	#define COMPACTION_BUILD 0
		608	#endif
		609
603	/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */	610	/* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
604	#ifdef CONFIG_FTRACE_MCOUNT_RECORD	611	#ifdef CONFIG_FTRACE_MCOUNT_RECORD
605	# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD	612	# define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD


diff --git a/mm/compaction.c b/mm/compaction.c index 20011a850fef..8fe917ec7c11 100644 --- a/mm/compaction.c +++ b/mm/compaction.c
@@ -384,10 +384,62 @@ static int compact_finished(struct zone *zone,
384	return COMPACT_CONTINUE;	384	return COMPACT_CONTINUE;
385	}	385	}
386		386
		387	/*
		388	* compaction_suitable: Is this suitable to run compaction on this zone now?
		389	* Returns
		390	* COMPACT_SKIPPED - If there are too few free pages for compaction
		391	* COMPACT_PARTIAL - If the allocation would succeed without compaction
		392	* COMPACT_CONTINUE - If compaction should run now
		393	*/
		394	unsigned long compaction_suitable(struct zone *zone, int order)
		395	{
		396	int fragindex;
		397	unsigned long watermark;
		398
		399	/*
		400	* Watermarks for order-0 must be met for compaction. Note the 2UL.
		401	* This is because during migration, copies of pages need to be
		402	* allocated and for a short time, the footprint is higher
		403	*/
		404	watermark = low_wmark_pages(zone) + (2UL << order);
		405	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
		406	return COMPACT_SKIPPED;
		407
		408	/*
		409	* fragmentation index determines if allocation failures are due to
		410	* low memory or external fragmentation
		411	*
		412	* index of -1 implies allocations might succeed dependingon watermarks
		413	* index towards 0 implies failure is due to lack of memory
		414	* index towards 1000 implies failure is due to fragmentation
		415	*
		416	* Only compact if a failure would be due to fragmentation.
		417	*/
		418	fragindex = fragmentation_index(zone, order);
		419	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
		420	return COMPACT_SKIPPED;
		421
		422	if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0))
		423	return COMPACT_PARTIAL;
		424
		425	return COMPACT_CONTINUE;
		426	}
		427
387	static int compact_zone(struct zone zone, struct compact_control cc)	428	static int compact_zone(struct zone zone, struct compact_control cc)
388	{	429	{
389	int ret;	430	int ret;
390		431
		432	ret = compaction_suitable(zone, cc->order);
		433	switch (ret) {
		434	case COMPACT_PARTIAL:
		435	case COMPACT_SKIPPED:
		436	/* Compaction is likely to fail */
		437	return ret;
		438	case COMPACT_CONTINUE:
		439	/* Fall through to compaction */
		440	;
		441	}
		442
391	/* Setup to move all movable pages to the end of the zone */	443	/* Setup to move all movable pages to the end of the zone */
392	cc->migrate_pfn = zone->zone_start_pfn;	444	cc->migrate_pfn = zone->zone_start_pfn;
393	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;	445	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
@@ -429,7 +481,7 @@ static int compact_zone(struct zone zone, struct compact_control cc)
429	return ret;	481	return ret;
430	}	482	}
431		483
432	static unsigned long compact_zone_order(struct zone *zone,	484	unsigned long compact_zone_order(struct zone *zone,
433	int order, gfp_t gfp_mask)	485	int order, gfp_t gfp_mask)
434	{	486	{
435	struct compact_control cc = {	487	struct compact_control cc = {
@@ -462,7 +514,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
462	enum zone_type high_zoneidx = gfp_zone(gfp_mask);	514	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
463	int may_enter_fs = gfp_mask & __GFP_FS;	515	int may_enter_fs = gfp_mask & __GFP_FS;
464	int may_perform_io = gfp_mask & __GFP_IO;	516	int may_perform_io = gfp_mask & __GFP_IO;
465	unsigned long watermark;
466	struct zoneref *z;	517	struct zoneref *z;
467	struct zone *zone;	518	struct zone *zone;
468	int rc = COMPACT_SKIPPED;	519	int rc = COMPACT_SKIPPED;
@@ -480,43 +531,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
480	/* Compact each zone in the list */	531	/* Compact each zone in the list */
481	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,	532	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
482	nodemask) {	533	nodemask) {
483	int fragindex;
484	int status;	534	int status;
485		535
486	/*
487	* Watermarks for order-0 must be met for compaction. Note
488	* the 2UL. This is because during migration, copies of
489	* pages need to be allocated and for a short time, the
490	* footprint is higher
491	*/
492	watermark = low_wmark_pages(zone) + (2UL << order);
493	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
494	continue;
495
496	/*
497	* fragmentation index determines if allocation failures are
498	* due to low memory or external fragmentation
499	*
500	* index of -1 implies allocations might succeed depending
501	* on watermarks
502	* index towards 0 implies failure is due to lack of memory
503	* index towards 1000 implies failure is due to fragmentation
504	*
505	* Only compact if a failure would be due to fragmentation.
506	*/
507	fragindex = fragmentation_index(zone, order);
508	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
509	continue;
510
511	if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
512	rc = COMPACT_PARTIAL;
513	break;
514	}
515
516	status = compact_zone_order(zone, order, gfp_mask);	536	status = compact_zone_order(zone, order, gfp_mask);
517	rc = max(status, rc);	537	rc = max(status, rc);
518		538
519	if (zone_watermark_ok(zone, order, watermark, 0, 0))	539	/* If a normal allocation would succeed, stop compacting */
		540	if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
520	break;	541	break;
521	}	542	}
522		543


diff --git a/mm/migrate.c b/mm/migrate.c index 6ae8a66a7045..94875b265928 100644 --- a/mm/migrate.c +++ b/mm/migrate.c
@@ -639,6 +639,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
639	if (!trylock_page(page)) {	639	if (!trylock_page(page)) {
640	if (!force)	640	if (!force)
641	goto move_newpage;	641	goto move_newpage;
		642
		643	/*
		644	* It's not safe for direct compaction to call lock_page.
		645	* For example, during page readahead pages are added locked
		646	* to the LRU. Later, when the IO completes the pages are
		647	* marked uptodate and unlocked. However, the queueing
		648	* could be merging multiple pages for one bio (e.g.
		649	* mpage_readpages). If an allocation happens for the
		650	* second or third page, the process can end up locking
		651	* the same page twice and deadlocking. Rather than
		652	* trying to be clever about what pages can be locked,
		653	* avoid the use of lock_page for direct compaction
		654	* altogether.
		655	*/
		656	if (current->flags & PF_MEMALLOC)
		657	goto move_newpage;
		658
642	lock_page(page);	659	lock_page(page);
643	}	660	}
644		661


diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22a1bb7723e4..03a66a31bfcd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1815	int migratetype, unsigned long *did_some_progress)	1815	int migratetype, unsigned long *did_some_progress)
1816	{	1816	{
1817	struct page *page;	1817	struct page *page;
		1818	struct task_struct *tsk = current;
1818		1819
1819	if (!order \|\| compaction_deferred(preferred_zone))	1820	if (!order \|\| compaction_deferred(preferred_zone))
1820	return NULL;	1821	return NULL;
1821		1822
		1823	tsk->flags \|= PF_MEMALLOC;
1822	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,	1824	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1823	nodemask);	1825	nodemask);
		1826	tsk->flags &= ~PF_MEMALLOC;
1824	if (*did_some_progress != COMPACT_SKIPPED) {	1827	if (*did_some_progress != COMPACT_SKIPPED) {
1825		1828
1826	/* Page migration frees to the PCP lists but we want merging */	1829	/* Page migration frees to the PCP lists but we want merging */
@@ -2121,6 +2124,19 @@ rebalance:
2121	/* Wait for some write requests to complete then retry */	2124	/* Wait for some write requests to complete then retry */
2122	wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);	2125	wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
2123	goto rebalance;	2126	goto rebalance;
		2127	} else {
		2128	/*
		2129	* High-order allocations do not necessarily loop after
		2130	* direct reclaim and reclaim/compaction depends on compaction
		2131	* being called after reclaim so call directly if necessary
		2132	*/
		2133	page = __alloc_pages_direct_compact(gfp_mask, order,
		2134	zonelist, high_zoneidx,
		2135	nodemask,
		2136	alloc_flags, preferred_zone,
		2137	migratetype, &did_some_progress);
		2138	if (page)
		2139	goto got_pg;
2124	}	2140	}
2125		2141
2126	nopage:	2142	nopage:


diff --git a/mm/vmscan.c b/mm/vmscan.c index 3464312bde07..10ebd74a423c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -32,6 +32,7 @@
32	#include <linux/topology.h>	32	#include <linux/topology.h>
33	#include <linux/cpu.h>	33	#include <linux/cpu.h>
34	#include <linux/cpuset.h>	34	#include <linux/cpuset.h>
		35	#include <linux/compaction.h>
35	#include <linux/notifier.h>	36	#include <linux/notifier.h>
36	#include <linux/rwsem.h>	37	#include <linux/rwsem.h>
37	#include <linux/delay.h>	38	#include <linux/delay.h>
@@ -59,12 +60,15 @@
59	* LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference	60	* LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference
60	* page from the LRU and reclaim all pages within a	61	* page from the LRU and reclaim all pages within a
61	* naturally aligned range	62	* naturally aligned range
		63	* LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of
		64	* order-0 pages and then compact the zone
62	*/	65	*/
63	typedef unsigned __bitwise__ lumpy_mode;	66	typedef unsigned __bitwise__ lumpy_mode;
64	#define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u)	67	#define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u)
65	#define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u)	68	#define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u)
66	#define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u)	69	#define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u)
67	#define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u)	70	#define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u)
		71	#define LUMPY_MODE_COMPACTION ((__force lumpy_mode)0x10u)
68		72
69	struct scan_control {	73	struct scan_control {
70	/* Incremented by the number of inactive pages that were scanned */	74	/* Incremented by the number of inactive pages that were scanned */
@@ -286,18 +290,20 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc,
286	lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;	290	lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC;
287		291
288	/*	292	/*
289	* Some reclaim have alredy been failed. No worth to try synchronous	293	* Initially assume we are entering either lumpy reclaim or
290	* lumpy reclaim.	294	* reclaim/compaction.Depending on the order, we will either set the
		295	* sync mode or just reclaim order-0 pages later.
291	*/	296	*/
292	if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE)	297	if (COMPACTION_BUILD)
293	return;	298	sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION;
		299	else
		300	sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
294		301
295	/*	302	/*
296	* If we need a large contiguous chunk of memory, or have	303	* Avoid using lumpy reclaim or reclaim/compaction if possible by
297	* trouble getting a small set of contiguous pages, we	304	* restricting when its set to either costly allocations or when
298	* will reclaim both active and inactive pages.	305	* under memory pressure
299	*/	306	*/
300	sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM;
301	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)	307	if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
302	sc->lumpy_reclaim_mode \|= syncmode;	308	sc->lumpy_reclaim_mode \|= syncmode;
303	else if (sc->order && priority < DEF_PRIORITY - 2)	309	else if (sc->order && priority < DEF_PRIORITY - 2)
@@ -1385,8 +1391,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1385	if (scanning_global_lru(sc)) {	1391	if (scanning_global_lru(sc)) {
1386	nr_taken = isolate_pages_global(nr_to_scan,	1392	nr_taken = isolate_pages_global(nr_to_scan,
1387	&page_list, &nr_scanned, sc->order,	1393	&page_list, &nr_scanned, sc->order,
1388	sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?	1394	sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
1389	ISOLATE_INACTIVE : ISOLATE_BOTH,	1395	ISOLATE_BOTH : ISOLATE_INACTIVE,
1390	zone, 0, file);	1396	zone, 0, file);
1391	zone->pages_scanned += nr_scanned;	1397	zone->pages_scanned += nr_scanned;
1392	if (current_is_kswapd())	1398	if (current_is_kswapd())
@@ -1398,8 +1404,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1398	} else {	1404	} else {
1399	nr_taken = mem_cgroup_isolate_pages(nr_to_scan,	1405	nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
1400	&page_list, &nr_scanned, sc->order,	1406	&page_list, &nr_scanned, sc->order,
1401	sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ?	1407	sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ?
1402	ISOLATE_INACTIVE : ISOLATE_BOTH,	1408	ISOLATE_BOTH : ISOLATE_INACTIVE,
1403	zone, sc->mem_cgroup,	1409	zone, sc->mem_cgroup,
1404	0, file);	1410	0, file);
1405	/*	1411	/*
@@ -1815,6 +1821,57 @@ out:
1815	}	1821	}
1816		1822
1817	/*	1823	/*
		1824	* Reclaim/compaction depends on a number of pages being freed. To avoid
		1825	* disruption to the system, a small number of order-0 pages continue to be
		1826	* rotated and reclaimed in the normal fashion. However, by the time we get
		1827	* back to the allocator and call try_to_compact_zone(), we ensure that
		1828	* there are enough free pages for it to be likely successful
		1829	*/
		1830	static inline bool should_continue_reclaim(struct zone *zone,
		1831	unsigned long nr_reclaimed,
		1832	unsigned long nr_scanned,
		1833	struct scan_control *sc)
		1834	{
		1835	unsigned long pages_for_compaction;
		1836	unsigned long inactive_lru_pages;
		1837
		1838	/* If not in reclaim/compaction mode, stop */
		1839	if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION))
		1840	return false;
		1841
		1842	/*
		1843	* If we failed to reclaim and have scanned the full list, stop.
		1844	* NOTE: Checking just nr_reclaimed would exit reclaim/compaction far
		1845	* faster but obviously would be less likely to succeed
		1846	* allocation. If this is desirable, use GFP_REPEAT to decide
		1847	* if both reclaimed and scanned should be checked or just
		1848	* reclaimed
		1849	*/
		1850	if (!nr_reclaimed && !nr_scanned)
		1851	return false;
		1852
		1853	/*
		1854	* If we have not reclaimed enough pages for compaction and the
		1855	* inactive lists are large enough, continue reclaiming
		1856	*/
		1857	pages_for_compaction = (2UL << sc->order);
		1858	inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) +
		1859	zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
		1860	if (sc->nr_reclaimed < pages_for_compaction &&
		1861	inactive_lru_pages > pages_for_compaction)
		1862	return true;
		1863
		1864	/* If compaction would go ahead or the allocation would succeed, stop */
		1865	switch (compaction_suitable(zone, sc->order)) {
		1866	case COMPACT_PARTIAL:
		1867	case COMPACT_CONTINUE:
		1868	return false;
		1869	default:
		1870	return true;
		1871	}
		1872	}
		1873
		1874	/*
1818	* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.	1875	* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1819	*/	1876	*/
1820	static void shrink_zone(int priority, struct zone *zone,	1877	static void shrink_zone(int priority, struct zone *zone,
@@ -1823,9 +1880,12 @@ static void shrink_zone(int priority, struct zone *zone,
1823	unsigned long nr[NR_LRU_LISTS];	1880	unsigned long nr[NR_LRU_LISTS];
1824	unsigned long nr_to_scan;	1881	unsigned long nr_to_scan;
1825	enum lru_list l;	1882	enum lru_list l;
1826	unsigned long nr_reclaimed = sc->nr_reclaimed;	1883	unsigned long nr_reclaimed;
1827	unsigned long nr_to_reclaim = sc->nr_to_reclaim;	1884	unsigned long nr_to_reclaim = sc->nr_to_reclaim;
		1885	unsigned long nr_scanned = sc->nr_scanned;
1828		1886
		1887	restart:
		1888	nr_reclaimed = 0;
1829	get_scan_count(zone, sc, nr, priority);	1889	get_scan_count(zone, sc, nr, priority);
1830		1890
1831	while (nr[LRU_INACTIVE_ANON] \|\| nr[LRU_ACTIVE_FILE] \|\|	1891	while (nr[LRU_INACTIVE_ANON] \|\| nr[LRU_ACTIVE_FILE] \|\|
@@ -1851,8 +1911,7 @@ static void shrink_zone(int priority, struct zone *zone,
1851	if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)	1911	if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
1852	break;	1912	break;
1853	}	1913	}
1854		1914	sc->nr_reclaimed += nr_reclaimed;
1855	sc->nr_reclaimed = nr_reclaimed;
1856		1915
1857	/*	1916	/*
1858	* Even if we did not try to evict anon pages at all, we want to	1917	* Even if we did not try to evict anon pages at all, we want to
@@ -1861,6 +1920,11 @@ static void shrink_zone(int priority, struct zone *zone,
1861	if (inactive_anon_is_low(zone, sc))	1920	if (inactive_anon_is_low(zone, sc))
1862	shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);	1921	shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1863		1922
		1923	/* reclaim/compaction might need reclaim to continue */
		1924	if (should_continue_reclaim(zone, nr_reclaimed,
		1925	sc->nr_scanned - nr_scanned, sc))
		1926	goto restart;
		1927
1864	throttle_vm_writeout(sc->gfp_mask);	1928	throttle_vm_writeout(sc->gfp_mask);
1865	}	1929	}
1866		1930
@@ -2307,6 +2371,14 @@ loop_again:
2307	total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)	2371	total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2308	sc.may_writepage = 1;	2372	sc.may_writepage = 1;
2309		2373
		2374	/*
		2375	* Compact the zone for higher orders to reduce
		2376	* latencies for higher-order allocations that
		2377	* would ordinarily call try_to_compact_pages()
		2378	*/
		2379	if (sc.order > PAGE_ALLOC_COSTLY_ORDER)
		2380	compact_zone_order(zone, sc.order, sc.gfp_mask);
		2381
2310	if (!zone_watermark_ok_safe(zone, order,	2382	if (!zone_watermark_ok_safe(zone, order,
2311	high_wmark_pages(zone), end_zone, 0)) {	2383	high_wmark_pages(zone), end_zone, 0)) {
2312	all_zones_ok = 0;	2384	all_zones_ok = 0;