mm, compaction: skip buddy pages by their order in the migrate scanner

The migration scanner skips PageBuddy pages, but does not consider their order as checking page_order() is generally unsafe without holding the zone->lock, and acquiring the lock just for the check wouldn't be a good tradeoff. Still, this could avoid some iterations over the rest of the buddy page, and if we are careful, the race window between PageBuddy() check and page_order() is small, and the worst thing that can happen is that we skip too much and miss some isolation candidates. This is not that bad, as compaction can already fail for many other reasons like parallel allocations, and those have much larger race window. This patch therefore makes the migration scanner obtain the buddy page order and use it to skip the whole buddy page, if the order appears to be in the valid range. It's important that the page_order() is read only once, so that the value used in the checks and in the pfn calculation is the same. But in theory the compiler can replace the local variable by multiple inlines of page_order(). Therefore, the patch introduces page_order_unsafe() that uses ACCESS_ONCE to prevent this. Testing with stress-highalloc from mmtests shows a 15% reduction in number of pages scanned by migration scanner. The reduction is >60% with __GFP_NO_KSWAPD allocations, along with success rates better by few percent. Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Reviewed-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com> Acked-by: Minchan Kim <minchan@kernel.org> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Cc: Michal Nazarewicz <mina86@mina86.com> Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Christoph Lameter <cl@linux.com> Cc: Rik van Riel <riel@redhat.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Vlastimil Babka <vbabka@suse.cz> 2014-10-09 18:27:23 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-10-09 22:25:54 -0400
commit: 99c0fd5e51c447917264154cb01a967804ace745 (patch)
tree: b733abc6c90b4689a68e189095bb6217d0ff8933
parent: e14c720efdd73c6d69cd8d07fa894bcd11fe1973 (diff)
2 files changed, 46 insertions, 6 deletions
diff --git a/mm/compaction.c b/mm/compaction.c
index b69b7dac0361..b9cf751cc00e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -313,8 +313,15 @@ static inline bool compact_should_abort(struct compact_control *cc)
 static bool suitable_migration_target(struct page *page)
 {
        /* If the page is a large free page, then disallow migration */
-        if (PageBuddy(page) && page_order(page) >= pageblock_order)
+        if (PageBuddy(page)) {
-                return false;
+                /*
+                 * We are checking page_order without zone->lock taken. But
+                 * the only small danger is that we skip a potentially suitable
+                 * pageblock, so it's not worth to check order for valid range.
+                 */
+                if (page_order_unsafe(page) >= pageblock_order)
+                        return false;
+        }
        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
        if (migrate_async_suitable(get_pageblock_migratetype(page)))
@@ -608,11 +615,23 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
                        valid_page = page;
                /*
-                 * Skip if free. page_order cannot be used without zone->lock
+                 * Skip if free. We read page order here without zone lock
-                 * as nothing prevents parallel allocations or buddy merging.
+                 * which is generally unsafe, but the race window is small and
+                 * the worst thing that can happen is that we skip some
+                 * potential isolation targets.
                 */
-                if (PageBuddy(page))
+                if (PageBuddy(page)) {
+                        unsigned long freepage_order = page_order_unsafe(page);
+                        /*
+                         * Without lock, we cannot be sure that what we got is
+                         * a valid page order. Consider only values in the
+                         * valid order range to prevent low_pfn overflow.
+                         */
+                        if (freepage_order > 0 && freepage_order < MAX_ORDER)
+                                low_pfn += (1UL << freepage_order) - 1;
                        continue;
+                }
                /*
                 * Check may be lockless but that's ok as we recheck later.
@@ -698,6 +717,13 @@ isolate_success:
                }
        }
+        /*
+         * The PageBuddy() check could have potentially brought us outside
+         * the range to be scanned.
+         */
+        if (unlikely(low_pfn > end_pfn))
+                low_pfn = end_pfn;
        if (locked)
                spin_unlock_irqrestore(&zone->lru_lock, flags);
diff --git a/mm/internal.h b/mm/internal.h
index 4c1d604c396c..86ae964a25b0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -164,7 +164,8 @@ isolate_migratepages_range(struct compact_control *cc,
 * general, page_zone(page)->lock must be held by the caller to prevent the
 * page from being allocated in parallel and returning garbage as the order.
 * If a caller does not hold page_zone(page)->lock, it must guarantee that the
- * page cannot be allocated or merged in parallel.
+ * page cannot be allocated or merged in parallel. Alternatively, it must
+ * handle invalid values gracefully, and use page_order_unsafe() below.
 */
 static inline unsigned long page_order(struct page *page)
 {
@@ -172,6 +173,19 @@ static inline unsigned long page_order(struct page *page)
        return page_private(page);
 }
+/*
+ * Like page_order(), but for callers who cannot afford to hold the zone lock.
+ * PageBuddy() should be checked first by the caller to minimize race window,
+ * and invalid values must be handled gracefully.
+ *
+ * ACCESS_ONCE is used so that if the caller assigns the result into a local
+ * variable and e.g. tests it for valid range before using, the compiler cannot
+ * decide to remove the variable and inline the page_private(page) multiple
+ * times, potentially observing different values in the tests and the actual
+ * use of the result.
+ */
+#define page_order_unsafe(page)         ACCESS_ONCE(page_private(page))
 static inline bool is_cow_mapping(vm_flags_t flags)
 {
        return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
author	Vlastimil Babka <vbabka@suse.cz>	2014-10-09 18:27:23 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-10-09 22:25:54 -0400
commit	99c0fd5e51c447917264154cb01a967804ace745 (patch)
tree	b733abc6c90b4689a68e189095bb6217d0ff8933
parent	e14c720efdd73c6d69cd8d07fa894bcd11fe1973 (diff)

diff --git a/mm/compaction.c b/mm/compaction.c index b69b7dac0361..b9cf751cc00e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c
@@ -313,8 +313,15 @@ static inline bool compact_should_abort(struct compact_control *cc)
313	static bool suitable_migration_target(struct page *page)	313	static bool suitable_migration_target(struct page *page)
314	{	314	{
315	/* If the page is a large free page, then disallow migration */	315	/* If the page is a large free page, then disallow migration */
316	if (PageBuddy(page) && page_order(page) >= pageblock_order)	316	if (PageBuddy(page)) {
317	return false;	317	/*
		318	* We are checking page_order without zone->lock taken. But
		319	* the only small danger is that we skip a potentially suitable
		320	* pageblock, so it's not worth to check order for valid range.
		321	*/
		322	if (page_order_unsafe(page) >= pageblock_order)
		323	return false;
		324	}
318		325
319	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */	326	/* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
320	if (migrate_async_suitable(get_pageblock_migratetype(page)))	327	if (migrate_async_suitable(get_pageblock_migratetype(page)))
@@ -608,11 +615,23 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
608	valid_page = page;	615	valid_page = page;
609		616
610	/*	617	/*
611	* Skip if free. page_order cannot be used without zone->lock	618	* Skip if free. We read page order here without zone lock
612	* as nothing prevents parallel allocations or buddy merging.	619	* which is generally unsafe, but the race window is small and
		620	* the worst thing that can happen is that we skip some
		621	* potential isolation targets.
613	*/	622	*/
614	if (PageBuddy(page))	623	if (PageBuddy(page)) {
		624	unsigned long freepage_order = page_order_unsafe(page);
		625
		626	/*
		627	* Without lock, we cannot be sure that what we got is
		628	* a valid page order. Consider only values in the
		629	* valid order range to prevent low_pfn overflow.
		630	*/
		631	if (freepage_order > 0 && freepage_order < MAX_ORDER)
		632	low_pfn += (1UL << freepage_order) - 1;
615	continue;	633	continue;
		634	}
616		635
617	/*	636	/*
618	* Check may be lockless but that's ok as we recheck later.	637	* Check may be lockless but that's ok as we recheck later.
@@ -698,6 +717,13 @@ isolate_success:
698	}	717	}
699	}	718	}
700		719
		720	/*
		721	* The PageBuddy() check could have potentially brought us outside
		722	* the range to be scanned.
		723	*/
		724	if (unlikely(low_pfn > end_pfn))
		725	low_pfn = end_pfn;
		726
701	if (locked)	727	if (locked)
702	spin_unlock_irqrestore(&zone->lru_lock, flags);	728	spin_unlock_irqrestore(&zone->lru_lock, flags);
703		729


diff --git a/mm/internal.h b/mm/internal.h index 4c1d604c396c..86ae964a25b0 100644 --- a/mm/internal.h +++ b/mm/internal.h
@@ -164,7 +164,8 @@ isolate_migratepages_range(struct compact_control *cc,
164	* general, page_zone(page)->lock must be held by the caller to prevent the	164	* general, page_zone(page)->lock must be held by the caller to prevent the
165	* page from being allocated in parallel and returning garbage as the order.	165	* page from being allocated in parallel and returning garbage as the order.
166	* If a caller does not hold page_zone(page)->lock, it must guarantee that the	166	* If a caller does not hold page_zone(page)->lock, it must guarantee that the
167	* page cannot be allocated or merged in parallel.	167	* page cannot be allocated or merged in parallel. Alternatively, it must
		168	* handle invalid values gracefully, and use page_order_unsafe() below.
168	*/	169	*/
169	static inline unsigned long page_order(struct page *page)	170	static inline unsigned long page_order(struct page *page)
170	{	171	{
@@ -172,6 +173,19 @@ static inline unsigned long page_order(struct page *page)
172	return page_private(page);	173	return page_private(page);
173	}	174	}
174		175
		176	/*
		177	* Like page_order(), but for callers who cannot afford to hold the zone lock.
		178	* PageBuddy() should be checked first by the caller to minimize race window,
		179	* and invalid values must be handled gracefully.
		180	*
		181	* ACCESS_ONCE is used so that if the caller assigns the result into a local
		182	* variable and e.g. tests it for valid range before using, the compiler cannot
		183	* decide to remove the variable and inline the page_private(page) multiple
		184	* times, potentially observing different values in the tests and the actual
		185	* use of the result.
		186	*/
		187	#define page_order_unsafe(page) ACCESS_ONCE(page_private(page))
		188
175	static inline bool is_cow_mapping(vm_flags_t flags)	189	static inline bool is_cow_mapping(vm_flags_t flags)
176	{	190	{
177	return (flags & (VM_SHARED \| VM_MAYWRITE)) == VM_MAYWRITE;	191	return (flags & (VM_SHARED \| VM_MAYWRITE)) == VM_MAYWRITE;