mm: vmscan: block kswapd if it is encountering pages under writeback

Historically, kswapd used to congestion_wait() at higher priorities if it was not making forward progress. This made no sense as the failure to make progress could be completely independent of IO. It was later replaced by wait_iff_congested() and removed entirely by commit 258401a6 (mm: don't wait on congested zones in balance_pgdat()) as it was duplicating logic in shrink_inactive_list(). This is problematic. If kswapd encounters many pages under writeback and it continues to scan until it reaches the high watermark then it will quickly skip over the pages under writeback and reclaim clean young pages or push applications out to swap. The use of wait_iff_congested() is not suited to kswapd as it will only stall if the underlying BDI is really congested or a direct reclaimer was unable to write to the underlying BDI. kswapd bypasses the BDI congestion as it sets PF_SWAPWRITE but even if this was taken into account then it would cause direct reclaimers to stall on writeback which is not desirable. This patch sets a ZONE_WRITEBACK flag if direct reclaim or kswapd is encountering too many pages under writeback. If this flag is set and kswapd encounters a PageReclaim page under writeback then it'll assume that the LRU lists are being recycled too quickly before IO can complete and block waiting for some IO to complete. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Michal Hocko <mhocko@suse.cz> Acked-by: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Mel Gorman <mgorman@suse.de> 2013-07-03 18:01:51 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-07-03 19:07:28 -0400
commit: 283aba9f9e0e4882bf09bd37a2983379a6fae805 (patch)
tree: 8c856efae71bb2daaadae48ff565132dd6e0b06b /mm
parent: d43006d503ac921c7df4f94d13c17db6f13c9d26 (diff)
1 files changed, 60 insertions, 22 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d6c916d808ba..1109de0c35bf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+                /*
+                 * If a page at the tail of the LRU is under writeback, there
+                 * are three cases to consider.
+                 *
+                 * 1) If reclaim is encountering an excessive number of pages
+                 *    under writeback and this page is both under writeback and
+                 *    PageReclaim then it indicates that pages are being queued
+                 *    for IO but are being recycled through the LRU before the
+                 *    IO can complete. Waiting on the page itself risks an
+                 *    indefinite stall if it is impossible to writeback the
+                 *    page due to IO error or disconnected storage so instead
+                 *    block for HZ/10 or until some IO completes then clear the
+                 *    ZONE_WRITEBACK flag to recheck if the condition exists.
+                 *
+                 * 2) Global reclaim encounters a page, memcg encounters a
+                 *    page that is not marked for immediate reclaim or
+                 *    the caller does not have __GFP_IO. In this case mark
+                 *    the page for immediate reclaim and continue scanning.
+                 *
+                 *    __GFP_IO is checked  because a loop driver thread might
+                 *    enter reclaim, and deadlock if it waits on a page for
+                 *    which it is needed to do the write (loop masks off
+                 *    __GFP_IO|__GFP_FS for this reason); but more thought
+                 *    would probably show more reasons.
+                 *
+                 *    Don't require __GFP_FS, since we're not going into the
+                 *    FS, just waiting on its writeback completion. Worryingly,
+                 *    ext4 gfs2 and xfs allocate pages with
+                 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+                 *    may_enter_fs here is liable to OOM on them.
+                 *
+                 * 3) memcg encounters a page that is not already marked
+                 *    PageReclaim. memcg does not have any dirty pages
+                 *    throttling so we could easily OOM just because too many
+                 *    pages are in writeback and there is nothing else to
+                 *    reclaim. Wait for the writeback to complete.
+                 */
                if (PageWriteback(page)) {
-                        /*
+                        /* Case 1 above */
-                         * memcg doesn't have any dirty pages throttling so we
+                        if (current_is_kswapd() &&
-                         * could easily OOM just because too many pages are in
+                            PageReclaim(page) &&
-                         * writeback and there is nothing else to reclaim.
+                            zone_is_reclaim_writeback(zone)) {
-                         *
+                                unlock_page(page);
-                         * Check __GFP_IO, certainly because a loop driver
+                                congestion_wait(BLK_RW_ASYNC, HZ/10);
-                         * thread might enter reclaim, and deadlock if it waits
+                                zone_clear_flag(zone, ZONE_WRITEBACK);
-                         * on a page for which it is needed to do the write
+                                goto keep;
-                         * (loop masks off __GFP_IO|__GFP_FS for this reason);
-                         * but more thought would probably show more reasons.
+                        /* Case 2 above */
-                         *
+                        } else if (global_reclaim(sc) ||
-                         * Don't require __GFP_FS, since we're not going into
-                         * the FS, just waiting on its writeback completion.
-                         * Worryingly, ext4 gfs2 and xfs allocate pages with
-                         * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
-                         * testing may_enter_fs here is liable to OOM on them.
-                         */
-                        if (global_reclaim(sc) ||
                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                /*
                                 * This is slightly racy - end_page_writeback()
@@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 */
                                SetPageReclaim(page);
                                nr_writeback++;
                                goto keep_locked;
+                        /* Case 3 above */
+                        } else {
+                                wait_on_page_writeback(page);
                        }
-                        wait_on_page_writeback(page);
                }
                if (!force_reclaim)
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         *                     isolated page is PageWriteback
         */
        if (nr_writeback && nr_writeback >=
-                        (nr_taken >> (DEF_PRIORITY - sc->priority)))
+                        (nr_taken >> (DEF_PRIORITY - sc->priority))) {
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+                zone_set_flag(zone, ZONE_WRITEBACK);
+        }
        /*
         * Similarly, if many dirty pages are encountered that are not
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 * the high watermark.
 *
 * Returns true if kswapd scanned at least the requested number of pages to
- * reclaim. This is used to determine if the scanning priority needs to be
+ * reclaim or if the lack of progress was due to pages under writeback.
- * raised.
+ * This is used to determine if the scanning priority needs to be raised.
 */
 static bool kswapd_shrink_zone(struct zone *zone,
                               struct scan_control *sc,
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
        if (nr_slab == 0 && !zone_reclaimable(zone))
                zone->all_unreclaimable = 1;
+        zone_clear_flag(zone, ZONE_WRITEBACK);
        return sc->nr_scanned >= sc->nr_to_reclaim;
 }
author	Mel Gorman <mgorman@suse.de>	2013-07-03 18:01:51 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-07-03 19:07:28 -0400
commit	283aba9f9e0e4882bf09bd37a2983379a6fae805 (patch)
tree	8c856efae71bb2daaadae48ff565132dd6e0b06b /mm
parent	d43006d503ac921c7df4f94d13c17db6f13c9d26 (diff)

diff --git a/mm/vmscan.c b/mm/vmscan.c index d6c916d808ba..1109de0c35bf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list,
724	may_enter_fs = (sc->gfp_mask & __GFP_FS) \|\|	724	may_enter_fs = (sc->gfp_mask & __GFP_FS) \|\|
725	(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));	725	(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
726		726
		727	/*
		728	* If a page at the tail of the LRU is under writeback, there
		729	* are three cases to consider.
		730	*
		731	* 1) If reclaim is encountering an excessive number of pages
		732	* under writeback and this page is both under writeback and
		733	* PageReclaim then it indicates that pages are being queued
		734	* for IO but are being recycled through the LRU before the
		735	* IO can complete. Waiting on the page itself risks an
		736	* indefinite stall if it is impossible to writeback the
		737	* page due to IO error or disconnected storage so instead
		738	* block for HZ/10 or until some IO completes then clear the
		739	* ZONE_WRITEBACK flag to recheck if the condition exists.
		740	*
		741	* 2) Global reclaim encounters a page, memcg encounters a
		742	* page that is not marked for immediate reclaim or
		743	* the caller does not have __GFP_IO. In this case mark
		744	* the page for immediate reclaim and continue scanning.
		745	*
		746	* __GFP_IO is checked because a loop driver thread might
		747	* enter reclaim, and deadlock if it waits on a page for
		748	* which it is needed to do the write (loop masks off
		749	* __GFP_IO\|__GFP_FS for this reason); but more thought
		750	* would probably show more reasons.
		751	*
		752	* Don't require __GFP_FS, since we're not going into the
		753	* FS, just waiting on its writeback completion. Worryingly,
		754	* ext4 gfs2 and xfs allocate pages with
		755	* grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
		756	* may_enter_fs here is liable to OOM on them.
		757	*
		758	* 3) memcg encounters a page that is not already marked
		759	* PageReclaim. memcg does not have any dirty pages
		760	* throttling so we could easily OOM just because too many
		761	* pages are in writeback and there is nothing else to
		762	* reclaim. Wait for the writeback to complete.
		763	*/
727	if (PageWriteback(page)) {	764	if (PageWriteback(page)) {
728	/*	765	/* Case 1 above */
729	* memcg doesn't have any dirty pages throttling so we	766	if (current_is_kswapd() &&
730	* could easily OOM just because too many pages are in	767	PageReclaim(page) &&
731	* writeback and there is nothing else to reclaim.	768	zone_is_reclaim_writeback(zone)) {
732	*	769	unlock_page(page);
733	* Check __GFP_IO, certainly because a loop driver	770	congestion_wait(BLK_RW_ASYNC, HZ/10);
734	* thread might enter reclaim, and deadlock if it waits	771	zone_clear_flag(zone, ZONE_WRITEBACK);
735	* on a page for which it is needed to do the write	772	goto keep;
736	* (loop masks off __GFP_IO\|__GFP_FS for this reason);	773
737	* but more thought would probably show more reasons.	774	/* Case 2 above */
738	*	775	} else if (global_reclaim(sc) \|\|
739	* Don't require __GFP_FS, since we're not going into
740	* the FS, just waiting on its writeback completion.
741	* Worryingly, ext4 gfs2 and xfs allocate pages with
742	* grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
743	* testing may_enter_fs here is liable to OOM on them.
744	*/
745	if (global_reclaim(sc) \|\|
746	!PageReclaim(page) \|\| !(sc->gfp_mask & __GFP_IO)) {	776	!PageReclaim(page) \|\| !(sc->gfp_mask & __GFP_IO)) {
747	/*	777	/*
748	* This is slightly racy - end_page_writeback()	778	* This is slightly racy - end_page_writeback()
@@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
757	*/	787	*/
758	SetPageReclaim(page);	788	SetPageReclaim(page);
759	nr_writeback++;	789	nr_writeback++;
		790
760	goto keep_locked;	791	goto keep_locked;
		792
		793	/* Case 3 above */
		794	} else {
		795	wait_on_page_writeback(page);
761	}	796	}
762	wait_on_page_writeback(page);
763	}	797	}
764		798
765	if (!force_reclaim)	799	if (!force_reclaim)
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1374	* isolated page is PageWriteback	1408	* isolated page is PageWriteback
1375	*/	1409	*/
1376	if (nr_writeback && nr_writeback >=	1410	if (nr_writeback && nr_writeback >=
1377	(nr_taken >> (DEF_PRIORITY - sc->priority)))	1411	(nr_taken >> (DEF_PRIORITY - sc->priority))) {
1378	wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);	1412	wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
		1413	zone_set_flag(zone, ZONE_WRITEBACK);
		1414	}
1379		1415
1380	/*	1416	/*
1381	* Similarly, if many dirty pages are encountered that are not	1417	* Similarly, if many dirty pages are encountered that are not
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2669	* the high watermark.	2705	* the high watermark.
2670	*	2706	*
2671	* Returns true if kswapd scanned at least the requested number of pages to	2707	* Returns true if kswapd scanned at least the requested number of pages to
2672	* reclaim. This is used to determine if the scanning priority needs to be	2708	* reclaim or if the lack of progress was due to pages under writeback.
2673	* raised.	2709	* This is used to determine if the scanning priority needs to be raised.
2674	*/	2710	*/
2675	static bool kswapd_shrink_zone(struct zone *zone,	2711	static bool kswapd_shrink_zone(struct zone *zone,
2676	struct scan_control *sc,	2712	struct scan_control *sc,
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
2697	if (nr_slab == 0 && !zone_reclaimable(zone))	2733	if (nr_slab == 0 && !zone_reclaimable(zone))
2698	zone->all_unreclaimable = 1;	2734	zone->all_unreclaimable = 1;
2699		2735
		2736	zone_clear_flag(zone, ZONE_WRITEBACK);
		2737
2700	return sc->nr_scanned >= sc->nr_to_reclaim;	2738	return sc->nr_scanned >= sc->nr_to_reclaim;
2701	}	2739	}
2702		2740