2 files changed, 68 insertions, 22 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2aaf72f7e345..fce64afba042 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -499,6 +499,9 @@ typedef enum {
                                         * many dirty file pages at the tail
                                         * of the LRU.
                                         */
+        ZONE_WRITEBACK,                 /* reclaim scanning has recently found
+                                         * many pages under writeback
+                                         */
 } zone_flags_t;
 static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone)
        return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
 }
+static inline int zone_is_reclaim_writeback(const struct zone *zone)
+{
+        return test_bit(ZONE_WRITEBACK, &zone->flags);
+}
 static inline int zone_is_reclaim_locked(const struct zone *zone)
 {
        return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d6c916d808ba..1109de0c35bf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+                /*
+                 * If a page at the tail of the LRU is under writeback, there
+                 * are three cases to consider.
+                 *
+                 * 1) If reclaim is encountering an excessive number of pages
+                 *    under writeback and this page is both under writeback and
+                 *    PageReclaim then it indicates that pages are being queued
+                 *    for IO but are being recycled through the LRU before the
+                 *    IO can complete. Waiting on the page itself risks an
+                 *    indefinite stall if it is impossible to writeback the
+                 *    page due to IO error or disconnected storage so instead
+                 *    block for HZ/10 or until some IO completes then clear the
+                 *    ZONE_WRITEBACK flag to recheck if the condition exists.
+                 *
+                 * 2) Global reclaim encounters a page, memcg encounters a
+                 *    page that is not marked for immediate reclaim or
+                 *    the caller does not have __GFP_IO. In this case mark
+                 *    the page for immediate reclaim and continue scanning.
+                 *
+                 *    __GFP_IO is checked  because a loop driver thread might
+                 *    enter reclaim, and deadlock if it waits on a page for
+                 *    which it is needed to do the write (loop masks off
+                 *    __GFP_IO|__GFP_FS for this reason); but more thought
+                 *    would probably show more reasons.
+                 *
+                 *    Don't require __GFP_FS, since we're not going into the
+                 *    FS, just waiting on its writeback completion. Worryingly,
+                 *    ext4 gfs2 and xfs allocate pages with
+                 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
+                 *    may_enter_fs here is liable to OOM on them.
+                 *
+                 * 3) memcg encounters a page that is not already marked
+                 *    PageReclaim. memcg does not have any dirty pages
+                 *    throttling so we could easily OOM just because too many
+                 *    pages are in writeback and there is nothing else to
+                 *    reclaim. Wait for the writeback to complete.
+                 */
                if (PageWriteback(page)) {
-                        /*
+                        /* Case 1 above */
-                         * memcg doesn't have any dirty pages throttling so we
+                        if (current_is_kswapd() &&
-                         * could easily OOM just because too many pages are in
+                            PageReclaim(page) &&
-                         * writeback and there is nothing else to reclaim.
+                            zone_is_reclaim_writeback(zone)) {
-                         *
+                                unlock_page(page);
-                         * Check __GFP_IO, certainly because a loop driver
+                                congestion_wait(BLK_RW_ASYNC, HZ/10);
-                         * thread might enter reclaim, and deadlock if it waits
+                                zone_clear_flag(zone, ZONE_WRITEBACK);
-                         * on a page for which it is needed to do the write
+                                goto keep;
-                         * (loop masks off __GFP_IO|__GFP_FS for this reason);
-                         * but more thought would probably show more reasons.
+                        /* Case 2 above */
-                         *
+                        } else if (global_reclaim(sc) ||
-                         * Don't require __GFP_FS, since we're not going into
-                         * the FS, just waiting on its writeback completion.
-                         * Worryingly, ext4 gfs2 and xfs allocate pages with
-                         * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
-                         * testing may_enter_fs here is liable to OOM on them.
-                         */
-                        if (global_reclaim(sc) ||
                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                /*
                                 * This is slightly racy - end_page_writeback()
@@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 */
                                SetPageReclaim(page);
                                nr_writeback++;
                                goto keep_locked;
+                        /* Case 3 above */
+                        } else {
+                                wait_on_page_writeback(page);
                        }
-                        wait_on_page_writeback(page);
                }
                if (!force_reclaim)
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         *                     isolated page is PageWriteback
         */
        if (nr_writeback && nr_writeback >=
-                        (nr_taken >> (DEF_PRIORITY - sc->priority)))
+                        (nr_taken >> (DEF_PRIORITY - sc->priority))) {
                wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+                zone_set_flag(zone, ZONE_WRITEBACK);
+        }
        /*
         * Similarly, if many dirty pages are encountered that are not
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 * the high watermark.
 *
 * Returns true if kswapd scanned at least the requested number of pages to
- * reclaim. This is used to determine if the scanning priority needs to be
+ * reclaim or if the lack of progress was due to pages under writeback.
- * raised.
+ * This is used to determine if the scanning priority needs to be raised.
 */
 static bool kswapd_shrink_zone(struct zone *zone,
                               struct scan_control *sc,
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
        if (nr_slab == 0 && !zone_reclaimable(zone))
                zone->all_unreclaimable = 1;
+        zone_clear_flag(zone, ZONE_WRITEBACK);
        return sc->nr_scanned >= sc->nr_to_reclaim;
 }

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2aaf72f7e345..fce64afba042 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h
@@ -499,6 +499,9 @@ typedef enum {
499	* many dirty file pages at the tail	499	* many dirty file pages at the tail
500	* of the LRU.	500	* of the LRU.
501	*/	501	*/
		502	ZONE_WRITEBACK, /* reclaim scanning has recently found
		503	* many pages under writeback
		504	*/
502	} zone_flags_t;	505	} zone_flags_t;
503		506
504	static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)	507	static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone)
526	return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);	529	return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
527	}	530	}
528		531
		532	static inline int zone_is_reclaim_writeback(const struct zone *zone)
		533	{
		534	return test_bit(ZONE_WRITEBACK, &zone->flags);
		535	}
		536
529	static inline int zone_is_reclaim_locked(const struct zone *zone)	537	static inline int zone_is_reclaim_locked(const struct zone *zone)
530	{	538	{
531	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);	539	return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);


diff --git a/mm/vmscan.c b/mm/vmscan.c index d6c916d808ba..1109de0c35bf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list,
724	may_enter_fs = (sc->gfp_mask & __GFP_FS) \|\|	724	may_enter_fs = (sc->gfp_mask & __GFP_FS) \|\|
725	(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));	725	(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
726		726
		727	/*
		728	* If a page at the tail of the LRU is under writeback, there
		729	* are three cases to consider.
		730	*
		731	* 1) If reclaim is encountering an excessive number of pages
		732	* under writeback and this page is both under writeback and
		733	* PageReclaim then it indicates that pages are being queued
		734	* for IO but are being recycled through the LRU before the
		735	* IO can complete. Waiting on the page itself risks an
		736	* indefinite stall if it is impossible to writeback the
		737	* page due to IO error or disconnected storage so instead
		738	* block for HZ/10 or until some IO completes then clear the
		739	* ZONE_WRITEBACK flag to recheck if the condition exists.
		740	*
		741	* 2) Global reclaim encounters a page, memcg encounters a
		742	* page that is not marked for immediate reclaim or
		743	* the caller does not have __GFP_IO. In this case mark
		744	* the page for immediate reclaim and continue scanning.
		745	*
		746	* __GFP_IO is checked because a loop driver thread might
		747	* enter reclaim, and deadlock if it waits on a page for
		748	* which it is needed to do the write (loop masks off
		749	* __GFP_IO\|__GFP_FS for this reason); but more thought
		750	* would probably show more reasons.
		751	*
		752	* Don't require __GFP_FS, since we're not going into the
		753	* FS, just waiting on its writeback completion. Worryingly,
		754	* ext4 gfs2 and xfs allocate pages with
		755	* grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
		756	* may_enter_fs here is liable to OOM on them.
		757	*
		758	* 3) memcg encounters a page that is not already marked
		759	* PageReclaim. memcg does not have any dirty pages
		760	* throttling so we could easily OOM just because too many
		761	* pages are in writeback and there is nothing else to
		762	* reclaim. Wait for the writeback to complete.
		763	*/
727	if (PageWriteback(page)) {	764	if (PageWriteback(page)) {
728	/*	765	/* Case 1 above */
729	* memcg doesn't have any dirty pages throttling so we	766	if (current_is_kswapd() &&
730	* could easily OOM just because too many pages are in	767	PageReclaim(page) &&
731	* writeback and there is nothing else to reclaim.	768	zone_is_reclaim_writeback(zone)) {
732	*	769	unlock_page(page);
733	* Check __GFP_IO, certainly because a loop driver	770	congestion_wait(BLK_RW_ASYNC, HZ/10);
734	* thread might enter reclaim, and deadlock if it waits	771	zone_clear_flag(zone, ZONE_WRITEBACK);
735	* on a page for which it is needed to do the write	772	goto keep;
736	* (loop masks off __GFP_IO\|__GFP_FS for this reason);	773
737	* but more thought would probably show more reasons.	774	/* Case 2 above */
738	*	775	} else if (global_reclaim(sc) \|\|
739	* Don't require __GFP_FS, since we're not going into
740	* the FS, just waiting on its writeback completion.
741	* Worryingly, ext4 gfs2 and xfs allocate pages with
742	* grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
743	* testing may_enter_fs here is liable to OOM on them.
744	*/
745	if (global_reclaim(sc) \|\|
746	!PageReclaim(page) \|\| !(sc->gfp_mask & __GFP_IO)) {	776	!PageReclaim(page) \|\| !(sc->gfp_mask & __GFP_IO)) {
747	/*	777	/*
748	* This is slightly racy - end_page_writeback()	778	* This is slightly racy - end_page_writeback()
@@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
757	*/	787	*/
758	SetPageReclaim(page);	788	SetPageReclaim(page);
759	nr_writeback++;	789	nr_writeback++;
		790
760	goto keep_locked;	791	goto keep_locked;
		792
		793	/* Case 3 above */
		794	} else {
		795	wait_on_page_writeback(page);
761	}	796	}
762	wait_on_page_writeback(page);
763	}	797	}
764		798
765	if (!force_reclaim)	799	if (!force_reclaim)
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1374	* isolated page is PageWriteback	1408	* isolated page is PageWriteback
1375	*/	1409	*/
1376	if (nr_writeback && nr_writeback >=	1410	if (nr_writeback && nr_writeback >=
1377	(nr_taken >> (DEF_PRIORITY - sc->priority)))	1411	(nr_taken >> (DEF_PRIORITY - sc->priority))) {
1378	wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);	1412	wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
		1413	zone_set_flag(zone, ZONE_WRITEBACK);
		1414	}
1379		1415
1380	/*	1416	/*
1381	* Similarly, if many dirty pages are encountered that are not	1417	* Similarly, if many dirty pages are encountered that are not
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2669	* the high watermark.	2705	* the high watermark.
2670	*	2706	*
2671	* Returns true if kswapd scanned at least the requested number of pages to	2707	* Returns true if kswapd scanned at least the requested number of pages to
2672	* reclaim. This is used to determine if the scanning priority needs to be	2708	* reclaim or if the lack of progress was due to pages under writeback.
2673	* raised.	2709	* This is used to determine if the scanning priority needs to be raised.
2674	*/	2710	*/
2675	static bool kswapd_shrink_zone(struct zone *zone,	2711	static bool kswapd_shrink_zone(struct zone *zone,
2676	struct scan_control *sc,	2712	struct scan_control *sc,
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
2697	if (nr_slab == 0 && !zone_reclaimable(zone))	2733	if (nr_slab == 0 && !zone_reclaimable(zone))
2698	zone->all_unreclaimable = 1;	2734	zone->all_unreclaimable = 1;
2699		2735
		2736	zone_clear_flag(zone, ZONE_WRITEBACK);
		2737
2700	return sc->nr_scanned >= sc->nr_to_reclaim;	2738	return sc->nr_scanned >= sc->nr_to_reclaim;
2701	}	2739	}
2702		2740