aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2013-07-03 18:01:51 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-03 19:07:28 -0400
commit283aba9f9e0e4882bf09bd37a2983379a6fae805 (patch)
tree8c856efae71bb2daaadae48ff565132dd6e0b06b /mm
parentd43006d503ac921c7df4f94d13c17db6f13c9d26 (diff)
mm: vmscan: block kswapd if it is encountering pages under writeback
Historically, kswapd used to congestion_wait() at higher priorities if it was not making forward progress. This made no sense as the failure to make progress could be completely independent of IO. It was later replaced by wait_iff_congested() and removed entirely by commit 258401a6 (mm: don't wait on congested zones in balance_pgdat()) as it was duplicating logic in shrink_inactive_list(). This is problematic. If kswapd encounters many pages under writeback and it continues to scan until it reaches the high watermark then it will quickly skip over the pages under writeback and reclaim clean young pages or push applications out to swap. The use of wait_iff_congested() is not suited to kswapd as it will only stall if the underlying BDI is really congested or a direct reclaimer was unable to write to the underlying BDI. kswapd bypasses the BDI congestion as it sets PF_SWAPWRITE but even if this was taken into account then it would cause direct reclaimers to stall on writeback which is not desirable. This patch sets a ZONE_WRITEBACK flag if direct reclaim or kswapd is encountering too many pages under writeback. If this flag is set and kswapd encounters a PageReclaim page under writeback then it'll assume that the LRU lists are being recycled too quickly before IO can complete and block waiting for some IO to complete. Signed-off-by: Mel Gorman <mgorman@suse.de> Reviewed-by: Michal Hocko <mhocko@suse.cz> Acked-by: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Jiri Slaby <jslaby@suse.cz> Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Tested-by: Zlatko Calusic <zcalusic@bitsync.net> Cc: dormando <dormando@rydia.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/vmscan.c82
1 files changed, 60 insertions, 22 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d6c916d808ba..1109de0c35bf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list,
724 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 724 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
725 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 725 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
726 726
727 /*
728 * If a page at the tail of the LRU is under writeback, there
729 * are three cases to consider.
730 *
731 * 1) If reclaim is encountering an excessive number of pages
732 * under writeback and this page is both under writeback and
733 * PageReclaim then it indicates that pages are being queued
734 * for IO but are being recycled through the LRU before the
735 * IO can complete. Waiting on the page itself risks an
736 * indefinite stall if it is impossible to writeback the
737 * page due to IO error or disconnected storage so instead
738 * block for HZ/10 or until some IO completes then clear the
739 * ZONE_WRITEBACK flag to recheck if the condition exists.
740 *
741 * 2) Global reclaim encounters a page, memcg encounters a
742 * page that is not marked for immediate reclaim or
743 * the caller does not have __GFP_IO. In this case mark
744 * the page for immediate reclaim and continue scanning.
745 *
746 * __GFP_IO is checked because a loop driver thread might
747 * enter reclaim, and deadlock if it waits on a page for
748 * which it is needed to do the write (loop masks off
749 * __GFP_IO|__GFP_FS for this reason); but more thought
750 * would probably show more reasons.
751 *
752 * Don't require __GFP_FS, since we're not going into the
753 * FS, just waiting on its writeback completion. Worryingly,
754 * ext4 gfs2 and xfs allocate pages with
755 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
756 * may_enter_fs here is liable to OOM on them.
757 *
758 * 3) memcg encounters a page that is not already marked
759 * PageReclaim. memcg does not have any dirty pages
760 * throttling so we could easily OOM just because too many
761 * pages are in writeback and there is nothing else to
762 * reclaim. Wait for the writeback to complete.
763 */
727 if (PageWriteback(page)) { 764 if (PageWriteback(page)) {
728 /* 765 /* Case 1 above */
729 * memcg doesn't have any dirty pages throttling so we 766 if (current_is_kswapd() &&
730 * could easily OOM just because too many pages are in 767 PageReclaim(page) &&
731 * writeback and there is nothing else to reclaim. 768 zone_is_reclaim_writeback(zone)) {
732 * 769 unlock_page(page);
733 * Check __GFP_IO, certainly because a loop driver 770 congestion_wait(BLK_RW_ASYNC, HZ/10);
734 * thread might enter reclaim, and deadlock if it waits 771 zone_clear_flag(zone, ZONE_WRITEBACK);
735 * on a page for which it is needed to do the write 772 goto keep;
736 * (loop masks off __GFP_IO|__GFP_FS for this reason); 773
737 * but more thought would probably show more reasons. 774 /* Case 2 above */
738 * 775 } else if (global_reclaim(sc) ||
739 * Don't require __GFP_FS, since we're not going into
740 * the FS, just waiting on its writeback completion.
741 * Worryingly, ext4 gfs2 and xfs allocate pages with
742 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
743 * testing may_enter_fs here is liable to OOM on them.
744 */
745 if (global_reclaim(sc) ||
746 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { 776 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
747 /* 777 /*
748 * This is slightly racy - end_page_writeback() 778 * This is slightly racy - end_page_writeback()
@@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
757 */ 787 */
758 SetPageReclaim(page); 788 SetPageReclaim(page);
759 nr_writeback++; 789 nr_writeback++;
790
760 goto keep_locked; 791 goto keep_locked;
792
793 /* Case 3 above */
794 } else {
795 wait_on_page_writeback(page);
761 } 796 }
762 wait_on_page_writeback(page);
763 } 797 }
764 798
765 if (!force_reclaim) 799 if (!force_reclaim)
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1374 * isolated page is PageWriteback 1408 * isolated page is PageWriteback
1375 */ 1409 */
1376 if (nr_writeback && nr_writeback >= 1410 if (nr_writeback && nr_writeback >=
1377 (nr_taken >> (DEF_PRIORITY - sc->priority))) 1411 (nr_taken >> (DEF_PRIORITY - sc->priority))) {
1378 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1412 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1413 zone_set_flag(zone, ZONE_WRITEBACK);
1414 }
1379 1415
1380 /* 1416 /*
1381 * Similarly, if many dirty pages are encountered that are not 1417 * Similarly, if many dirty pages are encountered that are not
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2669 * the high watermark. 2705 * the high watermark.
2670 * 2706 *
2671 * Returns true if kswapd scanned at least the requested number of pages to 2707 * Returns true if kswapd scanned at least the requested number of pages to
2672 * reclaim. This is used to determine if the scanning priority needs to be 2708 * reclaim or if the lack of progress was due to pages under writeback.
2673 * raised. 2709 * This is used to determine if the scanning priority needs to be raised.
2674 */ 2710 */
2675static bool kswapd_shrink_zone(struct zone *zone, 2711static bool kswapd_shrink_zone(struct zone *zone,
2676 struct scan_control *sc, 2712 struct scan_control *sc,
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
2697 if (nr_slab == 0 && !zone_reclaimable(zone)) 2733 if (nr_slab == 0 && !zone_reclaimable(zone))
2698 zone->all_unreclaimable = 1; 2734 zone->all_unreclaimable = 1;
2699 2735
2736 zone_clear_flag(zone, ZONE_WRITEBACK);
2737
2700 return sc->nr_scanned >= sc->nr_to_reclaim; 2738 return sc->nr_scanned >= sc->nr_to_reclaim;
2701} 2739}
2702 2740