aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/mmzone.h8
-rw-r--r--mm/vmscan.c82
2 files changed, 68 insertions, 22 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 2aaf72f7e345..fce64afba042 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -499,6 +499,9 @@ typedef enum {
499 * many dirty file pages at the tail 499 * many dirty file pages at the tail
500 * of the LRU. 500 * of the LRU.
501 */ 501 */
502 ZONE_WRITEBACK, /* reclaim scanning has recently found
503 * many pages under writeback
504 */
502} zone_flags_t; 505} zone_flags_t;
503 506
504static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) 507static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone)
526 return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); 529 return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags);
527} 530}
528 531
532static inline int zone_is_reclaim_writeback(const struct zone *zone)
533{
534 return test_bit(ZONE_WRITEBACK, &zone->flags);
535}
536
529static inline int zone_is_reclaim_locked(const struct zone *zone) 537static inline int zone_is_reclaim_locked(const struct zone *zone)
530{ 538{
531 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); 539 return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d6c916d808ba..1109de0c35bf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list,
724 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 724 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
725 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 725 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
726 726
727 /*
728 * If a page at the tail of the LRU is under writeback, there
729 * are three cases to consider.
730 *
731 * 1) If reclaim is encountering an excessive number of pages
732 * under writeback and this page is both under writeback and
733 * PageReclaim then it indicates that pages are being queued
734 * for IO but are being recycled through the LRU before the
735 * IO can complete. Waiting on the page itself risks an
736 * indefinite stall if it is impossible to writeback the
737 * page due to IO error or disconnected storage so instead
738 * block for HZ/10 or until some IO completes then clear the
739 * ZONE_WRITEBACK flag to recheck if the condition exists.
740 *
741 * 2) Global reclaim encounters a page, memcg encounters a
742 * page that is not marked for immediate reclaim or
743 * the caller does not have __GFP_IO. In this case mark
744 * the page for immediate reclaim and continue scanning.
745 *
746 * __GFP_IO is checked because a loop driver thread might
747 * enter reclaim, and deadlock if it waits on a page for
748 * which it is needed to do the write (loop masks off
749 * __GFP_IO|__GFP_FS for this reason); but more thought
750 * would probably show more reasons.
751 *
752 * Don't require __GFP_FS, since we're not going into the
753 * FS, just waiting on its writeback completion. Worryingly,
754 * ext4 gfs2 and xfs allocate pages with
755 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
756 * may_enter_fs here is liable to OOM on them.
757 *
758 * 3) memcg encounters a page that is not already marked
759 * PageReclaim. memcg does not have any dirty pages
760 * throttling so we could easily OOM just because too many
761 * pages are in writeback and there is nothing else to
762 * reclaim. Wait for the writeback to complete.
763 */
727 if (PageWriteback(page)) { 764 if (PageWriteback(page)) {
728 /* 765 /* Case 1 above */
729 * memcg doesn't have any dirty pages throttling so we 766 if (current_is_kswapd() &&
730 * could easily OOM just because too many pages are in 767 PageReclaim(page) &&
731 * writeback and there is nothing else to reclaim. 768 zone_is_reclaim_writeback(zone)) {
732 * 769 unlock_page(page);
733 * Check __GFP_IO, certainly because a loop driver 770 congestion_wait(BLK_RW_ASYNC, HZ/10);
734 * thread might enter reclaim, and deadlock if it waits 771 zone_clear_flag(zone, ZONE_WRITEBACK);
735 * on a page for which it is needed to do the write 772 goto keep;
736 * (loop masks off __GFP_IO|__GFP_FS for this reason); 773
737 * but more thought would probably show more reasons. 774 /* Case 2 above */
738 * 775 } else if (global_reclaim(sc) ||
739 * Don't require __GFP_FS, since we're not going into
740 * the FS, just waiting on its writeback completion.
741 * Worryingly, ext4 gfs2 and xfs allocate pages with
742 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
743 * testing may_enter_fs here is liable to OOM on them.
744 */
745 if (global_reclaim(sc) ||
746 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { 776 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
747 /* 777 /*
748 * This is slightly racy - end_page_writeback() 778 * This is slightly racy - end_page_writeback()
@@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
757 */ 787 */
758 SetPageReclaim(page); 788 SetPageReclaim(page);
759 nr_writeback++; 789 nr_writeback++;
790
760 goto keep_locked; 791 goto keep_locked;
792
793 /* Case 3 above */
794 } else {
795 wait_on_page_writeback(page);
761 } 796 }
762 wait_on_page_writeback(page);
763 } 797 }
764 798
765 if (!force_reclaim) 799 if (!force_reclaim)
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1374 * isolated page is PageWriteback 1408 * isolated page is PageWriteback
1375 */ 1409 */
1376 if (nr_writeback && nr_writeback >= 1410 if (nr_writeback && nr_writeback >=
1377 (nr_taken >> (DEF_PRIORITY - sc->priority))) 1411 (nr_taken >> (DEF_PRIORITY - sc->priority))) {
1378 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1412 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1413 zone_set_flag(zone, ZONE_WRITEBACK);
1414 }
1379 1415
1380 /* 1416 /*
1381 * Similarly, if many dirty pages are encountered that are not 1417 * Similarly, if many dirty pages are encountered that are not
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2669 * the high watermark. 2705 * the high watermark.
2670 * 2706 *
2671 * Returns true if kswapd scanned at least the requested number of pages to 2707 * Returns true if kswapd scanned at least the requested number of pages to
2672 * reclaim. This is used to determine if the scanning priority needs to be 2708 * reclaim or if the lack of progress was due to pages under writeback.
2673 * raised. 2709 * This is used to determine if the scanning priority needs to be raised.
2674 */ 2710 */
2675static bool kswapd_shrink_zone(struct zone *zone, 2711static bool kswapd_shrink_zone(struct zone *zone,
2676 struct scan_control *sc, 2712 struct scan_control *sc,
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone,
2697 if (nr_slab == 0 && !zone_reclaimable(zone)) 2733 if (nr_slab == 0 && !zone_reclaimable(zone))
2698 zone->all_unreclaimable = 1; 2734 zone->all_unreclaimable = 1;
2699 2735
2736 zone_clear_flag(zone, ZONE_WRITEBACK);
2737
2700 return sc->nr_scanned >= sc->nr_to_reclaim; 2738 return sc->nr_scanned >= sc->nr_to_reclaim;
2701} 2739}
2702 2740