diff options
-rw-r--r-- | include/linux/mmzone.h | 8 | ||||
-rw-r--r-- | mm/vmscan.c | 82 |
2 files changed, 68 insertions, 22 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2aaf72f7e345..fce64afba042 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -499,6 +499,9 @@ typedef enum { | |||
499 | * many dirty file pages at the tail | 499 | * many dirty file pages at the tail |
500 | * of the LRU. | 500 | * of the LRU. |
501 | */ | 501 | */ |
502 | ZONE_WRITEBACK, /* reclaim scanning has recently found | ||
503 | * many pages under writeback | ||
504 | */ | ||
502 | } zone_flags_t; | 505 | } zone_flags_t; |
503 | 506 | ||
504 | static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) | 507 | static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) |
@@ -526,6 +529,11 @@ static inline int zone_is_reclaim_dirty(const struct zone *zone) | |||
526 | return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); | 529 | return test_bit(ZONE_TAIL_LRU_DIRTY, &zone->flags); |
527 | } | 530 | } |
528 | 531 | ||
532 | static inline int zone_is_reclaim_writeback(const struct zone *zone) | ||
533 | { | ||
534 | return test_bit(ZONE_WRITEBACK, &zone->flags); | ||
535 | } | ||
536 | |||
529 | static inline int zone_is_reclaim_locked(const struct zone *zone) | 537 | static inline int zone_is_reclaim_locked(const struct zone *zone) |
530 | { | 538 | { |
531 | return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); | 539 | return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index d6c916d808ba..1109de0c35bf 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -724,25 +724,55 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
724 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || | 724 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || |
725 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); | 725 | (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); |
726 | 726 | ||
727 | /* | ||
728 | * If a page at the tail of the LRU is under writeback, there | ||
729 | * are three cases to consider. | ||
730 | * | ||
731 | * 1) If reclaim is encountering an excessive number of pages | ||
732 | * under writeback and this page is both under writeback and | ||
733 | * PageReclaim then it indicates that pages are being queued | ||
734 | * for IO but are being recycled through the LRU before the | ||
735 | * IO can complete. Waiting on the page itself risks an | ||
736 | * indefinite stall if it is impossible to writeback the | ||
737 | * page due to IO error or disconnected storage so instead | ||
738 | * block for HZ/10 or until some IO completes then clear the | ||
739 | * ZONE_WRITEBACK flag to recheck if the condition exists. | ||
740 | * | ||
741 | * 2) Global reclaim encounters a page, memcg encounters a | ||
742 | * page that is not marked for immediate reclaim or | ||
743 | * the caller does not have __GFP_IO. In this case mark | ||
744 | * the page for immediate reclaim and continue scanning. | ||
745 | * | ||
746 | * __GFP_IO is checked because a loop driver thread might | ||
747 | * enter reclaim, and deadlock if it waits on a page for | ||
748 | * which it is needed to do the write (loop masks off | ||
749 | * __GFP_IO|__GFP_FS for this reason); but more thought | ||
750 | * would probably show more reasons. | ||
751 | * | ||
752 | * Don't require __GFP_FS, since we're not going into the | ||
753 | * FS, just waiting on its writeback completion. Worryingly, | ||
754 | * ext4 gfs2 and xfs allocate pages with | ||
755 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing | ||
756 | * may_enter_fs here is liable to OOM on them. | ||
757 | * | ||
758 | * 3) memcg encounters a page that is not already marked | ||
759 | * PageReclaim. memcg does not have any dirty pages | ||
760 | * throttling so we could easily OOM just because too many | ||
761 | * pages are in writeback and there is nothing else to | ||
762 | * reclaim. Wait for the writeback to complete. | ||
763 | */ | ||
727 | if (PageWriteback(page)) { | 764 | if (PageWriteback(page)) { |
728 | /* | 765 | /* Case 1 above */ |
729 | * memcg doesn't have any dirty pages throttling so we | 766 | if (current_is_kswapd() && |
730 | * could easily OOM just because too many pages are in | 767 | PageReclaim(page) && |
731 | * writeback and there is nothing else to reclaim. | 768 | zone_is_reclaim_writeback(zone)) { |
732 | * | 769 | unlock_page(page); |
733 | * Check __GFP_IO, certainly because a loop driver | 770 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
734 | * thread might enter reclaim, and deadlock if it waits | 771 | zone_clear_flag(zone, ZONE_WRITEBACK); |
735 | * on a page for which it is needed to do the write | 772 | goto keep; |
736 | * (loop masks off __GFP_IO|__GFP_FS for this reason); | 773 | |
737 | * but more thought would probably show more reasons. | 774 | /* Case 2 above */ |
738 | * | 775 | } else if (global_reclaim(sc) || |
739 | * Don't require __GFP_FS, since we're not going into | ||
740 | * the FS, just waiting on its writeback completion. | ||
741 | * Worryingly, ext4 gfs2 and xfs allocate pages with | ||
742 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so | ||
743 | * testing may_enter_fs here is liable to OOM on them. | ||
744 | */ | ||
745 | if (global_reclaim(sc) || | ||
746 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { | 776 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { |
747 | /* | 777 | /* |
748 | * This is slightly racy - end_page_writeback() | 778 | * This is slightly racy - end_page_writeback() |
@@ -757,9 +787,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
757 | */ | 787 | */ |
758 | SetPageReclaim(page); | 788 | SetPageReclaim(page); |
759 | nr_writeback++; | 789 | nr_writeback++; |
790 | |||
760 | goto keep_locked; | 791 | goto keep_locked; |
792 | |||
793 | /* Case 3 above */ | ||
794 | } else { | ||
795 | wait_on_page_writeback(page); | ||
761 | } | 796 | } |
762 | wait_on_page_writeback(page); | ||
763 | } | 797 | } |
764 | 798 | ||
765 | if (!force_reclaim) | 799 | if (!force_reclaim) |
@@ -1374,8 +1408,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1374 | * isolated page is PageWriteback | 1408 | * isolated page is PageWriteback |
1375 | */ | 1409 | */ |
1376 | if (nr_writeback && nr_writeback >= | 1410 | if (nr_writeback && nr_writeback >= |
1377 | (nr_taken >> (DEF_PRIORITY - sc->priority))) | 1411 | (nr_taken >> (DEF_PRIORITY - sc->priority))) { |
1378 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); | 1412 | wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); |
1413 | zone_set_flag(zone, ZONE_WRITEBACK); | ||
1414 | } | ||
1379 | 1415 | ||
1380 | /* | 1416 | /* |
1381 | * Similarly, if many dirty pages are encountered that are not | 1417 | * Similarly, if many dirty pages are encountered that are not |
@@ -2669,8 +2705,8 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2669 | * the high watermark. | 2705 | * the high watermark. |
2670 | * | 2706 | * |
2671 | * Returns true if kswapd scanned at least the requested number of pages to | 2707 | * Returns true if kswapd scanned at least the requested number of pages to |
2672 | * reclaim. This is used to determine if the scanning priority needs to be | 2708 | * reclaim or if the lack of progress was due to pages under writeback. |
2673 | * raised. | 2709 | * This is used to determine if the scanning priority needs to be raised. |
2674 | */ | 2710 | */ |
2675 | static bool kswapd_shrink_zone(struct zone *zone, | 2711 | static bool kswapd_shrink_zone(struct zone *zone, |
2676 | struct scan_control *sc, | 2712 | struct scan_control *sc, |
@@ -2697,6 +2733,8 @@ static bool kswapd_shrink_zone(struct zone *zone, | |||
2697 | if (nr_slab == 0 && !zone_reclaimable(zone)) | 2733 | if (nr_slab == 0 && !zone_reclaimable(zone)) |
2698 | zone->all_unreclaimable = 1; | 2734 | zone->all_unreclaimable = 1; |
2699 | 2735 | ||
2736 | zone_clear_flag(zone, ZONE_WRITEBACK); | ||
2737 | |||
2700 | return sc->nr_scanned >= sc->nr_to_reclaim; | 2738 | return sc->nr_scanned >= sc->nr_to_reclaim; |
2701 | } | 2739 | } |
2702 | 2740 | ||