aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c585
1 files changed, 384 insertions, 201 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fa6a85378ee4..99b3ac7771ad 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
546void putback_lru_page(struct page *page) 546void putback_lru_page(struct page *page)
547{ 547{
548 int lru; 548 int lru;
549 int active = !!TestClearPageActive(page);
550 int was_unevictable = PageUnevictable(page); 549 int was_unevictable = PageUnevictable(page);
551 550
552 VM_BUG_ON(PageLRU(page)); 551 VM_BUG_ON(PageLRU(page));
@@ -561,8 +560,8 @@ redo:
561 * unevictable page on [in]active list. 560 * unevictable page on [in]active list.
562 * We know how to handle that. 561 * We know how to handle that.
563 */ 562 */
564 lru = active + page_lru_base_type(page); 563 lru = page_lru_base_type(page);
565 lru_cache_add_lru(page, lru); 564 lru_cache_add(page);
566 } else { 565 } else {
567 /* 566 /*
568 * Put unevictable pages directly on zone's unevictable 567 * Put unevictable pages directly on zone's unevictable
@@ -669,6 +668,35 @@ static enum page_references page_check_references(struct page *page,
669 return PAGEREF_RECLAIM; 668 return PAGEREF_RECLAIM;
670} 669}
671 670
671/* Check if a page is dirty or under writeback */
672static void page_check_dirty_writeback(struct page *page,
673 bool *dirty, bool *writeback)
674{
675 struct address_space *mapping;
676
677 /*
678 * Anonymous pages are not handled by flushers and must be written
679 * from reclaim context. Do not stall reclaim based on them
680 */
681 if (!page_is_file_cache(page)) {
682 *dirty = false;
683 *writeback = false;
684 return;
685 }
686
687 /* By default assume that the page flags are accurate */
688 *dirty = PageDirty(page);
689 *writeback = PageWriteback(page);
690
691 /* Verify dirty/writeback state if the filesystem supports it */
692 if (!page_has_private(page))
693 return;
694
695 mapping = page_mapping(page);
696 if (mapping && mapping->a_ops->is_dirty_writeback)
697 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
698}
699
672/* 700/*
673 * shrink_page_list() returns the number of reclaimed pages 701 * shrink_page_list() returns the number of reclaimed pages
674 */ 702 */
@@ -677,16 +705,21 @@ static unsigned long shrink_page_list(struct list_head *page_list,
677 struct scan_control *sc, 705 struct scan_control *sc,
678 enum ttu_flags ttu_flags, 706 enum ttu_flags ttu_flags,
679 unsigned long *ret_nr_dirty, 707 unsigned long *ret_nr_dirty,
708 unsigned long *ret_nr_unqueued_dirty,
709 unsigned long *ret_nr_congested,
680 unsigned long *ret_nr_writeback, 710 unsigned long *ret_nr_writeback,
711 unsigned long *ret_nr_immediate,
681 bool force_reclaim) 712 bool force_reclaim)
682{ 713{
683 LIST_HEAD(ret_pages); 714 LIST_HEAD(ret_pages);
684 LIST_HEAD(free_pages); 715 LIST_HEAD(free_pages);
685 int pgactivate = 0; 716 int pgactivate = 0;
717 unsigned long nr_unqueued_dirty = 0;
686 unsigned long nr_dirty = 0; 718 unsigned long nr_dirty = 0;
687 unsigned long nr_congested = 0; 719 unsigned long nr_congested = 0;
688 unsigned long nr_reclaimed = 0; 720 unsigned long nr_reclaimed = 0;
689 unsigned long nr_writeback = 0; 721 unsigned long nr_writeback = 0;
722 unsigned long nr_immediate = 0;
690 723
691 cond_resched(); 724 cond_resched();
692 725
@@ -696,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
696 struct page *page; 729 struct page *page;
697 int may_enter_fs; 730 int may_enter_fs;
698 enum page_references references = PAGEREF_RECLAIM_CLEAN; 731 enum page_references references = PAGEREF_RECLAIM_CLEAN;
732 bool dirty, writeback;
699 733
700 cond_resched(); 734 cond_resched();
701 735
@@ -723,25 +757,77 @@ static unsigned long shrink_page_list(struct list_head *page_list,
723 may_enter_fs = (sc->gfp_mask & __GFP_FS) || 757 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
724 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 758 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
725 759
760 /*
761 * The number of dirty pages determines if a zone is marked
762 * reclaim_congested which affects wait_iff_congested. kswapd
763 * will stall and start writing pages if the tail of the LRU
764 * is all dirty unqueued pages.
765 */
766 page_check_dirty_writeback(page, &dirty, &writeback);
767 if (dirty || writeback)
768 nr_dirty++;
769
770 if (dirty && !writeback)
771 nr_unqueued_dirty++;
772
773 /*
774 * Treat this page as congested if the underlying BDI is or if
775 * pages are cycling through the LRU so quickly that the
776 * pages marked for immediate reclaim are making it to the
777 * end of the LRU a second time.
778 */
779 mapping = page_mapping(page);
780 if ((mapping && bdi_write_congested(mapping->backing_dev_info)) ||
781 (writeback && PageReclaim(page)))
782 nr_congested++;
783
784 /*
785 * If a page at the tail of the LRU is under writeback, there
786 * are three cases to consider.
787 *
788 * 1) If reclaim is encountering an excessive number of pages
789 * under writeback and this page is both under writeback and
790 * PageReclaim then it indicates that pages are being queued
791 * for IO but are being recycled through the LRU before the
792 * IO can complete. Waiting on the page itself risks an
793 * indefinite stall if it is impossible to writeback the
794 * page due to IO error or disconnected storage so instead
795 * note that the LRU is being scanned too quickly and the
796 * caller can stall after page list has been processed.
797 *
798 * 2) Global reclaim encounters a page, memcg encounters a
799 * page that is not marked for immediate reclaim or
800 * the caller does not have __GFP_IO. In this case mark
801 * the page for immediate reclaim and continue scanning.
802 *
803 * __GFP_IO is checked because a loop driver thread might
804 * enter reclaim, and deadlock if it waits on a page for
805 * which it is needed to do the write (loop masks off
806 * __GFP_IO|__GFP_FS for this reason); but more thought
807 * would probably show more reasons.
808 *
809 * Don't require __GFP_FS, since we're not going into the
810 * FS, just waiting on its writeback completion. Worryingly,
811 * ext4 gfs2 and xfs allocate pages with
812 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
813 * may_enter_fs here is liable to OOM on them.
814 *
815 * 3) memcg encounters a page that is not already marked
816 * PageReclaim. memcg does not have any dirty pages
817 * throttling so we could easily OOM just because too many
818 * pages are in writeback and there is nothing else to
819 * reclaim. Wait for the writeback to complete.
820 */
726 if (PageWriteback(page)) { 821 if (PageWriteback(page)) {
727 /* 822 /* Case 1 above */
728 * memcg doesn't have any dirty pages throttling so we 823 if (current_is_kswapd() &&
729 * could easily OOM just because too many pages are in 824 PageReclaim(page) &&
730 * writeback and there is nothing else to reclaim. 825 zone_is_reclaim_writeback(zone)) {
731 * 826 nr_immediate++;
732 * Check __GFP_IO, certainly because a loop driver 827 goto keep_locked;
733 * thread might enter reclaim, and deadlock if it waits 828
734 * on a page for which it is needed to do the write 829 /* Case 2 above */
735 * (loop masks off __GFP_IO|__GFP_FS for this reason); 830 } else if (global_reclaim(sc) ||
736 * but more thought would probably show more reasons.
737 *
738 * Don't require __GFP_FS, since we're not going into
739 * the FS, just waiting on its writeback completion.
740 * Worryingly, ext4 gfs2 and xfs allocate pages with
741 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
742 * testing may_enter_fs here is liable to OOM on them.
743 */
744 if (global_reclaim(sc) ||
745 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { 831 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
746 /* 832 /*
747 * This is slightly racy - end_page_writeback() 833 * This is slightly racy - end_page_writeback()
@@ -756,9 +842,13 @@ static unsigned long shrink_page_list(struct list_head *page_list,
756 */ 842 */
757 SetPageReclaim(page); 843 SetPageReclaim(page);
758 nr_writeback++; 844 nr_writeback++;
845
759 goto keep_locked; 846 goto keep_locked;
847
848 /* Case 3 above */
849 } else {
850 wait_on_page_writeback(page);
760 } 851 }
761 wait_on_page_writeback(page);
762 } 852 }
763 853
764 if (!force_reclaim) 854 if (!force_reclaim)
@@ -784,9 +874,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
784 if (!add_to_swap(page, page_list)) 874 if (!add_to_swap(page, page_list))
785 goto activate_locked; 875 goto activate_locked;
786 may_enter_fs = 1; 876 may_enter_fs = 1;
787 }
788 877
789 mapping = page_mapping(page); 878 /* Adding to swap updated mapping */
879 mapping = page_mapping(page);
880 }
790 881
791 /* 882 /*
792 * The page is mapped into the page tables of one or more 883 * The page is mapped into the page tables of one or more
@@ -806,16 +897,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
806 } 897 }
807 898
808 if (PageDirty(page)) { 899 if (PageDirty(page)) {
809 nr_dirty++;
810
811 /* 900 /*
812 * Only kswapd can writeback filesystem pages to 901 * Only kswapd can writeback filesystem pages to
813 * avoid risk of stack overflow but do not writeback 902 * avoid risk of stack overflow but only writeback
814 * unless under significant pressure. 903 * if many dirty pages have been encountered.
815 */ 904 */
816 if (page_is_file_cache(page) && 905 if (page_is_file_cache(page) &&
817 (!current_is_kswapd() || 906 (!current_is_kswapd() ||
818 sc->priority >= DEF_PRIORITY - 2)) { 907 !zone_is_reclaim_dirty(zone))) {
819 /* 908 /*
820 * Immediately reclaim when written back. 909 * Immediately reclaim when written back.
821 * Similar in principal to deactivate_page() 910 * Similar in principal to deactivate_page()
@@ -838,7 +927,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
838 /* Page is dirty, try to write it out here */ 927 /* Page is dirty, try to write it out here */
839 switch (pageout(page, mapping, sc)) { 928 switch (pageout(page, mapping, sc)) {
840 case PAGE_KEEP: 929 case PAGE_KEEP:
841 nr_congested++;
842 goto keep_locked; 930 goto keep_locked;
843 case PAGE_ACTIVATE: 931 case PAGE_ACTIVATE:
844 goto activate_locked; 932 goto activate_locked;
@@ -946,22 +1034,16 @@ keep:
946 VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); 1034 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
947 } 1035 }
948 1036
949 /*
950 * Tag a zone as congested if all the dirty pages encountered were
951 * backed by a congested BDI. In this case, reclaimers should just
952 * back off and wait for congestion to clear because further reclaim
953 * will encounter the same problem
954 */
955 if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc))
956 zone_set_flag(zone, ZONE_CONGESTED);
957
958 free_hot_cold_page_list(&free_pages, 1); 1037 free_hot_cold_page_list(&free_pages, 1);
959 1038
960 list_splice(&ret_pages, page_list); 1039 list_splice(&ret_pages, page_list);
961 count_vm_events(PGACTIVATE, pgactivate); 1040 count_vm_events(PGACTIVATE, pgactivate);
962 mem_cgroup_uncharge_end(); 1041 mem_cgroup_uncharge_end();
963 *ret_nr_dirty += nr_dirty; 1042 *ret_nr_dirty += nr_dirty;
1043 *ret_nr_congested += nr_congested;
1044 *ret_nr_unqueued_dirty += nr_unqueued_dirty;
964 *ret_nr_writeback += nr_writeback; 1045 *ret_nr_writeback += nr_writeback;
1046 *ret_nr_immediate += nr_immediate;
965 return nr_reclaimed; 1047 return nr_reclaimed;
966} 1048}
967 1049
@@ -973,7 +1055,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
973 .priority = DEF_PRIORITY, 1055 .priority = DEF_PRIORITY,
974 .may_unmap = 1, 1056 .may_unmap = 1,
975 }; 1057 };
976 unsigned long ret, dummy1, dummy2; 1058 unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5;
977 struct page *page, *next; 1059 struct page *page, *next;
978 LIST_HEAD(clean_pages); 1060 LIST_HEAD(clean_pages);
979 1061
@@ -985,8 +1067,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
985 } 1067 }
986 1068
987 ret = shrink_page_list(&clean_pages, zone, &sc, 1069 ret = shrink_page_list(&clean_pages, zone, &sc,
988 TTU_UNMAP|TTU_IGNORE_ACCESS, 1070 TTU_UNMAP|TTU_IGNORE_ACCESS,
989 &dummy1, &dummy2, true); 1071 &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true);
990 list_splice(&clean_pages, page_list); 1072 list_splice(&clean_pages, page_list);
991 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); 1073 __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret);
992 return ret; 1074 return ret;
@@ -1281,7 +1363,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1281 unsigned long nr_reclaimed = 0; 1363 unsigned long nr_reclaimed = 0;
1282 unsigned long nr_taken; 1364 unsigned long nr_taken;
1283 unsigned long nr_dirty = 0; 1365 unsigned long nr_dirty = 0;
1366 unsigned long nr_congested = 0;
1367 unsigned long nr_unqueued_dirty = 0;
1284 unsigned long nr_writeback = 0; 1368 unsigned long nr_writeback = 0;
1369 unsigned long nr_immediate = 0;
1285 isolate_mode_t isolate_mode = 0; 1370 isolate_mode_t isolate_mode = 0;
1286 int file = is_file_lru(lru); 1371 int file = is_file_lru(lru);
1287 struct zone *zone = lruvec_zone(lruvec); 1372 struct zone *zone = lruvec_zone(lruvec);
@@ -1323,7 +1408,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1323 return 0; 1408 return 0;
1324 1409
1325 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, 1410 nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
1326 &nr_dirty, &nr_writeback, false); 1411 &nr_dirty, &nr_unqueued_dirty, &nr_congested,
1412 &nr_writeback, &nr_immediate,
1413 false);
1327 1414
1328 spin_lock_irq(&zone->lru_lock); 1415 spin_lock_irq(&zone->lru_lock);
1329 1416
@@ -1357,7 +1444,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1357 * same way balance_dirty_pages() manages. 1444 * same way balance_dirty_pages() manages.
1358 * 1445 *
1359 * This scales the number of dirty pages that must be under writeback 1446 * This scales the number of dirty pages that must be under writeback
1360 * before throttling depending on priority. It is a simple backoff 1447 * before a zone gets flagged ZONE_WRITEBACK. It is a simple backoff
1361 * function that has the most effect in the range DEF_PRIORITY to 1448 * function that has the most effect in the range DEF_PRIORITY to
1362 * DEF_PRIORITY-2 which is the priority reclaim is considered to be 1449 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
1363 * in trouble and reclaim is considered to be in trouble. 1450 * in trouble and reclaim is considered to be in trouble.
@@ -1368,9 +1455,53 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1368 * ... 1455 * ...
1369 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any 1456 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1370 * isolated page is PageWriteback 1457 * isolated page is PageWriteback
1458 *
1459 * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
1460 * of pages under pages flagged for immediate reclaim and stall if any
1461 * are encountered in the nr_immediate check below.
1371 */ 1462 */
1372 if (nr_writeback && nr_writeback >= 1463 if (nr_writeback && nr_writeback >=
1373 (nr_taken >> (DEF_PRIORITY - sc->priority))) 1464 (nr_taken >> (DEF_PRIORITY - sc->priority)))
1465 zone_set_flag(zone, ZONE_WRITEBACK);
1466
1467 /*
1468 * memcg will stall in page writeback so only consider forcibly
1469 * stalling for global reclaim
1470 */
1471 if (global_reclaim(sc)) {
1472 /*
1473 * Tag a zone as congested if all the dirty pages scanned were
1474 * backed by a congested BDI and wait_iff_congested will stall.
1475 */
1476 if (nr_dirty && nr_dirty == nr_congested)
1477 zone_set_flag(zone, ZONE_CONGESTED);
1478
1479 /*
1480 * If dirty pages are scanned that are not queued for IO, it
1481 * implies that flushers are not keeping up. In this case, flag
1482 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing
1483 * pages from reclaim context. It will forcibly stall in the
1484 * next check.
1485 */
1486 if (nr_unqueued_dirty == nr_taken)
1487 zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);
1488
1489 /*
1490 * In addition, if kswapd scans pages marked marked for
1491 * immediate reclaim and under writeback (nr_immediate), it
1492 * implies that pages are cycling through the LRU faster than
1493 * they are written so also forcibly stall.
1494 */
1495 if (nr_unqueued_dirty == nr_taken || nr_immediate)
1496 congestion_wait(BLK_RW_ASYNC, HZ/10);
1497 }
1498
1499 /*
1500 * Stall direct reclaim for IO completions if underlying BDIs or zone
1501 * is congested. Allow kswapd to continue until it starts encountering
1502 * unqueued dirty pages or cycling through the LRU too quickly.
1503 */
1504 if (!sc->hibernation_mode && !current_is_kswapd())
1374 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); 1505 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1375 1506
1376 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1507 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
@@ -1822,17 +1953,25 @@ out:
1822static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) 1953static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1823{ 1954{
1824 unsigned long nr[NR_LRU_LISTS]; 1955 unsigned long nr[NR_LRU_LISTS];
1956 unsigned long targets[NR_LRU_LISTS];
1825 unsigned long nr_to_scan; 1957 unsigned long nr_to_scan;
1826 enum lru_list lru; 1958 enum lru_list lru;
1827 unsigned long nr_reclaimed = 0; 1959 unsigned long nr_reclaimed = 0;
1828 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 1960 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1829 struct blk_plug plug; 1961 struct blk_plug plug;
1962 bool scan_adjusted = false;
1830 1963
1831 get_scan_count(lruvec, sc, nr); 1964 get_scan_count(lruvec, sc, nr);
1832 1965
1966 /* Record the original scan target for proportional adjustments later */
1967 memcpy(targets, nr, sizeof(nr));
1968
1833 blk_start_plug(&plug); 1969 blk_start_plug(&plug);
1834 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1970 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1835 nr[LRU_INACTIVE_FILE]) { 1971 nr[LRU_INACTIVE_FILE]) {
1972 unsigned long nr_anon, nr_file, percentage;
1973 unsigned long nr_scanned;
1974
1836 for_each_evictable_lru(lru) { 1975 for_each_evictable_lru(lru) {
1837 if (nr[lru]) { 1976 if (nr[lru]) {
1838 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); 1977 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
@@ -1842,17 +1981,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1842 lruvec, sc); 1981 lruvec, sc);
1843 } 1982 }
1844 } 1983 }
1984
1985 if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
1986 continue;
1987
1845 /* 1988 /*
1846 * On large memory systems, scan >> priority can become 1989 * For global direct reclaim, reclaim only the number of pages
1847 * really large. This is fine for the starting priority; 1990 * requested. Less care is taken to scan proportionally as it
1848 * we want to put equal scanning pressure on each zone. 1991 * is more important to minimise direct reclaim stall latency
1849 * However, if the VM has a harder time of freeing pages, 1992 * than it is to properly age the LRU lists.
1850 * with multiple processes reclaiming pages, the total
1851 * freeing target can get unreasonably large.
1852 */ 1993 */
1853 if (nr_reclaimed >= nr_to_reclaim && 1994 if (global_reclaim(sc) && !current_is_kswapd())
1854 sc->priority < DEF_PRIORITY)
1855 break; 1995 break;
1996
1997 /*
1998 * For kswapd and memcg, reclaim at least the number of pages
1999 * requested. Ensure that the anon and file LRUs shrink
2000 * proportionally what was requested by get_scan_count(). We
2001 * stop reclaiming one LRU and reduce the amount scanning
2002 * proportional to the original scan target.
2003 */
2004 nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2005 nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2006
2007 if (nr_file > nr_anon) {
2008 unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2009 targets[LRU_ACTIVE_ANON] + 1;
2010 lru = LRU_BASE;
2011 percentage = nr_anon * 100 / scan_target;
2012 } else {
2013 unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2014 targets[LRU_ACTIVE_FILE] + 1;
2015 lru = LRU_FILE;
2016 percentage = nr_file * 100 / scan_target;
2017 }
2018
2019 /* Stop scanning the smaller of the LRU */
2020 nr[lru] = 0;
2021 nr[lru + LRU_ACTIVE] = 0;
2022
2023 /*
2024 * Recalculate the other LRU scan count based on its original
2025 * scan target and the percentage scanning already complete
2026 */
2027 lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2028 nr_scanned = targets[lru] - nr[lru];
2029 nr[lru] = targets[lru] * (100 - percentage) / 100;
2030 nr[lru] -= min(nr[lru], nr_scanned);
2031
2032 lru += LRU_ACTIVE;
2033 nr_scanned = targets[lru] - nr[lru];
2034 nr[lru] = targets[lru] * (100 - percentage) / 100;
2035 nr[lru] -= min(nr[lru], nr_scanned);
2036
2037 scan_adjusted = true;
1856 } 2038 }
1857 blk_finish_plug(&plug); 2039 blk_finish_plug(&plug);
1858 sc->nr_reclaimed += nr_reclaimed; 2040 sc->nr_reclaimed += nr_reclaimed;
@@ -2222,17 +2404,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2222 WB_REASON_TRY_TO_FREE_PAGES); 2404 WB_REASON_TRY_TO_FREE_PAGES);
2223 sc->may_writepage = 1; 2405 sc->may_writepage = 1;
2224 } 2406 }
2225
2226 /* Take a nap, wait for some writeback to complete */
2227 if (!sc->hibernation_mode && sc->nr_scanned &&
2228 sc->priority < DEF_PRIORITY - 2) {
2229 struct zone *preferred_zone;
2230
2231 first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
2232 &cpuset_current_mems_allowed,
2233 &preferred_zone);
2234 wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
2235 }
2236 } while (--sc->priority >= 0); 2407 } while (--sc->priority >= 0);
2237 2408
2238out: 2409out:
@@ -2601,6 +2772,91 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2601} 2772}
2602 2773
2603/* 2774/*
2775 * kswapd shrinks the zone by the number of pages required to reach
2776 * the high watermark.
2777 *
2778 * Returns true if kswapd scanned at least the requested number of pages to
2779 * reclaim or if the lack of progress was due to pages under writeback.
2780 * This is used to determine if the scanning priority needs to be raised.
2781 */
2782static bool kswapd_shrink_zone(struct zone *zone,
2783 int classzone_idx,
2784 struct scan_control *sc,
2785 unsigned long lru_pages,
2786 unsigned long *nr_attempted)
2787{
2788 unsigned long nr_slab;
2789 int testorder = sc->order;
2790 unsigned long balance_gap;
2791 struct reclaim_state *reclaim_state = current->reclaim_state;
2792 struct shrink_control shrink = {
2793 .gfp_mask = sc->gfp_mask,
2794 };
2795 bool lowmem_pressure;
2796
2797 /* Reclaim above the high watermark. */
2798 sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone));
2799
2800 /*
2801 * Kswapd reclaims only single pages with compaction enabled. Trying
2802 * too hard to reclaim until contiguous free pages have become
2803 * available can hurt performance by evicting too much useful data
2804 * from memory. Do not reclaim more than needed for compaction.
2805 */
2806 if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2807 compaction_suitable(zone, sc->order) !=
2808 COMPACT_SKIPPED)
2809 testorder = 0;
2810
2811 /*
2812 * We put equal pressure on every zone, unless one zone has way too
2813 * many pages free already. The "too many pages" is defined as the
2814 * high wmark plus a "gap" where the gap is either the low
2815 * watermark or 1% of the zone, whichever is smaller.
2816 */
2817 balance_gap = min(low_wmark_pages(zone),
2818 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2819 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2820
2821 /*
2822 * If there is no low memory pressure or the zone is balanced then no
2823 * reclaim is necessary
2824 */
2825 lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone));
2826 if (!lowmem_pressure && zone_balanced(zone, testorder,
2827 balance_gap, classzone_idx))
2828 return true;
2829
2830 shrink_zone(zone, sc);
2831
2832 reclaim_state->reclaimed_slab = 0;
2833 nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
2834 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2835
2836 /* Account for the number of pages attempted to reclaim */
2837 *nr_attempted += sc->nr_to_reclaim;
2838
2839 if (nr_slab == 0 && !zone_reclaimable(zone))
2840 zone->all_unreclaimable = 1;
2841
2842 zone_clear_flag(zone, ZONE_WRITEBACK);
2843
2844 /*
2845 * If a zone reaches its high watermark, consider it to be no longer
2846 * congested. It's possible there are dirty pages backed by congested
2847 * BDIs but as pressure is relieved, speculatively avoid congestion
2848 * waits.
2849 */
2850 if (!zone->all_unreclaimable &&
2851 zone_balanced(zone, testorder, 0, classzone_idx)) {
2852 zone_clear_flag(zone, ZONE_CONGESTED);
2853 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2854 }
2855
2856 return sc->nr_scanned >= sc->nr_to_reclaim;
2857}
2858
2859/*
2604 * For kswapd, balance_pgdat() will work across all this node's zones until 2860 * For kswapd, balance_pgdat() will work across all this node's zones until
2605 * they are all at high_wmark_pages(zone). 2861 * they are all at high_wmark_pages(zone).
2606 * 2862 *
@@ -2624,35 +2880,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2624static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2880static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2625 int *classzone_idx) 2881 int *classzone_idx)
2626{ 2882{
2627 bool pgdat_is_balanced = false;
2628 int i; 2883 int i;
2629 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2884 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2630 struct reclaim_state *reclaim_state = current->reclaim_state;
2631 unsigned long nr_soft_reclaimed; 2885 unsigned long nr_soft_reclaimed;
2632 unsigned long nr_soft_scanned; 2886 unsigned long nr_soft_scanned;
2633 struct scan_control sc = { 2887 struct scan_control sc = {
2634 .gfp_mask = GFP_KERNEL, 2888 .gfp_mask = GFP_KERNEL,
2889 .priority = DEF_PRIORITY,
2635 .may_unmap = 1, 2890 .may_unmap = 1,
2636 .may_swap = 1, 2891 .may_swap = 1,
2637 /* 2892 .may_writepage = !laptop_mode,
2638 * kswapd doesn't want to be bailed out while reclaim. because
2639 * we want to put equal scanning pressure on each zone.
2640 */
2641 .nr_to_reclaim = ULONG_MAX,
2642 .order = order, 2893 .order = order,
2643 .target_mem_cgroup = NULL, 2894 .target_mem_cgroup = NULL,
2644 }; 2895 };
2645 struct shrink_control shrink = {
2646 .gfp_mask = sc.gfp_mask,
2647 };
2648loop_again:
2649 sc.priority = DEF_PRIORITY;
2650 sc.nr_reclaimed = 0;
2651 sc.may_writepage = !laptop_mode;
2652 count_vm_event(PAGEOUTRUN); 2896 count_vm_event(PAGEOUTRUN);
2653 2897
2654 do { 2898 do {
2655 unsigned long lru_pages = 0; 2899 unsigned long lru_pages = 0;
2900 unsigned long nr_attempted = 0;
2901 bool raise_priority = true;
2902 bool pgdat_needs_compaction = (order > 0);
2903
2904 sc.nr_reclaimed = 0;
2656 2905
2657 /* 2906 /*
2658 * Scan in the highmem->dma direction for the highest 2907 * Scan in the highmem->dma direction for the highest
@@ -2689,23 +2938,46 @@ loop_again:
2689 end_zone = i; 2938 end_zone = i;
2690 break; 2939 break;
2691 } else { 2940 } else {
2692 /* If balanced, clear the congested flag */ 2941 /*
2942 * If balanced, clear the dirty and congested
2943 * flags
2944 */
2693 zone_clear_flag(zone, ZONE_CONGESTED); 2945 zone_clear_flag(zone, ZONE_CONGESTED);
2946 zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
2694 } 2947 }
2695 } 2948 }
2696 2949
2697 if (i < 0) { 2950 if (i < 0)
2698 pgdat_is_balanced = true;
2699 goto out; 2951 goto out;
2700 }
2701 2952
2702 for (i = 0; i <= end_zone; i++) { 2953 for (i = 0; i <= end_zone; i++) {
2703 struct zone *zone = pgdat->node_zones + i; 2954 struct zone *zone = pgdat->node_zones + i;
2704 2955
2956 if (!populated_zone(zone))
2957 continue;
2958
2705 lru_pages += zone_reclaimable_pages(zone); 2959 lru_pages += zone_reclaimable_pages(zone);
2960
2961 /*
2962 * If any zone is currently balanced then kswapd will
2963 * not call compaction as it is expected that the
2964 * necessary pages are already available.
2965 */
2966 if (pgdat_needs_compaction &&
2967 zone_watermark_ok(zone, order,
2968 low_wmark_pages(zone),
2969 *classzone_idx, 0))
2970 pgdat_needs_compaction = false;
2706 } 2971 }
2707 2972
2708 /* 2973 /*
2974 * If we're getting trouble reclaiming, start doing writepage
2975 * even in laptop mode.
2976 */
2977 if (sc.priority < DEF_PRIORITY - 2)
2978 sc.may_writepage = 1;
2979
2980 /*
2709 * Now scan the zone in the dma->highmem direction, stopping 2981 * Now scan the zone in the dma->highmem direction, stopping
2710 * at the last zone which needs scanning. 2982 * at the last zone which needs scanning.
2711 * 2983 *
@@ -2716,8 +2988,6 @@ loop_again:
2716 */ 2988 */
2717 for (i = 0; i <= end_zone; i++) { 2989 for (i = 0; i <= end_zone; i++) {
2718 struct zone *zone = pgdat->node_zones + i; 2990 struct zone *zone = pgdat->node_zones + i;
2719 int nr_slab, testorder;
2720 unsigned long balance_gap;
2721 2991
2722 if (!populated_zone(zone)) 2992 if (!populated_zone(zone))
2723 continue; 2993 continue;
@@ -2738,65 +3008,14 @@ loop_again:
2738 sc.nr_reclaimed += nr_soft_reclaimed; 3008 sc.nr_reclaimed += nr_soft_reclaimed;
2739 3009
2740 /* 3010 /*
2741 * We put equal pressure on every zone, unless 3011 * There should be no need to raise the scanning
2742 * one zone has way too many pages free 3012 * priority if enough pages are already being scanned
2743 * already. The "too many pages" is defined 3013 * that that high watermark would be met at 100%
2744 * as the high wmark plus a "gap" where the 3014 * efficiency.
2745 * gap is either the low watermark or 1%
2746 * of the zone, whichever is smaller.
2747 */ 3015 */
2748 balance_gap = min(low_wmark_pages(zone), 3016 if (kswapd_shrink_zone(zone, end_zone, &sc,
2749 (zone->managed_pages + 3017 lru_pages, &nr_attempted))
2750 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 3018 raise_priority = false;
2751 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2752 /*
2753 * Kswapd reclaims only single pages with compaction
2754 * enabled. Trying too hard to reclaim until contiguous
2755 * free pages have become available can hurt performance
2756 * by evicting too much useful data from memory.
2757 * Do not reclaim more than needed for compaction.
2758 */
2759 testorder = order;
2760 if (IS_ENABLED(CONFIG_COMPACTION) && order &&
2761 compaction_suitable(zone, order) !=
2762 COMPACT_SKIPPED)
2763 testorder = 0;
2764
2765 if ((buffer_heads_over_limit && is_highmem_idx(i)) ||
2766 !zone_balanced(zone, testorder,
2767 balance_gap, end_zone)) {
2768 shrink_zone(zone, &sc);
2769
2770 reclaim_state->reclaimed_slab = 0;
2771 nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages);
2772 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
2773
2774 if (nr_slab == 0 && !zone_reclaimable(zone))
2775 zone->all_unreclaimable = 1;
2776 }
2777
2778 /*
2779 * If we're getting trouble reclaiming, start doing
2780 * writepage even in laptop mode.
2781 */
2782 if (sc.priority < DEF_PRIORITY - 2)
2783 sc.may_writepage = 1;
2784
2785 if (zone->all_unreclaimable) {
2786 if (end_zone && end_zone == i)
2787 end_zone--;
2788 continue;
2789 }
2790
2791 if (zone_balanced(zone, testorder, 0, end_zone))
2792 /*
2793 * If a zone reaches its high watermark,
2794 * consider it to be no longer congested. It's
2795 * possible there are dirty pages backed by
2796 * congested BDIs but as pressure is relieved,
2797 * speculatively avoid congestion waits
2798 */
2799 zone_clear_flag(zone, ZONE_CONGESTED);
2800 } 3019 }
2801 3020
2802 /* 3021 /*
@@ -2808,74 +3027,38 @@ loop_again:
2808 pfmemalloc_watermark_ok(pgdat)) 3027 pfmemalloc_watermark_ok(pgdat))
2809 wake_up(&pgdat->pfmemalloc_wait); 3028 wake_up(&pgdat->pfmemalloc_wait);
2810 3029
2811 if (pgdat_balanced(pgdat, order, *classzone_idx)) {
2812 pgdat_is_balanced = true;
2813 break; /* kswapd: all done */
2814 }
2815
2816 /* 3030 /*
2817 * We do this so kswapd doesn't build up large priorities for 3031 * Fragmentation may mean that the system cannot be rebalanced
2818 * example when it is freeing in parallel with allocators. It 3032 * for high-order allocations in all zones. If twice the
2819 * matches the direct reclaim path behaviour in terms of impact 3033 * allocation size has been reclaimed and the zones are still
2820 * on zone->*_priority. 3034 * not balanced then recheck the watermarks at order-0 to
3035 * prevent kswapd reclaiming excessively. Assume that a
3036 * process requested a high-order can direct reclaim/compact.
2821 */ 3037 */
2822 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 3038 if (order && sc.nr_reclaimed >= 2UL << order)
2823 break; 3039 order = sc.order = 0;
2824 } while (--sc.priority >= 0);
2825
2826out:
2827 if (!pgdat_is_balanced) {
2828 cond_resched();
2829 3040
2830 try_to_freeze(); 3041 /* Check if kswapd should be suspending */
3042 if (try_to_freeze() || kthread_should_stop())
3043 break;
2831 3044
2832 /* 3045 /*
2833 * Fragmentation may mean that the system cannot be 3046 * Compact if necessary and kswapd is reclaiming at least the
2834 * rebalanced for high-order allocations in all zones. 3047 * high watermark number of pages as requsted
2835 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
2836 * it means the zones have been fully scanned and are still
2837 * not balanced. For high-order allocations, there is
2838 * little point trying all over again as kswapd may
2839 * infinite loop.
2840 *
2841 * Instead, recheck all watermarks at order-0 as they
2842 * are the most important. If watermarks are ok, kswapd will go
2843 * back to sleep. High-order users can still perform direct
2844 * reclaim if they wish.
2845 */ 3048 */
2846 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) 3049 if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted)
2847 order = sc.order = 0;
2848
2849 goto loop_again;
2850 }
2851
2852 /*
2853 * If kswapd was reclaiming at a higher order, it has the option of
2854 * sleeping without all zones being balanced. Before it does, it must
2855 * ensure that the watermarks for order-0 on *all* zones are met and
2856 * that the congestion flags are cleared. The congestion flag must
2857 * be cleared as kswapd is the only mechanism that clears the flag
2858 * and it is potentially going to sleep here.
2859 */
2860 if (order) {
2861 int zones_need_compaction = 1;
2862
2863 for (i = 0; i <= end_zone; i++) {
2864 struct zone *zone = pgdat->node_zones + i;
2865
2866 if (!populated_zone(zone))
2867 continue;
2868
2869 /* Check if the memory needs to be defragmented. */
2870 if (zone_watermark_ok(zone, order,
2871 low_wmark_pages(zone), *classzone_idx, 0))
2872 zones_need_compaction = 0;
2873 }
2874
2875 if (zones_need_compaction)
2876 compact_pgdat(pgdat, order); 3050 compact_pgdat(pgdat, order);
2877 }
2878 3051
3052 /*
3053 * Raise priority if scanning rate is too low or there was no
3054 * progress in reclaiming pages
3055 */
3056 if (raise_priority || !sc.nr_reclaimed)
3057 sc.priority--;
3058 } while (sc.priority >= 1 &&
3059 !pgdat_balanced(pgdat, order, *classzone_idx));
3060
3061out:
2879 /* 3062 /*
2880 * Return the order we were reclaiming at so prepare_kswapd_sleep() 3063 * Return the order we were reclaiming at so prepare_kswapd_sleep()
2881 * makes a decision on the order we were last reclaiming at. However, 3064 * makes a decision on the order we were last reclaiming at. However,