diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 525 |
1 files changed, 284 insertions, 241 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index bc8031ef994d..2f45c0520f43 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -97,8 +97,13 @@ struct scan_control { | |||
97 | /* Can pages be swapped as part of reclaim? */ | 97 | /* Can pages be swapped as part of reclaim? */ |
98 | unsigned int may_swap:1; | 98 | unsigned int may_swap:1; |
99 | 99 | ||
100 | /* Can cgroups be reclaimed below their normal consumption range? */ | 100 | /* |
101 | unsigned int may_thrash:1; | 101 | * Cgroups are not reclaimed below their configured memory.low, |
102 | * unless we threaten to OOM. If any cgroups are skipped due to | ||
103 | * memory.low and nothing was reclaimed, go back for memory.low. | ||
104 | */ | ||
105 | unsigned int memcg_low_reclaim:1; | ||
106 | unsigned int memcg_low_skipped:1; | ||
102 | 107 | ||
103 | unsigned int hibernation_mode:1; | 108 | unsigned int hibernation_mode:1; |
104 | 109 | ||
@@ -230,12 +235,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) | |||
230 | return nr; | 235 | return nr; |
231 | } | 236 | } |
232 | 237 | ||
233 | bool pgdat_reclaimable(struct pglist_data *pgdat) | ||
234 | { | ||
235 | return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < | ||
236 | pgdat_reclaimable_pages(pgdat) * 6; | ||
237 | } | ||
238 | |||
239 | /** | 238 | /** |
240 | * lruvec_lru_size - Returns the number of pages on the given LRU list. | 239 | * lruvec_lru_size - Returns the number of pages on the given LRU list. |
241 | * @lruvec: lru vector | 240 | * @lruvec: lru vector |
@@ -912,7 +911,8 @@ static void page_check_dirty_writeback(struct page *page, | |||
912 | * Anonymous pages are not handled by flushers and must be written | 911 | * Anonymous pages are not handled by flushers and must be written |
913 | * from reclaim context. Do not stall reclaim based on them | 912 | * from reclaim context. Do not stall reclaim based on them |
914 | */ | 913 | */ |
915 | if (!page_is_file_cache(page)) { | 914 | if (!page_is_file_cache(page) || |
915 | (PageAnon(page) && !PageSwapBacked(page))) { | ||
916 | *dirty = false; | 916 | *dirty = false; |
917 | *writeback = false; | 917 | *writeback = false; |
918 | return; | 918 | return; |
@@ -972,8 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
972 | int may_enter_fs; | 972 | int may_enter_fs; |
973 | enum page_references references = PAGEREF_RECLAIM_CLEAN; | 973 | enum page_references references = PAGEREF_RECLAIM_CLEAN; |
974 | bool dirty, writeback; | 974 | bool dirty, writeback; |
975 | bool lazyfree = false; | ||
976 | int ret = SWAP_SUCCESS; | ||
977 | 975 | ||
978 | cond_resched(); | 976 | cond_resched(); |
979 | 977 | ||
@@ -988,13 +986,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
988 | sc->nr_scanned++; | 986 | sc->nr_scanned++; |
989 | 987 | ||
990 | if (unlikely(!page_evictable(page))) | 988 | if (unlikely(!page_evictable(page))) |
991 | goto cull_mlocked; | 989 | goto activate_locked; |
992 | 990 | ||
993 | if (!sc->may_unmap && page_mapped(page)) | 991 | if (!sc->may_unmap && page_mapped(page)) |
994 | goto keep_locked; | 992 | goto keep_locked; |
995 | 993 | ||
996 | /* Double the slab pressure for mapped and swapcache pages */ | 994 | /* Double the slab pressure for mapped and swapcache pages */ |
997 | if (page_mapped(page) || PageSwapCache(page)) | 995 | if ((page_mapped(page) || PageSwapCache(page)) && |
996 | !(PageAnon(page) && !PageSwapBacked(page))) | ||
998 | sc->nr_scanned++; | 997 | sc->nr_scanned++; |
999 | 998 | ||
1000 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || | 999 | may_enter_fs = (sc->gfp_mask & __GFP_FS) || |
@@ -1120,13 +1119,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1120 | /* | 1119 | /* |
1121 | * Anonymous process memory has backing store? | 1120 | * Anonymous process memory has backing store? |
1122 | * Try to allocate it some swap space here. | 1121 | * Try to allocate it some swap space here. |
1122 | * Lazyfree page could be freed directly | ||
1123 | */ | 1123 | */ |
1124 | if (PageAnon(page) && !PageSwapCache(page)) { | 1124 | if (PageAnon(page) && PageSwapBacked(page) && |
1125 | !PageSwapCache(page)) { | ||
1125 | if (!(sc->gfp_mask & __GFP_IO)) | 1126 | if (!(sc->gfp_mask & __GFP_IO)) |
1126 | goto keep_locked; | 1127 | goto keep_locked; |
1127 | if (!add_to_swap(page, page_list)) | 1128 | if (!add_to_swap(page, page_list)) |
1128 | goto activate_locked; | 1129 | goto activate_locked; |
1129 | lazyfree = true; | ||
1130 | may_enter_fs = 1; | 1130 | may_enter_fs = 1; |
1131 | 1131 | ||
1132 | /* Adding to swap updated mapping */ | 1132 | /* Adding to swap updated mapping */ |
@@ -1143,21 +1143,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1143 | * The page is mapped into the page tables of one or more | 1143 | * The page is mapped into the page tables of one or more |
1144 | * processes. Try to unmap it here. | 1144 | * processes. Try to unmap it here. |
1145 | */ | 1145 | */ |
1146 | if (page_mapped(page) && mapping) { | 1146 | if (page_mapped(page)) { |
1147 | switch (ret = try_to_unmap(page, lazyfree ? | 1147 | if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { |
1148 | (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : | ||
1149 | (ttu_flags | TTU_BATCH_FLUSH))) { | ||
1150 | case SWAP_FAIL: | ||
1151 | nr_unmap_fail++; | 1148 | nr_unmap_fail++; |
1152 | goto activate_locked; | 1149 | goto activate_locked; |
1153 | case SWAP_AGAIN: | ||
1154 | goto keep_locked; | ||
1155 | case SWAP_MLOCK: | ||
1156 | goto cull_mlocked; | ||
1157 | case SWAP_LZFREE: | ||
1158 | goto lazyfree; | ||
1159 | case SWAP_SUCCESS: | ||
1160 | ; /* try to free the page below */ | ||
1161 | } | 1150 | } |
1162 | } | 1151 | } |
1163 | 1152 | ||
@@ -1267,10 +1256,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
1267 | } | 1256 | } |
1268 | } | 1257 | } |
1269 | 1258 | ||
1270 | lazyfree: | 1259 | if (PageAnon(page) && !PageSwapBacked(page)) { |
1271 | if (!mapping || !__remove_mapping(mapping, page, true)) | 1260 | /* follow __remove_mapping for reference */ |
1272 | goto keep_locked; | 1261 | if (!page_ref_freeze(page, 1)) |
1262 | goto keep_locked; | ||
1263 | if (PageDirty(page)) { | ||
1264 | page_ref_unfreeze(page, 1); | ||
1265 | goto keep_locked; | ||
1266 | } | ||
1273 | 1267 | ||
1268 | count_vm_event(PGLAZYFREED); | ||
1269 | } else if (!mapping || !__remove_mapping(mapping, page, true)) | ||
1270 | goto keep_locked; | ||
1274 | /* | 1271 | /* |
1275 | * At this point, we have no other references and there is | 1272 | * At this point, we have no other references and there is |
1276 | * no way to pick any more up (removed from LRU, removed | 1273 | * no way to pick any more up (removed from LRU, removed |
@@ -1280,9 +1277,6 @@ lazyfree: | |||
1280 | */ | 1277 | */ |
1281 | __ClearPageLocked(page); | 1278 | __ClearPageLocked(page); |
1282 | free_it: | 1279 | free_it: |
1283 | if (ret == SWAP_LZFREE) | ||
1284 | count_vm_event(PGLAZYFREED); | ||
1285 | |||
1286 | nr_reclaimed++; | 1280 | nr_reclaimed++; |
1287 | 1281 | ||
1288 | /* | 1282 | /* |
@@ -1292,20 +1286,16 @@ free_it: | |||
1292 | list_add(&page->lru, &free_pages); | 1286 | list_add(&page->lru, &free_pages); |
1293 | continue; | 1287 | continue; |
1294 | 1288 | ||
1295 | cull_mlocked: | ||
1296 | if (PageSwapCache(page)) | ||
1297 | try_to_free_swap(page); | ||
1298 | unlock_page(page); | ||
1299 | list_add(&page->lru, &ret_pages); | ||
1300 | continue; | ||
1301 | |||
1302 | activate_locked: | 1289 | activate_locked: |
1303 | /* Not a candidate for swapping, so reclaim swap space. */ | 1290 | /* Not a candidate for swapping, so reclaim swap space. */ |
1304 | if (PageSwapCache(page) && mem_cgroup_swap_full(page)) | 1291 | if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || |
1292 | PageMlocked(page))) | ||
1305 | try_to_free_swap(page); | 1293 | try_to_free_swap(page); |
1306 | VM_BUG_ON_PAGE(PageActive(page), page); | 1294 | VM_BUG_ON_PAGE(PageActive(page), page); |
1307 | SetPageActive(page); | 1295 | if (!PageMlocked(page)) { |
1308 | pgactivate++; | 1296 | SetPageActive(page); |
1297 | pgactivate++; | ||
1298 | } | ||
1309 | keep_locked: | 1299 | keep_locked: |
1310 | unlock_page(page); | 1300 | unlock_page(page); |
1311 | keep: | 1301 | keep: |
@@ -1354,7 +1344,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, | |||
1354 | } | 1344 | } |
1355 | 1345 | ||
1356 | ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, | 1346 | ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, |
1357 | TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true); | 1347 | TTU_IGNORE_ACCESS, NULL, true); |
1358 | list_splice(&clean_pages, page_list); | 1348 | list_splice(&clean_pages, page_list); |
1359 | mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); | 1349 | mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); |
1360 | return ret; | 1350 | return ret; |
@@ -1478,12 +1468,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1478 | unsigned long nr_taken = 0; | 1468 | unsigned long nr_taken = 0; |
1479 | unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; | 1469 | unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; |
1480 | unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; | 1470 | unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; |
1481 | unsigned long skipped = 0, total_skipped = 0; | 1471 | unsigned long skipped = 0; |
1482 | unsigned long scan, nr_pages; | 1472 | unsigned long scan, nr_pages; |
1483 | LIST_HEAD(pages_skipped); | 1473 | LIST_HEAD(pages_skipped); |
1484 | 1474 | ||
1485 | for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && | 1475 | for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && |
1486 | !list_empty(src);) { | 1476 | !list_empty(src); scan++) { |
1487 | struct page *page; | 1477 | struct page *page; |
1488 | 1478 | ||
1489 | page = lru_to_page(src); | 1479 | page = lru_to_page(src); |
@@ -1497,12 +1487,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1497 | continue; | 1487 | continue; |
1498 | } | 1488 | } |
1499 | 1489 | ||
1500 | /* | ||
1501 | * Account for scanned and skipped separetly to avoid the pgdat | ||
1502 | * being prematurely marked unreclaimable by pgdat_reclaimable. | ||
1503 | */ | ||
1504 | scan++; | ||
1505 | |||
1506 | switch (__isolate_lru_page(page, mode)) { | 1490 | switch (__isolate_lru_page(page, mode)) { |
1507 | case 0: | 1491 | case 0: |
1508 | nr_pages = hpage_nr_pages(page); | 1492 | nr_pages = hpage_nr_pages(page); |
@@ -1531,6 +1515,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1531 | if (!list_empty(&pages_skipped)) { | 1515 | if (!list_empty(&pages_skipped)) { |
1532 | int zid; | 1516 | int zid; |
1533 | 1517 | ||
1518 | list_splice(&pages_skipped, src); | ||
1534 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 1519 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
1535 | if (!nr_skipped[zid]) | 1520 | if (!nr_skipped[zid]) |
1536 | continue; | 1521 | continue; |
@@ -1538,17 +1523,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, | |||
1538 | __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); | 1523 | __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); |
1539 | skipped += nr_skipped[zid]; | 1524 | skipped += nr_skipped[zid]; |
1540 | } | 1525 | } |
1541 | |||
1542 | /* | ||
1543 | * Account skipped pages as a partial scan as the pgdat may be | ||
1544 | * close to unreclaimable. If the LRU list is empty, account | ||
1545 | * skipped pages as a full scan. | ||
1546 | */ | ||
1547 | total_skipped = list_empty(src) ? skipped : skipped >> 2; | ||
1548 | |||
1549 | list_splice(&pages_skipped, src); | ||
1550 | } | 1526 | } |
1551 | *nr_scanned = scan + total_skipped; | 1527 | *nr_scanned = scan; |
1552 | trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, | 1528 | trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, |
1553 | scan, skipped, nr_taken, mode, lru); | 1529 | scan, skipped, nr_taken, mode, lru); |
1554 | update_lru_sizes(lruvec, lru, nr_zone_taken); | 1530 | update_lru_sizes(lruvec, lru, nr_zone_taken); |
@@ -1750,7 +1726,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1750 | reclaim_stat->recent_scanned[file] += nr_taken; | 1726 | reclaim_stat->recent_scanned[file] += nr_taken; |
1751 | 1727 | ||
1752 | if (global_reclaim(sc)) { | 1728 | if (global_reclaim(sc)) { |
1753 | __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); | ||
1754 | if (current_is_kswapd()) | 1729 | if (current_is_kswapd()) |
1755 | __count_vm_events(PGSCAN_KSWAPD, nr_scanned); | 1730 | __count_vm_events(PGSCAN_KSWAPD, nr_scanned); |
1756 | else | 1731 | else |
@@ -1761,7 +1736,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1761 | if (nr_taken == 0) | 1736 | if (nr_taken == 0) |
1762 | return 0; | 1737 | return 0; |
1763 | 1738 | ||
1764 | nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, | 1739 | nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, |
1765 | &stat, false); | 1740 | &stat, false); |
1766 | 1741 | ||
1767 | spin_lock_irq(&pgdat->lru_lock); | 1742 | spin_lock_irq(&pgdat->lru_lock); |
@@ -1953,8 +1928,6 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
1953 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); | 1928 | __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); |
1954 | reclaim_stat->recent_scanned[file] += nr_taken; | 1929 | reclaim_stat->recent_scanned[file] += nr_taken; |
1955 | 1930 | ||
1956 | if (global_reclaim(sc)) | ||
1957 | __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); | ||
1958 | __count_vm_events(PGREFILL, nr_scanned); | 1931 | __count_vm_events(PGREFILL, nr_scanned); |
1959 | 1932 | ||
1960 | spin_unlock_irq(&pgdat->lru_lock); | 1933 | spin_unlock_irq(&pgdat->lru_lock); |
@@ -2033,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
2033 | * Both inactive lists should also be large enough that each inactive | 2006 | * Both inactive lists should also be large enough that each inactive |
2034 | * page has a chance to be referenced again before it is reclaimed. | 2007 | * page has a chance to be referenced again before it is reclaimed. |
2035 | * | 2008 | * |
2009 | * If that fails and refaulting is observed, the inactive list grows. | ||
2010 | * | ||
2036 | * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages | 2011 | * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages |
2037 | * on this LRU, maintained by the pageout code. A zone->inactive_ratio | 2012 | * on this LRU, maintained by the pageout code. A zone->inactive_ratio |
2038 | * of 3 means 3:1 or 25% of the pages are kept on the inactive list. | 2013 | * of 3 means 3:1 or 25% of the pages are kept on the inactive list. |
@@ -2049,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan, | |||
2049 | * 10TB 320 32GB | 2024 | * 10TB 320 32GB |
2050 | */ | 2025 | */ |
2051 | static bool inactive_list_is_low(struct lruvec *lruvec, bool file, | 2026 | static bool inactive_list_is_low(struct lruvec *lruvec, bool file, |
2052 | struct scan_control *sc, bool trace) | 2027 | struct mem_cgroup *memcg, |
2028 | struct scan_control *sc, bool actual_reclaim) | ||
2053 | { | 2029 | { |
2054 | unsigned long inactive_ratio; | ||
2055 | unsigned long inactive, active; | ||
2056 | enum lru_list inactive_lru = file * LRU_FILE; | ||
2057 | enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; | 2030 | enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; |
2031 | struct pglist_data *pgdat = lruvec_pgdat(lruvec); | ||
2032 | enum lru_list inactive_lru = file * LRU_FILE; | ||
2033 | unsigned long inactive, active; | ||
2034 | unsigned long inactive_ratio; | ||
2035 | unsigned long refaults; | ||
2058 | unsigned long gb; | 2036 | unsigned long gb; |
2059 | 2037 | ||
2060 | /* | 2038 | /* |
@@ -2067,27 +2045,42 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, | |||
2067 | inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); | 2045 | inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); |
2068 | active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); | 2046 | active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); |
2069 | 2047 | ||
2070 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | 2048 | if (memcg) |
2071 | if (gb) | 2049 | refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); |
2072 | inactive_ratio = int_sqrt(10 * gb); | ||
2073 | else | 2050 | else |
2074 | inactive_ratio = 1; | 2051 | refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); |
2052 | |||
2053 | /* | ||
2054 | * When refaults are being observed, it means a new workingset | ||
2055 | * is being established. Disable active list protection to get | ||
2056 | * rid of the stale workingset quickly. | ||
2057 | */ | ||
2058 | if (file && actual_reclaim && lruvec->refaults != refaults) { | ||
2059 | inactive_ratio = 0; | ||
2060 | } else { | ||
2061 | gb = (inactive + active) >> (30 - PAGE_SHIFT); | ||
2062 | if (gb) | ||
2063 | inactive_ratio = int_sqrt(10 * gb); | ||
2064 | else | ||
2065 | inactive_ratio = 1; | ||
2066 | } | ||
2075 | 2067 | ||
2076 | if (trace) | 2068 | if (actual_reclaim) |
2077 | trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id, | 2069 | trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, |
2078 | sc->reclaim_idx, | 2070 | lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, |
2079 | lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, | 2071 | lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, |
2080 | lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, | 2072 | inactive_ratio, file); |
2081 | inactive_ratio, file); | ||
2082 | 2073 | ||
2083 | return inactive * inactive_ratio < active; | 2074 | return inactive * inactive_ratio < active; |
2084 | } | 2075 | } |
2085 | 2076 | ||
2086 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, | 2077 | static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, |
2087 | struct lruvec *lruvec, struct scan_control *sc) | 2078 | struct lruvec *lruvec, struct mem_cgroup *memcg, |
2079 | struct scan_control *sc) | ||
2088 | { | 2080 | { |
2089 | if (is_active_lru(lru)) { | 2081 | if (is_active_lru(lru)) { |
2090 | if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) | 2082 | if (inactive_list_is_low(lruvec, is_file_lru(lru), |
2083 | memcg, sc, true)) | ||
2091 | shrink_active_list(nr_to_scan, lruvec, sc, lru); | 2084 | shrink_active_list(nr_to_scan, lruvec, sc, lru); |
2092 | return 0; | 2085 | return 0; |
2093 | } | 2086 | } |
@@ -2123,30 +2116,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2123 | unsigned long anon_prio, file_prio; | 2116 | unsigned long anon_prio, file_prio; |
2124 | enum scan_balance scan_balance; | 2117 | enum scan_balance scan_balance; |
2125 | unsigned long anon, file; | 2118 | unsigned long anon, file; |
2126 | bool force_scan = false; | ||
2127 | unsigned long ap, fp; | 2119 | unsigned long ap, fp; |
2128 | enum lru_list lru; | 2120 | enum lru_list lru; |
2129 | bool some_scanned; | ||
2130 | int pass; | ||
2131 | |||
2132 | /* | ||
2133 | * If the zone or memcg is small, nr[l] can be 0. This | ||
2134 | * results in no scanning on this priority and a potential | ||
2135 | * priority drop. Global direct reclaim can go to the next | ||
2136 | * zone and tends to have no problems. Global kswapd is for | ||
2137 | * zone balancing and it needs to scan a minimum amount. When | ||
2138 | * reclaiming for a memcg, a priority drop can cause high | ||
2139 | * latencies, so it's better to scan a minimum amount there as | ||
2140 | * well. | ||
2141 | */ | ||
2142 | if (current_is_kswapd()) { | ||
2143 | if (!pgdat_reclaimable(pgdat)) | ||
2144 | force_scan = true; | ||
2145 | if (!mem_cgroup_online(memcg)) | ||
2146 | force_scan = true; | ||
2147 | } | ||
2148 | if (!global_reclaim(sc)) | ||
2149 | force_scan = true; | ||
2150 | 2121 | ||
2151 | /* If we have no swap space, do not bother scanning anon pages. */ | 2122 | /* If we have no swap space, do not bother scanning anon pages. */ |
2152 | if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { | 2123 | if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { |
@@ -2218,7 +2189,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2218 | * lruvec even if it has plenty of old anonymous pages unless the | 2189 | * lruvec even if it has plenty of old anonymous pages unless the |
2219 | * system is under heavy pressure. | 2190 | * system is under heavy pressure. |
2220 | */ | 2191 | */ |
2221 | if (!inactive_list_is_low(lruvec, true, sc, false) && | 2192 | if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && |
2222 | lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { | 2193 | lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { |
2223 | scan_balance = SCAN_FILE; | 2194 | scan_balance = SCAN_FILE; |
2224 | goto out; | 2195 | goto out; |
@@ -2277,55 +2248,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, | |||
2277 | fraction[1] = fp; | 2248 | fraction[1] = fp; |
2278 | denominator = ap + fp + 1; | 2249 | denominator = ap + fp + 1; |
2279 | out: | 2250 | out: |
2280 | some_scanned = false; | 2251 | *lru_pages = 0; |
2281 | /* Only use force_scan on second pass. */ | 2252 | for_each_evictable_lru(lru) { |
2282 | for (pass = 0; !some_scanned && pass < 2; pass++) { | 2253 | int file = is_file_lru(lru); |
2283 | *lru_pages = 0; | 2254 | unsigned long size; |
2284 | for_each_evictable_lru(lru) { | 2255 | unsigned long scan; |
2285 | int file = is_file_lru(lru); | ||
2286 | unsigned long size; | ||
2287 | unsigned long scan; | ||
2288 | |||
2289 | size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); | ||
2290 | scan = size >> sc->priority; | ||
2291 | |||
2292 | if (!scan && pass && force_scan) | ||
2293 | scan = min(size, SWAP_CLUSTER_MAX); | ||
2294 | |||
2295 | switch (scan_balance) { | ||
2296 | case SCAN_EQUAL: | ||
2297 | /* Scan lists relative to size */ | ||
2298 | break; | ||
2299 | case SCAN_FRACT: | ||
2300 | /* | ||
2301 | * Scan types proportional to swappiness and | ||
2302 | * their relative recent reclaim efficiency. | ||
2303 | */ | ||
2304 | scan = div64_u64(scan * fraction[file], | ||
2305 | denominator); | ||
2306 | break; | ||
2307 | case SCAN_FILE: | ||
2308 | case SCAN_ANON: | ||
2309 | /* Scan one type exclusively */ | ||
2310 | if ((scan_balance == SCAN_FILE) != file) { | ||
2311 | size = 0; | ||
2312 | scan = 0; | ||
2313 | } | ||
2314 | break; | ||
2315 | default: | ||
2316 | /* Look ma, no brain */ | ||
2317 | BUG(); | ||
2318 | } | ||
2319 | 2256 | ||
2320 | *lru_pages += size; | 2257 | size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); |
2321 | nr[lru] = scan; | 2258 | scan = size >> sc->priority; |
2259 | /* | ||
2260 | * If the cgroup's already been deleted, make sure to | ||
2261 | * scrape out the remaining cache. | ||
2262 | */ | ||
2263 | if (!scan && !mem_cgroup_online(memcg)) | ||
2264 | scan = min(size, SWAP_CLUSTER_MAX); | ||
2322 | 2265 | ||
2266 | switch (scan_balance) { | ||
2267 | case SCAN_EQUAL: | ||
2268 | /* Scan lists relative to size */ | ||
2269 | break; | ||
2270 | case SCAN_FRACT: | ||
2323 | /* | 2271 | /* |
2324 | * Skip the second pass and don't force_scan, | 2272 | * Scan types proportional to swappiness and |
2325 | * if we found something to scan. | 2273 | * their relative recent reclaim efficiency. |
2326 | */ | 2274 | */ |
2327 | some_scanned |= !!scan; | 2275 | scan = div64_u64(scan * fraction[file], |
2276 | denominator); | ||
2277 | break; | ||
2278 | case SCAN_FILE: | ||
2279 | case SCAN_ANON: | ||
2280 | /* Scan one type exclusively */ | ||
2281 | if ((scan_balance == SCAN_FILE) != file) { | ||
2282 | size = 0; | ||
2283 | scan = 0; | ||
2284 | } | ||
2285 | break; | ||
2286 | default: | ||
2287 | /* Look ma, no brain */ | ||
2288 | BUG(); | ||
2328 | } | 2289 | } |
2290 | |||
2291 | *lru_pages += size; | ||
2292 | nr[lru] = scan; | ||
2329 | } | 2293 | } |
2330 | } | 2294 | } |
2331 | 2295 | ||
@@ -2376,7 +2340,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc | |||
2376 | nr[lru] -= nr_to_scan; | 2340 | nr[lru] -= nr_to_scan; |
2377 | 2341 | ||
2378 | nr_reclaimed += shrink_list(lru, nr_to_scan, | 2342 | nr_reclaimed += shrink_list(lru, nr_to_scan, |
2379 | lruvec, sc); | 2343 | lruvec, memcg, sc); |
2380 | } | 2344 | } |
2381 | } | 2345 | } |
2382 | 2346 | ||
@@ -2443,7 +2407,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc | |||
2443 | * Even if we did not try to evict anon pages at all, we want to | 2407 | * Even if we did not try to evict anon pages at all, we want to |
2444 | * rebalance the anon lru active/inactive ratio. | 2408 | * rebalance the anon lru active/inactive ratio. |
2445 | */ | 2409 | */ |
2446 | if (inactive_list_is_low(lruvec, false, sc, true)) | 2410 | if (inactive_list_is_low(lruvec, false, memcg, sc, true)) |
2447 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | 2411 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, |
2448 | sc, LRU_ACTIVE_ANON); | 2412 | sc, LRU_ACTIVE_ANON); |
2449 | } | 2413 | } |
@@ -2557,9 +2521,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2557 | unsigned long scanned; | 2521 | unsigned long scanned; |
2558 | 2522 | ||
2559 | if (mem_cgroup_low(root, memcg)) { | 2523 | if (mem_cgroup_low(root, memcg)) { |
2560 | if (!sc->may_thrash) | 2524 | if (!sc->memcg_low_reclaim) { |
2525 | sc->memcg_low_skipped = 1; | ||
2561 | continue; | 2526 | continue; |
2562 | mem_cgroup_events(memcg, MEMCG_LOW, 1); | 2527 | } |
2528 | mem_cgroup_event(memcg, MEMCG_LOW); | ||
2563 | } | 2529 | } |
2564 | 2530 | ||
2565 | reclaimed = sc->nr_reclaimed; | 2531 | reclaimed = sc->nr_reclaimed; |
@@ -2620,6 +2586,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) | |||
2620 | } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, | 2586 | } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, |
2621 | sc->nr_scanned - nr_scanned, sc)); | 2587 | sc->nr_scanned - nr_scanned, sc)); |
2622 | 2588 | ||
2589 | /* | ||
2590 | * Kswapd gives up on balancing particular nodes after too | ||
2591 | * many failures to reclaim anything from them and goes to | ||
2592 | * sleep. On reclaim progress, reset the failure counter. A | ||
2593 | * successful direct reclaim run will revive a dormant kswapd. | ||
2594 | */ | ||
2595 | if (reclaimable) | ||
2596 | pgdat->kswapd_failures = 0; | ||
2597 | |||
2623 | return reclaimable; | 2598 | return reclaimable; |
2624 | } | 2599 | } |
2625 | 2600 | ||
@@ -2694,10 +2669,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2694 | GFP_KERNEL | __GFP_HARDWALL)) | 2669 | GFP_KERNEL | __GFP_HARDWALL)) |
2695 | continue; | 2670 | continue; |
2696 | 2671 | ||
2697 | if (sc->priority != DEF_PRIORITY && | ||
2698 | !pgdat_reclaimable(zone->zone_pgdat)) | ||
2699 | continue; /* Let kswapd poll it */ | ||
2700 | |||
2701 | /* | 2672 | /* |
2702 | * If we already have plenty of memory free for | 2673 | * If we already have plenty of memory free for |
2703 | * compaction in this zone, don't free any more. | 2674 | * compaction in this zone, don't free any more. |
@@ -2752,6 +2723,25 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2752 | sc->gfp_mask = orig_mask; | 2723 | sc->gfp_mask = orig_mask; |
2753 | } | 2724 | } |
2754 | 2725 | ||
2726 | static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) | ||
2727 | { | ||
2728 | struct mem_cgroup *memcg; | ||
2729 | |||
2730 | memcg = mem_cgroup_iter(root_memcg, NULL, NULL); | ||
2731 | do { | ||
2732 | unsigned long refaults; | ||
2733 | struct lruvec *lruvec; | ||
2734 | |||
2735 | if (memcg) | ||
2736 | refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); | ||
2737 | else | ||
2738 | refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); | ||
2739 | |||
2740 | lruvec = mem_cgroup_lruvec(pgdat, memcg); | ||
2741 | lruvec->refaults = refaults; | ||
2742 | } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); | ||
2743 | } | ||
2744 | |||
2755 | /* | 2745 | /* |
2756 | * This is the main entry point to direct page reclaim. | 2746 | * This is the main entry point to direct page reclaim. |
2757 | * | 2747 | * |
@@ -2772,6 +2762,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2772 | struct scan_control *sc) | 2762 | struct scan_control *sc) |
2773 | { | 2763 | { |
2774 | int initial_priority = sc->priority; | 2764 | int initial_priority = sc->priority; |
2765 | pg_data_t *last_pgdat; | ||
2766 | struct zoneref *z; | ||
2767 | struct zone *zone; | ||
2775 | retry: | 2768 | retry: |
2776 | delayacct_freepages_start(); | 2769 | delayacct_freepages_start(); |
2777 | 2770 | ||
@@ -2798,6 +2791,15 @@ retry: | |||
2798 | sc->may_writepage = 1; | 2791 | sc->may_writepage = 1; |
2799 | } while (--sc->priority >= 0); | 2792 | } while (--sc->priority >= 0); |
2800 | 2793 | ||
2794 | last_pgdat = NULL; | ||
2795 | for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, | ||
2796 | sc->nodemask) { | ||
2797 | if (zone->zone_pgdat == last_pgdat) | ||
2798 | continue; | ||
2799 | last_pgdat = zone->zone_pgdat; | ||
2800 | snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); | ||
2801 | } | ||
2802 | |||
2801 | delayacct_freepages_end(); | 2803 | delayacct_freepages_end(); |
2802 | 2804 | ||
2803 | if (sc->nr_reclaimed) | 2805 | if (sc->nr_reclaimed) |
@@ -2808,16 +2810,17 @@ retry: | |||
2808 | return 1; | 2810 | return 1; |
2809 | 2811 | ||
2810 | /* Untapped cgroup reserves? Don't OOM, retry. */ | 2812 | /* Untapped cgroup reserves? Don't OOM, retry. */ |
2811 | if (!sc->may_thrash) { | 2813 | if (sc->memcg_low_skipped) { |
2812 | sc->priority = initial_priority; | 2814 | sc->priority = initial_priority; |
2813 | sc->may_thrash = 1; | 2815 | sc->memcg_low_reclaim = 1; |
2816 | sc->memcg_low_skipped = 0; | ||
2814 | goto retry; | 2817 | goto retry; |
2815 | } | 2818 | } |
2816 | 2819 | ||
2817 | return 0; | 2820 | return 0; |
2818 | } | 2821 | } |
2819 | 2822 | ||
2820 | static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | 2823 | static bool allow_direct_reclaim(pg_data_t *pgdat) |
2821 | { | 2824 | { |
2822 | struct zone *zone; | 2825 | struct zone *zone; |
2823 | unsigned long pfmemalloc_reserve = 0; | 2826 | unsigned long pfmemalloc_reserve = 0; |
@@ -2825,10 +2828,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | |||
2825 | int i; | 2828 | int i; |
2826 | bool wmark_ok; | 2829 | bool wmark_ok; |
2827 | 2830 | ||
2831 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) | ||
2832 | return true; | ||
2833 | |||
2828 | for (i = 0; i <= ZONE_NORMAL; i++) { | 2834 | for (i = 0; i <= ZONE_NORMAL; i++) { |
2829 | zone = &pgdat->node_zones[i]; | 2835 | zone = &pgdat->node_zones[i]; |
2830 | if (!managed_zone(zone) || | 2836 | if (!managed_zone(zone)) |
2831 | pgdat_reclaimable_pages(pgdat) == 0) | 2837 | continue; |
2838 | |||
2839 | if (!zone_reclaimable_pages(zone)) | ||
2832 | continue; | 2840 | continue; |
2833 | 2841 | ||
2834 | pfmemalloc_reserve += min_wmark_pages(zone); | 2842 | pfmemalloc_reserve += min_wmark_pages(zone); |
@@ -2905,7 +2913,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2905 | 2913 | ||
2906 | /* Throttle based on the first usable node */ | 2914 | /* Throttle based on the first usable node */ |
2907 | pgdat = zone->zone_pgdat; | 2915 | pgdat = zone->zone_pgdat; |
2908 | if (pfmemalloc_watermark_ok(pgdat)) | 2916 | if (allow_direct_reclaim(pgdat)) |
2909 | goto out; | 2917 | goto out; |
2910 | break; | 2918 | break; |
2911 | } | 2919 | } |
@@ -2927,14 +2935,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | |||
2927 | */ | 2935 | */ |
2928 | if (!(gfp_mask & __GFP_FS)) { | 2936 | if (!(gfp_mask & __GFP_FS)) { |
2929 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, | 2937 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, |
2930 | pfmemalloc_watermark_ok(pgdat), HZ); | 2938 | allow_direct_reclaim(pgdat), HZ); |
2931 | 2939 | ||
2932 | goto check_pending; | 2940 | goto check_pending; |
2933 | } | 2941 | } |
2934 | 2942 | ||
2935 | /* Throttle until kswapd wakes the process */ | 2943 | /* Throttle until kswapd wakes the process */ |
2936 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, | 2944 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, |
2937 | pfmemalloc_watermark_ok(pgdat)); | 2945 | allow_direct_reclaim(pgdat)); |
2938 | 2946 | ||
2939 | check_pending: | 2947 | check_pending: |
2940 | if (fatal_signal_pending(current)) | 2948 | if (fatal_signal_pending(current)) |
@@ -2950,7 +2958,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2950 | unsigned long nr_reclaimed; | 2958 | unsigned long nr_reclaimed; |
2951 | struct scan_control sc = { | 2959 | struct scan_control sc = { |
2952 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2960 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2953 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), | 2961 | .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), |
2954 | .reclaim_idx = gfp_zone(gfp_mask), | 2962 | .reclaim_idx = gfp_zone(gfp_mask), |
2955 | .order = order, | 2963 | .order = order, |
2956 | .nodemask = nodemask, | 2964 | .nodemask = nodemask, |
@@ -3028,9 +3036,10 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
3028 | struct zonelist *zonelist; | 3036 | struct zonelist *zonelist; |
3029 | unsigned long nr_reclaimed; | 3037 | unsigned long nr_reclaimed; |
3030 | int nid; | 3038 | int nid; |
3039 | unsigned int noreclaim_flag; | ||
3031 | struct scan_control sc = { | 3040 | struct scan_control sc = { |
3032 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), | 3041 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
3033 | .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | | 3042 | .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | |
3034 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), | 3043 | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), |
3035 | .reclaim_idx = MAX_NR_ZONES - 1, | 3044 | .reclaim_idx = MAX_NR_ZONES - 1, |
3036 | .target_mem_cgroup = memcg, | 3045 | .target_mem_cgroup = memcg, |
@@ -3054,9 +3063,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, | |||
3054 | sc.gfp_mask, | 3063 | sc.gfp_mask, |
3055 | sc.reclaim_idx); | 3064 | sc.reclaim_idx); |
3056 | 3065 | ||
3057 | current->flags |= PF_MEMALLOC; | 3066 | noreclaim_flag = memalloc_noreclaim_save(); |
3058 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | 3067 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); |
3059 | current->flags &= ~PF_MEMALLOC; | 3068 | memalloc_noreclaim_restore(noreclaim_flag); |
3060 | 3069 | ||
3061 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); | 3070 | trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); |
3062 | 3071 | ||
@@ -3076,7 +3085,7 @@ static void age_active_anon(struct pglist_data *pgdat, | |||
3076 | do { | 3085 | do { |
3077 | struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); | 3086 | struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); |
3078 | 3087 | ||
3079 | if (inactive_list_is_low(lruvec, false, sc, true)) | 3088 | if (inactive_list_is_low(lruvec, false, memcg, sc, true)) |
3080 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | 3089 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, |
3081 | sc, LRU_ACTIVE_ANON); | 3090 | sc, LRU_ACTIVE_ANON); |
3082 | 3091 | ||
@@ -3084,22 +3093,44 @@ static void age_active_anon(struct pglist_data *pgdat, | |||
3084 | } while (memcg); | 3093 | } while (memcg); |
3085 | } | 3094 | } |
3086 | 3095 | ||
3087 | static bool zone_balanced(struct zone *zone, int order, int classzone_idx) | 3096 | /* |
3097 | * Returns true if there is an eligible zone balanced for the request order | ||
3098 | * and classzone_idx | ||
3099 | */ | ||
3100 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | ||
3088 | { | 3101 | { |
3089 | unsigned long mark = high_wmark_pages(zone); | 3102 | int i; |
3103 | unsigned long mark = -1; | ||
3104 | struct zone *zone; | ||
3090 | 3105 | ||
3091 | if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) | 3106 | for (i = 0; i <= classzone_idx; i++) { |
3092 | return false; | 3107 | zone = pgdat->node_zones + i; |
3108 | |||
3109 | if (!managed_zone(zone)) | ||
3110 | continue; | ||
3111 | |||
3112 | mark = high_wmark_pages(zone); | ||
3113 | if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) | ||
3114 | return true; | ||
3115 | } | ||
3093 | 3116 | ||
3094 | /* | 3117 | /* |
3095 | * If any eligible zone is balanced then the node is not considered | 3118 | * If a node has no populated zone within classzone_idx, it does not |
3096 | * to be congested or dirty | 3119 | * need balancing by definition. This can happen if a zone-restricted |
3120 | * allocation tries to wake a remote kswapd. | ||
3097 | */ | 3121 | */ |
3098 | clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); | 3122 | if (mark == -1) |
3099 | clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); | 3123 | return true; |
3100 | clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags); | ||
3101 | 3124 | ||
3102 | return true; | 3125 | return false; |
3126 | } | ||
3127 | |||
3128 | /* Clear pgdat state for congested, dirty or under writeback. */ | ||
3129 | static void clear_pgdat_congested(pg_data_t *pgdat) | ||
3130 | { | ||
3131 | clear_bit(PGDAT_CONGESTED, &pgdat->flags); | ||
3132 | clear_bit(PGDAT_DIRTY, &pgdat->flags); | ||
3133 | clear_bit(PGDAT_WRITEBACK, &pgdat->flags); | ||
3103 | } | 3134 | } |
3104 | 3135 | ||
3105 | /* | 3136 | /* |
@@ -3110,11 +3141,9 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx) | |||
3110 | */ | 3141 | */ |
3111 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) | 3142 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) |
3112 | { | 3143 | { |
3113 | int i; | ||
3114 | |||
3115 | /* | 3144 | /* |
3116 | * The throttled processes are normally woken up in balance_pgdat() as | 3145 | * The throttled processes are normally woken up in balance_pgdat() as |
3117 | * soon as pfmemalloc_watermark_ok() is true. But there is a potential | 3146 | * soon as allow_direct_reclaim() is true. But there is a potential |
3118 | * race between when kswapd checks the watermarks and a process gets | 3147 | * race between when kswapd checks the watermarks and a process gets |
3119 | * throttled. There is also a potential race if processes get | 3148 | * throttled. There is also a potential race if processes get |
3120 | * throttled, kswapd wakes, a large process exits thereby balancing the | 3149 | * throttled, kswapd wakes, a large process exits thereby balancing the |
@@ -3128,17 +3157,16 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
3128 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) | 3157 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) |
3129 | wake_up_all(&pgdat->pfmemalloc_wait); | 3158 | wake_up_all(&pgdat->pfmemalloc_wait); |
3130 | 3159 | ||
3131 | for (i = 0; i <= classzone_idx; i++) { | 3160 | /* Hopeless node, leave it to direct reclaim */ |
3132 | struct zone *zone = pgdat->node_zones + i; | 3161 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) |
3133 | 3162 | return true; | |
3134 | if (!managed_zone(zone)) | ||
3135 | continue; | ||
3136 | 3163 | ||
3137 | if (!zone_balanced(zone, order, classzone_idx)) | 3164 | if (pgdat_balanced(pgdat, order, classzone_idx)) { |
3138 | return false; | 3165 | clear_pgdat_congested(pgdat); |
3166 | return true; | ||
3139 | } | 3167 | } |
3140 | 3168 | ||
3141 | return true; | 3169 | return false; |
3142 | } | 3170 | } |
3143 | 3171 | ||
3144 | /* | 3172 | /* |
@@ -3214,9 +3242,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3214 | count_vm_event(PAGEOUTRUN); | 3242 | count_vm_event(PAGEOUTRUN); |
3215 | 3243 | ||
3216 | do { | 3244 | do { |
3245 | unsigned long nr_reclaimed = sc.nr_reclaimed; | ||
3217 | bool raise_priority = true; | 3246 | bool raise_priority = true; |
3218 | 3247 | ||
3219 | sc.nr_reclaimed = 0; | ||
3220 | sc.reclaim_idx = classzone_idx; | 3248 | sc.reclaim_idx = classzone_idx; |
3221 | 3249 | ||
3222 | /* | 3250 | /* |
@@ -3241,23 +3269,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3241 | } | 3269 | } |
3242 | 3270 | ||
3243 | /* | 3271 | /* |
3244 | * Only reclaim if there are no eligible zones. Check from | 3272 | * Only reclaim if there are no eligible zones. Note that |
3245 | * high to low zone as allocations prefer higher zones. | 3273 | * sc.reclaim_idx is not used as buffer_heads_over_limit may |
3246 | * Scanning from low to high zone would allow congestion to be | 3274 | * have adjusted it. |
3247 | * cleared during a very small window when a small low | ||
3248 | * zone was balanced even under extreme pressure when the | ||
3249 | * overall node may be congested. Note that sc.reclaim_idx | ||
3250 | * is not used as buffer_heads_over_limit may have adjusted | ||
3251 | * it. | ||
3252 | */ | 3275 | */ |
3253 | for (i = classzone_idx; i >= 0; i--) { | 3276 | if (pgdat_balanced(pgdat, sc.order, classzone_idx)) |
3254 | zone = pgdat->node_zones + i; | 3277 | goto out; |
3255 | if (!managed_zone(zone)) | ||
3256 | continue; | ||
3257 | |||
3258 | if (zone_balanced(zone, sc.order, classzone_idx)) | ||
3259 | goto out; | ||
3260 | } | ||
3261 | 3278 | ||
3262 | /* | 3279 | /* |
3263 | * Do some background aging of the anon list, to give | 3280 | * Do some background aging of the anon list, to give |
@@ -3271,7 +3288,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3271 | * If we're getting trouble reclaiming, start doing writepage | 3288 | * If we're getting trouble reclaiming, start doing writepage |
3272 | * even in laptop mode. | 3289 | * even in laptop mode. |
3273 | */ | 3290 | */ |
3274 | if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) | 3291 | if (sc.priority < DEF_PRIORITY - 2) |
3275 | sc.may_writepage = 1; | 3292 | sc.may_writepage = 1; |
3276 | 3293 | ||
3277 | /* Call soft limit reclaim before calling shrink_node. */ | 3294 | /* Call soft limit reclaim before calling shrink_node. */ |
@@ -3295,7 +3312,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3295 | * able to safely make forward progress. Wake them | 3312 | * able to safely make forward progress. Wake them |
3296 | */ | 3313 | */ |
3297 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | 3314 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && |
3298 | pfmemalloc_watermark_ok(pgdat)) | 3315 | allow_direct_reclaim(pgdat)) |
3299 | wake_up_all(&pgdat->pfmemalloc_wait); | 3316 | wake_up_all(&pgdat->pfmemalloc_wait); |
3300 | 3317 | ||
3301 | /* Check if kswapd should be suspending */ | 3318 | /* Check if kswapd should be suspending */ |
@@ -3306,11 +3323,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) | |||
3306 | * Raise priority if scanning rate is too low or there was no | 3323 | * Raise priority if scanning rate is too low or there was no |
3307 | * progress in reclaiming pages | 3324 | * progress in reclaiming pages |
3308 | */ | 3325 | */ |
3309 | if (raise_priority || !sc.nr_reclaimed) | 3326 | nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; |
3327 | if (raise_priority || !nr_reclaimed) | ||
3310 | sc.priority--; | 3328 | sc.priority--; |
3311 | } while (sc.priority >= 1); | 3329 | } while (sc.priority >= 1); |
3312 | 3330 | ||
3331 | if (!sc.nr_reclaimed) | ||
3332 | pgdat->kswapd_failures++; | ||
3333 | |||
3313 | out: | 3334 | out: |
3335 | snapshot_refaults(NULL, pgdat); | ||
3314 | /* | 3336 | /* |
3315 | * Return the order kswapd stopped reclaiming at as | 3337 | * Return the order kswapd stopped reclaiming at as |
3316 | * prepare_kswapd_sleep() takes it into account. If another caller | 3338 | * prepare_kswapd_sleep() takes it into account. If another caller |
@@ -3320,6 +3342,22 @@ out: | |||
3320 | return sc.order; | 3342 | return sc.order; |
3321 | } | 3343 | } |
3322 | 3344 | ||
3345 | /* | ||
3346 | * pgdat->kswapd_classzone_idx is the highest zone index that a recent | ||
3347 | * allocation request woke kswapd for. When kswapd has not woken recently, | ||
3348 | * the value is MAX_NR_ZONES which is not a valid index. This compares a | ||
3349 | * given classzone and returns it or the highest classzone index kswapd | ||
3350 | * was recently woke for. | ||
3351 | */ | ||
3352 | static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, | ||
3353 | enum zone_type classzone_idx) | ||
3354 | { | ||
3355 | if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) | ||
3356 | return classzone_idx; | ||
3357 | |||
3358 | return max(pgdat->kswapd_classzone_idx, classzone_idx); | ||
3359 | } | ||
3360 | |||
3323 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, | 3361 | static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, |
3324 | unsigned int classzone_idx) | 3362 | unsigned int classzone_idx) |
3325 | { | 3363 | { |
@@ -3331,7 +3369,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o | |||
3331 | 3369 | ||
3332 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 3370 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
3333 | 3371 | ||
3334 | /* Try to sleep for a short interval */ | 3372 | /* |
3373 | * Try to sleep for a short interval. Note that kcompactd will only be | ||
3374 | * woken if it is possible to sleep for a short interval. This is | ||
3375 | * deliberate on the assumption that if reclaim cannot keep an | ||
3376 | * eligible zone balanced that it's also unlikely that compaction will | ||
3377 | * succeed. | ||
3378 | */ | ||
3335 | if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { | 3379 | if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { |
3336 | /* | 3380 | /* |
3337 | * Compaction records what page blocks it recently failed to | 3381 | * Compaction records what page blocks it recently failed to |
@@ -3355,7 +3399,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o | |||
3355 | * the previous request that slept prematurely. | 3399 | * the previous request that slept prematurely. |
3356 | */ | 3400 | */ |
3357 | if (remaining) { | 3401 | if (remaining) { |
3358 | pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); | 3402 | pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); |
3359 | pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); | 3403 | pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); |
3360 | } | 3404 | } |
3361 | 3405 | ||
@@ -3409,7 +3453,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o | |||
3409 | */ | 3453 | */ |
3410 | static int kswapd(void *p) | 3454 | static int kswapd(void *p) |
3411 | { | 3455 | { |
3412 | unsigned int alloc_order, reclaim_order, classzone_idx; | 3456 | unsigned int alloc_order, reclaim_order; |
3457 | unsigned int classzone_idx = MAX_NR_ZONES - 1; | ||
3413 | pg_data_t *pgdat = (pg_data_t*)p; | 3458 | pg_data_t *pgdat = (pg_data_t*)p; |
3414 | struct task_struct *tsk = current; | 3459 | struct task_struct *tsk = current; |
3415 | 3460 | ||
@@ -3439,20 +3484,23 @@ static int kswapd(void *p) | |||
3439 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; | 3484 | tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; |
3440 | set_freezable(); | 3485 | set_freezable(); |
3441 | 3486 | ||
3442 | pgdat->kswapd_order = alloc_order = reclaim_order = 0; | 3487 | pgdat->kswapd_order = 0; |
3443 | pgdat->kswapd_classzone_idx = classzone_idx = 0; | 3488 | pgdat->kswapd_classzone_idx = MAX_NR_ZONES; |
3444 | for ( ; ; ) { | 3489 | for ( ; ; ) { |
3445 | bool ret; | 3490 | bool ret; |
3446 | 3491 | ||
3492 | alloc_order = reclaim_order = pgdat->kswapd_order; | ||
3493 | classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); | ||
3494 | |||
3447 | kswapd_try_sleep: | 3495 | kswapd_try_sleep: |
3448 | kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, | 3496 | kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, |
3449 | classzone_idx); | 3497 | classzone_idx); |
3450 | 3498 | ||
3451 | /* Read the new order and classzone_idx */ | 3499 | /* Read the new order and classzone_idx */ |
3452 | alloc_order = reclaim_order = pgdat->kswapd_order; | 3500 | alloc_order = reclaim_order = pgdat->kswapd_order; |
3453 | classzone_idx = pgdat->kswapd_classzone_idx; | 3501 | classzone_idx = kswapd_classzone_idx(pgdat, 0); |
3454 | pgdat->kswapd_order = 0; | 3502 | pgdat->kswapd_order = 0; |
3455 | pgdat->kswapd_classzone_idx = 0; | 3503 | pgdat->kswapd_classzone_idx = MAX_NR_ZONES; |
3456 | 3504 | ||
3457 | ret = try_to_freeze(); | 3505 | ret = try_to_freeze(); |
3458 | if (kthread_should_stop()) | 3506 | if (kthread_should_stop()) |
@@ -3478,9 +3526,6 @@ kswapd_try_sleep: | |||
3478 | reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); | 3526 | reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); |
3479 | if (reclaim_order < alloc_order) | 3527 | if (reclaim_order < alloc_order) |
3480 | goto kswapd_try_sleep; | 3528 | goto kswapd_try_sleep; |
3481 | |||
3482 | alloc_order = reclaim_order = pgdat->kswapd_order; | ||
3483 | classzone_idx = pgdat->kswapd_classzone_idx; | ||
3484 | } | 3529 | } |
3485 | 3530 | ||
3486 | tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); | 3531 | tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); |
@@ -3496,7 +3541,6 @@ kswapd_try_sleep: | |||
3496 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | 3541 | void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) |
3497 | { | 3542 | { |
3498 | pg_data_t *pgdat; | 3543 | pg_data_t *pgdat; |
3499 | int z; | ||
3500 | 3544 | ||
3501 | if (!managed_zone(zone)) | 3545 | if (!managed_zone(zone)) |
3502 | return; | 3546 | return; |
@@ -3504,22 +3548,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | |||
3504 | if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) | 3548 | if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) |
3505 | return; | 3549 | return; |
3506 | pgdat = zone->zone_pgdat; | 3550 | pgdat = zone->zone_pgdat; |
3507 | pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); | 3551 | pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, |
3552 | classzone_idx); | ||
3508 | pgdat->kswapd_order = max(pgdat->kswapd_order, order); | 3553 | pgdat->kswapd_order = max(pgdat->kswapd_order, order); |
3509 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 3554 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
3510 | return; | 3555 | return; |
3511 | 3556 | ||
3512 | /* Only wake kswapd if all zones are unbalanced */ | 3557 | /* Hopeless node, leave it to direct reclaim */ |
3513 | for (z = 0; z <= classzone_idx; z++) { | 3558 | if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) |
3514 | zone = pgdat->node_zones + z; | 3559 | return; |
3515 | if (!managed_zone(zone)) | ||
3516 | continue; | ||
3517 | 3560 | ||
3518 | if (zone_balanced(zone, order, classzone_idx)) | 3561 | if (pgdat_balanced(pgdat, order, classzone_idx)) |
3519 | return; | 3562 | return; |
3520 | } | ||
3521 | 3563 | ||
3522 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | 3564 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); |
3523 | wake_up_interruptible(&pgdat->kswapd_wait); | 3565 | wake_up_interruptible(&pgdat->kswapd_wait); |
3524 | } | 3566 | } |
3525 | 3567 | ||
@@ -3548,8 +3590,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
3548 | struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | 3590 | struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); |
3549 | struct task_struct *p = current; | 3591 | struct task_struct *p = current; |
3550 | unsigned long nr_reclaimed; | 3592 | unsigned long nr_reclaimed; |
3593 | unsigned int noreclaim_flag; | ||
3551 | 3594 | ||
3552 | p->flags |= PF_MEMALLOC; | 3595 | noreclaim_flag = memalloc_noreclaim_save(); |
3553 | lockdep_set_current_reclaim_state(sc.gfp_mask); | 3596 | lockdep_set_current_reclaim_state(sc.gfp_mask); |
3554 | reclaim_state.reclaimed_slab = 0; | 3597 | reclaim_state.reclaimed_slab = 0; |
3555 | p->reclaim_state = &reclaim_state; | 3598 | p->reclaim_state = &reclaim_state; |
@@ -3558,7 +3601,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) | |||
3558 | 3601 | ||
3559 | p->reclaim_state = NULL; | 3602 | p->reclaim_state = NULL; |
3560 | lockdep_clear_current_reclaim_state(); | 3603 | lockdep_clear_current_reclaim_state(); |
3561 | p->flags &= ~PF_MEMALLOC; | 3604 | memalloc_noreclaim_restore(noreclaim_flag); |
3562 | 3605 | ||
3563 | return nr_reclaimed; | 3606 | return nr_reclaimed; |
3564 | } | 3607 | } |
@@ -3723,9 +3766,10 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in | |||
3723 | struct task_struct *p = current; | 3766 | struct task_struct *p = current; |
3724 | struct reclaim_state reclaim_state; | 3767 | struct reclaim_state reclaim_state; |
3725 | int classzone_idx = gfp_zone(gfp_mask); | 3768 | int classzone_idx = gfp_zone(gfp_mask); |
3769 | unsigned int noreclaim_flag; | ||
3726 | struct scan_control sc = { | 3770 | struct scan_control sc = { |
3727 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), | 3771 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
3728 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), | 3772 | .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), |
3729 | .order = order, | 3773 | .order = order, |
3730 | .priority = NODE_RECLAIM_PRIORITY, | 3774 | .priority = NODE_RECLAIM_PRIORITY, |
3731 | .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), | 3775 | .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), |
@@ -3740,7 +3784,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in | |||
3740 | * and we also need to be able to write out pages for RECLAIM_WRITE | 3784 | * and we also need to be able to write out pages for RECLAIM_WRITE |
3741 | * and RECLAIM_UNMAP. | 3785 | * and RECLAIM_UNMAP. |
3742 | */ | 3786 | */ |
3743 | p->flags |= PF_MEMALLOC | PF_SWAPWRITE; | 3787 | noreclaim_flag = memalloc_noreclaim_save(); |
3788 | p->flags |= PF_SWAPWRITE; | ||
3744 | lockdep_set_current_reclaim_state(gfp_mask); | 3789 | lockdep_set_current_reclaim_state(gfp_mask); |
3745 | reclaim_state.reclaimed_slab = 0; | 3790 | reclaim_state.reclaimed_slab = 0; |
3746 | p->reclaim_state = &reclaim_state; | 3791 | p->reclaim_state = &reclaim_state; |
@@ -3756,7 +3801,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in | |||
3756 | } | 3801 | } |
3757 | 3802 | ||
3758 | p->reclaim_state = NULL; | 3803 | p->reclaim_state = NULL; |
3759 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 3804 | current->flags &= ~PF_SWAPWRITE; |
3805 | memalloc_noreclaim_restore(noreclaim_flag); | ||
3760 | lockdep_clear_current_reclaim_state(); | 3806 | lockdep_clear_current_reclaim_state(); |
3761 | return sc.nr_reclaimed >= nr_pages; | 3807 | return sc.nr_reclaimed >= nr_pages; |
3762 | } | 3808 | } |
@@ -3779,9 +3825,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) | |||
3779 | sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) | 3825 | sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) |
3780 | return NODE_RECLAIM_FULL; | 3826 | return NODE_RECLAIM_FULL; |
3781 | 3827 | ||
3782 | if (!pgdat_reclaimable(pgdat)) | ||
3783 | return NODE_RECLAIM_FULL; | ||
3784 | |||
3785 | /* | 3828 | /* |
3786 | * Do not scan if the allocation should not be delayed. | 3829 | * Do not scan if the allocation should not be delayed. |
3787 | */ | 3830 | */ |