aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2019-04-18 20:50:34 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-04-19 12:46:05 -0400
commit3b991208b897f52507168374033771a984b947b1 (patch)
treeb2a8c30bb8ee998e0d98437fb6d8bdc8e3436121
parent1a9f219157b22d0ffb340a9c5f431afd02cd2cf3 (diff)
mm: fix inactive list balancing between NUMA nodes and cgroups
During !CONFIG_CGROUP reclaim, we expand the inactive list size if it's thrashing on the node that is about to be reclaimed. But when cgroups are enabled, we suddenly ignore the node scope and use the cgroup scope only. The result is that pressure bleeds between NUMA nodes depending on whether cgroups are merely compiled into Linux. This behavioral difference is unexpected and undesirable. When the refault adaptivity of the inactive list was first introduced, there were no statistics at the lruvec level - the intersection of node and memcg - so it was better than nothing. But now that we have that infrastructure, use lruvec_page_state() to make the list balancing decision always NUMA aware. [hannes@cmpxchg.org: fix bisection hole] Link: http://lkml.kernel.org/r/20190417155241.GB23013@cmpxchg.org Link: http://lkml.kernel.org/r/20190412144438.2645-1-hannes@cmpxchg.org Fixes: 2a2e48854d70 ("mm: vmscan: fix IO/refault regression in cache workingset transition") Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Shakeel Butt <shakeelb@google.com> Cc: Roman Gushchin <guro@fb.com> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/vmscan.c29
1 files changed, 9 insertions, 20 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a5ad0b35ab8e..a815f73ee4d5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2176,7 +2176,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
2176 * 10TB 320 32GB 2176 * 10TB 320 32GB
2177 */ 2177 */
2178static bool inactive_list_is_low(struct lruvec *lruvec, bool file, 2178static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2179 struct mem_cgroup *memcg,
2180 struct scan_control *sc, bool actual_reclaim) 2179 struct scan_control *sc, bool actual_reclaim)
2181{ 2180{
2182 enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; 2181 enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
@@ -2197,16 +2196,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2197 inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); 2196 inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
2198 active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); 2197 active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
2199 2198
2200 if (memcg)
2201 refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
2202 else
2203 refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
2204
2205 /* 2199 /*
2206 * When refaults are being observed, it means a new workingset 2200 * When refaults are being observed, it means a new workingset
2207 * is being established. Disable active list protection to get 2201 * is being established. Disable active list protection to get
2208 * rid of the stale workingset quickly. 2202 * rid of the stale workingset quickly.
2209 */ 2203 */
2204 refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
2210 if (file && actual_reclaim && lruvec->refaults != refaults) { 2205 if (file && actual_reclaim && lruvec->refaults != refaults) {
2211 inactive_ratio = 0; 2206 inactive_ratio = 0;
2212 } else { 2207 } else {
@@ -2227,12 +2222,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
2227} 2222}
2228 2223
2229static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 2224static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2230 struct lruvec *lruvec, struct mem_cgroup *memcg, 2225 struct lruvec *lruvec, struct scan_control *sc)
2231 struct scan_control *sc)
2232{ 2226{
2233 if (is_active_lru(lru)) { 2227 if (is_active_lru(lru)) {
2234 if (inactive_list_is_low(lruvec, is_file_lru(lru), 2228 if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
2235 memcg, sc, true))
2236 shrink_active_list(nr_to_scan, lruvec, sc, lru); 2229 shrink_active_list(nr_to_scan, lruvec, sc, lru);
2237 return 0; 2230 return 0;
2238 } 2231 }
@@ -2332,7 +2325,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2332 * anonymous pages on the LRU in eligible zones. 2325 * anonymous pages on the LRU in eligible zones.
2333 * Otherwise, the small LRU gets thrashed. 2326 * Otherwise, the small LRU gets thrashed.
2334 */ 2327 */
2335 if (!inactive_list_is_low(lruvec, false, memcg, sc, false) && 2328 if (!inactive_list_is_low(lruvec, false, sc, false) &&
2336 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) 2329 lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
2337 >> sc->priority) { 2330 >> sc->priority) {
2338 scan_balance = SCAN_ANON; 2331 scan_balance = SCAN_ANON;
@@ -2350,7 +2343,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
2350 * lruvec even if it has plenty of old anonymous pages unless the 2343 * lruvec even if it has plenty of old anonymous pages unless the
2351 * system is under heavy pressure. 2344 * system is under heavy pressure.
2352 */ 2345 */
2353 if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && 2346 if (!inactive_list_is_low(lruvec, true, sc, false) &&
2354 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { 2347 lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
2355 scan_balance = SCAN_FILE; 2348 scan_balance = SCAN_FILE;
2356 goto out; 2349 goto out;
@@ -2503,7 +2496,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
2503 nr[lru] -= nr_to_scan; 2496 nr[lru] -= nr_to_scan;
2504 2497
2505 nr_reclaimed += shrink_list(lru, nr_to_scan, 2498 nr_reclaimed += shrink_list(lru, nr_to_scan,
2506 lruvec, memcg, sc); 2499 lruvec, sc);
2507 } 2500 }
2508 } 2501 }
2509 2502
@@ -2570,7 +2563,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
2570 * Even if we did not try to evict anon pages at all, we want to 2563 * Even if we did not try to evict anon pages at all, we want to
2571 * rebalance the anon lru active/inactive ratio. 2564 * rebalance the anon lru active/inactive ratio.
2572 */ 2565 */
2573 if (inactive_list_is_low(lruvec, false, memcg, sc, true)) 2566 if (inactive_list_is_low(lruvec, false, sc, true))
2574 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 2567 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2575 sc, LRU_ACTIVE_ANON); 2568 sc, LRU_ACTIVE_ANON);
2576} 2569}
@@ -2969,12 +2962,8 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
2969 unsigned long refaults; 2962 unsigned long refaults;
2970 struct lruvec *lruvec; 2963 struct lruvec *lruvec;
2971 2964
2972 if (memcg)
2973 refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
2974 else
2975 refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
2976
2977 lruvec = mem_cgroup_lruvec(pgdat, memcg); 2965 lruvec = mem_cgroup_lruvec(pgdat, memcg);
2966 refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
2978 lruvec->refaults = refaults; 2967 lruvec->refaults = refaults;
2979 } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); 2968 } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
2980} 2969}
@@ -3339,7 +3328,7 @@ static void age_active_anon(struct pglist_data *pgdat,
3339 do { 3328 do {
3340 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); 3329 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
3341 3330
3342 if (inactive_list_is_low(lruvec, false, memcg, sc, true)) 3331 if (inactive_list_is_low(lruvec, false, sc, true))
3343 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 3332 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3344 sc, LRU_ACTIVE_ANON); 3333 sc, LRU_ACTIVE_ANON);
3345 3334