diff options
author | Tejun Heo <tj@kernel.org> | 2015-05-22 18:23:36 -0400 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2015-06-02 10:38:13 -0400 |
commit | 97c9341f727105c29478da19f1687b0e0a917256 (patch) | |
tree | ea533ba46133970f166b1dd280019a31516a10cd /mm/vmscan.c | |
parent | c2aa723a6093633ae4ec15b08a4db276643cab3e (diff) |
mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use
Because writeback wasn't cgroup aware before, the usual dirty
throttling mechanism in balance_dirty_pages() didn't work for
processes under memcg limit. The writeback path didn't know how much
memory is available or how fast the dirty pages are being written out
for a given memcg and balance_dirty_pages() didn't have any measure of
IO back pressure for the memcg.
To work around the issue, memcg implemented an ad-hoc dirty throttling
mechanism in the direct reclaim path by stalling on pages under
writeback which are encountered during direct reclaim scan. This is
rather ugly and crude - none of the configurability, fairness, or
bandwidth-proportional distribution of the normal path.
The previous patches implemented proper memcg aware dirty throttling
when cgroup writeback is in use making the ad-hoc mechanism
unnecessary. This patch disables direct reclaim stalling for such
case.
Note: I disabled the parts which seemed obvious and it behaves fine
while testing but my understanding of this code path is
rudimentary and it's quite possible that I got something wrong.
Please let me know if I got some wrong or more global_reclaim()
sites should be updated.
v2: The original patch removed the direct stalling mechanism which
breaks legacy hierarchies. Conditionalize instead of removing.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Jan Kara <jack@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 51 |
1 files changed, 41 insertions, 10 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index f46339870147..8cb16ebaf3ed 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc) | |||
154 | { | 154 | { |
155 | return !sc->target_mem_cgroup; | 155 | return !sc->target_mem_cgroup; |
156 | } | 156 | } |
157 | |||
158 | /** | ||
159 | * sane_reclaim - is the usual dirty throttling mechanism operational? | ||
160 | * @sc: scan_control in question | ||
161 | * | ||
162 | * The normal page dirty throttling mechanism in balance_dirty_pages() is | ||
163 | * completely broken with the legacy memcg and direct stalling in | ||
164 | * shrink_page_list() is used for throttling instead, which lacks all the | ||
165 | * niceties such as fairness, adaptive pausing, bandwidth proportional | ||
166 | * allocation and configurability. | ||
167 | * | ||
168 | * This function tests whether the vmscan currently in progress can assume | ||
169 | * that the normal dirty throttling mechanism is operational. | ||
170 | */ | ||
171 | static bool sane_reclaim(struct scan_control *sc) | ||
172 | { | ||
173 | struct mem_cgroup *memcg = sc->target_mem_cgroup; | ||
174 | |||
175 | if (!memcg) | ||
176 | return true; | ||
177 | #ifdef CONFIG_CGROUP_WRITEBACK | ||
178 | if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup)) | ||
179 | return true; | ||
180 | #endif | ||
181 | return false; | ||
182 | } | ||
157 | #else | 183 | #else |
158 | static bool global_reclaim(struct scan_control *sc) | 184 | static bool global_reclaim(struct scan_control *sc) |
159 | { | 185 | { |
160 | return true; | 186 | return true; |
161 | } | 187 | } |
188 | |||
189 | static bool sane_reclaim(struct scan_control *sc) | ||
190 | { | ||
191 | return true; | ||
192 | } | ||
162 | #endif | 193 | #endif |
163 | 194 | ||
164 | static unsigned long zone_reclaimable_pages(struct zone *zone) | 195 | static unsigned long zone_reclaimable_pages(struct zone *zone) |
@@ -941,10 +972,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
941 | * note that the LRU is being scanned too quickly and the | 972 | * note that the LRU is being scanned too quickly and the |
942 | * caller can stall after page list has been processed. | 973 | * caller can stall after page list has been processed. |
943 | * | 974 | * |
944 | * 2) Global reclaim encounters a page, memcg encounters a | 975 | * 2) Global or new memcg reclaim encounters a page that is |
945 | * page that is not marked for immediate reclaim or | 976 | * not marked for immediate reclaim or the caller does not |
946 | * the caller does not have __GFP_IO. In this case mark | 977 | * have __GFP_IO. In this case mark the page for immediate |
947 | * the page for immediate reclaim and continue scanning. | 978 | * reclaim and continue scanning. |
948 | * | 979 | * |
949 | * __GFP_IO is checked because a loop driver thread might | 980 | * __GFP_IO is checked because a loop driver thread might |
950 | * enter reclaim, and deadlock if it waits on a page for | 981 | * enter reclaim, and deadlock if it waits on a page for |
@@ -958,7 +989,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
958 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing | 989 | * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing |
959 | * may_enter_fs here is liable to OOM on them. | 990 | * may_enter_fs here is liable to OOM on them. |
960 | * | 991 | * |
961 | * 3) memcg encounters a page that is not already marked | 992 | * 3) Legacy memcg encounters a page that is not already marked |
962 | * PageReclaim. memcg does not have any dirty pages | 993 | * PageReclaim. memcg does not have any dirty pages |
963 | * throttling so we could easily OOM just because too many | 994 | * throttling so we could easily OOM just because too many |
964 | * pages are in writeback and there is nothing else to | 995 | * pages are in writeback and there is nothing else to |
@@ -973,7 +1004,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, | |||
973 | goto keep_locked; | 1004 | goto keep_locked; |
974 | 1005 | ||
975 | /* Case 2 above */ | 1006 | /* Case 2 above */ |
976 | } else if (global_reclaim(sc) || | 1007 | } else if (sane_reclaim(sc) || |
977 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { | 1008 | !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { |
978 | /* | 1009 | /* |
979 | * This is slightly racy - end_page_writeback() | 1010 | * This is slightly racy - end_page_writeback() |
@@ -1422,7 +1453,7 @@ static int too_many_isolated(struct zone *zone, int file, | |||
1422 | if (current_is_kswapd()) | 1453 | if (current_is_kswapd()) |
1423 | return 0; | 1454 | return 0; |
1424 | 1455 | ||
1425 | if (!global_reclaim(sc)) | 1456 | if (!sane_reclaim(sc)) |
1426 | return 0; | 1457 | return 0; |
1427 | 1458 | ||
1428 | if (file) { | 1459 | if (file) { |
@@ -1614,10 +1645,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, | |||
1614 | set_bit(ZONE_WRITEBACK, &zone->flags); | 1645 | set_bit(ZONE_WRITEBACK, &zone->flags); |
1615 | 1646 | ||
1616 | /* | 1647 | /* |
1617 | * memcg will stall in page writeback so only consider forcibly | 1648 | * Legacy memcg will stall in page writeback so avoid forcibly |
1618 | * stalling for global reclaim | 1649 | * stalling here. |
1619 | */ | 1650 | */ |
1620 | if (global_reclaim(sc)) { | 1651 | if (sane_reclaim(sc)) { |
1621 | /* | 1652 | /* |
1622 | * Tag a zone as congested if all the dirty pages scanned were | 1653 | * Tag a zone as congested if all the dirty pages scanned were |
1623 | * backed by a congested BDI and wait_iff_congested will stall. | 1654 | * backed by a congested BDI and wait_iff_congested will stall. |