summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2015-05-22 18:23:36 -0400
committerJens Axboe <axboe@fb.com>2015-06-02 10:38:13 -0400
commit97c9341f727105c29478da19f1687b0e0a917256 (patch)
treeea533ba46133970f166b1dd280019a31516a10cd /mm/vmscan.c
parentc2aa723a6093633ae4ec15b08a4db276643cab3e (diff)
mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use
Because writeback wasn't cgroup aware before, the usual dirty throttling mechanism in balance_dirty_pages() didn't work for processes under memcg limit. The writeback path didn't know how much memory is available or how fast the dirty pages are being written out for a given memcg and balance_dirty_pages() didn't have any measure of IO back pressure for the memcg. To work around the issue, memcg implemented an ad-hoc dirty throttling mechanism in the direct reclaim path by stalling on pages under writeback which are encountered during direct reclaim scan. This is rather ugly and crude - none of the configurability, fairness, or bandwidth-proportional distribution of the normal path. The previous patches implemented proper memcg aware dirty throttling when cgroup writeback is in use making the ad-hoc mechanism unnecessary. This patch disables direct reclaim stalling for such case. Note: I disabled the parts which seemed obvious and it behaves fine while testing but my understanding of this code path is rudimentary and it's quite possible that I got something wrong. Please let me know if I got some wrong or more global_reclaim() sites should be updated. v2: The original patch removed the direct stalling mechanism which breaks legacy hierarchies. Conditionalize instead of removing. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Jens Axboe <axboe@kernel.dk> Cc: Jan Kara <jack@suse.cz> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Greg Thelen <gthelen@google.com> Cc: Vladimir Davydov <vdavydov@parallels.com> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c51
1 files changed, 41 insertions, 10 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f46339870147..8cb16ebaf3ed 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc)
154{ 154{
155 return !sc->target_mem_cgroup; 155 return !sc->target_mem_cgroup;
156} 156}
157
158/**
159 * sane_reclaim - is the usual dirty throttling mechanism operational?
160 * @sc: scan_control in question
161 *
162 * The normal page dirty throttling mechanism in balance_dirty_pages() is
163 * completely broken with the legacy memcg and direct stalling in
164 * shrink_page_list() is used for throttling instead, which lacks all the
165 * niceties such as fairness, adaptive pausing, bandwidth proportional
166 * allocation and configurability.
167 *
168 * This function tests whether the vmscan currently in progress can assume
169 * that the normal dirty throttling mechanism is operational.
170 */
171static bool sane_reclaim(struct scan_control *sc)
172{
173 struct mem_cgroup *memcg = sc->target_mem_cgroup;
174
175 if (!memcg)
176 return true;
177#ifdef CONFIG_CGROUP_WRITEBACK
178 if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
179 return true;
180#endif
181 return false;
182}
157#else 183#else
158static bool global_reclaim(struct scan_control *sc) 184static bool global_reclaim(struct scan_control *sc)
159{ 185{
160 return true; 186 return true;
161} 187}
188
189static bool sane_reclaim(struct scan_control *sc)
190{
191 return true;
192}
162#endif 193#endif
163 194
164static unsigned long zone_reclaimable_pages(struct zone *zone) 195static unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -941,10 +972,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
941 * note that the LRU is being scanned too quickly and the 972 * note that the LRU is being scanned too quickly and the
942 * caller can stall after page list has been processed. 973 * caller can stall after page list has been processed.
943 * 974 *
944 * 2) Global reclaim encounters a page, memcg encounters a 975 * 2) Global or new memcg reclaim encounters a page that is
945 * page that is not marked for immediate reclaim or 976 * not marked for immediate reclaim or the caller does not
946 * the caller does not have __GFP_IO. In this case mark 977 * have __GFP_IO. In this case mark the page for immediate
947 * the page for immediate reclaim and continue scanning. 978 * reclaim and continue scanning.
948 * 979 *
949 * __GFP_IO is checked because a loop driver thread might 980 * __GFP_IO is checked because a loop driver thread might
950 * enter reclaim, and deadlock if it waits on a page for 981 * enter reclaim, and deadlock if it waits on a page for
@@ -958,7 +989,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
958 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing 989 * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
959 * may_enter_fs here is liable to OOM on them. 990 * may_enter_fs here is liable to OOM on them.
960 * 991 *
961 * 3) memcg encounters a page that is not already marked 992 * 3) Legacy memcg encounters a page that is not already marked
962 * PageReclaim. memcg does not have any dirty pages 993 * PageReclaim. memcg does not have any dirty pages
963 * throttling so we could easily OOM just because too many 994 * throttling so we could easily OOM just because too many
964 * pages are in writeback and there is nothing else to 995 * pages are in writeback and there is nothing else to
@@ -973,7 +1004,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
973 goto keep_locked; 1004 goto keep_locked;
974 1005
975 /* Case 2 above */ 1006 /* Case 2 above */
976 } else if (global_reclaim(sc) || 1007 } else if (sane_reclaim(sc) ||
977 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { 1008 !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
978 /* 1009 /*
979 * This is slightly racy - end_page_writeback() 1010 * This is slightly racy - end_page_writeback()
@@ -1422,7 +1453,7 @@ static int too_many_isolated(struct zone *zone, int file,
1422 if (current_is_kswapd()) 1453 if (current_is_kswapd())
1423 return 0; 1454 return 0;
1424 1455
1425 if (!global_reclaim(sc)) 1456 if (!sane_reclaim(sc))
1426 return 0; 1457 return 0;
1427 1458
1428 if (file) { 1459 if (file) {
@@ -1614,10 +1645,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1614 set_bit(ZONE_WRITEBACK, &zone->flags); 1645 set_bit(ZONE_WRITEBACK, &zone->flags);
1615 1646
1616 /* 1647 /*
1617 * memcg will stall in page writeback so only consider forcibly 1648 * Legacy memcg will stall in page writeback so avoid forcibly
1618 * stalling for global reclaim 1649 * stalling here.
1619 */ 1650 */
1620 if (global_reclaim(sc)) { 1651 if (sane_reclaim(sc)) {
1621 /* 1652 /*
1622 * Tag a zone as congested if all the dirty pages scanned were 1653 * Tag a zone as congested if all the dirty pages scanned were
1623 * backed by a congested BDI and wait_iff_congested will stall. 1654 * backed by a congested BDI and wait_iff_congested will stall.