mm: throttle on IO only when there are too many dirty and writeback pages

wait_iff_congested has been used to throttle allocator before it retried another round of direct reclaim to allow the writeback to make some progress and prevent reclaim from looping over dirty/writeback pages without making any progress. We used to do congestion_wait before commit 0e093d99763e ("writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone") but that led to undesirable stalls and sleeping for the full timeout even when the BDI wasn't congested. Hence wait_iff_congested was used instead. But it seems that even wait_iff_congested doesn't work as expected. We might have a small file LRU list with all pages dirty/writeback and yet the bdi is not congested so this is just a cond_resched in the end and can end up triggering pre mature OOM. This patch replaces the unconditional wait_iff_congested by congestion_wait which is executed only if we _know_ that the last round of direct reclaim didn't make any progress and dirty+writeback pages are more than a half of the reclaimable pages on the zone which might be usable for our target allocation. This shouldn't reintroduce stalls fixed by 0e093d99763e because congestion_wait is called only when we are getting hopeless when sleeping is a better choice than OOM with many pages under IO. We have to preserve logic introduced by commit 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress") into the __alloc_pages_slowpath now that wait_iff_congested is not used anymore. As the only remaining user of wait_iff_congested is shrink_inactive_list we can remove the WQ specific short sleep from wait_iff_congested because the sleep is needed to be done only once in the allocation retry cycle. [mhocko@suse.com: high_zoneidx->ac_classzone_idx to evaluate memory reserves properly] Link: http://lkml.kernel.org/r/1463051677-29418-2-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: David Rientjes <rientjes@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Joonsoo Kim <js1304@gmail.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Vladimir Davydov <vdavydov@virtuozzo.com> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Michal Hocko <mhocko@suse.com> 2016-05-20 19:57:03 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-05-20 20:58:30 -0400
commit: ede37713737834d98ec72ed299a305d53e909f73 (patch)
tree: d56b784a80fe86a9207e58ecca7a604f9dfd6454 /mm/page_alloc.c
parent: 0a0337e0d1d134465778a16f5cbea95086e8e9e0 (diff)
1 files changed, 37 insertions, 4 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fa39efc3a692..f51c302126a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3436,8 +3436,9 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
                                        ac->nodemask) {
                unsigned long available;
+                unsigned long reclaimable;
-                available = zone_reclaimable_pages(zone);
+                available = reclaimable = zone_reclaimable_pages(zone);
                available -= DIV_ROUND_UP(no_progress_loops * available,
                                          MAX_RECLAIM_RETRIES);
                available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
@@ -3447,9 +3448,41 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
                 * available?
                 */
                if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
-                                ac->high_zoneidx, alloc_flags, available)) {
+                                ac_classzone_idx(ac), alloc_flags, available)) {
-                        /* Wait for some write requests to complete then retry */
+                        /*
-                        wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50);
+                         * If we didn't make any progress and have a lot of
+                         * dirty + writeback pages then we should wait for
+                         * an IO to complete to slow down the reclaim and
+                         * prevent from pre mature OOM
+                         */
+                        if (!did_some_progress) {
+                                unsigned long writeback;
+                                unsigned long dirty;
+                                writeback = zone_page_state_snapshot(zone,
+                                                                     NR_WRITEBACK);
+                                dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+                                if (2*(writeback + dirty) > reclaimable) {
+                                        congestion_wait(BLK_RW_ASYNC, HZ/10);
+                                        return true;
+                                }
+                        }
+                        /*
+                         * Memory allocation/reclaim might be called from a WQ
+                         * context and the current implementation of the WQ
+                         * concurrency control doesn't recognize that
+                         * a particular WQ is congested if the worker thread is
+                         * looping without ever sleeping. Therefore we have to
+                         * do a short sleep here rather than calling
+                         * cond_resched().
+                         */
+                        if (current->flags & PF_WQ_WORKER)
+                                schedule_timeout_uninterruptible(1);
+                        else
+                                cond_resched();
                        return true;
                }
        }
author	Michal Hocko <mhocko@suse.com>	2016-05-20 19:57:03 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-05-20 20:58:30 -0400
commit	ede37713737834d98ec72ed299a305d53e909f73 (patch)
tree	d56b784a80fe86a9207e58ecca7a604f9dfd6454 /mm/page_alloc.c
parent	0a0337e0d1d134465778a16f5cbea95086e8e9e0 (diff)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fa39efc3a692..f51c302126a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -3436,8 +3436,9 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3436	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,	3436	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3437	ac->nodemask) {	3437	ac->nodemask) {
3438	unsigned long available;	3438	unsigned long available;
		3439	unsigned long reclaimable;
3439		3440
3440	available = zone_reclaimable_pages(zone);	3441	available = reclaimable = zone_reclaimable_pages(zone);
3441	available -= DIV_ROUND_UP(no_progress_loops * available,	3442	available -= DIV_ROUND_UP(no_progress_loops * available,
3442	MAX_RECLAIM_RETRIES);	3443	MAX_RECLAIM_RETRIES);
3443	available += zone_page_state_snapshot(zone, NR_FREE_PAGES);	3444	available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
@@ -3447,9 +3448,41 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3447	* available?	3448	* available?
3448	*/	3449	*/
3449	if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),	3450	if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
3450	ac->high_zoneidx, alloc_flags, available)) {	3451	ac_classzone_idx(ac), alloc_flags, available)) {
3451	/* Wait for some write requests to complete then retry */	3452	/*
3452	wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50);	3453	* If we didn't make any progress and have a lot of
		3454	* dirty + writeback pages then we should wait for
		3455	* an IO to complete to slow down the reclaim and
		3456	* prevent from pre mature OOM
		3457	*/
		3458	if (!did_some_progress) {
		3459	unsigned long writeback;
		3460	unsigned long dirty;
		3461
		3462	writeback = zone_page_state_snapshot(zone,
		3463	NR_WRITEBACK);
		3464	dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
		3465
		3466	if (2*(writeback + dirty) > reclaimable) {
		3467	congestion_wait(BLK_RW_ASYNC, HZ/10);
		3468	return true;
		3469	}
		3470	}
		3471
		3472	/*
		3473	* Memory allocation/reclaim might be called from a WQ
		3474	* context and the current implementation of the WQ
		3475	* concurrency control doesn't recognize that
		3476	* a particular WQ is congested if the worker thread is
		3477	* looping without ever sleeping. Therefore we have to
		3478	* do a short sleep here rather than calling
		3479	* cond_resched().
		3480	*/
		3481	if (current->flags & PF_WQ_WORKER)
		3482	schedule_timeout_uninterruptible(1);
		3483	else
		3484	cond_resched();
		3485
3453	return true;	3486	return true;
3454	}	3487	}
3455	}	3488	}