Merge branch 'fortglx/3.7/time' of git://git.linaro.org/people/jstultz/linux into timers/core

author: Thomas Gleixner <tglx@linutronix.de> 2012-10-09 15:20:05 -0400
committer: Thomas Gleixner <tglx@linutronix.de> 2012-10-09 15:20:05 -0400
commit: db8c246937713e60b7628661ccc187eeb81f2bae (patch)
tree: 6351e8bca23eef40fce85396d1c6f6cfffbd4b66 /mm/vmscan.c
parent: c5f66e99b7cb091e3d51ae8e8156892e8feb7fa3 (diff)
parent: 28f2b02bc581ffc835bc1691b18d03f62fcf0395 (diff)
1 files changed, 169 insertions, 16 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 661576324c7..8d01243d956 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -133,7 +133,7 @@ long vm_total_pages;	/* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 static bool global_reclaim(struct scan_control *sc)
 {
        return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        cond_resched();
+        mem_cgroup_uncharge_start();
        while (!list_empty(page_list)) {
                enum page_references references;
                struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
                if (PageWriteback(page)) {
-                        nr_writeback++;
+                        /*
-                        unlock_page(page);
+                         * memcg doesn't have any dirty pages throttling so we
-                        goto keep;
+                         * could easily OOM just because too many pages are in
+                         * writeback and there is nothing else to reclaim.
+                         *
+                         * Check __GFP_IO, certainly because a loop driver
+                         * thread might enter reclaim, and deadlock if it waits
+                         * on a page for which it is needed to do the write
+                         * (loop masks off __GFP_IO|__GFP_FS for this reason);
+                         * but more thought would probably show more reasons.
+                         *
+                         * Don't require __GFP_FS, since we're not going into
+                         * the FS, just waiting on its writeback completion.
+                         * Worryingly, ext4 gfs2 and xfs allocate pages with
+                         * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
+                         * testing may_enter_fs here is liable to OOM on them.
+                         */
+                        if (global_reclaim(sc) ||
+                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+                                /*
+                                 * This is slightly racy - end_page_writeback()
+                                 * might have just cleared PageReclaim, then
+                                 * setting PageReclaim here end up interpreted
+                                 * as PageReadahead - but that does not matter
+                                 * enough to care.  What we do want is for this
+                                 * page to have PageReclaim set next time memcg
+                                 * reclaim reaches the tests above, so it will
+                                 * then wait_on_page_writeback() to avoid OOM;
+                                 * and it's also appropriate in global reclaim.
+                                 */
+                                SetPageReclaim(page);
+                                nr_writeback++;
+                                goto keep_locked;
+                        }
+                        wait_on_page_writeback(page);
                }
                references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
        list_splice(&ret_pages, page_list);
        count_vm_events(PGACTIVATE, pgactivate);
+        mem_cgroup_uncharge_end();
        *ret_nr_dirty += nr_dirty;
        *ret_nr_writeback += nr_writeback;
        return nr_reclaimed;
@@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc)
 * by looking at the fraction of the pages scanned we did rotate back
 * onto the active list instead of evict.
 *
- * nr[0] = anon pages to scan; nr[1] = file pages to scan
+ * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
+ * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
 */
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                           unsigned long *nr)
@@ -2111,6 +2146,83 @@ out:
        return 0;
 }
+static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
+{
+        struct zone *zone;
+        unsigned long pfmemalloc_reserve = 0;
+        unsigned long free_pages = 0;
+        int i;
+        bool wmark_ok;
+        for (i = 0; i <= ZONE_NORMAL; i++) {
+                zone = &pgdat->node_zones[i];
+                pfmemalloc_reserve += min_wmark_pages(zone);
+                free_pages += zone_page_state(zone, NR_FREE_PAGES);
+        }
+        wmark_ok = free_pages > pfmemalloc_reserve / 2;
+        /* kswapd must be awake if processes are being throttled */
+        if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
+                pgdat->classzone_idx = min(pgdat->classzone_idx,
+                                                (enum zone_type)ZONE_NORMAL);
+                wake_up_interruptible(&pgdat->kswapd_wait);
+        }
+        return wmark_ok;
+}
+/*
+ * Throttle direct reclaimers if backing storage is backed by the network
+ * and the PFMEMALLOC reserve for the preferred node is getting dangerously
+ * depleted. kswapd will continue to make progress and wake the processes
+ * when the low watermark is reached
+ */
+static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
+                                        nodemask_t *nodemask)
+{
+        struct zone *zone;
+        int high_zoneidx = gfp_zone(gfp_mask);
+        pg_data_t *pgdat;
+        /*
+         * Kernel threads should not be throttled as they may be indirectly
+         * responsible for cleaning pages necessary for reclaim to make forward
+         * progress. kjournald for example may enter direct reclaim while
+         * committing a transaction where throttling it could forcing other
+         * processes to block on log_wait_commit().
+         */
+        if (current->flags & PF_KTHREAD)
+                return;
+        /* Check if the pfmemalloc reserves are ok */
+        first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
+        pgdat = zone->zone_pgdat;
+        if (pfmemalloc_watermark_ok(pgdat))
+                return;
+        /* Account for the throttling */
+        count_vm_event(PGSCAN_DIRECT_THROTTLE);
+        /*
+         * If the caller cannot enter the filesystem, it's possible that it
+         * is due to the caller holding an FS lock or performing a journal
+         * transaction in the case of a filesystem like ext[3|4]. In this case,
+         * it is not safe to block on pfmemalloc_wait as kswapd could be
+         * blocked waiting on the same lock. Instead, throttle for up to a
+         * second before continuing.
+         */
+        if (!(gfp_mask & __GFP_FS)) {
+                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
+                        pfmemalloc_watermark_ok(pgdat), HZ);
+                return;
+        }
+        /* Throttle until kswapd wakes the process */
+        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+                pfmemalloc_watermark_ok(pgdat));
+}
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                gfp_t gfp_mask, nodemask_t *nodemask)
 {
@@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .gfp_mask = sc.gfp_mask,
        };
+        throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
+        /*
+         * Do not enter reclaim if fatal signal is pending. 1 is returned so
+         * that the page allocator does not consider triggering OOM
+         */
+        if (fatal_signal_pending(current))
+                return 1;
        trace_mm_vmscan_direct_reclaim_begin(order,
                                sc.may_writepage,
                                gfp_mask);
@@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        return nr_reclaimed;
 }
-#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#ifdef CONFIG_MEMCG
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
                                                gfp_t gfp_mask, bool noswap,
@@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
        return balanced_pages >= (present_pages >> 2);
 }
-/* is kswapd sleeping prematurely? */
+/*
-static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
+ * Prepare kswapd for sleeping. This verifies that there are no processes
+ * waiting in throttle_direct_reclaim() and that watermarks have been met.
+ *
+ * Returns true if kswapd is ready to sleep
+ */
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
                                        int classzone_idx)
 {
        int i;
@@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
        /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
        if (remaining)
-                return true;
+                return false;
+        /*
+         * There is a potential race between when kswapd checks its watermarks
+         * and a process gets throttled. There is also a potential race if
+         * processes get throttled, kswapd wakes, a large process exits therby
+         * balancing the zones that causes kswapd to miss a wakeup. If kswapd
+         * is going to sleep, no process should be sleeping on pfmemalloc_wait
+         * so wake them now if necessary. If necessary, processes will wake
+         * kswapd and get throttled again
+         */
+        if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
+                wake_up(&pgdat->pfmemalloc_wait);
+                return false;
+        }
        /* Check the watermark levels */
        for (i = 0; i <= classzone_idx; i++) {
@@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
         * must be balanced
         */
        if (order)
-                return !pgdat_balanced(pgdat, balanced, classzone_idx);
+                return pgdat_balanced(pgdat, balanced, classzone_idx);
        else
-                return !all_zones_ok;
+                return all_zones_ok;
 }
 /*
@@ -2537,7 +2677,7 @@ loop_again:
                                 * consider it to be no longer congested. It's
                                 * possible there are dirty pages backed by
                                 * congested BDIs but as pressure is relieved,
-                                 * spectulatively avoid congestion waits
+                                 * speculatively avoid congestion waits
                                 */
                                zone_clear_flag(zone, ZONE_CONGESTED);
                                if (i <= *classzone_idx)
@@ -2545,6 +2685,16 @@ loop_again:
                        }
                }
+                /*
+                 * If the low watermark is met there is no need for processes
+                 * to be throttled on pfmemalloc_wait as they should not be
+                 * able to safely make forward progress. Wake them
+                 */
+                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
+                                pfmemalloc_watermark_ok(pgdat))
+                        wake_up(&pgdat->pfmemalloc_wait);
                if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
                        break;          /* kswapd: all done */
                /*
@@ -2646,7 +2796,7 @@ out:
        }
        /*
-         * Return the order we were reclaiming at so sleeping_prematurely()
+         * Return the order we were reclaiming at so prepare_kswapd_sleep()
         * makes a decision on the order we were last reclaiming at. However,
         * if another caller entered the allocator slow path while kswapd
         * was awake, order will remain at the higher level
@@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
        /* Try to sleep for a short interval */
-        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                remaining = schedule_timeout(HZ/10);
                finish_wait(&pgdat->kswapd_wait, &wait);
                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         * After a short sleep, check if it was a premature sleep. If not, then
         * go fully to sleep until explicitly woken up.
         */
-        if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {
+        if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
                /*
@@ -2688,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
                 * them before going back to sleep.
                 */
                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
-                schedule();
+                if (!kthread_should_stop())
+                        schedule();
                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
        } else {
                if (remaining)
author	Thomas Gleixner <tglx@linutronix.de>	2012-10-09 15:20:05 -0400
committer	Thomas Gleixner <tglx@linutronix.de>	2012-10-09 15:20:05 -0400
commit	db8c246937713e60b7628661ccc187eeb81f2bae (patch)
tree	6351e8bca23eef40fce85396d1c6f6cfffbd4b66 /mm/vmscan.c
parent	c5f66e99b7cb091e3d51ae8e8156892e8feb7fa3 (diff)
parent	28f2b02bc581ffc835bc1691b18d03f62fcf0395 (diff)

diff --git a/mm/vmscan.c b/mm/vmscan.c index 661576324c7..8d01243d956 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */
133	static LIST_HEAD(shrinker_list);	133	static LIST_HEAD(shrinker_list);
134	static DECLARE_RWSEM(shrinker_rwsem);	134	static DECLARE_RWSEM(shrinker_rwsem);
135		135
136	#ifdef CONFIG_CGROUP_MEM_RES_CTLR	136	#ifdef CONFIG_MEMCG
137	static bool global_reclaim(struct scan_control *sc)	137	static bool global_reclaim(struct scan_control *sc)
138	{	138	{
139	return !sc->target_mem_cgroup;	139	return !sc->target_mem_cgroup;
@@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
687		687
688	cond_resched();	688	cond_resched();
689		689
		690	mem_cgroup_uncharge_start();
690	while (!list_empty(page_list)) {	691	while (!list_empty(page_list)) {
691	enum page_references references;	692	enum page_references references;
692	struct address_space *mapping;	693	struct address_space *mapping;
@@ -720,9 +721,41 @@ static unsigned long shrink_page_list(struct list_head *page_list,
720	(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));	721	(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
721		722
722	if (PageWriteback(page)) {	723	if (PageWriteback(page)) {
723	nr_writeback++;	724	/*
724	unlock_page(page);	725	* memcg doesn't have any dirty pages throttling so we
725	goto keep;	726	* could easily OOM just because too many pages are in
		727	* writeback and there is nothing else to reclaim.
		728	*
		729	* Check __GFP_IO, certainly because a loop driver
		730	* thread might enter reclaim, and deadlock if it waits
		731	* on a page for which it is needed to do the write
		732	* (loop masks off __GFP_IO\|__GFP_FS for this reason);
		733	* but more thought would probably show more reasons.
		734	*
		735	* Don't require __GFP_FS, since we're not going into
		736	* the FS, just waiting on its writeback completion.
		737	* Worryingly, ext4 gfs2 and xfs allocate pages with
		738	* grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so
		739	* testing may_enter_fs here is liable to OOM on them.
		740	*/
		741	if (global_reclaim(sc) \|\|
		742	!PageReclaim(page) \|\| !(sc->gfp_mask & __GFP_IO)) {
		743	/*
		744	* This is slightly racy - end_page_writeback()
		745	* might have just cleared PageReclaim, then
		746	* setting PageReclaim here end up interpreted
		747	* as PageReadahead - but that does not matter
		748	* enough to care. What we do want is for this
		749	* page to have PageReclaim set next time memcg
		750	* reclaim reaches the tests above, so it will
		751	* then wait_on_page_writeback() to avoid OOM;
		752	* and it's also appropriate in global reclaim.
		753	*/
		754	SetPageReclaim(page);
		755	nr_writeback++;
		756	goto keep_locked;
		757	}
		758	wait_on_page_writeback(page);
726	}	759	}
727		760
728	references = page_check_references(page, sc);	761	references = page_check_references(page, sc);
@@ -921,6 +954,7 @@ keep:
921		954
922	list_splice(&ret_pages, page_list);	955	list_splice(&ret_pages, page_list);
923	count_vm_events(PGACTIVATE, pgactivate);	956	count_vm_events(PGACTIVATE, pgactivate);
		957	mem_cgroup_uncharge_end();
924	*ret_nr_dirty += nr_dirty;	958	*ret_nr_dirty += nr_dirty;
925	*ret_nr_writeback += nr_writeback;	959	*ret_nr_writeback += nr_writeback;
926	return nr_reclaimed;	960	return nr_reclaimed;
@@ -1567,7 +1601,8 @@ static int vmscan_swappiness(struct scan_control *sc)
1567	* by looking at the fraction of the pages scanned we did rotate back	1601	* by looking at the fraction of the pages scanned we did rotate back
1568	* onto the active list instead of evict.	1602	* onto the active list instead of evict.
1569	*	1603	*
1570	* nr[0] = anon pages to scan; nr[1] = file pages to scan	1604	* nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
		1605	* nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
1571	*/	1606	*/
1572	static void get_scan_count(struct lruvec lruvec, struct scan_control sc,	1607	static void get_scan_count(struct lruvec lruvec, struct scan_control sc,
1573	unsigned long *nr)	1608	unsigned long *nr)
@@ -2111,6 +2146,83 @@ out:
2111	return 0;	2146	return 0;
2112	}	2147	}
2113		2148
		2149	static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
		2150	{
		2151	struct zone *zone;
		2152	unsigned long pfmemalloc_reserve = 0;
		2153	unsigned long free_pages = 0;
		2154	int i;
		2155	bool wmark_ok;
		2156
		2157	for (i = 0; i <= ZONE_NORMAL; i++) {
		2158	zone = &pgdat->node_zones[i];
		2159	pfmemalloc_reserve += min_wmark_pages(zone);
		2160	free_pages += zone_page_state(zone, NR_FREE_PAGES);
		2161	}
		2162
		2163	wmark_ok = free_pages > pfmemalloc_reserve / 2;
		2164
		2165	/* kswapd must be awake if processes are being throttled */
		2166	if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
		2167	pgdat->classzone_idx = min(pgdat->classzone_idx,
		2168	(enum zone_type)ZONE_NORMAL);
		2169	wake_up_interruptible(&pgdat->kswapd_wait);
		2170	}
		2171
		2172	return wmark_ok;
		2173	}
		2174
		2175	/*
		2176	* Throttle direct reclaimers if backing storage is backed by the network
		2177	* and the PFMEMALLOC reserve for the preferred node is getting dangerously
		2178	* depleted. kswapd will continue to make progress and wake the processes
		2179	* when the low watermark is reached
		2180	*/
		2181	static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
		2182	nodemask_t *nodemask)
		2183	{
		2184	struct zone *zone;
		2185	int high_zoneidx = gfp_zone(gfp_mask);
		2186	pg_data_t *pgdat;
		2187
		2188	/*
		2189	* Kernel threads should not be throttled as they may be indirectly
		2190	* responsible for cleaning pages necessary for reclaim to make forward
		2191	* progress. kjournald for example may enter direct reclaim while
		2192	* committing a transaction where throttling it could forcing other
		2193	* processes to block on log_wait_commit().
		2194	*/
		2195	if (current->flags & PF_KTHREAD)
		2196	return;
		2197
		2198	/* Check if the pfmemalloc reserves are ok */
		2199	first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
		2200	pgdat = zone->zone_pgdat;
		2201	if (pfmemalloc_watermark_ok(pgdat))
		2202	return;
		2203
		2204	/* Account for the throttling */
		2205	count_vm_event(PGSCAN_DIRECT_THROTTLE);
		2206
		2207	/*
		2208	* If the caller cannot enter the filesystem, it's possible that it
		2209	* is due to the caller holding an FS lock or performing a journal
		2210	* transaction in the case of a filesystem like ext[3\|4]. In this case,
		2211	* it is not safe to block on pfmemalloc_wait as kswapd could be
		2212	* blocked waiting on the same lock. Instead, throttle for up to a
		2213	* second before continuing.
		2214	*/
		2215	if (!(gfp_mask & __GFP_FS)) {
		2216	wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
		2217	pfmemalloc_watermark_ok(pgdat), HZ);
		2218	return;
		2219	}
		2220
		2221	/* Throttle until kswapd wakes the process */
		2222	wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
		2223	pfmemalloc_watermark_ok(pgdat));
		2224	}
		2225
2114	unsigned long try_to_free_pages(struct zonelist *zonelist, int order,	2226	unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2115	gfp_t gfp_mask, nodemask_t *nodemask)	2227	gfp_t gfp_mask, nodemask_t *nodemask)
2116	{	2228	{
@@ -2130,6 +2242,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2130	.gfp_mask = sc.gfp_mask,	2242	.gfp_mask = sc.gfp_mask,
2131	};	2243	};
2132		2244
		2245	throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
		2246
		2247	/*
		2248	* Do not enter reclaim if fatal signal is pending. 1 is returned so
		2249	* that the page allocator does not consider triggering OOM
		2250	*/
		2251	if (fatal_signal_pending(current))
		2252	return 1;
		2253
2133	trace_mm_vmscan_direct_reclaim_begin(order,	2254	trace_mm_vmscan_direct_reclaim_begin(order,
2134	sc.may_writepage,	2255	sc.may_writepage,
2135	gfp_mask);	2256	gfp_mask);
@@ -2141,7 +2262,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2141	return nr_reclaimed;	2262	return nr_reclaimed;
2142	}	2263	}
2143		2264
2144	#ifdef CONFIG_CGROUP_MEM_RES_CTLR	2265	#ifdef CONFIG_MEMCG
2145		2266
2146	unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,	2267	unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
2147	gfp_t gfp_mask, bool noswap,	2268	gfp_t gfp_mask, bool noswap,
@@ -2274,8 +2395,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2274	return balanced_pages >= (present_pages >> 2);	2395	return balanced_pages >= (present_pages >> 2);
2275	}	2396	}
2276		2397
2277	/* is kswapd sleeping prematurely? */	2398	/*
2278	static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,	2399	* Prepare kswapd for sleeping. This verifies that there are no processes
		2400	* waiting in throttle_direct_reclaim() and that watermarks have been met.
		2401	*
		2402	* Returns true if kswapd is ready to sleep
		2403	*/
		2404	static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2279	int classzone_idx)	2405	int classzone_idx)
2280	{	2406	{
2281	int i;	2407	int i;
@@ -2284,7 +2410,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2284		2410
2285	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */	2411	/* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2286	if (remaining)	2412	if (remaining)
2287	return true;	2413	return false;
		2414
		2415	/*
		2416	* There is a potential race between when kswapd checks its watermarks
		2417	* and a process gets throttled. There is also a potential race if
		2418	* processes get throttled, kswapd wakes, a large process exits therby
		2419	* balancing the zones that causes kswapd to miss a wakeup. If kswapd
		2420	* is going to sleep, no process should be sleeping on pfmemalloc_wait
		2421	* so wake them now if necessary. If necessary, processes will wake
		2422	* kswapd and get throttled again
		2423	*/
		2424	if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
		2425	wake_up(&pgdat->pfmemalloc_wait);
		2426	return false;
		2427	}
2288		2428
2289	/* Check the watermark levels */	2429	/* Check the watermark levels */
2290	for (i = 0; i <= classzone_idx; i++) {	2430	for (i = 0; i <= classzone_idx; i++) {
@@ -2317,9 +2457,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2317	* must be balanced	2457	* must be balanced
2318	*/	2458	*/
2319	if (order)	2459	if (order)
2320	return !pgdat_balanced(pgdat, balanced, classzone_idx);	2460	return pgdat_balanced(pgdat, balanced, classzone_idx);
2321	else	2461	else
2322	return !all_zones_ok;	2462	return all_zones_ok;
2323	}	2463	}
2324		2464
2325	/*	2465	/*
@@ -2537,7 +2677,7 @@ loop_again:
2537	* consider it to be no longer congested. It's	2677	* consider it to be no longer congested. It's
2538	* possible there are dirty pages backed by	2678	* possible there are dirty pages backed by
2539	* congested BDIs but as pressure is relieved,	2679	* congested BDIs but as pressure is relieved,
2540	* spectulatively avoid congestion waits	2680	* speculatively avoid congestion waits
2541	*/	2681	*/
2542	zone_clear_flag(zone, ZONE_CONGESTED);	2682	zone_clear_flag(zone, ZONE_CONGESTED);
2543	if (i <= *classzone_idx)	2683	if (i <= *classzone_idx)
@@ -2545,6 +2685,16 @@ loop_again:
2545	}	2685	}
2546		2686
2547	}	2687	}
		2688
		2689	/*
		2690	* If the low watermark is met there is no need for processes
		2691	* to be throttled on pfmemalloc_wait as they should not be
		2692	* able to safely make forward progress. Wake them
		2693	*/
		2694	if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
		2695	pfmemalloc_watermark_ok(pgdat))
		2696	wake_up(&pgdat->pfmemalloc_wait);
		2697
2548	if (all_zones_ok \|\| (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))	2698	if (all_zones_ok \|\| (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2549	break; /* kswapd: all done */	2699	break; /* kswapd: all done */
2550	/*	2700	/*
@@ -2646,7 +2796,7 @@ out:
2646	}	2796	}
2647		2797
2648	/*	2798	/*
2649	* Return the order we were reclaiming at so sleeping_prematurely()	2799	* Return the order we were reclaiming at so prepare_kswapd_sleep()
2650	* makes a decision on the order we were last reclaiming at. However,	2800	* makes a decision on the order we were last reclaiming at. However,
2651	* if another caller entered the allocator slow path while kswapd	2801	* if another caller entered the allocator slow path while kswapd
2652	* was awake, order will remain at the higher level	2802	* was awake, order will remain at the higher level
@@ -2666,7 +2816,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2666	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);	2816	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2667		2817
2668	/* Try to sleep for a short interval */	2818	/* Try to sleep for a short interval */
2669	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {	2819	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2670	remaining = schedule_timeout(HZ/10);	2820	remaining = schedule_timeout(HZ/10);
2671	finish_wait(&pgdat->kswapd_wait, &wait);	2821	finish_wait(&pgdat->kswapd_wait, &wait);
2672	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);	2822	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2676,7 +2826,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2676	* After a short sleep, check if it was a premature sleep. If not, then	2826	* After a short sleep, check if it was a premature sleep. If not, then
2677	* go fully to sleep until explicitly woken up.	2827	* go fully to sleep until explicitly woken up.
2678	*/	2828	*/
2679	if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) {	2829	if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2680	trace_mm_vmscan_kswapd_sleep(pgdat->node_id);	2830	trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2681		2831
2682	/*	2832	/*
@@ -2688,7 +2838,10 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2688	* them before going back to sleep.	2838	* them before going back to sleep.
2689	*/	2839	*/
2690	set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);	2840	set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
2691	schedule();	2841
		2842	if (!kthread_should_stop())
		2843	schedule();
		2844
2692	set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);	2845	set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
2693	} else {	2846	} else {
2694	if (remaining)	2847	if (remaining)