diff options
author | Mel Gorman <mgorman@suse.de> | 2012-07-31 19:44:35 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-07-31 21:42:46 -0400 |
commit | 5515061d22f0f9976ae7815864bfd22042d36848 (patch) | |
tree | 13b53a29166f19eb864e96b3b58539a207e5fa2f /mm | |
parent | 7f338fe4540b1d0600b02314c7d885fd358e9eca (diff) |
mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
If swap is backed by network storage such as NBD, there is a risk that a
large number of reclaimers can hang the system by consuming all
PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune
min_free_kbytes in advance which is a bit fragile.
This patch throttles direct reclaimers if half the PF_MEMALLOC reserves
are in use. If the system is routinely getting throttled the system
administrator can increase min_free_kbytes so degradation is smoother but
the system will keep running.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: David Miller <davem@davemloft.net>
Cc: Neil Brown <neilb@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Christie <michaelc@cs.wisc.edu>
Cc: Eric B Munson <emunson@mgebm.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/page_alloc.c | 1 | ||||
-rw-r--r-- | mm/vmscan.c | 128 |
2 files changed, 121 insertions, 8 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef2d1e72fc07..48aee0f5902f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -4389,6 +4389,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4389 | pgdat_resize_init(pgdat); | 4389 | pgdat_resize_init(pgdat); |
4390 | pgdat->nr_zones = 0; | 4390 | pgdat->nr_zones = 0; |
4391 | init_waitqueue_head(&pgdat->kswapd_wait); | 4391 | init_waitqueue_head(&pgdat->kswapd_wait); |
4392 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | ||
4392 | pgdat->kswapd_max_order = 0; | 4393 | pgdat->kswapd_max_order = 0; |
4393 | pgdat_page_cgroup_init(pgdat); | 4394 | pgdat_page_cgroup_init(pgdat); |
4394 | 4395 | ||
diff --git a/mm/vmscan.c b/mm/vmscan.c index 6b1f89a91212..021a44a7bd20 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2112,6 +2112,80 @@ out: | |||
2112 | return 0; | 2112 | return 0; |
2113 | } | 2113 | } |
2114 | 2114 | ||
2115 | static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) | ||
2116 | { | ||
2117 | struct zone *zone; | ||
2118 | unsigned long pfmemalloc_reserve = 0; | ||
2119 | unsigned long free_pages = 0; | ||
2120 | int i; | ||
2121 | bool wmark_ok; | ||
2122 | |||
2123 | for (i = 0; i <= ZONE_NORMAL; i++) { | ||
2124 | zone = &pgdat->node_zones[i]; | ||
2125 | pfmemalloc_reserve += min_wmark_pages(zone); | ||
2126 | free_pages += zone_page_state(zone, NR_FREE_PAGES); | ||
2127 | } | ||
2128 | |||
2129 | wmark_ok = free_pages > pfmemalloc_reserve / 2; | ||
2130 | |||
2131 | /* kswapd must be awake if processes are being throttled */ | ||
2132 | if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { | ||
2133 | pgdat->classzone_idx = min(pgdat->classzone_idx, | ||
2134 | (enum zone_type)ZONE_NORMAL); | ||
2135 | wake_up_interruptible(&pgdat->kswapd_wait); | ||
2136 | } | ||
2137 | |||
2138 | return wmark_ok; | ||
2139 | } | ||
2140 | |||
2141 | /* | ||
2142 | * Throttle direct reclaimers if backing storage is backed by the network | ||
2143 | * and the PFMEMALLOC reserve for the preferred node is getting dangerously | ||
2144 | * depleted. kswapd will continue to make progress and wake the processes | ||
2145 | * when the low watermark is reached | ||
2146 | */ | ||
2147 | static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, | ||
2148 | nodemask_t *nodemask) | ||
2149 | { | ||
2150 | struct zone *zone; | ||
2151 | int high_zoneidx = gfp_zone(gfp_mask); | ||
2152 | pg_data_t *pgdat; | ||
2153 | |||
2154 | /* | ||
2155 | * Kernel threads should not be throttled as they may be indirectly | ||
2156 | * responsible for cleaning pages necessary for reclaim to make forward | ||
2157 | * progress. kjournald for example may enter direct reclaim while | ||
2158 | * committing a transaction where throttling it could forcing other | ||
2159 | * processes to block on log_wait_commit(). | ||
2160 | */ | ||
2161 | if (current->flags & PF_KTHREAD) | ||
2162 | return; | ||
2163 | |||
2164 | /* Check if the pfmemalloc reserves are ok */ | ||
2165 | first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); | ||
2166 | pgdat = zone->zone_pgdat; | ||
2167 | if (pfmemalloc_watermark_ok(pgdat)) | ||
2168 | return; | ||
2169 | |||
2170 | /* | ||
2171 | * If the caller cannot enter the filesystem, it's possible that it | ||
2172 | * is due to the caller holding an FS lock or performing a journal | ||
2173 | * transaction in the case of a filesystem like ext[3|4]. In this case, | ||
2174 | * it is not safe to block on pfmemalloc_wait as kswapd could be | ||
2175 | * blocked waiting on the same lock. Instead, throttle for up to a | ||
2176 | * second before continuing. | ||
2177 | */ | ||
2178 | if (!(gfp_mask & __GFP_FS)) { | ||
2179 | wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, | ||
2180 | pfmemalloc_watermark_ok(pgdat), HZ); | ||
2181 | return; | ||
2182 | } | ||
2183 | |||
2184 | /* Throttle until kswapd wakes the process */ | ||
2185 | wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, | ||
2186 | pfmemalloc_watermark_ok(pgdat)); | ||
2187 | } | ||
2188 | |||
2115 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | 2189 | unsigned long try_to_free_pages(struct zonelist *zonelist, int order, |
2116 | gfp_t gfp_mask, nodemask_t *nodemask) | 2190 | gfp_t gfp_mask, nodemask_t *nodemask) |
2117 | { | 2191 | { |
@@ -2131,6 +2205,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2131 | .gfp_mask = sc.gfp_mask, | 2205 | .gfp_mask = sc.gfp_mask, |
2132 | }; | 2206 | }; |
2133 | 2207 | ||
2208 | throttle_direct_reclaim(gfp_mask, zonelist, nodemask); | ||
2209 | |||
2210 | /* | ||
2211 | * Do not enter reclaim if fatal signal is pending. 1 is returned so | ||
2212 | * that the page allocator does not consider triggering OOM | ||
2213 | */ | ||
2214 | if (fatal_signal_pending(current)) | ||
2215 | return 1; | ||
2216 | |||
2134 | trace_mm_vmscan_direct_reclaim_begin(order, | 2217 | trace_mm_vmscan_direct_reclaim_begin(order, |
2135 | sc.may_writepage, | 2218 | sc.may_writepage, |
2136 | gfp_mask); | 2219 | gfp_mask); |
@@ -2275,8 +2358,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, | |||
2275 | return balanced_pages >= (present_pages >> 2); | 2358 | return balanced_pages >= (present_pages >> 2); |
2276 | } | 2359 | } |
2277 | 2360 | ||
2278 | /* is kswapd sleeping prematurely? */ | 2361 | /* |
2279 | static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | 2362 | * Prepare kswapd for sleeping. This verifies that there are no processes |
2363 | * waiting in throttle_direct_reclaim() and that watermarks have been met. | ||
2364 | * | ||
2365 | * Returns true if kswapd is ready to sleep | ||
2366 | */ | ||
2367 | static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | ||
2280 | int classzone_idx) | 2368 | int classzone_idx) |
2281 | { | 2369 | { |
2282 | int i; | 2370 | int i; |
@@ -2285,7 +2373,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2285 | 2373 | ||
2286 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ | 2374 | /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ |
2287 | if (remaining) | 2375 | if (remaining) |
2288 | return true; | 2376 | return false; |
2377 | |||
2378 | /* | ||
2379 | * There is a potential race between when kswapd checks its watermarks | ||
2380 | * and a process gets throttled. There is also a potential race if | ||
2381 | * processes get throttled, kswapd wakes, a large process exits therby | ||
2382 | * balancing the zones that causes kswapd to miss a wakeup. If kswapd | ||
2383 | * is going to sleep, no process should be sleeping on pfmemalloc_wait | ||
2384 | * so wake them now if necessary. If necessary, processes will wake | ||
2385 | * kswapd and get throttled again | ||
2386 | */ | ||
2387 | if (waitqueue_active(&pgdat->pfmemalloc_wait)) { | ||
2388 | wake_up(&pgdat->pfmemalloc_wait); | ||
2389 | return false; | ||
2390 | } | ||
2289 | 2391 | ||
2290 | /* Check the watermark levels */ | 2392 | /* Check the watermark levels */ |
2291 | for (i = 0; i <= classzone_idx; i++) { | 2393 | for (i = 0; i <= classzone_idx; i++) { |
@@ -2318,9 +2420,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, | |||
2318 | * must be balanced | 2420 | * must be balanced |
2319 | */ | 2421 | */ |
2320 | if (order) | 2422 | if (order) |
2321 | return !pgdat_balanced(pgdat, balanced, classzone_idx); | 2423 | return pgdat_balanced(pgdat, balanced, classzone_idx); |
2322 | else | 2424 | else |
2323 | return !all_zones_ok; | 2425 | return all_zones_ok; |
2324 | } | 2426 | } |
2325 | 2427 | ||
2326 | /* | 2428 | /* |
@@ -2546,6 +2648,16 @@ loop_again: | |||
2546 | } | 2648 | } |
2547 | 2649 | ||
2548 | } | 2650 | } |
2651 | |||
2652 | /* | ||
2653 | * If the low watermark is met there is no need for processes | ||
2654 | * to be throttled on pfmemalloc_wait as they should not be | ||
2655 | * able to safely make forward progress. Wake them | ||
2656 | */ | ||
2657 | if (waitqueue_active(&pgdat->pfmemalloc_wait) && | ||
2658 | pfmemalloc_watermark_ok(pgdat)) | ||
2659 | wake_up(&pgdat->pfmemalloc_wait); | ||
2660 | |||
2549 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) | 2661 | if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) |
2550 | break; /* kswapd: all done */ | 2662 | break; /* kswapd: all done */ |
2551 | /* | 2663 | /* |
@@ -2647,7 +2759,7 @@ out: | |||
2647 | } | 2759 | } |
2648 | 2760 | ||
2649 | /* | 2761 | /* |
2650 | * Return the order we were reclaiming at so sleeping_prematurely() | 2762 | * Return the order we were reclaiming at so prepare_kswapd_sleep() |
2651 | * makes a decision on the order we were last reclaiming at. However, | 2763 | * makes a decision on the order we were last reclaiming at. However, |
2652 | * if another caller entered the allocator slow path while kswapd | 2764 | * if another caller entered the allocator slow path while kswapd |
2653 | * was awake, order will remain at the higher level | 2765 | * was awake, order will remain at the higher level |
@@ -2667,7 +2779,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2667 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2779 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
2668 | 2780 | ||
2669 | /* Try to sleep for a short interval */ | 2781 | /* Try to sleep for a short interval */ |
2670 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2782 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
2671 | remaining = schedule_timeout(HZ/10); | 2783 | remaining = schedule_timeout(HZ/10); |
2672 | finish_wait(&pgdat->kswapd_wait, &wait); | 2784 | finish_wait(&pgdat->kswapd_wait, &wait); |
2673 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); | 2785 | prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); |
@@ -2677,7 +2789,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) | |||
2677 | * After a short sleep, check if it was a premature sleep. If not, then | 2789 | * After a short sleep, check if it was a premature sleep. If not, then |
2678 | * go fully to sleep until explicitly woken up. | 2790 | * go fully to sleep until explicitly woken up. |
2679 | */ | 2791 | */ |
2680 | if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { | 2792 | if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { |
2681 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | 2793 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
2682 | 2794 | ||
2683 | /* | 2795 | /* |