aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-07-31 19:44:35 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-07-31 21:42:46 -0400
commit5515061d22f0f9976ae7815864bfd22042d36848 (patch)
tree13b53a29166f19eb864e96b3b58539a207e5fa2f
parent7f338fe4540b1d0600b02314c7d885fd358e9eca (diff)
mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage
If swap is backed by network storage such as NBD, there is a risk that a large number of reclaimers can hang the system by consuming all PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune min_free_kbytes in advance which is a bit fragile. This patch throttles direct reclaimers if half the PF_MEMALLOC reserves are in use. If the system is routinely getting throttled the system administrator can increase min_free_kbytes so degradation is smoother but the system will keep running. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: David Miller <davem@davemloft.net> Cc: Neil Brown <neilb@suse.de> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Christie <michaelc@cs.wisc.edu> Cc: Eric B Munson <emunson@mgebm.net> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Mel Gorman <mgorman@suse.de> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mmzone.h1
-rw-r--r--mm/page_alloc.c1
-rw-r--r--mm/vmscan.c128
3 files changed, 122 insertions, 8 deletions
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 64b2c3a48286..2daa54f55db7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -705,6 +705,7 @@ typedef struct pglist_data {
705 range, including holes */ 705 range, including holes */
706 int node_id; 706 int node_id;
707 wait_queue_head_t kswapd_wait; 707 wait_queue_head_t kswapd_wait;
708 wait_queue_head_t pfmemalloc_wait;
708 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ 709 struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */
709 int kswapd_max_order; 710 int kswapd_max_order;
710 enum zone_type classzone_idx; 711 enum zone_type classzone_idx;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ef2d1e72fc07..48aee0f5902f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4389,6 +4389,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4389 pgdat_resize_init(pgdat); 4389 pgdat_resize_init(pgdat);
4390 pgdat->nr_zones = 0; 4390 pgdat->nr_zones = 0;
4391 init_waitqueue_head(&pgdat->kswapd_wait); 4391 init_waitqueue_head(&pgdat->kswapd_wait);
4392 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4392 pgdat->kswapd_max_order = 0; 4393 pgdat->kswapd_max_order = 0;
4393 pgdat_page_cgroup_init(pgdat); 4394 pgdat_page_cgroup_init(pgdat);
4394 4395
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6b1f89a91212..021a44a7bd20 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2112,6 +2112,80 @@ out:
2112 return 0; 2112 return 0;
2113} 2113}
2114 2114
2115static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)
2116{
2117 struct zone *zone;
2118 unsigned long pfmemalloc_reserve = 0;
2119 unsigned long free_pages = 0;
2120 int i;
2121 bool wmark_ok;
2122
2123 for (i = 0; i <= ZONE_NORMAL; i++) {
2124 zone = &pgdat->node_zones[i];
2125 pfmemalloc_reserve += min_wmark_pages(zone);
2126 free_pages += zone_page_state(zone, NR_FREE_PAGES);
2127 }
2128
2129 wmark_ok = free_pages > pfmemalloc_reserve / 2;
2130
2131 /* kswapd must be awake if processes are being throttled */
2132 if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
2133 pgdat->classzone_idx = min(pgdat->classzone_idx,
2134 (enum zone_type)ZONE_NORMAL);
2135 wake_up_interruptible(&pgdat->kswapd_wait);
2136 }
2137
2138 return wmark_ok;
2139}
2140
2141/*
2142 * Throttle direct reclaimers if backing storage is backed by the network
2143 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
2144 * depleted. kswapd will continue to make progress and wake the processes
2145 * when the low watermark is reached
2146 */
2147static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
2148 nodemask_t *nodemask)
2149{
2150 struct zone *zone;
2151 int high_zoneidx = gfp_zone(gfp_mask);
2152 pg_data_t *pgdat;
2153
2154 /*
2155 * Kernel threads should not be throttled as they may be indirectly
2156 * responsible for cleaning pages necessary for reclaim to make forward
2157 * progress. kjournald for example may enter direct reclaim while
2158 * committing a transaction where throttling it could forcing other
2159 * processes to block on log_wait_commit().
2160 */
2161 if (current->flags & PF_KTHREAD)
2162 return;
2163
2164 /* Check if the pfmemalloc reserves are ok */
2165 first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone);
2166 pgdat = zone->zone_pgdat;
2167 if (pfmemalloc_watermark_ok(pgdat))
2168 return;
2169
2170 /*
2171 * If the caller cannot enter the filesystem, it's possible that it
2172 * is due to the caller holding an FS lock or performing a journal
2173 * transaction in the case of a filesystem like ext[3|4]. In this case,
2174 * it is not safe to block on pfmemalloc_wait as kswapd could be
2175 * blocked waiting on the same lock. Instead, throttle for up to a
2176 * second before continuing.
2177 */
2178 if (!(gfp_mask & __GFP_FS)) {
2179 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
2180 pfmemalloc_watermark_ok(pgdat), HZ);
2181 return;
2182 }
2183
2184 /* Throttle until kswapd wakes the process */
2185 wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
2186 pfmemalloc_watermark_ok(pgdat));
2187}
2188
2115unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 2189unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2116 gfp_t gfp_mask, nodemask_t *nodemask) 2190 gfp_t gfp_mask, nodemask_t *nodemask)
2117{ 2191{
@@ -2131,6 +2205,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2131 .gfp_mask = sc.gfp_mask, 2205 .gfp_mask = sc.gfp_mask,
2132 }; 2206 };
2133 2207
2208 throttle_direct_reclaim(gfp_mask, zonelist, nodemask);
2209
2210 /*
2211 * Do not enter reclaim if fatal signal is pending. 1 is returned so
2212 * that the page allocator does not consider triggering OOM
2213 */
2214 if (fatal_signal_pending(current))
2215 return 1;
2216
2134 trace_mm_vmscan_direct_reclaim_begin(order, 2217 trace_mm_vmscan_direct_reclaim_begin(order,
2135 sc.may_writepage, 2218 sc.may_writepage,
2136 gfp_mask); 2219 gfp_mask);
@@ -2275,8 +2358,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages,
2275 return balanced_pages >= (present_pages >> 2); 2358 return balanced_pages >= (present_pages >> 2);
2276} 2359}
2277 2360
2278/* is kswapd sleeping prematurely? */ 2361/*
2279static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, 2362 * Prepare kswapd for sleeping. This verifies that there are no processes
2363 * waiting in throttle_direct_reclaim() and that watermarks have been met.
2364 *
2365 * Returns true if kswapd is ready to sleep
2366 */
2367static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2280 int classzone_idx) 2368 int classzone_idx)
2281{ 2369{
2282 int i; 2370 int i;
@@ -2285,7 +2373,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2285 2373
2286 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ 2374 /* If a direct reclaimer woke kswapd within HZ/10, it's premature */
2287 if (remaining) 2375 if (remaining)
2288 return true; 2376 return false;
2377
2378 /*
2379 * There is a potential race between when kswapd checks its watermarks
2380 * and a process gets throttled. There is also a potential race if
2381 * processes get throttled, kswapd wakes, a large process exits therby
2382 * balancing the zones that causes kswapd to miss a wakeup. If kswapd
2383 * is going to sleep, no process should be sleeping on pfmemalloc_wait
2384 * so wake them now if necessary. If necessary, processes will wake
2385 * kswapd and get throttled again
2386 */
2387 if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
2388 wake_up(&pgdat->pfmemalloc_wait);
2389 return false;
2390 }
2289 2391
2290 /* Check the watermark levels */ 2392 /* Check the watermark levels */
2291 for (i = 0; i <= classzone_idx; i++) { 2393 for (i = 0; i <= classzone_idx; i++) {
@@ -2318,9 +2420,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining,
2318 * must be balanced 2420 * must be balanced
2319 */ 2421 */
2320 if (order) 2422 if (order)
2321 return !pgdat_balanced(pgdat, balanced, classzone_idx); 2423 return pgdat_balanced(pgdat, balanced, classzone_idx);
2322 else 2424 else
2323 return !all_zones_ok; 2425 return all_zones_ok;
2324} 2426}
2325 2427
2326/* 2428/*
@@ -2546,6 +2648,16 @@ loop_again:
2546 } 2648 }
2547 2649
2548 } 2650 }
2651
2652 /*
2653 * If the low watermark is met there is no need for processes
2654 * to be throttled on pfmemalloc_wait as they should not be
2655 * able to safely make forward progress. Wake them
2656 */
2657 if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
2658 pfmemalloc_watermark_ok(pgdat))
2659 wake_up(&pgdat->pfmemalloc_wait);
2660
2549 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) 2661 if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))
2550 break; /* kswapd: all done */ 2662 break; /* kswapd: all done */
2551 /* 2663 /*
@@ -2647,7 +2759,7 @@ out:
2647 } 2759 }
2648 2760
2649 /* 2761 /*
2650 * Return the order we were reclaiming at so sleeping_prematurely() 2762 * Return the order we were reclaiming at so prepare_kswapd_sleep()
2651 * makes a decision on the order we were last reclaiming at. However, 2763 * makes a decision on the order we were last reclaiming at. However,
2652 * if another caller entered the allocator slow path while kswapd 2764 * if another caller entered the allocator slow path while kswapd
2653 * was awake, order will remain at the higher level 2765 * was awake, order will remain at the higher level
@@ -2667,7 +2779,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2667 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2779 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
2668 2780
2669 /* Try to sleep for a short interval */ 2781 /* Try to sleep for a short interval */
2670 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2782 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2671 remaining = schedule_timeout(HZ/10); 2783 remaining = schedule_timeout(HZ/10);
2672 finish_wait(&pgdat->kswapd_wait, &wait); 2784 finish_wait(&pgdat->kswapd_wait, &wait);
2673 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); 2785 prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
@@ -2677,7 +2789,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2677 * After a short sleep, check if it was a premature sleep. If not, then 2789 * After a short sleep, check if it was a premature sleep. If not, then
2678 * go fully to sleep until explicitly woken up. 2790 * go fully to sleep until explicitly woken up.
2679 */ 2791 */
2680 if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { 2792 if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) {
2681 trace_mm_vmscan_kswapd_sleep(pgdat->node_id); 2793 trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
2682 2794
2683 /* 2795 /*