diff options
author | Mel Gorman <mel@csn.ul.ie> | 2011-01-13 18:45:41 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-01-13 20:32:31 -0500 |
commit | 88f5acf88ae6a9778f6d25d0d5d7ec2d57764a97 (patch) | |
tree | 6f39beef8cf918eb2ca9f64ae1bcd1ea79ca487a /mm/vmscan.c | |
parent | 43bb40c9e3aa51a3b038c9df2c9afb4d4685614d (diff) |
mm: page allocator: adjust the per-cpu counter threshold when memory is low
Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory
is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To
avoid synchronization overhead, these counters are maintained on a per-cpu
basis and drained both periodically and when a threshold is above a
threshold. On large CPU systems, the difference between the estimate and
real value of NR_FREE_PAGES can be very high. The system can get into a
case where pages are allocated far below the min watermark potentially
causing livelock issues. The commit solved the problem by taking a better
reading of NR_FREE_PAGES when memory was low.
Unfortately, as reported by Shaohua Li this accurate reading can consume a
large amount of CPU time on systems with many sockets due to cache line
bouncing. This patch takes a different approach. For large machines
where counter drift might be unsafe and while kswapd is awake, the per-cpu
thresholds for the target pgdat are reduced to limit the level of drift to
what should be a safe level. This incurs a performance penalty in heavy
memory pressure by a factor that depends on the workload and the machine
but the machine should function correctly without accidentally exhausting
all memory on a node. There is an additional cost when kswapd wakes and
sleeps but the event is not expected to be frequent - in Shaohua's test
case, there was one recorded sleep and wake event at least.
To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is
introduced that takes a more accurate reading of NR_FREE_PAGES when called
from wakeup_kswapd, when deciding whether it is really safe to go back to
sleep in sleeping_prematurely() and when deciding if a zone is really
balanced or not in balance_pgdat(). We are still using an expensive
function but limiting how often it is called.
When the test case is reproduced, the time spent in the watermark
functions is reduced. The following report is on the percentage of time
spent cumulatively spent in the functions zone_nr_free_pages(),
zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(),
zone_page_state_snapshot(), zone_page_state().
vanilla 11.6615%
disable-threshold 0.2584%
David said:
: We had to pull aa454840 "mm: page allocator: calculate a better estimate
: of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36
: internally because tests showed that it would cause the machine to stall
: as the result of heavy kswapd activity. I merged it back with this fix as
: it is pending in the -mm tree and it solves the issue we were seeing, so I
: definitely think this should be pushed to -stable (and I would seriously
: consider it for 2.6.37 inclusion even at this late date).
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Reported-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Christoph Lameter <cl@linux.com>
Tested-by: Nicolas Bareil <nico@chdir.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: <stable@kernel.org> [2.6.37.1, 2.6.36.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 23 |
1 files changed, 13 insertions, 10 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ca587c69274..5da4295e7d67 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2143,7 +2143,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) | |||
2143 | if (zone->all_unreclaimable) | 2143 | if (zone->all_unreclaimable) |
2144 | continue; | 2144 | continue; |
2145 | 2145 | ||
2146 | if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), | 2146 | if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), |
2147 | 0, 0)) | 2147 | 0, 0)) |
2148 | return 1; | 2148 | return 1; |
2149 | } | 2149 | } |
@@ -2230,7 +2230,7 @@ loop_again: | |||
2230 | shrink_active_list(SWAP_CLUSTER_MAX, zone, | 2230 | shrink_active_list(SWAP_CLUSTER_MAX, zone, |
2231 | &sc, priority, 0); | 2231 | &sc, priority, 0); |
2232 | 2232 | ||
2233 | if (!zone_watermark_ok(zone, order, | 2233 | if (!zone_watermark_ok_safe(zone, order, |
2234 | high_wmark_pages(zone), 0, 0)) { | 2234 | high_wmark_pages(zone), 0, 0)) { |
2235 | end_zone = i; | 2235 | end_zone = i; |
2236 | break; | 2236 | break; |
@@ -2276,7 +2276,7 @@ loop_again: | |||
2276 | * We put equal pressure on every zone, unless one | 2276 | * We put equal pressure on every zone, unless one |
2277 | * zone has way too many pages free already. | 2277 | * zone has way too many pages free already. |
2278 | */ | 2278 | */ |
2279 | if (!zone_watermark_ok(zone, order, | 2279 | if (!zone_watermark_ok_safe(zone, order, |
2280 | 8*high_wmark_pages(zone), end_zone, 0)) | 2280 | 8*high_wmark_pages(zone), end_zone, 0)) |
2281 | shrink_zone(priority, zone, &sc); | 2281 | shrink_zone(priority, zone, &sc); |
2282 | reclaim_state->reclaimed_slab = 0; | 2282 | reclaim_state->reclaimed_slab = 0; |
@@ -2297,7 +2297,7 @@ loop_again: | |||
2297 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | 2297 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) |
2298 | sc.may_writepage = 1; | 2298 | sc.may_writepage = 1; |
2299 | 2299 | ||
2300 | if (!zone_watermark_ok(zone, order, | 2300 | if (!zone_watermark_ok_safe(zone, order, |
2301 | high_wmark_pages(zone), end_zone, 0)) { | 2301 | high_wmark_pages(zone), end_zone, 0)) { |
2302 | all_zones_ok = 0; | 2302 | all_zones_ok = 0; |
2303 | /* | 2303 | /* |
@@ -2305,7 +2305,7 @@ loop_again: | |||
2305 | * means that we have a GFP_ATOMIC allocation | 2305 | * means that we have a GFP_ATOMIC allocation |
2306 | * failure risk. Hurry up! | 2306 | * failure risk. Hurry up! |
2307 | */ | 2307 | */ |
2308 | if (!zone_watermark_ok(zone, order, | 2308 | if (!zone_watermark_ok_safe(zone, order, |
2309 | min_wmark_pages(zone), end_zone, 0)) | 2309 | min_wmark_pages(zone), end_zone, 0)) |
2310 | has_under_min_watermark_zone = 1; | 2310 | has_under_min_watermark_zone = 1; |
2311 | } else { | 2311 | } else { |
@@ -2448,7 +2448,9 @@ static int kswapd(void *p) | |||
2448 | */ | 2448 | */ |
2449 | if (!sleeping_prematurely(pgdat, order, remaining)) { | 2449 | if (!sleeping_prematurely(pgdat, order, remaining)) { |
2450 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); | 2450 | trace_mm_vmscan_kswapd_sleep(pgdat->node_id); |
2451 | restore_pgdat_percpu_threshold(pgdat); | ||
2451 | schedule(); | 2452 | schedule(); |
2453 | reduce_pgdat_percpu_threshold(pgdat); | ||
2452 | } else { | 2454 | } else { |
2453 | if (remaining) | 2455 | if (remaining) |
2454 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); | 2456 | count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); |
@@ -2487,16 +2489,17 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
2487 | if (!populated_zone(zone)) | 2489 | if (!populated_zone(zone)) |
2488 | return; | 2490 | return; |
2489 | 2491 | ||
2490 | pgdat = zone->zone_pgdat; | 2492 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) |
2491 | if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2492 | return; | 2493 | return; |
2494 | pgdat = zone->zone_pgdat; | ||
2493 | if (pgdat->kswapd_max_order < order) | 2495 | if (pgdat->kswapd_max_order < order) |
2494 | pgdat->kswapd_max_order = order; | 2496 | pgdat->kswapd_max_order = order; |
2495 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2496 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | ||
2497 | return; | ||
2498 | if (!waitqueue_active(&pgdat->kswapd_wait)) | 2497 | if (!waitqueue_active(&pgdat->kswapd_wait)) |
2499 | return; | 2498 | return; |
2499 | if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) | ||
2500 | return; | ||
2501 | |||
2502 | trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); | ||
2500 | wake_up_interruptible(&pgdat->kswapd_wait); | 2503 | wake_up_interruptible(&pgdat->kswapd_wait); |
2501 | } | 2504 | } |
2502 | 2505 | ||