diff options
author | KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> | 2009-12-14 20:59:12 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-15 11:53:18 -0500 |
commit | 7b51755c3b38483b574d363d5ee587283c3f7999 (patch) | |
tree | 29ac6d3b73f0a28488dabff03a7df948c0333906 | |
parent | 22fba33545b731408deab6e96b6e231ee05fd10b (diff) |
vmscan: kill hibernation specific reclaim logic and unify it
shrink_all_zone() was introduced by commit d6277db4ab (swsusp: rework
memory shrinker) for hibernate performance improvement. and
sc.swap_cluster_max was introduced by commit a06fe4d307 (Speed freeing
memory for suspend).
commit a06fe4d307 said
Without the patch:
Freed 14600 pages in 1749 jiffies = 32.61 MB/s (Anomolous!)
Freed 88563 pages in 14719 jiffies = 23.50 MB/s
Freed 205734 pages in 32389 jiffies = 24.81 MB/s
With the patch:
Freed 68252 pages in 496 jiffies = 537.52 MB/s
Freed 116464 pages in 569 jiffies = 798.54 MB/s
Freed 209699 pages in 705 jiffies = 1161.89 MB/s
At that time, their patch was pretty worth. However, Modern Hardware
trend and recent VM improvement broke its worth. From several reason, I
think we should remove shrink_all_zones() at all.
detail:
1) Old days, shrink_zone()'s slowness was mainly caused by stupid io-throttle
at no i/o congestion.
but current shrink_zone() is sane, not slow.
2) shrink_all_zone() try to shrink all pages at a time. but it doesn't works
fine on numa system.
example)
System has 4GB memory and each node have 2GB. and hibernate need 1GB.
optimal)
steal 500MB from each node.
shrink_all_zones)
steal 1GB from node-0.
Oh, Cache balancing logic was broken. ;)
Unfortunately, Desktop system moved ahead NUMA at nowadays.
(Side note, if hibernate require 2GB, shrink_all_zones() never success
on above machine)
3) if the node has several I/O flighting pages, shrink_all_zones() makes
pretty bad result.
schenario) hibernate need 1GB
1) shrink_all_zones() try to reclaim 1GB from Node-0
2) but it only reclaimed 990MB
3) stupidly, shrink_all_zones() try to reclaim 1GB from Node-1
4) it reclaimed 990MB
Oh, well. it reclaimed twice much than required.
In the other hand, current shrink_zone() has sane baling out logic.
then, it doesn't make overkill reclaim. then, we lost shrink_zones()'s risk.
4) SplitLRU VM always keep active/inactive ratio very carefully. inactive list only
shrinking break its assumption. it makes unnecessary OOM risk. it obviously suboptimal.
Now, shrink_all_memory() is only the wrapper function of do_try_to_free_pages().
it bring good reviewability and debuggability, and solve above problems.
side note: Reclaim logic unificication makes two good side effect.
- Fix recursive reclaim bug on shrink_all_memory().
it did forgot to use PF_MEMALLOC. it mean the system be able to stuck into deadlock.
- Now, shrink_all_memory() got lockdep awareness. it bring good debuggability.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | mm/vmscan.c | 153 |
1 files changed, 26 insertions, 127 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 7b0d5c784c7e..63bd521bb229 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -58,6 +58,8 @@ struct scan_control { | |||
58 | /* How many pages shrink_list() should reclaim */ | 58 | /* How many pages shrink_list() should reclaim */ |
59 | unsigned long nr_to_reclaim; | 59 | unsigned long nr_to_reclaim; |
60 | 60 | ||
61 | unsigned long hibernation_mode; | ||
62 | |||
61 | /* This context's GFP mask */ | 63 | /* This context's GFP mask */ |
62 | gfp_t gfp_mask; | 64 | gfp_t gfp_mask; |
63 | 65 | ||
@@ -1796,7 +1798,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
1796 | } | 1798 | } |
1797 | 1799 | ||
1798 | /* Take a nap, wait for some writeback to complete */ | 1800 | /* Take a nap, wait for some writeback to complete */ |
1799 | if (sc->nr_scanned && priority < DEF_PRIORITY - 2) | 1801 | if (!sc->hibernation_mode && sc->nr_scanned && |
1802 | priority < DEF_PRIORITY - 2) | ||
1800 | congestion_wait(BLK_RW_ASYNC, HZ/10); | 1803 | congestion_wait(BLK_RW_ASYNC, HZ/10); |
1801 | } | 1804 | } |
1802 | /* top priority shrink_zones still had more to do? don't OOM, then */ | 1805 | /* top priority shrink_zones still had more to do? don't OOM, then */ |
@@ -2336,148 +2339,44 @@ unsigned long zone_reclaimable_pages(struct zone *zone) | |||
2336 | 2339 | ||
2337 | #ifdef CONFIG_HIBERNATION | 2340 | #ifdef CONFIG_HIBERNATION |
2338 | /* | 2341 | /* |
2339 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages | 2342 | * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of |
2340 | * from LRU lists system-wide, for given pass and priority. | ||
2341 | * | ||
2342 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | ||
2343 | */ | ||
2344 | static void shrink_all_zones(unsigned long nr_pages, int prio, | ||
2345 | int pass, struct scan_control *sc) | ||
2346 | { | ||
2347 | struct zone *zone; | ||
2348 | unsigned long nr_reclaimed = 0; | ||
2349 | struct zone_reclaim_stat *reclaim_stat; | ||
2350 | |||
2351 | for_each_populated_zone(zone) { | ||
2352 | enum lru_list l; | ||
2353 | |||
2354 | if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) | ||
2355 | continue; | ||
2356 | |||
2357 | for_each_evictable_lru(l) { | ||
2358 | enum zone_stat_item ls = NR_LRU_BASE + l; | ||
2359 | unsigned long lru_pages = zone_page_state(zone, ls); | ||
2360 | |||
2361 | /* For pass = 0, we don't shrink the active list */ | ||
2362 | if (pass == 0 && (l == LRU_ACTIVE_ANON || | ||
2363 | l == LRU_ACTIVE_FILE)) | ||
2364 | continue; | ||
2365 | |||
2366 | reclaim_stat = get_reclaim_stat(zone, sc); | ||
2367 | reclaim_stat->nr_saved_scan[l] += | ||
2368 | (lru_pages >> prio) + 1; | ||
2369 | if (reclaim_stat->nr_saved_scan[l] | ||
2370 | >= nr_pages || pass > 3) { | ||
2371 | unsigned long nr_to_scan; | ||
2372 | |||
2373 | reclaim_stat->nr_saved_scan[l] = 0; | ||
2374 | nr_to_scan = min(nr_pages, lru_pages); | ||
2375 | nr_reclaimed += shrink_list(l, nr_to_scan, zone, | ||
2376 | sc, prio); | ||
2377 | if (nr_reclaimed >= nr_pages) { | ||
2378 | sc->nr_reclaimed += nr_reclaimed; | ||
2379 | return; | ||
2380 | } | ||
2381 | } | ||
2382 | } | ||
2383 | } | ||
2384 | sc->nr_reclaimed += nr_reclaimed; | ||
2385 | } | ||
2386 | |||
2387 | /* | ||
2388 | * Try to free `nr_pages' of memory, system-wide, and return the number of | ||
2389 | * freed pages. | 2343 | * freed pages. |
2390 | * | 2344 | * |
2391 | * Rather than trying to age LRUs the aim is to preserve the overall | 2345 | * Rather than trying to age LRUs the aim is to preserve the overall |
2392 | * LRU order by reclaiming preferentially | 2346 | * LRU order by reclaiming preferentially |
2393 | * inactive > active > active referenced > active mapped | 2347 | * inactive > active > active referenced > active mapped |
2394 | */ | 2348 | */ |
2395 | unsigned long shrink_all_memory(unsigned long nr_pages) | 2349 | unsigned long shrink_all_memory(unsigned long nr_to_reclaim) |
2396 | { | 2350 | { |
2397 | unsigned long lru_pages, nr_slab; | ||
2398 | int pass; | ||
2399 | struct reclaim_state reclaim_state; | 2351 | struct reclaim_state reclaim_state; |
2400 | struct scan_control sc = { | 2352 | struct scan_control sc = { |
2401 | .gfp_mask = GFP_KERNEL, | 2353 | .gfp_mask = GFP_HIGHUSER_MOVABLE, |
2402 | .may_unmap = 0, | 2354 | .may_swap = 1, |
2355 | .may_unmap = 1, | ||
2403 | .may_writepage = 1, | 2356 | .may_writepage = 1, |
2357 | .swap_cluster_max = SWAP_CLUSTER_MAX, | ||
2358 | .nr_to_reclaim = nr_to_reclaim, | ||
2359 | .hibernation_mode = 1, | ||
2360 | .swappiness = vm_swappiness, | ||
2361 | .order = 0, | ||
2404 | .isolate_pages = isolate_pages_global, | 2362 | .isolate_pages = isolate_pages_global, |
2405 | .nr_reclaimed = 0, | ||
2406 | }; | 2363 | }; |
2364 | struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); | ||
2365 | struct task_struct *p = current; | ||
2366 | unsigned long nr_reclaimed; | ||
2407 | 2367 | ||
2408 | current->reclaim_state = &reclaim_state; | 2368 | p->flags |= PF_MEMALLOC; |
2409 | 2369 | lockdep_set_current_reclaim_state(sc.gfp_mask); | |
2410 | lru_pages = global_reclaimable_pages(); | 2370 | reclaim_state.reclaimed_slab = 0; |
2411 | nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); | 2371 | p->reclaim_state = &reclaim_state; |
2412 | /* If slab caches are huge, it's better to hit them first */ | ||
2413 | while (nr_slab >= lru_pages) { | ||
2414 | reclaim_state.reclaimed_slab = 0; | ||
2415 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
2416 | if (!reclaim_state.reclaimed_slab) | ||
2417 | break; | ||
2418 | |||
2419 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2420 | if (sc.nr_reclaimed >= nr_pages) | ||
2421 | goto out; | ||
2422 | |||
2423 | nr_slab -= reclaim_state.reclaimed_slab; | ||
2424 | } | ||
2425 | |||
2426 | /* | ||
2427 | * We try to shrink LRUs in 5 passes: | ||
2428 | * 0 = Reclaim from inactive_list only | ||
2429 | * 1 = Reclaim from active list but don't reclaim mapped | ||
2430 | * 2 = 2nd pass of type 1 | ||
2431 | * 3 = Reclaim mapped (normal reclaim) | ||
2432 | * 4 = 2nd pass of type 3 | ||
2433 | */ | ||
2434 | for (pass = 0; pass < 5; pass++) { | ||
2435 | int prio; | ||
2436 | |||
2437 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | ||
2438 | if (pass > 2) | ||
2439 | sc.may_unmap = 1; | ||
2440 | |||
2441 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | ||
2442 | unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; | ||
2443 | |||
2444 | sc.nr_scanned = 0; | ||
2445 | sc.swap_cluster_max = nr_to_scan; | ||
2446 | shrink_all_zones(nr_to_scan, prio, pass, &sc); | ||
2447 | if (sc.nr_reclaimed >= nr_pages) | ||
2448 | goto out; | ||
2449 | |||
2450 | reclaim_state.reclaimed_slab = 0; | ||
2451 | shrink_slab(sc.nr_scanned, sc.gfp_mask, | ||
2452 | global_reclaimable_pages()); | ||
2453 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2454 | if (sc.nr_reclaimed >= nr_pages) | ||
2455 | goto out; | ||
2456 | |||
2457 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | ||
2458 | congestion_wait(BLK_RW_ASYNC, HZ / 10); | ||
2459 | } | ||
2460 | } | ||
2461 | |||
2462 | /* | ||
2463 | * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be | ||
2464 | * something in slab caches | ||
2465 | */ | ||
2466 | if (!sc.nr_reclaimed) { | ||
2467 | do { | ||
2468 | reclaim_state.reclaimed_slab = 0; | ||
2469 | shrink_slab(nr_pages, sc.gfp_mask, | ||
2470 | global_reclaimable_pages()); | ||
2471 | sc.nr_reclaimed += reclaim_state.reclaimed_slab; | ||
2472 | } while (sc.nr_reclaimed < nr_pages && | ||
2473 | reclaim_state.reclaimed_slab > 0); | ||
2474 | } | ||
2475 | 2372 | ||
2373 | nr_reclaimed = do_try_to_free_pages(zonelist, &sc); | ||
2476 | 2374 | ||
2477 | out: | 2375 | p->reclaim_state = NULL; |
2478 | current->reclaim_state = NULL; | 2376 | lockdep_clear_current_reclaim_state(); |
2377 | p->flags &= ~PF_MEMALLOC; | ||
2479 | 2378 | ||
2480 | return sc.nr_reclaimed; | 2379 | return nr_reclaimed; |
2481 | } | 2380 | } |
2482 | #endif /* CONFIG_HIBERNATION */ | 2381 | #endif /* CONFIG_HIBERNATION */ |
2483 | 2382 | ||