aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>2009-12-14 20:59:12 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-12-15 11:53:18 -0500
commit7b51755c3b38483b574d363d5ee587283c3f7999 (patch)
tree29ac6d3b73f0a28488dabff03a7df948c0333906
parent22fba33545b731408deab6e96b6e231ee05fd10b (diff)
vmscan: kill hibernation specific reclaim logic and unify it
shrink_all_zone() was introduced by commit d6277db4ab (swsusp: rework memory shrinker) for hibernate performance improvement. and sc.swap_cluster_max was introduced by commit a06fe4d307 (Speed freeing memory for suspend). commit a06fe4d307 said Without the patch: Freed 14600 pages in 1749 jiffies = 32.61 MB/s (Anomolous!) Freed 88563 pages in 14719 jiffies = 23.50 MB/s Freed 205734 pages in 32389 jiffies = 24.81 MB/s With the patch: Freed 68252 pages in 496 jiffies = 537.52 MB/s Freed 116464 pages in 569 jiffies = 798.54 MB/s Freed 209699 pages in 705 jiffies = 1161.89 MB/s At that time, their patch was pretty worth. However, Modern Hardware trend and recent VM improvement broke its worth. From several reason, I think we should remove shrink_all_zones() at all. detail: 1) Old days, shrink_zone()'s slowness was mainly caused by stupid io-throttle at no i/o congestion. but current shrink_zone() is sane, not slow. 2) shrink_all_zone() try to shrink all pages at a time. but it doesn't works fine on numa system. example) System has 4GB memory and each node have 2GB. and hibernate need 1GB. optimal) steal 500MB from each node. shrink_all_zones) steal 1GB from node-0. Oh, Cache balancing logic was broken. ;) Unfortunately, Desktop system moved ahead NUMA at nowadays. (Side note, if hibernate require 2GB, shrink_all_zones() never success on above machine) 3) if the node has several I/O flighting pages, shrink_all_zones() makes pretty bad result. schenario) hibernate need 1GB 1) shrink_all_zones() try to reclaim 1GB from Node-0 2) but it only reclaimed 990MB 3) stupidly, shrink_all_zones() try to reclaim 1GB from Node-1 4) it reclaimed 990MB Oh, well. it reclaimed twice much than required. In the other hand, current shrink_zone() has sane baling out logic. then, it doesn't make overkill reclaim. then, we lost shrink_zones()'s risk. 4) SplitLRU VM always keep active/inactive ratio very carefully. inactive list only shrinking break its assumption. it makes unnecessary OOM risk. it obviously suboptimal. Now, shrink_all_memory() is only the wrapper function of do_try_to_free_pages(). it bring good reviewability and debuggability, and solve above problems. side note: Reclaim logic unificication makes two good side effect. - Fix recursive reclaim bug on shrink_all_memory(). it did forgot to use PF_MEMALLOC. it mean the system be able to stuck into deadlock. - Now, shrink_all_memory() got lockdep awareness. it bring good debuggability. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reviewed-by: Rik van Riel <riel@redhat.com> Acked-by: Rafael J. Wysocki <rjw@sisk.pl> Cc: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/vmscan.c153
1 files changed, 26 insertions, 127 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7b0d5c784c7e..63bd521bb229 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -58,6 +58,8 @@ struct scan_control {
58 /* How many pages shrink_list() should reclaim */ 58 /* How many pages shrink_list() should reclaim */
59 unsigned long nr_to_reclaim; 59 unsigned long nr_to_reclaim;
60 60
61 unsigned long hibernation_mode;
62
61 /* This context's GFP mask */ 63 /* This context's GFP mask */
62 gfp_t gfp_mask; 64 gfp_t gfp_mask;
63 65
@@ -1796,7 +1798,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1796 } 1798 }
1797 1799
1798 /* Take a nap, wait for some writeback to complete */ 1800 /* Take a nap, wait for some writeback to complete */
1799 if (sc->nr_scanned && priority < DEF_PRIORITY - 2) 1801 if (!sc->hibernation_mode && sc->nr_scanned &&
1802 priority < DEF_PRIORITY - 2)
1800 congestion_wait(BLK_RW_ASYNC, HZ/10); 1803 congestion_wait(BLK_RW_ASYNC, HZ/10);
1801 } 1804 }
1802 /* top priority shrink_zones still had more to do? don't OOM, then */ 1805 /* top priority shrink_zones still had more to do? don't OOM, then */
@@ -2336,148 +2339,44 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
2336 2339
2337#ifdef CONFIG_HIBERNATION 2340#ifdef CONFIG_HIBERNATION
2338/* 2341/*
2339 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2342 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
2340 * from LRU lists system-wide, for given pass and priority.
2341 *
2342 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
2343 */
2344static void shrink_all_zones(unsigned long nr_pages, int prio,
2345 int pass, struct scan_control *sc)
2346{
2347 struct zone *zone;
2348 unsigned long nr_reclaimed = 0;
2349 struct zone_reclaim_stat *reclaim_stat;
2350
2351 for_each_populated_zone(zone) {
2352 enum lru_list l;
2353
2354 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
2355 continue;
2356
2357 for_each_evictable_lru(l) {
2358 enum zone_stat_item ls = NR_LRU_BASE + l;
2359 unsigned long lru_pages = zone_page_state(zone, ls);
2360
2361 /* For pass = 0, we don't shrink the active list */
2362 if (pass == 0 && (l == LRU_ACTIVE_ANON ||
2363 l == LRU_ACTIVE_FILE))
2364 continue;
2365
2366 reclaim_stat = get_reclaim_stat(zone, sc);
2367 reclaim_stat->nr_saved_scan[l] +=
2368 (lru_pages >> prio) + 1;
2369 if (reclaim_stat->nr_saved_scan[l]
2370 >= nr_pages || pass > 3) {
2371 unsigned long nr_to_scan;
2372
2373 reclaim_stat->nr_saved_scan[l] = 0;
2374 nr_to_scan = min(nr_pages, lru_pages);
2375 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2376 sc, prio);
2377 if (nr_reclaimed >= nr_pages) {
2378 sc->nr_reclaimed += nr_reclaimed;
2379 return;
2380 }
2381 }
2382 }
2383 }
2384 sc->nr_reclaimed += nr_reclaimed;
2385}
2386
2387/*
2388 * Try to free `nr_pages' of memory, system-wide, and return the number of
2389 * freed pages. 2343 * freed pages.
2390 * 2344 *
2391 * Rather than trying to age LRUs the aim is to preserve the overall 2345 * Rather than trying to age LRUs the aim is to preserve the overall
2392 * LRU order by reclaiming preferentially 2346 * LRU order by reclaiming preferentially
2393 * inactive > active > active referenced > active mapped 2347 * inactive > active > active referenced > active mapped
2394 */ 2348 */
2395unsigned long shrink_all_memory(unsigned long nr_pages) 2349unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2396{ 2350{
2397 unsigned long lru_pages, nr_slab;
2398 int pass;
2399 struct reclaim_state reclaim_state; 2351 struct reclaim_state reclaim_state;
2400 struct scan_control sc = { 2352 struct scan_control sc = {
2401 .gfp_mask = GFP_KERNEL, 2353 .gfp_mask = GFP_HIGHUSER_MOVABLE,
2402 .may_unmap = 0, 2354 .may_swap = 1,
2355 .may_unmap = 1,
2403 .may_writepage = 1, 2356 .may_writepage = 1,
2357 .swap_cluster_max = SWAP_CLUSTER_MAX,
2358 .nr_to_reclaim = nr_to_reclaim,
2359 .hibernation_mode = 1,
2360 .swappiness = vm_swappiness,
2361 .order = 0,
2404 .isolate_pages = isolate_pages_global, 2362 .isolate_pages = isolate_pages_global,
2405 .nr_reclaimed = 0,
2406 }; 2363 };
2364 struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
2365 struct task_struct *p = current;
2366 unsigned long nr_reclaimed;
2407 2367
2408 current->reclaim_state = &reclaim_state; 2368 p->flags |= PF_MEMALLOC;
2409 2369 lockdep_set_current_reclaim_state(sc.gfp_mask);
2410 lru_pages = global_reclaimable_pages(); 2370 reclaim_state.reclaimed_slab = 0;
2411 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2371 p->reclaim_state = &reclaim_state;
2412 /* If slab caches are huge, it's better to hit them first */
2413 while (nr_slab >= lru_pages) {
2414 reclaim_state.reclaimed_slab = 0;
2415 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
2416 if (!reclaim_state.reclaimed_slab)
2417 break;
2418
2419 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2420 if (sc.nr_reclaimed >= nr_pages)
2421 goto out;
2422
2423 nr_slab -= reclaim_state.reclaimed_slab;
2424 }
2425
2426 /*
2427 * We try to shrink LRUs in 5 passes:
2428 * 0 = Reclaim from inactive_list only
2429 * 1 = Reclaim from active list but don't reclaim mapped
2430 * 2 = 2nd pass of type 1
2431 * 3 = Reclaim mapped (normal reclaim)
2432 * 4 = 2nd pass of type 3
2433 */
2434 for (pass = 0; pass < 5; pass++) {
2435 int prio;
2436
2437 /* Force reclaiming mapped pages in the passes #3 and #4 */
2438 if (pass > 2)
2439 sc.may_unmap = 1;
2440
2441 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
2442 unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
2443
2444 sc.nr_scanned = 0;
2445 sc.swap_cluster_max = nr_to_scan;
2446 shrink_all_zones(nr_to_scan, prio, pass, &sc);
2447 if (sc.nr_reclaimed >= nr_pages)
2448 goto out;
2449
2450 reclaim_state.reclaimed_slab = 0;
2451 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2452 global_reclaimable_pages());
2453 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2454 if (sc.nr_reclaimed >= nr_pages)
2455 goto out;
2456
2457 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
2458 congestion_wait(BLK_RW_ASYNC, HZ / 10);
2459 }
2460 }
2461
2462 /*
2463 * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be
2464 * something in slab caches
2465 */
2466 if (!sc.nr_reclaimed) {
2467 do {
2468 reclaim_state.reclaimed_slab = 0;
2469 shrink_slab(nr_pages, sc.gfp_mask,
2470 global_reclaimable_pages());
2471 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2472 } while (sc.nr_reclaimed < nr_pages &&
2473 reclaim_state.reclaimed_slab > 0);
2474 }
2475 2372
2373 nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
2476 2374
2477out: 2375 p->reclaim_state = NULL;
2478 current->reclaim_state = NULL; 2376 lockdep_clear_current_reclaim_state();
2377 p->flags &= ~PF_MEMALLOC;
2479 2378
2480 return sc.nr_reclaimed; 2379 return nr_reclaimed;
2481} 2380}
2482#endif /* CONFIG_HIBERNATION */ 2381#endif /* CONFIG_HIBERNATION */
2483 2382