diff options
author | Rafael J. Wysocki <rjw@sisk.pl> | 2006-06-23 05:03:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-23 10:42:48 -0400 |
commit | d6277db4ab271862ed599da08d78961c70f00002 (patch) | |
tree | f11b2f82200c95d17e10779b44a6da37bc03965f | |
parent | 7a7c381d25067b9a2bfe025dfcb16459daec0373 (diff) |
[PATCH] swsusp: rework memory shrinker
Rework the swsusp's memory shrinker in the following way:
- Simplify balance_pgdat() by removing all of the swsusp-related code
from it.
- Make shrink_all_memory() use shrink_slab() and a new function
shrink_all_zones() which calls shrink_active_list() and
shrink_inactive_list() directly for each zone in a way that's optimized
for suspend.
In shrink_all_memory() we try to free exactly as many pages as the caller
asks for, preferably in one shot, starting from easier targets. If slab
caches are huge, they are most likely to have enough pages to reclaim.
The inactive lists are next (the zones with more inactive pages go first)
etc.
Each time shrink_all_memory() attempts to shrink the active and inactive
lists for each zone in 5 passes. In the first pass, only the inactive
lists are taken into consideration. In the next two passes the active
lists are also shrunk, but mapped pages are not reclaimed. In the last
two passes the active and inactive lists are shrunk and mapped pages are
reclaimed as well. The aim of this is to alter the reclaim logic to choose
the best pages to keep on resume and improve the responsiveness of the
resumed system.
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r-- | kernel/power/swsusp.c | 10 | ||||
-rw-r--r-- | mm/vmscan.c | 219 |
2 files changed, 172 insertions, 57 deletions
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c4016cbbd3e0..f9238faf76e4 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) | |||
175 | */ | 175 | */ |
176 | 176 | ||
177 | #define SHRINK_BITE 10000 | 177 | #define SHRINK_BITE 10000 |
178 | static inline unsigned long __shrink_memory(long tmp) | ||
179 | { | ||
180 | if (tmp > SHRINK_BITE) | ||
181 | tmp = SHRINK_BITE; | ||
182 | return shrink_all_memory(tmp); | ||
183 | } | ||
178 | 184 | ||
179 | int swsusp_shrink_memory(void) | 185 | int swsusp_shrink_memory(void) |
180 | { | 186 | { |
@@ -195,12 +201,12 @@ int swsusp_shrink_memory(void) | |||
195 | if (!is_highmem(zone)) | 201 | if (!is_highmem(zone)) |
196 | tmp -= zone->free_pages; | 202 | tmp -= zone->free_pages; |
197 | if (tmp > 0) { | 203 | if (tmp > 0) { |
198 | tmp = shrink_all_memory(SHRINK_BITE); | 204 | tmp = __shrink_memory(tmp); |
199 | if (!tmp) | 205 | if (!tmp) |
200 | return -ENOMEM; | 206 | return -ENOMEM; |
201 | pages += tmp; | 207 | pages += tmp; |
202 | } else if (size > image_size / PAGE_SIZE) { | 208 | } else if (size > image_size / PAGE_SIZE) { |
203 | tmp = shrink_all_memory(SHRINK_BITE); | 209 | tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); |
204 | pages += tmp; | 210 | pages += tmp; |
205 | } | 211 | } |
206 | printk("\b%c", p[i++%4]); | 212 | printk("\b%c", p[i++%4]); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 440a733fe2e9..46be8a02280e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -61,6 +61,8 @@ struct scan_control { | |||
61 | * In this context, it doesn't matter that we scan the | 61 | * In this context, it doesn't matter that we scan the |
62 | * whole list at once. */ | 62 | * whole list at once. */ |
63 | int swap_cluster_max; | 63 | int swap_cluster_max; |
64 | |||
65 | int swappiness; | ||
64 | }; | 66 | }; |
65 | 67 | ||
66 | /* | 68 | /* |
@@ -741,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
741 | * A 100% value of vm_swappiness overrides this algorithm | 743 | * A 100% value of vm_swappiness overrides this algorithm |
742 | * altogether. | 744 | * altogether. |
743 | */ | 745 | */ |
744 | swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; | 746 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; |
745 | 747 | ||
746 | /* | 748 | /* |
747 | * Now use this metric to decide whether to start moving mapped | 749 | * Now use this metric to decide whether to start moving mapped |
@@ -957,6 +959,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
957 | .may_writepage = !laptop_mode, | 959 | .may_writepage = !laptop_mode, |
958 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 960 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
959 | .may_swap = 1, | 961 | .may_swap = 1, |
962 | .swappiness = vm_swappiness, | ||
960 | }; | 963 | }; |
961 | 964 | ||
962 | inc_page_state(allocstall); | 965 | inc_page_state(allocstall); |
@@ -1021,10 +1024,6 @@ out: | |||
1021 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1024 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1022 | * they are all at pages_high. | 1025 | * they are all at pages_high. |
1023 | * | 1026 | * |
1024 | * If `nr_pages' is non-zero then it is the number of pages which are to be | ||
1025 | * reclaimed, regardless of the zone occupancies. This is a software suspend | ||
1026 | * special. | ||
1027 | * | ||
1028 | * Returns the number of pages which were actually freed. | 1027 | * Returns the number of pages which were actually freed. |
1029 | * | 1028 | * |
1030 | * There is special handling here for zones which are full of pinned pages. | 1029 | * There is special handling here for zones which are full of pinned pages. |
@@ -1042,10 +1041,8 @@ out: | |||
1042 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1041 | * the page allocator fallback scheme to ensure that aging of pages is balanced |
1043 | * across the zones. | 1042 | * across the zones. |
1044 | */ | 1043 | */ |
1045 | static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, | 1044 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) |
1046 | int order) | ||
1047 | { | 1045 | { |
1048 | unsigned long to_free = nr_pages; | ||
1049 | int all_zones_ok; | 1046 | int all_zones_ok; |
1050 | int priority; | 1047 | int priority; |
1051 | int i; | 1048 | int i; |
@@ -1055,7 +1052,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, | |||
1055 | struct scan_control sc = { | 1052 | struct scan_control sc = { |
1056 | .gfp_mask = GFP_KERNEL, | 1053 | .gfp_mask = GFP_KERNEL, |
1057 | .may_swap = 1, | 1054 | .may_swap = 1, |
1058 | .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, | 1055 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1056 | .swappiness = vm_swappiness, | ||
1059 | }; | 1057 | }; |
1060 | 1058 | ||
1061 | loop_again: | 1059 | loop_again: |
@@ -1082,31 +1080,26 @@ loop_again: | |||
1082 | 1080 | ||
1083 | all_zones_ok = 1; | 1081 | all_zones_ok = 1; |
1084 | 1082 | ||
1085 | if (nr_pages == 0) { | 1083 | /* |
1086 | /* | 1084 | * Scan in the highmem->dma direction for the highest |
1087 | * Scan in the highmem->dma direction for the highest | 1085 | * zone which needs scanning |
1088 | * zone which needs scanning | 1086 | */ |
1089 | */ | 1087 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { |
1090 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { | 1088 | struct zone *zone = pgdat->node_zones + i; |
1091 | struct zone *zone = pgdat->node_zones + i; | ||
1092 | 1089 | ||
1093 | if (!populated_zone(zone)) | 1090 | if (!populated_zone(zone)) |
1094 | continue; | 1091 | continue; |
1095 | 1092 | ||
1096 | if (zone->all_unreclaimable && | 1093 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1097 | priority != DEF_PRIORITY) | 1094 | continue; |
1098 | continue; | ||
1099 | 1095 | ||
1100 | if (!zone_watermark_ok(zone, order, | 1096 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1101 | zone->pages_high, 0, 0)) { | 1097 | 0, 0)) { |
1102 | end_zone = i; | 1098 | end_zone = i; |
1103 | goto scan; | 1099 | goto scan; |
1104 | } | ||
1105 | } | 1100 | } |
1106 | goto out; | ||
1107 | } else { | ||
1108 | end_zone = pgdat->nr_zones - 1; | ||
1109 | } | 1101 | } |
1102 | goto out; | ||
1110 | scan: | 1103 | scan: |
1111 | for (i = 0; i <= end_zone; i++) { | 1104 | for (i = 0; i <= end_zone; i++) { |
1112 | struct zone *zone = pgdat->node_zones + i; | 1105 | struct zone *zone = pgdat->node_zones + i; |
@@ -1133,11 +1126,9 @@ scan: | |||
1133 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1126 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1134 | continue; | 1127 | continue; |
1135 | 1128 | ||
1136 | if (nr_pages == 0) { /* Not software suspend */ | 1129 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1137 | if (!zone_watermark_ok(zone, order, | 1130 | end_zone, 0)) |
1138 | zone->pages_high, end_zone, 0)) | 1131 | all_zones_ok = 0; |
1139 | all_zones_ok = 0; | ||
1140 | } | ||
1141 | zone->temp_priority = priority; | 1132 | zone->temp_priority = priority; |
1142 | if (zone->prev_priority > priority) | 1133 | if (zone->prev_priority > priority) |
1143 | zone->prev_priority = priority; | 1134 | zone->prev_priority = priority; |
@@ -1162,8 +1153,6 @@ scan: | |||
1162 | total_scanned > nr_reclaimed + nr_reclaimed / 2) | 1153 | total_scanned > nr_reclaimed + nr_reclaimed / 2) |
1163 | sc.may_writepage = 1; | 1154 | sc.may_writepage = 1; |
1164 | } | 1155 | } |
1165 | if (nr_pages && to_free > nr_reclaimed) | ||
1166 | continue; /* swsusp: need to do more work */ | ||
1167 | if (all_zones_ok) | 1156 | if (all_zones_ok) |
1168 | break; /* kswapd: all done */ | 1157 | break; /* kswapd: all done */ |
1169 | /* | 1158 | /* |
@@ -1179,7 +1168,7 @@ scan: | |||
1179 | * matches the direct reclaim path behaviour in terms of impact | 1168 | * matches the direct reclaim path behaviour in terms of impact |
1180 | * on zone->*_priority. | 1169 | * on zone->*_priority. |
1181 | */ | 1170 | */ |
1182 | if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) | 1171 | if (nr_reclaimed >= SWAP_CLUSTER_MAX) |
1183 | break; | 1172 | break; |
1184 | } | 1173 | } |
1185 | out: | 1174 | out: |
@@ -1261,7 +1250,7 @@ static int kswapd(void *p) | |||
1261 | } | 1250 | } |
1262 | finish_wait(&pgdat->kswapd_wait, &wait); | 1251 | finish_wait(&pgdat->kswapd_wait, &wait); |
1263 | 1252 | ||
1264 | balance_pgdat(pgdat, 0, order); | 1253 | balance_pgdat(pgdat, order); |
1265 | } | 1254 | } |
1266 | return 0; | 1255 | return 0; |
1267 | } | 1256 | } |
@@ -1290,35 +1279,154 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1290 | 1279 | ||
1291 | #ifdef CONFIG_PM | 1280 | #ifdef CONFIG_PM |
1292 | /* | 1281 | /* |
1293 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed | 1282 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages |
1294 | * pages. | 1283 | * from LRU lists system-wide, for given pass and priority, and returns the |
1284 | * number of reclaimed pages | ||
1285 | * | ||
1286 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | ||
1287 | */ | ||
1288 | static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, | ||
1289 | int prio, struct scan_control *sc) | ||
1290 | { | ||
1291 | struct zone *zone; | ||
1292 | unsigned long nr_to_scan, ret = 0; | ||
1293 | |||
1294 | for_each_zone(zone) { | ||
1295 | |||
1296 | if (!populated_zone(zone)) | ||
1297 | continue; | ||
1298 | |||
1299 | if (zone->all_unreclaimable && prio != DEF_PRIORITY) | ||
1300 | continue; | ||
1301 | |||
1302 | /* For pass = 0 we don't shrink the active list */ | ||
1303 | if (pass > 0) { | ||
1304 | zone->nr_scan_active += (zone->nr_active >> prio) + 1; | ||
1305 | if (zone->nr_scan_active >= nr_pages || pass > 3) { | ||
1306 | zone->nr_scan_active = 0; | ||
1307 | nr_to_scan = min(nr_pages, zone->nr_active); | ||
1308 | shrink_active_list(nr_to_scan, zone, sc); | ||
1309 | } | ||
1310 | } | ||
1311 | |||
1312 | zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; | ||
1313 | if (zone->nr_scan_inactive >= nr_pages || pass > 3) { | ||
1314 | zone->nr_scan_inactive = 0; | ||
1315 | nr_to_scan = min(nr_pages, zone->nr_inactive); | ||
1316 | ret += shrink_inactive_list(nr_to_scan, zone, sc); | ||
1317 | if (ret >= nr_pages) | ||
1318 | return ret; | ||
1319 | } | ||
1320 | } | ||
1321 | |||
1322 | return ret; | ||
1323 | } | ||
1324 | |||
1325 | /* | ||
1326 | * Try to free `nr_pages' of memory, system-wide, and return the number of | ||
1327 | * freed pages. | ||
1328 | * | ||
1329 | * Rather than trying to age LRUs the aim is to preserve the overall | ||
1330 | * LRU order by reclaiming preferentially | ||
1331 | * inactive > active > active referenced > active mapped | ||
1295 | */ | 1332 | */ |
1296 | unsigned long shrink_all_memory(unsigned long nr_pages) | 1333 | unsigned long shrink_all_memory(unsigned long nr_pages) |
1297 | { | 1334 | { |
1298 | pg_data_t *pgdat; | 1335 | unsigned long lru_pages, nr_slab; |
1299 | unsigned long nr_to_free = nr_pages; | ||
1300 | unsigned long ret = 0; | 1336 | unsigned long ret = 0; |
1301 | unsigned retry = 2; | 1337 | int pass; |
1302 | struct reclaim_state reclaim_state = { | 1338 | struct reclaim_state reclaim_state; |
1303 | .reclaimed_slab = 0, | 1339 | struct zone *zone; |
1340 | struct scan_control sc = { | ||
1341 | .gfp_mask = GFP_KERNEL, | ||
1342 | .may_swap = 0, | ||
1343 | .swap_cluster_max = nr_pages, | ||
1344 | .may_writepage = 1, | ||
1345 | .swappiness = vm_swappiness, | ||
1304 | }; | 1346 | }; |
1305 | 1347 | ||
1306 | current->reclaim_state = &reclaim_state; | 1348 | current->reclaim_state = &reclaim_state; |
1307 | repeat: | ||
1308 | for_each_online_pgdat(pgdat) { | ||
1309 | unsigned long freed; | ||
1310 | 1349 | ||
1311 | freed = balance_pgdat(pgdat, nr_to_free, 0); | 1350 | lru_pages = 0; |
1312 | ret += freed; | 1351 | for_each_zone(zone) |
1313 | nr_to_free -= freed; | 1352 | lru_pages += zone->nr_active + zone->nr_inactive; |
1314 | if ((long)nr_to_free <= 0) | 1353 | |
1354 | nr_slab = read_page_state(nr_slab); | ||
1355 | /* If slab caches are huge, it's better to hit them first */ | ||
1356 | while (nr_slab >= lru_pages) { | ||
1357 | reclaim_state.reclaimed_slab = 0; | ||
1358 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
1359 | if (!reclaim_state.reclaimed_slab) | ||
1315 | break; | 1360 | break; |
1361 | |||
1362 | ret += reclaim_state.reclaimed_slab; | ||
1363 | if (ret >= nr_pages) | ||
1364 | goto out; | ||
1365 | |||
1366 | nr_slab -= reclaim_state.reclaimed_slab; | ||
1316 | } | 1367 | } |
1317 | if (retry-- && ret < nr_pages) { | 1368 | |
1318 | blk_congestion_wait(WRITE, HZ/5); | 1369 | /* |
1319 | goto repeat; | 1370 | * We try to shrink LRUs in 5 passes: |
1371 | * 0 = Reclaim from inactive_list only | ||
1372 | * 1 = Reclaim from active list but don't reclaim mapped | ||
1373 | * 2 = 2nd pass of type 1 | ||
1374 | * 3 = Reclaim mapped (normal reclaim) | ||
1375 | * 4 = 2nd pass of type 3 | ||
1376 | */ | ||
1377 | for (pass = 0; pass < 5; pass++) { | ||
1378 | int prio; | ||
1379 | |||
1380 | /* Needed for shrinking slab caches later on */ | ||
1381 | if (!lru_pages) | ||
1382 | for_each_zone(zone) { | ||
1383 | lru_pages += zone->nr_active; | ||
1384 | lru_pages += zone->nr_inactive; | ||
1385 | } | ||
1386 | |||
1387 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | ||
1388 | if (pass > 2) { | ||
1389 | sc.may_swap = 1; | ||
1390 | sc.swappiness = 100; | ||
1391 | } | ||
1392 | |||
1393 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | ||
1394 | unsigned long nr_to_scan = nr_pages - ret; | ||
1395 | |||
1396 | sc.nr_mapped = read_page_state(nr_mapped); | ||
1397 | sc.nr_scanned = 0; | ||
1398 | |||
1399 | ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); | ||
1400 | if (ret >= nr_pages) | ||
1401 | goto out; | ||
1402 | |||
1403 | reclaim_state.reclaimed_slab = 0; | ||
1404 | shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); | ||
1405 | ret += reclaim_state.reclaimed_slab; | ||
1406 | if (ret >= nr_pages) | ||
1407 | goto out; | ||
1408 | |||
1409 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | ||
1410 | blk_congestion_wait(WRITE, HZ / 10); | ||
1411 | } | ||
1412 | |||
1413 | lru_pages = 0; | ||
1320 | } | 1414 | } |
1415 | |||
1416 | /* | ||
1417 | * If ret = 0, we could not shrink LRUs, but there may be something | ||
1418 | * in slab caches | ||
1419 | */ | ||
1420 | if (!ret) | ||
1421 | do { | ||
1422 | reclaim_state.reclaimed_slab = 0; | ||
1423 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
1424 | ret += reclaim_state.reclaimed_slab; | ||
1425 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); | ||
1426 | |||
1427 | out: | ||
1321 | current->reclaim_state = NULL; | 1428 | current->reclaim_state = NULL; |
1429 | |||
1322 | return ret; | 1430 | return ret; |
1323 | } | 1431 | } |
1324 | #endif | 1432 | #endif |
@@ -1416,6 +1524,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1416 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 1524 | .swap_cluster_max = max_t(unsigned long, nr_pages, |
1417 | SWAP_CLUSTER_MAX), | 1525 | SWAP_CLUSTER_MAX), |
1418 | .gfp_mask = gfp_mask, | 1526 | .gfp_mask = gfp_mask, |
1527 | .swappiness = vm_swappiness, | ||
1419 | }; | 1528 | }; |
1420 | 1529 | ||
1421 | disable_swap_token(); | 1530 | disable_swap_token(); |