aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRafael J. Wysocki <rjw@sisk.pl>2006-06-23 05:03:18 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-23 10:42:48 -0400
commitd6277db4ab271862ed599da08d78961c70f00002 (patch)
treef11b2f82200c95d17e10779b44a6da37bc03965f
parent7a7c381d25067b9a2bfe025dfcb16459daec0373 (diff)
[PATCH] swsusp: rework memory shrinker
Rework the swsusp's memory shrinker in the following way: - Simplify balance_pgdat() by removing all of the swsusp-related code from it. - Make shrink_all_memory() use shrink_slab() and a new function shrink_all_zones() which calls shrink_active_list() and shrink_inactive_list() directly for each zone in a way that's optimized for suspend. In shrink_all_memory() we try to free exactly as many pages as the caller asks for, preferably in one shot, starting from easier targets.  If slab caches are huge, they are most likely to have enough pages to reclaim.  The inactive lists are next (the zones with more inactive pages go first) etc. Each time shrink_all_memory() attempts to shrink the active and inactive lists for each zone in 5 passes.  In the first pass, only the inactive lists are taken into consideration.  In the next two passes the active lists are also shrunk, but mapped pages are not reclaimed.  In the last two passes the active and inactive lists are shrunk and mapped pages are reclaimed as well. The aim of this is to alter the reclaim logic to choose the best pages to keep on resume and improve the responsiveness of the resumed system. Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl> Signed-off-by: Con Kolivas <kernel@kolivas.org> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--kernel/power/swsusp.c10
-rw-r--r--mm/vmscan.c219
2 files changed, 172 insertions, 57 deletions
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c4016cbbd3e0..f9238faf76e4 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
175 */ 175 */
176 176
177#define SHRINK_BITE 10000 177#define SHRINK_BITE 10000
178static inline unsigned long __shrink_memory(long tmp)
179{
180 if (tmp > SHRINK_BITE)
181 tmp = SHRINK_BITE;
182 return shrink_all_memory(tmp);
183}
178 184
179int swsusp_shrink_memory(void) 185int swsusp_shrink_memory(void)
180{ 186{
@@ -195,12 +201,12 @@ int swsusp_shrink_memory(void)
195 if (!is_highmem(zone)) 201 if (!is_highmem(zone))
196 tmp -= zone->free_pages; 202 tmp -= zone->free_pages;
197 if (tmp > 0) { 203 if (tmp > 0) {
198 tmp = shrink_all_memory(SHRINK_BITE); 204 tmp = __shrink_memory(tmp);
199 if (!tmp) 205 if (!tmp)
200 return -ENOMEM; 206 return -ENOMEM;
201 pages += tmp; 207 pages += tmp;
202 } else if (size > image_size / PAGE_SIZE) { 208 } else if (size > image_size / PAGE_SIZE) {
203 tmp = shrink_all_memory(SHRINK_BITE); 209 tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
204 pages += tmp; 210 pages += tmp;
205 } 211 }
206 printk("\b%c", p[i++%4]); 212 printk("\b%c", p[i++%4]);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 440a733fe2e9..46be8a02280e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -61,6 +61,8 @@ struct scan_control {
61 * In this context, it doesn't matter that we scan the 61 * In this context, it doesn't matter that we scan the
62 * whole list at once. */ 62 * whole list at once. */
63 int swap_cluster_max; 63 int swap_cluster_max;
64
65 int swappiness;
64}; 66};
65 67
66/* 68/*
@@ -741,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
741 * A 100% value of vm_swappiness overrides this algorithm 743 * A 100% value of vm_swappiness overrides this algorithm
742 * altogether. 744 * altogether.
743 */ 745 */
744 swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; 746 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
745 747
746 /* 748 /*
747 * Now use this metric to decide whether to start moving mapped 749 * Now use this metric to decide whether to start moving mapped
@@ -957,6 +959,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
957 .may_writepage = !laptop_mode, 959 .may_writepage = !laptop_mode,
958 .swap_cluster_max = SWAP_CLUSTER_MAX, 960 .swap_cluster_max = SWAP_CLUSTER_MAX,
959 .may_swap = 1, 961 .may_swap = 1,
962 .swappiness = vm_swappiness,
960 }; 963 };
961 964
962 inc_page_state(allocstall); 965 inc_page_state(allocstall);
@@ -1021,10 +1024,6 @@ out:
1021 * For kswapd, balance_pgdat() will work across all this node's zones until 1024 * For kswapd, balance_pgdat() will work across all this node's zones until
1022 * they are all at pages_high. 1025 * they are all at pages_high.
1023 * 1026 *
1024 * If `nr_pages' is non-zero then it is the number of pages which are to be
1025 * reclaimed, regardless of the zone occupancies. This is a software suspend
1026 * special.
1027 *
1028 * Returns the number of pages which were actually freed. 1027 * Returns the number of pages which were actually freed.
1029 * 1028 *
1030 * There is special handling here for zones which are full of pinned pages. 1029 * There is special handling here for zones which are full of pinned pages.
@@ -1042,10 +1041,8 @@ out:
1042 * the page allocator fallback scheme to ensure that aging of pages is balanced 1041 * the page allocator fallback scheme to ensure that aging of pages is balanced
1043 * across the zones. 1042 * across the zones.
1044 */ 1043 */
1045static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, 1044static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1046 int order)
1047{ 1045{
1048 unsigned long to_free = nr_pages;
1049 int all_zones_ok; 1046 int all_zones_ok;
1050 int priority; 1047 int priority;
1051 int i; 1048 int i;
@@ -1055,7 +1052,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
1055 struct scan_control sc = { 1052 struct scan_control sc = {
1056 .gfp_mask = GFP_KERNEL, 1053 .gfp_mask = GFP_KERNEL,
1057 .may_swap = 1, 1054 .may_swap = 1,
1058 .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, 1055 .swap_cluster_max = SWAP_CLUSTER_MAX,
1056 .swappiness = vm_swappiness,
1059 }; 1057 };
1060 1058
1061loop_again: 1059loop_again:
@@ -1082,31 +1080,26 @@ loop_again:
1082 1080
1083 all_zones_ok = 1; 1081 all_zones_ok = 1;
1084 1082
1085 if (nr_pages == 0) { 1083 /*
1086 /* 1084 * Scan in the highmem->dma direction for the highest
1087 * Scan in the highmem->dma direction for the highest 1085 * zone which needs scanning
1088 * zone which needs scanning 1086 */
1089 */ 1087 for (i = pgdat->nr_zones - 1; i >= 0; i--) {
1090 for (i = pgdat->nr_zones - 1; i >= 0; i--) { 1088 struct zone *zone = pgdat->node_zones + i;
1091 struct zone *zone = pgdat->node_zones + i;
1092 1089
1093 if (!populated_zone(zone)) 1090 if (!populated_zone(zone))
1094 continue; 1091 continue;
1095 1092
1096 if (zone->all_unreclaimable && 1093 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1097 priority != DEF_PRIORITY) 1094 continue;
1098 continue;
1099 1095
1100 if (!zone_watermark_ok(zone, order, 1096 if (!zone_watermark_ok(zone, order, zone->pages_high,
1101 zone->pages_high, 0, 0)) { 1097 0, 0)) {
1102 end_zone = i; 1098 end_zone = i;
1103 goto scan; 1099 goto scan;
1104 }
1105 } 1100 }
1106 goto out;
1107 } else {
1108 end_zone = pgdat->nr_zones - 1;
1109 } 1101 }
1102 goto out;
1110scan: 1103scan:
1111 for (i = 0; i <= end_zone; i++) { 1104 for (i = 0; i <= end_zone; i++) {
1112 struct zone *zone = pgdat->node_zones + i; 1105 struct zone *zone = pgdat->node_zones + i;
@@ -1133,11 +1126,9 @@ scan:
1133 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 1126 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
1134 continue; 1127 continue;
1135 1128
1136 if (nr_pages == 0) { /* Not software suspend */ 1129 if (!zone_watermark_ok(zone, order, zone->pages_high,
1137 if (!zone_watermark_ok(zone, order, 1130 end_zone, 0))
1138 zone->pages_high, end_zone, 0)) 1131 all_zones_ok = 0;
1139 all_zones_ok = 0;
1140 }
1141 zone->temp_priority = priority; 1132 zone->temp_priority = priority;
1142 if (zone->prev_priority > priority) 1133 if (zone->prev_priority > priority)
1143 zone->prev_priority = priority; 1134 zone->prev_priority = priority;
@@ -1162,8 +1153,6 @@ scan:
1162 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1153 total_scanned > nr_reclaimed + nr_reclaimed / 2)
1163 sc.may_writepage = 1; 1154 sc.may_writepage = 1;
1164 } 1155 }
1165 if (nr_pages && to_free > nr_reclaimed)
1166 continue; /* swsusp: need to do more work */
1167 if (all_zones_ok) 1156 if (all_zones_ok)
1168 break; /* kswapd: all done */ 1157 break; /* kswapd: all done */
1169 /* 1158 /*
@@ -1179,7 +1168,7 @@ scan:
1179 * matches the direct reclaim path behaviour in terms of impact 1168 * matches the direct reclaim path behaviour in terms of impact
1180 * on zone->*_priority. 1169 * on zone->*_priority.
1181 */ 1170 */
1182 if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) 1171 if (nr_reclaimed >= SWAP_CLUSTER_MAX)
1183 break; 1172 break;
1184 } 1173 }
1185out: 1174out:
@@ -1261,7 +1250,7 @@ static int kswapd(void *p)
1261 } 1250 }
1262 finish_wait(&pgdat->kswapd_wait, &wait); 1251 finish_wait(&pgdat->kswapd_wait, &wait);
1263 1252
1264 balance_pgdat(pgdat, 0, order); 1253 balance_pgdat(pgdat, order);
1265 } 1254 }
1266 return 0; 1255 return 0;
1267} 1256}
@@ -1290,35 +1279,154 @@ void wakeup_kswapd(struct zone *zone, int order)
1290 1279
1291#ifdef CONFIG_PM 1280#ifdef CONFIG_PM
1292/* 1281/*
1293 * Try to free `nr_pages' of memory, system-wide. Returns the number of freed 1282 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
1294 * pages. 1283 * from LRU lists system-wide, for given pass and priority, and returns the
1284 * number of reclaimed pages
1285 *
1286 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
1287 */
1288static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
1289 int prio, struct scan_control *sc)
1290{
1291 struct zone *zone;
1292 unsigned long nr_to_scan, ret = 0;
1293
1294 for_each_zone(zone) {
1295
1296 if (!populated_zone(zone))
1297 continue;
1298
1299 if (zone->all_unreclaimable && prio != DEF_PRIORITY)
1300 continue;
1301
1302 /* For pass = 0 we don't shrink the active list */
1303 if (pass > 0) {
1304 zone->nr_scan_active += (zone->nr_active >> prio) + 1;
1305 if (zone->nr_scan_active >= nr_pages || pass > 3) {
1306 zone->nr_scan_active = 0;
1307 nr_to_scan = min(nr_pages, zone->nr_active);
1308 shrink_active_list(nr_to_scan, zone, sc);
1309 }
1310 }
1311
1312 zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
1313 if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
1314 zone->nr_scan_inactive = 0;
1315 nr_to_scan = min(nr_pages, zone->nr_inactive);
1316 ret += shrink_inactive_list(nr_to_scan, zone, sc);
1317 if (ret >= nr_pages)
1318 return ret;
1319 }
1320 }
1321
1322 return ret;
1323}
1324
1325/*
1326 * Try to free `nr_pages' of memory, system-wide, and return the number of
1327 * freed pages.
1328 *
1329 * Rather than trying to age LRUs the aim is to preserve the overall
1330 * LRU order by reclaiming preferentially
1331 * inactive > active > active referenced > active mapped
1295 */ 1332 */
1296unsigned long shrink_all_memory(unsigned long nr_pages) 1333unsigned long shrink_all_memory(unsigned long nr_pages)
1297{ 1334{
1298 pg_data_t *pgdat; 1335 unsigned long lru_pages, nr_slab;
1299 unsigned long nr_to_free = nr_pages;
1300 unsigned long ret = 0; 1336 unsigned long ret = 0;
1301 unsigned retry = 2; 1337 int pass;
1302 struct reclaim_state reclaim_state = { 1338 struct reclaim_state reclaim_state;
1303 .reclaimed_slab = 0, 1339 struct zone *zone;
1340 struct scan_control sc = {
1341 .gfp_mask = GFP_KERNEL,
1342 .may_swap = 0,
1343 .swap_cluster_max = nr_pages,
1344 .may_writepage = 1,
1345 .swappiness = vm_swappiness,
1304 }; 1346 };
1305 1347
1306 current->reclaim_state = &reclaim_state; 1348 current->reclaim_state = &reclaim_state;
1307repeat:
1308 for_each_online_pgdat(pgdat) {
1309 unsigned long freed;
1310 1349
1311 freed = balance_pgdat(pgdat, nr_to_free, 0); 1350 lru_pages = 0;
1312 ret += freed; 1351 for_each_zone(zone)
1313 nr_to_free -= freed; 1352 lru_pages += zone->nr_active + zone->nr_inactive;
1314 if ((long)nr_to_free <= 0) 1353
1354 nr_slab = read_page_state(nr_slab);
1355 /* If slab caches are huge, it's better to hit them first */
1356 while (nr_slab >= lru_pages) {
1357 reclaim_state.reclaimed_slab = 0;
1358 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
1359 if (!reclaim_state.reclaimed_slab)
1315 break; 1360 break;
1361
1362 ret += reclaim_state.reclaimed_slab;
1363 if (ret >= nr_pages)
1364 goto out;
1365
1366 nr_slab -= reclaim_state.reclaimed_slab;
1316 } 1367 }
1317 if (retry-- && ret < nr_pages) { 1368
1318 blk_congestion_wait(WRITE, HZ/5); 1369 /*
1319 goto repeat; 1370 * We try to shrink LRUs in 5 passes:
1371 * 0 = Reclaim from inactive_list only
1372 * 1 = Reclaim from active list but don't reclaim mapped
1373 * 2 = 2nd pass of type 1
1374 * 3 = Reclaim mapped (normal reclaim)
1375 * 4 = 2nd pass of type 3
1376 */
1377 for (pass = 0; pass < 5; pass++) {
1378 int prio;
1379
1380 /* Needed for shrinking slab caches later on */
1381 if (!lru_pages)
1382 for_each_zone(zone) {
1383 lru_pages += zone->nr_active;
1384 lru_pages += zone->nr_inactive;
1385 }
1386
1387 /* Force reclaiming mapped pages in the passes #3 and #4 */
1388 if (pass > 2) {
1389 sc.may_swap = 1;
1390 sc.swappiness = 100;
1391 }
1392
1393 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
1394 unsigned long nr_to_scan = nr_pages - ret;
1395
1396 sc.nr_mapped = read_page_state(nr_mapped);
1397 sc.nr_scanned = 0;
1398
1399 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
1400 if (ret >= nr_pages)
1401 goto out;
1402
1403 reclaim_state.reclaimed_slab = 0;
1404 shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
1405 ret += reclaim_state.reclaimed_slab;
1406 if (ret >= nr_pages)
1407 goto out;
1408
1409 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
1410 blk_congestion_wait(WRITE, HZ / 10);
1411 }
1412
1413 lru_pages = 0;
1320 } 1414 }
1415
1416 /*
1417 * If ret = 0, we could not shrink LRUs, but there may be something
1418 * in slab caches
1419 */
1420 if (!ret)
1421 do {
1422 reclaim_state.reclaimed_slab = 0;
1423 shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
1424 ret += reclaim_state.reclaimed_slab;
1425 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1426
1427out:
1321 current->reclaim_state = NULL; 1428 current->reclaim_state = NULL;
1429
1322 return ret; 1430 return ret;
1323} 1431}
1324#endif 1432#endif
@@ -1416,6 +1524,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
1416 .swap_cluster_max = max_t(unsigned long, nr_pages, 1524 .swap_cluster_max = max_t(unsigned long, nr_pages,
1417 SWAP_CLUSTER_MAX), 1525 SWAP_CLUSTER_MAX),
1418 .gfp_mask = gfp_mask, 1526 .gfp_mask = gfp_mask,
1527 .swappiness = vm_swappiness,
1419 }; 1528 };
1420 1529
1421 disable_swap_token(); 1530 disable_swap_token();