diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 240 |
1 files changed, 181 insertions, 59 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 440a733fe2e9..72babac71dea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -61,6 +61,8 @@ struct scan_control { | |||
61 | * In this context, it doesn't matter that we scan the | 61 | * In this context, it doesn't matter that we scan the |
62 | * whole list at once. */ | 62 | * whole list at once. */ |
63 | int swap_cluster_max; | 63 | int swap_cluster_max; |
64 | |||
65 | int swappiness; | ||
64 | }; | 66 | }; |
65 | 67 | ||
66 | /* | 68 | /* |
@@ -108,7 +110,7 @@ struct shrinker { | |||
108 | * From 0 .. 100. Higher means more swappy. | 110 | * From 0 .. 100. Higher means more swappy. |
109 | */ | 111 | */ |
110 | int vm_swappiness = 60; | 112 | int vm_swappiness = 60; |
111 | static long total_memory; | 113 | long vm_total_pages; /* The total number of pages which the VM controls */ |
112 | 114 | ||
113 | static LIST_HEAD(shrinker_list); | 115 | static LIST_HEAD(shrinker_list); |
114 | static DECLARE_RWSEM(shrinker_rwsem); | 116 | static DECLARE_RWSEM(shrinker_rwsem); |
@@ -288,11 +290,23 @@ static void handle_write_error(struct address_space *mapping, | |||
288 | unlock_page(page); | 290 | unlock_page(page); |
289 | } | 291 | } |
290 | 292 | ||
293 | /* possible outcome of pageout() */ | ||
294 | typedef enum { | ||
295 | /* failed to write page out, page is locked */ | ||
296 | PAGE_KEEP, | ||
297 | /* move page to the active list, page is locked */ | ||
298 | PAGE_ACTIVATE, | ||
299 | /* page has been sent to the disk successfully, page is unlocked */ | ||
300 | PAGE_SUCCESS, | ||
301 | /* page is clean and locked */ | ||
302 | PAGE_CLEAN, | ||
303 | } pageout_t; | ||
304 | |||
291 | /* | 305 | /* |
292 | * pageout is called by shrink_page_list() for each dirty page. | 306 | * pageout is called by shrink_page_list() for each dirty page. |
293 | * Calls ->writepage(). | 307 | * Calls ->writepage(). |
294 | */ | 308 | */ |
295 | pageout_t pageout(struct page *page, struct address_space *mapping) | 309 | static pageout_t pageout(struct page *page, struct address_space *mapping) |
296 | { | 310 | { |
297 | /* | 311 | /* |
298 | * If the page is dirty, only perform writeback if that write | 312 | * If the page is dirty, only perform writeback if that write |
@@ -337,6 +351,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping) | |||
337 | struct writeback_control wbc = { | 351 | struct writeback_control wbc = { |
338 | .sync_mode = WB_SYNC_NONE, | 352 | .sync_mode = WB_SYNC_NONE, |
339 | .nr_to_write = SWAP_CLUSTER_MAX, | 353 | .nr_to_write = SWAP_CLUSTER_MAX, |
354 | .range_start = 0, | ||
355 | .range_end = LLONG_MAX, | ||
340 | .nonblocking = 1, | 356 | .nonblocking = 1, |
341 | .for_reclaim = 1, | 357 | .for_reclaim = 1, |
342 | }; | 358 | }; |
@@ -727,7 +743,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
727 | * how much memory | 743 | * how much memory |
728 | * is mapped. | 744 | * is mapped. |
729 | */ | 745 | */ |
730 | mapped_ratio = (sc->nr_mapped * 100) / total_memory; | 746 | mapped_ratio = (sc->nr_mapped * 100) / vm_total_pages; |
731 | 747 | ||
732 | /* | 748 | /* |
733 | * Now decide how much we really want to unmap some pages. The | 749 | * Now decide how much we really want to unmap some pages. The |
@@ -741,7 +757,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
741 | * A 100% value of vm_swappiness overrides this algorithm | 757 | * A 100% value of vm_swappiness overrides this algorithm |
742 | * altogether. | 758 | * altogether. |
743 | */ | 759 | */ |
744 | swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; | 760 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; |
745 | 761 | ||
746 | /* | 762 | /* |
747 | * Now use this metric to decide whether to start moving mapped | 763 | * Now use this metric to decide whether to start moving mapped |
@@ -957,6 +973,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
957 | .may_writepage = !laptop_mode, | 973 | .may_writepage = !laptop_mode, |
958 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 974 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
959 | .may_swap = 1, | 975 | .may_swap = 1, |
976 | .swappiness = vm_swappiness, | ||
960 | }; | 977 | }; |
961 | 978 | ||
962 | inc_page_state(allocstall); | 979 | inc_page_state(allocstall); |
@@ -1021,10 +1038,6 @@ out: | |||
1021 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1038 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1022 | * they are all at pages_high. | 1039 | * they are all at pages_high. |
1023 | * | 1040 | * |
1024 | * If `nr_pages' is non-zero then it is the number of pages which are to be | ||
1025 | * reclaimed, regardless of the zone occupancies. This is a software suspend | ||
1026 | * special. | ||
1027 | * | ||
1028 | * Returns the number of pages which were actually freed. | 1041 | * Returns the number of pages which were actually freed. |
1029 | * | 1042 | * |
1030 | * There is special handling here for zones which are full of pinned pages. | 1043 | * There is special handling here for zones which are full of pinned pages. |
@@ -1042,10 +1055,8 @@ out: | |||
1042 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1055 | * the page allocator fallback scheme to ensure that aging of pages is balanced |
1043 | * across the zones. | 1056 | * across the zones. |
1044 | */ | 1057 | */ |
1045 | static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, | 1058 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) |
1046 | int order) | ||
1047 | { | 1059 | { |
1048 | unsigned long to_free = nr_pages; | ||
1049 | int all_zones_ok; | 1060 | int all_zones_ok; |
1050 | int priority; | 1061 | int priority; |
1051 | int i; | 1062 | int i; |
@@ -1055,7 +1066,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, | |||
1055 | struct scan_control sc = { | 1066 | struct scan_control sc = { |
1056 | .gfp_mask = GFP_KERNEL, | 1067 | .gfp_mask = GFP_KERNEL, |
1057 | .may_swap = 1, | 1068 | .may_swap = 1, |
1058 | .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, | 1069 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1070 | .swappiness = vm_swappiness, | ||
1059 | }; | 1071 | }; |
1060 | 1072 | ||
1061 | loop_again: | 1073 | loop_again: |
@@ -1082,31 +1094,26 @@ loop_again: | |||
1082 | 1094 | ||
1083 | all_zones_ok = 1; | 1095 | all_zones_ok = 1; |
1084 | 1096 | ||
1085 | if (nr_pages == 0) { | 1097 | /* |
1086 | /* | 1098 | * Scan in the highmem->dma direction for the highest |
1087 | * Scan in the highmem->dma direction for the highest | 1099 | * zone which needs scanning |
1088 | * zone which needs scanning | 1100 | */ |
1089 | */ | 1101 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { |
1090 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { | 1102 | struct zone *zone = pgdat->node_zones + i; |
1091 | struct zone *zone = pgdat->node_zones + i; | ||
1092 | 1103 | ||
1093 | if (!populated_zone(zone)) | 1104 | if (!populated_zone(zone)) |
1094 | continue; | 1105 | continue; |
1095 | 1106 | ||
1096 | if (zone->all_unreclaimable && | 1107 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1097 | priority != DEF_PRIORITY) | 1108 | continue; |
1098 | continue; | ||
1099 | 1109 | ||
1100 | if (!zone_watermark_ok(zone, order, | 1110 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1101 | zone->pages_high, 0, 0)) { | 1111 | 0, 0)) { |
1102 | end_zone = i; | 1112 | end_zone = i; |
1103 | goto scan; | 1113 | goto scan; |
1104 | } | ||
1105 | } | 1114 | } |
1106 | goto out; | ||
1107 | } else { | ||
1108 | end_zone = pgdat->nr_zones - 1; | ||
1109 | } | 1115 | } |
1116 | goto out; | ||
1110 | scan: | 1117 | scan: |
1111 | for (i = 0; i <= end_zone; i++) { | 1118 | for (i = 0; i <= end_zone; i++) { |
1112 | struct zone *zone = pgdat->node_zones + i; | 1119 | struct zone *zone = pgdat->node_zones + i; |
@@ -1133,11 +1140,9 @@ scan: | |||
1133 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1140 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1134 | continue; | 1141 | continue; |
1135 | 1142 | ||
1136 | if (nr_pages == 0) { /* Not software suspend */ | 1143 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1137 | if (!zone_watermark_ok(zone, order, | 1144 | end_zone, 0)) |
1138 | zone->pages_high, end_zone, 0)) | 1145 | all_zones_ok = 0; |
1139 | all_zones_ok = 0; | ||
1140 | } | ||
1141 | zone->temp_priority = priority; | 1146 | zone->temp_priority = priority; |
1142 | if (zone->prev_priority > priority) | 1147 | if (zone->prev_priority > priority) |
1143 | zone->prev_priority = priority; | 1148 | zone->prev_priority = priority; |
@@ -1162,8 +1167,6 @@ scan: | |||
1162 | total_scanned > nr_reclaimed + nr_reclaimed / 2) | 1167 | total_scanned > nr_reclaimed + nr_reclaimed / 2) |
1163 | sc.may_writepage = 1; | 1168 | sc.may_writepage = 1; |
1164 | } | 1169 | } |
1165 | if (nr_pages && to_free > nr_reclaimed) | ||
1166 | continue; /* swsusp: need to do more work */ | ||
1167 | if (all_zones_ok) | 1170 | if (all_zones_ok) |
1168 | break; /* kswapd: all done */ | 1171 | break; /* kswapd: all done */ |
1169 | /* | 1172 | /* |
@@ -1179,7 +1182,7 @@ scan: | |||
1179 | * matches the direct reclaim path behaviour in terms of impact | 1182 | * matches the direct reclaim path behaviour in terms of impact |
1180 | * on zone->*_priority. | 1183 | * on zone->*_priority. |
1181 | */ | 1184 | */ |
1182 | if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) | 1185 | if (nr_reclaimed >= SWAP_CLUSTER_MAX) |
1183 | break; | 1186 | break; |
1184 | } | 1187 | } |
1185 | out: | 1188 | out: |
@@ -1261,7 +1264,7 @@ static int kswapd(void *p) | |||
1261 | } | 1264 | } |
1262 | finish_wait(&pgdat->kswapd_wait, &wait); | 1265 | finish_wait(&pgdat->kswapd_wait, &wait); |
1263 | 1266 | ||
1264 | balance_pgdat(pgdat, 0, order); | 1267 | balance_pgdat(pgdat, order); |
1265 | } | 1268 | } |
1266 | return 0; | 1269 | return 0; |
1267 | } | 1270 | } |
@@ -1290,35 +1293,154 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1290 | 1293 | ||
1291 | #ifdef CONFIG_PM | 1294 | #ifdef CONFIG_PM |
1292 | /* | 1295 | /* |
1293 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed | 1296 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages |
1294 | * pages. | 1297 | * from LRU lists system-wide, for given pass and priority, and returns the |
1298 | * number of reclaimed pages | ||
1299 | * | ||
1300 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | ||
1301 | */ | ||
1302 | static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, | ||
1303 | int prio, struct scan_control *sc) | ||
1304 | { | ||
1305 | struct zone *zone; | ||
1306 | unsigned long nr_to_scan, ret = 0; | ||
1307 | |||
1308 | for_each_zone(zone) { | ||
1309 | |||
1310 | if (!populated_zone(zone)) | ||
1311 | continue; | ||
1312 | |||
1313 | if (zone->all_unreclaimable && prio != DEF_PRIORITY) | ||
1314 | continue; | ||
1315 | |||
1316 | /* For pass = 0 we don't shrink the active list */ | ||
1317 | if (pass > 0) { | ||
1318 | zone->nr_scan_active += (zone->nr_active >> prio) + 1; | ||
1319 | if (zone->nr_scan_active >= nr_pages || pass > 3) { | ||
1320 | zone->nr_scan_active = 0; | ||
1321 | nr_to_scan = min(nr_pages, zone->nr_active); | ||
1322 | shrink_active_list(nr_to_scan, zone, sc); | ||
1323 | } | ||
1324 | } | ||
1325 | |||
1326 | zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; | ||
1327 | if (zone->nr_scan_inactive >= nr_pages || pass > 3) { | ||
1328 | zone->nr_scan_inactive = 0; | ||
1329 | nr_to_scan = min(nr_pages, zone->nr_inactive); | ||
1330 | ret += shrink_inactive_list(nr_to_scan, zone, sc); | ||
1331 | if (ret >= nr_pages) | ||
1332 | return ret; | ||
1333 | } | ||
1334 | } | ||
1335 | |||
1336 | return ret; | ||
1337 | } | ||
1338 | |||
1339 | /* | ||
1340 | * Try to free `nr_pages' of memory, system-wide, and return the number of | ||
1341 | * freed pages. | ||
1342 | * | ||
1343 | * Rather than trying to age LRUs the aim is to preserve the overall | ||
1344 | * LRU order by reclaiming preferentially | ||
1345 | * inactive > active > active referenced > active mapped | ||
1295 | */ | 1346 | */ |
1296 | unsigned long shrink_all_memory(unsigned long nr_pages) | 1347 | unsigned long shrink_all_memory(unsigned long nr_pages) |
1297 | { | 1348 | { |
1298 | pg_data_t *pgdat; | 1349 | unsigned long lru_pages, nr_slab; |
1299 | unsigned long nr_to_free = nr_pages; | ||
1300 | unsigned long ret = 0; | 1350 | unsigned long ret = 0; |
1301 | unsigned retry = 2; | 1351 | int pass; |
1302 | struct reclaim_state reclaim_state = { | 1352 | struct reclaim_state reclaim_state; |
1303 | .reclaimed_slab = 0, | 1353 | struct zone *zone; |
1354 | struct scan_control sc = { | ||
1355 | .gfp_mask = GFP_KERNEL, | ||
1356 | .may_swap = 0, | ||
1357 | .swap_cluster_max = nr_pages, | ||
1358 | .may_writepage = 1, | ||
1359 | .swappiness = vm_swappiness, | ||
1304 | }; | 1360 | }; |
1305 | 1361 | ||
1306 | current->reclaim_state = &reclaim_state; | 1362 | current->reclaim_state = &reclaim_state; |
1307 | repeat: | ||
1308 | for_each_online_pgdat(pgdat) { | ||
1309 | unsigned long freed; | ||
1310 | 1363 | ||
1311 | freed = balance_pgdat(pgdat, nr_to_free, 0); | 1364 | lru_pages = 0; |
1312 | ret += freed; | 1365 | for_each_zone(zone) |
1313 | nr_to_free -= freed; | 1366 | lru_pages += zone->nr_active + zone->nr_inactive; |
1314 | if ((long)nr_to_free <= 0) | 1367 | |
1368 | nr_slab = read_page_state(nr_slab); | ||
1369 | /* If slab caches are huge, it's better to hit them first */ | ||
1370 | while (nr_slab >= lru_pages) { | ||
1371 | reclaim_state.reclaimed_slab = 0; | ||
1372 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
1373 | if (!reclaim_state.reclaimed_slab) | ||
1315 | break; | 1374 | break; |
1375 | |||
1376 | ret += reclaim_state.reclaimed_slab; | ||
1377 | if (ret >= nr_pages) | ||
1378 | goto out; | ||
1379 | |||
1380 | nr_slab -= reclaim_state.reclaimed_slab; | ||
1316 | } | 1381 | } |
1317 | if (retry-- && ret < nr_pages) { | 1382 | |
1318 | blk_congestion_wait(WRITE, HZ/5); | 1383 | /* |
1319 | goto repeat; | 1384 | * We try to shrink LRUs in 5 passes: |
1385 | * 0 = Reclaim from inactive_list only | ||
1386 | * 1 = Reclaim from active list but don't reclaim mapped | ||
1387 | * 2 = 2nd pass of type 1 | ||
1388 | * 3 = Reclaim mapped (normal reclaim) | ||
1389 | * 4 = 2nd pass of type 3 | ||
1390 | */ | ||
1391 | for (pass = 0; pass < 5; pass++) { | ||
1392 | int prio; | ||
1393 | |||
1394 | /* Needed for shrinking slab caches later on */ | ||
1395 | if (!lru_pages) | ||
1396 | for_each_zone(zone) { | ||
1397 | lru_pages += zone->nr_active; | ||
1398 | lru_pages += zone->nr_inactive; | ||
1399 | } | ||
1400 | |||
1401 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | ||
1402 | if (pass > 2) { | ||
1403 | sc.may_swap = 1; | ||
1404 | sc.swappiness = 100; | ||
1405 | } | ||
1406 | |||
1407 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | ||
1408 | unsigned long nr_to_scan = nr_pages - ret; | ||
1409 | |||
1410 | sc.nr_mapped = read_page_state(nr_mapped); | ||
1411 | sc.nr_scanned = 0; | ||
1412 | |||
1413 | ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); | ||
1414 | if (ret >= nr_pages) | ||
1415 | goto out; | ||
1416 | |||
1417 | reclaim_state.reclaimed_slab = 0; | ||
1418 | shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); | ||
1419 | ret += reclaim_state.reclaimed_slab; | ||
1420 | if (ret >= nr_pages) | ||
1421 | goto out; | ||
1422 | |||
1423 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | ||
1424 | blk_congestion_wait(WRITE, HZ / 10); | ||
1425 | } | ||
1426 | |||
1427 | lru_pages = 0; | ||
1320 | } | 1428 | } |
1429 | |||
1430 | /* | ||
1431 | * If ret = 0, we could not shrink LRUs, but there may be something | ||
1432 | * in slab caches | ||
1433 | */ | ||
1434 | if (!ret) | ||
1435 | do { | ||
1436 | reclaim_state.reclaimed_slab = 0; | ||
1437 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
1438 | ret += reclaim_state.reclaimed_slab; | ||
1439 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); | ||
1440 | |||
1441 | out: | ||
1321 | current->reclaim_state = NULL; | 1442 | current->reclaim_state = NULL; |
1443 | |||
1322 | return ret; | 1444 | return ret; |
1323 | } | 1445 | } |
1324 | #endif | 1446 | #endif |
@@ -1360,7 +1482,6 @@ static int __init kswapd_init(void) | |||
1360 | pgdat->kswapd = find_task_by_pid(pid); | 1482 | pgdat->kswapd = find_task_by_pid(pid); |
1361 | read_unlock(&tasklist_lock); | 1483 | read_unlock(&tasklist_lock); |
1362 | } | 1484 | } |
1363 | total_memory = nr_free_pagecache_pages(); | ||
1364 | hotcpu_notifier(cpu_callback, 0); | 1485 | hotcpu_notifier(cpu_callback, 0); |
1365 | return 0; | 1486 | return 0; |
1366 | } | 1487 | } |
@@ -1416,6 +1537,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1416 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 1537 | .swap_cluster_max = max_t(unsigned long, nr_pages, |
1417 | SWAP_CLUSTER_MAX), | 1538 | SWAP_CLUSTER_MAX), |
1418 | .gfp_mask = gfp_mask, | 1539 | .gfp_mask = gfp_mask, |
1540 | .swappiness = vm_swappiness, | ||
1419 | }; | 1541 | }; |
1420 | 1542 | ||
1421 | disable_swap_token(); | 1543 | disable_swap_token(); |