diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 341 |
1 files changed, 230 insertions, 111 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 440a733fe2e..ff2ebe9458a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -34,6 +34,7 @@ | |||
34 | #include <linux/notifier.h> | 34 | #include <linux/notifier.h> |
35 | #include <linux/rwsem.h> | 35 | #include <linux/rwsem.h> |
36 | #include <linux/delay.h> | 36 | #include <linux/delay.h> |
37 | #include <linux/kthread.h> | ||
37 | 38 | ||
38 | #include <asm/tlbflush.h> | 39 | #include <asm/tlbflush.h> |
39 | #include <asm/div64.h> | 40 | #include <asm/div64.h> |
@@ -46,8 +47,6 @@ struct scan_control { | |||
46 | /* Incremented by the number of inactive pages that were scanned */ | 47 | /* Incremented by the number of inactive pages that were scanned */ |
47 | unsigned long nr_scanned; | 48 | unsigned long nr_scanned; |
48 | 49 | ||
49 | unsigned long nr_mapped; /* From page_state */ | ||
50 | |||
51 | /* This context's GFP mask */ | 50 | /* This context's GFP mask */ |
52 | gfp_t gfp_mask; | 51 | gfp_t gfp_mask; |
53 | 52 | ||
@@ -61,6 +60,8 @@ struct scan_control { | |||
61 | * In this context, it doesn't matter that we scan the | 60 | * In this context, it doesn't matter that we scan the |
62 | * whole list at once. */ | 61 | * whole list at once. */ |
63 | int swap_cluster_max; | 62 | int swap_cluster_max; |
63 | |||
64 | int swappiness; | ||
64 | }; | 65 | }; |
65 | 66 | ||
66 | /* | 67 | /* |
@@ -108,7 +109,7 @@ struct shrinker { | |||
108 | * From 0 .. 100. Higher means more swappy. | 109 | * From 0 .. 100. Higher means more swappy. |
109 | */ | 110 | */ |
110 | int vm_swappiness = 60; | 111 | int vm_swappiness = 60; |
111 | static long total_memory; | 112 | long vm_total_pages; /* The total number of pages which the VM controls */ |
112 | 113 | ||
113 | static LIST_HEAD(shrinker_list); | 114 | static LIST_HEAD(shrinker_list); |
114 | static DECLARE_RWSEM(shrinker_rwsem); | 115 | static DECLARE_RWSEM(shrinker_rwsem); |
@@ -214,7 +215,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, | |||
214 | break; | 215 | break; |
215 | if (shrink_ret < nr_before) | 216 | if (shrink_ret < nr_before) |
216 | ret += nr_before - shrink_ret; | 217 | ret += nr_before - shrink_ret; |
217 | mod_page_state(slabs_scanned, this_scan); | 218 | count_vm_events(SLABS_SCANNED, this_scan); |
218 | total_scan -= this_scan; | 219 | total_scan -= this_scan; |
219 | 220 | ||
220 | cond_resched(); | 221 | cond_resched(); |
@@ -288,11 +289,23 @@ static void handle_write_error(struct address_space *mapping, | |||
288 | unlock_page(page); | 289 | unlock_page(page); |
289 | } | 290 | } |
290 | 291 | ||
292 | /* possible outcome of pageout() */ | ||
293 | typedef enum { | ||
294 | /* failed to write page out, page is locked */ | ||
295 | PAGE_KEEP, | ||
296 | /* move page to the active list, page is locked */ | ||
297 | PAGE_ACTIVATE, | ||
298 | /* page has been sent to the disk successfully, page is unlocked */ | ||
299 | PAGE_SUCCESS, | ||
300 | /* page is clean and locked */ | ||
301 | PAGE_CLEAN, | ||
302 | } pageout_t; | ||
303 | |||
291 | /* | 304 | /* |
292 | * pageout is called by shrink_page_list() for each dirty page. | 305 | * pageout is called by shrink_page_list() for each dirty page. |
293 | * Calls ->writepage(). | 306 | * Calls ->writepage(). |
294 | */ | 307 | */ |
295 | pageout_t pageout(struct page *page, struct address_space *mapping) | 308 | static pageout_t pageout(struct page *page, struct address_space *mapping) |
296 | { | 309 | { |
297 | /* | 310 | /* |
298 | * If the page is dirty, only perform writeback if that write | 311 | * If the page is dirty, only perform writeback if that write |
@@ -337,6 +350,8 @@ pageout_t pageout(struct page *page, struct address_space *mapping) | |||
337 | struct writeback_control wbc = { | 350 | struct writeback_control wbc = { |
338 | .sync_mode = WB_SYNC_NONE, | 351 | .sync_mode = WB_SYNC_NONE, |
339 | .nr_to_write = SWAP_CLUSTER_MAX, | 352 | .nr_to_write = SWAP_CLUSTER_MAX, |
353 | .range_start = 0, | ||
354 | .range_end = LLONG_MAX, | ||
340 | .nonblocking = 1, | 355 | .nonblocking = 1, |
341 | .for_reclaim = 1, | 356 | .for_reclaim = 1, |
342 | }; | 357 | }; |
@@ -554,7 +569,7 @@ keep: | |||
554 | list_splice(&ret_pages, page_list); | 569 | list_splice(&ret_pages, page_list); |
555 | if (pagevec_count(&freed_pvec)) | 570 | if (pagevec_count(&freed_pvec)) |
556 | __pagevec_release_nonlru(&freed_pvec); | 571 | __pagevec_release_nonlru(&freed_pvec); |
557 | mod_page_state(pgactivate, pgactivate); | 572 | count_vm_events(PGACTIVATE, pgactivate); |
558 | return nr_reclaimed; | 573 | return nr_reclaimed; |
559 | } | 574 | } |
560 | 575 | ||
@@ -644,11 +659,11 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, | |||
644 | nr_reclaimed += nr_freed; | 659 | nr_reclaimed += nr_freed; |
645 | local_irq_disable(); | 660 | local_irq_disable(); |
646 | if (current_is_kswapd()) { | 661 | if (current_is_kswapd()) { |
647 | __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); | 662 | __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); |
648 | __mod_page_state(kswapd_steal, nr_freed); | 663 | __count_vm_events(KSWAPD_STEAL, nr_freed); |
649 | } else | 664 | } else |
650 | __mod_page_state_zone(zone, pgscan_direct, nr_scan); | 665 | __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); |
651 | __mod_page_state_zone(zone, pgsteal, nr_freed); | 666 | __count_vm_events(PGACTIVATE, nr_freed); |
652 | 667 | ||
653 | if (nr_taken == 0) | 668 | if (nr_taken == 0) |
654 | goto done; | 669 | goto done; |
@@ -727,7 +742,9 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
727 | * how much memory | 742 | * how much memory |
728 | * is mapped. | 743 | * is mapped. |
729 | */ | 744 | */ |
730 | mapped_ratio = (sc->nr_mapped * 100) / total_memory; | 745 | mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + |
746 | global_page_state(NR_ANON_PAGES)) * 100) / | ||
747 | vm_total_pages; | ||
731 | 748 | ||
732 | /* | 749 | /* |
733 | * Now decide how much we really want to unmap some pages. The | 750 | * Now decide how much we really want to unmap some pages. The |
@@ -741,7 +758,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
741 | * A 100% value of vm_swappiness overrides this algorithm | 758 | * A 100% value of vm_swappiness overrides this algorithm |
742 | * altogether. | 759 | * altogether. |
743 | */ | 760 | */ |
744 | swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; | 761 | swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; |
745 | 762 | ||
746 | /* | 763 | /* |
747 | * Now use this metric to decide whether to start moving mapped | 764 | * Now use this metric to decide whether to start moving mapped |
@@ -824,11 +841,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, | |||
824 | } | 841 | } |
825 | } | 842 | } |
826 | zone->nr_active += pgmoved; | 843 | zone->nr_active += pgmoved; |
827 | spin_unlock(&zone->lru_lock); | ||
828 | 844 | ||
829 | __mod_page_state_zone(zone, pgrefill, pgscanned); | 845 | __count_zone_vm_events(PGREFILL, zone, pgscanned); |
830 | __mod_page_state(pgdeactivate, pgdeactivate); | 846 | __count_vm_events(PGDEACTIVATE, pgdeactivate); |
831 | local_irq_enable(); | 847 | spin_unlock_irq(&zone->lru_lock); |
832 | 848 | ||
833 | pagevec_release(&pvec); | 849 | pagevec_release(&pvec); |
834 | } | 850 | } |
@@ -957,9 +973,10 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
957 | .may_writepage = !laptop_mode, | 973 | .may_writepage = !laptop_mode, |
958 | .swap_cluster_max = SWAP_CLUSTER_MAX, | 974 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
959 | .may_swap = 1, | 975 | .may_swap = 1, |
976 | .swappiness = vm_swappiness, | ||
960 | }; | 977 | }; |
961 | 978 | ||
962 | inc_page_state(allocstall); | 979 | count_vm_event(ALLOCSTALL); |
963 | 980 | ||
964 | for (i = 0; zones[i] != NULL; i++) { | 981 | for (i = 0; zones[i] != NULL; i++) { |
965 | struct zone *zone = zones[i]; | 982 | struct zone *zone = zones[i]; |
@@ -972,7 +989,6 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) | |||
972 | } | 989 | } |
973 | 990 | ||
974 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { | 991 | for (priority = DEF_PRIORITY; priority >= 0; priority--) { |
975 | sc.nr_mapped = read_page_state(nr_mapped); | ||
976 | sc.nr_scanned = 0; | 992 | sc.nr_scanned = 0; |
977 | if (!priority) | 993 | if (!priority) |
978 | disable_swap_token(); | 994 | disable_swap_token(); |
@@ -1021,10 +1037,6 @@ out: | |||
1021 | * For kswapd, balance_pgdat() will work across all this node's zones until | 1037 | * For kswapd, balance_pgdat() will work across all this node's zones until |
1022 | * they are all at pages_high. | 1038 | * they are all at pages_high. |
1023 | * | 1039 | * |
1024 | * If `nr_pages' is non-zero then it is the number of pages which are to be | ||
1025 | * reclaimed, regardless of the zone occupancies. This is a software suspend | ||
1026 | * special. | ||
1027 | * | ||
1028 | * Returns the number of pages which were actually freed. | 1040 | * Returns the number of pages which were actually freed. |
1029 | * | 1041 | * |
1030 | * There is special handling here for zones which are full of pinned pages. | 1042 | * There is special handling here for zones which are full of pinned pages. |
@@ -1042,10 +1054,8 @@ out: | |||
1042 | * the page allocator fallback scheme to ensure that aging of pages is balanced | 1054 | * the page allocator fallback scheme to ensure that aging of pages is balanced |
1043 | * across the zones. | 1055 | * across the zones. |
1044 | */ | 1056 | */ |
1045 | static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, | 1057 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order) |
1046 | int order) | ||
1047 | { | 1058 | { |
1048 | unsigned long to_free = nr_pages; | ||
1049 | int all_zones_ok; | 1059 | int all_zones_ok; |
1050 | int priority; | 1060 | int priority; |
1051 | int i; | 1061 | int i; |
@@ -1055,16 +1065,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, | |||
1055 | struct scan_control sc = { | 1065 | struct scan_control sc = { |
1056 | .gfp_mask = GFP_KERNEL, | 1066 | .gfp_mask = GFP_KERNEL, |
1057 | .may_swap = 1, | 1067 | .may_swap = 1, |
1058 | .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, | 1068 | .swap_cluster_max = SWAP_CLUSTER_MAX, |
1069 | .swappiness = vm_swappiness, | ||
1059 | }; | 1070 | }; |
1060 | 1071 | ||
1061 | loop_again: | 1072 | loop_again: |
1062 | total_scanned = 0; | 1073 | total_scanned = 0; |
1063 | nr_reclaimed = 0; | 1074 | nr_reclaimed = 0; |
1064 | sc.may_writepage = !laptop_mode; | 1075 | sc.may_writepage = !laptop_mode; |
1065 | sc.nr_mapped = read_page_state(nr_mapped); | 1076 | count_vm_event(PAGEOUTRUN); |
1066 | |||
1067 | inc_page_state(pageoutrun); | ||
1068 | 1077 | ||
1069 | for (i = 0; i < pgdat->nr_zones; i++) { | 1078 | for (i = 0; i < pgdat->nr_zones; i++) { |
1070 | struct zone *zone = pgdat->node_zones + i; | 1079 | struct zone *zone = pgdat->node_zones + i; |
@@ -1082,31 +1091,26 @@ loop_again: | |||
1082 | 1091 | ||
1083 | all_zones_ok = 1; | 1092 | all_zones_ok = 1; |
1084 | 1093 | ||
1085 | if (nr_pages == 0) { | 1094 | /* |
1086 | /* | 1095 | * Scan in the highmem->dma direction for the highest |
1087 | * Scan in the highmem->dma direction for the highest | 1096 | * zone which needs scanning |
1088 | * zone which needs scanning | 1097 | */ |
1089 | */ | 1098 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { |
1090 | for (i = pgdat->nr_zones - 1; i >= 0; i--) { | 1099 | struct zone *zone = pgdat->node_zones + i; |
1091 | struct zone *zone = pgdat->node_zones + i; | ||
1092 | 1100 | ||
1093 | if (!populated_zone(zone)) | 1101 | if (!populated_zone(zone)) |
1094 | continue; | 1102 | continue; |
1095 | 1103 | ||
1096 | if (zone->all_unreclaimable && | 1104 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1097 | priority != DEF_PRIORITY) | 1105 | continue; |
1098 | continue; | ||
1099 | 1106 | ||
1100 | if (!zone_watermark_ok(zone, order, | 1107 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1101 | zone->pages_high, 0, 0)) { | 1108 | 0, 0)) { |
1102 | end_zone = i; | 1109 | end_zone = i; |
1103 | goto scan; | 1110 | goto scan; |
1104 | } | ||
1105 | } | 1111 | } |
1106 | goto out; | ||
1107 | } else { | ||
1108 | end_zone = pgdat->nr_zones - 1; | ||
1109 | } | 1112 | } |
1113 | goto out; | ||
1110 | scan: | 1114 | scan: |
1111 | for (i = 0; i <= end_zone; i++) { | 1115 | for (i = 0; i <= end_zone; i++) { |
1112 | struct zone *zone = pgdat->node_zones + i; | 1116 | struct zone *zone = pgdat->node_zones + i; |
@@ -1133,11 +1137,9 @@ scan: | |||
1133 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) | 1137 | if (zone->all_unreclaimable && priority != DEF_PRIORITY) |
1134 | continue; | 1138 | continue; |
1135 | 1139 | ||
1136 | if (nr_pages == 0) { /* Not software suspend */ | 1140 | if (!zone_watermark_ok(zone, order, zone->pages_high, |
1137 | if (!zone_watermark_ok(zone, order, | 1141 | end_zone, 0)) |
1138 | zone->pages_high, end_zone, 0)) | 1142 | all_zones_ok = 0; |
1139 | all_zones_ok = 0; | ||
1140 | } | ||
1141 | zone->temp_priority = priority; | 1143 | zone->temp_priority = priority; |
1142 | if (zone->prev_priority > priority) | 1144 | if (zone->prev_priority > priority) |
1143 | zone->prev_priority = priority; | 1145 | zone->prev_priority = priority; |
@@ -1162,8 +1164,6 @@ scan: | |||
1162 | total_scanned > nr_reclaimed + nr_reclaimed / 2) | 1164 | total_scanned > nr_reclaimed + nr_reclaimed / 2) |
1163 | sc.may_writepage = 1; | 1165 | sc.may_writepage = 1; |
1164 | } | 1166 | } |
1165 | if (nr_pages && to_free > nr_reclaimed) | ||
1166 | continue; /* swsusp: need to do more work */ | ||
1167 | if (all_zones_ok) | 1167 | if (all_zones_ok) |
1168 | break; /* kswapd: all done */ | 1168 | break; /* kswapd: all done */ |
1169 | /* | 1169 | /* |
@@ -1179,7 +1179,7 @@ scan: | |||
1179 | * matches the direct reclaim path behaviour in terms of impact | 1179 | * matches the direct reclaim path behaviour in terms of impact |
1180 | * on zone->*_priority. | 1180 | * on zone->*_priority. |
1181 | */ | 1181 | */ |
1182 | if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) | 1182 | if (nr_reclaimed >= SWAP_CLUSTER_MAX) |
1183 | break; | 1183 | break; |
1184 | } | 1184 | } |
1185 | out: | 1185 | out: |
@@ -1220,7 +1220,6 @@ static int kswapd(void *p) | |||
1220 | }; | 1220 | }; |
1221 | cpumask_t cpumask; | 1221 | cpumask_t cpumask; |
1222 | 1222 | ||
1223 | daemonize("kswapd%d", pgdat->node_id); | ||
1224 | cpumask = node_to_cpumask(pgdat->node_id); | 1223 | cpumask = node_to_cpumask(pgdat->node_id); |
1225 | if (!cpus_empty(cpumask)) | 1224 | if (!cpus_empty(cpumask)) |
1226 | set_cpus_allowed(tsk, cpumask); | 1225 | set_cpus_allowed(tsk, cpumask); |
@@ -1261,7 +1260,7 @@ static int kswapd(void *p) | |||
1261 | } | 1260 | } |
1262 | finish_wait(&pgdat->kswapd_wait, &wait); | 1261 | finish_wait(&pgdat->kswapd_wait, &wait); |
1263 | 1262 | ||
1264 | balance_pgdat(pgdat, 0, order); | 1263 | balance_pgdat(pgdat, order); |
1265 | } | 1264 | } |
1266 | return 0; | 1265 | return 0; |
1267 | } | 1266 | } |
@@ -1290,35 +1289,152 @@ void wakeup_kswapd(struct zone *zone, int order) | |||
1290 | 1289 | ||
1291 | #ifdef CONFIG_PM | 1290 | #ifdef CONFIG_PM |
1292 | /* | 1291 | /* |
1293 | * Try to free `nr_pages' of memory, system-wide. Returns the number of freed | 1292 | * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages |
1294 | * pages. | 1293 | * from LRU lists system-wide, for given pass and priority, and returns the |
1294 | * number of reclaimed pages | ||
1295 | * | ||
1296 | * For pass > 3 we also try to shrink the LRU lists that contain a few pages | ||
1297 | */ | ||
1298 | static unsigned long shrink_all_zones(unsigned long nr_pages, int pass, | ||
1299 | int prio, struct scan_control *sc) | ||
1300 | { | ||
1301 | struct zone *zone; | ||
1302 | unsigned long nr_to_scan, ret = 0; | ||
1303 | |||
1304 | for_each_zone(zone) { | ||
1305 | |||
1306 | if (!populated_zone(zone)) | ||
1307 | continue; | ||
1308 | |||
1309 | if (zone->all_unreclaimable && prio != DEF_PRIORITY) | ||
1310 | continue; | ||
1311 | |||
1312 | /* For pass = 0 we don't shrink the active list */ | ||
1313 | if (pass > 0) { | ||
1314 | zone->nr_scan_active += (zone->nr_active >> prio) + 1; | ||
1315 | if (zone->nr_scan_active >= nr_pages || pass > 3) { | ||
1316 | zone->nr_scan_active = 0; | ||
1317 | nr_to_scan = min(nr_pages, zone->nr_active); | ||
1318 | shrink_active_list(nr_to_scan, zone, sc); | ||
1319 | } | ||
1320 | } | ||
1321 | |||
1322 | zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1; | ||
1323 | if (zone->nr_scan_inactive >= nr_pages || pass > 3) { | ||
1324 | zone->nr_scan_inactive = 0; | ||
1325 | nr_to_scan = min(nr_pages, zone->nr_inactive); | ||
1326 | ret += shrink_inactive_list(nr_to_scan, zone, sc); | ||
1327 | if (ret >= nr_pages) | ||
1328 | return ret; | ||
1329 | } | ||
1330 | } | ||
1331 | |||
1332 | return ret; | ||
1333 | } | ||
1334 | |||
1335 | /* | ||
1336 | * Try to free `nr_pages' of memory, system-wide, and return the number of | ||
1337 | * freed pages. | ||
1338 | * | ||
1339 | * Rather than trying to age LRUs the aim is to preserve the overall | ||
1340 | * LRU order by reclaiming preferentially | ||
1341 | * inactive > active > active referenced > active mapped | ||
1295 | */ | 1342 | */ |
1296 | unsigned long shrink_all_memory(unsigned long nr_pages) | 1343 | unsigned long shrink_all_memory(unsigned long nr_pages) |
1297 | { | 1344 | { |
1298 | pg_data_t *pgdat; | 1345 | unsigned long lru_pages, nr_slab; |
1299 | unsigned long nr_to_free = nr_pages; | ||
1300 | unsigned long ret = 0; | 1346 | unsigned long ret = 0; |
1301 | unsigned retry = 2; | 1347 | int pass; |
1302 | struct reclaim_state reclaim_state = { | 1348 | struct reclaim_state reclaim_state; |
1303 | .reclaimed_slab = 0, | 1349 | struct zone *zone; |
1350 | struct scan_control sc = { | ||
1351 | .gfp_mask = GFP_KERNEL, | ||
1352 | .may_swap = 0, | ||
1353 | .swap_cluster_max = nr_pages, | ||
1354 | .may_writepage = 1, | ||
1355 | .swappiness = vm_swappiness, | ||
1304 | }; | 1356 | }; |
1305 | 1357 | ||
1306 | current->reclaim_state = &reclaim_state; | 1358 | current->reclaim_state = &reclaim_state; |
1307 | repeat: | 1359 | |
1308 | for_each_online_pgdat(pgdat) { | 1360 | lru_pages = 0; |
1309 | unsigned long freed; | 1361 | for_each_zone(zone) |
1310 | 1362 | lru_pages += zone->nr_active + zone->nr_inactive; | |
1311 | freed = balance_pgdat(pgdat, nr_to_free, 0); | 1363 | |
1312 | ret += freed; | 1364 | nr_slab = global_page_state(NR_SLAB); |
1313 | nr_to_free -= freed; | 1365 | /* If slab caches are huge, it's better to hit them first */ |
1314 | if ((long)nr_to_free <= 0) | 1366 | while (nr_slab >= lru_pages) { |
1367 | reclaim_state.reclaimed_slab = 0; | ||
1368 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
1369 | if (!reclaim_state.reclaimed_slab) | ||
1315 | break; | 1370 | break; |
1371 | |||
1372 | ret += reclaim_state.reclaimed_slab; | ||
1373 | if (ret >= nr_pages) | ||
1374 | goto out; | ||
1375 | |||
1376 | nr_slab -= reclaim_state.reclaimed_slab; | ||
1316 | } | 1377 | } |
1317 | if (retry-- && ret < nr_pages) { | 1378 | |
1318 | blk_congestion_wait(WRITE, HZ/5); | 1379 | /* |
1319 | goto repeat; | 1380 | * We try to shrink LRUs in 5 passes: |
1381 | * 0 = Reclaim from inactive_list only | ||
1382 | * 1 = Reclaim from active list but don't reclaim mapped | ||
1383 | * 2 = 2nd pass of type 1 | ||
1384 | * 3 = Reclaim mapped (normal reclaim) | ||
1385 | * 4 = 2nd pass of type 3 | ||
1386 | */ | ||
1387 | for (pass = 0; pass < 5; pass++) { | ||
1388 | int prio; | ||
1389 | |||
1390 | /* Needed for shrinking slab caches later on */ | ||
1391 | if (!lru_pages) | ||
1392 | for_each_zone(zone) { | ||
1393 | lru_pages += zone->nr_active; | ||
1394 | lru_pages += zone->nr_inactive; | ||
1395 | } | ||
1396 | |||
1397 | /* Force reclaiming mapped pages in the passes #3 and #4 */ | ||
1398 | if (pass > 2) { | ||
1399 | sc.may_swap = 1; | ||
1400 | sc.swappiness = 100; | ||
1401 | } | ||
1402 | |||
1403 | for (prio = DEF_PRIORITY; prio >= 0; prio--) { | ||
1404 | unsigned long nr_to_scan = nr_pages - ret; | ||
1405 | |||
1406 | sc.nr_scanned = 0; | ||
1407 | ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); | ||
1408 | if (ret >= nr_pages) | ||
1409 | goto out; | ||
1410 | |||
1411 | reclaim_state.reclaimed_slab = 0; | ||
1412 | shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages); | ||
1413 | ret += reclaim_state.reclaimed_slab; | ||
1414 | if (ret >= nr_pages) | ||
1415 | goto out; | ||
1416 | |||
1417 | if (sc.nr_scanned && prio < DEF_PRIORITY - 2) | ||
1418 | blk_congestion_wait(WRITE, HZ / 10); | ||
1419 | } | ||
1420 | |||
1421 | lru_pages = 0; | ||
1320 | } | 1422 | } |
1423 | |||
1424 | /* | ||
1425 | * If ret = 0, we could not shrink LRUs, but there may be something | ||
1426 | * in slab caches | ||
1427 | */ | ||
1428 | if (!ret) | ||
1429 | do { | ||
1430 | reclaim_state.reclaimed_slab = 0; | ||
1431 | shrink_slab(nr_pages, sc.gfp_mask, lru_pages); | ||
1432 | ret += reclaim_state.reclaimed_slab; | ||
1433 | } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); | ||
1434 | |||
1435 | out: | ||
1321 | current->reclaim_state = NULL; | 1436 | current->reclaim_state = NULL; |
1437 | |||
1322 | return ret; | 1438 | return ret; |
1323 | } | 1439 | } |
1324 | #endif | 1440 | #endif |
@@ -1328,7 +1444,7 @@ repeat: | |||
1328 | not required for correctness. So if the last cpu in a node goes | 1444 | not required for correctness. So if the last cpu in a node goes |
1329 | away, we get changed to run anywhere: as the first one comes back, | 1445 | away, we get changed to run anywhere: as the first one comes back, |
1330 | restore their cpu bindings. */ | 1446 | restore their cpu bindings. */ |
1331 | static int cpu_callback(struct notifier_block *nfb, | 1447 | static int __devinit cpu_callback(struct notifier_block *nfb, |
1332 | unsigned long action, void *hcpu) | 1448 | unsigned long action, void *hcpu) |
1333 | { | 1449 | { |
1334 | pg_data_t *pgdat; | 1450 | pg_data_t *pgdat; |
@@ -1346,21 +1462,35 @@ static int cpu_callback(struct notifier_block *nfb, | |||
1346 | } | 1462 | } |
1347 | #endif /* CONFIG_HOTPLUG_CPU */ | 1463 | #endif /* CONFIG_HOTPLUG_CPU */ |
1348 | 1464 | ||
1465 | /* | ||
1466 | * This kswapd start function will be called by init and node-hot-add. | ||
1467 | * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. | ||
1468 | */ | ||
1469 | int kswapd_run(int nid) | ||
1470 | { | ||
1471 | pg_data_t *pgdat = NODE_DATA(nid); | ||
1472 | int ret = 0; | ||
1473 | |||
1474 | if (pgdat->kswapd) | ||
1475 | return 0; | ||
1476 | |||
1477 | pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid); | ||
1478 | if (IS_ERR(pgdat->kswapd)) { | ||
1479 | /* failure at boot is fatal */ | ||
1480 | BUG_ON(system_state == SYSTEM_BOOTING); | ||
1481 | printk("Failed to start kswapd on node %d\n",nid); | ||
1482 | ret = -1; | ||
1483 | } | ||
1484 | return ret; | ||
1485 | } | ||
1486 | |||
1349 | static int __init kswapd_init(void) | 1487 | static int __init kswapd_init(void) |
1350 | { | 1488 | { |
1351 | pg_data_t *pgdat; | 1489 | int nid; |
1352 | 1490 | ||
1353 | swap_setup(); | 1491 | swap_setup(); |
1354 | for_each_online_pgdat(pgdat) { | 1492 | for_each_online_node(nid) |
1355 | pid_t pid; | 1493 | kswapd_run(nid); |
1356 | |||
1357 | pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); | ||
1358 | BUG_ON(pid < 0); | ||
1359 | read_lock(&tasklist_lock); | ||
1360 | pgdat->kswapd = find_task_by_pid(pid); | ||
1361 | read_unlock(&tasklist_lock); | ||
1362 | } | ||
1363 | total_memory = nr_free_pagecache_pages(); | ||
1364 | hotcpu_notifier(cpu_callback, 0); | 1494 | hotcpu_notifier(cpu_callback, 0); |
1365 | return 0; | 1495 | return 0; |
1366 | } | 1496 | } |
@@ -1387,11 +1517,6 @@ int zone_reclaim_mode __read_mostly; | |||
1387 | #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ | 1517 | #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ |
1388 | 1518 | ||
1389 | /* | 1519 | /* |
1390 | * Mininum time between zone reclaim scans | ||
1391 | */ | ||
1392 | int zone_reclaim_interval __read_mostly = 30*HZ; | ||
1393 | |||
1394 | /* | ||
1395 | * Priority for ZONE_RECLAIM. This determines the fraction of pages | 1520 | * Priority for ZONE_RECLAIM. This determines the fraction of pages |
1396 | * of a node considered for each zone_reclaim. 4 scans 1/16th of | 1521 | * of a node considered for each zone_reclaim. 4 scans 1/16th of |
1397 | * a zone. | 1522 | * a zone. |
@@ -1412,10 +1537,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1412 | struct scan_control sc = { | 1537 | struct scan_control sc = { |
1413 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 1538 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
1414 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 1539 | .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
1415 | .nr_mapped = read_page_state(nr_mapped), | ||
1416 | .swap_cluster_max = max_t(unsigned long, nr_pages, | 1540 | .swap_cluster_max = max_t(unsigned long, nr_pages, |
1417 | SWAP_CLUSTER_MAX), | 1541 | SWAP_CLUSTER_MAX), |
1418 | .gfp_mask = gfp_mask, | 1542 | .gfp_mask = gfp_mask, |
1543 | .swappiness = vm_swappiness, | ||
1419 | }; | 1544 | }; |
1420 | 1545 | ||
1421 | disable_swap_token(); | 1546 | disable_swap_token(); |
@@ -1456,16 +1581,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1456 | 1581 | ||
1457 | p->reclaim_state = NULL; | 1582 | p->reclaim_state = NULL; |
1458 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); | 1583 | current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); |
1459 | |||
1460 | if (nr_reclaimed == 0) { | ||
1461 | /* | ||
1462 | * We were unable to reclaim enough pages to stay on node. We | ||
1463 | * now allow off node accesses for a certain time period before | ||
1464 | * trying again to reclaim pages from the local zone. | ||
1465 | */ | ||
1466 | zone->last_unsuccessful_zone_reclaim = jiffies; | ||
1467 | } | ||
1468 | |||
1469 | return nr_reclaimed >= nr_pages; | 1584 | return nr_reclaimed >= nr_pages; |
1470 | } | 1585 | } |
1471 | 1586 | ||
@@ -1475,13 +1590,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
1475 | int node_id; | 1590 | int node_id; |
1476 | 1591 | ||
1477 | /* | 1592 | /* |
1478 | * Do not reclaim if there was a recent unsuccessful attempt at zone | 1593 | * Do not reclaim if there are not enough reclaimable pages in this |
1479 | * reclaim. In that case we let allocations go off node for the | 1594 | * zone that would satify this allocations. |
1480 | * zone_reclaim_interval. Otherwise we would scan for each off-node | 1595 | * |
1481 | * page allocation. | 1596 | * All unmapped pagecache pages are reclaimable. |
1597 | * | ||
1598 | * Both counters may be temporarily off a bit so we use | ||
1599 | * SWAP_CLUSTER_MAX as the boundary. It may also be good to | ||
1600 | * leave a few frequently used unmapped pagecache pages around. | ||
1482 | */ | 1601 | */ |
1483 | if (time_before(jiffies, | 1602 | if (zone_page_state(zone, NR_FILE_PAGES) - |
1484 | zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) | 1603 | zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX) |
1485 | return 0; | 1604 | return 0; |
1486 | 1605 | ||
1487 | /* | 1606 | /* |