aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c324
1 files changed, 194 insertions, 130 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d196f46c8808..9a27c44aa327 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,9 @@ struct scan_control {
52 /* Incremented by the number of inactive pages that were scanned */ 52 /* Incremented by the number of inactive pages that were scanned */
53 unsigned long nr_scanned; 53 unsigned long nr_scanned;
54 54
55 /* Number of pages freed so far during a call to shrink_zones() */
56 unsigned long nr_reclaimed;
57
55 /* This context's GFP mask */ 58 /* This context's GFP mask */
56 gfp_t gfp_mask; 59 gfp_t gfp_mask;
57 60
@@ -122,11 +125,30 @@ static LIST_HEAD(shrinker_list);
122static DECLARE_RWSEM(shrinker_rwsem); 125static DECLARE_RWSEM(shrinker_rwsem);
123 126
124#ifdef CONFIG_CGROUP_MEM_RES_CTLR 127#ifdef CONFIG_CGROUP_MEM_RES_CTLR
125#define scan_global_lru(sc) (!(sc)->mem_cgroup) 128#define scanning_global_lru(sc) (!(sc)->mem_cgroup)
126#else 129#else
127#define scan_global_lru(sc) (1) 130#define scanning_global_lru(sc) (1)
128#endif 131#endif
129 132
133static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
134 struct scan_control *sc)
135{
136 if (!scanning_global_lru(sc))
137 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
138
139 return &zone->reclaim_stat;
140}
141
142static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
143 enum lru_list lru)
144{
145 if (!scanning_global_lru(sc))
146 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
147
148 return zone_page_state(zone, NR_LRU_BASE + lru);
149}
150
151
130/* 152/*
131 * Add a shrinker callback to be called from the vm 153 * Add a shrinker callback to be called from the vm
132 */ 154 */
@@ -509,7 +531,6 @@ redo:
509 lru = LRU_UNEVICTABLE; 531 lru = LRU_UNEVICTABLE;
510 add_page_to_unevictable_list(page); 532 add_page_to_unevictable_list(page);
511 } 533 }
512 mem_cgroup_move_lists(page, lru);
513 534
514 /* 535 /*
515 * page's status can change while we move it among lru. If an evictable 536 * page's status can change while we move it among lru. If an evictable
@@ -544,7 +565,6 @@ void putback_lru_page(struct page *page)
544 565
545 lru = !!TestClearPageActive(page) + page_is_file_cache(page); 566 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
546 lru_cache_add_lru(page, lru); 567 lru_cache_add_lru(page, lru);
547 mem_cgroup_move_lists(page, lru);
548 put_page(page); 568 put_page(page);
549} 569}
550#endif /* CONFIG_UNEVICTABLE_LRU */ 570#endif /* CONFIG_UNEVICTABLE_LRU */
@@ -617,7 +637,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
617 referenced && page_mapping_inuse(page)) 637 referenced && page_mapping_inuse(page))
618 goto activate_locked; 638 goto activate_locked;
619 639
620#ifdef CONFIG_SWAP
621 /* 640 /*
622 * Anonymous process memory has backing store? 641 * Anonymous process memory has backing store?
623 * Try to allocate it some swap space here. 642 * Try to allocate it some swap space here.
@@ -625,20 +644,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
625 if (PageAnon(page) && !PageSwapCache(page)) { 644 if (PageAnon(page) && !PageSwapCache(page)) {
626 if (!(sc->gfp_mask & __GFP_IO)) 645 if (!(sc->gfp_mask & __GFP_IO))
627 goto keep_locked; 646 goto keep_locked;
628 switch (try_to_munlock(page)) { 647 if (!add_to_swap(page))
629 case SWAP_FAIL: /* shouldn't happen */
630 case SWAP_AGAIN:
631 goto keep_locked;
632 case SWAP_MLOCK:
633 goto cull_mlocked;
634 case SWAP_SUCCESS:
635 ; /* fall thru'; add to swap cache */
636 }
637 if (!add_to_swap(page, GFP_ATOMIC))
638 goto activate_locked; 648 goto activate_locked;
639 may_enter_fs = 1; 649 may_enter_fs = 1;
640 } 650 }
641#endif /* CONFIG_SWAP */
642 651
643 mapping = page_mapping(page); 652 mapping = page_mapping(page);
644 653
@@ -752,6 +761,8 @@ free_it:
752 continue; 761 continue;
753 762
754cull_mlocked: 763cull_mlocked:
764 if (PageSwapCache(page))
765 try_to_free_swap(page);
755 unlock_page(page); 766 unlock_page(page);
756 putback_lru_page(page); 767 putback_lru_page(page);
757 continue; 768 continue;
@@ -759,7 +770,7 @@ cull_mlocked:
759activate_locked: 770activate_locked:
760 /* Not a candidate for swapping, so reclaim swap space. */ 771 /* Not a candidate for swapping, so reclaim swap space. */
761 if (PageSwapCache(page) && vm_swap_full()) 772 if (PageSwapCache(page) && vm_swap_full())
762 remove_exclusive_swap_page_ref(page); 773 try_to_free_swap(page);
763 VM_BUG_ON(PageActive(page)); 774 VM_BUG_ON(PageActive(page));
764 SetPageActive(page); 775 SetPageActive(page);
765 pgactivate++; 776 pgactivate++;
@@ -819,6 +830,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
819 return ret; 830 return ret;
820 831
821 ret = -EBUSY; 832 ret = -EBUSY;
833
822 if (likely(get_page_unless_zero(page))) { 834 if (likely(get_page_unless_zero(page))) {
823 /* 835 /*
824 * Be careful not to clear PageLRU until after we're 836 * Be careful not to clear PageLRU until after we're
@@ -827,6 +839,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
827 */ 839 */
828 ClearPageLRU(page); 840 ClearPageLRU(page);
829 ret = 0; 841 ret = 0;
842 mem_cgroup_del_lru(page);
830 } 843 }
831 844
832 return ret; 845 return ret;
@@ -1035,6 +1048,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1035 struct pagevec pvec; 1048 struct pagevec pvec;
1036 unsigned long nr_scanned = 0; 1049 unsigned long nr_scanned = 0;
1037 unsigned long nr_reclaimed = 0; 1050 unsigned long nr_reclaimed = 0;
1051 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1038 1052
1039 pagevec_init(&pvec, 1); 1053 pagevec_init(&pvec, 1);
1040 1054
@@ -1076,13 +1090,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1076 __mod_zone_page_state(zone, NR_INACTIVE_ANON, 1090 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1077 -count[LRU_INACTIVE_ANON]); 1091 -count[LRU_INACTIVE_ANON]);
1078 1092
1079 if (scan_global_lru(sc)) { 1093 if (scanning_global_lru(sc))
1080 zone->pages_scanned += nr_scan; 1094 zone->pages_scanned += nr_scan;
1081 zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; 1095
1082 zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; 1096 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1083 zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; 1097 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
1084 zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; 1098 reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1085 } 1099 reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1100
1086 spin_unlock_irq(&zone->lru_lock); 1101 spin_unlock_irq(&zone->lru_lock);
1087 1102
1088 nr_scanned += nr_scan; 1103 nr_scanned += nr_scan;
@@ -1114,7 +1129,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1114 if (current_is_kswapd()) { 1129 if (current_is_kswapd()) {
1115 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); 1130 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1116 __count_vm_events(KSWAPD_STEAL, nr_freed); 1131 __count_vm_events(KSWAPD_STEAL, nr_freed);
1117 } else if (scan_global_lru(sc)) 1132 } else if (scanning_global_lru(sc))
1118 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); 1133 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1119 1134
1120 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1135 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
@@ -1140,10 +1155,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1140 SetPageLRU(page); 1155 SetPageLRU(page);
1141 lru = page_lru(page); 1156 lru = page_lru(page);
1142 add_page_to_lru_list(zone, page, lru); 1157 add_page_to_lru_list(zone, page, lru);
1143 mem_cgroup_move_lists(page, lru); 1158 if (PageActive(page)) {
1144 if (PageActive(page) && scan_global_lru(sc)) {
1145 int file = !!page_is_file_cache(page); 1159 int file = !!page_is_file_cache(page);
1146 zone->recent_rotated[file]++; 1160 reclaim_stat->recent_rotated[file]++;
1147 } 1161 }
1148 if (!pagevec_add(&pvec, page)) { 1162 if (!pagevec_add(&pvec, page)) {
1149 spin_unlock_irq(&zone->lru_lock); 1163 spin_unlock_irq(&zone->lru_lock);
@@ -1173,11 +1187,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1173 zone->prev_priority = priority; 1187 zone->prev_priority = priority;
1174} 1188}
1175 1189
1176static inline int zone_is_near_oom(struct zone *zone)
1177{
1178 return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
1179}
1180
1181/* 1190/*
1182 * This moves pages from the active list to the inactive list. 1191 * This moves pages from the active list to the inactive list.
1183 * 1192 *
@@ -1208,6 +1217,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1208 struct page *page; 1217 struct page *page;
1209 struct pagevec pvec; 1218 struct pagevec pvec;
1210 enum lru_list lru; 1219 enum lru_list lru;
1220 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1211 1221
1212 lru_add_drain(); 1222 lru_add_drain();
1213 spin_lock_irq(&zone->lru_lock); 1223 spin_lock_irq(&zone->lru_lock);
@@ -1218,10 +1228,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1218 * zone->pages_scanned is used for detect zone's oom 1228 * zone->pages_scanned is used for detect zone's oom
1219 * mem_cgroup remembers nr_scan by itself. 1229 * mem_cgroup remembers nr_scan by itself.
1220 */ 1230 */
1221 if (scan_global_lru(sc)) { 1231 if (scanning_global_lru(sc)) {
1222 zone->pages_scanned += pgscanned; 1232 zone->pages_scanned += pgscanned;
1223 zone->recent_scanned[!!file] += pgmoved;
1224 } 1233 }
1234 reclaim_stat->recent_scanned[!!file] += pgmoved;
1225 1235
1226 if (file) 1236 if (file)
1227 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1237 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
@@ -1248,6 +1258,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1248 list_add(&page->lru, &l_inactive); 1258 list_add(&page->lru, &l_inactive);
1249 } 1259 }
1250 1260
1261 /*
1262 * Move the pages to the [file or anon] inactive list.
1263 */
1264 pagevec_init(&pvec, 1);
1265 pgmoved = 0;
1266 lru = LRU_BASE + file * LRU_FILE;
1267
1251 spin_lock_irq(&zone->lru_lock); 1268 spin_lock_irq(&zone->lru_lock);
1252 /* 1269 /*
1253 * Count referenced pages from currently used mappings as 1270 * Count referenced pages from currently used mappings as
@@ -1255,15 +1272,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1255 * This helps balance scan pressure between file and anonymous 1272 * This helps balance scan pressure between file and anonymous
1256 * pages in get_scan_ratio. 1273 * pages in get_scan_ratio.
1257 */ 1274 */
1258 zone->recent_rotated[!!file] += pgmoved; 1275 reclaim_stat->recent_rotated[!!file] += pgmoved;
1259 1276
1260 /*
1261 * Move the pages to the [file or anon] inactive list.
1262 */
1263 pagevec_init(&pvec, 1);
1264
1265 pgmoved = 0;
1266 lru = LRU_BASE + file * LRU_FILE;
1267 while (!list_empty(&l_inactive)) { 1277 while (!list_empty(&l_inactive)) {
1268 page = lru_to_page(&l_inactive); 1278 page = lru_to_page(&l_inactive);
1269 prefetchw_prev_lru_page(page, &l_inactive, flags); 1279 prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1273,7 +1283,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1273 ClearPageActive(page); 1283 ClearPageActive(page);
1274 1284
1275 list_move(&page->lru, &zone->lru[lru].list); 1285 list_move(&page->lru, &zone->lru[lru].list);
1276 mem_cgroup_move_lists(page, lru); 1286 mem_cgroup_add_lru_list(page, lru);
1277 pgmoved++; 1287 pgmoved++;
1278 if (!pagevec_add(&pvec, page)) { 1288 if (!pagevec_add(&pvec, page)) {
1279 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1289 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1302,6 +1312,38 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1302 pagevec_release(&pvec); 1312 pagevec_release(&pvec);
1303} 1313}
1304 1314
1315static int inactive_anon_is_low_global(struct zone *zone)
1316{
1317 unsigned long active, inactive;
1318
1319 active = zone_page_state(zone, NR_ACTIVE_ANON);
1320 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1321
1322 if (inactive * zone->inactive_ratio < active)
1323 return 1;
1324
1325 return 0;
1326}
1327
1328/**
1329 * inactive_anon_is_low - check if anonymous pages need to be deactivated
1330 * @zone: zone to check
1331 * @sc: scan control of this context
1332 *
1333 * Returns true if the zone does not have enough inactive anon pages,
1334 * meaning some active anon pages need to be deactivated.
1335 */
1336static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1337{
1338 int low;
1339
1340 if (scanning_global_lru(sc))
1341 low = inactive_anon_is_low_global(zone);
1342 else
1343 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1344 return low;
1345}
1346
1305static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1347static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1306 struct zone *zone, struct scan_control *sc, int priority) 1348 struct zone *zone, struct scan_control *sc, int priority)
1307{ 1349{
@@ -1312,8 +1354,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1312 return 0; 1354 return 0;
1313 } 1355 }
1314 1356
1315 if (lru == LRU_ACTIVE_ANON && 1357 if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
1316 (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
1317 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1358 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1318 return 0; 1359 return 0;
1319 } 1360 }
@@ -1335,12 +1376,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1335 unsigned long anon, file, free; 1376 unsigned long anon, file, free;
1336 unsigned long anon_prio, file_prio; 1377 unsigned long anon_prio, file_prio;
1337 unsigned long ap, fp; 1378 unsigned long ap, fp;
1338 1379 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1339 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1340 zone_page_state(zone, NR_INACTIVE_ANON);
1341 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1342 zone_page_state(zone, NR_INACTIVE_FILE);
1343 free = zone_page_state(zone, NR_FREE_PAGES);
1344 1380
1345 /* If we have no swap space, do not bother scanning anon pages. */ 1381 /* If we have no swap space, do not bother scanning anon pages. */
1346 if (nr_swap_pages <= 0) { 1382 if (nr_swap_pages <= 0) {
@@ -1349,11 +1385,20 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1349 return; 1385 return;
1350 } 1386 }
1351 1387
1352 /* If we have very few page cache pages, force-scan anon pages. */ 1388 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1353 if (unlikely(file + free <= zone->pages_high)) { 1389 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1354 percent[0] = 100; 1390 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
1355 percent[1] = 0; 1391 zone_nr_pages(zone, sc, LRU_INACTIVE_FILE);
1356 return; 1392
1393 if (scanning_global_lru(sc)) {
1394 free = zone_page_state(zone, NR_FREE_PAGES);
1395 /* If we have very few page cache pages,
1396 force-scan anon pages. */
1397 if (unlikely(file + free <= zone->pages_high)) {
1398 percent[0] = 100;
1399 percent[1] = 0;
1400 return;
1401 }
1357 } 1402 }
1358 1403
1359 /* 1404 /*
@@ -1367,17 +1412,17 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1367 * 1412 *
1368 * anon in [0], file in [1] 1413 * anon in [0], file in [1]
1369 */ 1414 */
1370 if (unlikely(zone->recent_scanned[0] > anon / 4)) { 1415 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1371 spin_lock_irq(&zone->lru_lock); 1416 spin_lock_irq(&zone->lru_lock);
1372 zone->recent_scanned[0] /= 2; 1417 reclaim_stat->recent_scanned[0] /= 2;
1373 zone->recent_rotated[0] /= 2; 1418 reclaim_stat->recent_rotated[0] /= 2;
1374 spin_unlock_irq(&zone->lru_lock); 1419 spin_unlock_irq(&zone->lru_lock);
1375 } 1420 }
1376 1421
1377 if (unlikely(zone->recent_scanned[1] > file / 4)) { 1422 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1378 spin_lock_irq(&zone->lru_lock); 1423 spin_lock_irq(&zone->lru_lock);
1379 zone->recent_scanned[1] /= 2; 1424 reclaim_stat->recent_scanned[1] /= 2;
1380 zone->recent_rotated[1] /= 2; 1425 reclaim_stat->recent_rotated[1] /= 2;
1381 spin_unlock_irq(&zone->lru_lock); 1426 spin_unlock_irq(&zone->lru_lock);
1382 } 1427 }
1383 1428
@@ -1393,11 +1438,11 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1393 * proportional to the fraction of recently scanned pages on 1438 * proportional to the fraction of recently scanned pages on
1394 * each list that were recently referenced and in active use. 1439 * each list that were recently referenced and in active use.
1395 */ 1440 */
1396 ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); 1441 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
1397 ap /= zone->recent_rotated[0] + 1; 1442 ap /= reclaim_stat->recent_rotated[0] + 1;
1398 1443
1399 fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); 1444 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1400 fp /= zone->recent_rotated[1] + 1; 1445 fp /= reclaim_stat->recent_rotated[1] + 1;
1401 1446
1402 /* Normalize to percentages */ 1447 /* Normalize to percentages */
1403 percent[0] = 100 * ap / (ap + fp + 1); 1448 percent[0] = 100 * ap / (ap + fp + 1);
@@ -1408,69 +1453,72 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1408/* 1453/*
1409 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1454 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1410 */ 1455 */
1411static unsigned long shrink_zone(int priority, struct zone *zone, 1456static void shrink_zone(int priority, struct zone *zone,
1412 struct scan_control *sc) 1457 struct scan_control *sc)
1413{ 1458{
1414 unsigned long nr[NR_LRU_LISTS]; 1459 unsigned long nr[NR_LRU_LISTS];
1415 unsigned long nr_to_scan; 1460 unsigned long nr_to_scan;
1416 unsigned long nr_reclaimed = 0;
1417 unsigned long percent[2]; /* anon @ 0; file @ 1 */ 1461 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1418 enum lru_list l; 1462 enum lru_list l;
1463 unsigned long nr_reclaimed = sc->nr_reclaimed;
1464 unsigned long swap_cluster_max = sc->swap_cluster_max;
1419 1465
1420 get_scan_ratio(zone, sc, percent); 1466 get_scan_ratio(zone, sc, percent);
1421 1467
1422 for_each_evictable_lru(l) { 1468 for_each_evictable_lru(l) {
1423 if (scan_global_lru(sc)) { 1469 int file = is_file_lru(l);
1424 int file = is_file_lru(l); 1470 int scan;
1425 int scan; 1471
1426 1472 scan = zone_page_state(zone, NR_LRU_BASE + l);
1427 scan = zone_page_state(zone, NR_LRU_BASE + l); 1473 if (priority) {
1428 if (priority) { 1474 scan >>= priority;
1429 scan >>= priority; 1475 scan = (scan * percent[file]) / 100;
1430 scan = (scan * percent[file]) / 100; 1476 }
1431 } 1477 if (scanning_global_lru(sc)) {
1432 zone->lru[l].nr_scan += scan; 1478 zone->lru[l].nr_scan += scan;
1433 nr[l] = zone->lru[l].nr_scan; 1479 nr[l] = zone->lru[l].nr_scan;
1434 if (nr[l] >= sc->swap_cluster_max) 1480 if (nr[l] >= swap_cluster_max)
1435 zone->lru[l].nr_scan = 0; 1481 zone->lru[l].nr_scan = 0;
1436 else 1482 else
1437 nr[l] = 0; 1483 nr[l] = 0;
1438 } else { 1484 } else
1439 /* 1485 nr[l] = scan;
1440 * This reclaim occurs not because zone memory shortage
1441 * but because memory controller hits its limit.
1442 * Don't modify zone reclaim related data.
1443 */
1444 nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
1445 priority, l);
1446 }
1447 } 1486 }
1448 1487
1449 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1488 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1450 nr[LRU_INACTIVE_FILE]) { 1489 nr[LRU_INACTIVE_FILE]) {
1451 for_each_evictable_lru(l) { 1490 for_each_evictable_lru(l) {
1452 if (nr[l]) { 1491 if (nr[l]) {
1453 nr_to_scan = min(nr[l], 1492 nr_to_scan = min(nr[l], swap_cluster_max);
1454 (unsigned long)sc->swap_cluster_max);
1455 nr[l] -= nr_to_scan; 1493 nr[l] -= nr_to_scan;
1456 1494
1457 nr_reclaimed += shrink_list(l, nr_to_scan, 1495 nr_reclaimed += shrink_list(l, nr_to_scan,
1458 zone, sc, priority); 1496 zone, sc, priority);
1459 } 1497 }
1460 } 1498 }
1499 /*
1500 * On large memory systems, scan >> priority can become
1501 * really large. This is fine for the starting priority;
1502 * we want to put equal scanning pressure on each zone.
1503 * However, if the VM has a harder time of freeing pages,
1504 * with multiple processes reclaiming pages, the total
1505 * freeing target can get unreasonably large.
1506 */
1507 if (nr_reclaimed > swap_cluster_max &&
1508 priority < DEF_PRIORITY && !current_is_kswapd())
1509 break;
1461 } 1510 }
1462 1511
1512 sc->nr_reclaimed = nr_reclaimed;
1513
1463 /* 1514 /*
1464 * Even if we did not try to evict anon pages at all, we want to 1515 * Even if we did not try to evict anon pages at all, we want to
1465 * rebalance the anon lru active/inactive ratio. 1516 * rebalance the anon lru active/inactive ratio.
1466 */ 1517 */
1467 if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) 1518 if (inactive_anon_is_low(zone, sc))
1468 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1469 else if (!scan_global_lru(sc))
1470 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1519 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1471 1520
1472 throttle_vm_writeout(sc->gfp_mask); 1521 throttle_vm_writeout(sc->gfp_mask);
1473 return nr_reclaimed;
1474} 1522}
1475 1523
1476/* 1524/*
@@ -1484,16 +1532,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1484 * b) The zones may be over pages_high but they must go *over* pages_high to 1532 * b) The zones may be over pages_high but they must go *over* pages_high to
1485 * satisfy the `incremental min' zone defense algorithm. 1533 * satisfy the `incremental min' zone defense algorithm.
1486 * 1534 *
1487 * Returns the number of reclaimed pages.
1488 *
1489 * If a zone is deemed to be full of pinned pages then just give it a light 1535 * If a zone is deemed to be full of pinned pages then just give it a light
1490 * scan then give up on it. 1536 * scan then give up on it.
1491 */ 1537 */
1492static unsigned long shrink_zones(int priority, struct zonelist *zonelist, 1538static void shrink_zones(int priority, struct zonelist *zonelist,
1493 struct scan_control *sc) 1539 struct scan_control *sc)
1494{ 1540{
1495 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1541 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1496 unsigned long nr_reclaimed = 0;
1497 struct zoneref *z; 1542 struct zoneref *z;
1498 struct zone *zone; 1543 struct zone *zone;
1499 1544
@@ -1505,7 +1550,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1505 * Take care memory controller reclaiming has small influence 1550 * Take care memory controller reclaiming has small influence
1506 * to global LRU. 1551 * to global LRU.
1507 */ 1552 */
1508 if (scan_global_lru(sc)) { 1553 if (scanning_global_lru(sc)) {
1509 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1554 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1510 continue; 1555 continue;
1511 note_zone_scanning_priority(zone, priority); 1556 note_zone_scanning_priority(zone, priority);
@@ -1524,10 +1569,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1524 priority); 1569 priority);
1525 } 1570 }
1526 1571
1527 nr_reclaimed += shrink_zone(priority, zone, sc); 1572 shrink_zone(priority, zone, sc);
1528 } 1573 }
1529
1530 return nr_reclaimed;
1531} 1574}
1532 1575
1533/* 1576/*
@@ -1552,7 +1595,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1552 int priority; 1595 int priority;
1553 unsigned long ret = 0; 1596 unsigned long ret = 0;
1554 unsigned long total_scanned = 0; 1597 unsigned long total_scanned = 0;
1555 unsigned long nr_reclaimed = 0;
1556 struct reclaim_state *reclaim_state = current->reclaim_state; 1598 struct reclaim_state *reclaim_state = current->reclaim_state;
1557 unsigned long lru_pages = 0; 1599 unsigned long lru_pages = 0;
1558 struct zoneref *z; 1600 struct zoneref *z;
@@ -1561,12 +1603,12 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1561 1603
1562 delayacct_freepages_start(); 1604 delayacct_freepages_start();
1563 1605
1564 if (scan_global_lru(sc)) 1606 if (scanning_global_lru(sc))
1565 count_vm_event(ALLOCSTALL); 1607 count_vm_event(ALLOCSTALL);
1566 /* 1608 /*
1567 * mem_cgroup will not do shrink_slab. 1609 * mem_cgroup will not do shrink_slab.
1568 */ 1610 */
1569 if (scan_global_lru(sc)) { 1611 if (scanning_global_lru(sc)) {
1570 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1612 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1571 1613
1572 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1614 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1580,21 +1622,21 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1580 sc->nr_scanned = 0; 1622 sc->nr_scanned = 0;
1581 if (!priority) 1623 if (!priority)
1582 disable_swap_token(); 1624 disable_swap_token();
1583 nr_reclaimed += shrink_zones(priority, zonelist, sc); 1625 shrink_zones(priority, zonelist, sc);
1584 /* 1626 /*
1585 * Don't shrink slabs when reclaiming memory from 1627 * Don't shrink slabs when reclaiming memory from
1586 * over limit cgroups 1628 * over limit cgroups
1587 */ 1629 */
1588 if (scan_global_lru(sc)) { 1630 if (scanning_global_lru(sc)) {
1589 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 1631 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1590 if (reclaim_state) { 1632 if (reclaim_state) {
1591 nr_reclaimed += reclaim_state->reclaimed_slab; 1633 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
1592 reclaim_state->reclaimed_slab = 0; 1634 reclaim_state->reclaimed_slab = 0;
1593 } 1635 }
1594 } 1636 }
1595 total_scanned += sc->nr_scanned; 1637 total_scanned += sc->nr_scanned;
1596 if (nr_reclaimed >= sc->swap_cluster_max) { 1638 if (sc->nr_reclaimed >= sc->swap_cluster_max) {
1597 ret = nr_reclaimed; 1639 ret = sc->nr_reclaimed;
1598 goto out; 1640 goto out;
1599 } 1641 }
1600 1642
@@ -1616,8 +1658,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1616 congestion_wait(WRITE, HZ/10); 1658 congestion_wait(WRITE, HZ/10);
1617 } 1659 }
1618 /* top priority shrink_zones still had more to do? don't OOM, then */ 1660 /* top priority shrink_zones still had more to do? don't OOM, then */
1619 if (!sc->all_unreclaimable && scan_global_lru(sc)) 1661 if (!sc->all_unreclaimable && scanning_global_lru(sc))
1620 ret = nr_reclaimed; 1662 ret = sc->nr_reclaimed;
1621out: 1663out:
1622 /* 1664 /*
1623 * Now that we've scanned all the zones at this priority level, note 1665 * Now that we've scanned all the zones at this priority level, note
@@ -1629,7 +1671,7 @@ out:
1629 if (priority < 0) 1671 if (priority < 0)
1630 priority = 0; 1672 priority = 0;
1631 1673
1632 if (scan_global_lru(sc)) { 1674 if (scanning_global_lru(sc)) {
1633 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1675 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1634 1676
1635 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1677 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1665,19 +1707,24 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1665#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1707#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1666 1708
1667unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1709unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1668 gfp_t gfp_mask) 1710 gfp_t gfp_mask,
1711 bool noswap,
1712 unsigned int swappiness)
1669{ 1713{
1670 struct scan_control sc = { 1714 struct scan_control sc = {
1671 .may_writepage = !laptop_mode, 1715 .may_writepage = !laptop_mode,
1672 .may_swap = 1, 1716 .may_swap = 1,
1673 .swap_cluster_max = SWAP_CLUSTER_MAX, 1717 .swap_cluster_max = SWAP_CLUSTER_MAX,
1674 .swappiness = vm_swappiness, 1718 .swappiness = swappiness,
1675 .order = 0, 1719 .order = 0,
1676 .mem_cgroup = mem_cont, 1720 .mem_cgroup = mem_cont,
1677 .isolate_pages = mem_cgroup_isolate_pages, 1721 .isolate_pages = mem_cgroup_isolate_pages,
1678 }; 1722 };
1679 struct zonelist *zonelist; 1723 struct zonelist *zonelist;
1680 1724
1725 if (noswap)
1726 sc.may_swap = 0;
1727
1681 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1728 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1682 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1729 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1683 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 1730 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
@@ -1712,7 +1759,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1712 int priority; 1759 int priority;
1713 int i; 1760 int i;
1714 unsigned long total_scanned; 1761 unsigned long total_scanned;
1715 unsigned long nr_reclaimed;
1716 struct reclaim_state *reclaim_state = current->reclaim_state; 1762 struct reclaim_state *reclaim_state = current->reclaim_state;
1717 struct scan_control sc = { 1763 struct scan_control sc = {
1718 .gfp_mask = GFP_KERNEL, 1764 .gfp_mask = GFP_KERNEL,
@@ -1731,7 +1777,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1731 1777
1732loop_again: 1778loop_again:
1733 total_scanned = 0; 1779 total_scanned = 0;
1734 nr_reclaimed = 0; 1780 sc.nr_reclaimed = 0;
1735 sc.may_writepage = !laptop_mode; 1781 sc.may_writepage = !laptop_mode;
1736 count_vm_event(PAGEOUTRUN); 1782 count_vm_event(PAGEOUTRUN);
1737 1783
@@ -1766,7 +1812,7 @@ loop_again:
1766 * Do some background aging of the anon list, to give 1812 * Do some background aging of the anon list, to give
1767 * pages a chance to be referenced before reclaiming. 1813 * pages a chance to be referenced before reclaiming.
1768 */ 1814 */
1769 if (inactive_anon_is_low(zone)) 1815 if (inactive_anon_is_low(zone, &sc))
1770 shrink_active_list(SWAP_CLUSTER_MAX, zone, 1816 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1771 &sc, priority, 0); 1817 &sc, priority, 0);
1772 1818
@@ -1817,11 +1863,11 @@ loop_again:
1817 */ 1863 */
1818 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1864 if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
1819 end_zone, 0)) 1865 end_zone, 0))
1820 nr_reclaimed += shrink_zone(priority, zone, &sc); 1866 shrink_zone(priority, zone, &sc);
1821 reclaim_state->reclaimed_slab = 0; 1867 reclaim_state->reclaimed_slab = 0;
1822 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1868 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1823 lru_pages); 1869 lru_pages);
1824 nr_reclaimed += reclaim_state->reclaimed_slab; 1870 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1825 total_scanned += sc.nr_scanned; 1871 total_scanned += sc.nr_scanned;
1826 if (zone_is_all_unreclaimable(zone)) 1872 if (zone_is_all_unreclaimable(zone))
1827 continue; 1873 continue;
@@ -1835,7 +1881,7 @@ loop_again:
1835 * even in laptop mode 1881 * even in laptop mode
1836 */ 1882 */
1837 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1883 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1838 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1884 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
1839 sc.may_writepage = 1; 1885 sc.may_writepage = 1;
1840 } 1886 }
1841 if (all_zones_ok) 1887 if (all_zones_ok)
@@ -1853,7 +1899,7 @@ loop_again:
1853 * matches the direct reclaim path behaviour in terms of impact 1899 * matches the direct reclaim path behaviour in terms of impact
1854 * on zone->*_priority. 1900 * on zone->*_priority.
1855 */ 1901 */
1856 if (nr_reclaimed >= SWAP_CLUSTER_MAX) 1902 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
1857 break; 1903 break;
1858 } 1904 }
1859out: 1905out:
@@ -1872,10 +1918,27 @@ out:
1872 1918
1873 try_to_freeze(); 1919 try_to_freeze();
1874 1920
1921 /*
1922 * Fragmentation may mean that the system cannot be
1923 * rebalanced for high-order allocations in all zones.
1924 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
1925 * it means the zones have been fully scanned and are still
1926 * not balanced. For high-order allocations, there is
1927 * little point trying all over again as kswapd may
1928 * infinite loop.
1929 *
1930 * Instead, recheck all watermarks at order-0 as they
1931 * are the most important. If watermarks are ok, kswapd will go
1932 * back to sleep. High-order users can still perform direct
1933 * reclaim if they wish.
1934 */
1935 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
1936 order = sc.order = 0;
1937
1875 goto loop_again; 1938 goto loop_again;
1876 } 1939 }
1877 1940
1878 return nr_reclaimed; 1941 return sc.nr_reclaimed;
1879} 1942}
1880 1943
1881/* 1944/*
@@ -2227,7 +2290,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2227 struct task_struct *p = current; 2290 struct task_struct *p = current;
2228 struct reclaim_state reclaim_state; 2291 struct reclaim_state reclaim_state;
2229 int priority; 2292 int priority;
2230 unsigned long nr_reclaimed = 0;
2231 struct scan_control sc = { 2293 struct scan_control sc = {
2232 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2294 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2233 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2295 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2260,9 +2322,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2260 priority = ZONE_RECLAIM_PRIORITY; 2322 priority = ZONE_RECLAIM_PRIORITY;
2261 do { 2323 do {
2262 note_zone_scanning_priority(zone, priority); 2324 note_zone_scanning_priority(zone, priority);
2263 nr_reclaimed += shrink_zone(priority, zone, &sc); 2325 shrink_zone(priority, zone, &sc);
2264 priority--; 2326 priority--;
2265 } while (priority >= 0 && nr_reclaimed < nr_pages); 2327 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2266 } 2328 }
2267 2329
2268 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2330 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2286,13 +2348,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2286 * Update nr_reclaimed by the number of slab pages we 2348 * Update nr_reclaimed by the number of slab pages we
2287 * reclaimed from this zone. 2349 * reclaimed from this zone.
2288 */ 2350 */
2289 nr_reclaimed += slab_reclaimable - 2351 sc.nr_reclaimed += slab_reclaimable -
2290 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2352 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2291 } 2353 }
2292 2354
2293 p->reclaim_state = NULL; 2355 p->reclaim_state = NULL;
2294 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 2356 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2295 return nr_reclaimed >= nr_pages; 2357 return sc.nr_reclaimed >= nr_pages;
2296} 2358}
2297 2359
2298int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 2360int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -2393,6 +2455,7 @@ retry:
2393 2455
2394 __dec_zone_state(zone, NR_UNEVICTABLE); 2456 __dec_zone_state(zone, NR_UNEVICTABLE);
2395 list_move(&page->lru, &zone->lru[l].list); 2457 list_move(&page->lru, &zone->lru[l].list);
2458 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
2396 __inc_zone_state(zone, NR_INACTIVE_ANON + l); 2459 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
2397 __count_vm_event(UNEVICTABLE_PGRESCUED); 2460 __count_vm_event(UNEVICTABLE_PGRESCUED);
2398 } else { 2461 } else {
@@ -2401,6 +2464,7 @@ retry:
2401 */ 2464 */
2402 SetPageUnevictable(page); 2465 SetPageUnevictable(page);
2403 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); 2466 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
2467 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
2404 if (page_evictable(page, NULL)) 2468 if (page_evictable(page, NULL))
2405 goto retry; 2469 goto retry;
2406 } 2470 }
@@ -2472,7 +2536,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
2472 * back onto @zone's unevictable list. 2536 * back onto @zone's unevictable list.
2473 */ 2537 */
2474#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ 2538#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
2475void scan_zone_unevictable_pages(struct zone *zone) 2539static void scan_zone_unevictable_pages(struct zone *zone)
2476{ 2540{
2477 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; 2541 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
2478 unsigned long scan; 2542 unsigned long scan;
@@ -2514,7 +2578,7 @@ void scan_zone_unevictable_pages(struct zone *zone)
2514 * that has possibly/probably made some previously unevictable pages 2578 * that has possibly/probably made some previously unevictable pages
2515 * evictable. 2579 * evictable.
2516 */ 2580 */
2517void scan_all_zones_unevictable_pages(void) 2581static void scan_all_zones_unevictable_pages(void)
2518{ 2582{
2519 struct zone *zone; 2583 struct zone *zone;
2520 2584