aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2009-09-08 20:55:21 -0400
committerDan Williams <dan.j.williams@intel.com>2009-09-08 20:55:21 -0400
commitbbb20089a3275a19e475dbc21320c3742e3ca423 (patch)
tree216fdc1cbef450ca688135c5b8969169482d9a48 /mm/vmscan.c
parent3e48e656903e9fd8bc805c6a2c4264d7808d315b (diff)
parent657a77fa7284d8ae28dfa48f1dc5d919bf5b2843 (diff)
Merge branch 'dmaengine' into async-tx-next
Conflicts: crypto/async_tx/async_xor.c drivers/dma/ioat/dma_v2.h drivers/dma/ioat/pci.c drivers/md/raid5.c
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c380
1 files changed, 240 insertions, 140 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d254306562cd..54155268dfca 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
470 swp_entry_t swap = { .val = page_private(page) }; 470 swp_entry_t swap = { .val = page_private(page) };
471 __delete_from_swap_cache(page); 471 __delete_from_swap_cache(page);
472 spin_unlock_irq(&mapping->tree_lock); 472 spin_unlock_irq(&mapping->tree_lock);
473 mem_cgroup_uncharge_swapcache(page, swap); 473 swapcache_free(swap, page);
474 swap_free(swap);
475 } else { 474 } else {
476 __remove_from_page_cache(page); 475 __remove_from_page_cache(page);
477 spin_unlock_irq(&mapping->tree_lock); 476 spin_unlock_irq(&mapping->tree_lock);
@@ -514,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
514 * 513 *
515 * lru_lock must not be held, interrupts must be enabled. 514 * lru_lock must not be held, interrupts must be enabled.
516 */ 515 */
517#ifdef CONFIG_UNEVICTABLE_LRU
518void putback_lru_page(struct page *page) 516void putback_lru_page(struct page *page)
519{ 517{
520 int lru; 518 int lru;
@@ -568,20 +566,6 @@ redo:
568 put_page(page); /* drop ref from isolate */ 566 put_page(page); /* drop ref from isolate */
569} 567}
570 568
571#else /* CONFIG_UNEVICTABLE_LRU */
572
573void putback_lru_page(struct page *page)
574{
575 int lru;
576 VM_BUG_ON(PageLRU(page));
577
578 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
579 lru_cache_add_lru(page, lru);
580 put_page(page);
581}
582#endif /* CONFIG_UNEVICTABLE_LRU */
583
584
585/* 569/*
586 * shrink_page_list() returns the number of reclaimed pages 570 * shrink_page_list() returns the number of reclaimed pages
587 */ 571 */
@@ -593,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
593 struct pagevec freed_pvec; 577 struct pagevec freed_pvec;
594 int pgactivate = 0; 578 int pgactivate = 0;
595 unsigned long nr_reclaimed = 0; 579 unsigned long nr_reclaimed = 0;
580 unsigned long vm_flags;
596 581
597 cond_resched(); 582 cond_resched();
598 583
@@ -643,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
643 goto keep_locked; 628 goto keep_locked;
644 } 629 }
645 630
646 referenced = page_referenced(page, 1, sc->mem_cgroup); 631 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags);
647 /* In active use or really unfreeable? Activate it. */ 633 /* In active use or really unfreeable? Activate it. */
648 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 634 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
649 referenced && page_mapping_inuse(page)) 635 referenced && page_mapping_inuse(page))
@@ -851,7 +837,6 @@ int __isolate_lru_page(struct page *page, int mode, int file)
851 */ 837 */
852 ClearPageLRU(page); 838 ClearPageLRU(page);
853 ret = 0; 839 ret = 0;
854 mem_cgroup_del_lru(page);
855 } 840 }
856 841
857 return ret; 842 return ret;
@@ -899,12 +884,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
899 switch (__isolate_lru_page(page, mode, file)) { 884 switch (__isolate_lru_page(page, mode, file)) {
900 case 0: 885 case 0:
901 list_move(&page->lru, dst); 886 list_move(&page->lru, dst);
887 mem_cgroup_del_lru(page);
902 nr_taken++; 888 nr_taken++;
903 break; 889 break;
904 890
905 case -EBUSY: 891 case -EBUSY:
906 /* else it is being freed elsewhere */ 892 /* else it is being freed elsewhere */
907 list_move(&page->lru, src); 893 list_move(&page->lru, src);
894 mem_cgroup_rotate_lru_list(page, page_lru(page));
908 continue; 895 continue;
909 896
910 default: 897 default:
@@ -943,18 +930,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
943 /* Check that we have not crossed a zone boundary. */ 930 /* Check that we have not crossed a zone boundary. */
944 if (unlikely(page_zone_id(cursor_page) != zone_id)) 931 if (unlikely(page_zone_id(cursor_page) != zone_id))
945 continue; 932 continue;
946 switch (__isolate_lru_page(cursor_page, mode, file)) { 933 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
947 case 0:
948 list_move(&cursor_page->lru, dst); 934 list_move(&cursor_page->lru, dst);
935 mem_cgroup_del_lru(cursor_page);
949 nr_taken++; 936 nr_taken++;
950 scan++; 937 scan++;
951 break;
952
953 case -EBUSY:
954 /* else it is being freed elsewhere */
955 list_move(&cursor_page->lru, src);
956 default:
957 break; /* ! on LRU or wrong list */
958 } 938 }
959 } 939 }
960 } 940 }
@@ -1061,6 +1041,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1061 unsigned long nr_scanned = 0; 1041 unsigned long nr_scanned = 0;
1062 unsigned long nr_reclaimed = 0; 1042 unsigned long nr_reclaimed = 0;
1063 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1043 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1044 int lumpy_reclaim = 0;
1045
1046 /*
1047 * If we need a large contiguous chunk of memory, or have
1048 * trouble getting a small set of contiguous pages, we
1049 * will reclaim both active and inactive pages.
1050 *
1051 * We use the same threshold as pageout congestion_wait below.
1052 */
1053 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1054 lumpy_reclaim = 1;
1055 else if (sc->order && priority < DEF_PRIORITY - 2)
1056 lumpy_reclaim = 1;
1064 1057
1065 pagevec_init(&pvec, 1); 1058 pagevec_init(&pvec, 1);
1066 1059
@@ -1073,19 +1066,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1073 unsigned long nr_freed; 1066 unsigned long nr_freed;
1074 unsigned long nr_active; 1067 unsigned long nr_active;
1075 unsigned int count[NR_LRU_LISTS] = { 0, }; 1068 unsigned int count[NR_LRU_LISTS] = { 0, };
1076 int mode = ISOLATE_INACTIVE; 1069 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1077
1078 /*
1079 * If we need a large contiguous chunk of memory, or have
1080 * trouble getting a small set of contiguous pages, we
1081 * will reclaim both active and inactive pages.
1082 *
1083 * We use the same threshold as pageout congestion_wait below.
1084 */
1085 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1086 mode = ISOLATE_BOTH;
1087 else if (sc->order && priority < DEF_PRIORITY - 2)
1088 mode = ISOLATE_BOTH;
1089 1070
1090 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1071 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1091 &page_list, &nr_scan, sc->order, mode, 1072 &page_list, &nr_scan, sc->order, mode,
@@ -1122,7 +1103,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1122 * but that should be acceptable to the caller 1103 * but that should be acceptable to the caller
1123 */ 1104 */
1124 if (nr_freed < nr_taken && !current_is_kswapd() && 1105 if (nr_freed < nr_taken && !current_is_kswapd() &&
1125 sc->order > PAGE_ALLOC_COSTLY_ORDER) { 1106 lumpy_reclaim) {
1126 congestion_wait(WRITE, HZ/10); 1107 congestion_wait(WRITE, HZ/10);
1127 1108
1128 /* 1109 /*
@@ -1217,18 +1198,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1217 * But we had to alter page->flags anyway. 1198 * But we had to alter page->flags anyway.
1218 */ 1199 */
1219 1200
1201static void move_active_pages_to_lru(struct zone *zone,
1202 struct list_head *list,
1203 enum lru_list lru)
1204{
1205 unsigned long pgmoved = 0;
1206 struct pagevec pvec;
1207 struct page *page;
1208
1209 pagevec_init(&pvec, 1);
1210
1211 while (!list_empty(list)) {
1212 page = lru_to_page(list);
1213 prefetchw_prev_lru_page(page, list, flags);
1214
1215 VM_BUG_ON(PageLRU(page));
1216 SetPageLRU(page);
1217
1218 VM_BUG_ON(!PageActive(page));
1219 if (!is_active_lru(lru))
1220 ClearPageActive(page); /* we are de-activating */
1221
1222 list_move(&page->lru, &zone->lru[lru].list);
1223 mem_cgroup_add_lru_list(page, lru);
1224 pgmoved++;
1225
1226 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1227 spin_unlock_irq(&zone->lru_lock);
1228 if (buffer_heads_over_limit)
1229 pagevec_strip(&pvec);
1230 __pagevec_release(&pvec);
1231 spin_lock_irq(&zone->lru_lock);
1232 }
1233 }
1234 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1235 if (!is_active_lru(lru))
1236 __count_vm_events(PGDEACTIVATE, pgmoved);
1237}
1220 1238
1221static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1239static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1222 struct scan_control *sc, int priority, int file) 1240 struct scan_control *sc, int priority, int file)
1223{ 1241{
1224 unsigned long pgmoved; 1242 unsigned long pgmoved;
1225 int pgdeactivate = 0;
1226 unsigned long pgscanned; 1243 unsigned long pgscanned;
1244 unsigned long vm_flags;
1227 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1245 LIST_HEAD(l_hold); /* The pages which were snipped off */
1246 LIST_HEAD(l_active);
1228 LIST_HEAD(l_inactive); 1247 LIST_HEAD(l_inactive);
1229 struct page *page; 1248 struct page *page;
1230 struct pagevec pvec;
1231 enum lru_list lru;
1232 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1249 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1233 1250
1234 lru_add_drain(); 1251 lru_add_drain();
@@ -1245,13 +1262,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1245 } 1262 }
1246 reclaim_stat->recent_scanned[!!file] += pgmoved; 1263 reclaim_stat->recent_scanned[!!file] += pgmoved;
1247 1264
1265 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1248 if (file) 1266 if (file)
1249 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1267 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1250 else 1268 else
1251 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1269 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1252 spin_unlock_irq(&zone->lru_lock); 1270 spin_unlock_irq(&zone->lru_lock);
1253 1271
1254 pgmoved = 0; 1272 pgmoved = 0; /* count referenced (mapping) mapped pages */
1255 while (!list_empty(&l_hold)) { 1273 while (!list_empty(&l_hold)) {
1256 cond_resched(); 1274 cond_resched();
1257 page = lru_to_page(&l_hold); 1275 page = lru_to_page(&l_hold);
@@ -1264,58 +1282,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1264 1282
1265 /* page_referenced clears PageReferenced */ 1283 /* page_referenced clears PageReferenced */
1266 if (page_mapping_inuse(page) && 1284 if (page_mapping_inuse(page) &&
1267 page_referenced(page, 0, sc->mem_cgroup)) 1285 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1268 pgmoved++; 1286 pgmoved++;
1287 /*
1288 * Identify referenced, file-backed active pages and
1289 * give them one more trip around the active list. So
1290 * that executable code get better chances to stay in
1291 * memory under moderate memory pressure. Anon pages
1292 * are not likely to be evicted by use-once streaming
1293 * IO, plus JVM can create lots of anon VM_EXEC pages,
1294 * so we ignore them here.
1295 */
1296 if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
1297 list_add(&page->lru, &l_active);
1298 continue;
1299 }
1300 }
1269 1301
1270 list_add(&page->lru, &l_inactive); 1302 list_add(&page->lru, &l_inactive);
1271 } 1303 }
1272 1304
1273 /* 1305 /*
1274 * Move the pages to the [file or anon] inactive list. 1306 * Move pages back to the lru list.
1275 */ 1307 */
1276 pagevec_init(&pvec, 1);
1277 lru = LRU_BASE + file * LRU_FILE;
1278
1279 spin_lock_irq(&zone->lru_lock); 1308 spin_lock_irq(&zone->lru_lock);
1280 /* 1309 /*
1281 * Count referenced pages from currently used mappings as 1310 * Count referenced pages from currently used mappings as rotated,
1282 * rotated, even though they are moved to the inactive list. 1311 * even though only some of them are actually re-activated. This
1283 * This helps balance scan pressure between file and anonymous 1312 * helps balance scan pressure between file and anonymous pages in
1284 * pages in get_scan_ratio. 1313 * get_scan_ratio.
1285 */ 1314 */
1286 reclaim_stat->recent_rotated[!!file] += pgmoved; 1315 reclaim_stat->recent_rotated[!!file] += pgmoved;
1287 1316
1288 pgmoved = 0; 1317 move_active_pages_to_lru(zone, &l_active,
1289 while (!list_empty(&l_inactive)) { 1318 LRU_ACTIVE + file * LRU_FILE);
1290 page = lru_to_page(&l_inactive); 1319 move_active_pages_to_lru(zone, &l_inactive,
1291 prefetchw_prev_lru_page(page, &l_inactive, flags); 1320 LRU_BASE + file * LRU_FILE);
1292 VM_BUG_ON(PageLRU(page));
1293 SetPageLRU(page);
1294 VM_BUG_ON(!PageActive(page));
1295 ClearPageActive(page);
1296 1321
1297 list_move(&page->lru, &zone->lru[lru].list);
1298 mem_cgroup_add_lru_list(page, lru);
1299 pgmoved++;
1300 if (!pagevec_add(&pvec, page)) {
1301 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1302 spin_unlock_irq(&zone->lru_lock);
1303 pgdeactivate += pgmoved;
1304 pgmoved = 0;
1305 if (buffer_heads_over_limit)
1306 pagevec_strip(&pvec);
1307 __pagevec_release(&pvec);
1308 spin_lock_irq(&zone->lru_lock);
1309 }
1310 }
1311 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1312 pgdeactivate += pgmoved;
1313 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1314 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1315 spin_unlock_irq(&zone->lru_lock); 1322 spin_unlock_irq(&zone->lru_lock);
1316 if (buffer_heads_over_limit)
1317 pagevec_strip(&pvec);
1318 pagevec_release(&pvec);
1319} 1323}
1320 1324
1321static int inactive_anon_is_low_global(struct zone *zone) 1325static int inactive_anon_is_low_global(struct zone *zone)
@@ -1350,12 +1354,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1350 return low; 1354 return low;
1351} 1355}
1352 1356
1357static int inactive_file_is_low_global(struct zone *zone)
1358{
1359 unsigned long active, inactive;
1360
1361 active = zone_page_state(zone, NR_ACTIVE_FILE);
1362 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1363
1364 return (active > inactive);
1365}
1366
1367/**
1368 * inactive_file_is_low - check if file pages need to be deactivated
1369 * @zone: zone to check
1370 * @sc: scan control of this context
1371 *
1372 * When the system is doing streaming IO, memory pressure here
1373 * ensures that active file pages get deactivated, until more
1374 * than half of the file pages are on the inactive list.
1375 *
1376 * Once we get to that situation, protect the system's working
1377 * set from being evicted by disabling active file page aging.
1378 *
1379 * This uses a different ratio than the anonymous pages, because
1380 * the page cache uses a use-once replacement algorithm.
1381 */
1382static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1383{
1384 int low;
1385
1386 if (scanning_global_lru(sc))
1387 low = inactive_file_is_low_global(zone);
1388 else
1389 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1390 return low;
1391}
1392
1353static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1393static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1354 struct zone *zone, struct scan_control *sc, int priority) 1394 struct zone *zone, struct scan_control *sc, int priority)
1355{ 1395{
1356 int file = is_file_lru(lru); 1396 int file = is_file_lru(lru);
1357 1397
1358 if (lru == LRU_ACTIVE_FILE) { 1398 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
1359 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1399 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1360 return 0; 1400 return 0;
1361 } 1401 }
@@ -1384,13 +1424,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1384 unsigned long ap, fp; 1424 unsigned long ap, fp;
1385 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1425 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1386 1426
1387 /* If we have no swap space, do not bother scanning anon pages. */
1388 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1389 percent[0] = 0;
1390 percent[1] = 100;
1391 return;
1392 }
1393
1394 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + 1427 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1395 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); 1428 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1396 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + 1429 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1400,7 +1433,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1400 free = zone_page_state(zone, NR_FREE_PAGES); 1433 free = zone_page_state(zone, NR_FREE_PAGES);
1401 /* If we have very few page cache pages, 1434 /* If we have very few page cache pages,
1402 force-scan anon pages. */ 1435 force-scan anon pages. */
1403 if (unlikely(file + free <= zone->pages_high)) { 1436 if (unlikely(file + free <= high_wmark_pages(zone))) {
1404 percent[0] = 100; 1437 percent[0] = 100;
1405 percent[1] = 0; 1438 percent[1] = 0;
1406 return; 1439 return;
@@ -1455,6 +1488,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1455 percent[1] = 100 - percent[0]; 1488 percent[1] = 100 - percent[0];
1456} 1489}
1457 1490
1491/*
1492 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1493 * until we collected @swap_cluster_max pages to scan.
1494 */
1495static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1496 unsigned long *nr_saved_scan,
1497 unsigned long swap_cluster_max)
1498{
1499 unsigned long nr;
1500
1501 *nr_saved_scan += nr_to_scan;
1502 nr = *nr_saved_scan;
1503
1504 if (nr >= swap_cluster_max)
1505 *nr_saved_scan = 0;
1506 else
1507 nr = 0;
1508
1509 return nr;
1510}
1458 1511
1459/* 1512/*
1460 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1513 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
@@ -1468,26 +1521,30 @@ static void shrink_zone(int priority, struct zone *zone,
1468 enum lru_list l; 1521 enum lru_list l;
1469 unsigned long nr_reclaimed = sc->nr_reclaimed; 1522 unsigned long nr_reclaimed = sc->nr_reclaimed;
1470 unsigned long swap_cluster_max = sc->swap_cluster_max; 1523 unsigned long swap_cluster_max = sc->swap_cluster_max;
1524 int noswap = 0;
1471 1525
1472 get_scan_ratio(zone, sc, percent); 1526 /* If we have no swap space, do not bother scanning anon pages. */
1527 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1528 noswap = 1;
1529 percent[0] = 0;
1530 percent[1] = 100;
1531 } else
1532 get_scan_ratio(zone, sc, percent);
1473 1533
1474 for_each_evictable_lru(l) { 1534 for_each_evictable_lru(l) {
1475 int file = is_file_lru(l); 1535 int file = is_file_lru(l);
1476 unsigned long scan; 1536 unsigned long scan;
1477 1537
1478 scan = zone_nr_pages(zone, sc, l); 1538 scan = zone_nr_pages(zone, sc, l);
1479 if (priority) { 1539 if (priority || noswap) {
1480 scan >>= priority; 1540 scan >>= priority;
1481 scan = (scan * percent[file]) / 100; 1541 scan = (scan * percent[file]) / 100;
1482 } 1542 }
1483 if (scanning_global_lru(sc)) { 1543 if (scanning_global_lru(sc))
1484 zone->lru[l].nr_scan += scan; 1544 nr[l] = nr_scan_try_batch(scan,
1485 nr[l] = zone->lru[l].nr_scan; 1545 &zone->lru[l].nr_saved_scan,
1486 if (nr[l] >= swap_cluster_max) 1546 swap_cluster_max);
1487 zone->lru[l].nr_scan = 0; 1547 else
1488 else
1489 nr[l] = 0;
1490 } else
1491 nr[l] = scan; 1548 nr[l] = scan;
1492 } 1549 }
1493 1550
@@ -1521,7 +1578,7 @@ static void shrink_zone(int priority, struct zone *zone,
1521 * Even if we did not try to evict anon pages at all, we want to 1578 * Even if we did not try to evict anon pages at all, we want to
1522 * rebalance the anon lru active/inactive ratio. 1579 * rebalance the anon lru active/inactive ratio.
1523 */ 1580 */
1524 if (inactive_anon_is_low(zone, sc)) 1581 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
1525 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1582 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1526 1583
1527 throttle_vm_writeout(sc->gfp_mask); 1584 throttle_vm_writeout(sc->gfp_mask);
@@ -1532,11 +1589,13 @@ static void shrink_zone(int priority, struct zone *zone,
1532 * try to reclaim pages from zones which will satisfy the caller's allocation 1589 * try to reclaim pages from zones which will satisfy the caller's allocation
1533 * request. 1590 * request.
1534 * 1591 *
1535 * We reclaim from a zone even if that zone is over pages_high. Because: 1592 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
1593 * Because:
1536 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 1594 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
1537 * allocation or 1595 * allocation or
1538 * b) The zones may be over pages_high but they must go *over* pages_high to 1596 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
1539 * satisfy the `incremental min' zone defense algorithm. 1597 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
1598 * zone defense algorithm.
1540 * 1599 *
1541 * If a zone is deemed to be full of pinned pages then just give it a light 1600 * If a zone is deemed to be full of pinned pages then just give it a light
1542 * scan then give up on it. 1601 * scan then give up on it.
@@ -1742,7 +1801,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1742 1801
1743/* 1802/*
1744 * For kswapd, balance_pgdat() will work across all this node's zones until 1803 * For kswapd, balance_pgdat() will work across all this node's zones until
1745 * they are all at pages_high. 1804 * they are all at high_wmark_pages(zone).
1746 * 1805 *
1747 * Returns the number of pages which were actually freed. 1806 * Returns the number of pages which were actually freed.
1748 * 1807 *
@@ -1755,11 +1814,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1755 * the zone for when the problem goes away. 1814 * the zone for when the problem goes away.
1756 * 1815 *
1757 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1816 * kswapd scans the zones in the highmem->normal->dma direction. It skips
1758 * zones which have free_pages > pages_high, but once a zone is found to have 1817 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
1759 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1818 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
1760 * of the number of free pages in the lower zones. This interoperates with 1819 * lower zones regardless of the number of free pages in the lower zones. This
1761 * the page allocator fallback scheme to ensure that aging of pages is balanced 1820 * interoperates with the page allocator fallback scheme to ensure that aging
1762 * across the zones. 1821 * of pages is balanced across the zones.
1763 */ 1822 */
1764static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1823static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1765{ 1824{
@@ -1780,7 +1839,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1780 }; 1839 };
1781 /* 1840 /*
1782 * temp_priority is used to remember the scanning priority at which 1841 * temp_priority is used to remember the scanning priority at which
1783 * this zone was successfully refilled to free_pages == pages_high. 1842 * this zone was successfully refilled to
1843 * free_pages == high_wmark_pages(zone).
1784 */ 1844 */
1785 int temp_priority[MAX_NR_ZONES]; 1845 int temp_priority[MAX_NR_ZONES];
1786 1846
@@ -1825,8 +1885,8 @@ loop_again:
1825 shrink_active_list(SWAP_CLUSTER_MAX, zone, 1885 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1826 &sc, priority, 0); 1886 &sc, priority, 0);
1827 1887
1828 if (!zone_watermark_ok(zone, order, zone->pages_high, 1888 if (!zone_watermark_ok(zone, order,
1829 0, 0)) { 1889 high_wmark_pages(zone), 0, 0)) {
1830 end_zone = i; 1890 end_zone = i;
1831 break; 1891 break;
1832 } 1892 }
@@ -1860,8 +1920,8 @@ loop_again:
1860 priority != DEF_PRIORITY) 1920 priority != DEF_PRIORITY)
1861 continue; 1921 continue;
1862 1922
1863 if (!zone_watermark_ok(zone, order, zone->pages_high, 1923 if (!zone_watermark_ok(zone, order,
1864 end_zone, 0)) 1924 high_wmark_pages(zone), end_zone, 0))
1865 all_zones_ok = 0; 1925 all_zones_ok = 0;
1866 temp_priority[i] = priority; 1926 temp_priority[i] = priority;
1867 sc.nr_scanned = 0; 1927 sc.nr_scanned = 0;
@@ -1870,8 +1930,8 @@ loop_again:
1870 * We put equal pressure on every zone, unless one 1930 * We put equal pressure on every zone, unless one
1871 * zone has way too many pages free already. 1931 * zone has way too many pages free already.
1872 */ 1932 */
1873 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1933 if (!zone_watermark_ok(zone, order,
1874 end_zone, 0)) 1934 8*high_wmark_pages(zone), end_zone, 0))
1875 shrink_zone(priority, zone, &sc); 1935 shrink_zone(priority, zone, &sc);
1876 reclaim_state->reclaimed_slab = 0; 1936 reclaim_state->reclaimed_slab = 0;
1877 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1937 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2037,7 +2097,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2037 return; 2097 return;
2038 2098
2039 pgdat = zone->zone_pgdat; 2099 pgdat = zone->zone_pgdat;
2040 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 2100 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2041 return; 2101 return;
2042 if (pgdat->kswapd_max_order < order) 2102 if (pgdat->kswapd_max_order < order)
2043 pgdat->kswapd_max_order = order; 2103 pgdat->kswapd_max_order = order;
@@ -2056,7 +2116,7 @@ unsigned long global_lru_pages(void)
2056 + global_page_state(NR_INACTIVE_FILE); 2116 + global_page_state(NR_INACTIVE_FILE);
2057} 2117}
2058 2118
2059#ifdef CONFIG_PM 2119#ifdef CONFIG_HIBERNATION
2060/* 2120/*
2061 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2121 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
2062 * from LRU lists system-wide, for given pass and priority. 2122 * from LRU lists system-wide, for given pass and priority.
@@ -2084,11 +2144,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2084 l == LRU_ACTIVE_FILE)) 2144 l == LRU_ACTIVE_FILE))
2085 continue; 2145 continue;
2086 2146
2087 zone->lru[l].nr_scan += (lru_pages >> prio) + 1; 2147 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
2088 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { 2148 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
2089 unsigned long nr_to_scan; 2149 unsigned long nr_to_scan;
2090 2150
2091 zone->lru[l].nr_scan = 0; 2151 zone->lru[l].nr_saved_scan = 0;
2092 nr_to_scan = min(nr_pages, lru_pages); 2152 nr_to_scan = min(nr_pages, lru_pages);
2093 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2153 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2094 sc, prio); 2154 sc, prio);
@@ -2196,7 +2256,7 @@ out:
2196 2256
2197 return sc.nr_reclaimed; 2257 return sc.nr_reclaimed;
2198} 2258}
2199#endif 2259#endif /* CONFIG_HIBERNATION */
2200 2260
2201/* It's optimal to keep kswapds on the same CPUs as their memory, but 2261/* It's optimal to keep kswapds on the same CPUs as their memory, but
2202 not required for correctness. So if the last cpu in a node goes 2262 not required for correctness. So if the last cpu in a node goes
@@ -2290,6 +2350,48 @@ int sysctl_min_unmapped_ratio = 1;
2290 */ 2350 */
2291int sysctl_min_slab_ratio = 5; 2351int sysctl_min_slab_ratio = 5;
2292 2352
2353static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
2354{
2355 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
2356 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
2357 zone_page_state(zone, NR_ACTIVE_FILE);
2358
2359 /*
2360 * It's possible for there to be more file mapped pages than
2361 * accounted for by the pages on the file LRU lists because
2362 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
2363 */
2364 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
2365}
2366
2367/* Work out how many page cache pages we can reclaim in this reclaim_mode */
2368static long zone_pagecache_reclaimable(struct zone *zone)
2369{
2370 long nr_pagecache_reclaimable;
2371 long delta = 0;
2372
2373 /*
2374 * If RECLAIM_SWAP is set, then all file pages are considered
2375 * potentially reclaimable. Otherwise, we have to worry about
2376 * pages like swapcache and zone_unmapped_file_pages() provides
2377 * a better estimate
2378 */
2379 if (zone_reclaim_mode & RECLAIM_SWAP)
2380 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
2381 else
2382 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
2383
2384 /* If we can't clean pages, remove dirty pages from consideration */
2385 if (!(zone_reclaim_mode & RECLAIM_WRITE))
2386 delta += zone_page_state(zone, NR_FILE_DIRTY);
2387
2388 /* Watch for any possible underflows due to delta */
2389 if (unlikely(delta > nr_pagecache_reclaimable))
2390 delta = nr_pagecache_reclaimable;
2391
2392 return nr_pagecache_reclaimable - delta;
2393}
2394
2293/* 2395/*
2294 * Try to free up some pages from this zone through reclaim. 2396 * Try to free up some pages from this zone through reclaim.
2295 */ 2397 */
@@ -2324,9 +2426,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2324 reclaim_state.reclaimed_slab = 0; 2426 reclaim_state.reclaimed_slab = 0;
2325 p->reclaim_state = &reclaim_state; 2427 p->reclaim_state = &reclaim_state;
2326 2428
2327 if (zone_page_state(zone, NR_FILE_PAGES) - 2429 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
2328 zone_page_state(zone, NR_FILE_MAPPED) >
2329 zone->min_unmapped_pages) {
2330 /* 2430 /*
2331 * Free memory by calling shrink zone with increasing 2431 * Free memory by calling shrink zone with increasing
2332 * priorities until we have enough memory freed. 2432 * priorities until we have enough memory freed.
@@ -2384,20 +2484,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2384 * if less than a specified percentage of the zone is used by 2484 * if less than a specified percentage of the zone is used by
2385 * unmapped file backed pages. 2485 * unmapped file backed pages.
2386 */ 2486 */
2387 if (zone_page_state(zone, NR_FILE_PAGES) - 2487 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
2388 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages 2488 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2389 && zone_page_state(zone, NR_SLAB_RECLAIMABLE) 2489 return ZONE_RECLAIM_FULL;
2390 <= zone->min_slab_pages)
2391 return 0;
2392 2490
2393 if (zone_is_all_unreclaimable(zone)) 2491 if (zone_is_all_unreclaimable(zone))
2394 return 0; 2492 return ZONE_RECLAIM_FULL;
2395 2493
2396 /* 2494 /*
2397 * Do not scan if the allocation should not be delayed. 2495 * Do not scan if the allocation should not be delayed.
2398 */ 2496 */
2399 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 2497 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
2400 return 0; 2498 return ZONE_RECLAIM_NOSCAN;
2401 2499
2402 /* 2500 /*
2403 * Only run zone reclaim on the local zone or on zones that do not 2501 * Only run zone reclaim on the local zone or on zones that do not
@@ -2407,18 +2505,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2407 */ 2505 */
2408 node_id = zone_to_nid(zone); 2506 node_id = zone_to_nid(zone);
2409 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 2507 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
2410 return 0; 2508 return ZONE_RECLAIM_NOSCAN;
2411 2509
2412 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 2510 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
2413 return 0; 2511 return ZONE_RECLAIM_NOSCAN;
2512
2414 ret = __zone_reclaim(zone, gfp_mask, order); 2513 ret = __zone_reclaim(zone, gfp_mask, order);
2415 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 2514 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
2416 2515
2516 if (!ret)
2517 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
2518
2417 return ret; 2519 return ret;
2418} 2520}
2419#endif 2521#endif
2420 2522
2421#ifdef CONFIG_UNEVICTABLE_LRU
2422/* 2523/*
2423 * page_evictable - test whether a page is evictable 2524 * page_evictable - test whether a page is evictable
2424 * @page: the page to test 2525 * @page: the page to test
@@ -2665,4 +2766,3 @@ void scan_unevictable_unregister_node(struct node *node)
2665 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 2766 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2666} 2767}
2667 2768
2668#endif