aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c376
1 files changed, 238 insertions, 138 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5fa3eda1f03f..4139aa52b941 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,10 +470,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
470 swp_entry_t swap = { .val = page_private(page) }; 470 swp_entry_t swap = { .val = page_private(page) };
471 __delete_from_swap_cache(page); 471 __delete_from_swap_cache(page);
472 spin_unlock_irq(&mapping->tree_lock); 472 spin_unlock_irq(&mapping->tree_lock);
473 swap_free(swap); 473 swapcache_free(swap, page);
474 } else { 474 } else {
475 __remove_from_page_cache(page); 475 __remove_from_page_cache(page);
476 spin_unlock_irq(&mapping->tree_lock); 476 spin_unlock_irq(&mapping->tree_lock);
477 mem_cgroup_uncharge_cache_page(page);
477 } 478 }
478 479
479 return 1; 480 return 1;
@@ -512,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
512 * 513 *
513 * lru_lock must not be held, interrupts must be enabled. 514 * lru_lock must not be held, interrupts must be enabled.
514 */ 515 */
515#ifdef CONFIG_UNEVICTABLE_LRU
516void putback_lru_page(struct page *page) 516void putback_lru_page(struct page *page)
517{ 517{
518 int lru; 518 int lru;
@@ -566,20 +566,6 @@ redo:
566 put_page(page); /* drop ref from isolate */ 566 put_page(page); /* drop ref from isolate */
567} 567}
568 568
569#else /* CONFIG_UNEVICTABLE_LRU */
570
571void putback_lru_page(struct page *page)
572{
573 int lru;
574 VM_BUG_ON(PageLRU(page));
575
576 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
577 lru_cache_add_lru(page, lru);
578 put_page(page);
579}
580#endif /* CONFIG_UNEVICTABLE_LRU */
581
582
583/* 569/*
584 * shrink_page_list() returns the number of reclaimed pages 570 * shrink_page_list() returns the number of reclaimed pages
585 */ 571 */
@@ -591,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
591 struct pagevec freed_pvec; 577 struct pagevec freed_pvec;
592 int pgactivate = 0; 578 int pgactivate = 0;
593 unsigned long nr_reclaimed = 0; 579 unsigned long nr_reclaimed = 0;
580 unsigned long vm_flags;
594 581
595 cond_resched(); 582 cond_resched();
596 583
@@ -641,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
641 goto keep_locked; 628 goto keep_locked;
642 } 629 }
643 630
644 referenced = page_referenced(page, 1, sc->mem_cgroup); 631 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags);
645 /* In active use or really unfreeable? Activate it. */ 633 /* In active use or really unfreeable? Activate it. */
646 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 634 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
647 referenced && page_mapping_inuse(page)) 635 referenced && page_mapping_inuse(page))
@@ -941,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
941 /* Check that we have not crossed a zone boundary. */ 929 /* Check that we have not crossed a zone boundary. */
942 if (unlikely(page_zone_id(cursor_page) != zone_id)) 930 if (unlikely(page_zone_id(cursor_page) != zone_id))
943 continue; 931 continue;
944 switch (__isolate_lru_page(cursor_page, mode, file)) { 932 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
945 case 0:
946 list_move(&cursor_page->lru, dst); 933 list_move(&cursor_page->lru, dst);
947 nr_taken++; 934 nr_taken++;
948 scan++; 935 scan++;
949 break;
950
951 case -EBUSY:
952 /* else it is being freed elsewhere */
953 list_move(&cursor_page->lru, src);
954 default:
955 break; /* ! on LRU or wrong list */
956 } 936 }
957 } 937 }
958 } 938 }
@@ -1059,6 +1039,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1059 unsigned long nr_scanned = 0; 1039 unsigned long nr_scanned = 0;
1060 unsigned long nr_reclaimed = 0; 1040 unsigned long nr_reclaimed = 0;
1061 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1041 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1042 int lumpy_reclaim = 0;
1043
1044 /*
1045 * If we need a large contiguous chunk of memory, or have
1046 * trouble getting a small set of contiguous pages, we
1047 * will reclaim both active and inactive pages.
1048 *
1049 * We use the same threshold as pageout congestion_wait below.
1050 */
1051 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1052 lumpy_reclaim = 1;
1053 else if (sc->order && priority < DEF_PRIORITY - 2)
1054 lumpy_reclaim = 1;
1062 1055
1063 pagevec_init(&pvec, 1); 1056 pagevec_init(&pvec, 1);
1064 1057
@@ -1071,19 +1064,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1071 unsigned long nr_freed; 1064 unsigned long nr_freed;
1072 unsigned long nr_active; 1065 unsigned long nr_active;
1073 unsigned int count[NR_LRU_LISTS] = { 0, }; 1066 unsigned int count[NR_LRU_LISTS] = { 0, };
1074 int mode = ISOLATE_INACTIVE; 1067 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1075
1076 /*
1077 * If we need a large contiguous chunk of memory, or have
1078 * trouble getting a small set of contiguous pages, we
1079 * will reclaim both active and inactive pages.
1080 *
1081 * We use the same threshold as pageout congestion_wait below.
1082 */
1083 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1084 mode = ISOLATE_BOTH;
1085 else if (sc->order && priority < DEF_PRIORITY - 2)
1086 mode = ISOLATE_BOTH;
1087 1068
1088 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1069 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1089 &page_list, &nr_scan, sc->order, mode, 1070 &page_list, &nr_scan, sc->order, mode,
@@ -1120,7 +1101,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1120 * but that should be acceptable to the caller 1101 * but that should be acceptable to the caller
1121 */ 1102 */
1122 if (nr_freed < nr_taken && !current_is_kswapd() && 1103 if (nr_freed < nr_taken && !current_is_kswapd() &&
1123 sc->order > PAGE_ALLOC_COSTLY_ORDER) { 1104 lumpy_reclaim) {
1124 congestion_wait(WRITE, HZ/10); 1105 congestion_wait(WRITE, HZ/10);
1125 1106
1126 /* 1107 /*
@@ -1215,18 +1196,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1215 * But we had to alter page->flags anyway. 1196 * But we had to alter page->flags anyway.
1216 */ 1197 */
1217 1198
1199static void move_active_pages_to_lru(struct zone *zone,
1200 struct list_head *list,
1201 enum lru_list lru)
1202{
1203 unsigned long pgmoved = 0;
1204 struct pagevec pvec;
1205 struct page *page;
1206
1207 pagevec_init(&pvec, 1);
1208
1209 while (!list_empty(list)) {
1210 page = lru_to_page(list);
1211 prefetchw_prev_lru_page(page, list, flags);
1212
1213 VM_BUG_ON(PageLRU(page));
1214 SetPageLRU(page);
1215
1216 VM_BUG_ON(!PageActive(page));
1217 if (!is_active_lru(lru))
1218 ClearPageActive(page); /* we are de-activating */
1219
1220 list_move(&page->lru, &zone->lru[lru].list);
1221 mem_cgroup_add_lru_list(page, lru);
1222 pgmoved++;
1223
1224 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1225 spin_unlock_irq(&zone->lru_lock);
1226 if (buffer_heads_over_limit)
1227 pagevec_strip(&pvec);
1228 __pagevec_release(&pvec);
1229 spin_lock_irq(&zone->lru_lock);
1230 }
1231 }
1232 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1233 if (!is_active_lru(lru))
1234 __count_vm_events(PGDEACTIVATE, pgmoved);
1235}
1218 1236
1219static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1237static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1220 struct scan_control *sc, int priority, int file) 1238 struct scan_control *sc, int priority, int file)
1221{ 1239{
1222 unsigned long pgmoved; 1240 unsigned long pgmoved;
1223 int pgdeactivate = 0;
1224 unsigned long pgscanned; 1241 unsigned long pgscanned;
1242 unsigned long vm_flags;
1225 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1243 LIST_HEAD(l_hold); /* The pages which were snipped off */
1244 LIST_HEAD(l_active);
1226 LIST_HEAD(l_inactive); 1245 LIST_HEAD(l_inactive);
1227 struct page *page; 1246 struct page *page;
1228 struct pagevec pvec;
1229 enum lru_list lru;
1230 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1247 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1231 1248
1232 lru_add_drain(); 1249 lru_add_drain();
@@ -1243,13 +1260,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1243 } 1260 }
1244 reclaim_stat->recent_scanned[!!file] += pgmoved; 1261 reclaim_stat->recent_scanned[!!file] += pgmoved;
1245 1262
1263 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1246 if (file) 1264 if (file)
1247 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1265 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1248 else 1266 else
1249 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1267 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1250 spin_unlock_irq(&zone->lru_lock); 1268 spin_unlock_irq(&zone->lru_lock);
1251 1269
1252 pgmoved = 0; 1270 pgmoved = 0; /* count referenced (mapping) mapped pages */
1253 while (!list_empty(&l_hold)) { 1271 while (!list_empty(&l_hold)) {
1254 cond_resched(); 1272 cond_resched();
1255 page = lru_to_page(&l_hold); 1273 page = lru_to_page(&l_hold);
@@ -1262,58 +1280,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1262 1280
1263 /* page_referenced clears PageReferenced */ 1281 /* page_referenced clears PageReferenced */
1264 if (page_mapping_inuse(page) && 1282 if (page_mapping_inuse(page) &&
1265 page_referenced(page, 0, sc->mem_cgroup)) 1283 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1266 pgmoved++; 1284 pgmoved++;
1285 /*
1286 * Identify referenced, file-backed active pages and
1287 * give them one more trip around the active list. So
1288 * that executable code get better chances to stay in
1289 * memory under moderate memory pressure. Anon pages
1290 * are not likely to be evicted by use-once streaming
1291 * IO, plus JVM can create lots of anon VM_EXEC pages,
1292 * so we ignore them here.
1293 */
1294 if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
1295 list_add(&page->lru, &l_active);
1296 continue;
1297 }
1298 }
1267 1299
1268 list_add(&page->lru, &l_inactive); 1300 list_add(&page->lru, &l_inactive);
1269 } 1301 }
1270 1302
1271 /* 1303 /*
1272 * Move the pages to the [file or anon] inactive list. 1304 * Move pages back to the lru list.
1273 */ 1305 */
1274 pagevec_init(&pvec, 1);
1275 lru = LRU_BASE + file * LRU_FILE;
1276
1277 spin_lock_irq(&zone->lru_lock); 1306 spin_lock_irq(&zone->lru_lock);
1278 /* 1307 /*
1279 * Count referenced pages from currently used mappings as 1308 * Count referenced pages from currently used mappings as rotated,
1280 * rotated, even though they are moved to the inactive list. 1309 * even though only some of them are actually re-activated. This
1281 * This helps balance scan pressure between file and anonymous 1310 * helps balance scan pressure between file and anonymous pages in
1282 * pages in get_scan_ratio. 1311 * get_scan_ratio.
1283 */ 1312 */
1284 reclaim_stat->recent_rotated[!!file] += pgmoved; 1313 reclaim_stat->recent_rotated[!!file] += pgmoved;
1285 1314
1286 pgmoved = 0; 1315 move_active_pages_to_lru(zone, &l_active,
1287 while (!list_empty(&l_inactive)) { 1316 LRU_ACTIVE + file * LRU_FILE);
1288 page = lru_to_page(&l_inactive); 1317 move_active_pages_to_lru(zone, &l_inactive,
1289 prefetchw_prev_lru_page(page, &l_inactive, flags); 1318 LRU_BASE + file * LRU_FILE);
1290 VM_BUG_ON(PageLRU(page));
1291 SetPageLRU(page);
1292 VM_BUG_ON(!PageActive(page));
1293 ClearPageActive(page);
1294 1319
1295 list_move(&page->lru, &zone->lru[lru].list);
1296 mem_cgroup_add_lru_list(page, lru);
1297 pgmoved++;
1298 if (!pagevec_add(&pvec, page)) {
1299 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1300 spin_unlock_irq(&zone->lru_lock);
1301 pgdeactivate += pgmoved;
1302 pgmoved = 0;
1303 if (buffer_heads_over_limit)
1304 pagevec_strip(&pvec);
1305 __pagevec_release(&pvec);
1306 spin_lock_irq(&zone->lru_lock);
1307 }
1308 }
1309 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1310 pgdeactivate += pgmoved;
1311 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1312 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1313 spin_unlock_irq(&zone->lru_lock); 1320 spin_unlock_irq(&zone->lru_lock);
1314 if (buffer_heads_over_limit)
1315 pagevec_strip(&pvec);
1316 pagevec_release(&pvec);
1317} 1321}
1318 1322
1319static int inactive_anon_is_low_global(struct zone *zone) 1323static int inactive_anon_is_low_global(struct zone *zone)
@@ -1348,12 +1352,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1348 return low; 1352 return low;
1349} 1353}
1350 1354
1355static int inactive_file_is_low_global(struct zone *zone)
1356{
1357 unsigned long active, inactive;
1358
1359 active = zone_page_state(zone, NR_ACTIVE_FILE);
1360 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1361
1362 return (active > inactive);
1363}
1364
1365/**
1366 * inactive_file_is_low - check if file pages need to be deactivated
1367 * @zone: zone to check
1368 * @sc: scan control of this context
1369 *
1370 * When the system is doing streaming IO, memory pressure here
1371 * ensures that active file pages get deactivated, until more
1372 * than half of the file pages are on the inactive list.
1373 *
1374 * Once we get to that situation, protect the system's working
1375 * set from being evicted by disabling active file page aging.
1376 *
1377 * This uses a different ratio than the anonymous pages, because
1378 * the page cache uses a use-once replacement algorithm.
1379 */
1380static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1381{
1382 int low;
1383
1384 if (scanning_global_lru(sc))
1385 low = inactive_file_is_low_global(zone);
1386 else
1387 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1388 return low;
1389}
1390
1351static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1391static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1352 struct zone *zone, struct scan_control *sc, int priority) 1392 struct zone *zone, struct scan_control *sc, int priority)
1353{ 1393{
1354 int file = is_file_lru(lru); 1394 int file = is_file_lru(lru);
1355 1395
1356 if (lru == LRU_ACTIVE_FILE) { 1396 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
1357 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1397 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1358 return 0; 1398 return 0;
1359 } 1399 }
@@ -1382,13 +1422,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1382 unsigned long ap, fp; 1422 unsigned long ap, fp;
1383 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1423 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1384 1424
1385 /* If we have no swap space, do not bother scanning anon pages. */
1386 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1387 percent[0] = 0;
1388 percent[1] = 100;
1389 return;
1390 }
1391
1392 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + 1425 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1393 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); 1426 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1394 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + 1427 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1398,7 +1431,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1398 free = zone_page_state(zone, NR_FREE_PAGES); 1431 free = zone_page_state(zone, NR_FREE_PAGES);
1399 /* If we have very few page cache pages, 1432 /* If we have very few page cache pages,
1400 force-scan anon pages. */ 1433 force-scan anon pages. */
1401 if (unlikely(file + free <= zone->pages_high)) { 1434 if (unlikely(file + free <= high_wmark_pages(zone))) {
1402 percent[0] = 100; 1435 percent[0] = 100;
1403 percent[1] = 0; 1436 percent[1] = 0;
1404 return; 1437 return;
@@ -1453,6 +1486,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1453 percent[1] = 100 - percent[0]; 1486 percent[1] = 100 - percent[0];
1454} 1487}
1455 1488
1489/*
1490 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1491 * until we collected @swap_cluster_max pages to scan.
1492 */
1493static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1494 unsigned long *nr_saved_scan,
1495 unsigned long swap_cluster_max)
1496{
1497 unsigned long nr;
1498
1499 *nr_saved_scan += nr_to_scan;
1500 nr = *nr_saved_scan;
1501
1502 if (nr >= swap_cluster_max)
1503 *nr_saved_scan = 0;
1504 else
1505 nr = 0;
1506
1507 return nr;
1508}
1456 1509
1457/* 1510/*
1458 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1511 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
@@ -1466,26 +1519,30 @@ static void shrink_zone(int priority, struct zone *zone,
1466 enum lru_list l; 1519 enum lru_list l;
1467 unsigned long nr_reclaimed = sc->nr_reclaimed; 1520 unsigned long nr_reclaimed = sc->nr_reclaimed;
1468 unsigned long swap_cluster_max = sc->swap_cluster_max; 1521 unsigned long swap_cluster_max = sc->swap_cluster_max;
1522 int noswap = 0;
1469 1523
1470 get_scan_ratio(zone, sc, percent); 1524 /* If we have no swap space, do not bother scanning anon pages. */
1525 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1526 noswap = 1;
1527 percent[0] = 0;
1528 percent[1] = 100;
1529 } else
1530 get_scan_ratio(zone, sc, percent);
1471 1531
1472 for_each_evictable_lru(l) { 1532 for_each_evictable_lru(l) {
1473 int file = is_file_lru(l); 1533 int file = is_file_lru(l);
1474 unsigned long scan; 1534 unsigned long scan;
1475 1535
1476 scan = zone_nr_pages(zone, sc, l); 1536 scan = zone_nr_pages(zone, sc, l);
1477 if (priority) { 1537 if (priority || noswap) {
1478 scan >>= priority; 1538 scan >>= priority;
1479 scan = (scan * percent[file]) / 100; 1539 scan = (scan * percent[file]) / 100;
1480 } 1540 }
1481 if (scanning_global_lru(sc)) { 1541 if (scanning_global_lru(sc))
1482 zone->lru[l].nr_scan += scan; 1542 nr[l] = nr_scan_try_batch(scan,
1483 nr[l] = zone->lru[l].nr_scan; 1543 &zone->lru[l].nr_saved_scan,
1484 if (nr[l] >= swap_cluster_max) 1544 swap_cluster_max);
1485 zone->lru[l].nr_scan = 0; 1545 else
1486 else
1487 nr[l] = 0;
1488 } else
1489 nr[l] = scan; 1546 nr[l] = scan;
1490 } 1547 }
1491 1548
@@ -1519,7 +1576,7 @@ static void shrink_zone(int priority, struct zone *zone,
1519 * Even if we did not try to evict anon pages at all, we want to 1576 * Even if we did not try to evict anon pages at all, we want to
1520 * rebalance the anon lru active/inactive ratio. 1577 * rebalance the anon lru active/inactive ratio.
1521 */ 1578 */
1522 if (inactive_anon_is_low(zone, sc)) 1579 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
1523 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1580 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1524 1581
1525 throttle_vm_writeout(sc->gfp_mask); 1582 throttle_vm_writeout(sc->gfp_mask);
@@ -1530,11 +1587,13 @@ static void shrink_zone(int priority, struct zone *zone,
1530 * try to reclaim pages from zones which will satisfy the caller's allocation 1587 * try to reclaim pages from zones which will satisfy the caller's allocation
1531 * request. 1588 * request.
1532 * 1589 *
1533 * We reclaim from a zone even if that zone is over pages_high. Because: 1590 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
1591 * Because:
1534 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 1592 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
1535 * allocation or 1593 * allocation or
1536 * b) The zones may be over pages_high but they must go *over* pages_high to 1594 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
1537 * satisfy the `incremental min' zone defense algorithm. 1595 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
1596 * zone defense algorithm.
1538 * 1597 *
1539 * If a zone is deemed to be full of pinned pages then just give it a light 1598 * If a zone is deemed to be full of pinned pages then just give it a light
1540 * scan then give up on it. 1599 * scan then give up on it.
@@ -1740,7 +1799,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1740 1799
1741/* 1800/*
1742 * For kswapd, balance_pgdat() will work across all this node's zones until 1801 * For kswapd, balance_pgdat() will work across all this node's zones until
1743 * they are all at pages_high. 1802 * they are all at high_wmark_pages(zone).
1744 * 1803 *
1745 * Returns the number of pages which were actually freed. 1804 * Returns the number of pages which were actually freed.
1746 * 1805 *
@@ -1753,11 +1812,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1753 * the zone for when the problem goes away. 1812 * the zone for when the problem goes away.
1754 * 1813 *
1755 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1814 * kswapd scans the zones in the highmem->normal->dma direction. It skips
1756 * zones which have free_pages > pages_high, but once a zone is found to have 1815 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
1757 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1816 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
1758 * of the number of free pages in the lower zones. This interoperates with 1817 * lower zones regardless of the number of free pages in the lower zones. This
1759 * the page allocator fallback scheme to ensure that aging of pages is balanced 1818 * interoperates with the page allocator fallback scheme to ensure that aging
1760 * across the zones. 1819 * of pages is balanced across the zones.
1761 */ 1820 */
1762static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1821static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1763{ 1822{
@@ -1778,7 +1837,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1778 }; 1837 };
1779 /* 1838 /*
1780 * temp_priority is used to remember the scanning priority at which 1839 * temp_priority is used to remember the scanning priority at which
1781 * this zone was successfully refilled to free_pages == pages_high. 1840 * this zone was successfully refilled to
1841 * free_pages == high_wmark_pages(zone).
1782 */ 1842 */
1783 int temp_priority[MAX_NR_ZONES]; 1843 int temp_priority[MAX_NR_ZONES];
1784 1844
@@ -1823,8 +1883,8 @@ loop_again:
1823 shrink_active_list(SWAP_CLUSTER_MAX, zone, 1883 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1824 &sc, priority, 0); 1884 &sc, priority, 0);
1825 1885
1826 if (!zone_watermark_ok(zone, order, zone->pages_high, 1886 if (!zone_watermark_ok(zone, order,
1827 0, 0)) { 1887 high_wmark_pages(zone), 0, 0)) {
1828 end_zone = i; 1888 end_zone = i;
1829 break; 1889 break;
1830 } 1890 }
@@ -1858,8 +1918,8 @@ loop_again:
1858 priority != DEF_PRIORITY) 1918 priority != DEF_PRIORITY)
1859 continue; 1919 continue;
1860 1920
1861 if (!zone_watermark_ok(zone, order, zone->pages_high, 1921 if (!zone_watermark_ok(zone, order,
1862 end_zone, 0)) 1922 high_wmark_pages(zone), end_zone, 0))
1863 all_zones_ok = 0; 1923 all_zones_ok = 0;
1864 temp_priority[i] = priority; 1924 temp_priority[i] = priority;
1865 sc.nr_scanned = 0; 1925 sc.nr_scanned = 0;
@@ -1868,8 +1928,8 @@ loop_again:
1868 * We put equal pressure on every zone, unless one 1928 * We put equal pressure on every zone, unless one
1869 * zone has way too many pages free already. 1929 * zone has way too many pages free already.
1870 */ 1930 */
1871 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1931 if (!zone_watermark_ok(zone, order,
1872 end_zone, 0)) 1932 8*high_wmark_pages(zone), end_zone, 0))
1873 shrink_zone(priority, zone, &sc); 1933 shrink_zone(priority, zone, &sc);
1874 reclaim_state->reclaimed_slab = 0; 1934 reclaim_state->reclaimed_slab = 0;
1875 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1935 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2035,7 +2095,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2035 return; 2095 return;
2036 2096
2037 pgdat = zone->zone_pgdat; 2097 pgdat = zone->zone_pgdat;
2038 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 2098 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2039 return; 2099 return;
2040 if (pgdat->kswapd_max_order < order) 2100 if (pgdat->kswapd_max_order < order)
2041 pgdat->kswapd_max_order = order; 2101 pgdat->kswapd_max_order = order;
@@ -2054,7 +2114,7 @@ unsigned long global_lru_pages(void)
2054 + global_page_state(NR_INACTIVE_FILE); 2114 + global_page_state(NR_INACTIVE_FILE);
2055} 2115}
2056 2116
2057#ifdef CONFIG_PM 2117#ifdef CONFIG_HIBERNATION
2058/* 2118/*
2059 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2119 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
2060 * from LRU lists system-wide, for given pass and priority. 2120 * from LRU lists system-wide, for given pass and priority.
@@ -2082,11 +2142,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2082 l == LRU_ACTIVE_FILE)) 2142 l == LRU_ACTIVE_FILE))
2083 continue; 2143 continue;
2084 2144
2085 zone->lru[l].nr_scan += (lru_pages >> prio) + 1; 2145 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
2086 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { 2146 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
2087 unsigned long nr_to_scan; 2147 unsigned long nr_to_scan;
2088 2148
2089 zone->lru[l].nr_scan = 0; 2149 zone->lru[l].nr_saved_scan = 0;
2090 nr_to_scan = min(nr_pages, lru_pages); 2150 nr_to_scan = min(nr_pages, lru_pages);
2091 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2151 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2092 sc, prio); 2152 sc, prio);
@@ -2194,7 +2254,7 @@ out:
2194 2254
2195 return sc.nr_reclaimed; 2255 return sc.nr_reclaimed;
2196} 2256}
2197#endif 2257#endif /* CONFIG_HIBERNATION */
2198 2258
2199/* It's optimal to keep kswapds on the same CPUs as their memory, but 2259/* It's optimal to keep kswapds on the same CPUs as their memory, but
2200 not required for correctness. So if the last cpu in a node goes 2260 not required for correctness. So if the last cpu in a node goes
@@ -2288,6 +2348,48 @@ int sysctl_min_unmapped_ratio = 1;
2288 */ 2348 */
2289int sysctl_min_slab_ratio = 5; 2349int sysctl_min_slab_ratio = 5;
2290 2350
2351static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
2352{
2353 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
2354 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
2355 zone_page_state(zone, NR_ACTIVE_FILE);
2356
2357 /*
2358 * It's possible for there to be more file mapped pages than
2359 * accounted for by the pages on the file LRU lists because
2360 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
2361 */
2362 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
2363}
2364
2365/* Work out how many page cache pages we can reclaim in this reclaim_mode */
2366static long zone_pagecache_reclaimable(struct zone *zone)
2367{
2368 long nr_pagecache_reclaimable;
2369 long delta = 0;
2370
2371 /*
2372 * If RECLAIM_SWAP is set, then all file pages are considered
2373 * potentially reclaimable. Otherwise, we have to worry about
2374 * pages like swapcache and zone_unmapped_file_pages() provides
2375 * a better estimate
2376 */
2377 if (zone_reclaim_mode & RECLAIM_SWAP)
2378 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
2379 else
2380 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
2381
2382 /* If we can't clean pages, remove dirty pages from consideration */
2383 if (!(zone_reclaim_mode & RECLAIM_WRITE))
2384 delta += zone_page_state(zone, NR_FILE_DIRTY);
2385
2386 /* Watch for any possible underflows due to delta */
2387 if (unlikely(delta > nr_pagecache_reclaimable))
2388 delta = nr_pagecache_reclaimable;
2389
2390 return nr_pagecache_reclaimable - delta;
2391}
2392
2291/* 2393/*
2292 * Try to free up some pages from this zone through reclaim. 2394 * Try to free up some pages from this zone through reclaim.
2293 */ 2395 */
@@ -2322,9 +2424,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2322 reclaim_state.reclaimed_slab = 0; 2424 reclaim_state.reclaimed_slab = 0;
2323 p->reclaim_state = &reclaim_state; 2425 p->reclaim_state = &reclaim_state;
2324 2426
2325 if (zone_page_state(zone, NR_FILE_PAGES) - 2427 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
2326 zone_page_state(zone, NR_FILE_MAPPED) >
2327 zone->min_unmapped_pages) {
2328 /* 2428 /*
2329 * Free memory by calling shrink zone with increasing 2429 * Free memory by calling shrink zone with increasing
2330 * priorities until we have enough memory freed. 2430 * priorities until we have enough memory freed.
@@ -2382,20 +2482,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2382 * if less than a specified percentage of the zone is used by 2482 * if less than a specified percentage of the zone is used by
2383 * unmapped file backed pages. 2483 * unmapped file backed pages.
2384 */ 2484 */
2385 if (zone_page_state(zone, NR_FILE_PAGES) - 2485 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
2386 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages 2486 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2387 && zone_page_state(zone, NR_SLAB_RECLAIMABLE) 2487 return ZONE_RECLAIM_FULL;
2388 <= zone->min_slab_pages)
2389 return 0;
2390 2488
2391 if (zone_is_all_unreclaimable(zone)) 2489 if (zone_is_all_unreclaimable(zone))
2392 return 0; 2490 return ZONE_RECLAIM_FULL;
2393 2491
2394 /* 2492 /*
2395 * Do not scan if the allocation should not be delayed. 2493 * Do not scan if the allocation should not be delayed.
2396 */ 2494 */
2397 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 2495 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
2398 return 0; 2496 return ZONE_RECLAIM_NOSCAN;
2399 2497
2400 /* 2498 /*
2401 * Only run zone reclaim on the local zone or on zones that do not 2499 * Only run zone reclaim on the local zone or on zones that do not
@@ -2405,18 +2503,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2405 */ 2503 */
2406 node_id = zone_to_nid(zone); 2504 node_id = zone_to_nid(zone);
2407 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 2505 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
2408 return 0; 2506 return ZONE_RECLAIM_NOSCAN;
2409 2507
2410 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 2508 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
2411 return 0; 2509 return ZONE_RECLAIM_NOSCAN;
2510
2412 ret = __zone_reclaim(zone, gfp_mask, order); 2511 ret = __zone_reclaim(zone, gfp_mask, order);
2413 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 2512 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
2414 2513
2514 if (!ret)
2515 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
2516
2415 return ret; 2517 return ret;
2416} 2518}
2417#endif 2519#endif
2418 2520
2419#ifdef CONFIG_UNEVICTABLE_LRU
2420/* 2521/*
2421 * page_evictable - test whether a page is evictable 2522 * page_evictable - test whether a page is evictable
2422 * @page: the page to test 2523 * @page: the page to test
@@ -2663,4 +2764,3 @@ void scan_unevictable_unregister_node(struct node *node)
2663 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 2764 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2664} 2765}
2665 2766
2666#endif