aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c372
1 files changed, 235 insertions, 137 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 95c08a8cc2ba..4139aa52b941 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -470,8 +470,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page)
470 swp_entry_t swap = { .val = page_private(page) }; 470 swp_entry_t swap = { .val = page_private(page) };
471 __delete_from_swap_cache(page); 471 __delete_from_swap_cache(page);
472 spin_unlock_irq(&mapping->tree_lock); 472 spin_unlock_irq(&mapping->tree_lock);
473 mem_cgroup_uncharge_swapcache(page, swap); 473 swapcache_free(swap, page);
474 swap_free(swap);
475 } else { 474 } else {
476 __remove_from_page_cache(page); 475 __remove_from_page_cache(page);
477 spin_unlock_irq(&mapping->tree_lock); 476 spin_unlock_irq(&mapping->tree_lock);
@@ -514,7 +513,6 @@ int remove_mapping(struct address_space *mapping, struct page *page)
514 * 513 *
515 * lru_lock must not be held, interrupts must be enabled. 514 * lru_lock must not be held, interrupts must be enabled.
516 */ 515 */
517#ifdef CONFIG_UNEVICTABLE_LRU
518void putback_lru_page(struct page *page) 516void putback_lru_page(struct page *page)
519{ 517{
520 int lru; 518 int lru;
@@ -568,20 +566,6 @@ redo:
568 put_page(page); /* drop ref from isolate */ 566 put_page(page); /* drop ref from isolate */
569} 567}
570 568
571#else /* CONFIG_UNEVICTABLE_LRU */
572
573void putback_lru_page(struct page *page)
574{
575 int lru;
576 VM_BUG_ON(PageLRU(page));
577
578 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
579 lru_cache_add_lru(page, lru);
580 put_page(page);
581}
582#endif /* CONFIG_UNEVICTABLE_LRU */
583
584
585/* 569/*
586 * shrink_page_list() returns the number of reclaimed pages 570 * shrink_page_list() returns the number of reclaimed pages
587 */ 571 */
@@ -593,6 +577,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
593 struct pagevec freed_pvec; 577 struct pagevec freed_pvec;
594 int pgactivate = 0; 578 int pgactivate = 0;
595 unsigned long nr_reclaimed = 0; 579 unsigned long nr_reclaimed = 0;
580 unsigned long vm_flags;
596 581
597 cond_resched(); 582 cond_resched();
598 583
@@ -643,7 +628,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
643 goto keep_locked; 628 goto keep_locked;
644 } 629 }
645 630
646 referenced = page_referenced(page, 1, sc->mem_cgroup); 631 referenced = page_referenced(page, 1,
632 sc->mem_cgroup, &vm_flags);
647 /* In active use or really unfreeable? Activate it. */ 633 /* In active use or really unfreeable? Activate it. */
648 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && 634 if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
649 referenced && page_mapping_inuse(page)) 635 referenced && page_mapping_inuse(page))
@@ -943,18 +929,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
943 /* Check that we have not crossed a zone boundary. */ 929 /* Check that we have not crossed a zone boundary. */
944 if (unlikely(page_zone_id(cursor_page) != zone_id)) 930 if (unlikely(page_zone_id(cursor_page) != zone_id))
945 continue; 931 continue;
946 switch (__isolate_lru_page(cursor_page, mode, file)) { 932 if (__isolate_lru_page(cursor_page, mode, file) == 0) {
947 case 0:
948 list_move(&cursor_page->lru, dst); 933 list_move(&cursor_page->lru, dst);
949 nr_taken++; 934 nr_taken++;
950 scan++; 935 scan++;
951 break;
952
953 case -EBUSY:
954 /* else it is being freed elsewhere */
955 list_move(&cursor_page->lru, src);
956 default:
957 break; /* ! on LRU or wrong list */
958 } 936 }
959 } 937 }
960 } 938 }
@@ -1061,6 +1039,19 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1061 unsigned long nr_scanned = 0; 1039 unsigned long nr_scanned = 0;
1062 unsigned long nr_reclaimed = 0; 1040 unsigned long nr_reclaimed = 0;
1063 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1041 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1042 int lumpy_reclaim = 0;
1043
1044 /*
1045 * If we need a large contiguous chunk of memory, or have
1046 * trouble getting a small set of contiguous pages, we
1047 * will reclaim both active and inactive pages.
1048 *
1049 * We use the same threshold as pageout congestion_wait below.
1050 */
1051 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1052 lumpy_reclaim = 1;
1053 else if (sc->order && priority < DEF_PRIORITY - 2)
1054 lumpy_reclaim = 1;
1064 1055
1065 pagevec_init(&pvec, 1); 1056 pagevec_init(&pvec, 1);
1066 1057
@@ -1073,19 +1064,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1073 unsigned long nr_freed; 1064 unsigned long nr_freed;
1074 unsigned long nr_active; 1065 unsigned long nr_active;
1075 unsigned int count[NR_LRU_LISTS] = { 0, }; 1066 unsigned int count[NR_LRU_LISTS] = { 0, };
1076 int mode = ISOLATE_INACTIVE; 1067 int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE;
1077
1078 /*
1079 * If we need a large contiguous chunk of memory, or have
1080 * trouble getting a small set of contiguous pages, we
1081 * will reclaim both active and inactive pages.
1082 *
1083 * We use the same threshold as pageout congestion_wait below.
1084 */
1085 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1086 mode = ISOLATE_BOTH;
1087 else if (sc->order && priority < DEF_PRIORITY - 2)
1088 mode = ISOLATE_BOTH;
1089 1068
1090 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1069 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
1091 &page_list, &nr_scan, sc->order, mode, 1070 &page_list, &nr_scan, sc->order, mode,
@@ -1122,7 +1101,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1122 * but that should be acceptable to the caller 1101 * but that should be acceptable to the caller
1123 */ 1102 */
1124 if (nr_freed < nr_taken && !current_is_kswapd() && 1103 if (nr_freed < nr_taken && !current_is_kswapd() &&
1125 sc->order > PAGE_ALLOC_COSTLY_ORDER) { 1104 lumpy_reclaim) {
1126 congestion_wait(WRITE, HZ/10); 1105 congestion_wait(WRITE, HZ/10);
1127 1106
1128 /* 1107 /*
@@ -1217,18 +1196,54 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1217 * But we had to alter page->flags anyway. 1196 * But we had to alter page->flags anyway.
1218 */ 1197 */
1219 1198
1199static void move_active_pages_to_lru(struct zone *zone,
1200 struct list_head *list,
1201 enum lru_list lru)
1202{
1203 unsigned long pgmoved = 0;
1204 struct pagevec pvec;
1205 struct page *page;
1206
1207 pagevec_init(&pvec, 1);
1208
1209 while (!list_empty(list)) {
1210 page = lru_to_page(list);
1211 prefetchw_prev_lru_page(page, list, flags);
1212
1213 VM_BUG_ON(PageLRU(page));
1214 SetPageLRU(page);
1215
1216 VM_BUG_ON(!PageActive(page));
1217 if (!is_active_lru(lru))
1218 ClearPageActive(page); /* we are de-activating */
1219
1220 list_move(&page->lru, &zone->lru[lru].list);
1221 mem_cgroup_add_lru_list(page, lru);
1222 pgmoved++;
1223
1224 if (!pagevec_add(&pvec, page) || list_empty(list)) {
1225 spin_unlock_irq(&zone->lru_lock);
1226 if (buffer_heads_over_limit)
1227 pagevec_strip(&pvec);
1228 __pagevec_release(&pvec);
1229 spin_lock_irq(&zone->lru_lock);
1230 }
1231 }
1232 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1233 if (!is_active_lru(lru))
1234 __count_vm_events(PGDEACTIVATE, pgmoved);
1235}
1220 1236
1221static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1237static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1222 struct scan_control *sc, int priority, int file) 1238 struct scan_control *sc, int priority, int file)
1223{ 1239{
1224 unsigned long pgmoved; 1240 unsigned long pgmoved;
1225 int pgdeactivate = 0;
1226 unsigned long pgscanned; 1241 unsigned long pgscanned;
1242 unsigned long vm_flags;
1227 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1243 LIST_HEAD(l_hold); /* The pages which were snipped off */
1244 LIST_HEAD(l_active);
1228 LIST_HEAD(l_inactive); 1245 LIST_HEAD(l_inactive);
1229 struct page *page; 1246 struct page *page;
1230 struct pagevec pvec;
1231 enum lru_list lru;
1232 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1247 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1233 1248
1234 lru_add_drain(); 1249 lru_add_drain();
@@ -1245,13 +1260,14 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1245 } 1260 }
1246 reclaim_stat->recent_scanned[!!file] += pgmoved; 1261 reclaim_stat->recent_scanned[!!file] += pgmoved;
1247 1262
1263 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1248 if (file) 1264 if (file)
1249 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1265 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1250 else 1266 else
1251 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); 1267 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1252 spin_unlock_irq(&zone->lru_lock); 1268 spin_unlock_irq(&zone->lru_lock);
1253 1269
1254 pgmoved = 0; 1270 pgmoved = 0; /* count referenced (mapping) mapped pages */
1255 while (!list_empty(&l_hold)) { 1271 while (!list_empty(&l_hold)) {
1256 cond_resched(); 1272 cond_resched();
1257 page = lru_to_page(&l_hold); 1273 page = lru_to_page(&l_hold);
@@ -1264,58 +1280,44 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1264 1280
1265 /* page_referenced clears PageReferenced */ 1281 /* page_referenced clears PageReferenced */
1266 if (page_mapping_inuse(page) && 1282 if (page_mapping_inuse(page) &&
1267 page_referenced(page, 0, sc->mem_cgroup)) 1283 page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
1268 pgmoved++; 1284 pgmoved++;
1285 /*
1286 * Identify referenced, file-backed active pages and
1287 * give them one more trip around the active list. So
1288 * that executable code get better chances to stay in
1289 * memory under moderate memory pressure. Anon pages
1290 * are not likely to be evicted by use-once streaming
1291 * IO, plus JVM can create lots of anon VM_EXEC pages,
1292 * so we ignore them here.
1293 */
1294 if ((vm_flags & VM_EXEC) && !PageAnon(page)) {
1295 list_add(&page->lru, &l_active);
1296 continue;
1297 }
1298 }
1269 1299
1270 list_add(&page->lru, &l_inactive); 1300 list_add(&page->lru, &l_inactive);
1271 } 1301 }
1272 1302
1273 /* 1303 /*
1274 * Move the pages to the [file or anon] inactive list. 1304 * Move pages back to the lru list.
1275 */ 1305 */
1276 pagevec_init(&pvec, 1);
1277 lru = LRU_BASE + file * LRU_FILE;
1278
1279 spin_lock_irq(&zone->lru_lock); 1306 spin_lock_irq(&zone->lru_lock);
1280 /* 1307 /*
1281 * Count referenced pages from currently used mappings as 1308 * Count referenced pages from currently used mappings as rotated,
1282 * rotated, even though they are moved to the inactive list. 1309 * even though only some of them are actually re-activated. This
1283 * This helps balance scan pressure between file and anonymous 1310 * helps balance scan pressure between file and anonymous pages in
1284 * pages in get_scan_ratio. 1311 * get_scan_ratio.
1285 */ 1312 */
1286 reclaim_stat->recent_rotated[!!file] += pgmoved; 1313 reclaim_stat->recent_rotated[!!file] += pgmoved;
1287 1314
1288 pgmoved = 0; 1315 move_active_pages_to_lru(zone, &l_active,
1289 while (!list_empty(&l_inactive)) { 1316 LRU_ACTIVE + file * LRU_FILE);
1290 page = lru_to_page(&l_inactive); 1317 move_active_pages_to_lru(zone, &l_inactive,
1291 prefetchw_prev_lru_page(page, &l_inactive, flags); 1318 LRU_BASE + file * LRU_FILE);
1292 VM_BUG_ON(PageLRU(page));
1293 SetPageLRU(page);
1294 VM_BUG_ON(!PageActive(page));
1295 ClearPageActive(page);
1296 1319
1297 list_move(&page->lru, &zone->lru[lru].list);
1298 mem_cgroup_add_lru_list(page, lru);
1299 pgmoved++;
1300 if (!pagevec_add(&pvec, page)) {
1301 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1302 spin_unlock_irq(&zone->lru_lock);
1303 pgdeactivate += pgmoved;
1304 pgmoved = 0;
1305 if (buffer_heads_over_limit)
1306 pagevec_strip(&pvec);
1307 __pagevec_release(&pvec);
1308 spin_lock_irq(&zone->lru_lock);
1309 }
1310 }
1311 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1312 pgdeactivate += pgmoved;
1313 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1314 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1315 spin_unlock_irq(&zone->lru_lock); 1320 spin_unlock_irq(&zone->lru_lock);
1316 if (buffer_heads_over_limit)
1317 pagevec_strip(&pvec);
1318 pagevec_release(&pvec);
1319} 1321}
1320 1322
1321static int inactive_anon_is_low_global(struct zone *zone) 1323static int inactive_anon_is_low_global(struct zone *zone)
@@ -1350,12 +1352,48 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1350 return low; 1352 return low;
1351} 1353}
1352 1354
1355static int inactive_file_is_low_global(struct zone *zone)
1356{
1357 unsigned long active, inactive;
1358
1359 active = zone_page_state(zone, NR_ACTIVE_FILE);
1360 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1361
1362 return (active > inactive);
1363}
1364
1365/**
1366 * inactive_file_is_low - check if file pages need to be deactivated
1367 * @zone: zone to check
1368 * @sc: scan control of this context
1369 *
1370 * When the system is doing streaming IO, memory pressure here
1371 * ensures that active file pages get deactivated, until more
1372 * than half of the file pages are on the inactive list.
1373 *
1374 * Once we get to that situation, protect the system's working
1375 * set from being evicted by disabling active file page aging.
1376 *
1377 * This uses a different ratio than the anonymous pages, because
1378 * the page cache uses a use-once replacement algorithm.
1379 */
1380static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1381{
1382 int low;
1383
1384 if (scanning_global_lru(sc))
1385 low = inactive_file_is_low_global(zone);
1386 else
1387 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup);
1388 return low;
1389}
1390
1353static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1391static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1354 struct zone *zone, struct scan_control *sc, int priority) 1392 struct zone *zone, struct scan_control *sc, int priority)
1355{ 1393{
1356 int file = is_file_lru(lru); 1394 int file = is_file_lru(lru);
1357 1395
1358 if (lru == LRU_ACTIVE_FILE) { 1396 if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
1359 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1397 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1360 return 0; 1398 return 0;
1361 } 1399 }
@@ -1384,13 +1422,6 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1384 unsigned long ap, fp; 1422 unsigned long ap, fp;
1385 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1423 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1386 1424
1387 /* If we have no swap space, do not bother scanning anon pages. */
1388 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1389 percent[0] = 0;
1390 percent[1] = 100;
1391 return;
1392 }
1393
1394 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + 1425 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1395 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); 1426 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1396 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + 1427 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
@@ -1400,7 +1431,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1400 free = zone_page_state(zone, NR_FREE_PAGES); 1431 free = zone_page_state(zone, NR_FREE_PAGES);
1401 /* If we have very few page cache pages, 1432 /* If we have very few page cache pages,
1402 force-scan anon pages. */ 1433 force-scan anon pages. */
1403 if (unlikely(file + free <= zone->pages_high)) { 1434 if (unlikely(file + free <= high_wmark_pages(zone))) {
1404 percent[0] = 100; 1435 percent[0] = 100;
1405 percent[1] = 0; 1436 percent[1] = 0;
1406 return; 1437 return;
@@ -1455,6 +1486,26 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1455 percent[1] = 100 - percent[0]; 1486 percent[1] = 100 - percent[0];
1456} 1487}
1457 1488
1489/*
1490 * Smallish @nr_to_scan's are deposited in @nr_saved_scan,
1491 * until we collected @swap_cluster_max pages to scan.
1492 */
1493static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
1494 unsigned long *nr_saved_scan,
1495 unsigned long swap_cluster_max)
1496{
1497 unsigned long nr;
1498
1499 *nr_saved_scan += nr_to_scan;
1500 nr = *nr_saved_scan;
1501
1502 if (nr >= swap_cluster_max)
1503 *nr_saved_scan = 0;
1504 else
1505 nr = 0;
1506
1507 return nr;
1508}
1458 1509
1459/* 1510/*
1460 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1511 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
@@ -1468,26 +1519,30 @@ static void shrink_zone(int priority, struct zone *zone,
1468 enum lru_list l; 1519 enum lru_list l;
1469 unsigned long nr_reclaimed = sc->nr_reclaimed; 1520 unsigned long nr_reclaimed = sc->nr_reclaimed;
1470 unsigned long swap_cluster_max = sc->swap_cluster_max; 1521 unsigned long swap_cluster_max = sc->swap_cluster_max;
1522 int noswap = 0;
1471 1523
1472 get_scan_ratio(zone, sc, percent); 1524 /* If we have no swap space, do not bother scanning anon pages. */
1525 if (!sc->may_swap || (nr_swap_pages <= 0)) {
1526 noswap = 1;
1527 percent[0] = 0;
1528 percent[1] = 100;
1529 } else
1530 get_scan_ratio(zone, sc, percent);
1473 1531
1474 for_each_evictable_lru(l) { 1532 for_each_evictable_lru(l) {
1475 int file = is_file_lru(l); 1533 int file = is_file_lru(l);
1476 unsigned long scan; 1534 unsigned long scan;
1477 1535
1478 scan = zone_nr_pages(zone, sc, l); 1536 scan = zone_nr_pages(zone, sc, l);
1479 if (priority) { 1537 if (priority || noswap) {
1480 scan >>= priority; 1538 scan >>= priority;
1481 scan = (scan * percent[file]) / 100; 1539 scan = (scan * percent[file]) / 100;
1482 } 1540 }
1483 if (scanning_global_lru(sc)) { 1541 if (scanning_global_lru(sc))
1484 zone->lru[l].nr_scan += scan; 1542 nr[l] = nr_scan_try_batch(scan,
1485 nr[l] = zone->lru[l].nr_scan; 1543 &zone->lru[l].nr_saved_scan,
1486 if (nr[l] >= swap_cluster_max) 1544 swap_cluster_max);
1487 zone->lru[l].nr_scan = 0; 1545 else
1488 else
1489 nr[l] = 0;
1490 } else
1491 nr[l] = scan; 1546 nr[l] = scan;
1492 } 1547 }
1493 1548
@@ -1521,7 +1576,7 @@ static void shrink_zone(int priority, struct zone *zone,
1521 * Even if we did not try to evict anon pages at all, we want to 1576 * Even if we did not try to evict anon pages at all, we want to
1522 * rebalance the anon lru active/inactive ratio. 1577 * rebalance the anon lru active/inactive ratio.
1523 */ 1578 */
1524 if (inactive_anon_is_low(zone, sc)) 1579 if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0)
1525 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1580 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1526 1581
1527 throttle_vm_writeout(sc->gfp_mask); 1582 throttle_vm_writeout(sc->gfp_mask);
@@ -1532,11 +1587,13 @@ static void shrink_zone(int priority, struct zone *zone,
1532 * try to reclaim pages from zones which will satisfy the caller's allocation 1587 * try to reclaim pages from zones which will satisfy the caller's allocation
1533 * request. 1588 * request.
1534 * 1589 *
1535 * We reclaim from a zone even if that zone is over pages_high. Because: 1590 * We reclaim from a zone even if that zone is over high_wmark_pages(zone).
1591 * Because:
1536 * a) The caller may be trying to free *extra* pages to satisfy a higher-order 1592 * a) The caller may be trying to free *extra* pages to satisfy a higher-order
1537 * allocation or 1593 * allocation or
1538 * b) The zones may be over pages_high but they must go *over* pages_high to 1594 * b) The target zone may be at high_wmark_pages(zone) but the lower zones
1539 * satisfy the `incremental min' zone defense algorithm. 1595 * must go *over* high_wmark_pages(zone) to satisfy the `incremental min'
1596 * zone defense algorithm.
1540 * 1597 *
1541 * If a zone is deemed to be full of pinned pages then just give it a light 1598 * If a zone is deemed to be full of pinned pages then just give it a light
1542 * scan then give up on it. 1599 * scan then give up on it.
@@ -1742,7 +1799,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1742 1799
1743/* 1800/*
1744 * For kswapd, balance_pgdat() will work across all this node's zones until 1801 * For kswapd, balance_pgdat() will work across all this node's zones until
1745 * they are all at pages_high. 1802 * they are all at high_wmark_pages(zone).
1746 * 1803 *
1747 * Returns the number of pages which were actually freed. 1804 * Returns the number of pages which were actually freed.
1748 * 1805 *
@@ -1755,11 +1812,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1755 * the zone for when the problem goes away. 1812 * the zone for when the problem goes away.
1756 * 1813 *
1757 * kswapd scans the zones in the highmem->normal->dma direction. It skips 1814 * kswapd scans the zones in the highmem->normal->dma direction. It skips
1758 * zones which have free_pages > pages_high, but once a zone is found to have 1815 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
1759 * free_pages <= pages_high, we scan that zone and the lower zones regardless 1816 * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the
1760 * of the number of free pages in the lower zones. This interoperates with 1817 * lower zones regardless of the number of free pages in the lower zones. This
1761 * the page allocator fallback scheme to ensure that aging of pages is balanced 1818 * interoperates with the page allocator fallback scheme to ensure that aging
1762 * across the zones. 1819 * of pages is balanced across the zones.
1763 */ 1820 */
1764static unsigned long balance_pgdat(pg_data_t *pgdat, int order) 1821static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1765{ 1822{
@@ -1780,7 +1837,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1780 }; 1837 };
1781 /* 1838 /*
1782 * temp_priority is used to remember the scanning priority at which 1839 * temp_priority is used to remember the scanning priority at which
1783 * this zone was successfully refilled to free_pages == pages_high. 1840 * this zone was successfully refilled to
1841 * free_pages == high_wmark_pages(zone).
1784 */ 1842 */
1785 int temp_priority[MAX_NR_ZONES]; 1843 int temp_priority[MAX_NR_ZONES];
1786 1844
@@ -1825,8 +1883,8 @@ loop_again:
1825 shrink_active_list(SWAP_CLUSTER_MAX, zone, 1883 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1826 &sc, priority, 0); 1884 &sc, priority, 0);
1827 1885
1828 if (!zone_watermark_ok(zone, order, zone->pages_high, 1886 if (!zone_watermark_ok(zone, order,
1829 0, 0)) { 1887 high_wmark_pages(zone), 0, 0)) {
1830 end_zone = i; 1888 end_zone = i;
1831 break; 1889 break;
1832 } 1890 }
@@ -1860,8 +1918,8 @@ loop_again:
1860 priority != DEF_PRIORITY) 1918 priority != DEF_PRIORITY)
1861 continue; 1919 continue;
1862 1920
1863 if (!zone_watermark_ok(zone, order, zone->pages_high, 1921 if (!zone_watermark_ok(zone, order,
1864 end_zone, 0)) 1922 high_wmark_pages(zone), end_zone, 0))
1865 all_zones_ok = 0; 1923 all_zones_ok = 0;
1866 temp_priority[i] = priority; 1924 temp_priority[i] = priority;
1867 sc.nr_scanned = 0; 1925 sc.nr_scanned = 0;
@@ -1870,8 +1928,8 @@ loop_again:
1870 * We put equal pressure on every zone, unless one 1928 * We put equal pressure on every zone, unless one
1871 * zone has way too many pages free already. 1929 * zone has way too many pages free already.
1872 */ 1930 */
1873 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1931 if (!zone_watermark_ok(zone, order,
1874 end_zone, 0)) 1932 8*high_wmark_pages(zone), end_zone, 0))
1875 shrink_zone(priority, zone, &sc); 1933 shrink_zone(priority, zone, &sc);
1876 reclaim_state->reclaimed_slab = 0; 1934 reclaim_state->reclaimed_slab = 0;
1877 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1935 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -2037,7 +2095,7 @@ void wakeup_kswapd(struct zone *zone, int order)
2037 return; 2095 return;
2038 2096
2039 pgdat = zone->zone_pgdat; 2097 pgdat = zone->zone_pgdat;
2040 if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) 2098 if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
2041 return; 2099 return;
2042 if (pgdat->kswapd_max_order < order) 2100 if (pgdat->kswapd_max_order < order)
2043 pgdat->kswapd_max_order = order; 2101 pgdat->kswapd_max_order = order;
@@ -2084,11 +2142,11 @@ static void shrink_all_zones(unsigned long nr_pages, int prio,
2084 l == LRU_ACTIVE_FILE)) 2142 l == LRU_ACTIVE_FILE))
2085 continue; 2143 continue;
2086 2144
2087 zone->lru[l].nr_scan += (lru_pages >> prio) + 1; 2145 zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1;
2088 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { 2146 if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) {
2089 unsigned long nr_to_scan; 2147 unsigned long nr_to_scan;
2090 2148
2091 zone->lru[l].nr_scan = 0; 2149 zone->lru[l].nr_saved_scan = 0;
2092 nr_to_scan = min(nr_pages, lru_pages); 2150 nr_to_scan = min(nr_pages, lru_pages);
2093 nr_reclaimed += shrink_list(l, nr_to_scan, zone, 2151 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2094 sc, prio); 2152 sc, prio);
@@ -2290,6 +2348,48 @@ int sysctl_min_unmapped_ratio = 1;
2290 */ 2348 */
2291int sysctl_min_slab_ratio = 5; 2349int sysctl_min_slab_ratio = 5;
2292 2350
2351static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
2352{
2353 unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
2354 unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
2355 zone_page_state(zone, NR_ACTIVE_FILE);
2356
2357 /*
2358 * It's possible for there to be more file mapped pages than
2359 * accounted for by the pages on the file LRU lists because
2360 * tmpfs pages accounted for as ANON can also be FILE_MAPPED
2361 */
2362 return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
2363}
2364
2365/* Work out how many page cache pages we can reclaim in this reclaim_mode */
2366static long zone_pagecache_reclaimable(struct zone *zone)
2367{
2368 long nr_pagecache_reclaimable;
2369 long delta = 0;
2370
2371 /*
2372 * If RECLAIM_SWAP is set, then all file pages are considered
2373 * potentially reclaimable. Otherwise, we have to worry about
2374 * pages like swapcache and zone_unmapped_file_pages() provides
2375 * a better estimate
2376 */
2377 if (zone_reclaim_mode & RECLAIM_SWAP)
2378 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
2379 else
2380 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
2381
2382 /* If we can't clean pages, remove dirty pages from consideration */
2383 if (!(zone_reclaim_mode & RECLAIM_WRITE))
2384 delta += zone_page_state(zone, NR_FILE_DIRTY);
2385
2386 /* Watch for any possible underflows due to delta */
2387 if (unlikely(delta > nr_pagecache_reclaimable))
2388 delta = nr_pagecache_reclaimable;
2389
2390 return nr_pagecache_reclaimable - delta;
2391}
2392
2293/* 2393/*
2294 * Try to free up some pages from this zone through reclaim. 2394 * Try to free up some pages from this zone through reclaim.
2295 */ 2395 */
@@ -2324,9 +2424,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2324 reclaim_state.reclaimed_slab = 0; 2424 reclaim_state.reclaimed_slab = 0;
2325 p->reclaim_state = &reclaim_state; 2425 p->reclaim_state = &reclaim_state;
2326 2426
2327 if (zone_page_state(zone, NR_FILE_PAGES) - 2427 if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
2328 zone_page_state(zone, NR_FILE_MAPPED) >
2329 zone->min_unmapped_pages) {
2330 /* 2428 /*
2331 * Free memory by calling shrink zone with increasing 2429 * Free memory by calling shrink zone with increasing
2332 * priorities until we have enough memory freed. 2430 * priorities until we have enough memory freed.
@@ -2384,20 +2482,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2384 * if less than a specified percentage of the zone is used by 2482 * if less than a specified percentage of the zone is used by
2385 * unmapped file backed pages. 2483 * unmapped file backed pages.
2386 */ 2484 */
2387 if (zone_page_state(zone, NR_FILE_PAGES) - 2485 if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
2388 zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages 2486 zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
2389 && zone_page_state(zone, NR_SLAB_RECLAIMABLE) 2487 return ZONE_RECLAIM_FULL;
2390 <= zone->min_slab_pages)
2391 return 0;
2392 2488
2393 if (zone_is_all_unreclaimable(zone)) 2489 if (zone_is_all_unreclaimable(zone))
2394 return 0; 2490 return ZONE_RECLAIM_FULL;
2395 2491
2396 /* 2492 /*
2397 * Do not scan if the allocation should not be delayed. 2493 * Do not scan if the allocation should not be delayed.
2398 */ 2494 */
2399 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) 2495 if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
2400 return 0; 2496 return ZONE_RECLAIM_NOSCAN;
2401 2497
2402 /* 2498 /*
2403 * Only run zone reclaim on the local zone or on zones that do not 2499 * Only run zone reclaim on the local zone or on zones that do not
@@ -2407,18 +2503,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2407 */ 2503 */
2408 node_id = zone_to_nid(zone); 2504 node_id = zone_to_nid(zone);
2409 if (node_state(node_id, N_CPU) && node_id != numa_node_id()) 2505 if (node_state(node_id, N_CPU) && node_id != numa_node_id())
2410 return 0; 2506 return ZONE_RECLAIM_NOSCAN;
2411 2507
2412 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) 2508 if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
2413 return 0; 2509 return ZONE_RECLAIM_NOSCAN;
2510
2414 ret = __zone_reclaim(zone, gfp_mask, order); 2511 ret = __zone_reclaim(zone, gfp_mask, order);
2415 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); 2512 zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
2416 2513
2514 if (!ret)
2515 count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
2516
2417 return ret; 2517 return ret;
2418} 2518}
2419#endif 2519#endif
2420 2520
2421#ifdef CONFIG_UNEVICTABLE_LRU
2422/* 2521/*
2423 * page_evictable - test whether a page is evictable 2522 * page_evictable - test whether a page is evictable
2424 * @page: the page to test 2523 * @page: the page to test
@@ -2665,4 +2764,3 @@ void scan_unevictable_unregister_node(struct node *node)
2665 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); 2764 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2666} 2765}
2667 2766
2668#endif