aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c396
1 files changed, 197 insertions, 199 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b7719ec10dc5..a1893c050795 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -105,7 +105,6 @@ struct scan_control {
105 105
106 /* Which cgroup do we reclaim from */ 106 /* Which cgroup do we reclaim from */
107 struct mem_cgroup *mem_cgroup; 107 struct mem_cgroup *mem_cgroup;
108 struct memcg_scanrecord *memcg_record;
109 108
110 /* 109 /*
111 * Nodemask of nodes allowed by the caller. If NULL, all nodes 110 * Nodemask of nodes allowed by the caller. If NULL, all nodes
@@ -496,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
496 return PAGE_ACTIVATE; 495 return PAGE_ACTIVATE;
497 } 496 }
498 497
499 /*
500 * Wait on writeback if requested to. This happens when
501 * direct reclaiming a large contiguous area and the
502 * first attempt to free a range of pages fails.
503 */
504 if (PageWriteback(page) &&
505 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
506 wait_on_page_writeback(page);
507
508 if (!PageWriteback(page)) { 498 if (!PageWriteback(page)) {
509 /* synchronous write or broken a_ops? */ 499 /* synchronous write or broken a_ops? */
510 ClearPageReclaim(page); 500 ClearPageReclaim(page);
@@ -643,13 +633,14 @@ redo:
643 lru = LRU_UNEVICTABLE; 633 lru = LRU_UNEVICTABLE;
644 add_page_to_unevictable_list(page); 634 add_page_to_unevictable_list(page);
645 /* 635 /*
646 * When racing with an mlock clearing (page is 636 * When racing with an mlock or AS_UNEVICTABLE clearing
647 * unlocked), make sure that if the other thread does 637 * (page is unlocked) make sure that if the other thread
648 * not observe our setting of PG_lru and fails 638 * does not observe our setting of PG_lru and fails
649 * isolation, we see PG_mlocked cleared below and move 639 * isolation/check_move_unevictable_page,
640 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
650 * the page back to the evictable list. 641 * the page back to the evictable list.
651 * 642 *
652 * The other side is TestClearPageMlocked(). 643 * The other side is TestClearPageMlocked() or shmem_lock().
653 */ 644 */
654 smp_mb(); 645 smp_mb();
655 } 646 }
@@ -760,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
760 */ 751 */
761static unsigned long shrink_page_list(struct list_head *page_list, 752static unsigned long shrink_page_list(struct list_head *page_list,
762 struct zone *zone, 753 struct zone *zone,
763 struct scan_control *sc) 754 struct scan_control *sc,
755 int priority,
756 unsigned long *ret_nr_dirty,
757 unsigned long *ret_nr_writeback)
764{ 758{
765 LIST_HEAD(ret_pages); 759 LIST_HEAD(ret_pages);
766 LIST_HEAD(free_pages); 760 LIST_HEAD(free_pages);
@@ -768,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
768 unsigned long nr_dirty = 0; 762 unsigned long nr_dirty = 0;
769 unsigned long nr_congested = 0; 763 unsigned long nr_congested = 0;
770 unsigned long nr_reclaimed = 0; 764 unsigned long nr_reclaimed = 0;
765 unsigned long nr_writeback = 0;
771 766
772 cond_resched(); 767 cond_resched();
773 768
@@ -804,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
804 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 799 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
805 800
806 if (PageWriteback(page)) { 801 if (PageWriteback(page)) {
802 nr_writeback++;
807 /* 803 /*
808 * Synchronous reclaim is performed in two passes, 804 * Synchronous reclaim cannot queue pages for
809 * first an asynchronous pass over the list to 805 * writeback due to the possibility of stack overflow
810 * start parallel writeback, and a second synchronous 806 * but if it encounters a page under writeback, wait
811 * pass to wait for the IO to complete. Wait here 807 * for the IO to complete.
812 * for any page for which writeback has already
813 * started.
814 */ 808 */
815 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && 809 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
816 may_enter_fs) 810 may_enter_fs)
@@ -866,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
866 if (PageDirty(page)) { 860 if (PageDirty(page)) {
867 nr_dirty++; 861 nr_dirty++;
868 862
863 /*
864 * Only kswapd can writeback filesystem pages to
865 * avoid risk of stack overflow but do not writeback
866 * unless under significant pressure.
867 */
868 if (page_is_file_cache(page) &&
869 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
870 /*
871 * Immediately reclaim when written back.
872 * Similar in principal to deactivate_page()
873 * except we already have the page isolated
874 * and know it's dirty
875 */
876 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
877 SetPageReclaim(page);
878
879 goto keep_locked;
880 }
881
869 if (references == PAGEREF_RECLAIM_CLEAN) 882 if (references == PAGEREF_RECLAIM_CLEAN)
870 goto keep_locked; 883 goto keep_locked;
871 if (!may_enter_fs) 884 if (!may_enter_fs)
@@ -1000,6 +1013,8 @@ keep_lumpy:
1000 1013
1001 list_splice(&ret_pages, page_list); 1014 list_splice(&ret_pages, page_list);
1002 count_vm_events(PGACTIVATE, pgactivate); 1015 count_vm_events(PGACTIVATE, pgactivate);
1016 *ret_nr_dirty += nr_dirty;
1017 *ret_nr_writeback += nr_writeback;
1003 return nr_reclaimed; 1018 return nr_reclaimed;
1004} 1019}
1005 1020
@@ -1013,23 +1028,27 @@ keep_lumpy:
1013 * 1028 *
1014 * returns 0 on success, -ve errno on failure. 1029 * returns 0 on success, -ve errno on failure.
1015 */ 1030 */
1016int __isolate_lru_page(struct page *page, int mode, int file) 1031int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1017{ 1032{
1033 bool all_lru_mode;
1018 int ret = -EINVAL; 1034 int ret = -EINVAL;
1019 1035
1020 /* Only take pages on the LRU. */ 1036 /* Only take pages on the LRU. */
1021 if (!PageLRU(page)) 1037 if (!PageLRU(page))
1022 return ret; 1038 return ret;
1023 1039
1040 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
1041 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1042
1024 /* 1043 /*
1025 * When checking the active state, we need to be sure we are 1044 * When checking the active state, we need to be sure we are
1026 * dealing with comparible boolean values. Take the logical not 1045 * dealing with comparible boolean values. Take the logical not
1027 * of each. 1046 * of each.
1028 */ 1047 */
1029 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 1048 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
1030 return ret; 1049 return ret;
1031 1050
1032 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) 1051 if (!all_lru_mode && !!page_is_file_cache(page) != file)
1033 return ret; 1052 return ret;
1034 1053
1035 /* 1054 /*
@@ -1042,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1042 1061
1043 ret = -EBUSY; 1062 ret = -EBUSY;
1044 1063
1064 if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
1065 return ret;
1066
1067 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1068 return ret;
1069
1045 if (likely(get_page_unless_zero(page))) { 1070 if (likely(get_page_unless_zero(page))) {
1046 /* 1071 /*
1047 * Be careful not to clear PageLRU until after we're 1072 * Be careful not to clear PageLRU until after we're
@@ -1077,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1077 */ 1102 */
1078static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1103static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1079 struct list_head *src, struct list_head *dst, 1104 struct list_head *src, struct list_head *dst,
1080 unsigned long *scanned, int order, int mode, int file) 1105 unsigned long *scanned, int order, isolate_mode_t mode,
1106 int file)
1081{ 1107{
1082 unsigned long nr_taken = 0; 1108 unsigned long nr_taken = 0;
1083 unsigned long nr_lumpy_taken = 0; 1109 unsigned long nr_lumpy_taken = 0;
@@ -1202,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1202static unsigned long isolate_pages_global(unsigned long nr, 1228static unsigned long isolate_pages_global(unsigned long nr,
1203 struct list_head *dst, 1229 struct list_head *dst,
1204 unsigned long *scanned, int order, 1230 unsigned long *scanned, int order,
1205 int mode, struct zone *z, 1231 isolate_mode_t mode,
1206 int active, int file) 1232 struct zone *z, int active, int file)
1207{ 1233{
1208 int lru = LRU_BASE; 1234 int lru = LRU_BASE;
1209 if (active) 1235 if (active)
@@ -1349,8 +1375,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc,
1349 int file = is_file_lru(lru); 1375 int file = is_file_lru(lru);
1350 int numpages = hpage_nr_pages(page); 1376 int numpages = hpage_nr_pages(page);
1351 reclaim_stat->recent_rotated[file] += numpages; 1377 reclaim_stat->recent_rotated[file] += numpages;
1352 if (!scanning_global_lru(sc))
1353 sc->memcg_record->nr_rotated[file] += numpages;
1354 } 1378 }
1355 if (!pagevec_add(&pvec, page)) { 1379 if (!pagevec_add(&pvec, page)) {
1356 spin_unlock_irq(&zone->lru_lock); 1380 spin_unlock_irq(&zone->lru_lock);
@@ -1394,14 +1418,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
1394 1418
1395 reclaim_stat->recent_scanned[0] += *nr_anon; 1419 reclaim_stat->recent_scanned[0] += *nr_anon;
1396 reclaim_stat->recent_scanned[1] += *nr_file; 1420 reclaim_stat->recent_scanned[1] += *nr_file;
1397 if (!scanning_global_lru(sc)) {
1398 sc->memcg_record->nr_scanned[0] += *nr_anon;
1399 sc->memcg_record->nr_scanned[1] += *nr_file;
1400 }
1401} 1421}
1402 1422
1403/* 1423/*
1404 * Returns true if the caller should wait to clean dirty/writeback pages. 1424 * Returns true if a direct reclaim should wait on pages under writeback.
1405 * 1425 *
1406 * If we are direct reclaiming for contiguous pages and we do not reclaim 1426 * If we are direct reclaiming for contiguous pages and we do not reclaim
1407 * everything in the list, try again and wait for writeback IO to complete. 1427 * everything in the list, try again and wait for writeback IO to complete.
@@ -1423,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1423 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) 1443 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1424 return false; 1444 return false;
1425 1445
1426 /* If we have relaimed everything on the isolated list, no stall */ 1446 /* If we have reclaimed everything on the isolated list, no stall */
1427 if (nr_freed == nr_taken) 1447 if (nr_freed == nr_taken)
1428 return false; 1448 return false;
1429 1449
@@ -1455,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1455 unsigned long nr_taken; 1475 unsigned long nr_taken;
1456 unsigned long nr_anon; 1476 unsigned long nr_anon;
1457 unsigned long nr_file; 1477 unsigned long nr_file;
1478 unsigned long nr_dirty = 0;
1479 unsigned long nr_writeback = 0;
1480 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1458 1481
1459 while (unlikely(too_many_isolated(zone, file, sc))) { 1482 while (unlikely(too_many_isolated(zone, file, sc))) {
1460 congestion_wait(BLK_RW_ASYNC, HZ/10); 1483 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1465,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1465 } 1488 }
1466 1489
1467 set_reclaim_mode(priority, sc, false); 1490 set_reclaim_mode(priority, sc, false);
1491 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1492 reclaim_mode |= ISOLATE_ACTIVE;
1493
1468 lru_add_drain(); 1494 lru_add_drain();
1495
1496 if (!sc->may_unmap)
1497 reclaim_mode |= ISOLATE_UNMAPPED;
1498 if (!sc->may_writepage)
1499 reclaim_mode |= ISOLATE_CLEAN;
1500
1469 spin_lock_irq(&zone->lru_lock); 1501 spin_lock_irq(&zone->lru_lock);
1470 1502
1471 if (scanning_global_lru(sc)) { 1503 if (scanning_global_lru(sc)) {
1472 nr_taken = isolate_pages_global(nr_to_scan, 1504 nr_taken = isolate_pages_global(nr_to_scan, &page_list,
1473 &page_list, &nr_scanned, sc->order, 1505 &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
1474 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1475 ISOLATE_BOTH : ISOLATE_INACTIVE,
1476 zone, 0, file);
1477 zone->pages_scanned += nr_scanned; 1506 zone->pages_scanned += nr_scanned;
1478 if (current_is_kswapd()) 1507 if (current_is_kswapd())
1479 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1508 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1482,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1482 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1511 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1483 nr_scanned); 1512 nr_scanned);
1484 } else { 1513 } else {
1485 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1514 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
1486 &page_list, &nr_scanned, sc->order, 1515 &nr_scanned, sc->order, reclaim_mode, zone,
1487 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? 1516 sc->mem_cgroup, 0, file);
1488 ISOLATE_BOTH : ISOLATE_INACTIVE,
1489 zone, sc->mem_cgroup,
1490 0, file);
1491 /* 1517 /*
1492 * mem_cgroup_isolate_pages() keeps track of 1518 * mem_cgroup_isolate_pages() keeps track of
1493 * scanned pages on its own. 1519 * scanned pages on its own.
@@ -1503,17 +1529,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1503 1529
1504 spin_unlock_irq(&zone->lru_lock); 1530 spin_unlock_irq(&zone->lru_lock);
1505 1531
1506 nr_reclaimed = shrink_page_list(&page_list, zone, sc); 1532 nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
1533 &nr_dirty, &nr_writeback);
1507 1534
1508 /* Check if we should syncronously wait for writeback */ 1535 /* Check if we should syncronously wait for writeback */
1509 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1536 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1510 set_reclaim_mode(priority, sc, true); 1537 set_reclaim_mode(priority, sc, true);
1511 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1538 nr_reclaimed += shrink_page_list(&page_list, zone, sc,
1539 priority, &nr_dirty, &nr_writeback);
1512 } 1540 }
1513 1541
1514 if (!scanning_global_lru(sc))
1515 sc->memcg_record->nr_freed[file] += nr_reclaimed;
1516
1517 local_irq_disable(); 1542 local_irq_disable();
1518 if (current_is_kswapd()) 1543 if (current_is_kswapd())
1519 __count_vm_events(KSWAPD_STEAL, nr_reclaimed); 1544 __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
@@ -1521,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1521 1546
1522 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1547 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1523 1548
1549 /*
1550 * If reclaim is isolating dirty pages under writeback, it implies
1551 * that the long-lived page allocation rate is exceeding the page
1552 * laundering rate. Either the global limits are not being effective
1553 * at throttling processes due to the page distribution throughout
1554 * zones or there is heavy usage of a slow backing device. The
1555 * only option is to throttle from reclaim context which is not ideal
1556 * as there is no guarantee the dirtying process is throttled in the
1557 * same way balance_dirty_pages() manages.
1558 *
1559 * This scales the number of dirty pages that must be under writeback
1560 * before throttling depending on priority. It is a simple backoff
1561 * function that has the most effect in the range DEF_PRIORITY to
1562 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
1563 * in trouble and reclaim is considered to be in trouble.
1564 *
1565 * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
1566 * DEF_PRIORITY-1 50% must be PageWriteback
1567 * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
1568 * ...
1569 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1570 * isolated page is PageWriteback
1571 */
1572 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
1573 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1574
1524 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1575 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1525 zone_idx(zone), 1576 zone_idx(zone),
1526 nr_scanned, nr_reclaimed, 1577 nr_scanned, nr_reclaimed,
@@ -1592,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1592 struct page *page; 1643 struct page *page;
1593 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1644 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1594 unsigned long nr_rotated = 0; 1645 unsigned long nr_rotated = 0;
1646 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
1595 1647
1596 lru_add_drain(); 1648 lru_add_drain();
1649
1650 if (!sc->may_unmap)
1651 reclaim_mode |= ISOLATE_UNMAPPED;
1652 if (!sc->may_writepage)
1653 reclaim_mode |= ISOLATE_CLEAN;
1654
1597 spin_lock_irq(&zone->lru_lock); 1655 spin_lock_irq(&zone->lru_lock);
1598 if (scanning_global_lru(sc)) { 1656 if (scanning_global_lru(sc)) {
1599 nr_taken = isolate_pages_global(nr_pages, &l_hold, 1657 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1600 &pgscanned, sc->order, 1658 &pgscanned, sc->order,
1601 ISOLATE_ACTIVE, zone, 1659 reclaim_mode, zone,
1602 1, file); 1660 1, file);
1603 zone->pages_scanned += pgscanned; 1661 zone->pages_scanned += pgscanned;
1604 } else { 1662 } else {
1605 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, 1663 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1606 &pgscanned, sc->order, 1664 &pgscanned, sc->order,
1607 ISOLATE_ACTIVE, zone, 1665 reclaim_mode, zone,
1608 sc->mem_cgroup, 1, file); 1666 sc->mem_cgroup, 1, file);
1609 /* 1667 /*
1610 * mem_cgroup_isolate_pages() keeps track of 1668 * mem_cgroup_isolate_pages() keeps track of
@@ -1613,8 +1671,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1613 } 1671 }
1614 1672
1615 reclaim_stat->recent_scanned[file] += nr_taken; 1673 reclaim_stat->recent_scanned[file] += nr_taken;
1616 if (!scanning_global_lru(sc))
1617 sc->memcg_record->nr_scanned[file] += nr_taken;
1618 1674
1619 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1675 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1620 if (file) 1676 if (file)
@@ -1666,8 +1722,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1666 * get_scan_ratio. 1722 * get_scan_ratio.
1667 */ 1723 */
1668 reclaim_stat->recent_rotated[file] += nr_rotated; 1724 reclaim_stat->recent_rotated[file] += nr_rotated;
1669 if (!scanning_global_lru(sc))
1670 sc->memcg_record->nr_rotated[file] += nr_rotated;
1671 1725
1672 move_active_pages_to_lru(zone, &l_active, 1726 move_active_pages_to_lru(zone, &l_active,
1673 LRU_ACTIVE + file * LRU_FILE); 1727 LRU_ACTIVE + file * LRU_FILE);
@@ -1713,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1713 if (scanning_global_lru(sc)) 1767 if (scanning_global_lru(sc))
1714 low = inactive_anon_is_low_global(zone); 1768 low = inactive_anon_is_low_global(zone);
1715 else 1769 else
1716 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); 1770 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
1717 return low; 1771 return low;
1718} 1772}
1719#else 1773#else
@@ -1756,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1756 if (scanning_global_lru(sc)) 1810 if (scanning_global_lru(sc))
1757 low = inactive_file_is_low_global(zone); 1811 low = inactive_file_is_low_global(zone);
1758 else 1812 else
1759 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); 1813 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
1760 return low; 1814 return low;
1761} 1815}
1762 1816
@@ -1808,23 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1808 u64 fraction[2], denominator; 1862 u64 fraction[2], denominator;
1809 enum lru_list l; 1863 enum lru_list l;
1810 int noswap = 0; 1864 int noswap = 0;
1811 int force_scan = 0; 1865 bool force_scan = false;
1812 unsigned long nr_force_scan[2];
1813
1814
1815 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1816 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1817 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1818 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1819 1866
1820 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { 1867 /*
1821 /* kswapd does zone balancing and need to scan this zone */ 1868 * If the zone or memcg is small, nr[l] can be 0. This
1822 if (scanning_global_lru(sc) && current_is_kswapd()) 1869 * results in no scanning on this priority and a potential
1823 force_scan = 1; 1870 * priority drop. Global direct reclaim can go to the next
1824 /* memcg may have small limit and need to avoid priority drop */ 1871 * zone and tends to have no problems. Global kswapd is for
1825 if (!scanning_global_lru(sc)) 1872 * zone balancing and it needs to scan a minimum amount. When
1826 force_scan = 1; 1873 * reclaiming for a memcg, a priority drop can cause high
1827 } 1874 * latencies, so it's better to scan a minimum amount there as
1875 * well.
1876 */
1877 if (scanning_global_lru(sc) && current_is_kswapd())
1878 force_scan = true;
1879 if (!scanning_global_lru(sc))
1880 force_scan = true;
1828 1881
1829 /* If we have no swap space, do not bother scanning anon pages. */ 1882 /* If we have no swap space, do not bother scanning anon pages. */
1830 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1883 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1832,11 +1885,14 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1832 fraction[0] = 0; 1885 fraction[0] = 0;
1833 fraction[1] = 1; 1886 fraction[1] = 1;
1834 denominator = 1; 1887 denominator = 1;
1835 nr_force_scan[0] = 0;
1836 nr_force_scan[1] = SWAP_CLUSTER_MAX;
1837 goto out; 1888 goto out;
1838 } 1889 }
1839 1890
1891 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1892 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1893 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1894 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1895
1840 if (scanning_global_lru(sc)) { 1896 if (scanning_global_lru(sc)) {
1841 free = zone_page_state(zone, NR_FREE_PAGES); 1897 free = zone_page_state(zone, NR_FREE_PAGES);
1842 /* If we have very few page cache pages, 1898 /* If we have very few page cache pages,
@@ -1845,8 +1901,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1845 fraction[0] = 1; 1901 fraction[0] = 1;
1846 fraction[1] = 0; 1902 fraction[1] = 0;
1847 denominator = 1; 1903 denominator = 1;
1848 nr_force_scan[0] = SWAP_CLUSTER_MAX;
1849 nr_force_scan[1] = 0;
1850 goto out; 1904 goto out;
1851 } 1905 }
1852 } 1906 }
@@ -1895,11 +1949,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1895 fraction[0] = ap; 1949 fraction[0] = ap;
1896 fraction[1] = fp; 1950 fraction[1] = fp;
1897 denominator = ap + fp + 1; 1951 denominator = ap + fp + 1;
1898 if (force_scan) {
1899 unsigned long scan = SWAP_CLUSTER_MAX;
1900 nr_force_scan[0] = div64_u64(scan * ap, denominator);
1901 nr_force_scan[1] = div64_u64(scan * fp, denominator);
1902 }
1903out: 1952out:
1904 for_each_evictable_lru(l) { 1953 for_each_evictable_lru(l) {
1905 int file = is_file_lru(l); 1954 int file = is_file_lru(l);
@@ -1908,20 +1957,10 @@ out:
1908 scan = zone_nr_lru_pages(zone, sc, l); 1957 scan = zone_nr_lru_pages(zone, sc, l);
1909 if (priority || noswap) { 1958 if (priority || noswap) {
1910 scan >>= priority; 1959 scan >>= priority;
1960 if (!scan && force_scan)
1961 scan = SWAP_CLUSTER_MAX;
1911 scan = div64_u64(scan * fraction[file], denominator); 1962 scan = div64_u64(scan * fraction[file], denominator);
1912 } 1963 }
1913
1914 /*
1915 * If zone is small or memcg is small, nr[l] can be 0.
1916 * This results no-scan on this priority and priority drop down.
1917 * For global direct reclaim, it can visit next zone and tend
1918 * not to have problems. For global kswapd, it's for zone
1919 * balancing and it need to scan a small amounts. When using
1920 * memcg, priority drop can cause big latency. So, it's better
1921 * to scan small amount. See may_noscan above.
1922 */
1923 if (!scan && force_scan)
1924 scan = nr_force_scan[file];
1925 nr[l] = scan; 1964 nr[l] = scan;
1926 } 1965 }
1927} 1966}
@@ -2000,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone,
2000 enum lru_list l; 2039 enum lru_list l;
2001 unsigned long nr_reclaimed, nr_scanned; 2040 unsigned long nr_reclaimed, nr_scanned;
2002 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2041 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2042 struct blk_plug plug;
2003 2043
2004restart: 2044restart:
2005 nr_reclaimed = 0; 2045 nr_reclaimed = 0;
2006 nr_scanned = sc->nr_scanned; 2046 nr_scanned = sc->nr_scanned;
2007 get_scan_count(zone, sc, nr, priority); 2047 get_scan_count(zone, sc, nr, priority);
2008 2048
2049 blk_start_plug(&plug);
2009 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2050 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2010 nr[LRU_INACTIVE_FILE]) { 2051 nr[LRU_INACTIVE_FILE]) {
2011 for_each_evictable_lru(l) { 2052 for_each_evictable_lru(l) {
@@ -2029,6 +2070,7 @@ restart:
2029 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 2070 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
2030 break; 2071 break;
2031 } 2072 }
2073 blk_finish_plug(&plug);
2032 sc->nr_reclaimed += nr_reclaimed; 2074 sc->nr_reclaimed += nr_reclaimed;
2033 2075
2034 /* 2076 /*
@@ -2061,14 +2103,19 @@ restart:
2061 * 2103 *
2062 * If a zone is deemed to be full of pinned pages then just give it a light 2104 * If a zone is deemed to be full of pinned pages then just give it a light
2063 * scan then give up on it. 2105 * scan then give up on it.
2106 *
2107 * This function returns true if a zone is being reclaimed for a costly
2108 * high-order allocation and compaction is either ready to begin or deferred.
2109 * This indicates to the caller that it should retry the allocation or fail.
2064 */ 2110 */
2065static void shrink_zones(int priority, struct zonelist *zonelist, 2111static bool shrink_zones(int priority, struct zonelist *zonelist,
2066 struct scan_control *sc) 2112 struct scan_control *sc)
2067{ 2113{
2068 struct zoneref *z; 2114 struct zoneref *z;
2069 struct zone *zone; 2115 struct zone *zone;
2070 unsigned long nr_soft_reclaimed; 2116 unsigned long nr_soft_reclaimed;
2071 unsigned long nr_soft_scanned; 2117 unsigned long nr_soft_scanned;
2118 bool should_abort_reclaim = false;
2072 2119
2073 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2120 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2074 gfp_zone(sc->gfp_mask), sc->nodemask) { 2121 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2083,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2083 continue; 2130 continue;
2084 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2131 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2085 continue; /* Let kswapd poll it */ 2132 continue; /* Let kswapd poll it */
2133 if (COMPACTION_BUILD) {
2134 /*
2135 * If we already have plenty of memory free for
2136 * compaction in this zone, don't free any more.
2137 * Even though compaction is invoked for any
2138 * non-zero order, only frequent costly order
2139 * reclamation is disruptive enough to become a
2140 * noticable problem, like transparent huge page
2141 * allocations.
2142 */
2143 if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2144 (compaction_suitable(zone, sc->order) ||
2145 compaction_deferred(zone))) {
2146 should_abort_reclaim = true;
2147 continue;
2148 }
2149 }
2086 /* 2150 /*
2087 * This steals pages from memory cgroups over softlimit 2151 * This steals pages from memory cgroups over softlimit
2088 * and returns the number of reclaimed pages and 2152 * and returns the number of reclaimed pages and
@@ -2100,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2100 2164
2101 shrink_zone(priority, zone, sc); 2165 shrink_zone(priority, zone, sc);
2102 } 2166 }
2167
2168 return should_abort_reclaim;
2103} 2169}
2104 2170
2105static bool zone_reclaimable(struct zone *zone) 2171static bool zone_reclaimable(struct zone *zone)
@@ -2164,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2164 sc->nr_scanned = 0; 2230 sc->nr_scanned = 0;
2165 if (!priority) 2231 if (!priority)
2166 disable_swap_token(sc->mem_cgroup); 2232 disable_swap_token(sc->mem_cgroup);
2167 shrink_zones(priority, zonelist, sc); 2233 if (shrink_zones(priority, zonelist, sc))
2234 break;
2235
2168 /* 2236 /*
2169 * Don't shrink slabs when reclaiming memory from 2237 * Don't shrink slabs when reclaiming memory from
2170 * over limit cgroups 2238 * over limit cgroups
@@ -2198,7 +2266,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2198 */ 2266 */
2199 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; 2267 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2200 if (total_scanned > writeback_threshold) { 2268 if (total_scanned > writeback_threshold) {
2201 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); 2269 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2270 WB_REASON_TRY_TO_FREE_PAGES);
2202 sc->may_writepage = 1; 2271 sc->may_writepage = 1;
2203 } 2272 }
2204 2273
@@ -2268,10 +2337,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2268#ifdef CONFIG_CGROUP_MEM_RES_CTLR 2337#ifdef CONFIG_CGROUP_MEM_RES_CTLR
2269 2338
2270unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2339unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2271 gfp_t gfp_mask, bool noswap, 2340 gfp_t gfp_mask, bool noswap,
2272 struct zone *zone, 2341 struct zone *zone,
2273 struct memcg_scanrecord *rec, 2342 unsigned long *nr_scanned)
2274 unsigned long *scanned)
2275{ 2343{
2276 struct scan_control sc = { 2344 struct scan_control sc = {
2277 .nr_scanned = 0, 2345 .nr_scanned = 0,
@@ -2281,9 +2349,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2281 .may_swap = !noswap, 2349 .may_swap = !noswap,
2282 .order = 0, 2350 .order = 0,
2283 .mem_cgroup = mem, 2351 .mem_cgroup = mem,
2284 .memcg_record = rec,
2285 }; 2352 };
2286 ktime_t start, end;
2287 2353
2288 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2354 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2289 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 2355 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2292,7 +2358,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2292 sc.may_writepage, 2358 sc.may_writepage,
2293 sc.gfp_mask); 2359 sc.gfp_mask);
2294 2360
2295 start = ktime_get();
2296 /* 2361 /*
2297 * NOTE: Although we can get the priority field, using it 2362 * NOTE: Although we can get the priority field, using it
2298 * here is not a good idea, since it limits the pages we can scan. 2363 * here is not a good idea, since it limits the pages we can scan.
@@ -2301,25 +2366,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2301 * the priority and make it zero. 2366 * the priority and make it zero.
2302 */ 2367 */
2303 shrink_zone(0, zone, &sc); 2368 shrink_zone(0, zone, &sc);
2304 end = ktime_get();
2305
2306 if (rec)
2307 rec->elapsed += ktime_to_ns(ktime_sub(end, start));
2308 *scanned = sc.nr_scanned;
2309 2369
2310 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); 2370 trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
2311 2371
2372 *nr_scanned = sc.nr_scanned;
2312 return sc.nr_reclaimed; 2373 return sc.nr_reclaimed;
2313} 2374}
2314 2375
2315unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2376unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2316 gfp_t gfp_mask, 2377 gfp_t gfp_mask,
2317 bool noswap, 2378 bool noswap)
2318 struct memcg_scanrecord *rec)
2319{ 2379{
2320 struct zonelist *zonelist; 2380 struct zonelist *zonelist;
2321 unsigned long nr_reclaimed; 2381 unsigned long nr_reclaimed;
2322 ktime_t start, end;
2323 int nid; 2382 int nid;
2324 struct scan_control sc = { 2383 struct scan_control sc = {
2325 .may_writepage = !laptop_mode, 2384 .may_writepage = !laptop_mode,
@@ -2328,7 +2387,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2328 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2387 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2329 .order = 0, 2388 .order = 0,
2330 .mem_cgroup = mem_cont, 2389 .mem_cgroup = mem_cont,
2331 .memcg_record = rec,
2332 .nodemask = NULL, /* we don't care the placement */ 2390 .nodemask = NULL, /* we don't care the placement */
2333 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 2391 .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
2334 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), 2392 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
@@ -2337,7 +2395,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2337 .gfp_mask = sc.gfp_mask, 2395 .gfp_mask = sc.gfp_mask,
2338 }; 2396 };
2339 2397
2340 start = ktime_get();
2341 /* 2398 /*
2342 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't 2399 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2343 * take care of from where we get pages. So the node where we start the 2400 * take care of from where we get pages. So the node where we start the
@@ -2352,9 +2409,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2352 sc.gfp_mask); 2409 sc.gfp_mask);
2353 2410
2354 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); 2411 nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
2355 end = ktime_get();
2356 if (rec)
2357 rec->elapsed += ktime_to_ns(ktime_sub(end, start));
2358 2412
2359 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); 2413 trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
2360 2414
@@ -2722,6 +2776,8 @@ out:
2722 2776
2723 /* If balanced, clear the congested flag */ 2777 /* If balanced, clear the congested flag */
2724 zone_clear_flag(zone, ZONE_CONGESTED); 2778 zone_clear_flag(zone, ZONE_CONGESTED);
2779 if (i <= *classzone_idx)
2780 balanced += zone->present_pages;
2725 } 2781 }
2726 } 2782 }
2727 2783
@@ -2795,7 +2851,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2795static int kswapd(void *p) 2851static int kswapd(void *p)
2796{ 2852{
2797 unsigned long order, new_order; 2853 unsigned long order, new_order;
2854 unsigned balanced_order;
2798 int classzone_idx, new_classzone_idx; 2855 int classzone_idx, new_classzone_idx;
2856 int balanced_classzone_idx;
2799 pg_data_t *pgdat = (pg_data_t*)p; 2857 pg_data_t *pgdat = (pg_data_t*)p;
2800 struct task_struct *tsk = current; 2858 struct task_struct *tsk = current;
2801 2859
@@ -2826,7 +2884,9 @@ static int kswapd(void *p)
2826 set_freezable(); 2884 set_freezable();
2827 2885
2828 order = new_order = 0; 2886 order = new_order = 0;
2887 balanced_order = 0;
2829 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2888 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2889 balanced_classzone_idx = classzone_idx;
2830 for ( ; ; ) { 2890 for ( ; ; ) {
2831 int ret; 2891 int ret;
2832 2892
@@ -2835,7 +2895,8 @@ static int kswapd(void *p)
2835 * new request of a similar or harder type will succeed soon 2895 * new request of a similar or harder type will succeed soon
2836 * so consider going to sleep on the basis we reclaimed at 2896 * so consider going to sleep on the basis we reclaimed at
2837 */ 2897 */
2838 if (classzone_idx >= new_classzone_idx && order == new_order) { 2898 if (balanced_classzone_idx >= new_classzone_idx &&
2899 balanced_order == new_order) {
2839 new_order = pgdat->kswapd_max_order; 2900 new_order = pgdat->kswapd_max_order;
2840 new_classzone_idx = pgdat->classzone_idx; 2901 new_classzone_idx = pgdat->classzone_idx;
2841 pgdat->kswapd_max_order = 0; 2902 pgdat->kswapd_max_order = 0;
@@ -2850,9 +2911,12 @@ static int kswapd(void *p)
2850 order = new_order; 2911 order = new_order;
2851 classzone_idx = new_classzone_idx; 2912 classzone_idx = new_classzone_idx;
2852 } else { 2913 } else {
2853 kswapd_try_to_sleep(pgdat, order, classzone_idx); 2914 kswapd_try_to_sleep(pgdat, balanced_order,
2915 balanced_classzone_idx);
2854 order = pgdat->kswapd_max_order; 2916 order = pgdat->kswapd_max_order;
2855 classzone_idx = pgdat->classzone_idx; 2917 classzone_idx = pgdat->classzone_idx;
2918 new_order = order;
2919 new_classzone_idx = classzone_idx;
2856 pgdat->kswapd_max_order = 0; 2920 pgdat->kswapd_max_order = 0;
2857 pgdat->classzone_idx = pgdat->nr_zones - 1; 2921 pgdat->classzone_idx = pgdat->nr_zones - 1;
2858 } 2922 }
@@ -2867,7 +2931,9 @@ static int kswapd(void *p)
2867 */ 2931 */
2868 if (!ret) { 2932 if (!ret) {
2869 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2933 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2870 order = balance_pgdat(pgdat, order, &classzone_idx); 2934 balanced_classzone_idx = classzone_idx;
2935 balanced_order = balance_pgdat(pgdat, order,
2936 &balanced_classzone_idx);
2871 } 2937 }
2872 } 2938 }
2873 return 0; 2939 return 0;
@@ -3379,66 +3445,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
3379 3445
3380} 3446}
3381 3447
3382/** 3448static void warn_scan_unevictable_pages(void)
3383 * scan_zone_unevictable_pages - check unevictable list for evictable pages
3384 * @zone - zone of which to scan the unevictable list
3385 *
3386 * Scan @zone's unevictable LRU lists to check for pages that have become
3387 * evictable. Move those that have to @zone's inactive list where they
3388 * become candidates for reclaim, unless shrink_inactive_zone() decides
3389 * to reactivate them. Pages that are still unevictable are rotated
3390 * back onto @zone's unevictable list.
3391 */
3392#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
3393static void scan_zone_unevictable_pages(struct zone *zone)
3394{
3395 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
3396 unsigned long scan;
3397 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
3398
3399 while (nr_to_scan > 0) {
3400 unsigned long batch_size = min(nr_to_scan,
3401 SCAN_UNEVICTABLE_BATCH_SIZE);
3402
3403 spin_lock_irq(&zone->lru_lock);
3404 for (scan = 0; scan < batch_size; scan++) {
3405 struct page *page = lru_to_page(l_unevictable);
3406
3407 if (!trylock_page(page))
3408 continue;
3409
3410 prefetchw_prev_lru_page(page, l_unevictable, flags);
3411
3412 if (likely(PageLRU(page) && PageUnevictable(page)))
3413 check_move_unevictable_page(page, zone);
3414
3415 unlock_page(page);
3416 }
3417 spin_unlock_irq(&zone->lru_lock);
3418
3419 nr_to_scan -= batch_size;
3420 }
3421}
3422
3423
3424/**
3425 * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
3426 *
3427 * A really big hammer: scan all zones' unevictable LRU lists to check for
3428 * pages that have become evictable. Move those back to the zones'
3429 * inactive list where they become candidates for reclaim.
3430 * This occurs when, e.g., we have unswappable pages on the unevictable lists,
3431 * and we add swap to the system. As such, it runs in the context of a task
3432 * that has possibly/probably made some previously unevictable pages
3433 * evictable.
3434 */
3435static void scan_all_zones_unevictable_pages(void)
3436{ 3449{
3437 struct zone *zone; 3450 printk_once(KERN_WARNING
3438 3451 "The scan_unevictable_pages sysctl/node-interface has been "
3439 for_each_zone(zone) { 3452 "disabled for lack of a legitimate use case. If you have "
3440 scan_zone_unevictable_pages(zone); 3453 "one, please send an email to linux-mm@kvack.org.\n");
3441 }
3442} 3454}
3443 3455
3444/* 3456/*
@@ -3451,11 +3463,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
3451 void __user *buffer, 3463 void __user *buffer,
3452 size_t *length, loff_t *ppos) 3464 size_t *length, loff_t *ppos)
3453{ 3465{
3466 warn_scan_unevictable_pages();
3454 proc_doulongvec_minmax(table, write, buffer, length, ppos); 3467 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3455
3456 if (write && *(unsigned long *)table->data)
3457 scan_all_zones_unevictable_pages();
3458
3459 scan_unevictable_pages = 0; 3468 scan_unevictable_pages = 0;
3460 return 0; 3469 return 0;
3461} 3470}
@@ -3470,6 +3479,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev,
3470 struct sysdev_attribute *attr, 3479 struct sysdev_attribute *attr,
3471 char *buf) 3480 char *buf)
3472{ 3481{
3482 warn_scan_unevictable_pages();
3473 return sprintf(buf, "0\n"); /* always zero; should fit... */ 3483 return sprintf(buf, "0\n"); /* always zero; should fit... */
3474} 3484}
3475 3485
@@ -3477,19 +3487,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,
3477 struct sysdev_attribute *attr, 3487 struct sysdev_attribute *attr,
3478 const char *buf, size_t count) 3488 const char *buf, size_t count)
3479{ 3489{
3480 struct zone *node_zones = NODE_DATA(dev->id)->node_zones; 3490 warn_scan_unevictable_pages();
3481 struct zone *zone;
3482 unsigned long res;
3483 unsigned long req = strict_strtoul(buf, 10, &res);
3484
3485 if (!req)
3486 return 1; /* zero is no-op */
3487
3488 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
3489 if (!populated_zone(zone))
3490 continue;
3491 scan_zone_unevictable_pages(zone);
3492 }
3493 return 1; 3491 return 1;
3494} 3492}
3495 3493