aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c380
1 files changed, 207 insertions, 173 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index febbc044e792..a1893c050795 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -95,8 +95,6 @@ struct scan_control {
95 /* Can pages be swapped as part of reclaim? */ 95 /* Can pages be swapped as part of reclaim? */
96 int may_swap; 96 int may_swap;
97 97
98 int swappiness;
99
100 int order; 98 int order;
101 99
102 /* 100 /*
@@ -173,7 +171,8 @@ static unsigned long zone_nr_lru_pages(struct zone *zone,
173 struct scan_control *sc, enum lru_list lru) 171 struct scan_control *sc, enum lru_list lru)
174{ 172{
175 if (!scanning_global_lru(sc)) 173 if (!scanning_global_lru(sc))
176 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup, zone, lru); 174 return mem_cgroup_zone_nr_lru_pages(sc->mem_cgroup,
175 zone_to_nid(zone), zone_idx(zone), BIT(lru));
177 176
178 return zone_page_state(zone, NR_LRU_BASE + lru); 177 return zone_page_state(zone, NR_LRU_BASE + lru);
179} 178}
@@ -496,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
496 return PAGE_ACTIVATE; 495 return PAGE_ACTIVATE;
497 } 496 }
498 497
499 /*
500 * Wait on writeback if requested to. This happens when
501 * direct reclaiming a large contiguous area and the
502 * first attempt to free a range of pages fails.
503 */
504 if (PageWriteback(page) &&
505 (sc->reclaim_mode & RECLAIM_MODE_SYNC))
506 wait_on_page_writeback(page);
507
508 if (!PageWriteback(page)) { 498 if (!PageWriteback(page)) {
509 /* synchronous write or broken a_ops? */ 499 /* synchronous write or broken a_ops? */
510 ClearPageReclaim(page); 500 ClearPageReclaim(page);
@@ -643,13 +633,14 @@ redo:
643 lru = LRU_UNEVICTABLE; 633 lru = LRU_UNEVICTABLE;
644 add_page_to_unevictable_list(page); 634 add_page_to_unevictable_list(page);
645 /* 635 /*
646 * When racing with an mlock clearing (page is 636 * When racing with an mlock or AS_UNEVICTABLE clearing
647 * unlocked), make sure that if the other thread does 637 * (page is unlocked) make sure that if the other thread
648 * not observe our setting of PG_lru and fails 638 * does not observe our setting of PG_lru and fails
649 * isolation, we see PG_mlocked cleared below and move 639 * isolation/check_move_unevictable_page,
640 * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
650 * the page back to the evictable list. 641 * the page back to the evictable list.
651 * 642 *
652 * The other side is TestClearPageMlocked(). 643 * The other side is TestClearPageMlocked() or shmem_lock().
653 */ 644 */
654 smp_mb(); 645 smp_mb();
655 } 646 }
@@ -760,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
760 */ 751 */
761static unsigned long shrink_page_list(struct list_head *page_list, 752static unsigned long shrink_page_list(struct list_head *page_list,
762 struct zone *zone, 753 struct zone *zone,
763 struct scan_control *sc) 754 struct scan_control *sc,
755 int priority,
756 unsigned long *ret_nr_dirty,
757 unsigned long *ret_nr_writeback)
764{ 758{
765 LIST_HEAD(ret_pages); 759 LIST_HEAD(ret_pages);
766 LIST_HEAD(free_pages); 760 LIST_HEAD(free_pages);
@@ -768,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
768 unsigned long nr_dirty = 0; 762 unsigned long nr_dirty = 0;
769 unsigned long nr_congested = 0; 763 unsigned long nr_congested = 0;
770 unsigned long nr_reclaimed = 0; 764 unsigned long nr_reclaimed = 0;
765 unsigned long nr_writeback = 0;
771 766
772 cond_resched(); 767 cond_resched();
773 768
@@ -804,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
804 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); 799 (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
805 800
806 if (PageWriteback(page)) { 801 if (PageWriteback(page)) {
802 nr_writeback++;
807 /* 803 /*
808 * Synchronous reclaim is performed in two passes, 804 * Synchronous reclaim cannot queue pages for
809 * first an asynchronous pass over the list to 805 * writeback due to the possibility of stack overflow
810 * start parallel writeback, and a second synchronous 806 * but if it encounters a page under writeback, wait
811 * pass to wait for the IO to complete. Wait here 807 * for the IO to complete.
812 * for any page for which writeback has already
813 * started.
814 */ 808 */
815 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && 809 if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
816 may_enter_fs) 810 may_enter_fs)
@@ -866,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
866 if (PageDirty(page)) { 860 if (PageDirty(page)) {
867 nr_dirty++; 861 nr_dirty++;
868 862
863 /*
864 * Only kswapd can writeback filesystem pages to
865 * avoid risk of stack overflow but do not writeback
866 * unless under significant pressure.
867 */
868 if (page_is_file_cache(page) &&
869 (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
870 /*
871 * Immediately reclaim when written back.
872 * Similar in principal to deactivate_page()
873 * except we already have the page isolated
874 * and know it's dirty
875 */
876 inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
877 SetPageReclaim(page);
878
879 goto keep_locked;
880 }
881
869 if (references == PAGEREF_RECLAIM_CLEAN) 882 if (references == PAGEREF_RECLAIM_CLEAN)
870 goto keep_locked; 883 goto keep_locked;
871 if (!may_enter_fs) 884 if (!may_enter_fs)
@@ -1000,6 +1013,8 @@ keep_lumpy:
1000 1013
1001 list_splice(&ret_pages, page_list); 1014 list_splice(&ret_pages, page_list);
1002 count_vm_events(PGACTIVATE, pgactivate); 1015 count_vm_events(PGACTIVATE, pgactivate);
1016 *ret_nr_dirty += nr_dirty;
1017 *ret_nr_writeback += nr_writeback;
1003 return nr_reclaimed; 1018 return nr_reclaimed;
1004} 1019}
1005 1020
@@ -1013,23 +1028,27 @@ keep_lumpy:
1013 * 1028 *
1014 * returns 0 on success, -ve errno on failure. 1029 * returns 0 on success, -ve errno on failure.
1015 */ 1030 */
1016int __isolate_lru_page(struct page *page, int mode, int file) 1031int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
1017{ 1032{
1033 bool all_lru_mode;
1018 int ret = -EINVAL; 1034 int ret = -EINVAL;
1019 1035
1020 /* Only take pages on the LRU. */ 1036 /* Only take pages on the LRU. */
1021 if (!PageLRU(page)) 1037 if (!PageLRU(page))
1022 return ret; 1038 return ret;
1023 1039
1040 all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
1041 (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
1042
1024 /* 1043 /*
1025 * When checking the active state, we need to be sure we are 1044 * When checking the active state, we need to be sure we are
1026 * dealing with comparible boolean values. Take the logical not 1045 * dealing with comparible boolean values. Take the logical not
1027 * of each. 1046 * of each.
1028 */ 1047 */
1029 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 1048 if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
1030 return ret; 1049 return ret;
1031 1050
1032 if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) 1051 if (!all_lru_mode && !!page_is_file_cache(page) != file)
1033 return ret; 1052 return ret;
1034 1053
1035 /* 1054 /*
@@ -1042,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1042 1061
1043 ret = -EBUSY; 1062 ret = -EBUSY;
1044 1063
1064 if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
1065 return ret;
1066
1067 if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1068 return ret;
1069
1045 if (likely(get_page_unless_zero(page))) { 1070 if (likely(get_page_unless_zero(page))) {
1046 /* 1071 /*
1047 * Be careful not to clear PageLRU until after we're 1072 * Be careful not to clear PageLRU until after we're
@@ -1077,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
1077 */ 1102 */
1078static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1103static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1079 struct list_head *src, struct list_head *dst, 1104 struct list_head *src, struct list_head *dst,
1080 unsigned long *scanned, int order, int mode, int file) 1105 unsigned long *scanned, int order, isolate_mode_t mode,
1106 int file)
1081{ 1107{
1082 unsigned long nr_taken = 0; 1108 unsigned long nr_taken = 0;
1083 unsigned long nr_lumpy_taken = 0; 1109 unsigned long nr_lumpy_taken = 0;
@@ -1202,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1202static unsigned long isolate_pages_global(unsigned long nr, 1228static unsigned long isolate_pages_global(unsigned long nr,
1203 struct list_head *dst, 1229 struct list_head *dst,
1204 unsigned long *scanned, int order, 1230 unsigned long *scanned, int order,
1205 int mode, struct zone *z, 1231 isolate_mode_t mode,
1206 int active, int file) 1232 struct zone *z, int active, int file)
1207{ 1233{
1208 int lru = LRU_BASE; 1234 int lru = LRU_BASE;
1209 if (active) 1235 if (active)
@@ -1395,7 +1421,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
1395} 1421}
1396 1422
1397/* 1423/*
1398 * Returns true if the caller should wait to clean dirty/writeback pages. 1424 * Returns true if a direct reclaim should wait on pages under writeback.
1399 * 1425 *
1400 * If we are direct reclaiming for contiguous pages and we do not reclaim 1426 * If we are direct reclaiming for contiguous pages and we do not reclaim
1401 * everything in the list, try again and wait for writeback IO to complete. 1427 * everything in the list, try again and wait for writeback IO to complete.
@@ -1417,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
1417 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) 1443 if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
1418 return false; 1444 return false;
1419 1445
1420 /* If we have relaimed everything on the isolated list, no stall */ 1446 /* If we have reclaimed everything on the isolated list, no stall */
1421 if (nr_freed == nr_taken) 1447 if (nr_freed == nr_taken)
1422 return false; 1448 return false;
1423 1449
@@ -1449,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1449 unsigned long nr_taken; 1475 unsigned long nr_taken;
1450 unsigned long nr_anon; 1476 unsigned long nr_anon;
1451 unsigned long nr_file; 1477 unsigned long nr_file;
1478 unsigned long nr_dirty = 0;
1479 unsigned long nr_writeback = 0;
1480 isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
1452 1481
1453 while (unlikely(too_many_isolated(zone, file, sc))) { 1482 while (unlikely(too_many_isolated(zone, file, sc))) {
1454 congestion_wait(BLK_RW_ASYNC, HZ/10); 1483 congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1459,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1459 } 1488 }
1460 1489
1461 set_reclaim_mode(priority, sc, false); 1490 set_reclaim_mode(priority, sc, false);
1491 if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
1492 reclaim_mode |= ISOLATE_ACTIVE;
1493
1462 lru_add_drain(); 1494 lru_add_drain();
1495
1496 if (!sc->may_unmap)
1497 reclaim_mode |= ISOLATE_UNMAPPED;
1498 if (!sc->may_writepage)
1499 reclaim_mode |= ISOLATE_CLEAN;
1500
1463 spin_lock_irq(&zone->lru_lock); 1501 spin_lock_irq(&zone->lru_lock);
1464 1502
1465 if (scanning_global_lru(sc)) { 1503 if (scanning_global_lru(sc)) {
1466 nr_taken = isolate_pages_global(nr_to_scan, 1504 nr_taken = isolate_pages_global(nr_to_scan, &page_list,
1467 &page_list, &nr_scanned, sc->order, 1505 &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
1468 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
1469 ISOLATE_BOTH : ISOLATE_INACTIVE,
1470 zone, 0, file);
1471 zone->pages_scanned += nr_scanned; 1506 zone->pages_scanned += nr_scanned;
1472 if (current_is_kswapd()) 1507 if (current_is_kswapd())
1473 __count_zone_vm_events(PGSCAN_KSWAPD, zone, 1508 __count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1476,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1476 __count_zone_vm_events(PGSCAN_DIRECT, zone, 1511 __count_zone_vm_events(PGSCAN_DIRECT, zone,
1477 nr_scanned); 1512 nr_scanned);
1478 } else { 1513 } else {
1479 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, 1514 nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
1480 &page_list, &nr_scanned, sc->order, 1515 &nr_scanned, sc->order, reclaim_mode, zone,
1481 sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? 1516 sc->mem_cgroup, 0, file);
1482 ISOLATE_BOTH : ISOLATE_INACTIVE,
1483 zone, sc->mem_cgroup,
1484 0, file);
1485 /* 1517 /*
1486 * mem_cgroup_isolate_pages() keeps track of 1518 * mem_cgroup_isolate_pages() keeps track of
1487 * scanned pages on its own. 1519 * scanned pages on its own.
@@ -1497,12 +1529,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1497 1529
1498 spin_unlock_irq(&zone->lru_lock); 1530 spin_unlock_irq(&zone->lru_lock);
1499 1531
1500 nr_reclaimed = shrink_page_list(&page_list, zone, sc); 1532 nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
1533 &nr_dirty, &nr_writeback);
1501 1534
1502 /* Check if we should syncronously wait for writeback */ 1535 /* Check if we should syncronously wait for writeback */
1503 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { 1536 if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
1504 set_reclaim_mode(priority, sc, true); 1537 set_reclaim_mode(priority, sc, true);
1505 nr_reclaimed += shrink_page_list(&page_list, zone, sc); 1538 nr_reclaimed += shrink_page_list(&page_list, zone, sc,
1539 priority, &nr_dirty, &nr_writeback);
1506 } 1540 }
1507 1541
1508 local_irq_disable(); 1542 local_irq_disable();
@@ -1512,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
1512 1546
1513 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); 1547 putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
1514 1548
1549 /*
1550 * If reclaim is isolating dirty pages under writeback, it implies
1551 * that the long-lived page allocation rate is exceeding the page
1552 * laundering rate. Either the global limits are not being effective
1553 * at throttling processes due to the page distribution throughout
1554 * zones or there is heavy usage of a slow backing device. The
1555 * only option is to throttle from reclaim context which is not ideal
1556 * as there is no guarantee the dirtying process is throttled in the
1557 * same way balance_dirty_pages() manages.
1558 *
1559 * This scales the number of dirty pages that must be under writeback
1560 * before throttling depending on priority. It is a simple backoff
1561 * function that has the most effect in the range DEF_PRIORITY to
1562 * DEF_PRIORITY-2 which is the priority reclaim is considered to be
1563 * in trouble and reclaim is considered to be in trouble.
1564 *
1565 * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
1566 * DEF_PRIORITY-1 50% must be PageWriteback
1567 * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
1568 * ...
1569 * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
1570 * isolated page is PageWriteback
1571 */
1572 if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
1573 wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
1574
1515 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, 1575 trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
1516 zone_idx(zone), 1576 zone_idx(zone),
1517 nr_scanned, nr_reclaimed, 1577 nr_scanned, nr_reclaimed,
@@ -1583,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1583 struct page *page; 1643 struct page *page;
1584 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); 1644 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1585 unsigned long nr_rotated = 0; 1645 unsigned long nr_rotated = 0;
1646 isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
1586 1647
1587 lru_add_drain(); 1648 lru_add_drain();
1649
1650 if (!sc->may_unmap)
1651 reclaim_mode |= ISOLATE_UNMAPPED;
1652 if (!sc->may_writepage)
1653 reclaim_mode |= ISOLATE_CLEAN;
1654
1588 spin_lock_irq(&zone->lru_lock); 1655 spin_lock_irq(&zone->lru_lock);
1589 if (scanning_global_lru(sc)) { 1656 if (scanning_global_lru(sc)) {
1590 nr_taken = isolate_pages_global(nr_pages, &l_hold, 1657 nr_taken = isolate_pages_global(nr_pages, &l_hold,
1591 &pgscanned, sc->order, 1658 &pgscanned, sc->order,
1592 ISOLATE_ACTIVE, zone, 1659 reclaim_mode, zone,
1593 1, file); 1660 1, file);
1594 zone->pages_scanned += pgscanned; 1661 zone->pages_scanned += pgscanned;
1595 } else { 1662 } else {
1596 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, 1663 nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
1597 &pgscanned, sc->order, 1664 &pgscanned, sc->order,
1598 ISOLATE_ACTIVE, zone, 1665 reclaim_mode, zone,
1599 sc->mem_cgroup, 1, file); 1666 sc->mem_cgroup, 1, file);
1600 /* 1667 /*
1601 * mem_cgroup_isolate_pages() keeps track of 1668 * mem_cgroup_isolate_pages() keeps track of
@@ -1700,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1700 if (scanning_global_lru(sc)) 1767 if (scanning_global_lru(sc))
1701 low = inactive_anon_is_low_global(zone); 1768 low = inactive_anon_is_low_global(zone);
1702 else 1769 else
1703 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); 1770 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone);
1704 return low; 1771 return low;
1705} 1772}
1706#else 1773#else
@@ -1743,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
1743 if (scanning_global_lru(sc)) 1810 if (scanning_global_lru(sc))
1744 low = inactive_file_is_low_global(zone); 1811 low = inactive_file_is_low_global(zone);
1745 else 1812 else
1746 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); 1813 low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone);
1747 return low; 1814 return low;
1748} 1815}
1749 1816
@@ -1770,6 +1837,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1770 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); 1837 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1771} 1838}
1772 1839
1840static int vmscan_swappiness(struct scan_control *sc)
1841{
1842 if (scanning_global_lru(sc))
1843 return vm_swappiness;
1844 return mem_cgroup_swappiness(sc->mem_cgroup);
1845}
1846
1773/* 1847/*
1774 * Determine how aggressively the anon and file LRU lists should be 1848 * Determine how aggressively the anon and file LRU lists should be
1775 * scanned. The relative value of each set of LRU lists is determined 1849 * scanned. The relative value of each set of LRU lists is determined
@@ -1788,22 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1788 u64 fraction[2], denominator; 1862 u64 fraction[2], denominator;
1789 enum lru_list l; 1863 enum lru_list l;
1790 int noswap = 0; 1864 int noswap = 0;
1791 int force_scan = 0; 1865 bool force_scan = false;
1792
1793
1794 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1795 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1796 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1797 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1798 1866
1799 if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { 1867 /*
1800 /* kswapd does zone balancing and need to scan this zone */ 1868 * If the zone or memcg is small, nr[l] can be 0. This
1801 if (scanning_global_lru(sc) && current_is_kswapd()) 1869 * results in no scanning on this priority and a potential
1802 force_scan = 1; 1870 * priority drop. Global direct reclaim can go to the next
1803 /* memcg may have small limit and need to avoid priority drop */ 1871 * zone and tends to have no problems. Global kswapd is for
1804 if (!scanning_global_lru(sc)) 1872 * zone balancing and it needs to scan a minimum amount. When
1805 force_scan = 1; 1873 * reclaiming for a memcg, a priority drop can cause high
1806 } 1874 * latencies, so it's better to scan a minimum amount there as
1875 * well.
1876 */
1877 if (scanning_global_lru(sc) && current_is_kswapd())
1878 force_scan = true;
1879 if (!scanning_global_lru(sc))
1880 force_scan = true;
1807 1881
1808 /* If we have no swap space, do not bother scanning anon pages. */ 1882 /* If we have no swap space, do not bother scanning anon pages. */
1809 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1883 if (!sc->may_swap || (nr_swap_pages <= 0)) {
@@ -1814,6 +1888,11 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1814 goto out; 1888 goto out;
1815 } 1889 }
1816 1890
1891 anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) +
1892 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON);
1893 file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) +
1894 zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE);
1895
1817 if (scanning_global_lru(sc)) { 1896 if (scanning_global_lru(sc)) {
1818 free = zone_page_state(zone, NR_FREE_PAGES); 1897 free = zone_page_state(zone, NR_FREE_PAGES);
1819 /* If we have very few page cache pages, 1898 /* If we have very few page cache pages,
@@ -1830,8 +1909,8 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
1830 * With swappiness at 100, anonymous and file have the same priority. 1909 * With swappiness at 100, anonymous and file have the same priority.
1831 * This scanning priority is essentially the inverse of IO cost. 1910 * This scanning priority is essentially the inverse of IO cost.
1832 */ 1911 */
1833 anon_prio = sc->swappiness; 1912 anon_prio = vmscan_swappiness(sc);
1834 file_prio = 200 - sc->swappiness; 1913 file_prio = 200 - vmscan_swappiness(sc);
1835 1914
1836 /* 1915 /*
1837 * OK, so we have swap space and a fair amount of page cache 1916 * OK, so we have swap space and a fair amount of page cache
@@ -1878,23 +1957,9 @@ out:
1878 scan = zone_nr_lru_pages(zone, sc, l); 1957 scan = zone_nr_lru_pages(zone, sc, l);
1879 if (priority || noswap) { 1958 if (priority || noswap) {
1880 scan >>= priority; 1959 scan >>= priority;
1881 scan = div64_u64(scan * fraction[file], denominator); 1960 if (!scan && force_scan)
1882 }
1883
1884 /*
1885 * If zone is small or memcg is small, nr[l] can be 0.
1886 * This results no-scan on this priority and priority drop down.
1887 * For global direct reclaim, it can visit next zone and tend
1888 * not to have problems. For global kswapd, it's for zone
1889 * balancing and it need to scan a small amounts. When using
1890 * memcg, priority drop can cause big latency. So, it's better
1891 * to scan small amount. See may_noscan above.
1892 */
1893 if (!scan && force_scan) {
1894 if (file)
1895 scan = SWAP_CLUSTER_MAX;
1896 else if (!noswap)
1897 scan = SWAP_CLUSTER_MAX; 1961 scan = SWAP_CLUSTER_MAX;
1962 scan = div64_u64(scan * fraction[file], denominator);
1898 } 1963 }
1899 nr[l] = scan; 1964 nr[l] = scan;
1900 } 1965 }
@@ -1974,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone,
1974 enum lru_list l; 2039 enum lru_list l;
1975 unsigned long nr_reclaimed, nr_scanned; 2040 unsigned long nr_reclaimed, nr_scanned;
1976 unsigned long nr_to_reclaim = sc->nr_to_reclaim; 2041 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2042 struct blk_plug plug;
1977 2043
1978restart: 2044restart:
1979 nr_reclaimed = 0; 2045 nr_reclaimed = 0;
1980 nr_scanned = sc->nr_scanned; 2046 nr_scanned = sc->nr_scanned;
1981 get_scan_count(zone, sc, nr, priority); 2047 get_scan_count(zone, sc, nr, priority);
1982 2048
2049 blk_start_plug(&plug);
1983 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 2050 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1984 nr[LRU_INACTIVE_FILE]) { 2051 nr[LRU_INACTIVE_FILE]) {
1985 for_each_evictable_lru(l) { 2052 for_each_evictable_lru(l) {
@@ -2003,6 +2070,7 @@ restart:
2003 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) 2070 if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
2004 break; 2071 break;
2005 } 2072 }
2073 blk_finish_plug(&plug);
2006 sc->nr_reclaimed += nr_reclaimed; 2074 sc->nr_reclaimed += nr_reclaimed;
2007 2075
2008 /* 2076 /*
@@ -2035,14 +2103,19 @@ restart:
2035 * 2103 *
2036 * If a zone is deemed to be full of pinned pages then just give it a light 2104 * If a zone is deemed to be full of pinned pages then just give it a light
2037 * scan then give up on it. 2105 * scan then give up on it.
2106 *
2107 * This function returns true if a zone is being reclaimed for a costly
2108 * high-order allocation and compaction is either ready to begin or deferred.
2109 * This indicates to the caller that it should retry the allocation or fail.
2038 */ 2110 */
2039static void shrink_zones(int priority, struct zonelist *zonelist, 2111static bool shrink_zones(int priority, struct zonelist *zonelist,
2040 struct scan_control *sc) 2112 struct scan_control *sc)
2041{ 2113{
2042 struct zoneref *z; 2114 struct zoneref *z;
2043 struct zone *zone; 2115 struct zone *zone;
2044 unsigned long nr_soft_reclaimed; 2116 unsigned long nr_soft_reclaimed;
2045 unsigned long nr_soft_scanned; 2117 unsigned long nr_soft_scanned;
2118 bool should_abort_reclaim = false;
2046 2119
2047 for_each_zone_zonelist_nodemask(zone, z, zonelist, 2120 for_each_zone_zonelist_nodemask(zone, z, zonelist,
2048 gfp_zone(sc->gfp_mask), sc->nodemask) { 2121 gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2057,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2057 continue; 2130 continue;
2058 if (zone->all_unreclaimable && priority != DEF_PRIORITY) 2131 if (zone->all_unreclaimable && priority != DEF_PRIORITY)
2059 continue; /* Let kswapd poll it */ 2132 continue; /* Let kswapd poll it */
2133 if (COMPACTION_BUILD) {
2134 /*
2135 * If we already have plenty of memory free for
2136 * compaction in this zone, don't free any more.
2137 * Even though compaction is invoked for any
2138 * non-zero order, only frequent costly order
2139 * reclamation is disruptive enough to become a
2140 * noticable problem, like transparent huge page
2141 * allocations.
2142 */
2143 if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2144 (compaction_suitable(zone, sc->order) ||
2145 compaction_deferred(zone))) {
2146 should_abort_reclaim = true;
2147 continue;
2148 }
2149 }
2060 /* 2150 /*
2061 * This steals pages from memory cgroups over softlimit 2151 * This steals pages from memory cgroups over softlimit
2062 * and returns the number of reclaimed pages and 2152 * and returns the number of reclaimed pages and
@@ -2074,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
2074 2164
2075 shrink_zone(priority, zone, sc); 2165 shrink_zone(priority, zone, sc);
2076 } 2166 }
2167
2168 return should_abort_reclaim;
2077} 2169}
2078 2170
2079static bool zone_reclaimable(struct zone *zone) 2171static bool zone_reclaimable(struct zone *zone)
@@ -2138,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2138 sc->nr_scanned = 0; 2230 sc->nr_scanned = 0;
2139 if (!priority) 2231 if (!priority)
2140 disable_swap_token(sc->mem_cgroup); 2232 disable_swap_token(sc->mem_cgroup);
2141 shrink_zones(priority, zonelist, sc); 2233 if (shrink_zones(priority, zonelist, sc))
2234 break;
2235
2142 /* 2236 /*
2143 * Don't shrink slabs when reclaiming memory from 2237 * Don't shrink slabs when reclaiming memory from
2144 * over limit cgroups 2238 * over limit cgroups
@@ -2172,7 +2266,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2172 */ 2266 */
2173 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; 2267 writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2;
2174 if (total_scanned > writeback_threshold) { 2268 if (total_scanned > writeback_threshold) {
2175 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); 2269 wakeup_flusher_threads(laptop_mode ? 0 : total_scanned,
2270 WB_REASON_TRY_TO_FREE_PAGES);
2176 sc->may_writepage = 1; 2271 sc->may_writepage = 1;
2177 } 2272 }
2178 2273
@@ -2220,7 +2315,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2220 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2315 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2221 .may_unmap = 1, 2316 .may_unmap = 1,
2222 .may_swap = 1, 2317 .may_swap = 1,
2223 .swappiness = vm_swappiness,
2224 .order = order, 2318 .order = order,
2225 .mem_cgroup = NULL, 2319 .mem_cgroup = NULL,
2226 .nodemask = nodemask, 2320 .nodemask = nodemask,
@@ -2244,7 +2338,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2244 2338
2245unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, 2339unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2246 gfp_t gfp_mask, bool noswap, 2340 gfp_t gfp_mask, bool noswap,
2247 unsigned int swappiness,
2248 struct zone *zone, 2341 struct zone *zone,
2249 unsigned long *nr_scanned) 2342 unsigned long *nr_scanned)
2250{ 2343{
@@ -2254,7 +2347,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2254 .may_writepage = !laptop_mode, 2347 .may_writepage = !laptop_mode,
2255 .may_unmap = 1, 2348 .may_unmap = 1,
2256 .may_swap = !noswap, 2349 .may_swap = !noswap,
2257 .swappiness = swappiness,
2258 .order = 0, 2350 .order = 0,
2259 .mem_cgroup = mem, 2351 .mem_cgroup = mem,
2260 }; 2352 };
@@ -2283,8 +2375,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
2283 2375
2284unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 2376unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2285 gfp_t gfp_mask, 2377 gfp_t gfp_mask,
2286 bool noswap, 2378 bool noswap)
2287 unsigned int swappiness)
2288{ 2379{
2289 struct zonelist *zonelist; 2380 struct zonelist *zonelist;
2290 unsigned long nr_reclaimed; 2381 unsigned long nr_reclaimed;
@@ -2294,7 +2385,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2294 .may_unmap = 1, 2385 .may_unmap = 1,
2295 .may_swap = !noswap, 2386 .may_swap = !noswap,
2296 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2387 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2297 .swappiness = swappiness,
2298 .order = 0, 2388 .order = 0,
2299 .mem_cgroup = mem_cont, 2389 .mem_cgroup = mem_cont,
2300 .nodemask = NULL, /* we don't care the placement */ 2390 .nodemask = NULL, /* we don't care the placement */
@@ -2445,7 +2535,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2445 * we want to put equal scanning pressure on each zone. 2535 * we want to put equal scanning pressure on each zone.
2446 */ 2536 */
2447 .nr_to_reclaim = ULONG_MAX, 2537 .nr_to_reclaim = ULONG_MAX,
2448 .swappiness = vm_swappiness,
2449 .order = order, 2538 .order = order,
2450 .mem_cgroup = NULL, 2539 .mem_cgroup = NULL,
2451 }; 2540 };
@@ -2494,6 +2583,9 @@ loop_again:
2494 high_wmark_pages(zone), 0, 0)) { 2583 high_wmark_pages(zone), 0, 0)) {
2495 end_zone = i; 2584 end_zone = i;
2496 break; 2585 break;
2586 } else {
2587 /* If balanced, clear the congested flag */
2588 zone_clear_flag(zone, ZONE_CONGESTED);
2497 } 2589 }
2498 } 2590 }
2499 if (i < 0) 2591 if (i < 0)
@@ -2684,6 +2776,8 @@ out:
2684 2776
2685 /* If balanced, clear the congested flag */ 2777 /* If balanced, clear the congested flag */
2686 zone_clear_flag(zone, ZONE_CONGESTED); 2778 zone_clear_flag(zone, ZONE_CONGESTED);
2779 if (i <= *classzone_idx)
2780 balanced += zone->present_pages;
2687 } 2781 }
2688 } 2782 }
2689 2783
@@ -2757,7 +2851,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
2757static int kswapd(void *p) 2851static int kswapd(void *p)
2758{ 2852{
2759 unsigned long order, new_order; 2853 unsigned long order, new_order;
2854 unsigned balanced_order;
2760 int classzone_idx, new_classzone_idx; 2855 int classzone_idx, new_classzone_idx;
2856 int balanced_classzone_idx;
2761 pg_data_t *pgdat = (pg_data_t*)p; 2857 pg_data_t *pgdat = (pg_data_t*)p;
2762 struct task_struct *tsk = current; 2858 struct task_struct *tsk = current;
2763 2859
@@ -2788,7 +2884,9 @@ static int kswapd(void *p)
2788 set_freezable(); 2884 set_freezable();
2789 2885
2790 order = new_order = 0; 2886 order = new_order = 0;
2887 balanced_order = 0;
2791 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; 2888 classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
2889 balanced_classzone_idx = classzone_idx;
2792 for ( ; ; ) { 2890 for ( ; ; ) {
2793 int ret; 2891 int ret;
2794 2892
@@ -2797,7 +2895,8 @@ static int kswapd(void *p)
2797 * new request of a similar or harder type will succeed soon 2895 * new request of a similar or harder type will succeed soon
2798 * so consider going to sleep on the basis we reclaimed at 2896 * so consider going to sleep on the basis we reclaimed at
2799 */ 2897 */
2800 if (classzone_idx >= new_classzone_idx && order == new_order) { 2898 if (balanced_classzone_idx >= new_classzone_idx &&
2899 balanced_order == new_order) {
2801 new_order = pgdat->kswapd_max_order; 2900 new_order = pgdat->kswapd_max_order;
2802 new_classzone_idx = pgdat->classzone_idx; 2901 new_classzone_idx = pgdat->classzone_idx;
2803 pgdat->kswapd_max_order = 0; 2902 pgdat->kswapd_max_order = 0;
@@ -2812,9 +2911,12 @@ static int kswapd(void *p)
2812 order = new_order; 2911 order = new_order;
2813 classzone_idx = new_classzone_idx; 2912 classzone_idx = new_classzone_idx;
2814 } else { 2913 } else {
2815 kswapd_try_to_sleep(pgdat, order, classzone_idx); 2914 kswapd_try_to_sleep(pgdat, balanced_order,
2915 balanced_classzone_idx);
2816 order = pgdat->kswapd_max_order; 2916 order = pgdat->kswapd_max_order;
2817 classzone_idx = pgdat->classzone_idx; 2917 classzone_idx = pgdat->classzone_idx;
2918 new_order = order;
2919 new_classzone_idx = classzone_idx;
2818 pgdat->kswapd_max_order = 0; 2920 pgdat->kswapd_max_order = 0;
2819 pgdat->classzone_idx = pgdat->nr_zones - 1; 2921 pgdat->classzone_idx = pgdat->nr_zones - 1;
2820 } 2922 }
@@ -2829,7 +2931,9 @@ static int kswapd(void *p)
2829 */ 2931 */
2830 if (!ret) { 2932 if (!ret) {
2831 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); 2933 trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
2832 order = balance_pgdat(pgdat, order, &classzone_idx); 2934 balanced_classzone_idx = classzone_idx;
2935 balanced_order = balance_pgdat(pgdat, order,
2936 &balanced_classzone_idx);
2833 } 2937 }
2834 } 2938 }
2835 return 0; 2939 return 0;
@@ -2915,7 +3019,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
2915 .may_writepage = 1, 3019 .may_writepage = 1,
2916 .nr_to_reclaim = nr_to_reclaim, 3020 .nr_to_reclaim = nr_to_reclaim,
2917 .hibernation_mode = 1, 3021 .hibernation_mode = 1,
2918 .swappiness = vm_swappiness,
2919 .order = 0, 3022 .order = 0,
2920 }; 3023 };
2921 struct shrink_control shrink = { 3024 struct shrink_control shrink = {
@@ -3102,7 +3205,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3102 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3205 .nr_to_reclaim = max_t(unsigned long, nr_pages,
3103 SWAP_CLUSTER_MAX), 3206 SWAP_CLUSTER_MAX),
3104 .gfp_mask = gfp_mask, 3207 .gfp_mask = gfp_mask,
3105 .swappiness = vm_swappiness,
3106 .order = order, 3208 .order = order,
3107 }; 3209 };
3108 struct shrink_control shrink = { 3210 struct shrink_control shrink = {
@@ -3343,66 +3445,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
3343 3445
3344} 3446}
3345 3447
3346/** 3448static void warn_scan_unevictable_pages(void)
3347 * scan_zone_unevictable_pages - check unevictable list for evictable pages
3348 * @zone - zone of which to scan the unevictable list
3349 *
3350 * Scan @zone's unevictable LRU lists to check for pages that have become
3351 * evictable. Move those that have to @zone's inactive list where they
3352 * become candidates for reclaim, unless shrink_inactive_zone() decides
3353 * to reactivate them. Pages that are still unevictable are rotated
3354 * back onto @zone's unevictable list.
3355 */
3356#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
3357static void scan_zone_unevictable_pages(struct zone *zone)
3358{
3359 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
3360 unsigned long scan;
3361 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
3362
3363 while (nr_to_scan > 0) {
3364 unsigned long batch_size = min(nr_to_scan,
3365 SCAN_UNEVICTABLE_BATCH_SIZE);
3366
3367 spin_lock_irq(&zone->lru_lock);
3368 for (scan = 0; scan < batch_size; scan++) {
3369 struct page *page = lru_to_page(l_unevictable);
3370
3371 if (!trylock_page(page))
3372 continue;
3373
3374 prefetchw_prev_lru_page(page, l_unevictable, flags);
3375
3376 if (likely(PageLRU(page) && PageUnevictable(page)))
3377 check_move_unevictable_page(page, zone);
3378
3379 unlock_page(page);
3380 }
3381 spin_unlock_irq(&zone->lru_lock);
3382
3383 nr_to_scan -= batch_size;
3384 }
3385}
3386
3387
3388/**
3389 * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
3390 *
3391 * A really big hammer: scan all zones' unevictable LRU lists to check for
3392 * pages that have become evictable. Move those back to the zones'
3393 * inactive list where they become candidates for reclaim.
3394 * This occurs when, e.g., we have unswappable pages on the unevictable lists,
3395 * and we add swap to the system. As such, it runs in the context of a task
3396 * that has possibly/probably made some previously unevictable pages
3397 * evictable.
3398 */
3399static void scan_all_zones_unevictable_pages(void)
3400{ 3449{
3401 struct zone *zone; 3450 printk_once(KERN_WARNING
3402 3451 "The scan_unevictable_pages sysctl/node-interface has been "
3403 for_each_zone(zone) { 3452 "disabled for lack of a legitimate use case. If you have "
3404 scan_zone_unevictable_pages(zone); 3453 "one, please send an email to linux-mm@kvack.org.\n");
3405 }
3406} 3454}
3407 3455
3408/* 3456/*
@@ -3415,11 +3463,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
3415 void __user *buffer, 3463 void __user *buffer,
3416 size_t *length, loff_t *ppos) 3464 size_t *length, loff_t *ppos)
3417{ 3465{
3466 warn_scan_unevictable_pages();
3418 proc_doulongvec_minmax(table, write, buffer, length, ppos); 3467 proc_doulongvec_minmax(table, write, buffer, length, ppos);
3419
3420 if (write && *(unsigned long *)table->data)
3421 scan_all_zones_unevictable_pages();
3422
3423 scan_unevictable_pages = 0; 3468 scan_unevictable_pages = 0;
3424 return 0; 3469 return 0;
3425} 3470}
@@ -3434,6 +3479,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev,
3434 struct sysdev_attribute *attr, 3479 struct sysdev_attribute *attr,
3435 char *buf) 3480 char *buf)
3436{ 3481{
3482 warn_scan_unevictable_pages();
3437 return sprintf(buf, "0\n"); /* always zero; should fit... */ 3483 return sprintf(buf, "0\n"); /* always zero; should fit... */
3438} 3484}
3439 3485
@@ -3441,19 +3487,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,
3441 struct sysdev_attribute *attr, 3487 struct sysdev_attribute *attr,
3442 const char *buf, size_t count) 3488 const char *buf, size_t count)
3443{ 3489{
3444 struct zone *node_zones = NODE_DATA(dev->id)->node_zones; 3490 warn_scan_unevictable_pages();
3445 struct zone *zone;
3446 unsigned long res;
3447 unsigned long req = strict_strtoul(buf, 10, &res);
3448
3449 if (!req)
3450 return 1; /* zero is no-op */
3451
3452 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
3453 if (!populated_zone(zone))
3454 continue;
3455 scan_zone_unevictable_pages(zone);
3456 }
3457 return 1; 3491 return 1;
3458} 3492}
3459 3493