aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c397
1 files changed, 213 insertions, 184 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 196709f5ee58..88c5fed8b9a4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -128,7 +128,7 @@ struct scan_control {
128 * From 0 .. 100. Higher means more swappy. 128 * From 0 .. 100. Higher means more swappy.
129 */ 129 */
130int vm_swappiness = 60; 130int vm_swappiness = 60;
131long vm_total_pages; /* The total number of pages which the VM controls */ 131unsigned long vm_total_pages; /* The total number of pages which the VM controls */
132 132
133static LIST_HEAD(shrinker_list); 133static LIST_HEAD(shrinker_list);
134static DECLARE_RWSEM(shrinker_rwsem); 134static DECLARE_RWSEM(shrinker_rwsem);
@@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec)
1579} 1579}
1580#endif 1580#endif
1581 1581
1582static int inactive_file_is_low_global(struct zone *zone)
1583{
1584 unsigned long active, inactive;
1585
1586 active = zone_page_state(zone, NR_ACTIVE_FILE);
1587 inactive = zone_page_state(zone, NR_INACTIVE_FILE);
1588
1589 return (active > inactive);
1590}
1591
1592/** 1582/**
1593 * inactive_file_is_low - check if file pages need to be deactivated 1583 * inactive_file_is_low - check if file pages need to be deactivated
1594 * @lruvec: LRU vector to check 1584 * @lruvec: LRU vector to check
@@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone)
1605 */ 1595 */
1606static int inactive_file_is_low(struct lruvec *lruvec) 1596static int inactive_file_is_low(struct lruvec *lruvec)
1607{ 1597{
1608 if (!mem_cgroup_disabled()) 1598 unsigned long inactive;
1609 return mem_cgroup_inactive_file_is_low(lruvec); 1599 unsigned long active;
1600
1601 inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE);
1602 active = get_lru_size(lruvec, LRU_ACTIVE_FILE);
1610 1603
1611 return inactive_file_is_low_global(lruvec_zone(lruvec)); 1604 return active > inactive;
1612} 1605}
1613 1606
1614static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) 1607static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
@@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc)
1638 return mem_cgroup_swappiness(sc->target_mem_cgroup); 1631 return mem_cgroup_swappiness(sc->target_mem_cgroup);
1639} 1632}
1640 1633
1634enum scan_balance {
1635 SCAN_EQUAL,
1636 SCAN_FRACT,
1637 SCAN_ANON,
1638 SCAN_FILE,
1639};
1640
1641/* 1641/*
1642 * Determine how aggressively the anon and file LRU lists should be 1642 * Determine how aggressively the anon and file LRU lists should be
1643 * scanned. The relative value of each set of LRU lists is determined 1643 * scanned. The relative value of each set of LRU lists is determined
@@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc)
1650static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, 1650static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1651 unsigned long *nr) 1651 unsigned long *nr)
1652{ 1652{
1653 unsigned long anon, file, free; 1653 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1654 u64 fraction[2];
1655 u64 denominator = 0; /* gcc */
1656 struct zone *zone = lruvec_zone(lruvec);
1654 unsigned long anon_prio, file_prio; 1657 unsigned long anon_prio, file_prio;
1658 enum scan_balance scan_balance;
1659 unsigned long anon, file, free;
1660 bool force_scan = false;
1655 unsigned long ap, fp; 1661 unsigned long ap, fp;
1656 struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
1657 u64 fraction[2], denominator;
1658 enum lru_list lru; 1662 enum lru_list lru;
1659 int noswap = 0;
1660 bool force_scan = false;
1661 struct zone *zone = lruvec_zone(lruvec);
1662 1663
1663 /* 1664 /*
1664 * If the zone or memcg is small, nr[l] can be 0. This 1665 * If the zone or memcg is small, nr[l] can be 0. This
@@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1676 force_scan = true; 1677 force_scan = true;
1677 1678
1678 /* If we have no swap space, do not bother scanning anon pages. */ 1679 /* If we have no swap space, do not bother scanning anon pages. */
1679 if (!sc->may_swap || (nr_swap_pages <= 0)) { 1680 if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
1680 noswap = 1; 1681 scan_balance = SCAN_FILE;
1681 fraction[0] = 0; 1682 goto out;
1682 fraction[1] = 1; 1683 }
1683 denominator = 1; 1684
1685 /*
1686 * Global reclaim will swap to prevent OOM even with no
1687 * swappiness, but memcg users want to use this knob to
1688 * disable swapping for individual groups completely when
1689 * using the memory controller's swap limit feature would be
1690 * too expensive.
1691 */
1692 if (!global_reclaim(sc) && !vmscan_swappiness(sc)) {
1693 scan_balance = SCAN_FILE;
1694 goto out;
1695 }
1696
1697 /*
1698 * Do not apply any pressure balancing cleverness when the
1699 * system is close to OOM, scan both anon and file equally
1700 * (unless the swappiness setting disagrees with swapping).
1701 */
1702 if (!sc->priority && vmscan_swappiness(sc)) {
1703 scan_balance = SCAN_EQUAL;
1684 goto out; 1704 goto out;
1685 } 1705 }
1686 1706
@@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1689 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + 1709 file = get_lru_size(lruvec, LRU_ACTIVE_FILE) +
1690 get_lru_size(lruvec, LRU_INACTIVE_FILE); 1710 get_lru_size(lruvec, LRU_INACTIVE_FILE);
1691 1711
1712 /*
1713 * If it's foreseeable that reclaiming the file cache won't be
1714 * enough to get the zone back into a desirable shape, we have
1715 * to swap. Better start now and leave the - probably heavily
1716 * thrashing - remaining file pages alone.
1717 */
1692 if (global_reclaim(sc)) { 1718 if (global_reclaim(sc)) {
1693 free = zone_page_state(zone, NR_FREE_PAGES); 1719 free = zone_page_state(zone, NR_FREE_PAGES);
1694 if (unlikely(file + free <= high_wmark_pages(zone))) { 1720 if (unlikely(file + free <= high_wmark_pages(zone))) {
1695 /* 1721 scan_balance = SCAN_ANON;
1696 * If we have very few page cache pages, force-scan
1697 * anon pages.
1698 */
1699 fraction[0] = 1;
1700 fraction[1] = 0;
1701 denominator = 1;
1702 goto out;
1703 } else if (!inactive_file_is_low_global(zone)) {
1704 /*
1705 * There is enough inactive page cache, do not
1706 * reclaim anything from the working set right now.
1707 */
1708 fraction[0] = 0;
1709 fraction[1] = 1;
1710 denominator = 1;
1711 goto out; 1722 goto out;
1712 } 1723 }
1713 } 1724 }
1714 1725
1715 /* 1726 /*
1727 * There is enough inactive page cache, do not reclaim
1728 * anything from the anonymous working set right now.
1729 */
1730 if (!inactive_file_is_low(lruvec)) {
1731 scan_balance = SCAN_FILE;
1732 goto out;
1733 }
1734
1735 scan_balance = SCAN_FRACT;
1736
1737 /*
1716 * With swappiness at 100, anonymous and file have the same priority. 1738 * With swappiness at 100, anonymous and file have the same priority.
1717 * This scanning priority is essentially the inverse of IO cost. 1739 * This scanning priority is essentially the inverse of IO cost.
1718 */ 1740 */
@@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
1759out: 1781out:
1760 for_each_evictable_lru(lru) { 1782 for_each_evictable_lru(lru) {
1761 int file = is_file_lru(lru); 1783 int file = is_file_lru(lru);
1784 unsigned long size;
1762 unsigned long scan; 1785 unsigned long scan;
1763 1786
1764 scan = get_lru_size(lruvec, lru); 1787 size = get_lru_size(lruvec, lru);
1765 if (sc->priority || noswap || !vmscan_swappiness(sc)) { 1788 scan = size >> sc->priority;
1766 scan >>= sc->priority; 1789
1767 if (!scan && force_scan) 1790 if (!scan && force_scan)
1768 scan = SWAP_CLUSTER_MAX; 1791 scan = min(size, SWAP_CLUSTER_MAX);
1792
1793 switch (scan_balance) {
1794 case SCAN_EQUAL:
1795 /* Scan lists relative to size */
1796 break;
1797 case SCAN_FRACT:
1798 /*
1799 * Scan types proportional to swappiness and
1800 * their relative recent reclaim efficiency.
1801 */
1769 scan = div64_u64(scan * fraction[file], denominator); 1802 scan = div64_u64(scan * fraction[file], denominator);
1803 break;
1804 case SCAN_FILE:
1805 case SCAN_ANON:
1806 /* Scan one type exclusively */
1807 if ((scan_balance == SCAN_FILE) != file)
1808 scan = 0;
1809 break;
1810 default:
1811 /* Look ma, no brain */
1812 BUG();
1770 } 1813 }
1771 nr[lru] = scan; 1814 nr[lru] = scan;
1772 } 1815 }
1773} 1816}
1774 1817
1818/*
1819 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1820 */
1821static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1822{
1823 unsigned long nr[NR_LRU_LISTS];
1824 unsigned long nr_to_scan;
1825 enum lru_list lru;
1826 unsigned long nr_reclaimed = 0;
1827 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1828 struct blk_plug plug;
1829
1830 get_scan_count(lruvec, sc, nr);
1831
1832 blk_start_plug(&plug);
1833 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1834 nr[LRU_INACTIVE_FILE]) {
1835 for_each_evictable_lru(lru) {
1836 if (nr[lru]) {
1837 nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
1838 nr[lru] -= nr_to_scan;
1839
1840 nr_reclaimed += shrink_list(lru, nr_to_scan,
1841 lruvec, sc);
1842 }
1843 }
1844 /*
1845 * On large memory systems, scan >> priority can become
1846 * really large. This is fine for the starting priority;
1847 * we want to put equal scanning pressure on each zone.
1848 * However, if the VM has a harder time of freeing pages,
1849 * with multiple processes reclaiming pages, the total
1850 * freeing target can get unreasonably large.
1851 */
1852 if (nr_reclaimed >= nr_to_reclaim &&
1853 sc->priority < DEF_PRIORITY)
1854 break;
1855 }
1856 blk_finish_plug(&plug);
1857 sc->nr_reclaimed += nr_reclaimed;
1858
1859 /*
1860 * Even if we did not try to evict anon pages at all, we want to
1861 * rebalance the anon lru active/inactive ratio.
1862 */
1863 if (inactive_anon_is_low(lruvec))
1864 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
1865 sc, LRU_ACTIVE_ANON);
1866
1867 throttle_vm_writeout(sc->gfp_mask);
1868}
1869
1775/* Use reclaim/compaction for costly allocs or under memory pressure */ 1870/* Use reclaim/compaction for costly allocs or under memory pressure */
1776static bool in_reclaim_compaction(struct scan_control *sc) 1871static bool in_reclaim_compaction(struct scan_control *sc)
1777{ 1872{
@@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
1790 * calls try_to_compact_zone() that it will have enough free pages to succeed. 1885 * calls try_to_compact_zone() that it will have enough free pages to succeed.
1791 * It will give up earlier than that if there is difficulty reclaiming pages. 1886 * It will give up earlier than that if there is difficulty reclaiming pages.
1792 */ 1887 */
1793static inline bool should_continue_reclaim(struct lruvec *lruvec, 1888static inline bool should_continue_reclaim(struct zone *zone,
1794 unsigned long nr_reclaimed, 1889 unsigned long nr_reclaimed,
1795 unsigned long nr_scanned, 1890 unsigned long nr_scanned,
1796 struct scan_control *sc) 1891 struct scan_control *sc)
@@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1830 * inactive lists are large enough, continue reclaiming 1925 * inactive lists are large enough, continue reclaiming
1831 */ 1926 */
1832 pages_for_compaction = (2UL << sc->order); 1927 pages_for_compaction = (2UL << sc->order);
1833 inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); 1928 inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE);
1834 if (nr_swap_pages > 0) 1929 if (get_nr_swap_pages() > 0)
1835 inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); 1930 inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON);
1836 if (sc->nr_reclaimed < pages_for_compaction && 1931 if (sc->nr_reclaimed < pages_for_compaction &&
1837 inactive_lru_pages > pages_for_compaction) 1932 inactive_lru_pages > pages_for_compaction)
1838 return true; 1933 return true;
1839 1934
1840 /* If compaction would go ahead or the allocation would succeed, stop */ 1935 /* If compaction would go ahead or the allocation would succeed, stop */
1841 switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { 1936 switch (compaction_suitable(zone, sc->order)) {
1842 case COMPACT_PARTIAL: 1937 case COMPACT_PARTIAL:
1843 case COMPACT_CONTINUE: 1938 case COMPACT_CONTINUE:
1844 return false; 1939 return false;
@@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
1847 } 1942 }
1848} 1943}
1849 1944
1850/* 1945static void shrink_zone(struct zone *zone, struct scan_control *sc)
1851 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1852 */
1853static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
1854{ 1946{
1855 unsigned long nr[NR_LRU_LISTS];
1856 unsigned long nr_to_scan;
1857 enum lru_list lru;
1858 unsigned long nr_reclaimed, nr_scanned; 1947 unsigned long nr_reclaimed, nr_scanned;
1859 unsigned long nr_to_reclaim = sc->nr_to_reclaim;
1860 struct blk_plug plug;
1861
1862restart:
1863 nr_reclaimed = 0;
1864 nr_scanned = sc->nr_scanned;
1865 get_scan_count(lruvec, sc, nr);
1866
1867 blk_start_plug(&plug);
1868 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1869 nr[LRU_INACTIVE_FILE]) {
1870 for_each_evictable_lru(lru) {
1871 if (nr[lru]) {
1872 nr_to_scan = min_t(unsigned long,
1873 nr[lru], SWAP_CLUSTER_MAX);
1874 nr[lru] -= nr_to_scan;
1875
1876 nr_reclaimed += shrink_list(lru, nr_to_scan,
1877 lruvec, sc);
1878 }
1879 }
1880 /*
1881 * On large memory systems, scan >> priority can become
1882 * really large. This is fine for the starting priority;
1883 * we want to put equal scanning pressure on each zone.
1884 * However, if the VM has a harder time of freeing pages,
1885 * with multiple processes reclaiming pages, the total
1886 * freeing target can get unreasonably large.
1887 */
1888 if (nr_reclaimed >= nr_to_reclaim &&
1889 sc->priority < DEF_PRIORITY)
1890 break;
1891 }
1892 blk_finish_plug(&plug);
1893 sc->nr_reclaimed += nr_reclaimed;
1894 1948
1895 /* 1949 do {
1896 * Even if we did not try to evict anon pages at all, we want to 1950 struct mem_cgroup *root = sc->target_mem_cgroup;
1897 * rebalance the anon lru active/inactive ratio. 1951 struct mem_cgroup_reclaim_cookie reclaim = {
1898 */ 1952 .zone = zone,
1899 if (inactive_anon_is_low(lruvec)) 1953 .priority = sc->priority,
1900 shrink_active_list(SWAP_CLUSTER_MAX, lruvec, 1954 };
1901 sc, LRU_ACTIVE_ANON); 1955 struct mem_cgroup *memcg;
1902
1903 /* reclaim/compaction might need reclaim to continue */
1904 if (should_continue_reclaim(lruvec, nr_reclaimed,
1905 sc->nr_scanned - nr_scanned, sc))
1906 goto restart;
1907 1956
1908 throttle_vm_writeout(sc->gfp_mask); 1957 nr_reclaimed = sc->nr_reclaimed;
1909} 1958 nr_scanned = sc->nr_scanned;
1910 1959
1911static void shrink_zone(struct zone *zone, struct scan_control *sc) 1960 memcg = mem_cgroup_iter(root, NULL, &reclaim);
1912{ 1961 do {
1913 struct mem_cgroup *root = sc->target_mem_cgroup; 1962 struct lruvec *lruvec;
1914 struct mem_cgroup_reclaim_cookie reclaim = {
1915 .zone = zone,
1916 .priority = sc->priority,
1917 };
1918 struct mem_cgroup *memcg;
1919 1963
1920 memcg = mem_cgroup_iter(root, NULL, &reclaim); 1964 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
1921 do {
1922 struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
1923 1965
1924 shrink_lruvec(lruvec, sc); 1966 shrink_lruvec(lruvec, sc);
1925 1967
1926 /* 1968 /*
1927 * Limit reclaim has historically picked one memcg and 1969 * Direct reclaim and kswapd have to scan all memory
1928 * scanned it with decreasing priority levels until 1970 * cgroups to fulfill the overall scan target for the
1929 * nr_to_reclaim had been reclaimed. This priority 1971 * zone.
1930 * cycle is thus over after a single memcg. 1972 *
1931 * 1973 * Limit reclaim, on the other hand, only cares about
1932 * Direct reclaim and kswapd, on the other hand, have 1974 * nr_to_reclaim pages to be reclaimed and it will
1933 * to scan all memory cgroups to fulfill the overall 1975 * retry with decreasing priority if one round over the
1934 * scan target for the zone. 1976 * whole hierarchy is not sufficient.
1935 */ 1977 */
1936 if (!global_reclaim(sc)) { 1978 if (!global_reclaim(sc) &&
1937 mem_cgroup_iter_break(root, memcg); 1979 sc->nr_reclaimed >= sc->nr_to_reclaim) {
1938 break; 1980 mem_cgroup_iter_break(root, memcg);
1939 } 1981 break;
1940 memcg = mem_cgroup_iter(root, memcg, &reclaim); 1982 }
1941 } while (memcg); 1983 memcg = mem_cgroup_iter(root, memcg, &reclaim);
1984 } while (memcg);
1985 } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
1986 sc->nr_scanned - nr_scanned, sc));
1942} 1987}
1943 1988
1944/* Returns true if compaction should go ahead for a high-order request */ 1989/* Returns true if compaction should go ahead for a high-order request */
@@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
1958 * a reasonable chance of completing and allocating the page 2003 * a reasonable chance of completing and allocating the page
1959 */ 2004 */
1960 balance_gap = min(low_wmark_pages(zone), 2005 balance_gap = min(low_wmark_pages(zone),
1961 (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2006 (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
1962 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2007 KSWAPD_ZONE_BALANCE_GAP_RATIO);
1963 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); 2008 watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);
1964 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); 2009 watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0);
@@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
2150 goto out; 2195 goto out;
2151 2196
2152 /* 2197 /*
2198 * If we're getting trouble reclaiming, start doing
2199 * writepage even in laptop mode.
2200 */
2201 if (sc->priority < DEF_PRIORITY - 2)
2202 sc->may_writepage = 1;
2203
2204 /*
2153 * Try to write back as many pages as we just scanned. This 2205 * Try to write back as many pages as we just scanned. This
2154 * tends to cause slow streaming writers to write data to the 2206 * tends to cause slow streaming writers to write data to the
2155 * disk smoothly, at the dirtying rate, which is nice. But 2207 * disk smoothly, at the dirtying rate, which is nice. But
@@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
2300{ 2352{
2301 unsigned long nr_reclaimed; 2353 unsigned long nr_reclaimed;
2302 struct scan_control sc = { 2354 struct scan_control sc = {
2303 .gfp_mask = gfp_mask, 2355 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
2304 .may_writepage = !laptop_mode, 2356 .may_writepage = !laptop_mode,
2305 .nr_to_reclaim = SWAP_CLUSTER_MAX, 2357 .nr_to_reclaim = SWAP_CLUSTER_MAX,
2306 .may_unmap = 1, 2358 .may_unmap = 1,
@@ -2473,7 +2525,7 @@ static bool zone_balanced(struct zone *zone, int order,
2473 */ 2525 */
2474static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) 2526static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2475{ 2527{
2476 unsigned long present_pages = 0; 2528 unsigned long managed_pages = 0;
2477 unsigned long balanced_pages = 0; 2529 unsigned long balanced_pages = 0;
2478 int i; 2530 int i;
2479 2531
@@ -2484,7 +2536,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2484 if (!populated_zone(zone)) 2536 if (!populated_zone(zone))
2485 continue; 2537 continue;
2486 2538
2487 present_pages += zone->present_pages; 2539 managed_pages += zone->managed_pages;
2488 2540
2489 /* 2541 /*
2490 * A special case here: 2542 * A special case here:
@@ -2494,18 +2546,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
2494 * they must be considered balanced here as well! 2546 * they must be considered balanced here as well!
2495 */ 2547 */
2496 if (zone->all_unreclaimable) { 2548 if (zone->all_unreclaimable) {
2497 balanced_pages += zone->present_pages; 2549 balanced_pages += zone->managed_pages;
2498 continue; 2550 continue;
2499 } 2551 }
2500 2552
2501 if (zone_balanced(zone, order, 0, i)) 2553 if (zone_balanced(zone, order, 0, i))
2502 balanced_pages += zone->present_pages; 2554 balanced_pages += zone->managed_pages;
2503 else if (!order) 2555 else if (!order)
2504 return false; 2556 return false;
2505 } 2557 }
2506 2558
2507 if (order) 2559 if (order)
2508 return balanced_pages >= (present_pages >> 2); 2560 return balanced_pages >= (managed_pages >> 2);
2509 else 2561 else
2510 return true; 2562 return true;
2511} 2563}
@@ -2564,7 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
2564static unsigned long balance_pgdat(pg_data_t *pgdat, int order, 2616static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
2565 int *classzone_idx) 2617 int *classzone_idx)
2566{ 2618{
2567 struct zone *unbalanced_zone; 2619 bool pgdat_is_balanced = false;
2568 int i; 2620 int i;
2569 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ 2621 int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
2570 unsigned long total_scanned; 2622 unsigned long total_scanned;
@@ -2595,9 +2647,6 @@ loop_again:
2595 2647
2596 do { 2648 do {
2597 unsigned long lru_pages = 0; 2649 unsigned long lru_pages = 0;
2598 int has_under_min_watermark_zone = 0;
2599
2600 unbalanced_zone = NULL;
2601 2650
2602 /* 2651 /*
2603 * Scan in the highmem->dma direction for the highest 2652 * Scan in the highmem->dma direction for the highest
@@ -2638,8 +2687,11 @@ loop_again:
2638 zone_clear_flag(zone, ZONE_CONGESTED); 2687 zone_clear_flag(zone, ZONE_CONGESTED);
2639 } 2688 }
2640 } 2689 }
2641 if (i < 0) 2690
2691 if (i < 0) {
2692 pgdat_is_balanced = true;
2642 goto out; 2693 goto out;
2694 }
2643 2695
2644 for (i = 0; i <= end_zone; i++) { 2696 for (i = 0; i <= end_zone; i++) {
2645 struct zone *zone = pgdat->node_zones + i; 2697 struct zone *zone = pgdat->node_zones + i;
@@ -2689,7 +2741,7 @@ loop_again:
2689 * of the zone, whichever is smaller. 2741 * of the zone, whichever is smaller.
2690 */ 2742 */
2691 balance_gap = min(low_wmark_pages(zone), 2743 balance_gap = min(low_wmark_pages(zone),
2692 (zone->present_pages + 2744 (zone->managed_pages +
2693 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / 2745 KSWAPD_ZONE_BALANCE_GAP_RATIO-1) /
2694 KSWAPD_ZONE_BALANCE_GAP_RATIO); 2746 KSWAPD_ZONE_BALANCE_GAP_RATIO);
2695 /* 2747 /*
@@ -2720,12 +2772,10 @@ loop_again:
2720 } 2772 }
2721 2773
2722 /* 2774 /*
2723 * If we've done a decent amount of scanning and 2775 * If we're getting trouble reclaiming, start doing
2724 * the reclaim ratio is low, start doing writepage 2776 * writepage even in laptop mode.
2725 * even in laptop mode
2726 */ 2777 */
2727 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 2778 if (sc.priority < DEF_PRIORITY - 2)
2728 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
2729 sc.may_writepage = 1; 2779 sc.may_writepage = 1;
2730 2780
2731 if (zone->all_unreclaimable) { 2781 if (zone->all_unreclaimable) {
@@ -2734,17 +2784,7 @@ loop_again:
2734 continue; 2784 continue;
2735 } 2785 }
2736 2786
2737 if (!zone_balanced(zone, testorder, 0, end_zone)) { 2787 if (zone_balanced(zone, testorder, 0, end_zone))
2738 unbalanced_zone = zone;
2739 /*
2740 * We are still under min water mark. This
2741 * means that we have a GFP_ATOMIC allocation
2742 * failure risk. Hurry up!
2743 */
2744 if (!zone_watermark_ok_safe(zone, order,
2745 min_wmark_pages(zone), end_zone, 0))
2746 has_under_min_watermark_zone = 1;
2747 } else {
2748 /* 2788 /*
2749 * If a zone reaches its high watermark, 2789 * If a zone reaches its high watermark,
2750 * consider it to be no longer congested. It's 2790 * consider it to be no longer congested. It's
@@ -2753,8 +2793,6 @@ loop_again:
2753 * speculatively avoid congestion waits 2793 * speculatively avoid congestion waits
2754 */ 2794 */
2755 zone_clear_flag(zone, ZONE_CONGESTED); 2795 zone_clear_flag(zone, ZONE_CONGESTED);
2756 }
2757
2758 } 2796 }
2759 2797
2760 /* 2798 /*
@@ -2766,17 +2804,9 @@ loop_again:
2766 pfmemalloc_watermark_ok(pgdat)) 2804 pfmemalloc_watermark_ok(pgdat))
2767 wake_up(&pgdat->pfmemalloc_wait); 2805 wake_up(&pgdat->pfmemalloc_wait);
2768 2806
2769 if (pgdat_balanced(pgdat, order, *classzone_idx)) 2807 if (pgdat_balanced(pgdat, order, *classzone_idx)) {
2808 pgdat_is_balanced = true;
2770 break; /* kswapd: all done */ 2809 break; /* kswapd: all done */
2771 /*
2772 * OK, kswapd is getting into trouble. Take a nap, then take
2773 * another pass across the zones.
2774 */
2775 if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) {
2776 if (has_under_min_watermark_zone)
2777 count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT);
2778 else if (unbalanced_zone)
2779 wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10);
2780 } 2810 }
2781 2811
2782 /* 2812 /*
@@ -2788,9 +2818,9 @@ loop_again:
2788 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) 2818 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
2789 break; 2819 break;
2790 } while (--sc.priority >= 0); 2820 } while (--sc.priority >= 0);
2791out:
2792 2821
2793 if (!pgdat_balanced(pgdat, order, *classzone_idx)) { 2822out:
2823 if (!pgdat_is_balanced) {
2794 cond_resched(); 2824 cond_resched();
2795 2825
2796 try_to_freeze(); 2826 try_to_freeze();
@@ -3053,7 +3083,7 @@ unsigned long global_reclaimable_pages(void)
3053 nr = global_page_state(NR_ACTIVE_FILE) + 3083 nr = global_page_state(NR_ACTIVE_FILE) +
3054 global_page_state(NR_INACTIVE_FILE); 3084 global_page_state(NR_INACTIVE_FILE);
3055 3085
3056 if (nr_swap_pages > 0) 3086 if (get_nr_swap_pages() > 0)
3057 nr += global_page_state(NR_ACTIVE_ANON) + 3087 nr += global_page_state(NR_ACTIVE_ANON) +
3058 global_page_state(NR_INACTIVE_ANON); 3088 global_page_state(NR_INACTIVE_ANON);
3059 3089
@@ -3067,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
3067 nr = zone_page_state(zone, NR_ACTIVE_FILE) + 3097 nr = zone_page_state(zone, NR_ACTIVE_FILE) +
3068 zone_page_state(zone, NR_INACTIVE_FILE); 3098 zone_page_state(zone, NR_INACTIVE_FILE);
3069 3099
3070 if (nr_swap_pages > 0) 3100 if (get_nr_swap_pages() > 0)
3071 nr += zone_page_state(zone, NR_ACTIVE_ANON) + 3101 nr += zone_page_state(zone, NR_ACTIVE_ANON) +
3072 zone_page_state(zone, NR_INACTIVE_ANON); 3102 zone_page_state(zone, NR_INACTIVE_ANON);
3073 3103
@@ -3280,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
3280 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 3310 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
3281 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), 3311 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
3282 .may_swap = 1, 3312 .may_swap = 1,
3283 .nr_to_reclaim = max_t(unsigned long, nr_pages, 3313 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3284 SWAP_CLUSTER_MAX), 3314 .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
3285 .gfp_mask = gfp_mask,
3286 .order = order, 3315 .order = order,
3287 .priority = ZONE_RECLAIM_PRIORITY, 3316 .priority = ZONE_RECLAIM_PRIORITY,
3288 }; 3317 };