diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 397 |
1 files changed, 213 insertions, 184 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 196709f5ee58..88c5fed8b9a4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -128,7 +128,7 @@ struct scan_control { | |||
128 | * From 0 .. 100. Higher means more swappy. | 128 | * From 0 .. 100. Higher means more swappy. |
129 | */ | 129 | */ |
130 | int vm_swappiness = 60; | 130 | int vm_swappiness = 60; |
131 | long vm_total_pages; /* The total number of pages which the VM controls */ | 131 | unsigned long vm_total_pages; /* The total number of pages which the VM controls */ |
132 | 132 | ||
133 | static LIST_HEAD(shrinker_list); | 133 | static LIST_HEAD(shrinker_list); |
134 | static DECLARE_RWSEM(shrinker_rwsem); | 134 | static DECLARE_RWSEM(shrinker_rwsem); |
@@ -1579,16 +1579,6 @@ static inline int inactive_anon_is_low(struct lruvec *lruvec) | |||
1579 | } | 1579 | } |
1580 | #endif | 1580 | #endif |
1581 | 1581 | ||
1582 | static int inactive_file_is_low_global(struct zone *zone) | ||
1583 | { | ||
1584 | unsigned long active, inactive; | ||
1585 | |||
1586 | active = zone_page_state(zone, NR_ACTIVE_FILE); | ||
1587 | inactive = zone_page_state(zone, NR_INACTIVE_FILE); | ||
1588 | |||
1589 | return (active > inactive); | ||
1590 | } | ||
1591 | |||
1592 | /** | 1582 | /** |
1593 | * inactive_file_is_low - check if file pages need to be deactivated | 1583 | * inactive_file_is_low - check if file pages need to be deactivated |
1594 | * @lruvec: LRU vector to check | 1584 | * @lruvec: LRU vector to check |
@@ -1605,10 +1595,13 @@ static int inactive_file_is_low_global(struct zone *zone) | |||
1605 | */ | 1595 | */ |
1606 | static int inactive_file_is_low(struct lruvec *lruvec) | 1596 | static int inactive_file_is_low(struct lruvec *lruvec) |
1607 | { | 1597 | { |
1608 | if (!mem_cgroup_disabled()) | 1598 | unsigned long inactive; |
1609 | return mem_cgroup_inactive_file_is_low(lruvec); | 1599 | unsigned long active; |
1600 | |||
1601 | inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); | ||
1602 | active = get_lru_size(lruvec, LRU_ACTIVE_FILE); | ||
1610 | 1603 | ||
1611 | return inactive_file_is_low_global(lruvec_zone(lruvec)); | 1604 | return active > inactive; |
1612 | } | 1605 | } |
1613 | 1606 | ||
1614 | static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) | 1607 | static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) |
@@ -1638,6 +1631,13 @@ static int vmscan_swappiness(struct scan_control *sc) | |||
1638 | return mem_cgroup_swappiness(sc->target_mem_cgroup); | 1631 | return mem_cgroup_swappiness(sc->target_mem_cgroup); |
1639 | } | 1632 | } |
1640 | 1633 | ||
1634 | enum scan_balance { | ||
1635 | SCAN_EQUAL, | ||
1636 | SCAN_FRACT, | ||
1637 | SCAN_ANON, | ||
1638 | SCAN_FILE, | ||
1639 | }; | ||
1640 | |||
1641 | /* | 1641 | /* |
1642 | * Determine how aggressively the anon and file LRU lists should be | 1642 | * Determine how aggressively the anon and file LRU lists should be |
1643 | * scanned. The relative value of each set of LRU lists is determined | 1643 | * scanned. The relative value of each set of LRU lists is determined |
@@ -1650,15 +1650,16 @@ static int vmscan_swappiness(struct scan_control *sc) | |||
1650 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | 1650 | static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, |
1651 | unsigned long *nr) | 1651 | unsigned long *nr) |
1652 | { | 1652 | { |
1653 | unsigned long anon, file, free; | 1653 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; |
1654 | u64 fraction[2]; | ||
1655 | u64 denominator = 0; /* gcc */ | ||
1656 | struct zone *zone = lruvec_zone(lruvec); | ||
1654 | unsigned long anon_prio, file_prio; | 1657 | unsigned long anon_prio, file_prio; |
1658 | enum scan_balance scan_balance; | ||
1659 | unsigned long anon, file, free; | ||
1660 | bool force_scan = false; | ||
1655 | unsigned long ap, fp; | 1661 | unsigned long ap, fp; |
1656 | struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; | ||
1657 | u64 fraction[2], denominator; | ||
1658 | enum lru_list lru; | 1662 | enum lru_list lru; |
1659 | int noswap = 0; | ||
1660 | bool force_scan = false; | ||
1661 | struct zone *zone = lruvec_zone(lruvec); | ||
1662 | 1663 | ||
1663 | /* | 1664 | /* |
1664 | * If the zone or memcg is small, nr[l] can be 0. This | 1665 | * If the zone or memcg is small, nr[l] can be 0. This |
@@ -1676,11 +1677,30 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1676 | force_scan = true; | 1677 | force_scan = true; |
1677 | 1678 | ||
1678 | /* If we have no swap space, do not bother scanning anon pages. */ | 1679 | /* If we have no swap space, do not bother scanning anon pages. */ |
1679 | if (!sc->may_swap || (nr_swap_pages <= 0)) { | 1680 | if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { |
1680 | noswap = 1; | 1681 | scan_balance = SCAN_FILE; |
1681 | fraction[0] = 0; | 1682 | goto out; |
1682 | fraction[1] = 1; | 1683 | } |
1683 | denominator = 1; | 1684 | |
1685 | /* | ||
1686 | * Global reclaim will swap to prevent OOM even with no | ||
1687 | * swappiness, but memcg users want to use this knob to | ||
1688 | * disable swapping for individual groups completely when | ||
1689 | * using the memory controller's swap limit feature would be | ||
1690 | * too expensive. | ||
1691 | */ | ||
1692 | if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { | ||
1693 | scan_balance = SCAN_FILE; | ||
1694 | goto out; | ||
1695 | } | ||
1696 | |||
1697 | /* | ||
1698 | * Do not apply any pressure balancing cleverness when the | ||
1699 | * system is close to OOM, scan both anon and file equally | ||
1700 | * (unless the swappiness setting disagrees with swapping). | ||
1701 | */ | ||
1702 | if (!sc->priority && vmscan_swappiness(sc)) { | ||
1703 | scan_balance = SCAN_EQUAL; | ||
1684 | goto out; | 1704 | goto out; |
1685 | } | 1705 | } |
1686 | 1706 | ||
@@ -1689,30 +1709,32 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1689 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + | 1709 | file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + |
1690 | get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1710 | get_lru_size(lruvec, LRU_INACTIVE_FILE); |
1691 | 1711 | ||
1712 | /* | ||
1713 | * If it's foreseeable that reclaiming the file cache won't be | ||
1714 | * enough to get the zone back into a desirable shape, we have | ||
1715 | * to swap. Better start now and leave the - probably heavily | ||
1716 | * thrashing - remaining file pages alone. | ||
1717 | */ | ||
1692 | if (global_reclaim(sc)) { | 1718 | if (global_reclaim(sc)) { |
1693 | free = zone_page_state(zone, NR_FREE_PAGES); | 1719 | free = zone_page_state(zone, NR_FREE_PAGES); |
1694 | if (unlikely(file + free <= high_wmark_pages(zone))) { | 1720 | if (unlikely(file + free <= high_wmark_pages(zone))) { |
1695 | /* | 1721 | scan_balance = SCAN_ANON; |
1696 | * If we have very few page cache pages, force-scan | ||
1697 | * anon pages. | ||
1698 | */ | ||
1699 | fraction[0] = 1; | ||
1700 | fraction[1] = 0; | ||
1701 | denominator = 1; | ||
1702 | goto out; | ||
1703 | } else if (!inactive_file_is_low_global(zone)) { | ||
1704 | /* | ||
1705 | * There is enough inactive page cache, do not | ||
1706 | * reclaim anything from the working set right now. | ||
1707 | */ | ||
1708 | fraction[0] = 0; | ||
1709 | fraction[1] = 1; | ||
1710 | denominator = 1; | ||
1711 | goto out; | 1722 | goto out; |
1712 | } | 1723 | } |
1713 | } | 1724 | } |
1714 | 1725 | ||
1715 | /* | 1726 | /* |
1727 | * There is enough inactive page cache, do not reclaim | ||
1728 | * anything from the anonymous working set right now. | ||
1729 | */ | ||
1730 | if (!inactive_file_is_low(lruvec)) { | ||
1731 | scan_balance = SCAN_FILE; | ||
1732 | goto out; | ||
1733 | } | ||
1734 | |||
1735 | scan_balance = SCAN_FRACT; | ||
1736 | |||
1737 | /* | ||
1716 | * With swappiness at 100, anonymous and file have the same priority. | 1738 | * With swappiness at 100, anonymous and file have the same priority. |
1717 | * This scanning priority is essentially the inverse of IO cost. | 1739 | * This scanning priority is essentially the inverse of IO cost. |
1718 | */ | 1740 | */ |
@@ -1759,19 +1781,92 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, | |||
1759 | out: | 1781 | out: |
1760 | for_each_evictable_lru(lru) { | 1782 | for_each_evictable_lru(lru) { |
1761 | int file = is_file_lru(lru); | 1783 | int file = is_file_lru(lru); |
1784 | unsigned long size; | ||
1762 | unsigned long scan; | 1785 | unsigned long scan; |
1763 | 1786 | ||
1764 | scan = get_lru_size(lruvec, lru); | 1787 | size = get_lru_size(lruvec, lru); |
1765 | if (sc->priority || noswap || !vmscan_swappiness(sc)) { | 1788 | scan = size >> sc->priority; |
1766 | scan >>= sc->priority; | 1789 | |
1767 | if (!scan && force_scan) | 1790 | if (!scan && force_scan) |
1768 | scan = SWAP_CLUSTER_MAX; | 1791 | scan = min(size, SWAP_CLUSTER_MAX); |
1792 | |||
1793 | switch (scan_balance) { | ||
1794 | case SCAN_EQUAL: | ||
1795 | /* Scan lists relative to size */ | ||
1796 | break; | ||
1797 | case SCAN_FRACT: | ||
1798 | /* | ||
1799 | * Scan types proportional to swappiness and | ||
1800 | * their relative recent reclaim efficiency. | ||
1801 | */ | ||
1769 | scan = div64_u64(scan * fraction[file], denominator); | 1802 | scan = div64_u64(scan * fraction[file], denominator); |
1803 | break; | ||
1804 | case SCAN_FILE: | ||
1805 | case SCAN_ANON: | ||
1806 | /* Scan one type exclusively */ | ||
1807 | if ((scan_balance == SCAN_FILE) != file) | ||
1808 | scan = 0; | ||
1809 | break; | ||
1810 | default: | ||
1811 | /* Look ma, no brain */ | ||
1812 | BUG(); | ||
1770 | } | 1813 | } |
1771 | nr[lru] = scan; | 1814 | nr[lru] = scan; |
1772 | } | 1815 | } |
1773 | } | 1816 | } |
1774 | 1817 | ||
1818 | /* | ||
1819 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | ||
1820 | */ | ||
1821 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | ||
1822 | { | ||
1823 | unsigned long nr[NR_LRU_LISTS]; | ||
1824 | unsigned long nr_to_scan; | ||
1825 | enum lru_list lru; | ||
1826 | unsigned long nr_reclaimed = 0; | ||
1827 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | ||
1828 | struct blk_plug plug; | ||
1829 | |||
1830 | get_scan_count(lruvec, sc, nr); | ||
1831 | |||
1832 | blk_start_plug(&plug); | ||
1833 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | ||
1834 | nr[LRU_INACTIVE_FILE]) { | ||
1835 | for_each_evictable_lru(lru) { | ||
1836 | if (nr[lru]) { | ||
1837 | nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); | ||
1838 | nr[lru] -= nr_to_scan; | ||
1839 | |||
1840 | nr_reclaimed += shrink_list(lru, nr_to_scan, | ||
1841 | lruvec, sc); | ||
1842 | } | ||
1843 | } | ||
1844 | /* | ||
1845 | * On large memory systems, scan >> priority can become | ||
1846 | * really large. This is fine for the starting priority; | ||
1847 | * we want to put equal scanning pressure on each zone. | ||
1848 | * However, if the VM has a harder time of freeing pages, | ||
1849 | * with multiple processes reclaiming pages, the total | ||
1850 | * freeing target can get unreasonably large. | ||
1851 | */ | ||
1852 | if (nr_reclaimed >= nr_to_reclaim && | ||
1853 | sc->priority < DEF_PRIORITY) | ||
1854 | break; | ||
1855 | } | ||
1856 | blk_finish_plug(&plug); | ||
1857 | sc->nr_reclaimed += nr_reclaimed; | ||
1858 | |||
1859 | /* | ||
1860 | * Even if we did not try to evict anon pages at all, we want to | ||
1861 | * rebalance the anon lru active/inactive ratio. | ||
1862 | */ | ||
1863 | if (inactive_anon_is_low(lruvec)) | ||
1864 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | ||
1865 | sc, LRU_ACTIVE_ANON); | ||
1866 | |||
1867 | throttle_vm_writeout(sc->gfp_mask); | ||
1868 | } | ||
1869 | |||
1775 | /* Use reclaim/compaction for costly allocs or under memory pressure */ | 1870 | /* Use reclaim/compaction for costly allocs or under memory pressure */ |
1776 | static bool in_reclaim_compaction(struct scan_control *sc) | 1871 | static bool in_reclaim_compaction(struct scan_control *sc) |
1777 | { | 1872 | { |
@@ -1790,7 +1885,7 @@ static bool in_reclaim_compaction(struct scan_control *sc) | |||
1790 | * calls try_to_compact_zone() that it will have enough free pages to succeed. | 1885 | * calls try_to_compact_zone() that it will have enough free pages to succeed. |
1791 | * It will give up earlier than that if there is difficulty reclaiming pages. | 1886 | * It will give up earlier than that if there is difficulty reclaiming pages. |
1792 | */ | 1887 | */ |
1793 | static inline bool should_continue_reclaim(struct lruvec *lruvec, | 1888 | static inline bool should_continue_reclaim(struct zone *zone, |
1794 | unsigned long nr_reclaimed, | 1889 | unsigned long nr_reclaimed, |
1795 | unsigned long nr_scanned, | 1890 | unsigned long nr_scanned, |
1796 | struct scan_control *sc) | 1891 | struct scan_control *sc) |
@@ -1830,15 +1925,15 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, | |||
1830 | * inactive lists are large enough, continue reclaiming | 1925 | * inactive lists are large enough, continue reclaiming |
1831 | */ | 1926 | */ |
1832 | pages_for_compaction = (2UL << sc->order); | 1927 | pages_for_compaction = (2UL << sc->order); |
1833 | inactive_lru_pages = get_lru_size(lruvec, LRU_INACTIVE_FILE); | 1928 | inactive_lru_pages = zone_page_state(zone, NR_INACTIVE_FILE); |
1834 | if (nr_swap_pages > 0) | 1929 | if (get_nr_swap_pages() > 0) |
1835 | inactive_lru_pages += get_lru_size(lruvec, LRU_INACTIVE_ANON); | 1930 | inactive_lru_pages += zone_page_state(zone, NR_INACTIVE_ANON); |
1836 | if (sc->nr_reclaimed < pages_for_compaction && | 1931 | if (sc->nr_reclaimed < pages_for_compaction && |
1837 | inactive_lru_pages > pages_for_compaction) | 1932 | inactive_lru_pages > pages_for_compaction) |
1838 | return true; | 1933 | return true; |
1839 | 1934 | ||
1840 | /* If compaction would go ahead or the allocation would succeed, stop */ | 1935 | /* If compaction would go ahead or the allocation would succeed, stop */ |
1841 | switch (compaction_suitable(lruvec_zone(lruvec), sc->order)) { | 1936 | switch (compaction_suitable(zone, sc->order)) { |
1842 | case COMPACT_PARTIAL: | 1937 | case COMPACT_PARTIAL: |
1843 | case COMPACT_CONTINUE: | 1938 | case COMPACT_CONTINUE: |
1844 | return false; | 1939 | return false; |
@@ -1847,98 +1942,48 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec, | |||
1847 | } | 1942 | } |
1848 | } | 1943 | } |
1849 | 1944 | ||
1850 | /* | 1945 | static void shrink_zone(struct zone *zone, struct scan_control *sc) |
1851 | * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. | ||
1852 | */ | ||
1853 | static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) | ||
1854 | { | 1946 | { |
1855 | unsigned long nr[NR_LRU_LISTS]; | ||
1856 | unsigned long nr_to_scan; | ||
1857 | enum lru_list lru; | ||
1858 | unsigned long nr_reclaimed, nr_scanned; | 1947 | unsigned long nr_reclaimed, nr_scanned; |
1859 | unsigned long nr_to_reclaim = sc->nr_to_reclaim; | ||
1860 | struct blk_plug plug; | ||
1861 | |||
1862 | restart: | ||
1863 | nr_reclaimed = 0; | ||
1864 | nr_scanned = sc->nr_scanned; | ||
1865 | get_scan_count(lruvec, sc, nr); | ||
1866 | |||
1867 | blk_start_plug(&plug); | ||
1868 | while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || | ||
1869 | nr[LRU_INACTIVE_FILE]) { | ||
1870 | for_each_evictable_lru(lru) { | ||
1871 | if (nr[lru]) { | ||
1872 | nr_to_scan = min_t(unsigned long, | ||
1873 | nr[lru], SWAP_CLUSTER_MAX); | ||
1874 | nr[lru] -= nr_to_scan; | ||
1875 | |||
1876 | nr_reclaimed += shrink_list(lru, nr_to_scan, | ||
1877 | lruvec, sc); | ||
1878 | } | ||
1879 | } | ||
1880 | /* | ||
1881 | * On large memory systems, scan >> priority can become | ||
1882 | * really large. This is fine for the starting priority; | ||
1883 | * we want to put equal scanning pressure on each zone. | ||
1884 | * However, if the VM has a harder time of freeing pages, | ||
1885 | * with multiple processes reclaiming pages, the total | ||
1886 | * freeing target can get unreasonably large. | ||
1887 | */ | ||
1888 | if (nr_reclaimed >= nr_to_reclaim && | ||
1889 | sc->priority < DEF_PRIORITY) | ||
1890 | break; | ||
1891 | } | ||
1892 | blk_finish_plug(&plug); | ||
1893 | sc->nr_reclaimed += nr_reclaimed; | ||
1894 | 1948 | ||
1895 | /* | 1949 | do { |
1896 | * Even if we did not try to evict anon pages at all, we want to | 1950 | struct mem_cgroup *root = sc->target_mem_cgroup; |
1897 | * rebalance the anon lru active/inactive ratio. | 1951 | struct mem_cgroup_reclaim_cookie reclaim = { |
1898 | */ | 1952 | .zone = zone, |
1899 | if (inactive_anon_is_low(lruvec)) | 1953 | .priority = sc->priority, |
1900 | shrink_active_list(SWAP_CLUSTER_MAX, lruvec, | 1954 | }; |
1901 | sc, LRU_ACTIVE_ANON); | 1955 | struct mem_cgroup *memcg; |
1902 | |||
1903 | /* reclaim/compaction might need reclaim to continue */ | ||
1904 | if (should_continue_reclaim(lruvec, nr_reclaimed, | ||
1905 | sc->nr_scanned - nr_scanned, sc)) | ||
1906 | goto restart; | ||
1907 | 1956 | ||
1908 | throttle_vm_writeout(sc->gfp_mask); | 1957 | nr_reclaimed = sc->nr_reclaimed; |
1909 | } | 1958 | nr_scanned = sc->nr_scanned; |
1910 | 1959 | ||
1911 | static void shrink_zone(struct zone *zone, struct scan_control *sc) | 1960 | memcg = mem_cgroup_iter(root, NULL, &reclaim); |
1912 | { | 1961 | do { |
1913 | struct mem_cgroup *root = sc->target_mem_cgroup; | 1962 | struct lruvec *lruvec; |
1914 | struct mem_cgroup_reclaim_cookie reclaim = { | ||
1915 | .zone = zone, | ||
1916 | .priority = sc->priority, | ||
1917 | }; | ||
1918 | struct mem_cgroup *memcg; | ||
1919 | 1963 | ||
1920 | memcg = mem_cgroup_iter(root, NULL, &reclaim); | 1964 | lruvec = mem_cgroup_zone_lruvec(zone, memcg); |
1921 | do { | ||
1922 | struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); | ||
1923 | 1965 | ||
1924 | shrink_lruvec(lruvec, sc); | 1966 | shrink_lruvec(lruvec, sc); |
1925 | 1967 | ||
1926 | /* | 1968 | /* |
1927 | * Limit reclaim has historically picked one memcg and | 1969 | * Direct reclaim and kswapd have to scan all memory |
1928 | * scanned it with decreasing priority levels until | 1970 | * cgroups to fulfill the overall scan target for the |
1929 | * nr_to_reclaim had been reclaimed. This priority | 1971 | * zone. |
1930 | * cycle is thus over after a single memcg. | 1972 | * |
1931 | * | 1973 | * Limit reclaim, on the other hand, only cares about |
1932 | * Direct reclaim and kswapd, on the other hand, have | 1974 | * nr_to_reclaim pages to be reclaimed and it will |
1933 | * to scan all memory cgroups to fulfill the overall | 1975 | * retry with decreasing priority if one round over the |
1934 | * scan target for the zone. | 1976 | * whole hierarchy is not sufficient. |
1935 | */ | 1977 | */ |
1936 | if (!global_reclaim(sc)) { | 1978 | if (!global_reclaim(sc) && |
1937 | mem_cgroup_iter_break(root, memcg); | 1979 | sc->nr_reclaimed >= sc->nr_to_reclaim) { |
1938 | break; | 1980 | mem_cgroup_iter_break(root, memcg); |
1939 | } | 1981 | break; |
1940 | memcg = mem_cgroup_iter(root, memcg, &reclaim); | 1982 | } |
1941 | } while (memcg); | 1983 | memcg = mem_cgroup_iter(root, memcg, &reclaim); |
1984 | } while (memcg); | ||
1985 | } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, | ||
1986 | sc->nr_scanned - nr_scanned, sc)); | ||
1942 | } | 1987 | } |
1943 | 1988 | ||
1944 | /* Returns true if compaction should go ahead for a high-order request */ | 1989 | /* Returns true if compaction should go ahead for a high-order request */ |
@@ -1958,7 +2003,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) | |||
1958 | * a reasonable chance of completing and allocating the page | 2003 | * a reasonable chance of completing and allocating the page |
1959 | */ | 2004 | */ |
1960 | balance_gap = min(low_wmark_pages(zone), | 2005 | balance_gap = min(low_wmark_pages(zone), |
1961 | (zone->present_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2006 | (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
1962 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2007 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
1963 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); | 2008 | watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); |
1964 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); | 2009 | watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); |
@@ -2150,6 +2195,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, | |||
2150 | goto out; | 2195 | goto out; |
2151 | 2196 | ||
2152 | /* | 2197 | /* |
2198 | * If we're getting trouble reclaiming, start doing | ||
2199 | * writepage even in laptop mode. | ||
2200 | */ | ||
2201 | if (sc->priority < DEF_PRIORITY - 2) | ||
2202 | sc->may_writepage = 1; | ||
2203 | |||
2204 | /* | ||
2153 | * Try to write back as many pages as we just scanned. This | 2205 | * Try to write back as many pages as we just scanned. This |
2154 | * tends to cause slow streaming writers to write data to the | 2206 | * tends to cause slow streaming writers to write data to the |
2155 | * disk smoothly, at the dirtying rate, which is nice. But | 2207 | * disk smoothly, at the dirtying rate, which is nice. But |
@@ -2300,7 +2352,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, | |||
2300 | { | 2352 | { |
2301 | unsigned long nr_reclaimed; | 2353 | unsigned long nr_reclaimed; |
2302 | struct scan_control sc = { | 2354 | struct scan_control sc = { |
2303 | .gfp_mask = gfp_mask, | 2355 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
2304 | .may_writepage = !laptop_mode, | 2356 | .may_writepage = !laptop_mode, |
2305 | .nr_to_reclaim = SWAP_CLUSTER_MAX, | 2357 | .nr_to_reclaim = SWAP_CLUSTER_MAX, |
2306 | .may_unmap = 1, | 2358 | .may_unmap = 1, |
@@ -2473,7 +2525,7 @@ static bool zone_balanced(struct zone *zone, int order, | |||
2473 | */ | 2525 | */ |
2474 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | 2526 | static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) |
2475 | { | 2527 | { |
2476 | unsigned long present_pages = 0; | 2528 | unsigned long managed_pages = 0; |
2477 | unsigned long balanced_pages = 0; | 2529 | unsigned long balanced_pages = 0; |
2478 | int i; | 2530 | int i; |
2479 | 2531 | ||
@@ -2484,7 +2536,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
2484 | if (!populated_zone(zone)) | 2536 | if (!populated_zone(zone)) |
2485 | continue; | 2537 | continue; |
2486 | 2538 | ||
2487 | present_pages += zone->present_pages; | 2539 | managed_pages += zone->managed_pages; |
2488 | 2540 | ||
2489 | /* | 2541 | /* |
2490 | * A special case here: | 2542 | * A special case here: |
@@ -2494,18 +2546,18 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) | |||
2494 | * they must be considered balanced here as well! | 2546 | * they must be considered balanced here as well! |
2495 | */ | 2547 | */ |
2496 | if (zone->all_unreclaimable) { | 2548 | if (zone->all_unreclaimable) { |
2497 | balanced_pages += zone->present_pages; | 2549 | balanced_pages += zone->managed_pages; |
2498 | continue; | 2550 | continue; |
2499 | } | 2551 | } |
2500 | 2552 | ||
2501 | if (zone_balanced(zone, order, 0, i)) | 2553 | if (zone_balanced(zone, order, 0, i)) |
2502 | balanced_pages += zone->present_pages; | 2554 | balanced_pages += zone->managed_pages; |
2503 | else if (!order) | 2555 | else if (!order) |
2504 | return false; | 2556 | return false; |
2505 | } | 2557 | } |
2506 | 2558 | ||
2507 | if (order) | 2559 | if (order) |
2508 | return balanced_pages >= (present_pages >> 2); | 2560 | return balanced_pages >= (managed_pages >> 2); |
2509 | else | 2561 | else |
2510 | return true; | 2562 | return true; |
2511 | } | 2563 | } |
@@ -2564,7 +2616,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, | |||
2564 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, | 2616 | static unsigned long balance_pgdat(pg_data_t *pgdat, int order, |
2565 | int *classzone_idx) | 2617 | int *classzone_idx) |
2566 | { | 2618 | { |
2567 | struct zone *unbalanced_zone; | 2619 | bool pgdat_is_balanced = false; |
2568 | int i; | 2620 | int i; |
2569 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ | 2621 | int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ |
2570 | unsigned long total_scanned; | 2622 | unsigned long total_scanned; |
@@ -2595,9 +2647,6 @@ loop_again: | |||
2595 | 2647 | ||
2596 | do { | 2648 | do { |
2597 | unsigned long lru_pages = 0; | 2649 | unsigned long lru_pages = 0; |
2598 | int has_under_min_watermark_zone = 0; | ||
2599 | |||
2600 | unbalanced_zone = NULL; | ||
2601 | 2650 | ||
2602 | /* | 2651 | /* |
2603 | * Scan in the highmem->dma direction for the highest | 2652 | * Scan in the highmem->dma direction for the highest |
@@ -2638,8 +2687,11 @@ loop_again: | |||
2638 | zone_clear_flag(zone, ZONE_CONGESTED); | 2687 | zone_clear_flag(zone, ZONE_CONGESTED); |
2639 | } | 2688 | } |
2640 | } | 2689 | } |
2641 | if (i < 0) | 2690 | |
2691 | if (i < 0) { | ||
2692 | pgdat_is_balanced = true; | ||
2642 | goto out; | 2693 | goto out; |
2694 | } | ||
2643 | 2695 | ||
2644 | for (i = 0; i <= end_zone; i++) { | 2696 | for (i = 0; i <= end_zone; i++) { |
2645 | struct zone *zone = pgdat->node_zones + i; | 2697 | struct zone *zone = pgdat->node_zones + i; |
@@ -2689,7 +2741,7 @@ loop_again: | |||
2689 | * of the zone, whichever is smaller. | 2741 | * of the zone, whichever is smaller. |
2690 | */ | 2742 | */ |
2691 | balance_gap = min(low_wmark_pages(zone), | 2743 | balance_gap = min(low_wmark_pages(zone), |
2692 | (zone->present_pages + | 2744 | (zone->managed_pages + |
2693 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / | 2745 | KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / |
2694 | KSWAPD_ZONE_BALANCE_GAP_RATIO); | 2746 | KSWAPD_ZONE_BALANCE_GAP_RATIO); |
2695 | /* | 2747 | /* |
@@ -2720,12 +2772,10 @@ loop_again: | |||
2720 | } | 2772 | } |
2721 | 2773 | ||
2722 | /* | 2774 | /* |
2723 | * If we've done a decent amount of scanning and | 2775 | * If we're getting trouble reclaiming, start doing |
2724 | * the reclaim ratio is low, start doing writepage | 2776 | * writepage even in laptop mode. |
2725 | * even in laptop mode | ||
2726 | */ | 2777 | */ |
2727 | if (total_scanned > SWAP_CLUSTER_MAX * 2 && | 2778 | if (sc.priority < DEF_PRIORITY - 2) |
2728 | total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) | ||
2729 | sc.may_writepage = 1; | 2779 | sc.may_writepage = 1; |
2730 | 2780 | ||
2731 | if (zone->all_unreclaimable) { | 2781 | if (zone->all_unreclaimable) { |
@@ -2734,17 +2784,7 @@ loop_again: | |||
2734 | continue; | 2784 | continue; |
2735 | } | 2785 | } |
2736 | 2786 | ||
2737 | if (!zone_balanced(zone, testorder, 0, end_zone)) { | 2787 | if (zone_balanced(zone, testorder, 0, end_zone)) |
2738 | unbalanced_zone = zone; | ||
2739 | /* | ||
2740 | * We are still under min water mark. This | ||
2741 | * means that we have a GFP_ATOMIC allocation | ||
2742 | * failure risk. Hurry up! | ||
2743 | */ | ||
2744 | if (!zone_watermark_ok_safe(zone, order, | ||
2745 | min_wmark_pages(zone), end_zone, 0)) | ||
2746 | has_under_min_watermark_zone = 1; | ||
2747 | } else { | ||
2748 | /* | 2788 | /* |
2749 | * If a zone reaches its high watermark, | 2789 | * If a zone reaches its high watermark, |
2750 | * consider it to be no longer congested. It's | 2790 | * consider it to be no longer congested. It's |
@@ -2753,8 +2793,6 @@ loop_again: | |||
2753 | * speculatively avoid congestion waits | 2793 | * speculatively avoid congestion waits |
2754 | */ | 2794 | */ |
2755 | zone_clear_flag(zone, ZONE_CONGESTED); | 2795 | zone_clear_flag(zone, ZONE_CONGESTED); |
2756 | } | ||
2757 | |||
2758 | } | 2796 | } |
2759 | 2797 | ||
2760 | /* | 2798 | /* |
@@ -2766,17 +2804,9 @@ loop_again: | |||
2766 | pfmemalloc_watermark_ok(pgdat)) | 2804 | pfmemalloc_watermark_ok(pgdat)) |
2767 | wake_up(&pgdat->pfmemalloc_wait); | 2805 | wake_up(&pgdat->pfmemalloc_wait); |
2768 | 2806 | ||
2769 | if (pgdat_balanced(pgdat, order, *classzone_idx)) | 2807 | if (pgdat_balanced(pgdat, order, *classzone_idx)) { |
2808 | pgdat_is_balanced = true; | ||
2770 | break; /* kswapd: all done */ | 2809 | break; /* kswapd: all done */ |
2771 | /* | ||
2772 | * OK, kswapd is getting into trouble. Take a nap, then take | ||
2773 | * another pass across the zones. | ||
2774 | */ | ||
2775 | if (total_scanned && (sc.priority < DEF_PRIORITY - 2)) { | ||
2776 | if (has_under_min_watermark_zone) | ||
2777 | count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); | ||
2778 | else if (unbalanced_zone) | ||
2779 | wait_iff_congested(unbalanced_zone, BLK_RW_ASYNC, HZ/10); | ||
2780 | } | 2810 | } |
2781 | 2811 | ||
2782 | /* | 2812 | /* |
@@ -2788,9 +2818,9 @@ loop_again: | |||
2788 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) | 2818 | if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) |
2789 | break; | 2819 | break; |
2790 | } while (--sc.priority >= 0); | 2820 | } while (--sc.priority >= 0); |
2791 | out: | ||
2792 | 2821 | ||
2793 | if (!pgdat_balanced(pgdat, order, *classzone_idx)) { | 2822 | out: |
2823 | if (!pgdat_is_balanced) { | ||
2794 | cond_resched(); | 2824 | cond_resched(); |
2795 | 2825 | ||
2796 | try_to_freeze(); | 2826 | try_to_freeze(); |
@@ -3053,7 +3083,7 @@ unsigned long global_reclaimable_pages(void) | |||
3053 | nr = global_page_state(NR_ACTIVE_FILE) + | 3083 | nr = global_page_state(NR_ACTIVE_FILE) + |
3054 | global_page_state(NR_INACTIVE_FILE); | 3084 | global_page_state(NR_INACTIVE_FILE); |
3055 | 3085 | ||
3056 | if (nr_swap_pages > 0) | 3086 | if (get_nr_swap_pages() > 0) |
3057 | nr += global_page_state(NR_ACTIVE_ANON) + | 3087 | nr += global_page_state(NR_ACTIVE_ANON) + |
3058 | global_page_state(NR_INACTIVE_ANON); | 3088 | global_page_state(NR_INACTIVE_ANON); |
3059 | 3089 | ||
@@ -3067,7 +3097,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone) | |||
3067 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + | 3097 | nr = zone_page_state(zone, NR_ACTIVE_FILE) + |
3068 | zone_page_state(zone, NR_INACTIVE_FILE); | 3098 | zone_page_state(zone, NR_INACTIVE_FILE); |
3069 | 3099 | ||
3070 | if (nr_swap_pages > 0) | 3100 | if (get_nr_swap_pages() > 0) |
3071 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + | 3101 | nr += zone_page_state(zone, NR_ACTIVE_ANON) + |
3072 | zone_page_state(zone, NR_INACTIVE_ANON); | 3102 | zone_page_state(zone, NR_INACTIVE_ANON); |
3073 | 3103 | ||
@@ -3280,9 +3310,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) | |||
3280 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), | 3310 | .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), |
3281 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), | 3311 | .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), |
3282 | .may_swap = 1, | 3312 | .may_swap = 1, |
3283 | .nr_to_reclaim = max_t(unsigned long, nr_pages, | 3313 | .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), |
3284 | SWAP_CLUSTER_MAX), | 3314 | .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), |
3285 | .gfp_mask = gfp_mask, | ||
3286 | .order = order, | 3315 | .order = order, |
3287 | .priority = ZONE_RECLAIM_PRIORITY, | 3316 | .priority = ZONE_RECLAIM_PRIORITY, |
3288 | }; | 3317 | }; |