summaryrefslogtreecommitdiffstats
path: root/mm/swapfile.c
diff options
context:
space:
mode:
authorVineeth Remanan Pillai <vpillai@digitalocean.com>2019-03-05 18:47:03 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2019-03-06 00:07:18 -0500
commitb56a2d8af9147a4efe4011b60d93779c0461ca97 (patch)
tree1595323c4696df56df8018357c6c97a0aef14f7a /mm/swapfile.c
parentc5bf121e4350a933bd431385e6fcb72a898ecc68 (diff)
mm: rid swapoff of quadratic complexity
This patch was initially posted by Kelley Nielsen. Reposting the patch with all review comments addressed and with minor modifications and optimizations. Also, folding in the fixes offered by Hugh Dickins and Huang Ying. Tests were rerun and commit message updated with new results. try_to_unuse() is of quadratic complexity, with a lot of wasted effort. It unuses swap entries one by one, potentially iterating over all the page tables for all the processes in the system for each one. This new proposed implementation of try_to_unuse simplifies its complexity to linear. It iterates over the system's mms once, unusing all the affected entries as it walks each set of page tables. It also makes similar changes to shmem_unuse. Improvement swapoff was called on a swap partition containing about 6G of data, in a VM(8cpu, 16G RAM), and calls to unuse_pte_range() were counted. Present implementation....about 1200M calls(8min, avg 80% cpu util). Prototype.................about 9.0K calls(3min, avg 5% cpu util). Details In shmem_unuse(), iterate over the shmem_swaplist and, for each shmem_inode_info that contains a swap entry, pass it to shmem_unuse_inode(), along with the swap type. In shmem_unuse_inode(), iterate over its associated xarray, and store the index and value of each swap entry in an array for passing to shmem_swapin_page() outside of the RCU critical section. In try_to_unuse(), instead of iterating over the entries in the type and unusing them one by one, perhaps walking all the page tables for all the processes for each one, iterate over the mmlist, making one pass. Pass each mm to unuse_mm() to begin its page table walk, and during the walk, unuse all the ptes that have backing store in the swap type received by try_to_unuse(). After the walk, check the type for orphaned swap entries with find_next_to_unuse(), and remove them from the swap cache. If find_next_to_unuse() starts over at the beginning of the type, repeat the check of the shmem_swaplist and the walk a maximum of three times. Change unuse_mm() and the intervening walk functions down to unuse_pte_range() to take the type as a parameter, and to iterate over their entire range, calling the next function down on every iteration. In unuse_pte_range(), make a swap entry from each pte in the range using the passed in type. If it has backing store in the type, call swapin_readahead() to retrieve the page and pass it to unuse_pte(). Pass the count of pages_to_unuse down the page table walks in try_to_unuse(), and return from the walk when the desired number of pages has been swapped back in. Link: http://lkml.kernel.org/r/20190114153129.4852-2-vpillai@digitalocean.com Signed-off-by: Vineeth Remanan Pillai <vpillai@digitalocean.com> Signed-off-by: Kelley Nielsen <kelleynnn@gmail.com> Signed-off-by: Huang Ying <ying.huang@intel.com> Acked-by: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@surriel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c433
1 files changed, 163 insertions, 270 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dbac1d49469d..6de46984d59d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1799,44 +1799,77 @@ out_nolock:
1799} 1799}
1800 1800
1801static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, 1801static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
1802 unsigned long addr, unsigned long end, 1802 unsigned long addr, unsigned long end,
1803 swp_entry_t entry, struct page *page) 1803 unsigned int type, bool frontswap,
1804 unsigned long *fs_pages_to_unuse)
1804{ 1805{
1805 pte_t swp_pte = swp_entry_to_pte(entry); 1806 struct page *page;
1807 swp_entry_t entry;
1806 pte_t *pte; 1808 pte_t *pte;
1809 struct swap_info_struct *si;
1810 unsigned long offset;
1807 int ret = 0; 1811 int ret = 0;
1812 volatile unsigned char *swap_map;
1808 1813
1809 /* 1814 si = swap_info[type];
1810 * We don't actually need pte lock while scanning for swp_pte: since
1811 * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
1812 * page table while we're scanning; though it could get zapped, and on
1813 * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
1814 * of unmatched parts which look like swp_pte, so unuse_pte must
1815 * recheck under pte lock. Scanning without pte lock lets it be
1816 * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
1817 */
1818 pte = pte_offset_map(pmd, addr); 1815 pte = pte_offset_map(pmd, addr);
1819 do { 1816 do {
1820 /* 1817 struct vm_fault vmf;
1821 * swapoff spends a _lot_ of time in this loop! 1818
1822 * Test inline before going to call unuse_pte. 1819 if (!is_swap_pte(*pte))
1823 */ 1820 continue;
1824 if (unlikely(pte_same_as_swp(*pte, swp_pte))) { 1821
1825 pte_unmap(pte); 1822 entry = pte_to_swp_entry(*pte);
1826 ret = unuse_pte(vma, pmd, addr, entry, page); 1823 if (swp_type(entry) != type)
1827 if (ret) 1824 continue;
1828 goto out; 1825
1829 pte = pte_offset_map(pmd, addr); 1826 offset = swp_offset(entry);
1827 if (frontswap && !frontswap_test(si, offset))
1828 continue;
1829
1830 pte_unmap(pte);
1831 swap_map = &si->swap_map[offset];
1832 vmf.vma = vma;
1833 vmf.address = addr;
1834 vmf.pmd = pmd;
1835 page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
1836 if (!page) {
1837 if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
1838 goto try_next;
1839 return -ENOMEM;
1840 }
1841
1842 lock_page(page);
1843 wait_on_page_writeback(page);
1844 ret = unuse_pte(vma, pmd, addr, entry, page);
1845 if (ret < 0) {
1846 unlock_page(page);
1847 put_page(page);
1848 goto out;
1849 }
1850
1851 try_to_free_swap(page);
1852 unlock_page(page);
1853 put_page(page);
1854
1855 if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
1856 ret = FRONTSWAP_PAGES_UNUSED;
1857 goto out;
1830 } 1858 }
1859try_next:
1860 pte = pte_offset_map(pmd, addr);
1831 } while (pte++, addr += PAGE_SIZE, addr != end); 1861 } while (pte++, addr += PAGE_SIZE, addr != end);
1832 pte_unmap(pte - 1); 1862 pte_unmap(pte - 1);
1863
1864 ret = 0;
1833out: 1865out:
1834 return ret; 1866 return ret;
1835} 1867}
1836 1868
1837static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, 1869static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1838 unsigned long addr, unsigned long end, 1870 unsigned long addr, unsigned long end,
1839 swp_entry_t entry, struct page *page) 1871 unsigned int type, bool frontswap,
1872 unsigned long *fs_pages_to_unuse)
1840{ 1873{
1841 pmd_t *pmd; 1874 pmd_t *pmd;
1842 unsigned long next; 1875 unsigned long next;
@@ -1848,7 +1881,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1848 next = pmd_addr_end(addr, end); 1881 next = pmd_addr_end(addr, end);
1849 if (pmd_none_or_trans_huge_or_clear_bad(pmd)) 1882 if (pmd_none_or_trans_huge_or_clear_bad(pmd))
1850 continue; 1883 continue;
1851 ret = unuse_pte_range(vma, pmd, addr, next, entry, page); 1884 ret = unuse_pte_range(vma, pmd, addr, next, type,
1885 frontswap, fs_pages_to_unuse);
1852 if (ret) 1886 if (ret)
1853 return ret; 1887 return ret;
1854 } while (pmd++, addr = next, addr != end); 1888 } while (pmd++, addr = next, addr != end);
@@ -1857,7 +1891,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
1857 1891
1858static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d, 1892static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1859 unsigned long addr, unsigned long end, 1893 unsigned long addr, unsigned long end,
1860 swp_entry_t entry, struct page *page) 1894 unsigned int type, bool frontswap,
1895 unsigned long *fs_pages_to_unuse)
1861{ 1896{
1862 pud_t *pud; 1897 pud_t *pud;
1863 unsigned long next; 1898 unsigned long next;
@@ -1868,7 +1903,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1868 next = pud_addr_end(addr, end); 1903 next = pud_addr_end(addr, end);
1869 if (pud_none_or_clear_bad(pud)) 1904 if (pud_none_or_clear_bad(pud))
1870 continue; 1905 continue;
1871 ret = unuse_pmd_range(vma, pud, addr, next, entry, page); 1906 ret = unuse_pmd_range(vma, pud, addr, next, type,
1907 frontswap, fs_pages_to_unuse);
1872 if (ret) 1908 if (ret)
1873 return ret; 1909 return ret;
1874 } while (pud++, addr = next, addr != end); 1910 } while (pud++, addr = next, addr != end);
@@ -1877,7 +1913,8 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
1877 1913
1878static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, 1914static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
1879 unsigned long addr, unsigned long end, 1915 unsigned long addr, unsigned long end,
1880 swp_entry_t entry, struct page *page) 1916 unsigned int type, bool frontswap,
1917 unsigned long *fs_pages_to_unuse)
1881{ 1918{
1882 p4d_t *p4d; 1919 p4d_t *p4d;
1883 unsigned long next; 1920 unsigned long next;
@@ -1888,78 +1925,66 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
1888 next = p4d_addr_end(addr, end); 1925 next = p4d_addr_end(addr, end);
1889 if (p4d_none_or_clear_bad(p4d)) 1926 if (p4d_none_or_clear_bad(p4d))
1890 continue; 1927 continue;
1891 ret = unuse_pud_range(vma, p4d, addr, next, entry, page); 1928 ret = unuse_pud_range(vma, p4d, addr, next, type,
1929 frontswap, fs_pages_to_unuse);
1892 if (ret) 1930 if (ret)
1893 return ret; 1931 return ret;
1894 } while (p4d++, addr = next, addr != end); 1932 } while (p4d++, addr = next, addr != end);
1895 return 0; 1933 return 0;
1896} 1934}
1897 1935
1898static int unuse_vma(struct vm_area_struct *vma, 1936static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
1899 swp_entry_t entry, struct page *page) 1937 bool frontswap, unsigned long *fs_pages_to_unuse)
1900{ 1938{
1901 pgd_t *pgd; 1939 pgd_t *pgd;
1902 unsigned long addr, end, next; 1940 unsigned long addr, end, next;
1903 int ret; 1941 int ret;
1904 1942
1905 if (page_anon_vma(page)) { 1943 addr = vma->vm_start;
1906 addr = page_address_in_vma(page, vma); 1944 end = vma->vm_end;
1907 if (addr == -EFAULT)
1908 return 0;
1909 else
1910 end = addr + PAGE_SIZE;
1911 } else {
1912 addr = vma->vm_start;
1913 end = vma->vm_end;
1914 }
1915 1945
1916 pgd = pgd_offset(vma->vm_mm, addr); 1946 pgd = pgd_offset(vma->vm_mm, addr);
1917 do { 1947 do {
1918 next = pgd_addr_end(addr, end); 1948 next = pgd_addr_end(addr, end);
1919 if (pgd_none_or_clear_bad(pgd)) 1949 if (pgd_none_or_clear_bad(pgd))
1920 continue; 1950 continue;
1921 ret = unuse_p4d_range(vma, pgd, addr, next, entry, page); 1951 ret = unuse_p4d_range(vma, pgd, addr, next, type,
1952 frontswap, fs_pages_to_unuse);
1922 if (ret) 1953 if (ret)
1923 return ret; 1954 return ret;
1924 } while (pgd++, addr = next, addr != end); 1955 } while (pgd++, addr = next, addr != end);
1925 return 0; 1956 return 0;
1926} 1957}
1927 1958
1928static int unuse_mm(struct mm_struct *mm, 1959static int unuse_mm(struct mm_struct *mm, unsigned int type,
1929 swp_entry_t entry, struct page *page) 1960 bool frontswap, unsigned long *fs_pages_to_unuse)
1930{ 1961{
1931 struct vm_area_struct *vma; 1962 struct vm_area_struct *vma;
1932 int ret = 0; 1963 int ret = 0;
1933 1964
1934 if (!down_read_trylock(&mm->mmap_sem)) { 1965 down_read(&mm->mmap_sem);
1935 /*
1936 * Activate page so shrink_inactive_list is unlikely to unmap
1937 * its ptes while lock is dropped, so swapoff can make progress.
1938 */
1939 activate_page(page);
1940 unlock_page(page);
1941 down_read(&mm->mmap_sem);
1942 lock_page(page);
1943 }
1944 for (vma = mm->mmap; vma; vma = vma->vm_next) { 1966 for (vma = mm->mmap; vma; vma = vma->vm_next) {
1945 if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) 1967 if (vma->anon_vma) {
1946 break; 1968 ret = unuse_vma(vma, type, frontswap,
1969 fs_pages_to_unuse);
1970 if (ret)
1971 break;
1972 }
1947 cond_resched(); 1973 cond_resched();
1948 } 1974 }
1949 up_read(&mm->mmap_sem); 1975 up_read(&mm->mmap_sem);
1950 return (ret < 0)? ret: 0; 1976 return ret;
1951} 1977}
1952 1978
1953/* 1979/*
1954 * Scan swap_map (or frontswap_map if frontswap parameter is true) 1980 * Scan swap_map (or frontswap_map if frontswap parameter is true)
1955 * from current position to next entry still in use. 1981 * from current position to next entry still in use. Return 0
1956 * Recycle to start on reaching the end, returning 0 when empty. 1982 * if there are no inuse entries after prev till end of the map.
1957 */ 1983 */
1958static unsigned int find_next_to_unuse(struct swap_info_struct *si, 1984static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1959 unsigned int prev, bool frontswap) 1985 unsigned int prev, bool frontswap)
1960{ 1986{
1961 unsigned int max = si->max; 1987 unsigned int i;
1962 unsigned int i = prev;
1963 unsigned char count; 1988 unsigned char count;
1964 1989
1965 /* 1990 /*
@@ -1968,20 +1993,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1968 * hits are okay, and sys_swapoff() has already prevented new 1993 * hits are okay, and sys_swapoff() has already prevented new
1969 * allocations from this area (while holding swap_lock). 1994 * allocations from this area (while holding swap_lock).
1970 */ 1995 */
1971 for (;;) { 1996 for (i = prev + 1; i < si->max; i++) {
1972 if (++i >= max) {
1973 if (!prev) {
1974 i = 0;
1975 break;
1976 }
1977 /*
1978 * No entries in use at top of swap_map,
1979 * loop back to start and recheck there.
1980 */
1981 max = prev + 1;
1982 prev = 0;
1983 i = 1;
1984 }
1985 count = READ_ONCE(si->swap_map[i]); 1997 count = READ_ONCE(si->swap_map[i]);
1986 if (count && swap_count(count) != SWAP_MAP_BAD) 1998 if (count && swap_count(count) != SWAP_MAP_BAD)
1987 if (!frontswap || frontswap_test(si, i)) 1999 if (!frontswap || frontswap_test(si, i))
@@ -1989,240 +2001,121 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
1989 if ((i % LATENCY_LIMIT) == 0) 2001 if ((i % LATENCY_LIMIT) == 0)
1990 cond_resched(); 2002 cond_resched();
1991 } 2003 }
2004
2005 if (i == si->max)
2006 i = 0;
2007
1992 return i; 2008 return i;
1993} 2009}
1994 2010
1995/* 2011/*
1996 * We completely avoid races by reading each swap page in advance, 2012 * If the boolean frontswap is true, only unuse pages_to_unuse pages;
1997 * and then search for the process using it. All the necessary
1998 * page table adjustments can then be made atomically.
1999 *
2000 * if the boolean frontswap is true, only unuse pages_to_unuse pages;
2001 * pages_to_unuse==0 means all pages; ignored if frontswap is false 2013 * pages_to_unuse==0 means all pages; ignored if frontswap is false
2002 */ 2014 */
2015#define SWAP_UNUSE_MAX_TRIES 3
2003int try_to_unuse(unsigned int type, bool frontswap, 2016int try_to_unuse(unsigned int type, bool frontswap,
2004 unsigned long pages_to_unuse) 2017 unsigned long pages_to_unuse)
2005{ 2018{
2019 struct mm_struct *prev_mm;
2020 struct mm_struct *mm;
2021 struct list_head *p;
2022 int retval = 0;
2006 struct swap_info_struct *si = swap_info[type]; 2023 struct swap_info_struct *si = swap_info[type];
2007 struct mm_struct *start_mm;
2008 volatile unsigned char *swap_map; /* swap_map is accessed without
2009 * locking. Mark it as volatile
2010 * to prevent compiler doing
2011 * something odd.
2012 */
2013 unsigned char swcount;
2014 struct page *page; 2024 struct page *page;
2015 swp_entry_t entry; 2025 swp_entry_t entry;
2016 unsigned int i = 0; 2026 unsigned int i;
2017 int retval = 0; 2027 int retries = 0;
2018 2028
2019 /* 2029 if (!si->inuse_pages)
2020 * When searching mms for an entry, a good strategy is to 2030 return 0;
2021 * start at the first mm we freed the previous entry from
2022 * (though actually we don't notice whether we or coincidence
2023 * freed the entry). Initialize this start_mm with a hold.
2024 *
2025 * A simpler strategy would be to start at the last mm we
2026 * freed the previous entry from; but that would take less
2027 * advantage of mmlist ordering, which clusters forked mms
2028 * together, child after parent. If we race with dup_mmap(), we
2029 * prefer to resolve parent before child, lest we miss entries
2030 * duplicated after we scanned child: using last mm would invert
2031 * that.
2032 */
2033 start_mm = &init_mm;
2034 mmget(&init_mm);
2035 2031
2036 /* 2032 if (!frontswap)
2037 * Keep on scanning until all entries have gone. Usually, 2033 pages_to_unuse = 0;
2038 * one pass through swap_map is enough, but not necessarily: 2034
2039 * there are races when an instance of an entry might be missed. 2035retry:
2040 */ 2036 retval = shmem_unuse(type, frontswap, &pages_to_unuse);
2041 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { 2037 if (retval)
2038 goto out;
2039
2040 prev_mm = &init_mm;
2041 mmget(prev_mm);
2042
2043 spin_lock(&mmlist_lock);
2044 p = &init_mm.mmlist;
2045 while ((p = p->next) != &init_mm.mmlist) {
2042 if (signal_pending(current)) { 2046 if (signal_pending(current)) {
2043 retval = -EINTR; 2047 retval = -EINTR;
2044 break; 2048 break;
2045 } 2049 }
2046 2050
2047 /* 2051 mm = list_entry(p, struct mm_struct, mmlist);
2048 * Get a page for the entry, using the existing swap 2052 if (!mmget_not_zero(mm))
2049 * cache page if there is one. Otherwise, get a clean 2053 continue;
2050 * page and read the swap into it. 2054 spin_unlock(&mmlist_lock);
2051 */ 2055 mmput(prev_mm);
2052 swap_map = &si->swap_map[i]; 2056 prev_mm = mm;
2053 entry = swp_entry(type, i); 2057 retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
2054 page = read_swap_cache_async(entry,
2055 GFP_HIGHUSER_MOVABLE, NULL, 0, false);
2056 if (!page) {
2057 /*
2058 * Either swap_duplicate() failed because entry
2059 * has been freed independently, and will not be
2060 * reused since sys_swapoff() already disabled
2061 * allocation from here, or alloc_page() failed.
2062 */
2063 swcount = *swap_map;
2064 /*
2065 * We don't hold lock here, so the swap entry could be
2066 * SWAP_MAP_BAD (when the cluster is discarding).
2067 * Instead of fail out, We can just skip the swap
2068 * entry because swapoff will wait for discarding
2069 * finish anyway.
2070 */
2071 if (!swcount || swcount == SWAP_MAP_BAD)
2072 continue;
2073 retval = -ENOMEM;
2074 break;
2075 }
2076 2058
2077 /* 2059 if (retval) {
2078 * Don't hold on to start_mm if it looks like exiting. 2060 mmput(prev_mm);
2079 */ 2061 goto out;
2080 if (atomic_read(&start_mm->mm_users) == 1) {
2081 mmput(start_mm);
2082 start_mm = &init_mm;
2083 mmget(&init_mm);
2084 } 2062 }
2085 2063
2086 /* 2064 /*
2087 * Wait for and lock page. When do_swap_page races with 2065 * Make sure that we aren't completely killing
2088 * try_to_unuse, do_swap_page can handle the fault much 2066 * interactive performance.
2089 * faster than try_to_unuse can locate the entry. This
2090 * apparently redundant "wait_on_page_locked" lets try_to_unuse
2091 * defer to do_swap_page in such a case - in some tests,
2092 * do_swap_page and try_to_unuse repeatedly compete.
2093 */
2094 wait_on_page_locked(page);
2095 wait_on_page_writeback(page);
2096 lock_page(page);
2097 wait_on_page_writeback(page);
2098
2099 /*
2100 * Remove all references to entry.
2101 */ 2067 */
2102 swcount = *swap_map; 2068 cond_resched();
2103 if (swap_count(swcount) == SWAP_MAP_SHMEM) { 2069 spin_lock(&mmlist_lock);
2104 retval = shmem_unuse(entry, page); 2070 }
2105 /* page has already been unlocked and released */ 2071 spin_unlock(&mmlist_lock);
2106 if (retval < 0)
2107 break;
2108 continue;
2109 }
2110 if (swap_count(swcount) && start_mm != &init_mm)
2111 retval = unuse_mm(start_mm, entry, page);
2112
2113 if (swap_count(*swap_map)) {
2114 int set_start_mm = (*swap_map >= swcount);
2115 struct list_head *p = &start_mm->mmlist;
2116 struct mm_struct *new_start_mm = start_mm;
2117 struct mm_struct *prev_mm = start_mm;
2118 struct mm_struct *mm;
2119
2120 mmget(new_start_mm);
2121 mmget(prev_mm);
2122 spin_lock(&mmlist_lock);
2123 while (swap_count(*swap_map) && !retval &&
2124 (p = p->next) != &start_mm->mmlist) {
2125 mm = list_entry(p, struct mm_struct, mmlist);
2126 if (!mmget_not_zero(mm))
2127 continue;
2128 spin_unlock(&mmlist_lock);
2129 mmput(prev_mm);
2130 prev_mm = mm;
2131 2072
2132 cond_resched(); 2073 mmput(prev_mm);
2133 2074
2134 swcount = *swap_map; 2075 i = 0;
2135 if (!swap_count(swcount)) /* any usage ? */ 2076 while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
2136 ;
2137 else if (mm == &init_mm)
2138 set_start_mm = 1;
2139 else
2140 retval = unuse_mm(mm, entry, page);
2141
2142 if (set_start_mm && *swap_map < swcount) {
2143 mmput(new_start_mm);
2144 mmget(mm);
2145 new_start_mm = mm;
2146 set_start_mm = 0;
2147 }
2148 spin_lock(&mmlist_lock);
2149 }
2150 spin_unlock(&mmlist_lock);
2151 mmput(prev_mm);
2152 mmput(start_mm);
2153 start_mm = new_start_mm;
2154 }
2155 if (retval) {
2156 unlock_page(page);
2157 put_page(page);
2158 break;
2159 }
2160 2077
2161 /* 2078 entry = swp_entry(type, i);
2162 * If a reference remains (rare), we would like to leave 2079 page = find_get_page(swap_address_space(entry), i);
2163 * the page in the swap cache; but try_to_unmap could 2080 if (!page)
2164 * then re-duplicate the entry once we drop page lock, 2081 continue;
2165 * so we might loop indefinitely; also, that page could
2166 * not be swapped out to other storage meanwhile. So:
2167 * delete from cache even if there's another reference,
2168 * after ensuring that the data has been saved to disk -
2169 * since if the reference remains (rarer), it will be
2170 * read from disk into another page. Splitting into two
2171 * pages would be incorrect if swap supported "shared
2172 * private" pages, but they are handled by tmpfs files.
2173 *
2174 * Given how unuse_vma() targets one particular offset
2175 * in an anon_vma, once the anon_vma has been determined,
2176 * this splitting happens to be just what is needed to
2177 * handle where KSM pages have been swapped out: re-reading
2178 * is unnecessarily slow, but we can fix that later on.
2179 */
2180 if (swap_count(*swap_map) &&
2181 PageDirty(page) && PageSwapCache(page)) {
2182 struct writeback_control wbc = {
2183 .sync_mode = WB_SYNC_NONE,
2184 };
2185
2186 swap_writepage(compound_head(page), &wbc);
2187 lock_page(page);
2188 wait_on_page_writeback(page);
2189 }
2190 2082
2191 /* 2083 /*
2192 * It is conceivable that a racing task removed this page from 2084 * It is conceivable that a racing task removed this page from
2193 * swap cache just before we acquired the page lock at the top, 2085 * swap cache just before we acquired the page lock. The page
2194 * or while we dropped it in unuse_mm(). The page might even 2086 * might even be back in swap cache on another swap area. But
2195 * be back in swap cache on another swap area: that we must not 2087 * that is okay, try_to_free_swap() only removes stale pages.
2196 * delete, since it may not have been written out to swap yet.
2197 */
2198 if (PageSwapCache(page) &&
2199 likely(page_private(page) == entry.val) &&
2200 (!PageTransCompound(page) ||
2201 !swap_page_trans_huge_swapped(si, entry)))
2202 delete_from_swap_cache(compound_head(page));
2203
2204 /*
2205 * So we could skip searching mms once swap count went
2206 * to 1, we did not mark any present ptes as dirty: must
2207 * mark page dirty so shrink_page_list will preserve it.
2208 */ 2088 */
2209 SetPageDirty(page); 2089 lock_page(page);
2090 wait_on_page_writeback(page);
2091 try_to_free_swap(page);
2210 unlock_page(page); 2092 unlock_page(page);
2211 put_page(page); 2093 put_page(page);
2212 2094
2213 /* 2095 /*
2214 * Make sure that we aren't completely killing 2096 * For frontswap, we just need to unuse pages_to_unuse, if
2215 * interactive performance. 2097 * it was specified. Need not check frontswap again here as
2098 * we already zeroed out pages_to_unuse if not frontswap.
2216 */ 2099 */
2217 cond_resched(); 2100 if (pages_to_unuse && --pages_to_unuse == 0)
2218 if (frontswap && pages_to_unuse > 0) { 2101 goto out;
2219 if (!--pages_to_unuse)
2220 break;
2221 }
2222 } 2102 }
2223 2103
2224 mmput(start_mm); 2104 /*
2225 return retval; 2105 * Lets check again to see if there are still swap entries in the map.
2106 * If yes, we would need to do retry the unuse logic again.
2107 * Under global memory pressure, swap entries can be reinserted back
2108 * into process space after the mmlist loop above passes over them.
2109 * Its not worth continuosuly retrying to unuse the swap in this case.
2110 * So we try SWAP_UNUSE_MAX_TRIES times.
2111 */
2112 if (++retries >= SWAP_UNUSE_MAX_TRIES)
2113 retval = -EBUSY;
2114 else if (si->inuse_pages)
2115 goto retry;
2116
2117out:
2118 return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
2226} 2119}
2227 2120
2228/* 2121/*