aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2007-10-16 04:24:59 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 12:42:54 -0400
commit08291429cfa6258c4cd95d8833beb40f828b194e (patch)
tree50a206f0f0e7a5400b44073107ff00517e6f50ac
parent4a9e5ef1f4f15205e477817a5cefc34bd3f65f55 (diff)
mm: fix pagecache write deadlocks
Modify the core write() code so that it won't take a pagefault while holding a lock on the pagecache page. There are a number of different deadlocks possible if we try to do such a thing: 1. generic_buffered_write 2. lock_page 3. prepare_write 4. unlock_page+vmtruncate 5. copy_from_user 6. mmap_sem(r) 7. handle_mm_fault 8. lock_page (filemap_nopage) 9. commit_write 10. unlock_page a. sys_munmap / sys_mlock / others b. mmap_sem(w) c. make_pages_present d. get_user_pages e. handle_mm_fault f. lock_page (filemap_nopage) 2,8 - recursive deadlock if page is same 2,8;2,8 - ABBA deadlock is page is different 2,6;b,f - ABBA deadlock if page is same The solution is as follows: 1. If we find the destination page is uptodate, continue as normal, but use atomic usercopies which do not take pagefaults and do not zero the uncopied tail of the destination. The destination is already uptodate, so we can commit_write the full length even if there was a partial copy: it does not matter that the tail was not modified, because if it is dirtied and written back to disk it will not cause any problems (uptodate *means* that the destination page is as new or newer than the copy on disk). 1a. The above requires that fault_in_pages_readable correctly returns access information, because atomic usercopies cannot distinguish between non-present pages in a readable mapping, from lack of a readable mapping. 2. If we find the destination page is non uptodate, unlock it (this could be made slightly more optimal), then allocate a temporary page to copy the source data into. Relock the destination page and continue with the copy. However, instead of a usercopy (which might take a fault), copy the data from the pinned temporary page via the kernel address space. (also, rename maxlen to seglen, because it was confusing) This increases the CPU/memory copy cost by almost 50% on the affected workloads. That will be solved by introducing a new set of pagecache write aops in a subsequent patch. Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/pagemap.h11
-rw-r--r--mm/filemap.c122
2 files changed, 112 insertions, 21 deletions
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 862fc07dc6c0..8f1e390fd71b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -219,6 +219,9 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size)
219{ 219{
220 int ret; 220 int ret;
221 221
222 if (unlikely(size == 0))
223 return 0;
224
222 /* 225 /*
223 * Writing zeroes into userspace here is OK, because we know that if 226 * Writing zeroes into userspace here is OK, because we know that if
224 * the zero gets there, we'll be overwriting it. 227 * the zero gets there, we'll be overwriting it.
@@ -238,19 +241,23 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size)
238 return ret; 241 return ret;
239} 242}
240 243
241static inline void fault_in_pages_readable(const char __user *uaddr, int size) 244static inline int fault_in_pages_readable(const char __user *uaddr, int size)
242{ 245{
243 volatile char c; 246 volatile char c;
244 int ret; 247 int ret;
245 248
249 if (unlikely(size == 0))
250 return 0;
251
246 ret = __get_user(c, uaddr); 252 ret = __get_user(c, uaddr);
247 if (ret == 0) { 253 if (ret == 0) {
248 const char __user *end = uaddr + size - 1; 254 const char __user *end = uaddr + size - 1;
249 255
250 if (((unsigned long)uaddr & PAGE_MASK) != 256 if (((unsigned long)uaddr & PAGE_MASK) !=
251 ((unsigned long)end & PAGE_MASK)) 257 ((unsigned long)end & PAGE_MASK))
252 __get_user(c, end); 258 ret = __get_user(c, end);
253 } 259 }
260 return ret;
254} 261}
255 262
256#endif /* _LINUX_PAGEMAP_H */ 263#endif /* _LINUX_PAGEMAP_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index c59d5b3cd99a..557fd887254f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1826,11 +1826,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1826 filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, written); 1826 filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, written);
1827 1827
1828 do { 1828 do {
1829 struct page *src_page;
1829 struct page *page; 1830 struct page *page;
1830 pgoff_t index; /* Pagecache index for current page */ 1831 pgoff_t index; /* Pagecache index for current page */
1831 unsigned long offset; /* Offset into pagecache page */ 1832 unsigned long offset; /* Offset into pagecache page */
1832 unsigned long maxlen; /* Bytes remaining in current iovec */ 1833 unsigned long seglen; /* Bytes remaining in current iovec */
1833 size_t bytes; /* Bytes to write to page */ 1834 unsigned long bytes; /* Bytes to write to page */
1834 size_t copied; /* Bytes copied from user */ 1835 size_t copied; /* Bytes copied from user */
1835 1836
1836 buf = cur_iov->iov_base + iov_offset; 1837 buf = cur_iov->iov_base + iov_offset;
@@ -1840,20 +1841,30 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1840 if (bytes > count) 1841 if (bytes > count)
1841 bytes = count; 1842 bytes = count;
1842 1843
1843 maxlen = cur_iov->iov_len - iov_offset; 1844 /*
1844 if (maxlen > bytes) 1845 * a non-NULL src_page indicates that we're doing the
1845 maxlen = bytes; 1846 * copy via get_user_pages and kmap.
1847 */
1848 src_page = NULL;
1849
1850 seglen = cur_iov->iov_len - iov_offset;
1851 if (seglen > bytes)
1852 seglen = bytes;
1846 1853
1847#ifndef CONFIG_DEBUG_VM
1848 /* 1854 /*
1849 * Bring in the user page that we will copy from _first_. 1855 * Bring in the user page that we will copy from _first_.
1850 * Otherwise there's a nasty deadlock on copying from the 1856 * Otherwise there's a nasty deadlock on copying from the
1851 * same page as we're writing to, without it being marked 1857 * same page as we're writing to, without it being marked
1852 * up-to-date. 1858 * up-to-date.
1859 *
1860 * Not only is this an optimisation, but it is also required
1861 * to check that the address is actually valid, when atomic
1862 * usercopies are used, below.
1853 */ 1863 */
1854 fault_in_pages_readable(buf, maxlen); 1864 if (unlikely(fault_in_pages_readable(buf, seglen))) {
1855#endif 1865 status = -EFAULT;
1856 1866 break;
1867 }
1857 1868
1858 page = __grab_cache_page(mapping, index); 1869 page = __grab_cache_page(mapping, index);
1859 if (!page) { 1870 if (!page) {
@@ -1861,32 +1872,104 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1861 break; 1872 break;
1862 } 1873 }
1863 1874
1875 /*
1876 * non-uptodate pages cannot cope with short copies, and we
1877 * cannot take a pagefault with the destination page locked.
1878 * So pin the source page to copy it.
1879 */
1880 if (!PageUptodate(page)) {
1881 unlock_page(page);
1882
1883 src_page = alloc_page(GFP_KERNEL);
1884 if (!src_page) {
1885 page_cache_release(page);
1886 status = -ENOMEM;
1887 break;
1888 }
1889
1890 /*
1891 * Cannot get_user_pages with a page locked for the
1892 * same reason as we can't take a page fault with a
1893 * page locked (as explained below).
1894 */
1895 copied = filemap_copy_from_user(src_page, offset,
1896 cur_iov, nr_segs, iov_offset, bytes);
1897 if (unlikely(copied == 0)) {
1898 status = -EFAULT;
1899 page_cache_release(page);
1900 page_cache_release(src_page);
1901 break;
1902 }
1903 bytes = copied;
1904
1905 lock_page(page);
1906 /*
1907 * Can't handle the page going uptodate here, because
1908 * that means we would use non-atomic usercopies, which
1909 * zero out the tail of the page, which can cause
1910 * zeroes to become transiently visible. We could just
1911 * use a non-zeroing copy, but the APIs aren't too
1912 * consistent.
1913 */
1914 if (unlikely(!page->mapping || PageUptodate(page))) {
1915 unlock_page(page);
1916 page_cache_release(page);
1917 page_cache_release(src_page);
1918 continue;
1919 }
1920
1921 }
1922
1864 status = a_ops->prepare_write(file, page, offset, offset+bytes); 1923 status = a_ops->prepare_write(file, page, offset, offset+bytes);
1865 if (unlikely(status)) 1924 if (unlikely(status))
1866 goto fs_write_aop_error; 1925 goto fs_write_aop_error;
1867 1926
1868 copied = filemap_copy_from_user(page, offset, 1927 if (!src_page) {
1928 /*
1929 * Must not enter the pagefault handler here, because
1930 * we hold the page lock, so we might recursively
1931 * deadlock on the same lock, or get an ABBA deadlock
1932 * against a different lock, or against the mmap_sem
1933 * (which nests outside the page lock). So increment
1934 * preempt count, and use _atomic usercopies.
1935 *
1936 * The page is uptodate so we are OK to encounter a
1937 * short copy: if unmodified parts of the page are
1938 * marked dirty and written out to disk, it doesn't
1939 * really matter.
1940 */
1941 pagefault_disable();
1942 copied = filemap_copy_from_user_atomic(page, offset,
1869 cur_iov, nr_segs, iov_offset, bytes); 1943 cur_iov, nr_segs, iov_offset, bytes);
1944 pagefault_enable();
1945 } else {
1946 void *src, *dst;
1947 src = kmap_atomic(src_page, KM_USER0);
1948 dst = kmap_atomic(page, KM_USER1);
1949 memcpy(dst + offset, src + offset, bytes);
1950 kunmap_atomic(dst, KM_USER1);
1951 kunmap_atomic(src, KM_USER0);
1952 copied = bytes;
1953 }
1870 flush_dcache_page(page); 1954 flush_dcache_page(page);
1871 1955
1872 status = a_ops->commit_write(file, page, offset, offset+bytes); 1956 status = a_ops->commit_write(file, page, offset, offset+bytes);
1873 if (unlikely(status < 0 || status == AOP_TRUNCATED_PAGE)) 1957 if (unlikely(status < 0 || status == AOP_TRUNCATED_PAGE))
1874 goto fs_write_aop_error; 1958 goto fs_write_aop_error;
1875 if (unlikely(copied != bytes)) {
1876 status = -EFAULT;
1877 goto fs_write_aop_error;
1878 }
1879 if (unlikely(status > 0)) /* filesystem did partial write */ 1959 if (unlikely(status > 0)) /* filesystem did partial write */
1880 copied = status; 1960 copied = min_t(size_t, copied, status);
1961
1962 unlock_page(page);
1963 mark_page_accessed(page);
1964 page_cache_release(page);
1965 if (src_page)
1966 page_cache_release(src_page);
1881 1967
1882 written += copied; 1968 written += copied;
1883 count -= copied; 1969 count -= copied;
1884 pos += copied; 1970 pos += copied;
1885 filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, copied); 1971 filemap_set_next_iovec(&cur_iov, nr_segs, &iov_offset, copied);
1886 1972
1887 unlock_page(page);
1888 mark_page_accessed(page);
1889 page_cache_release(page);
1890 balance_dirty_pages_ratelimited(mapping); 1973 balance_dirty_pages_ratelimited(mapping);
1891 cond_resched(); 1974 cond_resched();
1892 continue; 1975 continue;
@@ -1895,6 +1978,8 @@ fs_write_aop_error:
1895 if (status != AOP_TRUNCATED_PAGE) 1978 if (status != AOP_TRUNCATED_PAGE)
1896 unlock_page(page); 1979 unlock_page(page);
1897 page_cache_release(page); 1980 page_cache_release(page);
1981 if (src_page)
1982 page_cache_release(src_page);
1898 1983
1899 /* 1984 /*
1900 * prepare_write() may have instantiated a few blocks 1985 * prepare_write() may have instantiated a few blocks
@@ -1907,7 +1992,6 @@ fs_write_aop_error:
1907 continue; 1992 continue;
1908 else 1993 else
1909 break; 1994 break;
1910
1911 } while (count); 1995 } while (count);
1912 *ppos = pos; 1996 *ppos = pos;
1913 1997