aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2007-10-16 04:25:01 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 12:42:55 -0400
commitafddba49d18f346e5cc2938b6ed7c512db18ca68 (patch)
tree4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca /mm
parent637aff46f94a754207c80c8c64bf1b74f24b967d (diff)
fs: introduce write_begin, write_end, and perform_write aops
These are intended to replace prepare_write and commit_write with more flexible alternatives that are also able to avoid the buffered write deadlock problems efficiently (which prepare_write is unable to do). [mark.fasheh@oracle.com: API design contributions, code review and fixes] [akpm@linux-foundation.org: various fixes] [dmonakhov@sw.ru: new aop block_write_begin fix] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c250
1 files changed, 223 insertions, 27 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 67a03a0a9aee..ec25ba1aef5f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1742,11 +1742,20 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
1742 i->count -= bytes; 1742 i->count -= bytes;
1743} 1743}
1744 1744
1745int iov_iter_fault_in_readable(struct iov_iter *i) 1745/*
1746 * Fault in the first iovec of the given iov_iter, to a maximum length
1747 * of bytes. Returns 0 on success, or non-zero if the memory could not be
1748 * accessed (ie. because it is an invalid address).
1749 *
1750 * writev-intensive code may want this to prefault several iovecs -- that
1751 * would be possible (callers must not rely on the fact that _only_ the
1752 * first iovec will be faulted with the current implementation).
1753 */
1754int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
1746{ 1755{
1747 size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count);
1748 char __user *buf = i->iov->iov_base + i->iov_offset; 1756 char __user *buf = i->iov->iov_base + i->iov_offset;
1749 return fault_in_pages_readable(buf, seglen); 1757 bytes = min(bytes, i->iov->iov_len - i->iov_offset);
1758 return fault_in_pages_readable(buf, bytes);
1750} 1759}
1751 1760
1752/* 1761/*
@@ -1843,6 +1852,95 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
1843} 1852}
1844EXPORT_SYMBOL(generic_write_checks); 1853EXPORT_SYMBOL(generic_write_checks);
1845 1854
1855int pagecache_write_begin(struct file *file, struct address_space *mapping,
1856 loff_t pos, unsigned len, unsigned flags,
1857 struct page **pagep, void **fsdata)
1858{
1859 const struct address_space_operations *aops = mapping->a_ops;
1860
1861 if (aops->write_begin) {
1862 return aops->write_begin(file, mapping, pos, len, flags,
1863 pagep, fsdata);
1864 } else {
1865 int ret;
1866 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1867 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1868 struct inode *inode = mapping->host;
1869 struct page *page;
1870again:
1871 page = __grab_cache_page(mapping, index);
1872 *pagep = page;
1873 if (!page)
1874 return -ENOMEM;
1875
1876 if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
1877 /*
1878 * There is no way to resolve a short write situation
1879 * for a !Uptodate page (except by double copying in
1880 * the caller done by generic_perform_write_2copy).
1881 *
1882 * Instead, we have to bring it uptodate here.
1883 */
1884 ret = aops->readpage(file, page);
1885 page_cache_release(page);
1886 if (ret) {
1887 if (ret == AOP_TRUNCATED_PAGE)
1888 goto again;
1889 return ret;
1890 }
1891 goto again;
1892 }
1893
1894 ret = aops->prepare_write(file, page, offset, offset+len);
1895 if (ret) {
1896 if (ret != AOP_TRUNCATED_PAGE)
1897 unlock_page(page);
1898 page_cache_release(page);
1899 if (pos + len > inode->i_size)
1900 vmtruncate(inode, inode->i_size);
1901 if (ret == AOP_TRUNCATED_PAGE)
1902 goto again;
1903 }
1904 return ret;
1905 }
1906}
1907EXPORT_SYMBOL(pagecache_write_begin);
1908
1909int pagecache_write_end(struct file *file, struct address_space *mapping,
1910 loff_t pos, unsigned len, unsigned copied,
1911 struct page *page, void *fsdata)
1912{
1913 const struct address_space_operations *aops = mapping->a_ops;
1914 int ret;
1915
1916 if (aops->write_end) {
1917 mark_page_accessed(page);
1918 ret = aops->write_end(file, mapping, pos, len, copied,
1919 page, fsdata);
1920 } else {
1921 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1922 struct inode *inode = mapping->host;
1923
1924 flush_dcache_page(page);
1925 ret = aops->commit_write(file, page, offset, offset+len);
1926 unlock_page(page);
1927 mark_page_accessed(page);
1928 page_cache_release(page);
1929 BUG_ON(ret == AOP_TRUNCATED_PAGE); /* can't deal with */
1930
1931 if (ret < 0) {
1932 if (pos + len > inode->i_size)
1933 vmtruncate(inode, inode->i_size);
1934 } else if (ret > 0)
1935 ret = min_t(size_t, copied, ret);
1936 else
1937 ret = copied;
1938 }
1939
1940 return ret;
1941}
1942EXPORT_SYMBOL(pagecache_write_end);
1943
1846ssize_t 1944ssize_t
1847generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 1945generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1848 unsigned long *nr_segs, loff_t pos, loff_t *ppos, 1946 unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -1886,8 +1984,7 @@ EXPORT_SYMBOL(generic_file_direct_write);
1886 * Find or create a page at the given pagecache position. Return the locked 1984 * Find or create a page at the given pagecache position. Return the locked
1887 * page. This function is specifically for buffered writes. 1985 * page. This function is specifically for buffered writes.
1888 */ 1986 */
1889static struct page *__grab_cache_page(struct address_space *mapping, 1987struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
1890 pgoff_t index)
1891{ 1988{
1892 int status; 1989 int status;
1893 struct page *page; 1990 struct page *page;
@@ -1908,20 +2005,16 @@ repeat:
1908 } 2005 }
1909 return page; 2006 return page;
1910} 2007}
2008EXPORT_SYMBOL(__grab_cache_page);
1911 2009
1912ssize_t 2010static ssize_t generic_perform_write_2copy(struct file *file,
1913generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, 2011 struct iov_iter *i, loff_t pos)
1914 unsigned long nr_segs, loff_t pos, loff_t *ppos,
1915 size_t count, ssize_t written)
1916{ 2012{
1917 struct file *file = iocb->ki_filp;
1918 struct address_space *mapping = file->f_mapping; 2013 struct address_space *mapping = file->f_mapping;
1919 const struct address_space_operations *a_ops = mapping->a_ops; 2014 const struct address_space_operations *a_ops = mapping->a_ops;
1920 struct inode *inode = mapping->host; 2015 struct inode *inode = mapping->host;
1921 long status = 0; 2016 long status = 0;
1922 struct iov_iter i; 2017 ssize_t written = 0;
1923
1924 iov_iter_init(&i, iov, nr_segs, count, written);
1925 2018
1926 do { 2019 do {
1927 struct page *src_page; 2020 struct page *src_page;
@@ -1934,7 +2027,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1934 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2027 offset = (pos & (PAGE_CACHE_SIZE - 1));
1935 index = pos >> PAGE_CACHE_SHIFT; 2028 index = pos >> PAGE_CACHE_SHIFT;
1936 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2029 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
1937 iov_iter_count(&i)); 2030 iov_iter_count(i));
1938 2031
1939 /* 2032 /*
1940 * a non-NULL src_page indicates that we're doing the 2033 * a non-NULL src_page indicates that we're doing the
@@ -1952,7 +2045,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1952 * to check that the address is actually valid, when atomic 2045 * to check that the address is actually valid, when atomic
1953 * usercopies are used, below. 2046 * usercopies are used, below.
1954 */ 2047 */
1955 if (unlikely(iov_iter_fault_in_readable(&i))) { 2048 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
1956 status = -EFAULT; 2049 status = -EFAULT;
1957 break; 2050 break;
1958 } 2051 }
@@ -1983,7 +2076,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1983 * same reason as we can't take a page fault with a 2076 * same reason as we can't take a page fault with a
1984 * page locked (as explained below). 2077 * page locked (as explained below).
1985 */ 2078 */
1986 copied = iov_iter_copy_from_user(src_page, &i, 2079 copied = iov_iter_copy_from_user(src_page, i,
1987 offset, bytes); 2080 offset, bytes);
1988 if (unlikely(copied == 0)) { 2081 if (unlikely(copied == 0)) {
1989 status = -EFAULT; 2082 status = -EFAULT;
@@ -2008,7 +2101,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2008 page_cache_release(src_page); 2101 page_cache_release(src_page);
2009 continue; 2102 continue;
2010 } 2103 }
2011
2012 } 2104 }
2013 2105
2014 status = a_ops->prepare_write(file, page, offset, offset+bytes); 2106 status = a_ops->prepare_write(file, page, offset, offset+bytes);
@@ -2030,7 +2122,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2030 * really matter. 2122 * really matter.
2031 */ 2123 */
2032 pagefault_disable(); 2124 pagefault_disable();
2033 copied = iov_iter_copy_from_user_atomic(page, &i, 2125 copied = iov_iter_copy_from_user_atomic(page, i,
2034 offset, bytes); 2126 offset, bytes);
2035 pagefault_enable(); 2127 pagefault_enable();
2036 } else { 2128 } else {
@@ -2056,9 +2148,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2056 if (src_page) 2148 if (src_page)
2057 page_cache_release(src_page); 2149 page_cache_release(src_page);
2058 2150
2059 iov_iter_advance(&i, copied); 2151 iov_iter_advance(i, copied);
2060 written += copied;
2061 pos += copied; 2152 pos += copied;
2153 written += copied;
2062 2154
2063 balance_dirty_pages_ratelimited(mapping); 2155 balance_dirty_pages_ratelimited(mapping);
2064 cond_resched(); 2156 cond_resched();
@@ -2082,13 +2174,117 @@ fs_write_aop_error:
2082 continue; 2174 continue;
2083 else 2175 else
2084 break; 2176 break;
2085 } while (iov_iter_count(&i)); 2177 } while (iov_iter_count(i));
2086 *ppos = pos; 2178
2179 return written ? written : status;
2180}
2181
2182static ssize_t generic_perform_write(struct file *file,
2183 struct iov_iter *i, loff_t pos)
2184{
2185 struct address_space *mapping = file->f_mapping;
2186 const struct address_space_operations *a_ops = mapping->a_ops;
2187 long status = 0;
2188 ssize_t written = 0;
2189
2190 do {
2191 struct page *page;
2192 pgoff_t index; /* Pagecache index for current page */
2193 unsigned long offset; /* Offset into pagecache page */
2194 unsigned long bytes; /* Bytes to write to page */
2195 size_t copied; /* Bytes copied from user */
2196 void *fsdata;
2197
2198 offset = (pos & (PAGE_CACHE_SIZE - 1));
2199 index = pos >> PAGE_CACHE_SHIFT;
2200 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2201 iov_iter_count(i));
2202
2203again:
2204
2205 /*
2206 * Bring in the user page that we will copy from _first_.
2207 * Otherwise there's a nasty deadlock on copying from the
2208 * same page as we're writing to, without it being marked
2209 * up-to-date.
2210 *
2211 * Not only is this an optimisation, but it is also required
2212 * to check that the address is actually valid, when atomic
2213 * usercopies are used, below.
2214 */
2215 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2216 status = -EFAULT;
2217 break;
2218 }
2219
2220 status = a_ops->write_begin(file, mapping, pos, bytes, 0,
2221 &page, &fsdata);
2222 if (unlikely(status))
2223 break;
2224
2225 pagefault_disable();
2226 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2227 pagefault_enable();
2228 flush_dcache_page(page);
2229
2230 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2231 page, fsdata);
2232 if (unlikely(status < 0))
2233 break;
2234 copied = status;
2235
2236 cond_resched();
2237
2238 if (unlikely(copied == 0)) {
2239 /*
2240 * If we were unable to copy any data at all, we must
2241 * fall back to a single segment length write.
2242 *
2243 * If we didn't fallback here, we could livelock
2244 * because not all segments in the iov can be copied at
2245 * once without a pagefault.
2246 */
2247 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2248 iov_iter_single_seg_count(i));
2249 goto again;
2250 }
2251 iov_iter_advance(i, copied);
2252 pos += copied;
2253 written += copied;
2254
2255 balance_dirty_pages_ratelimited(mapping);
2256
2257 } while (iov_iter_count(i));
2258
2259 return written ? written : status;
2260}
2261
2262ssize_t
2263generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2264 unsigned long nr_segs, loff_t pos, loff_t *ppos,
2265 size_t count, ssize_t written)
2266{
2267 struct file *file = iocb->ki_filp;
2268 struct address_space *mapping = file->f_mapping;
2269 const struct address_space_operations *a_ops = mapping->a_ops;
2270 struct inode *inode = mapping->host;
2271 ssize_t status;
2272 struct iov_iter i;
2273
2274 iov_iter_init(&i, iov, nr_segs, count, written);
2275 if (a_ops->write_begin)
2276 status = generic_perform_write(file, &i, pos);
2277 else
2278 status = generic_perform_write_2copy(file, &i, pos);
2087 2279
2088 /*
2089 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
2090 */
2091 if (likely(status >= 0)) { 2280 if (likely(status >= 0)) {
2281 written += status;
2282 *ppos = pos + status;
2283
2284 /*
2285 * For now, when the user asks for O_SYNC, we'll actually give
2286 * O_DSYNC
2287 */
2092 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2288 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2093 if (!a_ops->writepage || !is_sync_kiocb(iocb)) 2289 if (!a_ops->writepage || !is_sync_kiocb(iocb))
2094 status = generic_osync_inode(inode, mapping, 2290 status = generic_osync_inode(inode, mapping,