diff options
author | Nick Piggin <npiggin@suse.de> | 2007-10-16 04:25:01 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-16 12:42:55 -0400 |
commit | afddba49d18f346e5cc2938b6ed7c512db18ca68 (patch) | |
tree | 4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca /mm | |
parent | 637aff46f94a754207c80c8c64bf1b74f24b967d (diff) |
fs: introduce write_begin, write_end, and perform_write aops
These are intended to replace prepare_write and commit_write with more
flexible alternatives that are also able to avoid the buffered write
deadlock problems efficiently (which prepare_write is unable to do).
[mark.fasheh@oracle.com: API design contributions, code review and fixes]
[akpm@linux-foundation.org: various fixes]
[dmonakhov@sw.ru: new aop block_write_begin fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 250 |
1 files changed, 223 insertions, 27 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 67a03a0a9aee..ec25ba1aef5f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1742,11 +1742,20 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
1742 | i->count -= bytes; | 1742 | i->count -= bytes; |
1743 | } | 1743 | } |
1744 | 1744 | ||
1745 | int iov_iter_fault_in_readable(struct iov_iter *i) | 1745 | /* |
1746 | * Fault in the first iovec of the given iov_iter, to a maximum length | ||
1747 | * of bytes. Returns 0 on success, or non-zero if the memory could not be | ||
1748 | * accessed (ie. because it is an invalid address). | ||
1749 | * | ||
1750 | * writev-intensive code may want this to prefault several iovecs -- that | ||
1751 | * would be possible (callers must not rely on the fact that _only_ the | ||
1752 | * first iovec will be faulted with the current implementation). | ||
1753 | */ | ||
1754 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) | ||
1746 | { | 1755 | { |
1747 | size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count); | ||
1748 | char __user *buf = i->iov->iov_base + i->iov_offset; | 1756 | char __user *buf = i->iov->iov_base + i->iov_offset; |
1749 | return fault_in_pages_readable(buf, seglen); | 1757 | bytes = min(bytes, i->iov->iov_len - i->iov_offset); |
1758 | return fault_in_pages_readable(buf, bytes); | ||
1750 | } | 1759 | } |
1751 | 1760 | ||
1752 | /* | 1761 | /* |
@@ -1843,6 +1852,95 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
1843 | } | 1852 | } |
1844 | EXPORT_SYMBOL(generic_write_checks); | 1853 | EXPORT_SYMBOL(generic_write_checks); |
1845 | 1854 | ||
1855 | int pagecache_write_begin(struct file *file, struct address_space *mapping, | ||
1856 | loff_t pos, unsigned len, unsigned flags, | ||
1857 | struct page **pagep, void **fsdata) | ||
1858 | { | ||
1859 | const struct address_space_operations *aops = mapping->a_ops; | ||
1860 | |||
1861 | if (aops->write_begin) { | ||
1862 | return aops->write_begin(file, mapping, pos, len, flags, | ||
1863 | pagep, fsdata); | ||
1864 | } else { | ||
1865 | int ret; | ||
1866 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | ||
1867 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
1868 | struct inode *inode = mapping->host; | ||
1869 | struct page *page; | ||
1870 | again: | ||
1871 | page = __grab_cache_page(mapping, index); | ||
1872 | *pagep = page; | ||
1873 | if (!page) | ||
1874 | return -ENOMEM; | ||
1875 | |||
1876 | if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { | ||
1877 | /* | ||
1878 | * There is no way to resolve a short write situation | ||
1879 | * for a !Uptodate page (except by double copying in | ||
1880 | * the caller done by generic_perform_write_2copy). | ||
1881 | * | ||
1882 | * Instead, we have to bring it uptodate here. | ||
1883 | */ | ||
1884 | ret = aops->readpage(file, page); | ||
1885 | page_cache_release(page); | ||
1886 | if (ret) { | ||
1887 | if (ret == AOP_TRUNCATED_PAGE) | ||
1888 | goto again; | ||
1889 | return ret; | ||
1890 | } | ||
1891 | goto again; | ||
1892 | } | ||
1893 | |||
1894 | ret = aops->prepare_write(file, page, offset, offset+len); | ||
1895 | if (ret) { | ||
1896 | if (ret != AOP_TRUNCATED_PAGE) | ||
1897 | unlock_page(page); | ||
1898 | page_cache_release(page); | ||
1899 | if (pos + len > inode->i_size) | ||
1900 | vmtruncate(inode, inode->i_size); | ||
1901 | if (ret == AOP_TRUNCATED_PAGE) | ||
1902 | goto again; | ||
1903 | } | ||
1904 | return ret; | ||
1905 | } | ||
1906 | } | ||
1907 | EXPORT_SYMBOL(pagecache_write_begin); | ||
1908 | |||
1909 | int pagecache_write_end(struct file *file, struct address_space *mapping, | ||
1910 | loff_t pos, unsigned len, unsigned copied, | ||
1911 | struct page *page, void *fsdata) | ||
1912 | { | ||
1913 | const struct address_space_operations *aops = mapping->a_ops; | ||
1914 | int ret; | ||
1915 | |||
1916 | if (aops->write_end) { | ||
1917 | mark_page_accessed(page); | ||
1918 | ret = aops->write_end(file, mapping, pos, len, copied, | ||
1919 | page, fsdata); | ||
1920 | } else { | ||
1921 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
1922 | struct inode *inode = mapping->host; | ||
1923 | |||
1924 | flush_dcache_page(page); | ||
1925 | ret = aops->commit_write(file, page, offset, offset+len); | ||
1926 | unlock_page(page); | ||
1927 | mark_page_accessed(page); | ||
1928 | page_cache_release(page); | ||
1929 | BUG_ON(ret == AOP_TRUNCATED_PAGE); /* can't deal with */ | ||
1930 | |||
1931 | if (ret < 0) { | ||
1932 | if (pos + len > inode->i_size) | ||
1933 | vmtruncate(inode, inode->i_size); | ||
1934 | } else if (ret > 0) | ||
1935 | ret = min_t(size_t, copied, ret); | ||
1936 | else | ||
1937 | ret = copied; | ||
1938 | } | ||
1939 | |||
1940 | return ret; | ||
1941 | } | ||
1942 | EXPORT_SYMBOL(pagecache_write_end); | ||
1943 | |||
1846 | ssize_t | 1944 | ssize_t |
1847 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | 1945 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, |
1848 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, | 1946 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, |
@@ -1886,8 +1984,7 @@ EXPORT_SYMBOL(generic_file_direct_write); | |||
1886 | * Find or create a page at the given pagecache position. Return the locked | 1984 | * Find or create a page at the given pagecache position. Return the locked |
1887 | * page. This function is specifically for buffered writes. | 1985 | * page. This function is specifically for buffered writes. |
1888 | */ | 1986 | */ |
1889 | static struct page *__grab_cache_page(struct address_space *mapping, | 1987 | struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) |
1890 | pgoff_t index) | ||
1891 | { | 1988 | { |
1892 | int status; | 1989 | int status; |
1893 | struct page *page; | 1990 | struct page *page; |
@@ -1908,20 +2005,16 @@ repeat: | |||
1908 | } | 2005 | } |
1909 | return page; | 2006 | return page; |
1910 | } | 2007 | } |
2008 | EXPORT_SYMBOL(__grab_cache_page); | ||
1911 | 2009 | ||
1912 | ssize_t | 2010 | static ssize_t generic_perform_write_2copy(struct file *file, |
1913 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | 2011 | struct iov_iter *i, loff_t pos) |
1914 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | ||
1915 | size_t count, ssize_t written) | ||
1916 | { | 2012 | { |
1917 | struct file *file = iocb->ki_filp; | ||
1918 | struct address_space *mapping = file->f_mapping; | 2013 | struct address_space *mapping = file->f_mapping; |
1919 | const struct address_space_operations *a_ops = mapping->a_ops; | 2014 | const struct address_space_operations *a_ops = mapping->a_ops; |
1920 | struct inode *inode = mapping->host; | 2015 | struct inode *inode = mapping->host; |
1921 | long status = 0; | 2016 | long status = 0; |
1922 | struct iov_iter i; | 2017 | ssize_t written = 0; |
1923 | |||
1924 | iov_iter_init(&i, iov, nr_segs, count, written); | ||
1925 | 2018 | ||
1926 | do { | 2019 | do { |
1927 | struct page *src_page; | 2020 | struct page *src_page; |
@@ -1934,7 +2027,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1934 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | 2027 | offset = (pos & (PAGE_CACHE_SIZE - 1)); |
1935 | index = pos >> PAGE_CACHE_SHIFT; | 2028 | index = pos >> PAGE_CACHE_SHIFT; |
1936 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | 2029 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, |
1937 | iov_iter_count(&i)); | 2030 | iov_iter_count(i)); |
1938 | 2031 | ||
1939 | /* | 2032 | /* |
1940 | * a non-NULL src_page indicates that we're doing the | 2033 | * a non-NULL src_page indicates that we're doing the |
@@ -1952,7 +2045,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1952 | * to check that the address is actually valid, when atomic | 2045 | * to check that the address is actually valid, when atomic |
1953 | * usercopies are used, below. | 2046 | * usercopies are used, below. |
1954 | */ | 2047 | */ |
1955 | if (unlikely(iov_iter_fault_in_readable(&i))) { | 2048 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { |
1956 | status = -EFAULT; | 2049 | status = -EFAULT; |
1957 | break; | 2050 | break; |
1958 | } | 2051 | } |
@@ -1983,7 +2076,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1983 | * same reason as we can't take a page fault with a | 2076 | * same reason as we can't take a page fault with a |
1984 | * page locked (as explained below). | 2077 | * page locked (as explained below). |
1985 | */ | 2078 | */ |
1986 | copied = iov_iter_copy_from_user(src_page, &i, | 2079 | copied = iov_iter_copy_from_user(src_page, i, |
1987 | offset, bytes); | 2080 | offset, bytes); |
1988 | if (unlikely(copied == 0)) { | 2081 | if (unlikely(copied == 0)) { |
1989 | status = -EFAULT; | 2082 | status = -EFAULT; |
@@ -2008,7 +2101,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2008 | page_cache_release(src_page); | 2101 | page_cache_release(src_page); |
2009 | continue; | 2102 | continue; |
2010 | } | 2103 | } |
2011 | |||
2012 | } | 2104 | } |
2013 | 2105 | ||
2014 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 2106 | status = a_ops->prepare_write(file, page, offset, offset+bytes); |
@@ -2030,7 +2122,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2030 | * really matter. | 2122 | * really matter. |
2031 | */ | 2123 | */ |
2032 | pagefault_disable(); | 2124 | pagefault_disable(); |
2033 | copied = iov_iter_copy_from_user_atomic(page, &i, | 2125 | copied = iov_iter_copy_from_user_atomic(page, i, |
2034 | offset, bytes); | 2126 | offset, bytes); |
2035 | pagefault_enable(); | 2127 | pagefault_enable(); |
2036 | } else { | 2128 | } else { |
@@ -2056,9 +2148,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2056 | if (src_page) | 2148 | if (src_page) |
2057 | page_cache_release(src_page); | 2149 | page_cache_release(src_page); |
2058 | 2150 | ||
2059 | iov_iter_advance(&i, copied); | 2151 | iov_iter_advance(i, copied); |
2060 | written += copied; | ||
2061 | pos += copied; | 2152 | pos += copied; |
2153 | written += copied; | ||
2062 | 2154 | ||
2063 | balance_dirty_pages_ratelimited(mapping); | 2155 | balance_dirty_pages_ratelimited(mapping); |
2064 | cond_resched(); | 2156 | cond_resched(); |
@@ -2082,13 +2174,117 @@ fs_write_aop_error: | |||
2082 | continue; | 2174 | continue; |
2083 | else | 2175 | else |
2084 | break; | 2176 | break; |
2085 | } while (iov_iter_count(&i)); | 2177 | } while (iov_iter_count(i)); |
2086 | *ppos = pos; | 2178 | |
2179 | return written ? written : status; | ||
2180 | } | ||
2181 | |||
2182 | static ssize_t generic_perform_write(struct file *file, | ||
2183 | struct iov_iter *i, loff_t pos) | ||
2184 | { | ||
2185 | struct address_space *mapping = file->f_mapping; | ||
2186 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
2187 | long status = 0; | ||
2188 | ssize_t written = 0; | ||
2189 | |||
2190 | do { | ||
2191 | struct page *page; | ||
2192 | pgoff_t index; /* Pagecache index for current page */ | ||
2193 | unsigned long offset; /* Offset into pagecache page */ | ||
2194 | unsigned long bytes; /* Bytes to write to page */ | ||
2195 | size_t copied; /* Bytes copied from user */ | ||
2196 | void *fsdata; | ||
2197 | |||
2198 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | ||
2199 | index = pos >> PAGE_CACHE_SHIFT; | ||
2200 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
2201 | iov_iter_count(i)); | ||
2202 | |||
2203 | again: | ||
2204 | |||
2205 | /* | ||
2206 | * Bring in the user page that we will copy from _first_. | ||
2207 | * Otherwise there's a nasty deadlock on copying from the | ||
2208 | * same page as we're writing to, without it being marked | ||
2209 | * up-to-date. | ||
2210 | * | ||
2211 | * Not only is this an optimisation, but it is also required | ||
2212 | * to check that the address is actually valid, when atomic | ||
2213 | * usercopies are used, below. | ||
2214 | */ | ||
2215 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
2216 | status = -EFAULT; | ||
2217 | break; | ||
2218 | } | ||
2219 | |||
2220 | status = a_ops->write_begin(file, mapping, pos, bytes, 0, | ||
2221 | &page, &fsdata); | ||
2222 | if (unlikely(status)) | ||
2223 | break; | ||
2224 | |||
2225 | pagefault_disable(); | ||
2226 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | ||
2227 | pagefault_enable(); | ||
2228 | flush_dcache_page(page); | ||
2229 | |||
2230 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | ||
2231 | page, fsdata); | ||
2232 | if (unlikely(status < 0)) | ||
2233 | break; | ||
2234 | copied = status; | ||
2235 | |||
2236 | cond_resched(); | ||
2237 | |||
2238 | if (unlikely(copied == 0)) { | ||
2239 | /* | ||
2240 | * If we were unable to copy any data at all, we must | ||
2241 | * fall back to a single segment length write. | ||
2242 | * | ||
2243 | * If we didn't fallback here, we could livelock | ||
2244 | * because not all segments in the iov can be copied at | ||
2245 | * once without a pagefault. | ||
2246 | */ | ||
2247 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
2248 | iov_iter_single_seg_count(i)); | ||
2249 | goto again; | ||
2250 | } | ||
2251 | iov_iter_advance(i, copied); | ||
2252 | pos += copied; | ||
2253 | written += copied; | ||
2254 | |||
2255 | balance_dirty_pages_ratelimited(mapping); | ||
2256 | |||
2257 | } while (iov_iter_count(i)); | ||
2258 | |||
2259 | return written ? written : status; | ||
2260 | } | ||
2261 | |||
2262 | ssize_t | ||
2263 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | ||
2264 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | ||
2265 | size_t count, ssize_t written) | ||
2266 | { | ||
2267 | struct file *file = iocb->ki_filp; | ||
2268 | struct address_space *mapping = file->f_mapping; | ||
2269 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
2270 | struct inode *inode = mapping->host; | ||
2271 | ssize_t status; | ||
2272 | struct iov_iter i; | ||
2273 | |||
2274 | iov_iter_init(&i, iov, nr_segs, count, written); | ||
2275 | if (a_ops->write_begin) | ||
2276 | status = generic_perform_write(file, &i, pos); | ||
2277 | else | ||
2278 | status = generic_perform_write_2copy(file, &i, pos); | ||
2087 | 2279 | ||
2088 | /* | ||
2089 | * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC | ||
2090 | */ | ||
2091 | if (likely(status >= 0)) { | 2280 | if (likely(status >= 0)) { |
2281 | written += status; | ||
2282 | *ppos = pos + status; | ||
2283 | |||
2284 | /* | ||
2285 | * For now, when the user asks for O_SYNC, we'll actually give | ||
2286 | * O_DSYNC | ||
2287 | */ | ||
2092 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2288 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
2093 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) | 2289 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) |
2094 | status = generic_osync_inode(inode, mapping, | 2290 | status = generic_osync_inode(inode, mapping, |