diff options
| author | Nick Piggin <npiggin@suse.de> | 2007-10-16 04:25:01 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-16 12:42:55 -0400 |
| commit | afddba49d18f346e5cc2938b6ed7c512db18ca68 (patch) | |
| tree | 4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca /mm/filemap.c | |
| parent | 637aff46f94a754207c80c8c64bf1b74f24b967d (diff) | |
fs: introduce write_begin, write_end, and perform_write aops
These are intended to replace prepare_write and commit_write with more
flexible alternatives that are also able to avoid the buffered write
deadlock problems efficiently (which prepare_write is unable to do).
[mark.fasheh@oracle.com: API design contributions, code review and fixes]
[akpm@linux-foundation.org: various fixes]
[dmonakhov@sw.ru: new aop block_write_begin fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/filemap.c')
| -rw-r--r-- | mm/filemap.c | 250 |
1 files changed, 223 insertions, 27 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 67a03a0a9aee..ec25ba1aef5f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -1742,11 +1742,20 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
| 1742 | i->count -= bytes; | 1742 | i->count -= bytes; |
| 1743 | } | 1743 | } |
| 1744 | 1744 | ||
| 1745 | int iov_iter_fault_in_readable(struct iov_iter *i) | 1745 | /* |
| 1746 | * Fault in the first iovec of the given iov_iter, to a maximum length | ||
| 1747 | * of bytes. Returns 0 on success, or non-zero if the memory could not be | ||
| 1748 | * accessed (ie. because it is an invalid address). | ||
| 1749 | * | ||
| 1750 | * writev-intensive code may want this to prefault several iovecs -- that | ||
| 1751 | * would be possible (callers must not rely on the fact that _only_ the | ||
| 1752 | * first iovec will be faulted with the current implementation). | ||
| 1753 | */ | ||
| 1754 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) | ||
| 1746 | { | 1755 | { |
| 1747 | size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count); | ||
| 1748 | char __user *buf = i->iov->iov_base + i->iov_offset; | 1756 | char __user *buf = i->iov->iov_base + i->iov_offset; |
| 1749 | return fault_in_pages_readable(buf, seglen); | 1757 | bytes = min(bytes, i->iov->iov_len - i->iov_offset); |
| 1758 | return fault_in_pages_readable(buf, bytes); | ||
| 1750 | } | 1759 | } |
| 1751 | 1760 | ||
| 1752 | /* | 1761 | /* |
| @@ -1843,6 +1852,95 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
| 1843 | } | 1852 | } |
| 1844 | EXPORT_SYMBOL(generic_write_checks); | 1853 | EXPORT_SYMBOL(generic_write_checks); |
| 1845 | 1854 | ||
| 1855 | int pagecache_write_begin(struct file *file, struct address_space *mapping, | ||
| 1856 | loff_t pos, unsigned len, unsigned flags, | ||
| 1857 | struct page **pagep, void **fsdata) | ||
| 1858 | { | ||
| 1859 | const struct address_space_operations *aops = mapping->a_ops; | ||
| 1860 | |||
| 1861 | if (aops->write_begin) { | ||
| 1862 | return aops->write_begin(file, mapping, pos, len, flags, | ||
| 1863 | pagep, fsdata); | ||
| 1864 | } else { | ||
| 1865 | int ret; | ||
| 1866 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | ||
| 1867 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
| 1868 | struct inode *inode = mapping->host; | ||
| 1869 | struct page *page; | ||
| 1870 | again: | ||
| 1871 | page = __grab_cache_page(mapping, index); | ||
| 1872 | *pagep = page; | ||
| 1873 | if (!page) | ||
| 1874 | return -ENOMEM; | ||
| 1875 | |||
| 1876 | if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { | ||
| 1877 | /* | ||
| 1878 | * There is no way to resolve a short write situation | ||
| 1879 | * for a !Uptodate page (except by double copying in | ||
| 1880 | * the caller done by generic_perform_write_2copy). | ||
| 1881 | * | ||
| 1882 | * Instead, we have to bring it uptodate here. | ||
| 1883 | */ | ||
| 1884 | ret = aops->readpage(file, page); | ||
| 1885 | page_cache_release(page); | ||
| 1886 | if (ret) { | ||
| 1887 | if (ret == AOP_TRUNCATED_PAGE) | ||
| 1888 | goto again; | ||
| 1889 | return ret; | ||
| 1890 | } | ||
| 1891 | goto again; | ||
| 1892 | } | ||
| 1893 | |||
| 1894 | ret = aops->prepare_write(file, page, offset, offset+len); | ||
| 1895 | if (ret) { | ||
| 1896 | if (ret != AOP_TRUNCATED_PAGE) | ||
| 1897 | unlock_page(page); | ||
| 1898 | page_cache_release(page); | ||
| 1899 | if (pos + len > inode->i_size) | ||
| 1900 | vmtruncate(inode, inode->i_size); | ||
| 1901 | if (ret == AOP_TRUNCATED_PAGE) | ||
| 1902 | goto again; | ||
| 1903 | } | ||
| 1904 | return ret; | ||
| 1905 | } | ||
| 1906 | } | ||
| 1907 | EXPORT_SYMBOL(pagecache_write_begin); | ||
| 1908 | |||
| 1909 | int pagecache_write_end(struct file *file, struct address_space *mapping, | ||
| 1910 | loff_t pos, unsigned len, unsigned copied, | ||
| 1911 | struct page *page, void *fsdata) | ||
| 1912 | { | ||
| 1913 | const struct address_space_operations *aops = mapping->a_ops; | ||
| 1914 | int ret; | ||
| 1915 | |||
| 1916 | if (aops->write_end) { | ||
| 1917 | mark_page_accessed(page); | ||
| 1918 | ret = aops->write_end(file, mapping, pos, len, copied, | ||
| 1919 | page, fsdata); | ||
| 1920 | } else { | ||
| 1921 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
| 1922 | struct inode *inode = mapping->host; | ||
| 1923 | |||
| 1924 | flush_dcache_page(page); | ||
| 1925 | ret = aops->commit_write(file, page, offset, offset+len); | ||
| 1926 | unlock_page(page); | ||
| 1927 | mark_page_accessed(page); | ||
| 1928 | page_cache_release(page); | ||
| 1929 | BUG_ON(ret == AOP_TRUNCATED_PAGE); /* can't deal with */ | ||
| 1930 | |||
| 1931 | if (ret < 0) { | ||
| 1932 | if (pos + len > inode->i_size) | ||
| 1933 | vmtruncate(inode, inode->i_size); | ||
| 1934 | } else if (ret > 0) | ||
| 1935 | ret = min_t(size_t, copied, ret); | ||
| 1936 | else | ||
| 1937 | ret = copied; | ||
| 1938 | } | ||
| 1939 | |||
| 1940 | return ret; | ||
| 1941 | } | ||
| 1942 | EXPORT_SYMBOL(pagecache_write_end); | ||
| 1943 | |||
| 1846 | ssize_t | 1944 | ssize_t |
| 1847 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | 1945 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, |
| 1848 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, | 1946 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, |
| @@ -1886,8 +1984,7 @@ EXPORT_SYMBOL(generic_file_direct_write); | |||
| 1886 | * Find or create a page at the given pagecache position. Return the locked | 1984 | * Find or create a page at the given pagecache position. Return the locked |
| 1887 | * page. This function is specifically for buffered writes. | 1985 | * page. This function is specifically for buffered writes. |
| 1888 | */ | 1986 | */ |
| 1889 | static struct page *__grab_cache_page(struct address_space *mapping, | 1987 | struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) |
| 1890 | pgoff_t index) | ||
| 1891 | { | 1988 | { |
| 1892 | int status; | 1989 | int status; |
| 1893 | struct page *page; | 1990 | struct page *page; |
| @@ -1908,20 +2005,16 @@ repeat: | |||
| 1908 | } | 2005 | } |
| 1909 | return page; | 2006 | return page; |
| 1910 | } | 2007 | } |
| 2008 | EXPORT_SYMBOL(__grab_cache_page); | ||
| 1911 | 2009 | ||
| 1912 | ssize_t | 2010 | static ssize_t generic_perform_write_2copy(struct file *file, |
| 1913 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | 2011 | struct iov_iter *i, loff_t pos) |
| 1914 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | ||
| 1915 | size_t count, ssize_t written) | ||
| 1916 | { | 2012 | { |
| 1917 | struct file *file = iocb->ki_filp; | ||
| 1918 | struct address_space *mapping = file->f_mapping; | 2013 | struct address_space *mapping = file->f_mapping; |
| 1919 | const struct address_space_operations *a_ops = mapping->a_ops; | 2014 | const struct address_space_operations *a_ops = mapping->a_ops; |
| 1920 | struct inode *inode = mapping->host; | 2015 | struct inode *inode = mapping->host; |
| 1921 | long status = 0; | 2016 | long status = 0; |
| 1922 | struct iov_iter i; | 2017 | ssize_t written = 0; |
| 1923 | |||
| 1924 | iov_iter_init(&i, iov, nr_segs, count, written); | ||
| 1925 | 2018 | ||
| 1926 | do { | 2019 | do { |
| 1927 | struct page *src_page; | 2020 | struct page *src_page; |
| @@ -1934,7 +2027,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 1934 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | 2027 | offset = (pos & (PAGE_CACHE_SIZE - 1)); |
| 1935 | index = pos >> PAGE_CACHE_SHIFT; | 2028 | index = pos >> PAGE_CACHE_SHIFT; |
| 1936 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | 2029 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, |
| 1937 | iov_iter_count(&i)); | 2030 | iov_iter_count(i)); |
| 1938 | 2031 | ||
| 1939 | /* | 2032 | /* |
| 1940 | * a non-NULL src_page indicates that we're doing the | 2033 | * a non-NULL src_page indicates that we're doing the |
| @@ -1952,7 +2045,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 1952 | * to check that the address is actually valid, when atomic | 2045 | * to check that the address is actually valid, when atomic |
| 1953 | * usercopies are used, below. | 2046 | * usercopies are used, below. |
| 1954 | */ | 2047 | */ |
| 1955 | if (unlikely(iov_iter_fault_in_readable(&i))) { | 2048 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { |
| 1956 | status = -EFAULT; | 2049 | status = -EFAULT; |
| 1957 | break; | 2050 | break; |
| 1958 | } | 2051 | } |
| @@ -1983,7 +2076,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 1983 | * same reason as we can't take a page fault with a | 2076 | * same reason as we can't take a page fault with a |
| 1984 | * page locked (as explained below). | 2077 | * page locked (as explained below). |
| 1985 | */ | 2078 | */ |
| 1986 | copied = iov_iter_copy_from_user(src_page, &i, | 2079 | copied = iov_iter_copy_from_user(src_page, i, |
| 1987 | offset, bytes); | 2080 | offset, bytes); |
| 1988 | if (unlikely(copied == 0)) { | 2081 | if (unlikely(copied == 0)) { |
| 1989 | status = -EFAULT; | 2082 | status = -EFAULT; |
| @@ -2008,7 +2101,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2008 | page_cache_release(src_page); | 2101 | page_cache_release(src_page); |
| 2009 | continue; | 2102 | continue; |
| 2010 | } | 2103 | } |
| 2011 | |||
| 2012 | } | 2104 | } |
| 2013 | 2105 | ||
| 2014 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 2106 | status = a_ops->prepare_write(file, page, offset, offset+bytes); |
| @@ -2030,7 +2122,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2030 | * really matter. | 2122 | * really matter. |
| 2031 | */ | 2123 | */ |
| 2032 | pagefault_disable(); | 2124 | pagefault_disable(); |
| 2033 | copied = iov_iter_copy_from_user_atomic(page, &i, | 2125 | copied = iov_iter_copy_from_user_atomic(page, i, |
| 2034 | offset, bytes); | 2126 | offset, bytes); |
| 2035 | pagefault_enable(); | 2127 | pagefault_enable(); |
| 2036 | } else { | 2128 | } else { |
| @@ -2056,9 +2148,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
| 2056 | if (src_page) | 2148 | if (src_page) |
| 2057 | page_cache_release(src_page); | 2149 | page_cache_release(src_page); |
| 2058 | 2150 | ||
| 2059 | iov_iter_advance(&i, copied); | 2151 | iov_iter_advance(i, copied); |
| 2060 | written += copied; | ||
| 2061 | pos += copied; | 2152 | pos += copied; |
| 2153 | written += copied; | ||
| 2062 | 2154 | ||
| 2063 | balance_dirty_pages_ratelimited(mapping); | 2155 | balance_dirty_pages_ratelimited(mapping); |
| 2064 | cond_resched(); | 2156 | cond_resched(); |
| @@ -2082,13 +2174,117 @@ fs_write_aop_error: | |||
| 2082 | continue; | 2174 | continue; |
| 2083 | else | 2175 | else |
| 2084 | break; | 2176 | break; |
| 2085 | } while (iov_iter_count(&i)); | 2177 | } while (iov_iter_count(i)); |
| 2086 | *ppos = pos; | 2178 | |
| 2179 | return written ? written : status; | ||
| 2180 | } | ||
| 2181 | |||
| 2182 | static ssize_t generic_perform_write(struct file *file, | ||
| 2183 | struct iov_iter *i, loff_t pos) | ||
| 2184 | { | ||
| 2185 | struct address_space *mapping = file->f_mapping; | ||
| 2186 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
| 2187 | long status = 0; | ||
| 2188 | ssize_t written = 0; | ||
| 2189 | |||
| 2190 | do { | ||
| 2191 | struct page *page; | ||
| 2192 | pgoff_t index; /* Pagecache index for current page */ | ||
| 2193 | unsigned long offset; /* Offset into pagecache page */ | ||
| 2194 | unsigned long bytes; /* Bytes to write to page */ | ||
| 2195 | size_t copied; /* Bytes copied from user */ | ||
| 2196 | void *fsdata; | ||
| 2197 | |||
| 2198 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | ||
| 2199 | index = pos >> PAGE_CACHE_SHIFT; | ||
| 2200 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
| 2201 | iov_iter_count(i)); | ||
| 2202 | |||
| 2203 | again: | ||
| 2204 | |||
| 2205 | /* | ||
| 2206 | * Bring in the user page that we will copy from _first_. | ||
| 2207 | * Otherwise there's a nasty deadlock on copying from the | ||
| 2208 | * same page as we're writing to, without it being marked | ||
| 2209 | * up-to-date. | ||
| 2210 | * | ||
| 2211 | * Not only is this an optimisation, but it is also required | ||
| 2212 | * to check that the address is actually valid, when atomic | ||
| 2213 | * usercopies are used, below. | ||
| 2214 | */ | ||
| 2215 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
| 2216 | status = -EFAULT; | ||
| 2217 | break; | ||
| 2218 | } | ||
| 2219 | |||
| 2220 | status = a_ops->write_begin(file, mapping, pos, bytes, 0, | ||
| 2221 | &page, &fsdata); | ||
| 2222 | if (unlikely(status)) | ||
| 2223 | break; | ||
| 2224 | |||
| 2225 | pagefault_disable(); | ||
| 2226 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | ||
| 2227 | pagefault_enable(); | ||
| 2228 | flush_dcache_page(page); | ||
| 2229 | |||
| 2230 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | ||
| 2231 | page, fsdata); | ||
| 2232 | if (unlikely(status < 0)) | ||
| 2233 | break; | ||
| 2234 | copied = status; | ||
| 2235 | |||
| 2236 | cond_resched(); | ||
| 2237 | |||
| 2238 | if (unlikely(copied == 0)) { | ||
| 2239 | /* | ||
| 2240 | * If we were unable to copy any data at all, we must | ||
| 2241 | * fall back to a single segment length write. | ||
| 2242 | * | ||
| 2243 | * If we didn't fallback here, we could livelock | ||
| 2244 | * because not all segments in the iov can be copied at | ||
| 2245 | * once without a pagefault. | ||
| 2246 | */ | ||
| 2247 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
| 2248 | iov_iter_single_seg_count(i)); | ||
| 2249 | goto again; | ||
| 2250 | } | ||
| 2251 | iov_iter_advance(i, copied); | ||
| 2252 | pos += copied; | ||
| 2253 | written += copied; | ||
| 2254 | |||
| 2255 | balance_dirty_pages_ratelimited(mapping); | ||
| 2256 | |||
| 2257 | } while (iov_iter_count(i)); | ||
| 2258 | |||
| 2259 | return written ? written : status; | ||
| 2260 | } | ||
| 2261 | |||
| 2262 | ssize_t | ||
| 2263 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | ||
| 2264 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | ||
| 2265 | size_t count, ssize_t written) | ||
| 2266 | { | ||
| 2267 | struct file *file = iocb->ki_filp; | ||
| 2268 | struct address_space *mapping = file->f_mapping; | ||
| 2269 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
| 2270 | struct inode *inode = mapping->host; | ||
| 2271 | ssize_t status; | ||
| 2272 | struct iov_iter i; | ||
| 2273 | |||
| 2274 | iov_iter_init(&i, iov, nr_segs, count, written); | ||
| 2275 | if (a_ops->write_begin) | ||
| 2276 | status = generic_perform_write(file, &i, pos); | ||
| 2277 | else | ||
| 2278 | status = generic_perform_write_2copy(file, &i, pos); | ||
| 2087 | 2279 | ||
| 2088 | /* | ||
| 2089 | * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC | ||
| 2090 | */ | ||
| 2091 | if (likely(status >= 0)) { | 2280 | if (likely(status >= 0)) { |
| 2281 | written += status; | ||
| 2282 | *ppos = pos + status; | ||
| 2283 | |||
| 2284 | /* | ||
| 2285 | * For now, when the user asks for O_SYNC, we'll actually give | ||
| 2286 | * O_DSYNC | ||
| 2287 | */ | ||
| 2092 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2288 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
| 2093 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) | 2289 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) |
| 2094 | status = generic_osync_inode(inode, mapping, | 2290 | status = generic_osync_inode(inode, mapping, |
