diff options
author | Nick Piggin <npiggin@suse.de> | 2007-10-16 04:25:01 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-10-16 12:42:55 -0400 |
commit | afddba49d18f346e5cc2938b6ed7c512db18ca68 (patch) | |
tree | 4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca | |
parent | 637aff46f94a754207c80c8c64bf1b74f24b967d (diff) |
fs: introduce write_begin, write_end, and perform_write aops
These are intended to replace prepare_write and commit_write with more
flexible alternatives that are also able to avoid the buffered write
deadlock problems efficiently (which prepare_write is unable to do).
[mark.fasheh@oracle.com: API design contributions, code review and fixes]
[akpm@linux-foundation.org: various fixes]
[dmonakhov@sw.ru: new aop block_write_begin fix]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/filesystems/Locking | 9 | ||||
-rw-r--r-- | Documentation/filesystems/vfs.txt | 45 | ||||
-rw-r--r-- | drivers/block/loop.c | 75 | ||||
-rw-r--r-- | fs/buffer.c | 201 | ||||
-rw-r--r-- | fs/libfs.c | 44 | ||||
-rw-r--r-- | fs/namei.c | 46 | ||||
-rw-r--r-- | fs/splice.c | 69 | ||||
-rw-r--r-- | include/linux/buffer_head.h | 10 | ||||
-rw-r--r-- | include/linux/fs.h | 30 | ||||
-rw-r--r-- | include/linux/pagemap.h | 2 | ||||
-rw-r--r-- | mm/filemap.c | 250 |
11 files changed, 575 insertions, 206 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index f0f825808ca4..fe26cc978523 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking | |||
@@ -178,15 +178,18 @@ prototypes: | |||
178 | locking rules: | 178 | locking rules: |
179 | All except set_page_dirty may block | 179 | All except set_page_dirty may block |
180 | 180 | ||
181 | BKL PageLocked(page) | 181 | BKL PageLocked(page) i_sem |
182 | writepage: no yes, unlocks (see below) | 182 | writepage: no yes, unlocks (see below) |
183 | readpage: no yes, unlocks | 183 | readpage: no yes, unlocks |
184 | sync_page: no maybe | 184 | sync_page: no maybe |
185 | writepages: no | 185 | writepages: no |
186 | set_page_dirty no no | 186 | set_page_dirty no no |
187 | readpages: no | 187 | readpages: no |
188 | prepare_write: no yes | 188 | prepare_write: no yes yes |
189 | commit_write: no yes | 189 | commit_write: no yes yes |
190 | write_begin: no locks the page yes | ||
191 | write_end: no yes, unlocks yes | ||
192 | perform_write: no n/a yes | ||
190 | bmap: yes | 193 | bmap: yes |
191 | invalidatepage: no yes | 194 | invalidatepage: no yes |
192 | releasepage: no yes | 195 | releasepage: no yes |
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 045f3e055a28..281c19ff7f45 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt | |||
@@ -537,6 +537,12 @@ struct address_space_operations { | |||
537 | struct list_head *pages, unsigned nr_pages); | 537 | struct list_head *pages, unsigned nr_pages); |
538 | int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); | 538 | int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); |
539 | int (*commit_write)(struct file *, struct page *, unsigned, unsigned); | 539 | int (*commit_write)(struct file *, struct page *, unsigned, unsigned); |
540 | int (*write_begin)(struct file *, struct address_space *mapping, | ||
541 | loff_t pos, unsigned len, unsigned flags, | ||
542 | struct page **pagep, void **fsdata); | ||
543 | int (*write_end)(struct file *, struct address_space *mapping, | ||
544 | loff_t pos, unsigned len, unsigned copied, | ||
545 | struct page *page, void *fsdata); | ||
540 | sector_t (*bmap)(struct address_space *, sector_t); | 546 | sector_t (*bmap)(struct address_space *, sector_t); |
541 | int (*invalidatepage) (struct page *, unsigned long); | 547 | int (*invalidatepage) (struct page *, unsigned long); |
542 | int (*releasepage) (struct page *, int); | 548 | int (*releasepage) (struct page *, int); |
@@ -633,6 +639,45 @@ struct address_space_operations { | |||
633 | operations. It should avoid returning an error if possible - | 639 | operations. It should avoid returning an error if possible - |
634 | errors should have been handled by prepare_write. | 640 | errors should have been handled by prepare_write. |
635 | 641 | ||
642 | write_begin: This is intended as a replacement for prepare_write. The | ||
643 | key differences being that: | ||
644 | - it returns a locked page (in *pagep) rather than being | ||
645 | given a pre locked page; | ||
646 | - it must be able to cope with short writes (where the | ||
647 | length passed to write_begin is greater than the number | ||
648 | of bytes copied into the page). | ||
649 | |||
650 | Called by the generic buffered write code to ask the filesystem to | ||
651 | prepare to write len bytes at the given offset in the file. The | ||
652 | address_space should check that the write will be able to complete, | ||
653 | by allocating space if necessary and doing any other internal | ||
654 | housekeeping. If the write will update parts of any basic-blocks on | ||
655 | storage, then those blocks should be pre-read (if they haven't been | ||
656 | read already) so that the updated blocks can be written out properly. | ||
657 | |||
658 | The filesystem must return the locked pagecache page for the specified | ||
659 | offset, in *pagep, for the caller to write into. | ||
660 | |||
661 | flags is a field for AOP_FLAG_xxx flags, described in | ||
662 | include/linux/fs.h. | ||
663 | |||
664 | A void * may be returned in fsdata, which then gets passed into | ||
665 | write_end. | ||
666 | |||
667 | Returns 0 on success; < 0 on failure (which is the error code), in | ||
668 | which case write_end is not called. | ||
669 | |||
670 | write_end: After a successful write_begin, and data copy, write_end must | ||
671 | be called. len is the original len passed to write_begin, and copied | ||
672 | is the amount that was able to be copied (copied == len is always true | ||
673 | if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag). | ||
674 | |||
675 | The filesystem must take care of unlocking the page and releasing it | ||
676 | refcount, and updating i_size. | ||
677 | |||
678 | Returns < 0 on failure, otherwise the number of bytes (<= 'copied') | ||
679 | that were able to be copied into pagecache. | ||
680 | |||
636 | bmap: called by the VFS to map a logical block offset within object to | 681 | bmap: called by the VFS to map a logical block offset within object to |
637 | physical block number. This method is used by the FIBMAP | 682 | physical block number. This method is used by the FIBMAP |
638 | ioctl and for working with swap-files. To be able to swap to | 683 | ioctl and for working with swap-files. To be able to swap to |
diff --git a/drivers/block/loop.c b/drivers/block/loop.c index b9233a06934c..a5f993ac28dd 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c | |||
@@ -204,14 +204,13 @@ lo_do_transfer(struct loop_device *lo, int cmd, | |||
204 | * do_lo_send_aops - helper for writing data to a loop device | 204 | * do_lo_send_aops - helper for writing data to a loop device |
205 | * | 205 | * |
206 | * This is the fast version for backing filesystems which implement the address | 206 | * This is the fast version for backing filesystems which implement the address |
207 | * space operations prepare_write and commit_write. | 207 | * space operations write_begin and write_end. |
208 | */ | 208 | */ |
209 | static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, | 209 | static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, |
210 | int bsize, loff_t pos, struct page *page) | 210 | int bsize, loff_t pos, struct page *unused) |
211 | { | 211 | { |
212 | struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */ | 212 | struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */ |
213 | struct address_space *mapping = file->f_mapping; | 213 | struct address_space *mapping = file->f_mapping; |
214 | const struct address_space_operations *aops = mapping->a_ops; | ||
215 | pgoff_t index; | 214 | pgoff_t index; |
216 | unsigned offset, bv_offs; | 215 | unsigned offset, bv_offs; |
217 | int len, ret; | 216 | int len, ret; |
@@ -223,63 +222,47 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, | |||
223 | len = bvec->bv_len; | 222 | len = bvec->bv_len; |
224 | while (len > 0) { | 223 | while (len > 0) { |
225 | sector_t IV; | 224 | sector_t IV; |
226 | unsigned size; | 225 | unsigned size, copied; |
227 | int transfer_result; | 226 | int transfer_result; |
227 | struct page *page; | ||
228 | void *fsdata; | ||
228 | 229 | ||
229 | IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9); | 230 | IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9); |
230 | size = PAGE_CACHE_SIZE - offset; | 231 | size = PAGE_CACHE_SIZE - offset; |
231 | if (size > len) | 232 | if (size > len) |
232 | size = len; | 233 | size = len; |
233 | page = grab_cache_page(mapping, index); | 234 | |
234 | if (unlikely(!page)) | 235 | ret = pagecache_write_begin(file, mapping, pos, size, 0, |
236 | &page, &fsdata); | ||
237 | if (ret) | ||
235 | goto fail; | 238 | goto fail; |
236 | ret = aops->prepare_write(file, page, offset, | 239 | |
237 | offset + size); | ||
238 | if (unlikely(ret)) { | ||
239 | if (ret == AOP_TRUNCATED_PAGE) { | ||
240 | page_cache_release(page); | ||
241 | continue; | ||
242 | } | ||
243 | goto unlock; | ||
244 | } | ||
245 | transfer_result = lo_do_transfer(lo, WRITE, page, offset, | 240 | transfer_result = lo_do_transfer(lo, WRITE, page, offset, |
246 | bvec->bv_page, bv_offs, size, IV); | 241 | bvec->bv_page, bv_offs, size, IV); |
247 | if (unlikely(transfer_result)) { | 242 | copied = size; |
248 | /* | ||
249 | * The transfer failed, but we still write the data to | ||
250 | * keep prepare/commit calls balanced. | ||
251 | */ | ||
252 | printk(KERN_ERR "loop: transfer error block %llu\n", | ||
253 | (unsigned long long)index); | ||
254 | zero_user_page(page, offset, size, KM_USER0); | ||
255 | } | ||
256 | flush_dcache_page(page); | ||
257 | ret = aops->commit_write(file, page, offset, | ||
258 | offset + size); | ||
259 | if (unlikely(ret)) { | ||
260 | if (ret == AOP_TRUNCATED_PAGE) { | ||
261 | page_cache_release(page); | ||
262 | continue; | ||
263 | } | ||
264 | goto unlock; | ||
265 | } | ||
266 | if (unlikely(transfer_result)) | 243 | if (unlikely(transfer_result)) |
267 | goto unlock; | 244 | copied = 0; |
268 | bv_offs += size; | 245 | |
269 | len -= size; | 246 | ret = pagecache_write_end(file, mapping, pos, size, copied, |
247 | page, fsdata); | ||
248 | if (ret < 0) | ||
249 | goto fail; | ||
250 | if (ret < copied) | ||
251 | copied = ret; | ||
252 | |||
253 | if (unlikely(transfer_result)) | ||
254 | goto fail; | ||
255 | |||
256 | bv_offs += copied; | ||
257 | len -= copied; | ||
270 | offset = 0; | 258 | offset = 0; |
271 | index++; | 259 | index++; |
272 | pos += size; | 260 | pos += copied; |
273 | unlock_page(page); | ||
274 | page_cache_release(page); | ||
275 | } | 261 | } |
276 | ret = 0; | 262 | ret = 0; |
277 | out: | 263 | out: |
278 | mutex_unlock(&mapping->host->i_mutex); | 264 | mutex_unlock(&mapping->host->i_mutex); |
279 | return ret; | 265 | return ret; |
280 | unlock: | ||
281 | unlock_page(page); | ||
282 | page_cache_release(page); | ||
283 | fail: | 266 | fail: |
284 | ret = -1; | 267 | ret = -1; |
285 | goto out; | 268 | goto out; |
@@ -313,7 +296,7 @@ static int __do_lo_send_write(struct file *file, | |||
313 | * do_lo_send_direct_write - helper for writing data to a loop device | 296 | * do_lo_send_direct_write - helper for writing data to a loop device |
314 | * | 297 | * |
315 | * This is the fast, non-transforming version for backing filesystems which do | 298 | * This is the fast, non-transforming version for backing filesystems which do |
316 | * not implement the address space operations prepare_write and commit_write. | 299 | * not implement the address space operations write_begin and write_end. |
317 | * It uses the write file operation which should be present on all writeable | 300 | * It uses the write file operation which should be present on all writeable |
318 | * filesystems. | 301 | * filesystems. |
319 | */ | 302 | */ |
@@ -332,7 +315,7 @@ static int do_lo_send_direct_write(struct loop_device *lo, | |||
332 | * do_lo_send_write - helper for writing data to a loop device | 315 | * do_lo_send_write - helper for writing data to a loop device |
333 | * | 316 | * |
334 | * This is the slow, transforming version for filesystems which do not | 317 | * This is the slow, transforming version for filesystems which do not |
335 | * implement the address space operations prepare_write and commit_write. It | 318 | * implement the address space operations write_begin and write_end. It |
336 | * uses the write file operation which should be present on all writeable | 319 | * uses the write file operation which should be present on all writeable |
337 | * filesystems. | 320 | * filesystems. |
338 | * | 321 | * |
@@ -780,7 +763,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file, | |||
780 | */ | 763 | */ |
781 | if (!file->f_op->splice_read) | 764 | if (!file->f_op->splice_read) |
782 | goto out_putf; | 765 | goto out_putf; |
783 | if (aops->prepare_write && aops->commit_write) | 766 | if (aops->prepare_write || aops->write_begin) |
784 | lo_flags |= LO_FLAGS_USE_AOPS; | 767 | lo_flags |= LO_FLAGS_USE_AOPS; |
785 | if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write) | 768 | if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write) |
786 | lo_flags |= LO_FLAGS_READ_ONLY; | 769 | lo_flags |= LO_FLAGS_READ_ONLY; |
diff --git a/fs/buffer.c b/fs/buffer.c index 9ece6c2086d0..68b8fbdc1b28 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -1770,6 +1770,48 @@ recover: | |||
1770 | goto done; | 1770 | goto done; |
1771 | } | 1771 | } |
1772 | 1772 | ||
1773 | /* | ||
1774 | * If a page has any new buffers, zero them out here, and mark them uptodate | ||
1775 | * and dirty so they'll be written out (in order to prevent uninitialised | ||
1776 | * block data from leaking). And clear the new bit. | ||
1777 | */ | ||
1778 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) | ||
1779 | { | ||
1780 | unsigned int block_start, block_end; | ||
1781 | struct buffer_head *head, *bh; | ||
1782 | |||
1783 | BUG_ON(!PageLocked(page)); | ||
1784 | if (!page_has_buffers(page)) | ||
1785 | return; | ||
1786 | |||
1787 | bh = head = page_buffers(page); | ||
1788 | block_start = 0; | ||
1789 | do { | ||
1790 | block_end = block_start + bh->b_size; | ||
1791 | |||
1792 | if (buffer_new(bh)) { | ||
1793 | if (block_end > from && block_start < to) { | ||
1794 | if (!PageUptodate(page)) { | ||
1795 | unsigned start, size; | ||
1796 | |||
1797 | start = max(from, block_start); | ||
1798 | size = min(to, block_end) - start; | ||
1799 | |||
1800 | zero_user_page(page, start, size, KM_USER0); | ||
1801 | set_buffer_uptodate(bh); | ||
1802 | } | ||
1803 | |||
1804 | clear_buffer_new(bh); | ||
1805 | mark_buffer_dirty(bh); | ||
1806 | } | ||
1807 | } | ||
1808 | |||
1809 | block_start = block_end; | ||
1810 | bh = bh->b_this_page; | ||
1811 | } while (bh != head); | ||
1812 | } | ||
1813 | EXPORT_SYMBOL(page_zero_new_buffers); | ||
1814 | |||
1773 | static int __block_prepare_write(struct inode *inode, struct page *page, | 1815 | static int __block_prepare_write(struct inode *inode, struct page *page, |
1774 | unsigned from, unsigned to, get_block_t *get_block) | 1816 | unsigned from, unsigned to, get_block_t *get_block) |
1775 | { | 1817 | { |
@@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page, | |||
1854 | if (!buffer_uptodate(*wait_bh)) | 1896 | if (!buffer_uptodate(*wait_bh)) |
1855 | err = -EIO; | 1897 | err = -EIO; |
1856 | } | 1898 | } |
1857 | if (!err) { | 1899 | if (unlikely(err)) |
1858 | bh = head; | 1900 | page_zero_new_buffers(page, from, to); |
1859 | do { | ||
1860 | if (buffer_new(bh)) | ||
1861 | clear_buffer_new(bh); | ||
1862 | } while ((bh = bh->b_this_page) != head); | ||
1863 | return 0; | ||
1864 | } | ||
1865 | /* Error case: */ | ||
1866 | /* | ||
1867 | * Zero out any newly allocated blocks to avoid exposing stale | ||
1868 | * data. If BH_New is set, we know that the block was newly | ||
1869 | * allocated in the above loop. | ||
1870 | */ | ||
1871 | bh = head; | ||
1872 | block_start = 0; | ||
1873 | do { | ||
1874 | block_end = block_start+blocksize; | ||
1875 | if (block_end <= from) | ||
1876 | goto next_bh; | ||
1877 | if (block_start >= to) | ||
1878 | break; | ||
1879 | if (buffer_new(bh)) { | ||
1880 | clear_buffer_new(bh); | ||
1881 | zero_user_page(page, block_start, bh->b_size, KM_USER0); | ||
1882 | set_buffer_uptodate(bh); | ||
1883 | mark_buffer_dirty(bh); | ||
1884 | } | ||
1885 | next_bh: | ||
1886 | block_start = block_end; | ||
1887 | bh = bh->b_this_page; | ||
1888 | } while (bh != head); | ||
1889 | return err; | 1901 | return err; |
1890 | } | 1902 | } |
1891 | 1903 | ||
@@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode *inode, struct page *page, | |||
1910 | set_buffer_uptodate(bh); | 1922 | set_buffer_uptodate(bh); |
1911 | mark_buffer_dirty(bh); | 1923 | mark_buffer_dirty(bh); |
1912 | } | 1924 | } |
1925 | clear_buffer_new(bh); | ||
1913 | } | 1926 | } |
1914 | 1927 | ||
1915 | /* | 1928 | /* |
@@ -1924,6 +1937,130 @@ static int __block_commit_write(struct inode *inode, struct page *page, | |||
1924 | } | 1937 | } |
1925 | 1938 | ||
1926 | /* | 1939 | /* |
1940 | * block_write_begin takes care of the basic task of block allocation and | ||
1941 | * bringing partial write blocks uptodate first. | ||
1942 | * | ||
1943 | * If *pagep is not NULL, then block_write_begin uses the locked page | ||
1944 | * at *pagep rather than allocating its own. In this case, the page will | ||
1945 | * not be unlocked or deallocated on failure. | ||
1946 | */ | ||
1947 | int block_write_begin(struct file *file, struct address_space *mapping, | ||
1948 | loff_t pos, unsigned len, unsigned flags, | ||
1949 | struct page **pagep, void **fsdata, | ||
1950 | get_block_t *get_block) | ||
1951 | { | ||
1952 | struct inode *inode = mapping->host; | ||
1953 | int status = 0; | ||
1954 | struct page *page; | ||
1955 | pgoff_t index; | ||
1956 | unsigned start, end; | ||
1957 | int ownpage = 0; | ||
1958 | |||
1959 | index = pos >> PAGE_CACHE_SHIFT; | ||
1960 | start = pos & (PAGE_CACHE_SIZE - 1); | ||
1961 | end = start + len; | ||
1962 | |||
1963 | page = *pagep; | ||
1964 | if (page == NULL) { | ||
1965 | ownpage = 1; | ||
1966 | page = __grab_cache_page(mapping, index); | ||
1967 | if (!page) { | ||
1968 | status = -ENOMEM; | ||
1969 | goto out; | ||
1970 | } | ||
1971 | *pagep = page; | ||
1972 | } else | ||
1973 | BUG_ON(!PageLocked(page)); | ||
1974 | |||
1975 | status = __block_prepare_write(inode, page, start, end, get_block); | ||
1976 | if (unlikely(status)) { | ||
1977 | ClearPageUptodate(page); | ||
1978 | |||
1979 | if (ownpage) { | ||
1980 | unlock_page(page); | ||
1981 | page_cache_release(page); | ||
1982 | *pagep = NULL; | ||
1983 | |||
1984 | /* | ||
1985 | * prepare_write() may have instantiated a few blocks | ||
1986 | * outside i_size. Trim these off again. Don't need | ||
1987 | * i_size_read because we hold i_mutex. | ||
1988 | */ | ||
1989 | if (pos + len > inode->i_size) | ||
1990 | vmtruncate(inode, inode->i_size); | ||
1991 | } | ||
1992 | goto out; | ||
1993 | } | ||
1994 | |||
1995 | out: | ||
1996 | return status; | ||
1997 | } | ||
1998 | EXPORT_SYMBOL(block_write_begin); | ||
1999 | |||
2000 | int block_write_end(struct file *file, struct address_space *mapping, | ||
2001 | loff_t pos, unsigned len, unsigned copied, | ||
2002 | struct page *page, void *fsdata) | ||
2003 | { | ||
2004 | struct inode *inode = mapping->host; | ||
2005 | unsigned start; | ||
2006 | |||
2007 | start = pos & (PAGE_CACHE_SIZE - 1); | ||
2008 | |||
2009 | if (unlikely(copied < len)) { | ||
2010 | /* | ||
2011 | * The buffers that were written will now be uptodate, so we | ||
2012 | * don't have to worry about a readpage reading them and | ||
2013 | * overwriting a partial write. However if we have encountered | ||
2014 | * a short write and only partially written into a buffer, it | ||
2015 | * will not be marked uptodate, so a readpage might come in and | ||
2016 | * destroy our partial write. | ||
2017 | * | ||
2018 | * Do the simplest thing, and just treat any short write to a | ||
2019 | * non uptodate page as a zero-length write, and force the | ||
2020 | * caller to redo the whole thing. | ||
2021 | */ | ||
2022 | if (!PageUptodate(page)) | ||
2023 | copied = 0; | ||
2024 | |||
2025 | page_zero_new_buffers(page, start+copied, start+len); | ||
2026 | } | ||
2027 | flush_dcache_page(page); | ||
2028 | |||
2029 | /* This could be a short (even 0-length) commit */ | ||
2030 | __block_commit_write(inode, page, start, start+copied); | ||
2031 | |||
2032 | return copied; | ||
2033 | } | ||
2034 | EXPORT_SYMBOL(block_write_end); | ||
2035 | |||
2036 | int generic_write_end(struct file *file, struct address_space *mapping, | ||
2037 | loff_t pos, unsigned len, unsigned copied, | ||
2038 | struct page *page, void *fsdata) | ||
2039 | { | ||
2040 | struct inode *inode = mapping->host; | ||
2041 | |||
2042 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | ||
2043 | |||
2044 | /* | ||
2045 | * No need to use i_size_read() here, the i_size | ||
2046 | * cannot change under us because we hold i_mutex. | ||
2047 | * | ||
2048 | * But it's important to update i_size while still holding page lock: | ||
2049 | * page writeout could otherwise come in and zero beyond i_size. | ||
2050 | */ | ||
2051 | if (pos+copied > inode->i_size) { | ||
2052 | i_size_write(inode, pos+copied); | ||
2053 | mark_inode_dirty(inode); | ||
2054 | } | ||
2055 | |||
2056 | unlock_page(page); | ||
2057 | page_cache_release(page); | ||
2058 | |||
2059 | return copied; | ||
2060 | } | ||
2061 | EXPORT_SYMBOL(generic_write_end); | ||
2062 | |||
2063 | /* | ||
1927 | * Generic "read page" function for block devices that have the normal | 2064 | * Generic "read page" function for block devices that have the normal |
1928 | * get_block functionality. This is most of the block device filesystems. | 2065 | * get_block functionality. This is most of the block device filesystems. |
1929 | * Reads the page asynchronously --- the unlock_buffer() and | 2066 | * Reads the page asynchronously --- the unlock_buffer() and |
diff --git a/fs/libfs.c b/fs/libfs.c index 5294de1f40c4..f2b32d3a9093 100644 --- a/fs/libfs.c +++ b/fs/libfs.c | |||
@@ -351,6 +351,26 @@ int simple_prepare_write(struct file *file, struct page *page, | |||
351 | return 0; | 351 | return 0; |
352 | } | 352 | } |
353 | 353 | ||
354 | int simple_write_begin(struct file *file, struct address_space *mapping, | ||
355 | loff_t pos, unsigned len, unsigned flags, | ||
356 | struct page **pagep, void **fsdata) | ||
357 | { | ||
358 | struct page *page; | ||
359 | pgoff_t index; | ||
360 | unsigned from; | ||
361 | |||
362 | index = pos >> PAGE_CACHE_SHIFT; | ||
363 | from = pos & (PAGE_CACHE_SIZE - 1); | ||
364 | |||
365 | page = __grab_cache_page(mapping, index); | ||
366 | if (!page) | ||
367 | return -ENOMEM; | ||
368 | |||
369 | *pagep = page; | ||
370 | |||
371 | return simple_prepare_write(file, page, from, from+len); | ||
372 | } | ||
373 | |||
354 | int simple_commit_write(struct file *file, struct page *page, | 374 | int simple_commit_write(struct file *file, struct page *page, |
355 | unsigned from, unsigned to) | 375 | unsigned from, unsigned to) |
356 | { | 376 | { |
@@ -369,6 +389,28 @@ int simple_commit_write(struct file *file, struct page *page, | |||
369 | return 0; | 389 | return 0; |
370 | } | 390 | } |
371 | 391 | ||
392 | int simple_write_end(struct file *file, struct address_space *mapping, | ||
393 | loff_t pos, unsigned len, unsigned copied, | ||
394 | struct page *page, void *fsdata) | ||
395 | { | ||
396 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | ||
397 | |||
398 | /* zero the stale part of the page if we did a short copy */ | ||
399 | if (copied < len) { | ||
400 | void *kaddr = kmap_atomic(page, KM_USER0); | ||
401 | memset(kaddr + from + copied, 0, len - copied); | ||
402 | flush_dcache_page(page); | ||
403 | kunmap_atomic(kaddr, KM_USER0); | ||
404 | } | ||
405 | |||
406 | simple_commit_write(file, page, from, from+copied); | ||
407 | |||
408 | unlock_page(page); | ||
409 | page_cache_release(page); | ||
410 | |||
411 | return copied; | ||
412 | } | ||
413 | |||
372 | /* | 414 | /* |
373 | * the inodes created here are not hashed. If you use iunique to generate | 415 | * the inodes created here are not hashed. If you use iunique to generate |
374 | * unique inode values later for this filesystem, then you must take care | 416 | * unique inode values later for this filesystem, then you must take care |
@@ -642,6 +684,8 @@ EXPORT_SYMBOL(dcache_dir_open); | |||
642 | EXPORT_SYMBOL(dcache_readdir); | 684 | EXPORT_SYMBOL(dcache_readdir); |
643 | EXPORT_SYMBOL(generic_read_dir); | 685 | EXPORT_SYMBOL(generic_read_dir); |
644 | EXPORT_SYMBOL(get_sb_pseudo); | 686 | EXPORT_SYMBOL(get_sb_pseudo); |
687 | EXPORT_SYMBOL(simple_write_begin); | ||
688 | EXPORT_SYMBOL(simple_write_end); | ||
645 | EXPORT_SYMBOL(simple_commit_write); | 689 | EXPORT_SYMBOL(simple_commit_write); |
646 | EXPORT_SYMBOL(simple_dir_inode_operations); | 690 | EXPORT_SYMBOL(simple_dir_inode_operations); |
647 | EXPORT_SYMBOL(simple_dir_operations); | 691 | EXPORT_SYMBOL(simple_dir_operations); |
diff --git a/fs/namei.c b/fs/namei.c index a83160acd748..b40b8084eefc 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -2729,53 +2729,29 @@ int __page_symlink(struct inode *inode, const char *symname, int len, | |||
2729 | { | 2729 | { |
2730 | struct address_space *mapping = inode->i_mapping; | 2730 | struct address_space *mapping = inode->i_mapping; |
2731 | struct page *page; | 2731 | struct page *page; |
2732 | void *fsdata; | ||
2732 | int err; | 2733 | int err; |
2733 | char *kaddr; | 2734 | char *kaddr; |
2734 | 2735 | ||
2735 | retry: | 2736 | retry: |
2736 | err = -ENOMEM; | 2737 | err = pagecache_write_begin(NULL, mapping, 0, len-1, |
2737 | page = find_or_create_page(mapping, 0, gfp_mask); | 2738 | AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); |
2738 | if (!page) | ||
2739 | goto fail; | ||
2740 | err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); | ||
2741 | if (err == AOP_TRUNCATED_PAGE) { | ||
2742 | page_cache_release(page); | ||
2743 | goto retry; | ||
2744 | } | ||
2745 | if (err) | 2739 | if (err) |
2746 | goto fail_map; | 2740 | goto fail; |
2741 | |||
2747 | kaddr = kmap_atomic(page, KM_USER0); | 2742 | kaddr = kmap_atomic(page, KM_USER0); |
2748 | memcpy(kaddr, symname, len-1); | 2743 | memcpy(kaddr, symname, len-1); |
2749 | kunmap_atomic(kaddr, KM_USER0); | 2744 | kunmap_atomic(kaddr, KM_USER0); |
2750 | err = mapping->a_ops->commit_write(NULL, page, 0, len-1); | 2745 | |
2751 | if (err == AOP_TRUNCATED_PAGE) { | 2746 | err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, |
2752 | page_cache_release(page); | 2747 | page, fsdata); |
2753 | goto retry; | ||
2754 | } | ||
2755 | if (err) | ||
2756 | goto fail_map; | ||
2757 | /* | ||
2758 | * Notice that we are _not_ going to block here - end of page is | ||
2759 | * unmapped, so this will only try to map the rest of page, see | ||
2760 | * that it is unmapped (typically even will not look into inode - | ||
2761 | * ->i_size will be enough for everything) and zero it out. | ||
2762 | * OTOH it's obviously correct and should make the page up-to-date. | ||
2763 | */ | ||
2764 | if (!PageUptodate(page)) { | ||
2765 | err = mapping->a_ops->readpage(NULL, page); | ||
2766 | if (err != AOP_TRUNCATED_PAGE) | ||
2767 | wait_on_page_locked(page); | ||
2768 | } else { | ||
2769 | unlock_page(page); | ||
2770 | } | ||
2771 | page_cache_release(page); | ||
2772 | if (err < 0) | 2748 | if (err < 0) |
2773 | goto fail; | 2749 | goto fail; |
2750 | if (err < len-1) | ||
2751 | goto retry; | ||
2752 | |||
2774 | mark_inode_dirty(inode); | 2753 | mark_inode_dirty(inode); |
2775 | return 0; | 2754 | return 0; |
2776 | fail_map: | ||
2777 | unlock_page(page); | ||
2778 | page_cache_release(page); | ||
2779 | fail: | 2755 | fail: |
2780 | return err; | 2756 | return err; |
2781 | } | 2757 | } |
diff --git a/fs/splice.c b/fs/splice.c index 2df6be43c667..a7568bcc0f99 100644 --- a/fs/splice.c +++ b/fs/splice.c | |||
@@ -563,7 +563,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, | |||
563 | struct address_space *mapping = file->f_mapping; | 563 | struct address_space *mapping = file->f_mapping; |
564 | unsigned int offset, this_len; | 564 | unsigned int offset, this_len; |
565 | struct page *page; | 565 | struct page *page; |
566 | pgoff_t index; | 566 | void *fsdata; |
567 | int ret; | 567 | int ret; |
568 | 568 | ||
569 | /* | 569 | /* |
@@ -573,49 +573,16 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, | |||
573 | if (unlikely(ret)) | 573 | if (unlikely(ret)) |
574 | return ret; | 574 | return ret; |
575 | 575 | ||
576 | index = sd->pos >> PAGE_CACHE_SHIFT; | ||
577 | offset = sd->pos & ~PAGE_CACHE_MASK; | 576 | offset = sd->pos & ~PAGE_CACHE_MASK; |
578 | 577 | ||
579 | this_len = sd->len; | 578 | this_len = sd->len; |
580 | if (this_len + offset > PAGE_CACHE_SIZE) | 579 | if (this_len + offset > PAGE_CACHE_SIZE) |
581 | this_len = PAGE_CACHE_SIZE - offset; | 580 | this_len = PAGE_CACHE_SIZE - offset; |
582 | 581 | ||
583 | find_page: | 582 | ret = pagecache_write_begin(file, mapping, sd->pos, this_len, |
584 | page = find_lock_page(mapping, index); | 583 | AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); |
585 | if (!page) { | 584 | if (unlikely(ret)) |
586 | ret = -ENOMEM; | 585 | goto out; |
587 | page = page_cache_alloc_cold(mapping); | ||
588 | if (unlikely(!page)) | ||
589 | goto out_ret; | ||
590 | |||
591 | /* | ||
592 | * This will also lock the page | ||
593 | */ | ||
594 | ret = add_to_page_cache_lru(page, mapping, index, | ||
595 | GFP_KERNEL); | ||
596 | if (unlikely(ret)) | ||
597 | goto out_release; | ||
598 | } | ||
599 | |||
600 | ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); | ||
601 | if (unlikely(ret)) { | ||
602 | loff_t isize = i_size_read(mapping->host); | ||
603 | |||
604 | if (ret != AOP_TRUNCATED_PAGE) | ||
605 | unlock_page(page); | ||
606 | page_cache_release(page); | ||
607 | if (ret == AOP_TRUNCATED_PAGE) | ||
608 | goto find_page; | ||
609 | |||
610 | /* | ||
611 | * prepare_write() may have instantiated a few blocks | ||
612 | * outside i_size. Trim these off again. | ||
613 | */ | ||
614 | if (sd->pos + this_len > isize) | ||
615 | vmtruncate(mapping->host, isize); | ||
616 | |||
617 | goto out_ret; | ||
618 | } | ||
619 | 586 | ||
620 | if (buf->page != page) { | 587 | if (buf->page != page) { |
621 | /* | 588 | /* |
@@ -629,31 +596,9 @@ find_page: | |||
629 | kunmap_atomic(dst, KM_USER1); | 596 | kunmap_atomic(dst, KM_USER1); |
630 | buf->ops->unmap(pipe, buf, src); | 597 | buf->ops->unmap(pipe, buf, src); |
631 | } | 598 | } |
632 | 599 | ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, | |
633 | ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); | 600 | page, fsdata); |
634 | if (ret) { | ||
635 | if (ret == AOP_TRUNCATED_PAGE) { | ||
636 | page_cache_release(page); | ||
637 | goto find_page; | ||
638 | } | ||
639 | if (ret < 0) | ||
640 | goto out; | ||
641 | /* | ||
642 | * Partial write has happened, so 'ret' already initialized by | ||
643 | * number of bytes written, Where is nothing we have to do here. | ||
644 | */ | ||
645 | } else | ||
646 | ret = this_len; | ||
647 | /* | ||
648 | * Return the number of bytes written and mark page as | ||
649 | * accessed, we are now done! | ||
650 | */ | ||
651 | mark_page_accessed(page); | ||
652 | out: | 601 | out: |
653 | unlock_page(page); | ||
654 | out_release: | ||
655 | page_cache_release(page); | ||
656 | out_ret: | ||
657 | return ret; | 602 | return ret; |
658 | } | 603 | } |
659 | 604 | ||
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 35cadad84b14..a562ecfb1a14 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h | |||
@@ -203,6 +203,16 @@ void block_invalidatepage(struct page *page, unsigned long offset); | |||
203 | int block_write_full_page(struct page *page, get_block_t *get_block, | 203 | int block_write_full_page(struct page *page, get_block_t *get_block, |
204 | struct writeback_control *wbc); | 204 | struct writeback_control *wbc); |
205 | int block_read_full_page(struct page*, get_block_t*); | 205 | int block_read_full_page(struct page*, get_block_t*); |
206 | int block_write_begin(struct file *, struct address_space *, | ||
207 | loff_t, unsigned, unsigned, | ||
208 | struct page **, void **, get_block_t*); | ||
209 | int block_write_end(struct file *, struct address_space *, | ||
210 | loff_t, unsigned, unsigned, | ||
211 | struct page *, void *); | ||
212 | int generic_write_end(struct file *, struct address_space *, | ||
213 | loff_t, unsigned, unsigned, | ||
214 | struct page *, void *); | ||
215 | void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); | ||
206 | int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); | 216 | int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); |
207 | int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, | 217 | int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, |
208 | loff_t *); | 218 | loff_t *); |
diff --git a/include/linux/fs.h b/include/linux/fs.h index 86ce27c72554..e9344e6f877d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -394,6 +394,8 @@ enum positive_aop_returns { | |||
394 | AOP_TRUNCATED_PAGE = 0x80001, | 394 | AOP_TRUNCATED_PAGE = 0x80001, |
395 | }; | 395 | }; |
396 | 396 | ||
397 | #define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */ | ||
398 | |||
397 | /* | 399 | /* |
398 | * oh the beauties of C type declarations. | 400 | * oh the beauties of C type declarations. |
399 | */ | 401 | */ |
@@ -413,7 +415,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page, | |||
413 | size_t iov_iter_copy_from_user(struct page *page, | 415 | size_t iov_iter_copy_from_user(struct page *page, |
414 | struct iov_iter *i, unsigned long offset, size_t bytes); | 416 | struct iov_iter *i, unsigned long offset, size_t bytes); |
415 | void iov_iter_advance(struct iov_iter *i, size_t bytes); | 417 | void iov_iter_advance(struct iov_iter *i, size_t bytes); |
416 | int iov_iter_fault_in_readable(struct iov_iter *i); | 418 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); |
417 | size_t iov_iter_single_seg_count(struct iov_iter *i); | 419 | size_t iov_iter_single_seg_count(struct iov_iter *i); |
418 | 420 | ||
419 | static inline void iov_iter_init(struct iov_iter *i, | 421 | static inline void iov_iter_init(struct iov_iter *i, |
@@ -454,6 +456,14 @@ struct address_space_operations { | |||
454 | */ | 456 | */ |
455 | int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); | 457 | int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); |
456 | int (*commit_write)(struct file *, struct page *, unsigned, unsigned); | 458 | int (*commit_write)(struct file *, struct page *, unsigned, unsigned); |
459 | |||
460 | int (*write_begin)(struct file *, struct address_space *mapping, | ||
461 | loff_t pos, unsigned len, unsigned flags, | ||
462 | struct page **pagep, void **fsdata); | ||
463 | int (*write_end)(struct file *, struct address_space *mapping, | ||
464 | loff_t pos, unsigned len, unsigned copied, | ||
465 | struct page *page, void *fsdata); | ||
466 | |||
457 | /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ | 467 | /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ |
458 | sector_t (*bmap)(struct address_space *, sector_t); | 468 | sector_t (*bmap)(struct address_space *, sector_t); |
459 | void (*invalidatepage) (struct page *, unsigned long); | 469 | void (*invalidatepage) (struct page *, unsigned long); |
@@ -468,6 +478,18 @@ struct address_space_operations { | |||
468 | int (*launder_page) (struct page *); | 478 | int (*launder_page) (struct page *); |
469 | }; | 479 | }; |
470 | 480 | ||
481 | /* | ||
482 | * pagecache_write_begin/pagecache_write_end must be used by general code | ||
483 | * to write into the pagecache. | ||
484 | */ | ||
485 | int pagecache_write_begin(struct file *, struct address_space *mapping, | ||
486 | loff_t pos, unsigned len, unsigned flags, | ||
487 | struct page **pagep, void **fsdata); | ||
488 | |||
489 | int pagecache_write_end(struct file *, struct address_space *mapping, | ||
490 | loff_t pos, unsigned len, unsigned copied, | ||
491 | struct page *page, void *fsdata); | ||
492 | |||
471 | struct backing_dev_info; | 493 | struct backing_dev_info; |
472 | struct address_space { | 494 | struct address_space { |
473 | struct inode *host; /* owner: inode, block_device */ | 495 | struct inode *host; /* owner: inode, block_device */ |
@@ -1866,6 +1888,12 @@ extern int simple_prepare_write(struct file *file, struct page *page, | |||
1866 | unsigned offset, unsigned to); | 1888 | unsigned offset, unsigned to); |
1867 | extern int simple_commit_write(struct file *file, struct page *page, | 1889 | extern int simple_commit_write(struct file *file, struct page *page, |
1868 | unsigned offset, unsigned to); | 1890 | unsigned offset, unsigned to); |
1891 | extern int simple_write_begin(struct file *file, struct address_space *mapping, | ||
1892 | loff_t pos, unsigned len, unsigned flags, | ||
1893 | struct page **pagep, void **fsdata); | ||
1894 | extern int simple_write_end(struct file *file, struct address_space *mapping, | ||
1895 | loff_t pos, unsigned len, unsigned copied, | ||
1896 | struct page *page, void *fsdata); | ||
1869 | 1897 | ||
1870 | extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); | 1898 | extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); |
1871 | extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); | 1899 | extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); |
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 8f1e390fd71b..db8a410ae9e1 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h | |||
@@ -96,6 +96,8 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, | |||
96 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, | 96 | unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, |
97 | int tag, unsigned int nr_pages, struct page **pages); | 97 | int tag, unsigned int nr_pages, struct page **pages); |
98 | 98 | ||
99 | struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index); | ||
100 | |||
99 | /* | 101 | /* |
100 | * Returns locked page at given index in given cache, creating it if needed. | 102 | * Returns locked page at given index in given cache, creating it if needed. |
101 | */ | 103 | */ |
diff --git a/mm/filemap.c b/mm/filemap.c index 67a03a0a9aee..ec25ba1aef5f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -1742,11 +1742,20 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) | |||
1742 | i->count -= bytes; | 1742 | i->count -= bytes; |
1743 | } | 1743 | } |
1744 | 1744 | ||
1745 | int iov_iter_fault_in_readable(struct iov_iter *i) | 1745 | /* |
1746 | * Fault in the first iovec of the given iov_iter, to a maximum length | ||
1747 | * of bytes. Returns 0 on success, or non-zero if the memory could not be | ||
1748 | * accessed (ie. because it is an invalid address). | ||
1749 | * | ||
1750 | * writev-intensive code may want this to prefault several iovecs -- that | ||
1751 | * would be possible (callers must not rely on the fact that _only_ the | ||
1752 | * first iovec will be faulted with the current implementation). | ||
1753 | */ | ||
1754 | int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes) | ||
1746 | { | 1755 | { |
1747 | size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count); | ||
1748 | char __user *buf = i->iov->iov_base + i->iov_offset; | 1756 | char __user *buf = i->iov->iov_base + i->iov_offset; |
1749 | return fault_in_pages_readable(buf, seglen); | 1757 | bytes = min(bytes, i->iov->iov_len - i->iov_offset); |
1758 | return fault_in_pages_readable(buf, bytes); | ||
1750 | } | 1759 | } |
1751 | 1760 | ||
1752 | /* | 1761 | /* |
@@ -1843,6 +1852,95 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i | |||
1843 | } | 1852 | } |
1844 | EXPORT_SYMBOL(generic_write_checks); | 1853 | EXPORT_SYMBOL(generic_write_checks); |
1845 | 1854 | ||
1855 | int pagecache_write_begin(struct file *file, struct address_space *mapping, | ||
1856 | loff_t pos, unsigned len, unsigned flags, | ||
1857 | struct page **pagep, void **fsdata) | ||
1858 | { | ||
1859 | const struct address_space_operations *aops = mapping->a_ops; | ||
1860 | |||
1861 | if (aops->write_begin) { | ||
1862 | return aops->write_begin(file, mapping, pos, len, flags, | ||
1863 | pagep, fsdata); | ||
1864 | } else { | ||
1865 | int ret; | ||
1866 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | ||
1867 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
1868 | struct inode *inode = mapping->host; | ||
1869 | struct page *page; | ||
1870 | again: | ||
1871 | page = __grab_cache_page(mapping, index); | ||
1872 | *pagep = page; | ||
1873 | if (!page) | ||
1874 | return -ENOMEM; | ||
1875 | |||
1876 | if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { | ||
1877 | /* | ||
1878 | * There is no way to resolve a short write situation | ||
1879 | * for a !Uptodate page (except by double copying in | ||
1880 | * the caller done by generic_perform_write_2copy). | ||
1881 | * | ||
1882 | * Instead, we have to bring it uptodate here. | ||
1883 | */ | ||
1884 | ret = aops->readpage(file, page); | ||
1885 | page_cache_release(page); | ||
1886 | if (ret) { | ||
1887 | if (ret == AOP_TRUNCATED_PAGE) | ||
1888 | goto again; | ||
1889 | return ret; | ||
1890 | } | ||
1891 | goto again; | ||
1892 | } | ||
1893 | |||
1894 | ret = aops->prepare_write(file, page, offset, offset+len); | ||
1895 | if (ret) { | ||
1896 | if (ret != AOP_TRUNCATED_PAGE) | ||
1897 | unlock_page(page); | ||
1898 | page_cache_release(page); | ||
1899 | if (pos + len > inode->i_size) | ||
1900 | vmtruncate(inode, inode->i_size); | ||
1901 | if (ret == AOP_TRUNCATED_PAGE) | ||
1902 | goto again; | ||
1903 | } | ||
1904 | return ret; | ||
1905 | } | ||
1906 | } | ||
1907 | EXPORT_SYMBOL(pagecache_write_begin); | ||
1908 | |||
1909 | int pagecache_write_end(struct file *file, struct address_space *mapping, | ||
1910 | loff_t pos, unsigned len, unsigned copied, | ||
1911 | struct page *page, void *fsdata) | ||
1912 | { | ||
1913 | const struct address_space_operations *aops = mapping->a_ops; | ||
1914 | int ret; | ||
1915 | |||
1916 | if (aops->write_end) { | ||
1917 | mark_page_accessed(page); | ||
1918 | ret = aops->write_end(file, mapping, pos, len, copied, | ||
1919 | page, fsdata); | ||
1920 | } else { | ||
1921 | unsigned offset = pos & (PAGE_CACHE_SIZE - 1); | ||
1922 | struct inode *inode = mapping->host; | ||
1923 | |||
1924 | flush_dcache_page(page); | ||
1925 | ret = aops->commit_write(file, page, offset, offset+len); | ||
1926 | unlock_page(page); | ||
1927 | mark_page_accessed(page); | ||
1928 | page_cache_release(page); | ||
1929 | BUG_ON(ret == AOP_TRUNCATED_PAGE); /* can't deal with */ | ||
1930 | |||
1931 | if (ret < 0) { | ||
1932 | if (pos + len > inode->i_size) | ||
1933 | vmtruncate(inode, inode->i_size); | ||
1934 | } else if (ret > 0) | ||
1935 | ret = min_t(size_t, copied, ret); | ||
1936 | else | ||
1937 | ret = copied; | ||
1938 | } | ||
1939 | |||
1940 | return ret; | ||
1941 | } | ||
1942 | EXPORT_SYMBOL(pagecache_write_end); | ||
1943 | |||
1846 | ssize_t | 1944 | ssize_t |
1847 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, | 1945 | generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, |
1848 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, | 1946 | unsigned long *nr_segs, loff_t pos, loff_t *ppos, |
@@ -1886,8 +1984,7 @@ EXPORT_SYMBOL(generic_file_direct_write); | |||
1886 | * Find or create a page at the given pagecache position. Return the locked | 1984 | * Find or create a page at the given pagecache position. Return the locked |
1887 | * page. This function is specifically for buffered writes. | 1985 | * page. This function is specifically for buffered writes. |
1888 | */ | 1986 | */ |
1889 | static struct page *__grab_cache_page(struct address_space *mapping, | 1987 | struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) |
1890 | pgoff_t index) | ||
1891 | { | 1988 | { |
1892 | int status; | 1989 | int status; |
1893 | struct page *page; | 1990 | struct page *page; |
@@ -1908,20 +2005,16 @@ repeat: | |||
1908 | } | 2005 | } |
1909 | return page; | 2006 | return page; |
1910 | } | 2007 | } |
2008 | EXPORT_SYMBOL(__grab_cache_page); | ||
1911 | 2009 | ||
1912 | ssize_t | 2010 | static ssize_t generic_perform_write_2copy(struct file *file, |
1913 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | 2011 | struct iov_iter *i, loff_t pos) |
1914 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | ||
1915 | size_t count, ssize_t written) | ||
1916 | { | 2012 | { |
1917 | struct file *file = iocb->ki_filp; | ||
1918 | struct address_space *mapping = file->f_mapping; | 2013 | struct address_space *mapping = file->f_mapping; |
1919 | const struct address_space_operations *a_ops = mapping->a_ops; | 2014 | const struct address_space_operations *a_ops = mapping->a_ops; |
1920 | struct inode *inode = mapping->host; | 2015 | struct inode *inode = mapping->host; |
1921 | long status = 0; | 2016 | long status = 0; |
1922 | struct iov_iter i; | 2017 | ssize_t written = 0; |
1923 | |||
1924 | iov_iter_init(&i, iov, nr_segs, count, written); | ||
1925 | 2018 | ||
1926 | do { | 2019 | do { |
1927 | struct page *src_page; | 2020 | struct page *src_page; |
@@ -1934,7 +2027,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1934 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | 2027 | offset = (pos & (PAGE_CACHE_SIZE - 1)); |
1935 | index = pos >> PAGE_CACHE_SHIFT; | 2028 | index = pos >> PAGE_CACHE_SHIFT; |
1936 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | 2029 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, |
1937 | iov_iter_count(&i)); | 2030 | iov_iter_count(i)); |
1938 | 2031 | ||
1939 | /* | 2032 | /* |
1940 | * a non-NULL src_page indicates that we're doing the | 2033 | * a non-NULL src_page indicates that we're doing the |
@@ -1952,7 +2045,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1952 | * to check that the address is actually valid, when atomic | 2045 | * to check that the address is actually valid, when atomic |
1953 | * usercopies are used, below. | 2046 | * usercopies are used, below. |
1954 | */ | 2047 | */ |
1955 | if (unlikely(iov_iter_fault_in_readable(&i))) { | 2048 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { |
1956 | status = -EFAULT; | 2049 | status = -EFAULT; |
1957 | break; | 2050 | break; |
1958 | } | 2051 | } |
@@ -1983,7 +2076,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
1983 | * same reason as we can't take a page fault with a | 2076 | * same reason as we can't take a page fault with a |
1984 | * page locked (as explained below). | 2077 | * page locked (as explained below). |
1985 | */ | 2078 | */ |
1986 | copied = iov_iter_copy_from_user(src_page, &i, | 2079 | copied = iov_iter_copy_from_user(src_page, i, |
1987 | offset, bytes); | 2080 | offset, bytes); |
1988 | if (unlikely(copied == 0)) { | 2081 | if (unlikely(copied == 0)) { |
1989 | status = -EFAULT; | 2082 | status = -EFAULT; |
@@ -2008,7 +2101,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2008 | page_cache_release(src_page); | 2101 | page_cache_release(src_page); |
2009 | continue; | 2102 | continue; |
2010 | } | 2103 | } |
2011 | |||
2012 | } | 2104 | } |
2013 | 2105 | ||
2014 | status = a_ops->prepare_write(file, page, offset, offset+bytes); | 2106 | status = a_ops->prepare_write(file, page, offset, offset+bytes); |
@@ -2030,7 +2122,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2030 | * really matter. | 2122 | * really matter. |
2031 | */ | 2123 | */ |
2032 | pagefault_disable(); | 2124 | pagefault_disable(); |
2033 | copied = iov_iter_copy_from_user_atomic(page, &i, | 2125 | copied = iov_iter_copy_from_user_atomic(page, i, |
2034 | offset, bytes); | 2126 | offset, bytes); |
2035 | pagefault_enable(); | 2127 | pagefault_enable(); |
2036 | } else { | 2128 | } else { |
@@ -2056,9 +2148,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2056 | if (src_page) | 2148 | if (src_page) |
2057 | page_cache_release(src_page); | 2149 | page_cache_release(src_page); |
2058 | 2150 | ||
2059 | iov_iter_advance(&i, copied); | 2151 | iov_iter_advance(i, copied); |
2060 | written += copied; | ||
2061 | pos += copied; | 2152 | pos += copied; |
2153 | written += copied; | ||
2062 | 2154 | ||
2063 | balance_dirty_pages_ratelimited(mapping); | 2155 | balance_dirty_pages_ratelimited(mapping); |
2064 | cond_resched(); | 2156 | cond_resched(); |
@@ -2082,13 +2174,117 @@ fs_write_aop_error: | |||
2082 | continue; | 2174 | continue; |
2083 | else | 2175 | else |
2084 | break; | 2176 | break; |
2085 | } while (iov_iter_count(&i)); | 2177 | } while (iov_iter_count(i)); |
2086 | *ppos = pos; | 2178 | |
2179 | return written ? written : status; | ||
2180 | } | ||
2181 | |||
2182 | static ssize_t generic_perform_write(struct file *file, | ||
2183 | struct iov_iter *i, loff_t pos) | ||
2184 | { | ||
2185 | struct address_space *mapping = file->f_mapping; | ||
2186 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
2187 | long status = 0; | ||
2188 | ssize_t written = 0; | ||
2189 | |||
2190 | do { | ||
2191 | struct page *page; | ||
2192 | pgoff_t index; /* Pagecache index for current page */ | ||
2193 | unsigned long offset; /* Offset into pagecache page */ | ||
2194 | unsigned long bytes; /* Bytes to write to page */ | ||
2195 | size_t copied; /* Bytes copied from user */ | ||
2196 | void *fsdata; | ||
2197 | |||
2198 | offset = (pos & (PAGE_CACHE_SIZE - 1)); | ||
2199 | index = pos >> PAGE_CACHE_SHIFT; | ||
2200 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
2201 | iov_iter_count(i)); | ||
2202 | |||
2203 | again: | ||
2204 | |||
2205 | /* | ||
2206 | * Bring in the user page that we will copy from _first_. | ||
2207 | * Otherwise there's a nasty deadlock on copying from the | ||
2208 | * same page as we're writing to, without it being marked | ||
2209 | * up-to-date. | ||
2210 | * | ||
2211 | * Not only is this an optimisation, but it is also required | ||
2212 | * to check that the address is actually valid, when atomic | ||
2213 | * usercopies are used, below. | ||
2214 | */ | ||
2215 | if (unlikely(iov_iter_fault_in_readable(i, bytes))) { | ||
2216 | status = -EFAULT; | ||
2217 | break; | ||
2218 | } | ||
2219 | |||
2220 | status = a_ops->write_begin(file, mapping, pos, bytes, 0, | ||
2221 | &page, &fsdata); | ||
2222 | if (unlikely(status)) | ||
2223 | break; | ||
2224 | |||
2225 | pagefault_disable(); | ||
2226 | copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); | ||
2227 | pagefault_enable(); | ||
2228 | flush_dcache_page(page); | ||
2229 | |||
2230 | status = a_ops->write_end(file, mapping, pos, bytes, copied, | ||
2231 | page, fsdata); | ||
2232 | if (unlikely(status < 0)) | ||
2233 | break; | ||
2234 | copied = status; | ||
2235 | |||
2236 | cond_resched(); | ||
2237 | |||
2238 | if (unlikely(copied == 0)) { | ||
2239 | /* | ||
2240 | * If we were unable to copy any data at all, we must | ||
2241 | * fall back to a single segment length write. | ||
2242 | * | ||
2243 | * If we didn't fallback here, we could livelock | ||
2244 | * because not all segments in the iov can be copied at | ||
2245 | * once without a pagefault. | ||
2246 | */ | ||
2247 | bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, | ||
2248 | iov_iter_single_seg_count(i)); | ||
2249 | goto again; | ||
2250 | } | ||
2251 | iov_iter_advance(i, copied); | ||
2252 | pos += copied; | ||
2253 | written += copied; | ||
2254 | |||
2255 | balance_dirty_pages_ratelimited(mapping); | ||
2256 | |||
2257 | } while (iov_iter_count(i)); | ||
2258 | |||
2259 | return written ? written : status; | ||
2260 | } | ||
2261 | |||
2262 | ssize_t | ||
2263 | generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | ||
2264 | unsigned long nr_segs, loff_t pos, loff_t *ppos, | ||
2265 | size_t count, ssize_t written) | ||
2266 | { | ||
2267 | struct file *file = iocb->ki_filp; | ||
2268 | struct address_space *mapping = file->f_mapping; | ||
2269 | const struct address_space_operations *a_ops = mapping->a_ops; | ||
2270 | struct inode *inode = mapping->host; | ||
2271 | ssize_t status; | ||
2272 | struct iov_iter i; | ||
2273 | |||
2274 | iov_iter_init(&i, iov, nr_segs, count, written); | ||
2275 | if (a_ops->write_begin) | ||
2276 | status = generic_perform_write(file, &i, pos); | ||
2277 | else | ||
2278 | status = generic_perform_write_2copy(file, &i, pos); | ||
2087 | 2279 | ||
2088 | /* | ||
2089 | * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC | ||
2090 | */ | ||
2091 | if (likely(status >= 0)) { | 2280 | if (likely(status >= 0)) { |
2281 | written += status; | ||
2282 | *ppos = pos + status; | ||
2283 | |||
2284 | /* | ||
2285 | * For now, when the user asks for O_SYNC, we'll actually give | ||
2286 | * O_DSYNC | ||
2287 | */ | ||
2092 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { | 2288 | if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { |
2093 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) | 2289 | if (!a_ops->writepage || !is_sync_kiocb(iocb)) |
2094 | status = generic_osync_inode(inode, mapping, | 2290 | status = generic_osync_inode(inode, mapping, |