aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2007-10-16 04:25:01 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-16 12:42:55 -0400
commitafddba49d18f346e5cc2938b6ed7c512db18ca68 (patch)
tree4726e3d3b0e9e8e5b5d3b2b0cccb36446bbdf3ca
parent637aff46f94a754207c80c8c64bf1b74f24b967d (diff)
fs: introduce write_begin, write_end, and perform_write aops
These are intended to replace prepare_write and commit_write with more flexible alternatives that are also able to avoid the buffered write deadlock problems efficiently (which prepare_write is unable to do). [mark.fasheh@oracle.com: API design contributions, code review and fixes] [akpm@linux-foundation.org: various fixes] [dmonakhov@sw.ru: new aop block_write_begin fix] Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com> Signed-off-by: Dmitriy Monakhov <dmonakhov@openvz.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/filesystems/Locking9
-rw-r--r--Documentation/filesystems/vfs.txt45
-rw-r--r--drivers/block/loop.c75
-rw-r--r--fs/buffer.c201
-rw-r--r--fs/libfs.c44
-rw-r--r--fs/namei.c46
-rw-r--r--fs/splice.c69
-rw-r--r--include/linux/buffer_head.h10
-rw-r--r--include/linux/fs.h30
-rw-r--r--include/linux/pagemap.h2
-rw-r--r--mm/filemap.c250
11 files changed, 575 insertions, 206 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index f0f825808ca4..fe26cc978523 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -178,15 +178,18 @@ prototypes:
178locking rules: 178locking rules:
179 All except set_page_dirty may block 179 All except set_page_dirty may block
180 180
181 BKL PageLocked(page) 181 BKL PageLocked(page) i_sem
182writepage: no yes, unlocks (see below) 182writepage: no yes, unlocks (see below)
183readpage: no yes, unlocks 183readpage: no yes, unlocks
184sync_page: no maybe 184sync_page: no maybe
185writepages: no 185writepages: no
186set_page_dirty no no 186set_page_dirty no no
187readpages: no 187readpages: no
188prepare_write: no yes 188prepare_write: no yes yes
189commit_write: no yes 189commit_write: no yes yes
190write_begin: no locks the page yes
191write_end: no yes, unlocks yes
192perform_write: no n/a yes
190bmap: yes 193bmap: yes
191invalidatepage: no yes 194invalidatepage: no yes
192releasepage: no yes 195releasepage: no yes
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 045f3e055a28..281c19ff7f45 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -537,6 +537,12 @@ struct address_space_operations {
537 struct list_head *pages, unsigned nr_pages); 537 struct list_head *pages, unsigned nr_pages);
538 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); 538 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
539 int (*commit_write)(struct file *, struct page *, unsigned, unsigned); 539 int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
540 int (*write_begin)(struct file *, struct address_space *mapping,
541 loff_t pos, unsigned len, unsigned flags,
542 struct page **pagep, void **fsdata);
543 int (*write_end)(struct file *, struct address_space *mapping,
544 loff_t pos, unsigned len, unsigned copied,
545 struct page *page, void *fsdata);
540 sector_t (*bmap)(struct address_space *, sector_t); 546 sector_t (*bmap)(struct address_space *, sector_t);
541 int (*invalidatepage) (struct page *, unsigned long); 547 int (*invalidatepage) (struct page *, unsigned long);
542 int (*releasepage) (struct page *, int); 548 int (*releasepage) (struct page *, int);
@@ -633,6 +639,45 @@ struct address_space_operations {
633 operations. It should avoid returning an error if possible - 639 operations. It should avoid returning an error if possible -
634 errors should have been handled by prepare_write. 640 errors should have been handled by prepare_write.
635 641
642 write_begin: This is intended as a replacement for prepare_write. The
643 key differences being that:
644 - it returns a locked page (in *pagep) rather than being
645 given a pre locked page;
646 - it must be able to cope with short writes (where the
647 length passed to write_begin is greater than the number
648 of bytes copied into the page).
649
650 Called by the generic buffered write code to ask the filesystem to
651 prepare to write len bytes at the given offset in the file. The
652 address_space should check that the write will be able to complete,
653 by allocating space if necessary and doing any other internal
654 housekeeping. If the write will update parts of any basic-blocks on
655 storage, then those blocks should be pre-read (if they haven't been
656 read already) so that the updated blocks can be written out properly.
657
658 The filesystem must return the locked pagecache page for the specified
659 offset, in *pagep, for the caller to write into.
660
661 flags is a field for AOP_FLAG_xxx flags, described in
662 include/linux/fs.h.
663
664 A void * may be returned in fsdata, which then gets passed into
665 write_end.
666
667 Returns 0 on success; < 0 on failure (which is the error code), in
668 which case write_end is not called.
669
670 write_end: After a successful write_begin, and data copy, write_end must
671 be called. len is the original len passed to write_begin, and copied
672 is the amount that was able to be copied (copied == len is always true
673 if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag).
674
675 The filesystem must take care of unlocking the page and releasing it
676 refcount, and updating i_size.
677
678 Returns < 0 on failure, otherwise the number of bytes (<= 'copied')
679 that were able to be copied into pagecache.
680
636 bmap: called by the VFS to map a logical block offset within object to 681 bmap: called by the VFS to map a logical block offset within object to
637 physical block number. This method is used by the FIBMAP 682 physical block number. This method is used by the FIBMAP
638 ioctl and for working with swap-files. To be able to swap to 683 ioctl and for working with swap-files. To be able to swap to
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b9233a06934c..a5f993ac28dd 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -204,14 +204,13 @@ lo_do_transfer(struct loop_device *lo, int cmd,
204 * do_lo_send_aops - helper for writing data to a loop device 204 * do_lo_send_aops - helper for writing data to a loop device
205 * 205 *
206 * This is the fast version for backing filesystems which implement the address 206 * This is the fast version for backing filesystems which implement the address
207 * space operations prepare_write and commit_write. 207 * space operations write_begin and write_end.
208 */ 208 */
209static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, 209static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
210 int bsize, loff_t pos, struct page *page) 210 int bsize, loff_t pos, struct page *unused)
211{ 211{
212 struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */ 212 struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */
213 struct address_space *mapping = file->f_mapping; 213 struct address_space *mapping = file->f_mapping;
214 const struct address_space_operations *aops = mapping->a_ops;
215 pgoff_t index; 214 pgoff_t index;
216 unsigned offset, bv_offs; 215 unsigned offset, bv_offs;
217 int len, ret; 216 int len, ret;
@@ -223,63 +222,47 @@ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec,
223 len = bvec->bv_len; 222 len = bvec->bv_len;
224 while (len > 0) { 223 while (len > 0) {
225 sector_t IV; 224 sector_t IV;
226 unsigned size; 225 unsigned size, copied;
227 int transfer_result; 226 int transfer_result;
227 struct page *page;
228 void *fsdata;
228 229
229 IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9); 230 IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
230 size = PAGE_CACHE_SIZE - offset; 231 size = PAGE_CACHE_SIZE - offset;
231 if (size > len) 232 if (size > len)
232 size = len; 233 size = len;
233 page = grab_cache_page(mapping, index); 234
234 if (unlikely(!page)) 235 ret = pagecache_write_begin(file, mapping, pos, size, 0,
236 &page, &fsdata);
237 if (ret)
235 goto fail; 238 goto fail;
236 ret = aops->prepare_write(file, page, offset, 239
237 offset + size);
238 if (unlikely(ret)) {
239 if (ret == AOP_TRUNCATED_PAGE) {
240 page_cache_release(page);
241 continue;
242 }
243 goto unlock;
244 }
245 transfer_result = lo_do_transfer(lo, WRITE, page, offset, 240 transfer_result = lo_do_transfer(lo, WRITE, page, offset,
246 bvec->bv_page, bv_offs, size, IV); 241 bvec->bv_page, bv_offs, size, IV);
247 if (unlikely(transfer_result)) { 242 copied = size;
248 /*
249 * The transfer failed, but we still write the data to
250 * keep prepare/commit calls balanced.
251 */
252 printk(KERN_ERR "loop: transfer error block %llu\n",
253 (unsigned long long)index);
254 zero_user_page(page, offset, size, KM_USER0);
255 }
256 flush_dcache_page(page);
257 ret = aops->commit_write(file, page, offset,
258 offset + size);
259 if (unlikely(ret)) {
260 if (ret == AOP_TRUNCATED_PAGE) {
261 page_cache_release(page);
262 continue;
263 }
264 goto unlock;
265 }
266 if (unlikely(transfer_result)) 243 if (unlikely(transfer_result))
267 goto unlock; 244 copied = 0;
268 bv_offs += size; 245
269 len -= size; 246 ret = pagecache_write_end(file, mapping, pos, size, copied,
247 page, fsdata);
248 if (ret < 0)
249 goto fail;
250 if (ret < copied)
251 copied = ret;
252
253 if (unlikely(transfer_result))
254 goto fail;
255
256 bv_offs += copied;
257 len -= copied;
270 offset = 0; 258 offset = 0;
271 index++; 259 index++;
272 pos += size; 260 pos += copied;
273 unlock_page(page);
274 page_cache_release(page);
275 } 261 }
276 ret = 0; 262 ret = 0;
277out: 263out:
278 mutex_unlock(&mapping->host->i_mutex); 264 mutex_unlock(&mapping->host->i_mutex);
279 return ret; 265 return ret;
280unlock:
281 unlock_page(page);
282 page_cache_release(page);
283fail: 266fail:
284 ret = -1; 267 ret = -1;
285 goto out; 268 goto out;
@@ -313,7 +296,7 @@ static int __do_lo_send_write(struct file *file,
313 * do_lo_send_direct_write - helper for writing data to a loop device 296 * do_lo_send_direct_write - helper for writing data to a loop device
314 * 297 *
315 * This is the fast, non-transforming version for backing filesystems which do 298 * This is the fast, non-transforming version for backing filesystems which do
316 * not implement the address space operations prepare_write and commit_write. 299 * not implement the address space operations write_begin and write_end.
317 * It uses the write file operation which should be present on all writeable 300 * It uses the write file operation which should be present on all writeable
318 * filesystems. 301 * filesystems.
319 */ 302 */
@@ -332,7 +315,7 @@ static int do_lo_send_direct_write(struct loop_device *lo,
332 * do_lo_send_write - helper for writing data to a loop device 315 * do_lo_send_write - helper for writing data to a loop device
333 * 316 *
334 * This is the slow, transforming version for filesystems which do not 317 * This is the slow, transforming version for filesystems which do not
335 * implement the address space operations prepare_write and commit_write. It 318 * implement the address space operations write_begin and write_end. It
336 * uses the write file operation which should be present on all writeable 319 * uses the write file operation which should be present on all writeable
337 * filesystems. 320 * filesystems.
338 * 321 *
@@ -780,7 +763,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
780 */ 763 */
781 if (!file->f_op->splice_read) 764 if (!file->f_op->splice_read)
782 goto out_putf; 765 goto out_putf;
783 if (aops->prepare_write && aops->commit_write) 766 if (aops->prepare_write || aops->write_begin)
784 lo_flags |= LO_FLAGS_USE_AOPS; 767 lo_flags |= LO_FLAGS_USE_AOPS;
785 if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write) 768 if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
786 lo_flags |= LO_FLAGS_READ_ONLY; 769 lo_flags |= LO_FLAGS_READ_ONLY;
diff --git a/fs/buffer.c b/fs/buffer.c
index 9ece6c2086d0..68b8fbdc1b28 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1770,6 +1770,48 @@ recover:
1770 goto done; 1770 goto done;
1771} 1771}
1772 1772
1773/*
1774 * If a page has any new buffers, zero them out here, and mark them uptodate
1775 * and dirty so they'll be written out (in order to prevent uninitialised
1776 * block data from leaking). And clear the new bit.
1777 */
1778void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1779{
1780 unsigned int block_start, block_end;
1781 struct buffer_head *head, *bh;
1782
1783 BUG_ON(!PageLocked(page));
1784 if (!page_has_buffers(page))
1785 return;
1786
1787 bh = head = page_buffers(page);
1788 block_start = 0;
1789 do {
1790 block_end = block_start + bh->b_size;
1791
1792 if (buffer_new(bh)) {
1793 if (block_end > from && block_start < to) {
1794 if (!PageUptodate(page)) {
1795 unsigned start, size;
1796
1797 start = max(from, block_start);
1798 size = min(to, block_end) - start;
1799
1800 zero_user_page(page, start, size, KM_USER0);
1801 set_buffer_uptodate(bh);
1802 }
1803
1804 clear_buffer_new(bh);
1805 mark_buffer_dirty(bh);
1806 }
1807 }
1808
1809 block_start = block_end;
1810 bh = bh->b_this_page;
1811 } while (bh != head);
1812}
1813EXPORT_SYMBOL(page_zero_new_buffers);
1814
1773static int __block_prepare_write(struct inode *inode, struct page *page, 1815static int __block_prepare_write(struct inode *inode, struct page *page,
1774 unsigned from, unsigned to, get_block_t *get_block) 1816 unsigned from, unsigned to, get_block_t *get_block)
1775{ 1817{
@@ -1854,38 +1896,8 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
1854 if (!buffer_uptodate(*wait_bh)) 1896 if (!buffer_uptodate(*wait_bh))
1855 err = -EIO; 1897 err = -EIO;
1856 } 1898 }
1857 if (!err) { 1899 if (unlikely(err))
1858 bh = head; 1900 page_zero_new_buffers(page, from, to);
1859 do {
1860 if (buffer_new(bh))
1861 clear_buffer_new(bh);
1862 } while ((bh = bh->b_this_page) != head);
1863 return 0;
1864 }
1865 /* Error case: */
1866 /*
1867 * Zero out any newly allocated blocks to avoid exposing stale
1868 * data. If BH_New is set, we know that the block was newly
1869 * allocated in the above loop.
1870 */
1871 bh = head;
1872 block_start = 0;
1873 do {
1874 block_end = block_start+blocksize;
1875 if (block_end <= from)
1876 goto next_bh;
1877 if (block_start >= to)
1878 break;
1879 if (buffer_new(bh)) {
1880 clear_buffer_new(bh);
1881 zero_user_page(page, block_start, bh->b_size, KM_USER0);
1882 set_buffer_uptodate(bh);
1883 mark_buffer_dirty(bh);
1884 }
1885next_bh:
1886 block_start = block_end;
1887 bh = bh->b_this_page;
1888 } while (bh != head);
1889 return err; 1901 return err;
1890} 1902}
1891 1903
@@ -1910,6 +1922,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1910 set_buffer_uptodate(bh); 1922 set_buffer_uptodate(bh);
1911 mark_buffer_dirty(bh); 1923 mark_buffer_dirty(bh);
1912 } 1924 }
1925 clear_buffer_new(bh);
1913 } 1926 }
1914 1927
1915 /* 1928 /*
@@ -1924,6 +1937,130 @@ static int __block_commit_write(struct inode *inode, struct page *page,
1924} 1937}
1925 1938
1926/* 1939/*
1940 * block_write_begin takes care of the basic task of block allocation and
1941 * bringing partial write blocks uptodate first.
1942 *
1943 * If *pagep is not NULL, then block_write_begin uses the locked page
1944 * at *pagep rather than allocating its own. In this case, the page will
1945 * not be unlocked or deallocated on failure.
1946 */
1947int block_write_begin(struct file *file, struct address_space *mapping,
1948 loff_t pos, unsigned len, unsigned flags,
1949 struct page **pagep, void **fsdata,
1950 get_block_t *get_block)
1951{
1952 struct inode *inode = mapping->host;
1953 int status = 0;
1954 struct page *page;
1955 pgoff_t index;
1956 unsigned start, end;
1957 int ownpage = 0;
1958
1959 index = pos >> PAGE_CACHE_SHIFT;
1960 start = pos & (PAGE_CACHE_SIZE - 1);
1961 end = start + len;
1962
1963 page = *pagep;
1964 if (page == NULL) {
1965 ownpage = 1;
1966 page = __grab_cache_page(mapping, index);
1967 if (!page) {
1968 status = -ENOMEM;
1969 goto out;
1970 }
1971 *pagep = page;
1972 } else
1973 BUG_ON(!PageLocked(page));
1974
1975 status = __block_prepare_write(inode, page, start, end, get_block);
1976 if (unlikely(status)) {
1977 ClearPageUptodate(page);
1978
1979 if (ownpage) {
1980 unlock_page(page);
1981 page_cache_release(page);
1982 *pagep = NULL;
1983
1984 /*
1985 * prepare_write() may have instantiated a few blocks
1986 * outside i_size. Trim these off again. Don't need
1987 * i_size_read because we hold i_mutex.
1988 */
1989 if (pos + len > inode->i_size)
1990 vmtruncate(inode, inode->i_size);
1991 }
1992 goto out;
1993 }
1994
1995out:
1996 return status;
1997}
1998EXPORT_SYMBOL(block_write_begin);
1999
2000int block_write_end(struct file *file, struct address_space *mapping,
2001 loff_t pos, unsigned len, unsigned copied,
2002 struct page *page, void *fsdata)
2003{
2004 struct inode *inode = mapping->host;
2005 unsigned start;
2006
2007 start = pos & (PAGE_CACHE_SIZE - 1);
2008
2009 if (unlikely(copied < len)) {
2010 /*
2011 * The buffers that were written will now be uptodate, so we
2012 * don't have to worry about a readpage reading them and
2013 * overwriting a partial write. However if we have encountered
2014 * a short write and only partially written into a buffer, it
2015 * will not be marked uptodate, so a readpage might come in and
2016 * destroy our partial write.
2017 *
2018 * Do the simplest thing, and just treat any short write to a
2019 * non uptodate page as a zero-length write, and force the
2020 * caller to redo the whole thing.
2021 */
2022 if (!PageUptodate(page))
2023 copied = 0;
2024
2025 page_zero_new_buffers(page, start+copied, start+len);
2026 }
2027 flush_dcache_page(page);
2028
2029 /* This could be a short (even 0-length) commit */
2030 __block_commit_write(inode, page, start, start+copied);
2031
2032 return copied;
2033}
2034EXPORT_SYMBOL(block_write_end);
2035
2036int generic_write_end(struct file *file, struct address_space *mapping,
2037 loff_t pos, unsigned len, unsigned copied,
2038 struct page *page, void *fsdata)
2039{
2040 struct inode *inode = mapping->host;
2041
2042 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2043
2044 /*
2045 * No need to use i_size_read() here, the i_size
2046 * cannot change under us because we hold i_mutex.
2047 *
2048 * But it's important to update i_size while still holding page lock:
2049 * page writeout could otherwise come in and zero beyond i_size.
2050 */
2051 if (pos+copied > inode->i_size) {
2052 i_size_write(inode, pos+copied);
2053 mark_inode_dirty(inode);
2054 }
2055
2056 unlock_page(page);
2057 page_cache_release(page);
2058
2059 return copied;
2060}
2061EXPORT_SYMBOL(generic_write_end);
2062
2063/*
1927 * Generic "read page" function for block devices that have the normal 2064 * Generic "read page" function for block devices that have the normal
1928 * get_block functionality. This is most of the block device filesystems. 2065 * get_block functionality. This is most of the block device filesystems.
1929 * Reads the page asynchronously --- the unlock_buffer() and 2066 * Reads the page asynchronously --- the unlock_buffer() and
diff --git a/fs/libfs.c b/fs/libfs.c
index 5294de1f40c4..f2b32d3a9093 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -351,6 +351,26 @@ int simple_prepare_write(struct file *file, struct page *page,
351 return 0; 351 return 0;
352} 352}
353 353
354int simple_write_begin(struct file *file, struct address_space *mapping,
355 loff_t pos, unsigned len, unsigned flags,
356 struct page **pagep, void **fsdata)
357{
358 struct page *page;
359 pgoff_t index;
360 unsigned from;
361
362 index = pos >> PAGE_CACHE_SHIFT;
363 from = pos & (PAGE_CACHE_SIZE - 1);
364
365 page = __grab_cache_page(mapping, index);
366 if (!page)
367 return -ENOMEM;
368
369 *pagep = page;
370
371 return simple_prepare_write(file, page, from, from+len);
372}
373
354int simple_commit_write(struct file *file, struct page *page, 374int simple_commit_write(struct file *file, struct page *page,
355 unsigned from, unsigned to) 375 unsigned from, unsigned to)
356{ 376{
@@ -369,6 +389,28 @@ int simple_commit_write(struct file *file, struct page *page,
369 return 0; 389 return 0;
370} 390}
371 391
392int simple_write_end(struct file *file, struct address_space *mapping,
393 loff_t pos, unsigned len, unsigned copied,
394 struct page *page, void *fsdata)
395{
396 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
397
398 /* zero the stale part of the page if we did a short copy */
399 if (copied < len) {
400 void *kaddr = kmap_atomic(page, KM_USER0);
401 memset(kaddr + from + copied, 0, len - copied);
402 flush_dcache_page(page);
403 kunmap_atomic(kaddr, KM_USER0);
404 }
405
406 simple_commit_write(file, page, from, from+copied);
407
408 unlock_page(page);
409 page_cache_release(page);
410
411 return copied;
412}
413
372/* 414/*
373 * the inodes created here are not hashed. If you use iunique to generate 415 * the inodes created here are not hashed. If you use iunique to generate
374 * unique inode values later for this filesystem, then you must take care 416 * unique inode values later for this filesystem, then you must take care
@@ -642,6 +684,8 @@ EXPORT_SYMBOL(dcache_dir_open);
642EXPORT_SYMBOL(dcache_readdir); 684EXPORT_SYMBOL(dcache_readdir);
643EXPORT_SYMBOL(generic_read_dir); 685EXPORT_SYMBOL(generic_read_dir);
644EXPORT_SYMBOL(get_sb_pseudo); 686EXPORT_SYMBOL(get_sb_pseudo);
687EXPORT_SYMBOL(simple_write_begin);
688EXPORT_SYMBOL(simple_write_end);
645EXPORT_SYMBOL(simple_commit_write); 689EXPORT_SYMBOL(simple_commit_write);
646EXPORT_SYMBOL(simple_dir_inode_operations); 690EXPORT_SYMBOL(simple_dir_inode_operations);
647EXPORT_SYMBOL(simple_dir_operations); 691EXPORT_SYMBOL(simple_dir_operations);
diff --git a/fs/namei.c b/fs/namei.c
index a83160acd748..b40b8084eefc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2729,53 +2729,29 @@ int __page_symlink(struct inode *inode, const char *symname, int len,
2729{ 2729{
2730 struct address_space *mapping = inode->i_mapping; 2730 struct address_space *mapping = inode->i_mapping;
2731 struct page *page; 2731 struct page *page;
2732 void *fsdata;
2732 int err; 2733 int err;
2733 char *kaddr; 2734 char *kaddr;
2734 2735
2735retry: 2736retry:
2736 err = -ENOMEM; 2737 err = pagecache_write_begin(NULL, mapping, 0, len-1,
2737 page = find_or_create_page(mapping, 0, gfp_mask); 2738 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
2738 if (!page)
2739 goto fail;
2740 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2741 if (err == AOP_TRUNCATED_PAGE) {
2742 page_cache_release(page);
2743 goto retry;
2744 }
2745 if (err) 2739 if (err)
2746 goto fail_map; 2740 goto fail;
2741
2747 kaddr = kmap_atomic(page, KM_USER0); 2742 kaddr = kmap_atomic(page, KM_USER0);
2748 memcpy(kaddr, symname, len-1); 2743 memcpy(kaddr, symname, len-1);
2749 kunmap_atomic(kaddr, KM_USER0); 2744 kunmap_atomic(kaddr, KM_USER0);
2750 err = mapping->a_ops->commit_write(NULL, page, 0, len-1); 2745
2751 if (err == AOP_TRUNCATED_PAGE) { 2746 err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
2752 page_cache_release(page); 2747 page, fsdata);
2753 goto retry;
2754 }
2755 if (err)
2756 goto fail_map;
2757 /*
2758 * Notice that we are _not_ going to block here - end of page is
2759 * unmapped, so this will only try to map the rest of page, see
2760 * that it is unmapped (typically even will not look into inode -
2761 * ->i_size will be enough for everything) and zero it out.
2762 * OTOH it's obviously correct and should make the page up-to-date.
2763 */
2764 if (!PageUptodate(page)) {
2765 err = mapping->a_ops->readpage(NULL, page);
2766 if (err != AOP_TRUNCATED_PAGE)
2767 wait_on_page_locked(page);
2768 } else {
2769 unlock_page(page);
2770 }
2771 page_cache_release(page);
2772 if (err < 0) 2748 if (err < 0)
2773 goto fail; 2749 goto fail;
2750 if (err < len-1)
2751 goto retry;
2752
2774 mark_inode_dirty(inode); 2753 mark_inode_dirty(inode);
2775 return 0; 2754 return 0;
2776fail_map:
2777 unlock_page(page);
2778 page_cache_release(page);
2779fail: 2755fail:
2780 return err; 2756 return err;
2781} 2757}
diff --git a/fs/splice.c b/fs/splice.c
index 2df6be43c667..a7568bcc0f99 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -563,7 +563,7 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
563 struct address_space *mapping = file->f_mapping; 563 struct address_space *mapping = file->f_mapping;
564 unsigned int offset, this_len; 564 unsigned int offset, this_len;
565 struct page *page; 565 struct page *page;
566 pgoff_t index; 566 void *fsdata;
567 int ret; 567 int ret;
568 568
569 /* 569 /*
@@ -573,49 +573,16 @@ static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
573 if (unlikely(ret)) 573 if (unlikely(ret))
574 return ret; 574 return ret;
575 575
576 index = sd->pos >> PAGE_CACHE_SHIFT;
577 offset = sd->pos & ~PAGE_CACHE_MASK; 576 offset = sd->pos & ~PAGE_CACHE_MASK;
578 577
579 this_len = sd->len; 578 this_len = sd->len;
580 if (this_len + offset > PAGE_CACHE_SIZE) 579 if (this_len + offset > PAGE_CACHE_SIZE)
581 this_len = PAGE_CACHE_SIZE - offset; 580 this_len = PAGE_CACHE_SIZE - offset;
582 581
583find_page: 582 ret = pagecache_write_begin(file, mapping, sd->pos, this_len,
584 page = find_lock_page(mapping, index); 583 AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
585 if (!page) { 584 if (unlikely(ret))
586 ret = -ENOMEM; 585 goto out;
587 page = page_cache_alloc_cold(mapping);
588 if (unlikely(!page))
589 goto out_ret;
590
591 /*
592 * This will also lock the page
593 */
594 ret = add_to_page_cache_lru(page, mapping, index,
595 GFP_KERNEL);
596 if (unlikely(ret))
597 goto out_release;
598 }
599
600 ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len);
601 if (unlikely(ret)) {
602 loff_t isize = i_size_read(mapping->host);
603
604 if (ret != AOP_TRUNCATED_PAGE)
605 unlock_page(page);
606 page_cache_release(page);
607 if (ret == AOP_TRUNCATED_PAGE)
608 goto find_page;
609
610 /*
611 * prepare_write() may have instantiated a few blocks
612 * outside i_size. Trim these off again.
613 */
614 if (sd->pos + this_len > isize)
615 vmtruncate(mapping->host, isize);
616
617 goto out_ret;
618 }
619 586
620 if (buf->page != page) { 587 if (buf->page != page) {
621 /* 588 /*
@@ -629,31 +596,9 @@ find_page:
629 kunmap_atomic(dst, KM_USER1); 596 kunmap_atomic(dst, KM_USER1);
630 buf->ops->unmap(pipe, buf, src); 597 buf->ops->unmap(pipe, buf, src);
631 } 598 }
632 599 ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len,
633 ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); 600 page, fsdata);
634 if (ret) {
635 if (ret == AOP_TRUNCATED_PAGE) {
636 page_cache_release(page);
637 goto find_page;
638 }
639 if (ret < 0)
640 goto out;
641 /*
642 * Partial write has happened, so 'ret' already initialized by
643 * number of bytes written, Where is nothing we have to do here.
644 */
645 } else
646 ret = this_len;
647 /*
648 * Return the number of bytes written and mark page as
649 * accessed, we are now done!
650 */
651 mark_page_accessed(page);
652out: 601out:
653 unlock_page(page);
654out_release:
655 page_cache_release(page);
656out_ret:
657 return ret; 602 return ret;
658} 603}
659 604
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 35cadad84b14..a562ecfb1a14 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -203,6 +203,16 @@ void block_invalidatepage(struct page *page, unsigned long offset);
203int block_write_full_page(struct page *page, get_block_t *get_block, 203int block_write_full_page(struct page *page, get_block_t *get_block,
204 struct writeback_control *wbc); 204 struct writeback_control *wbc);
205int block_read_full_page(struct page*, get_block_t*); 205int block_read_full_page(struct page*, get_block_t*);
206int block_write_begin(struct file *, struct address_space *,
207 loff_t, unsigned, unsigned,
208 struct page **, void **, get_block_t*);
209int block_write_end(struct file *, struct address_space *,
210 loff_t, unsigned, unsigned,
211 struct page *, void *);
212int generic_write_end(struct file *, struct address_space *,
213 loff_t, unsigned, unsigned,
214 struct page *, void *);
215void page_zero_new_buffers(struct page *page, unsigned from, unsigned to);
206int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); 216int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*);
207int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, 217int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*,
208 loff_t *); 218 loff_t *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 86ce27c72554..e9344e6f877d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -394,6 +394,8 @@ enum positive_aop_returns {
394 AOP_TRUNCATED_PAGE = 0x80001, 394 AOP_TRUNCATED_PAGE = 0x80001,
395}; 395};
396 396
397#define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */
398
397/* 399/*
398 * oh the beauties of C type declarations. 400 * oh the beauties of C type declarations.
399 */ 401 */
@@ -413,7 +415,7 @@ size_t iov_iter_copy_from_user_atomic(struct page *page,
413size_t iov_iter_copy_from_user(struct page *page, 415size_t iov_iter_copy_from_user(struct page *page,
414 struct iov_iter *i, unsigned long offset, size_t bytes); 416 struct iov_iter *i, unsigned long offset, size_t bytes);
415void iov_iter_advance(struct iov_iter *i, size_t bytes); 417void iov_iter_advance(struct iov_iter *i, size_t bytes);
416int iov_iter_fault_in_readable(struct iov_iter *i); 418int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
417size_t iov_iter_single_seg_count(struct iov_iter *i); 419size_t iov_iter_single_seg_count(struct iov_iter *i);
418 420
419static inline void iov_iter_init(struct iov_iter *i, 421static inline void iov_iter_init(struct iov_iter *i,
@@ -454,6 +456,14 @@ struct address_space_operations {
454 */ 456 */
455 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); 457 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
456 int (*commit_write)(struct file *, struct page *, unsigned, unsigned); 458 int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
459
460 int (*write_begin)(struct file *, struct address_space *mapping,
461 loff_t pos, unsigned len, unsigned flags,
462 struct page **pagep, void **fsdata);
463 int (*write_end)(struct file *, struct address_space *mapping,
464 loff_t pos, unsigned len, unsigned copied,
465 struct page *page, void *fsdata);
466
457 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ 467 /* Unfortunately this kludge is needed for FIBMAP. Don't use it */
458 sector_t (*bmap)(struct address_space *, sector_t); 468 sector_t (*bmap)(struct address_space *, sector_t);
459 void (*invalidatepage) (struct page *, unsigned long); 469 void (*invalidatepage) (struct page *, unsigned long);
@@ -468,6 +478,18 @@ struct address_space_operations {
468 int (*launder_page) (struct page *); 478 int (*launder_page) (struct page *);
469}; 479};
470 480
481/*
482 * pagecache_write_begin/pagecache_write_end must be used by general code
483 * to write into the pagecache.
484 */
485int pagecache_write_begin(struct file *, struct address_space *mapping,
486 loff_t pos, unsigned len, unsigned flags,
487 struct page **pagep, void **fsdata);
488
489int pagecache_write_end(struct file *, struct address_space *mapping,
490 loff_t pos, unsigned len, unsigned copied,
491 struct page *page, void *fsdata);
492
471struct backing_dev_info; 493struct backing_dev_info;
472struct address_space { 494struct address_space {
473 struct inode *host; /* owner: inode, block_device */ 495 struct inode *host; /* owner: inode, block_device */
@@ -1866,6 +1888,12 @@ extern int simple_prepare_write(struct file *file, struct page *page,
1866 unsigned offset, unsigned to); 1888 unsigned offset, unsigned to);
1867extern int simple_commit_write(struct file *file, struct page *page, 1889extern int simple_commit_write(struct file *file, struct page *page,
1868 unsigned offset, unsigned to); 1890 unsigned offset, unsigned to);
1891extern int simple_write_begin(struct file *file, struct address_space *mapping,
1892 loff_t pos, unsigned len, unsigned flags,
1893 struct page **pagep, void **fsdata);
1894extern int simple_write_end(struct file *file, struct address_space *mapping,
1895 loff_t pos, unsigned len, unsigned copied,
1896 struct page *page, void *fsdata);
1869 1897
1870extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); 1898extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *);
1871extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); 1899extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 8f1e390fd71b..db8a410ae9e1 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -96,6 +96,8 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start,
96unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, 96unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
97 int tag, unsigned int nr_pages, struct page **pages); 97 int tag, unsigned int nr_pages, struct page **pages);
98 98
99struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index);
100
99/* 101/*
100 * Returns locked page at given index in given cache, creating it if needed. 102 * Returns locked page at given index in given cache, creating it if needed.
101 */ 103 */
diff --git a/mm/filemap.c b/mm/filemap.c
index 67a03a0a9aee..ec25ba1aef5f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1742,11 +1742,20 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
1742 i->count -= bytes; 1742 i->count -= bytes;
1743} 1743}
1744 1744
1745int iov_iter_fault_in_readable(struct iov_iter *i) 1745/*
1746 * Fault in the first iovec of the given iov_iter, to a maximum length
1747 * of bytes. Returns 0 on success, or non-zero if the memory could not be
1748 * accessed (ie. because it is an invalid address).
1749 *
1750 * writev-intensive code may want this to prefault several iovecs -- that
1751 * would be possible (callers must not rely on the fact that _only_ the
1752 * first iovec will be faulted with the current implementation).
1753 */
1754int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
1746{ 1755{
1747 size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count);
1748 char __user *buf = i->iov->iov_base + i->iov_offset; 1756 char __user *buf = i->iov->iov_base + i->iov_offset;
1749 return fault_in_pages_readable(buf, seglen); 1757 bytes = min(bytes, i->iov->iov_len - i->iov_offset);
1758 return fault_in_pages_readable(buf, bytes);
1750} 1759}
1751 1760
1752/* 1761/*
@@ -1843,6 +1852,95 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
1843} 1852}
1844EXPORT_SYMBOL(generic_write_checks); 1853EXPORT_SYMBOL(generic_write_checks);
1845 1854
1855int pagecache_write_begin(struct file *file, struct address_space *mapping,
1856 loff_t pos, unsigned len, unsigned flags,
1857 struct page **pagep, void **fsdata)
1858{
1859 const struct address_space_operations *aops = mapping->a_ops;
1860
1861 if (aops->write_begin) {
1862 return aops->write_begin(file, mapping, pos, len, flags,
1863 pagep, fsdata);
1864 } else {
1865 int ret;
1866 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1867 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1868 struct inode *inode = mapping->host;
1869 struct page *page;
1870again:
1871 page = __grab_cache_page(mapping, index);
1872 *pagep = page;
1873 if (!page)
1874 return -ENOMEM;
1875
1876 if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
1877 /*
1878 * There is no way to resolve a short write situation
1879 * for a !Uptodate page (except by double copying in
1880 * the caller done by generic_perform_write_2copy).
1881 *
1882 * Instead, we have to bring it uptodate here.
1883 */
1884 ret = aops->readpage(file, page);
1885 page_cache_release(page);
1886 if (ret) {
1887 if (ret == AOP_TRUNCATED_PAGE)
1888 goto again;
1889 return ret;
1890 }
1891 goto again;
1892 }
1893
1894 ret = aops->prepare_write(file, page, offset, offset+len);
1895 if (ret) {
1896 if (ret != AOP_TRUNCATED_PAGE)
1897 unlock_page(page);
1898 page_cache_release(page);
1899 if (pos + len > inode->i_size)
1900 vmtruncate(inode, inode->i_size);
1901 if (ret == AOP_TRUNCATED_PAGE)
1902 goto again;
1903 }
1904 return ret;
1905 }
1906}
1907EXPORT_SYMBOL(pagecache_write_begin);
1908
1909int pagecache_write_end(struct file *file, struct address_space *mapping,
1910 loff_t pos, unsigned len, unsigned copied,
1911 struct page *page, void *fsdata)
1912{
1913 const struct address_space_operations *aops = mapping->a_ops;
1914 int ret;
1915
1916 if (aops->write_end) {
1917 mark_page_accessed(page);
1918 ret = aops->write_end(file, mapping, pos, len, copied,
1919 page, fsdata);
1920 } else {
1921 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
1922 struct inode *inode = mapping->host;
1923
1924 flush_dcache_page(page);
1925 ret = aops->commit_write(file, page, offset, offset+len);
1926 unlock_page(page);
1927 mark_page_accessed(page);
1928 page_cache_release(page);
1929 BUG_ON(ret == AOP_TRUNCATED_PAGE); /* can't deal with */
1930
1931 if (ret < 0) {
1932 if (pos + len > inode->i_size)
1933 vmtruncate(inode, inode->i_size);
1934 } else if (ret > 0)
1935 ret = min_t(size_t, copied, ret);
1936 else
1937 ret = copied;
1938 }
1939
1940 return ret;
1941}
1942EXPORT_SYMBOL(pagecache_write_end);
1943
1846ssize_t 1944ssize_t
1847generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, 1945generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
1848 unsigned long *nr_segs, loff_t pos, loff_t *ppos, 1946 unsigned long *nr_segs, loff_t pos, loff_t *ppos,
@@ -1886,8 +1984,7 @@ EXPORT_SYMBOL(generic_file_direct_write);
1886 * Find or create a page at the given pagecache position. Return the locked 1984 * Find or create a page at the given pagecache position. Return the locked
1887 * page. This function is specifically for buffered writes. 1985 * page. This function is specifically for buffered writes.
1888 */ 1986 */
1889static struct page *__grab_cache_page(struct address_space *mapping, 1987struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index)
1890 pgoff_t index)
1891{ 1988{
1892 int status; 1989 int status;
1893 struct page *page; 1990 struct page *page;
@@ -1908,20 +2005,16 @@ repeat:
1908 } 2005 }
1909 return page; 2006 return page;
1910} 2007}
2008EXPORT_SYMBOL(__grab_cache_page);
1911 2009
1912ssize_t 2010static ssize_t generic_perform_write_2copy(struct file *file,
1913generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, 2011 struct iov_iter *i, loff_t pos)
1914 unsigned long nr_segs, loff_t pos, loff_t *ppos,
1915 size_t count, ssize_t written)
1916{ 2012{
1917 struct file *file = iocb->ki_filp;
1918 struct address_space *mapping = file->f_mapping; 2013 struct address_space *mapping = file->f_mapping;
1919 const struct address_space_operations *a_ops = mapping->a_ops; 2014 const struct address_space_operations *a_ops = mapping->a_ops;
1920 struct inode *inode = mapping->host; 2015 struct inode *inode = mapping->host;
1921 long status = 0; 2016 long status = 0;
1922 struct iov_iter i; 2017 ssize_t written = 0;
1923
1924 iov_iter_init(&i, iov, nr_segs, count, written);
1925 2018
1926 do { 2019 do {
1927 struct page *src_page; 2020 struct page *src_page;
@@ -1934,7 +2027,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1934 offset = (pos & (PAGE_CACHE_SIZE - 1)); 2027 offset = (pos & (PAGE_CACHE_SIZE - 1));
1935 index = pos >> PAGE_CACHE_SHIFT; 2028 index = pos >> PAGE_CACHE_SHIFT;
1936 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, 2029 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
1937 iov_iter_count(&i)); 2030 iov_iter_count(i));
1938 2031
1939 /* 2032 /*
1940 * a non-NULL src_page indicates that we're doing the 2033 * a non-NULL src_page indicates that we're doing the
@@ -1952,7 +2045,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1952 * to check that the address is actually valid, when atomic 2045 * to check that the address is actually valid, when atomic
1953 * usercopies are used, below. 2046 * usercopies are used, below.
1954 */ 2047 */
1955 if (unlikely(iov_iter_fault_in_readable(&i))) { 2048 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
1956 status = -EFAULT; 2049 status = -EFAULT;
1957 break; 2050 break;
1958 } 2051 }
@@ -1983,7 +2076,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
1983 * same reason as we can't take a page fault with a 2076 * same reason as we can't take a page fault with a
1984 * page locked (as explained below). 2077 * page locked (as explained below).
1985 */ 2078 */
1986 copied = iov_iter_copy_from_user(src_page, &i, 2079 copied = iov_iter_copy_from_user(src_page, i,
1987 offset, bytes); 2080 offset, bytes);
1988 if (unlikely(copied == 0)) { 2081 if (unlikely(copied == 0)) {
1989 status = -EFAULT; 2082 status = -EFAULT;
@@ -2008,7 +2101,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2008 page_cache_release(src_page); 2101 page_cache_release(src_page);
2009 continue; 2102 continue;
2010 } 2103 }
2011
2012 } 2104 }
2013 2105
2014 status = a_ops->prepare_write(file, page, offset, offset+bytes); 2106 status = a_ops->prepare_write(file, page, offset, offset+bytes);
@@ -2030,7 +2122,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2030 * really matter. 2122 * really matter.
2031 */ 2123 */
2032 pagefault_disable(); 2124 pagefault_disable();
2033 copied = iov_iter_copy_from_user_atomic(page, &i, 2125 copied = iov_iter_copy_from_user_atomic(page, i,
2034 offset, bytes); 2126 offset, bytes);
2035 pagefault_enable(); 2127 pagefault_enable();
2036 } else { 2128 } else {
@@ -2056,9 +2148,9 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2056 if (src_page) 2148 if (src_page)
2057 page_cache_release(src_page); 2149 page_cache_release(src_page);
2058 2150
2059 iov_iter_advance(&i, copied); 2151 iov_iter_advance(i, copied);
2060 written += copied;
2061 pos += copied; 2152 pos += copied;
2153 written += copied;
2062 2154
2063 balance_dirty_pages_ratelimited(mapping); 2155 balance_dirty_pages_ratelimited(mapping);
2064 cond_resched(); 2156 cond_resched();
@@ -2082,13 +2174,117 @@ fs_write_aop_error:
2082 continue; 2174 continue;
2083 else 2175 else
2084 break; 2176 break;
2085 } while (iov_iter_count(&i)); 2177 } while (iov_iter_count(i));
2086 *ppos = pos; 2178
2179 return written ? written : status;
2180}
2181
2182static ssize_t generic_perform_write(struct file *file,
2183 struct iov_iter *i, loff_t pos)
2184{
2185 struct address_space *mapping = file->f_mapping;
2186 const struct address_space_operations *a_ops = mapping->a_ops;
2187 long status = 0;
2188 ssize_t written = 0;
2189
2190 do {
2191 struct page *page;
2192 pgoff_t index; /* Pagecache index for current page */
2193 unsigned long offset; /* Offset into pagecache page */
2194 unsigned long bytes; /* Bytes to write to page */
2195 size_t copied; /* Bytes copied from user */
2196 void *fsdata;
2197
2198 offset = (pos & (PAGE_CACHE_SIZE - 1));
2199 index = pos >> PAGE_CACHE_SHIFT;
2200 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2201 iov_iter_count(i));
2202
2203again:
2204
2205 /*
2206 * Bring in the user page that we will copy from _first_.
2207 * Otherwise there's a nasty deadlock on copying from the
2208 * same page as we're writing to, without it being marked
2209 * up-to-date.
2210 *
2211 * Not only is this an optimisation, but it is also required
2212 * to check that the address is actually valid, when atomic
2213 * usercopies are used, below.
2214 */
2215 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2216 status = -EFAULT;
2217 break;
2218 }
2219
2220 status = a_ops->write_begin(file, mapping, pos, bytes, 0,
2221 &page, &fsdata);
2222 if (unlikely(status))
2223 break;
2224
2225 pagefault_disable();
2226 copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
2227 pagefault_enable();
2228 flush_dcache_page(page);
2229
2230 status = a_ops->write_end(file, mapping, pos, bytes, copied,
2231 page, fsdata);
2232 if (unlikely(status < 0))
2233 break;
2234 copied = status;
2235
2236 cond_resched();
2237
2238 if (unlikely(copied == 0)) {
2239 /*
2240 * If we were unable to copy any data at all, we must
2241 * fall back to a single segment length write.
2242 *
2243 * If we didn't fallback here, we could livelock
2244 * because not all segments in the iov can be copied at
2245 * once without a pagefault.
2246 */
2247 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2248 iov_iter_single_seg_count(i));
2249 goto again;
2250 }
2251 iov_iter_advance(i, copied);
2252 pos += copied;
2253 written += copied;
2254
2255 balance_dirty_pages_ratelimited(mapping);
2256
2257 } while (iov_iter_count(i));
2258
2259 return written ? written : status;
2260}
2261
2262ssize_t
2263generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2264 unsigned long nr_segs, loff_t pos, loff_t *ppos,
2265 size_t count, ssize_t written)
2266{
2267 struct file *file = iocb->ki_filp;
2268 struct address_space *mapping = file->f_mapping;
2269 const struct address_space_operations *a_ops = mapping->a_ops;
2270 struct inode *inode = mapping->host;
2271 ssize_t status;
2272 struct iov_iter i;
2273
2274 iov_iter_init(&i, iov, nr_segs, count, written);
2275 if (a_ops->write_begin)
2276 status = generic_perform_write(file, &i, pos);
2277 else
2278 status = generic_perform_write_2copy(file, &i, pos);
2087 2279
2088 /*
2089 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
2090 */
2091 if (likely(status >= 0)) { 2280 if (likely(status >= 0)) {
2281 written += status;
2282 *ppos = pos + status;
2283
2284 /*
2285 * For now, when the user asks for O_SYNC, we'll actually give
2286 * O_DSYNC
2287 */
2092 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2288 if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2093 if (!a_ops->writepage || !is_sync_kiocb(iocb)) 2289 if (!a_ops->writepage || !is_sync_kiocb(iocb))
2094 status = generic_osync_inode(inode, mapping, 2290 status = generic_osync_inode(inode, mapping,