aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/Locking12
-rw-r--r--Documentation/filesystems/vfs.txt39
-rw-r--r--drivers/block/loop.c5
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/ocfs2/file.c3
-rw-r--r--fs/splice.c4
-rw-r--r--include/linux/fs.h7
-rw-r--r--mm/filemap.c242
9 files changed, 23 insertions, 293 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 8362860e21a7..23d2f4460deb 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -161,8 +161,12 @@ prototypes:
161 int (*set_page_dirty)(struct page *page); 161 int (*set_page_dirty)(struct page *page);
162 int (*readpages)(struct file *filp, struct address_space *mapping, 162 int (*readpages)(struct file *filp, struct address_space *mapping,
163 struct list_head *pages, unsigned nr_pages); 163 struct list_head *pages, unsigned nr_pages);
164 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); 164 int (*write_begin)(struct file *, struct address_space *mapping,
165 int (*commit_write)(struct file *, struct page *, unsigned, unsigned); 165 loff_t pos, unsigned len, unsigned flags,
166 struct page **pagep, void **fsdata);
167 int (*write_end)(struct file *, struct address_space *mapping,
168 loff_t pos, unsigned len, unsigned copied,
169 struct page *page, void *fsdata);
166 sector_t (*bmap)(struct address_space *, sector_t); 170 sector_t (*bmap)(struct address_space *, sector_t);
167 int (*invalidatepage) (struct page *, unsigned long); 171 int (*invalidatepage) (struct page *, unsigned long);
168 int (*releasepage) (struct page *, int); 172 int (*releasepage) (struct page *, int);
@@ -180,8 +184,6 @@ sync_page: no maybe
180writepages: no 184writepages: no
181set_page_dirty no no 185set_page_dirty no no
182readpages: no 186readpages: no
183prepare_write: no yes yes
184commit_write: no yes yes
185write_begin: no locks the page yes 187write_begin: no locks the page yes
186write_end: no yes, unlocks yes 188write_end: no yes, unlocks yes
187perform_write: no n/a yes 189perform_write: no n/a yes
@@ -191,7 +193,7 @@ releasepage: no yes
191direct_IO: no 193direct_IO: no
192launder_page: no yes 194launder_page: no yes
193 195
194 ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() 196 ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
195may be called from the request handler (/dev/loop). 197may be called from the request handler (/dev/loop).
196 198
197 ->readpage() unlocks the page, either synchronously or via I/O 199 ->readpage() unlocks the page, either synchronously or via I/O
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index c4d348dabe94..5579bda58a6d 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -492,7 +492,7 @@ written-back to storage typically in whole pages, however the
492address_space has finer control of write sizes. 492address_space has finer control of write sizes.
493 493
494The read process essentially only requires 'readpage'. The write 494The read process essentially only requires 'readpage'. The write
495process is more complicated and uses prepare_write/commit_write or 495process is more complicated and uses write_begin/write_end or
496set_page_dirty to write data into the address_space, and writepage, 496set_page_dirty to write data into the address_space, and writepage,
497sync_page, and writepages to writeback data to storage. 497sync_page, and writepages to writeback data to storage.
498 498
@@ -521,8 +521,6 @@ struct address_space_operations {
521 int (*set_page_dirty)(struct page *page); 521 int (*set_page_dirty)(struct page *page);
522 int (*readpages)(struct file *filp, struct address_space *mapping, 522 int (*readpages)(struct file *filp, struct address_space *mapping,
523 struct list_head *pages, unsigned nr_pages); 523 struct list_head *pages, unsigned nr_pages);
524 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
525 int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
526 int (*write_begin)(struct file *, struct address_space *mapping, 524 int (*write_begin)(struct file *, struct address_space *mapping,
527 loff_t pos, unsigned len, unsigned flags, 525 loff_t pos, unsigned len, unsigned flags,
528 struct page **pagep, void **fsdata); 526 struct page **pagep, void **fsdata);
@@ -598,37 +596,7 @@ struct address_space_operations {
598 readpages is only used for read-ahead, so read errors are 596 readpages is only used for read-ahead, so read errors are
599 ignored. If anything goes wrong, feel free to give up. 597 ignored. If anything goes wrong, feel free to give up.
600 598
601 prepare_write: called by the generic write path in VM to set up a write 599 write_begin:
602 request for a page. This indicates to the address space that
603 the given range of bytes is about to be written. The
604 address_space should check that the write will be able to
605 complete, by allocating space if necessary and doing any other
606 internal housekeeping. If the write will update parts of
607 any basic-blocks on storage, then those blocks should be
608 pre-read (if they haven't been read already) so that the
609 updated blocks can be written out properly.
610 The page will be locked.
611
612 Note: the page _must not_ be marked uptodate in this function
613 (or anywhere else) unless it actually is uptodate right now. As
614 soon as a page is marked uptodate, it is possible for a concurrent
615 read(2) to copy it to userspace.
616
617 commit_write: If prepare_write succeeds, new data will be copied
618 into the page and then commit_write will be called. It will
619 typically update the size of the file (if appropriate) and
620 mark the inode as dirty, and do any other related housekeeping
621 operations. It should avoid returning an error if possible -
622 errors should have been handled by prepare_write.
623
624 write_begin: This is intended as a replacement for prepare_write. The
625 key differences being that:
626 - it returns a locked page (in *pagep) rather than being
627 given a pre locked page;
628 - it must be able to cope with short writes (where the
629 length passed to write_begin is greater than the number
630 of bytes copied into the page).
631
632 Called by the generic buffered write code to ask the filesystem to 600 Called by the generic buffered write code to ask the filesystem to
633 prepare to write len bytes at the given offset in the file. The 601 prepare to write len bytes at the given offset in the file. The
634 address_space should check that the write will be able to complete, 602 address_space should check that the write will be able to complete,
@@ -640,6 +608,9 @@ struct address_space_operations {
640 The filesystem must return the locked pagecache page for the specified 608 The filesystem must return the locked pagecache page for the specified
641 offset, in *pagep, for the caller to write into. 609 offset, in *pagep, for the caller to write into.
642 610
611 It must be able to cope with short writes (where the length passed to
612 write_begin is greater than the number of bytes copied into the page).
613
643 flags is a field for AOP_FLAG_xxx flags, described in 614 flags is a field for AOP_FLAG_xxx flags, described in
644 include/linux/fs.h. 615 include/linux/fs.h.
645 616
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 3f09cd8bcc38..5c4ee70d5cf3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -40,8 +40,7 @@
40 * Heinz Mauelshagen <mge@sistina.com>, Feb 2002 40 * Heinz Mauelshagen <mge@sistina.com>, Feb 2002
41 * 41 *
42 * Support for falling back on the write file operation when the address space 42 * Support for falling back on the write file operation when the address space
43 * operations prepare_write and/or commit_write are not available on the 43 * operations write_begin is not available on the backing filesystem.
44 * backing filesystem.
45 * Anton Altaparmakov, 16 Feb 2005 44 * Anton Altaparmakov, 16 Feb 2005
46 * 45 *
47 * Still To Fix: 46 * Still To Fix:
@@ -765,7 +764,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
765 */ 764 */
766 if (!file->f_op->splice_read) 765 if (!file->f_op->splice_read)
767 goto out_putf; 766 goto out_putf;
768 if (aops->prepare_write || aops->write_begin) 767 if (aops->write_begin)
769 lo_flags |= LO_FLAGS_USE_AOPS; 768 lo_flags |= LO_FLAGS_USE_AOPS;
770 if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write) 769 if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
771 lo_flags |= LO_FLAGS_READ_ONLY; 770 lo_flags |= LO_FLAGS_READ_ONLY;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 19eafbe3c379..2b2eec1283bf 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -175,7 +175,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
175 175
176 if (rw == WRITE) { 176 if (rw == WRITE) {
177 /* 177 /*
178 * FIXME: blockdev_direct_IO() doesn't use ->prepare_write(), 178 * FIXME: blockdev_direct_IO() doesn't use ->write_begin(),
179 * so we need to update the ->mmu_private to block boundary. 179 * so we need to update the ->mmu_private to block boundary.
180 * 180 *
181 * But we must fill the remaining area or hole by nul for 181 * But we must fill the remaining area or hole by nul for
diff --git a/fs/libfs.c b/fs/libfs.c
index 74688598bcf7..e960a8321902 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -814,7 +814,7 @@ EXPORT_SYMBOL(simple_getattr);
814EXPORT_SYMBOL(simple_link); 814EXPORT_SYMBOL(simple_link);
815EXPORT_SYMBOL(simple_lookup); 815EXPORT_SYMBOL(simple_lookup);
816EXPORT_SYMBOL(simple_pin_fs); 816EXPORT_SYMBOL(simple_pin_fs);
817EXPORT_SYMBOL(simple_prepare_write); 817EXPORT_UNUSED_SYMBOL(simple_prepare_write);
818EXPORT_SYMBOL(simple_readpage); 818EXPORT_SYMBOL(simple_readpage);
819EXPORT_SYMBOL(simple_release_fs); 819EXPORT_SYMBOL(simple_release_fs);
820EXPORT_SYMBOL(simple_rename); 820EXPORT_SYMBOL(simple_rename);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8d3225a78073..7efe937a415f 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -679,8 +679,7 @@ leave:
679 679
680/* Some parts of this taken from generic_cont_expand, which turned out 680/* Some parts of this taken from generic_cont_expand, which turned out
681 * to be too fragile to do exactly what we need without us having to 681 * to be too fragile to do exactly what we need without us having to
682 * worry about recursive locking in ->prepare_write() and 682 * worry about recursive locking in ->write_begin() and ->write_end(). */
683 * ->commit_write(). */
684static int ocfs2_write_zero_page(struct inode *inode, 683static int ocfs2_write_zero_page(struct inode *inode,
685 u64 size) 684 u64 size)
686{ 685{
diff --git a/fs/splice.c b/fs/splice.c
index a1e701c27156..1abab5cee4ba 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -731,8 +731,8 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
731 }; 731 };
732 732
733 /* 733 /*
734 * The actor worker might be calling ->prepare_write and 734 * The actor worker might be calling ->write_begin and
735 * ->commit_write. Most of the time, these expect i_mutex to 735 * ->write_end. Most of the time, these expect i_mutex to
736 * be held. Since this may result in an ABBA deadlock with 736 * be held. Since this may result in an ABBA deadlock with
737 * pipe->inode, we have to order lock acquiry here. 737 * pipe->inode, we have to order lock acquiry here.
738 */ 738 */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5b248d61430c..0dcdd9458f4b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -489,13 +489,6 @@ struct address_space_operations {
489 int (*readpages)(struct file *filp, struct address_space *mapping, 489 int (*readpages)(struct file *filp, struct address_space *mapping,
490 struct list_head *pages, unsigned nr_pages); 490 struct list_head *pages, unsigned nr_pages);
491 491
492 /*
493 * ext3 requires that a successful prepare_write() call be followed
494 * by a commit_write() call - they must be balanced
495 */
496 int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
497 int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
498
499 int (*write_begin)(struct file *, struct address_space *mapping, 492 int (*write_begin)(struct file *, struct address_space *mapping,
500 loff_t pos, unsigned len, unsigned flags, 493 loff_t pos, unsigned len, unsigned flags,
501 struct page **pagep, void **fsdata); 494 struct page **pagep, void **fsdata);
diff --git a/mm/filemap.c b/mm/filemap.c
index ab8553658af3..f3e5f8944d17 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2029,48 +2029,8 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping,
2029{ 2029{
2030 const struct address_space_operations *aops = mapping->a_ops; 2030 const struct address_space_operations *aops = mapping->a_ops;
2031 2031
2032 if (aops->write_begin) { 2032 return aops->write_begin(file, mapping, pos, len, flags,
2033 return aops->write_begin(file, mapping, pos, len, flags,
2034 pagep, fsdata); 2033 pagep, fsdata);
2035 } else {
2036 int ret;
2037 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2038 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
2039 struct inode *inode = mapping->host;
2040 struct page *page;
2041again:
2042 page = __grab_cache_page(mapping, index);
2043 *pagep = page;
2044 if (!page)
2045 return -ENOMEM;
2046
2047 if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
2048 /*
2049 * There is no way to resolve a short write situation
2050 * for a !Uptodate page (except by double copying in
2051 * the caller done by generic_perform_write_2copy).
2052 *
2053 * Instead, we have to bring it uptodate here.
2054 */
2055 ret = aops->readpage(file, page);
2056 page_cache_release(page);
2057 if (ret) {
2058 if (ret == AOP_TRUNCATED_PAGE)
2059 goto again;
2060 return ret;
2061 }
2062 goto again;
2063 }
2064
2065 ret = aops->prepare_write(file, page, offset, offset+len);
2066 if (ret) {
2067 unlock_page(page);
2068 page_cache_release(page);
2069 if (pos + len > inode->i_size)
2070 vmtruncate(inode, inode->i_size);
2071 }
2072 return ret;
2073 }
2074} 2034}
2075EXPORT_SYMBOL(pagecache_write_begin); 2035EXPORT_SYMBOL(pagecache_write_begin);
2076 2036
@@ -2079,32 +2039,9 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
2079 struct page *page, void *fsdata) 2039 struct page *page, void *fsdata)
2080{ 2040{
2081 const struct address_space_operations *aops = mapping->a_ops; 2041 const struct address_space_operations *aops = mapping->a_ops;
2082 int ret;
2083
2084 if (aops->write_end) {
2085 mark_page_accessed(page);
2086 ret = aops->write_end(file, mapping, pos, len, copied,
2087 page, fsdata);
2088 } else {
2089 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
2090 struct inode *inode = mapping->host;
2091
2092 flush_dcache_page(page);
2093 ret = aops->commit_write(file, page, offset, offset+len);
2094 unlock_page(page);
2095 mark_page_accessed(page);
2096 page_cache_release(page);
2097
2098 if (ret < 0) {
2099 if (pos + len > inode->i_size)
2100 vmtruncate(inode, inode->i_size);
2101 } else if (ret > 0)
2102 ret = min_t(size_t, copied, ret);
2103 else
2104 ret = copied;
2105 }
2106 2042
2107 return ret; 2043 mark_page_accessed(page);
2044 return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2108} 2045}
2109EXPORT_SYMBOL(pagecache_write_end); 2046EXPORT_SYMBOL(pagecache_write_end);
2110 2047
@@ -2226,174 +2163,6 @@ repeat:
2226} 2163}
2227EXPORT_SYMBOL(__grab_cache_page); 2164EXPORT_SYMBOL(__grab_cache_page);
2228 2165
2229static ssize_t generic_perform_write_2copy(struct file *file,
2230 struct iov_iter *i, loff_t pos)
2231{
2232 struct address_space *mapping = file->f_mapping;
2233 const struct address_space_operations *a_ops = mapping->a_ops;
2234 struct inode *inode = mapping->host;
2235 long status = 0;
2236 ssize_t written = 0;
2237
2238 do {
2239 struct page *src_page;
2240 struct page *page;
2241 pgoff_t index; /* Pagecache index for current page */
2242 unsigned long offset; /* Offset into pagecache page */
2243 unsigned long bytes; /* Bytes to write to page */
2244 size_t copied; /* Bytes copied from user */
2245
2246 offset = (pos & (PAGE_CACHE_SIZE - 1));
2247 index = pos >> PAGE_CACHE_SHIFT;
2248 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2249 iov_iter_count(i));
2250
2251 /*
2252 * a non-NULL src_page indicates that we're doing the
2253 * copy via get_user_pages and kmap.
2254 */
2255 src_page = NULL;
2256
2257 /*
2258 * Bring in the user page that we will copy from _first_.
2259 * Otherwise there's a nasty deadlock on copying from the
2260 * same page as we're writing to, without it being marked
2261 * up-to-date.
2262 *
2263 * Not only is this an optimisation, but it is also required
2264 * to check that the address is actually valid, when atomic
2265 * usercopies are used, below.
2266 */
2267 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2268 status = -EFAULT;
2269 break;
2270 }
2271
2272 page = __grab_cache_page(mapping, index);
2273 if (!page) {
2274 status = -ENOMEM;
2275 break;
2276 }
2277
2278 /*
2279 * non-uptodate pages cannot cope with short copies, and we
2280 * cannot take a pagefault with the destination page locked.
2281 * So pin the source page to copy it.
2282 */
2283 if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
2284 unlock_page(page);
2285
2286 src_page = alloc_page(GFP_KERNEL);
2287 if (!src_page) {
2288 page_cache_release(page);
2289 status = -ENOMEM;
2290 break;
2291 }
2292
2293 /*
2294 * Cannot get_user_pages with a page locked for the
2295 * same reason as we can't take a page fault with a
2296 * page locked (as explained below).
2297 */
2298 copied = iov_iter_copy_from_user(src_page, i,
2299 offset, bytes);
2300 if (unlikely(copied == 0)) {
2301 status = -EFAULT;
2302 page_cache_release(page);
2303 page_cache_release(src_page);
2304 break;
2305 }
2306 bytes = copied;
2307
2308 lock_page(page);
2309 /*
2310 * Can't handle the page going uptodate here, because
2311 * that means we would use non-atomic usercopies, which
2312 * zero out the tail of the page, which can cause
2313 * zeroes to become transiently visible. We could just
2314 * use a non-zeroing copy, but the APIs aren't too
2315 * consistent.
2316 */
2317 if (unlikely(!page->mapping || PageUptodate(page))) {
2318 unlock_page(page);
2319 page_cache_release(page);
2320 page_cache_release(src_page);
2321 continue;
2322 }
2323 }
2324
2325 status = a_ops->prepare_write(file, page, offset, offset+bytes);
2326 if (unlikely(status))
2327 goto fs_write_aop_error;
2328
2329 if (!src_page) {
2330 /*
2331 * Must not enter the pagefault handler here, because
2332 * we hold the page lock, so we might recursively
2333 * deadlock on the same lock, or get an ABBA deadlock
2334 * against a different lock, or against the mmap_sem
2335 * (which nests outside the page lock). So increment
2336 * preempt count, and use _atomic usercopies.
2337 *
2338 * The page is uptodate so we are OK to encounter a
2339 * short copy: if unmodified parts of the page are
2340 * marked dirty and written out to disk, it doesn't
2341 * really matter.
2342 */
2343 pagefault_disable();
2344 copied = iov_iter_copy_from_user_atomic(page, i,
2345 offset, bytes);
2346 pagefault_enable();
2347 } else {
2348 void *src, *dst;
2349 src = kmap_atomic(src_page, KM_USER0);
2350 dst = kmap_atomic(page, KM_USER1);
2351 memcpy(dst + offset, src + offset, bytes);
2352 kunmap_atomic(dst, KM_USER1);
2353 kunmap_atomic(src, KM_USER0);
2354 copied = bytes;
2355 }
2356 flush_dcache_page(page);
2357
2358 status = a_ops->commit_write(file, page, offset, offset+bytes);
2359 if (unlikely(status < 0))
2360 goto fs_write_aop_error;
2361 if (unlikely(status > 0)) /* filesystem did partial write */
2362 copied = min_t(size_t, copied, status);
2363
2364 unlock_page(page);
2365 mark_page_accessed(page);
2366 page_cache_release(page);
2367 if (src_page)
2368 page_cache_release(src_page);
2369
2370 iov_iter_advance(i, copied);
2371 pos += copied;
2372 written += copied;
2373
2374 balance_dirty_pages_ratelimited(mapping);
2375 cond_resched();
2376 continue;
2377
2378fs_write_aop_error:
2379 unlock_page(page);
2380 page_cache_release(page);
2381 if (src_page)
2382 page_cache_release(src_page);
2383
2384 /*
2385 * prepare_write() may have instantiated a few blocks
2386 * outside i_size. Trim these off again. Don't need
2387 * i_size_read because we hold i_mutex.
2388 */
2389 if (pos + bytes > inode->i_size)
2390 vmtruncate(inode, inode->i_size);
2391 break;
2392 } while (iov_iter_count(i));
2393
2394 return written ? written : status;
2395}
2396
2397static ssize_t generic_perform_write(struct file *file, 2166static ssize_t generic_perform_write(struct file *file,
2398 struct iov_iter *i, loff_t pos) 2167 struct iov_iter *i, loff_t pos)
2399{ 2168{
@@ -2494,10 +2263,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2494 struct iov_iter i; 2263 struct iov_iter i;
2495 2264
2496 iov_iter_init(&i, iov, nr_segs, count, written); 2265 iov_iter_init(&i, iov, nr_segs, count, written);
2497 if (a_ops->write_begin) 2266 status = generic_perform_write(file, &i, pos);
2498 status = generic_perform_write(file, &i, pos);
2499 else
2500 status = generic_perform_write_2copy(file, &i, pos);
2501 2267
2502 if (likely(status >= 0)) { 2268 if (likely(status >= 0)) {
2503 written += status; 2269 written += status;