aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2014-11-13 12:01:45 -0500
committerChris Mason <clm@fb.com>2014-11-21 14:59:57 -0500
commitb38ef71cb102208dffcf4e8524e9d5ec4ec0eaa9 (patch)
treed2417a15186a9d048558564a1412d11f02fa0da0
parent0870295b2371673b3563735825ad559409d8cedc (diff)
Btrfs: ensure ordered extent errors aren't missed on fsync
When doing a fsync with a fast path we have a time window where we can miss the fact that writeback of some file data failed, and therefore we endup returning success (0) from fsync when we should return an error. The steps that lead to this are the following: 1) We start all ordered extents by calling filemap_fdatawrite_range(); 2) We do some other work like locking the inode's i_mutex, start a transaction, start a log transaction, etc; 3) We enter btrfs_log_inode(), acquire the inode's log_mutex and collect all the ordered extents from inode's ordered tree into a list; 4) But by the time we do ordered extent collection, some ordered extents we started at step 1) might have already completed with an error, and therefore we didn't found them in the ordered tree and had no idea they finished with an error. This makes our fsync return success (0) to userspace, but has no bad effects on the log like for example insertion of file extent items into the log that point to unwritten extents, because the invalid extent maps were removed before the ordered extent completed (in inode.c:btrfs_finish_ordered_io). So after collecting the ordered extents just check if the inode's i_mapping has any error flags set (AS_EIO or AS_ENOSPC) and leave with an error if it does. Whenever writeback fails for a page of an ordered extent, we call mapping_set_error (done in extent_io.c:end_extent_writepage, called by extent_io.c:end_bio_extent_writepage) that sets one of those error flags in the inode's i_mapping flags. This change also has the side effect of fixing the issue where for fast fsyncs we never checked/cleared the error flags from the inode's i_mapping flags, which means that a full fsync performed after a fast fsync could get such errors that belonged to the fast fsync - because the full fsync calls btrfs_wait_ordered_range() which calls filemap_fdatawait_range(), and the later checks for and clears those flags, while for fast fsyncs we never call filemap_fdatawait_range() or anything else that checks for and clears the error flags from the inode's i_mapping. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--fs/btrfs/ctree.h1
-rw-r--r--fs/btrfs/inode.c15
-rw-r--r--fs/btrfs/tree-log.c21
3 files changed, 37 insertions, 0 deletions
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a9466e346358..49d956b2cf30 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3866,6 +3866,7 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
3866 struct btrfs_trans_handle *trans, int mode, 3866 struct btrfs_trans_handle *trans, int mode,
3867 u64 start, u64 num_bytes, u64 min_size, 3867 u64 start, u64 num_bytes, u64 min_size,
3868 loff_t actual_len, u64 *alloc_hint); 3868 loff_t actual_len, u64 *alloc_hint);
3869int btrfs_inode_check_errors(struct inode *inode);
3869extern const struct dentry_operations btrfs_dentry_operations; 3870extern const struct dentry_operations btrfs_dentry_operations;
3870 3871
3871/* ioctl.c */ 3872/* ioctl.c */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index cb5978a4a277..a5374c2bb943 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9464,6 +9464,21 @@ out_inode:
9464 9464
9465} 9465}
9466 9466
9467/* Inspired by filemap_check_errors() */
9468int btrfs_inode_check_errors(struct inode *inode)
9469{
9470 int ret = 0;
9471
9472 if (test_bit(AS_ENOSPC, &inode->i_mapping->flags) &&
9473 test_and_clear_bit(AS_ENOSPC, &inode->i_mapping->flags))
9474 ret = -ENOSPC;
9475 if (test_bit(AS_EIO, &inode->i_mapping->flags) &&
9476 test_and_clear_bit(AS_EIO, &inode->i_mapping->flags))
9477 ret = -EIO;
9478
9479 return ret;
9480}
9481
9467static const struct inode_operations btrfs_dir_inode_operations = { 9482static const struct inode_operations btrfs_dir_inode_operations = {
9468 .getattr = btrfs_getattr, 9483 .getattr = btrfs_getattr,
9469 .lookup = btrfs_lookup, 9484 .lookup = btrfs_lookup,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 3883d0febd82..9a02da16f2be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3635,6 +3635,12 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans,
3635 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); 3635 test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)));
3636 3636
3637 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { 3637 if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) {
3638 /*
3639 * Clear the AS_EIO/AS_ENOSPC flags from the inode's
3640 * i_mapping flags, so that the next fsync won't get
3641 * an outdated io error too.
3642 */
3643 btrfs_inode_check_errors(inode);
3638 *ordered_io_error = true; 3644 *ordered_io_error = true;
3639 break; 3645 break;
3640 } 3646 }
@@ -4098,6 +4104,21 @@ log_extents:
4098 btrfs_release_path(path); 4104 btrfs_release_path(path);
4099 btrfs_release_path(dst_path); 4105 btrfs_release_path(dst_path);
4100 if (fast_search) { 4106 if (fast_search) {
4107 /*
4108 * Some ordered extents started by fsync might have completed
4109 * before we collected the ordered extents in logged_list, which
4110 * means they're gone, not in our logged_list nor in the inode's
4111 * ordered tree. We want the application/user space to know an
4112 * error happened while attempting to persist file data so that
4113 * it can take proper action. If such error happened, we leave
4114 * without writing to the log tree and the fsync must report the
4115 * file data write error and not commit the current transaction.
4116 */
4117 err = btrfs_inode_check_errors(inode);
4118 if (err) {
4119 ctx->io_err = err;
4120 goto out_unlock;
4121 }
4101 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4122 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4102 &logged_list, ctx); 4123 &logged_list, ctx);
4103 if (ret) { 4124 if (ret) {