diff options
author | Filipe Manana <fdmanana@suse.com> | 2014-09-06 17:34:39 -0400 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2014-09-08 16:56:43 -0400 |
commit | 49dae1bc1c665817e434d01eefaa11967f618243 (patch) | |
tree | 4c306620e23a5c0446ce1826919ed5e62f8f0a5c | |
parent | c47ca32d3aadb234f73389a34c97574085bc9eda (diff) |
Btrfs: fix fsync data loss after a ranged fsync
While we're doing a full fsync (when the inode has the flag
BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a
portion of the file), we might have ordered operations that are started
before or while we're logging the inode and that fall outside the fsync
range.
Therefore when a full ranged fsync finishes don't remove every extent
map from the list of modified extent maps - as for some of them, that
fall outside our fsync range, their respective ordered operation hasn't
finished yet, meaning the corresponding file extent item wasn't inserted
into the fs/subvol tree yet and therefore we didn't log it, and we must
let the next fast fsync (one that checks only the modified list) see this
extent map and log a matching file extent item to the log btree and wait
for its ordered operation to finish (if it's still ongoing).
A test case for xfstests follows.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r-- | fs/btrfs/file.c | 2 | ||||
-rw-r--r-- | fs/btrfs/tree-log.c | 77 | ||||
-rw-r--r-- | fs/btrfs/tree-log.h | 2 |
3 files changed, 64 insertions, 17 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 36861b7a6757..ff1cc0399b9a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
@@ -1966,7 +1966,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
1966 | 1966 | ||
1967 | btrfs_init_log_ctx(&ctx); | 1967 | btrfs_init_log_ctx(&ctx); |
1968 | 1968 | ||
1969 | ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); | 1969 | ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); |
1970 | if (ret < 0) { | 1970 | if (ret < 0) { |
1971 | /* Fallthrough and commit/free transaction. */ | 1971 | /* Fallthrough and commit/free transaction. */ |
1972 | ret = 1; | 1972 | ret = 1; |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7e0e6e3029dd..d296efe2d3e7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
@@ -94,8 +94,10 @@ | |||
94 | #define LOG_WALK_REPLAY_ALL 3 | 94 | #define LOG_WALK_REPLAY_ALL 3 |
95 | 95 | ||
96 | static int btrfs_log_inode(struct btrfs_trans_handle *trans, | 96 | static int btrfs_log_inode(struct btrfs_trans_handle *trans, |
97 | struct btrfs_root *root, struct inode *inode, | 97 | struct btrfs_root *root, struct inode *inode, |
98 | int inode_only); | 98 | int inode_only, |
99 | const loff_t start, | ||
100 | const loff_t end); | ||
99 | static int link_to_fixup_dir(struct btrfs_trans_handle *trans, | 101 | static int link_to_fixup_dir(struct btrfs_trans_handle *trans, |
100 | struct btrfs_root *root, | 102 | struct btrfs_root *root, |
101 | struct btrfs_path *path, u64 objectid); | 103 | struct btrfs_path *path, u64 objectid); |
@@ -3858,8 +3860,10 @@ process: | |||
3858 | * This handles both files and directories. | 3860 | * This handles both files and directories. |
3859 | */ | 3861 | */ |
3860 | static int btrfs_log_inode(struct btrfs_trans_handle *trans, | 3862 | static int btrfs_log_inode(struct btrfs_trans_handle *trans, |
3861 | struct btrfs_root *root, struct inode *inode, | 3863 | struct btrfs_root *root, struct inode *inode, |
3862 | int inode_only) | 3864 | int inode_only, |
3865 | const loff_t start, | ||
3866 | const loff_t end) | ||
3863 | { | 3867 | { |
3864 | struct btrfs_path *path; | 3868 | struct btrfs_path *path; |
3865 | struct btrfs_path *dst_path; | 3869 | struct btrfs_path *dst_path; |
@@ -3876,6 +3880,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, | |||
3876 | int ins_nr; | 3880 | int ins_nr; |
3877 | bool fast_search = false; | 3881 | bool fast_search = false; |
3878 | u64 ino = btrfs_ino(inode); | 3882 | u64 ino = btrfs_ino(inode); |
3883 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
3879 | 3884 | ||
3880 | path = btrfs_alloc_path(); | 3885 | path = btrfs_alloc_path(); |
3881 | if (!path) | 3886 | if (!path) |
@@ -4049,13 +4054,35 @@ log_extents: | |||
4049 | goto out_unlock; | 4054 | goto out_unlock; |
4050 | } | 4055 | } |
4051 | } else if (inode_only == LOG_INODE_ALL) { | 4056 | } else if (inode_only == LOG_INODE_ALL) { |
4052 | struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; | ||
4053 | struct extent_map *em, *n; | 4057 | struct extent_map *em, *n; |
4054 | 4058 | ||
4055 | write_lock(&tree->lock); | 4059 | write_lock(&em_tree->lock); |
4056 | list_for_each_entry_safe(em, n, &tree->modified_extents, list) | 4060 | /* |
4057 | list_del_init(&em->list); | 4061 | * We can't just remove every em if we're called for a ranged |
4058 | write_unlock(&tree->lock); | 4062 | * fsync - that is, one that doesn't cover the whole possible |
4063 | * file range (0 to LLONG_MAX). This is because we can have | ||
4064 | * em's that fall outside the range we're logging and therefore | ||
4065 | * their ordered operations haven't completed yet | ||
4066 | * (btrfs_finish_ordered_io() not invoked yet). This means we | ||
4067 | * didn't get their respective file extent item in the fs/subvol | ||
4068 | * tree yet, and need to let the next fast fsync (one which | ||
4069 | * consults the list of modified extent maps) find the em so | ||
4070 | * that it logs a matching file extent item and waits for the | ||
4071 | * respective ordered operation to complete (if it's still | ||
4072 | * running). | ||
4073 | * | ||
4074 | * Removing every em outside the range we're logging would make | ||
4075 | * the next fast fsync not log their matching file extent items, | ||
4076 | * therefore making us lose data after a log replay. | ||
4077 | */ | ||
4078 | list_for_each_entry_safe(em, n, &em_tree->modified_extents, | ||
4079 | list) { | ||
4080 | const u64 mod_end = em->mod_start + em->mod_len - 1; | ||
4081 | |||
4082 | if (em->mod_start >= start && mod_end <= end) | ||
4083 | list_del_init(&em->list); | ||
4084 | } | ||
4085 | write_unlock(&em_tree->lock); | ||
4059 | } | 4086 | } |
4060 | 4087 | ||
4061 | if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { | 4088 | if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { |
@@ -4065,8 +4092,19 @@ log_extents: | |||
4065 | goto out_unlock; | 4092 | goto out_unlock; |
4066 | } | 4093 | } |
4067 | } | 4094 | } |
4068 | BTRFS_I(inode)->logged_trans = trans->transid; | 4095 | |
4069 | BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; | 4096 | write_lock(&em_tree->lock); |
4097 | /* | ||
4098 | * If we're doing a ranged fsync and there are still modified extents | ||
4099 | * in the list, we must run on the next fsync call as it might cover | ||
4100 | * those extents (a full fsync or an fsync for other range). | ||
4101 | */ | ||
4102 | if (list_empty(&em_tree->modified_extents)) { | ||
4103 | BTRFS_I(inode)->logged_trans = trans->transid; | ||
4104 | BTRFS_I(inode)->last_log_commit = | ||
4105 | BTRFS_I(inode)->last_sub_trans; | ||
4106 | } | ||
4107 | write_unlock(&em_tree->lock); | ||
4070 | out_unlock: | 4108 | out_unlock: |
4071 | if (unlikely(err)) | 4109 | if (unlikely(err)) |
4072 | btrfs_put_logged_extents(&logged_list); | 4110 | btrfs_put_logged_extents(&logged_list); |
@@ -4161,7 +4199,10 @@ out: | |||
4161 | */ | 4199 | */ |
4162 | static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, | 4200 | static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, |
4163 | struct btrfs_root *root, struct inode *inode, | 4201 | struct btrfs_root *root, struct inode *inode, |
4164 | struct dentry *parent, int exists_only, | 4202 | struct dentry *parent, |
4203 | const loff_t start, | ||
4204 | const loff_t end, | ||
4205 | int exists_only, | ||
4165 | struct btrfs_log_ctx *ctx) | 4206 | struct btrfs_log_ctx *ctx) |
4166 | { | 4207 | { |
4167 | int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; | 4208 | int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; |
@@ -4207,7 +4248,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, | |||
4207 | if (ret) | 4248 | if (ret) |
4208 | goto end_no_trans; | 4249 | goto end_no_trans; |
4209 | 4250 | ||
4210 | ret = btrfs_log_inode(trans, root, inode, inode_only); | 4251 | ret = btrfs_log_inode(trans, root, inode, inode_only, start, end); |
4211 | if (ret) | 4252 | if (ret) |
4212 | goto end_trans; | 4253 | goto end_trans; |
4213 | 4254 | ||
@@ -4235,7 +4276,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, | |||
4235 | 4276 | ||
4236 | if (BTRFS_I(inode)->generation > | 4277 | if (BTRFS_I(inode)->generation > |
4237 | root->fs_info->last_trans_committed) { | 4278 | root->fs_info->last_trans_committed) { |
4238 | ret = btrfs_log_inode(trans, root, inode, inode_only); | 4279 | ret = btrfs_log_inode(trans, root, inode, inode_only, |
4280 | 0, LLONG_MAX); | ||
4239 | if (ret) | 4281 | if (ret) |
4240 | goto end_trans; | 4282 | goto end_trans; |
4241 | } | 4283 | } |
@@ -4269,13 +4311,15 @@ end_no_trans: | |||
4269 | */ | 4311 | */ |
4270 | int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, | 4312 | int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, |
4271 | struct btrfs_root *root, struct dentry *dentry, | 4313 | struct btrfs_root *root, struct dentry *dentry, |
4314 | const loff_t start, | ||
4315 | const loff_t end, | ||
4272 | struct btrfs_log_ctx *ctx) | 4316 | struct btrfs_log_ctx *ctx) |
4273 | { | 4317 | { |
4274 | struct dentry *parent = dget_parent(dentry); | 4318 | struct dentry *parent = dget_parent(dentry); |
4275 | int ret; | 4319 | int ret; |
4276 | 4320 | ||
4277 | ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, | 4321 | ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, |
4278 | 0, ctx); | 4322 | start, end, 0, ctx); |
4279 | dput(parent); | 4323 | dput(parent); |
4280 | 4324 | ||
4281 | return ret; | 4325 | return ret; |
@@ -4512,6 +4556,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, | |||
4512 | root->fs_info->last_trans_committed)) | 4556 | root->fs_info->last_trans_committed)) |
4513 | return 0; | 4557 | return 0; |
4514 | 4558 | ||
4515 | return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); | 4559 | return btrfs_log_inode_parent(trans, root, inode, parent, 0, |
4560 | LLONG_MAX, 1, NULL); | ||
4516 | } | 4561 | } |
4517 | 4562 | ||
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 7f5b41bd5373..e2e798ae7cd7 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h | |||
@@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, | |||
59 | int btrfs_recover_log_trees(struct btrfs_root *tree_root); | 59 | int btrfs_recover_log_trees(struct btrfs_root *tree_root); |
60 | int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, | 60 | int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, |
61 | struct btrfs_root *root, struct dentry *dentry, | 61 | struct btrfs_root *root, struct dentry *dentry, |
62 | const loff_t start, | ||
63 | const loff_t end, | ||
62 | struct btrfs_log_ctx *ctx); | 64 | struct btrfs_log_ctx *ctx); |
63 | int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, | 65 | int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, |
64 | struct btrfs_root *root, | 66 | struct btrfs_root *root, |