aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-02-13 07:30:56 -0500
committerChris Mason <clm@fb.com>2015-02-14 11:22:49 -0500
commit1a4bcf470c886b955adf36486f4c86f2441d85cb (patch)
treeb3c95c50eb689718020926319de252c33bd82fd5 /fs/btrfs
parent3d84be799194147e04c0e3129ed44a948773b80a (diff)
Btrfs: fix fsync data loss after adding hard link to inode
We have a scenario where after the fsync log replay we can lose file data that had been previously fsync'ed if we added an hard link for our inode and after that we sync'ed the fsync log (for example by fsync'ing some other file or directory). This is because when adding an hard link we updated the inode item in the log tree with an i_size value of 0. At that point the new inode item was in memory only and a subsequent fsync log replay would not make us lose the file data. However if after adding the hard link we sync the log tree to disk, by fsync'ing some other file or directory for example, we ended up losing the file data after log replay, because the inode item in the persisted log tree had an an i_size of zero. This is easy to reproduce, and the following excerpt from my test for xfstests shows this: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create one file with data and fsync it. # This made the btrfs fsync log persist the data and the inode metadata with # a correct inode->i_size (4096 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xaa -b 4K 0 4K" -c "fsync" \ $SCRATCH_MNT/foo | _filter_xfs_io # Now add one hard link to our file. This made the btrfs code update the fsync # log, in memory only, with an inode metadata having a size of 0. ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link # Now force persistence of the fsync log to disk, for example, by fsyncing some # other file. touch $SCRATCH_MNT/bar $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/bar # Before a power loss or crash, we could read the 4Kb of data from our file as # expected. echo "File content before:" od -t x1 $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # After the fsync log replay, because the fsync log had a value of 0 for our # inode's i_size, we couldn't read anymore the 4Kb of data that we previously # wrote and fsync'ed. The size of the file became 0 after the fsync log replay. echo "File content after:" od -t x1 $SCRATCH_MNT/foo Another alternative test, that doesn't need to fsync an inode in the same transaction it was created, is: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create our test file with some data. $XFS_IO_PROG -f -c "pwrite -S 0xaa -b 8K 0 8K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Make sure the file is durably persisted. sync # Append some data to our file, to increase its size. $XFS_IO_PROG -f -c "pwrite -S 0xcc -b 4K 8K 4K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Fsync the file, so from this point on if a crash/power failure happens, our # new data is guaranteed to be there next time the fs is mounted. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Add one hard link to our file. This made btrfs write into the in memory fsync # log a special inode with generation 0 and an i_size of 0 too. Note that this # didn't update the inode in the fsync log on disk. ln $SCRATCH_MNT/foo $SCRATCH_MNT/foo_link # Now make sure the in memory fsync log is durably persisted. # Creating and fsync'ing another file will do it. touch $SCRATCH_MNT/bar $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/bar # As expected, before the crash/power failure, we should be able to read the # 12Kb of file data. echo "File content before:" od -t x1 $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # After mounting the fs again, the fsync log was replayed. # The btrfs fsync log replay code didn't update the i_size of the persisted # inode because the inode item in the log had a special generation with a # value of 0 (and it couldn't know the correct i_size, since that inode item # had a 0 i_size too). This made the last 4Kb of file data inaccessible and # effectively lost. echo "File content after:" od -t x1 $SCRATCH_MNT/foo This isn't a new issue/regression. This problem has been around since the log tree code was added in 2008: Btrfs: Add a write ahead tree log to optimize synchronous operations (commit e02119d5a7b4396c5a872582fddc8bd6d305a70a) Test cases for xfstests follow soon. CC: <stable@vger.kernel.org> Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/tree-log.c82
1 files changed, 73 insertions, 9 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7870bdba26b7..5f649bb32bec 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -490,8 +490,20 @@ insert:
490 src_item = (struct btrfs_inode_item *)src_ptr; 490 src_item = (struct btrfs_inode_item *)src_ptr;
491 dst_item = (struct btrfs_inode_item *)dst_ptr; 491 dst_item = (struct btrfs_inode_item *)dst_ptr;
492 492
493 if (btrfs_inode_generation(eb, src_item) == 0) 493 if (btrfs_inode_generation(eb, src_item) == 0) {
494 struct extent_buffer *dst_eb = path->nodes[0];
495
496 if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
497 S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
498 struct btrfs_map_token token;
499 u64 ino_size = btrfs_inode_size(eb, src_item);
500
501 btrfs_init_map_token(&token);
502 btrfs_set_token_inode_size(dst_eb, dst_item,
503 ino_size, &token);
504 }
494 goto no_copy; 505 goto no_copy;
506 }
495 507
496 if (overwrite_root && 508 if (overwrite_root &&
497 S_ISDIR(btrfs_inode_mode(eb, src_item)) && 509 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
@@ -3250,7 +3262,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
3250static void fill_inode_item(struct btrfs_trans_handle *trans, 3262static void fill_inode_item(struct btrfs_trans_handle *trans,
3251 struct extent_buffer *leaf, 3263 struct extent_buffer *leaf,
3252 struct btrfs_inode_item *item, 3264 struct btrfs_inode_item *item,
3253 struct inode *inode, int log_inode_only) 3265 struct inode *inode, int log_inode_only,
3266 u64 logged_isize)
3254{ 3267{
3255 struct btrfs_map_token token; 3268 struct btrfs_map_token token;
3256 3269
@@ -3263,7 +3276,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
3263 * to say 'update this inode with these values' 3276 * to say 'update this inode with these values'
3264 */ 3277 */
3265 btrfs_set_token_inode_generation(leaf, item, 0, &token); 3278 btrfs_set_token_inode_generation(leaf, item, 0, &token);
3266 btrfs_set_token_inode_size(leaf, item, 0, &token); 3279 btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
3267 } else { 3280 } else {
3268 btrfs_set_token_inode_generation(leaf, item, 3281 btrfs_set_token_inode_generation(leaf, item,
3269 BTRFS_I(inode)->generation, 3282 BTRFS_I(inode)->generation,
@@ -3315,7 +3328,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
3315 return ret; 3328 return ret;
3316 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 3329 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3317 struct btrfs_inode_item); 3330 struct btrfs_inode_item);
3318 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0); 3331 fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
3319 btrfs_release_path(path); 3332 btrfs_release_path(path);
3320 return 0; 3333 return 0;
3321} 3334}
@@ -3324,7 +3337,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3324 struct inode *inode, 3337 struct inode *inode,
3325 struct btrfs_path *dst_path, 3338 struct btrfs_path *dst_path,
3326 struct btrfs_path *src_path, u64 *last_extent, 3339 struct btrfs_path *src_path, u64 *last_extent,
3327 int start_slot, int nr, int inode_only) 3340 int start_slot, int nr, int inode_only,
3341 u64 logged_isize)
3328{ 3342{
3329 unsigned long src_offset; 3343 unsigned long src_offset;
3330 unsigned long dst_offset; 3344 unsigned long dst_offset;
@@ -3381,7 +3395,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
3381 dst_path->slots[0], 3395 dst_path->slots[0],
3382 struct btrfs_inode_item); 3396 struct btrfs_inode_item);
3383 fill_inode_item(trans, dst_path->nodes[0], inode_item, 3397 fill_inode_item(trans, dst_path->nodes[0], inode_item,
3384 inode, inode_only == LOG_INODE_EXISTS); 3398 inode, inode_only == LOG_INODE_EXISTS,
3399 logged_isize);
3385 } else { 3400 } else {
3386 copy_extent_buffer(dst_path->nodes[0], src, dst_offset, 3401 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3387 src_offset, ins_sizes[i]); 3402 src_offset, ins_sizes[i]);
@@ -3933,6 +3948,33 @@ process:
3933 return ret; 3948 return ret;
3934} 3949}
3935 3950
3951static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
3952 struct btrfs_path *path, u64 *size_ret)
3953{
3954 struct btrfs_key key;
3955 int ret;
3956
3957 key.objectid = btrfs_ino(inode);
3958 key.type = BTRFS_INODE_ITEM_KEY;
3959 key.offset = 0;
3960
3961 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
3962 if (ret < 0) {
3963 return ret;
3964 } else if (ret > 0) {
3965 *size_ret = i_size_read(inode);
3966 } else {
3967 struct btrfs_inode_item *item;
3968
3969 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3970 struct btrfs_inode_item);
3971 *size_ret = btrfs_inode_size(path->nodes[0], item);
3972 }
3973
3974 btrfs_release_path(path);
3975 return 0;
3976}
3977
3936/* log a single inode in the tree log. 3978/* log a single inode in the tree log.
3937 * At least one parent directory for this inode must exist in the tree 3979 * At least one parent directory for this inode must exist in the tree
3938 * or be logged already. 3980 * or be logged already.
@@ -3970,6 +4012,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
3970 bool fast_search = false; 4012 bool fast_search = false;
3971 u64 ino = btrfs_ino(inode); 4013 u64 ino = btrfs_ino(inode);
3972 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4014 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4015 u64 logged_isize = 0;
3973 4016
3974 path = btrfs_alloc_path(); 4017 path = btrfs_alloc_path();
3975 if (!path) 4018 if (!path)
@@ -4030,6 +4073,25 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4030 max_key_type = BTRFS_XATTR_ITEM_KEY; 4073 max_key_type = BTRFS_XATTR_ITEM_KEY;
4031 ret = drop_objectid_items(trans, log, path, ino, max_key_type); 4074 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
4032 } else { 4075 } else {
4076 if (inode_only == LOG_INODE_EXISTS) {
4077 /*
4078 * Make sure the new inode item we write to the log has
4079 * the same isize as the current one (if it exists).
4080 * This is necessary to prevent data loss after log
4081 * replay, and also to prevent doing a wrong expanding
4082 * truncate - for e.g. create file, write 4K into offset
4083 * 0, fsync, write 4K into offset 4096, add hard link,
4084 * fsync some other file (to sync log), power fail - if
4085 * we use the inode's current i_size, after log replay
4086 * we get a 8Kb file, with the last 4Kb extent as a hole
4087 * (zeroes), as if an expanding truncate happened,
4088 * instead of getting a file of 4Kb only.
4089 */
4090 err = logged_inode_size(log, inode, path,
4091 &logged_isize);
4092 if (err)
4093 goto out_unlock;
4094 }
4033 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, 4095 if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4034 &BTRFS_I(inode)->runtime_flags)) { 4096 &BTRFS_I(inode)->runtime_flags)) {
4035 clear_bit(BTRFS_INODE_COPY_EVERYTHING, 4097 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
@@ -4085,7 +4147,8 @@ again:
4085 } 4147 }
4086 4148
4087 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4149 ret = copy_items(trans, inode, dst_path, path, &last_extent,
4088 ins_start_slot, ins_nr, inode_only); 4150 ins_start_slot, ins_nr, inode_only,
4151 logged_isize);
4089 if (ret < 0) { 4152 if (ret < 0) {
4090 err = ret; 4153 err = ret;
4091 goto out_unlock; 4154 goto out_unlock;
@@ -4109,7 +4172,7 @@ next_slot:
4109 if (ins_nr) { 4172 if (ins_nr) {
4110 ret = copy_items(trans, inode, dst_path, path, 4173 ret = copy_items(trans, inode, dst_path, path,
4111 &last_extent, ins_start_slot, 4174 &last_extent, ins_start_slot,
4112 ins_nr, inode_only); 4175 ins_nr, inode_only, logged_isize);
4113 if (ret < 0) { 4176 if (ret < 0) {
4114 err = ret; 4177 err = ret;
4115 goto out_unlock; 4178 goto out_unlock;
@@ -4130,7 +4193,8 @@ next_slot:
4130 } 4193 }
4131 if (ins_nr) { 4194 if (ins_nr) {
4132 ret = copy_items(trans, inode, dst_path, path, &last_extent, 4195 ret = copy_items(trans, inode, dst_path, path, &last_extent,
4133 ins_start_slot, ins_nr, inode_only); 4196 ins_start_slot, ins_nr, inode_only,
4197 logged_isize);
4134 if (ret < 0) { 4198 if (ret < 0) {
4135 err = ret; 4199 err = ret;
4136 goto out_unlock; 4200 goto out_unlock;