aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-02-15 17:38:54 -0500
committerChris Mason <clm@fb.com>2015-03-02 17:04:44 -0500
commite8c1c76e804b18120e6977fc092769c043876212 (patch)
tree2bb67d4ae79589af33480772150e83ac5ca78c56
parent0c0ef4bc842ba6b593bb94f9fb8b653fe18c5ed8 (diff)
Btrfs: add missing inode update when punching hole
When punching a file hole if we endup only zeroing parts of a page, because the start offset isn't a multiple of the sector size or the start offset and length fall within the same page, we were not updating the inode item. This prevented an fsync from doing anything, if no other file changes happened in the current transaction, because the fields in btrfs_inode used to check if the inode needs to be fsync'ed weren't updated. This issue is easy to reproduce and the following excerpt from the xfstest case I made shows how to trigger it: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create our test file. $XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Fsync the file, this makes btrfs update some btrfs inode specific fields # that are used to track if the inode needs to be written/updated to the fsync # log or not. After this fsync, the new values for those fields indicate that # a subsequent fsync does not need to touch the fsync log. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Force a commit of the current transaction. After this point, any operation # that modifies the data or metadata of our file, should update those fields in # the btrfs inode with values that make the next fsync operation write to the # fsync log. sync # Punch a hole in our file. This small range affects only 1 page. # This made the btrfs hole punching implementation write only some zeroes in # one page, but it did not update the btrfs inode fields used to determine if # the next fsync needs to write to the fsync log. $XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo # Another variation of the previously mentioned case. $XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo # Now fsync the file. This was a no-operation because the previous hole punch # operation didn't update the inode's fields mentioned before, so they remained # with the values they had after the first fsync - that is, they indicate that # it is not needed to write to fsync log. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo echo "File content before:" od -t x1 $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Enable writes and mount the fs. This makes the fsync log replay code run. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Because the last fsync didn't do anything, here the file content matched what # it was after the first fsync, before the holes were punched, and not what it # was after the holes were punched. echo "File content after:" od -t x1 $SCRATCH_MNT/foo This issue has been around since 2012, when the punch hole implementation was added, commit 2aaa66558172 ("Btrfs: add hole punching"). A test case for xfstests follows soon. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r--fs/btrfs/file.c31
1 files changed, 28 insertions, 3 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..b476e5645034 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2276,6 +2276,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2276 bool same_page; 2276 bool same_page;
2277 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2277 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2278 u64 ino_size; 2278 u64 ino_size;
2279 bool truncated_page = false;
2280 bool updated_inode = false;
2279 2281
2280 ret = btrfs_wait_ordered_range(inode, offset, len); 2282 ret = btrfs_wait_ordered_range(inode, offset, len);
2281 if (ret) 2283 if (ret)
@@ -2307,13 +2309,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2307 * entire page. 2309 * entire page.
2308 */ 2310 */
2309 if (same_page && len < PAGE_CACHE_SIZE) { 2311 if (same_page && len < PAGE_CACHE_SIZE) {
2310 if (offset < ino_size) 2312 if (offset < ino_size) {
2313 truncated_page = true;
2311 ret = btrfs_truncate_page(inode, offset, len, 0); 2314 ret = btrfs_truncate_page(inode, offset, len, 0);
2315 } else {
2316 ret = 0;
2317 }
2312 goto out_only_mutex; 2318 goto out_only_mutex;
2313 } 2319 }
2314 2320
2315 /* zero back part of the first page */ 2321 /* zero back part of the first page */
2316 if (offset < ino_size) { 2322 if (offset < ino_size) {
2323 truncated_page = true;
2317 ret = btrfs_truncate_page(inode, offset, 0, 0); 2324 ret = btrfs_truncate_page(inode, offset, 0, 0);
2318 if (ret) { 2325 if (ret) {
2319 mutex_unlock(&inode->i_mutex); 2326 mutex_unlock(&inode->i_mutex);
@@ -2349,6 +2356,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2349 if (!ret) { 2356 if (!ret) {
2350 /* zero the front end of the last page */ 2357 /* zero the front end of the last page */
2351 if (tail_start + tail_len < ino_size) { 2358 if (tail_start + tail_len < ino_size) {
2359 truncated_page = true;
2352 ret = btrfs_truncate_page(inode, 2360 ret = btrfs_truncate_page(inode,
2353 tail_start + tail_len, 0, 1); 2361 tail_start + tail_len, 0, 1);
2354 if (ret) 2362 if (ret)
@@ -2358,8 +2366,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2358 } 2366 }
2359 2367
2360 if (lockend < lockstart) { 2368 if (lockend < lockstart) {
2361 mutex_unlock(&inode->i_mutex); 2369 ret = 0;
2362 return 0; 2370 goto out_only_mutex;
2363 } 2371 }
2364 2372
2365 while (1) { 2373 while (1) {
@@ -2507,6 +2515,7 @@ out_trans:
2507 2515
2508 trans->block_rsv = &root->fs_info->trans_block_rsv; 2516 trans->block_rsv = &root->fs_info->trans_block_rsv;
2509 ret = btrfs_update_inode(trans, root, inode); 2517 ret = btrfs_update_inode(trans, root, inode);
2518 updated_inode = true;
2510 btrfs_end_transaction(trans, root); 2519 btrfs_end_transaction(trans, root);
2511 btrfs_btree_balance_dirty(root); 2520 btrfs_btree_balance_dirty(root);
2512out_free: 2521out_free:
@@ -2516,6 +2525,22 @@ out:
2516 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, 2525 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2517 &cached_state, GFP_NOFS); 2526 &cached_state, GFP_NOFS);
2518out_only_mutex: 2527out_only_mutex:
2528 if (!updated_inode && truncated_page && !ret && !err) {
2529 /*
2530 * If we only end up zeroing part of a page, we still need to
2531 * update the inode item, so that all the time fields are
2532 * updated as well as the necessary btrfs inode in memory fields
2533 * for detecting, at fsync time, if the inode isn't yet in the
2534 * log tree or it's there but not up to date.
2535 */
2536 trans = btrfs_start_transaction(root, 1);
2537 if (IS_ERR(trans)) {
2538 err = PTR_ERR(trans);
2539 } else {
2540 err = btrfs_update_inode(trans, root, inode);
2541 ret = btrfs_end_transaction(trans, root);
2542 }
2543 }
2519 mutex_unlock(&inode->i_mutex); 2544 mutex_unlock(&inode->i_mutex);
2520 if (ret && !err) 2545 if (ret && !err)
2521 err = ret; 2546 err = ret;