Btrfs: add missing inode update when punching hole

When punching a file hole if we endup only zeroing parts of a page, because the start offset isn't a multiple of the sector size or the start offset and length fall within the same page, we were not updating the inode item. This prevented an fsync from doing anything, if no other file changes happened in the current transaction, because the fields in btrfs_inode used to check if the inode needs to be fsync'ed weren't updated. This issue is easy to reproduce and the following excerpt from the xfstest case I made shows how to trigger it: _scratch_mkfs >> $seqres.full 2>&1 _init_flakey _mount_flakey # Create our test file. $XFS_IO_PROG -f -c "pwrite -S 0x22 -b 16K 0 16K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Fsync the file, this makes btrfs update some btrfs inode specific fields # that are used to track if the inode needs to be written/updated to the fsync # log or not. After this fsync, the new values for those fields indicate that # a subsequent fsync does not need to touch the fsync log. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo # Force a commit of the current transaction. After this point, any operation # that modifies the data or metadata of our file, should update those fields in # the btrfs inode with values that make the next fsync operation write to the # fsync log. sync # Punch a hole in our file. This small range affects only 1 page. # This made the btrfs hole punching implementation write only some zeroes in # one page, but it did not update the btrfs inode fields used to determine if # the next fsync needs to write to the fsync log. $XFS_IO_PROG -c "fpunch 8000 4K" $SCRATCH_MNT/foo # Another variation of the previously mentioned case. $XFS_IO_PROG -c "fpunch 15000 100" $SCRATCH_MNT/foo # Now fsync the file. This was a no-operation because the previous hole punch # operation didn't update the inode's fields mentioned before, so they remained # with the values they had after the first fsync - that is, they indicate that # it is not needed to write to fsync log. $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/foo echo "File content before:" od -t x1 $SCRATCH_MNT/foo # Simulate a crash/power loss. _load_flakey_table $FLAKEY_DROP_WRITES _unmount_flakey # Enable writes and mount the fs. This makes the fsync log replay code run. _load_flakey_table $FLAKEY_ALLOW_WRITES _mount_flakey # Because the last fsync didn't do anything, here the file content matched what # it was after the first fsync, before the holes were punched, and not what it # was after the holes were punched. echo "File content after:" od -t x1 $SCRATCH_MNT/foo This issue has been around since 2012, when the punch hole implementation was added, commit 2aaa66558172 ("Btrfs: add hole punching"). A test case for xfstests follows soon. Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Chris Mason <clm@fb.com>
author: Filipe Manana <fdmanana@suse.com> 2015-02-15 17:38:54 -0500
committer: Chris Mason <clm@fb.com> 2015-03-02 17:04:44 -0500
commit: e8c1c76e804b18120e6977fc092769c043876212 (patch)
tree: 2bb67d4ae79589af33480772150e83ac5ca78c56
parent: 0c0ef4bc842ba6b593bb94f9fb8b653fe18c5ed8 (diff)
1 files changed, 28 insertions, 3 deletions
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..b476e5645034 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2276,6 +2276,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        bool same_page;
        bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
        u64 ino_size;
+        bool truncated_page = false;
+        bool updated_inode = false;
        ret = btrfs_wait_ordered_range(inode, offset, len);
        if (ret)
@@ -2307,13 +2309,18 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
         * entire page.
         */
        if (same_page && len < PAGE_CACHE_SIZE) {
-                if (offset < ino_size)
+                if (offset < ino_size) {
+                        truncated_page = true;
                        ret = btrfs_truncate_page(inode, offset, len, 0);
+                } else {
+                        ret = 0;
+                }
                goto out_only_mutex;
        }
        /* zero back part of the first page */
        if (offset < ino_size) {
+                truncated_page = true;
                ret = btrfs_truncate_page(inode, offset, 0, 0);
                if (ret) {
                        mutex_unlock(&inode->i_mutex);
@@ -2349,6 +2356,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                if (!ret) {
                        /* zero the front end of the last page */
                        if (tail_start + tail_len < ino_size) {
+                                truncated_page = true;
                                ret = btrfs_truncate_page(inode,
                                                tail_start + tail_len, 0, 1);
                                if (ret)
@@ -2358,8 +2366,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
        }
        if (lockend < lockstart) {
-                mutex_unlock(&inode->i_mutex);
+                ret = 0;
-                return 0;
+                goto out_only_mutex;
        }
        while (1) {
@@ -2507,6 +2515,7 @@ out_trans:
        trans->block_rsv = &root->fs_info->trans_block_rsv;
        ret = btrfs_update_inode(trans, root, inode);
+        updated_inode = true;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root);
 out_free:
@@ -2516,6 +2525,22 @@ out:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                             &cached_state, GFP_NOFS);
 out_only_mutex:
+        if (!updated_inode && truncated_page && !ret && !err) {
+                /*
+                 * If we only end up zeroing part of a page, we still need to
+                 * update the inode item, so that all the time fields are
+                 * updated as well as the necessary btrfs inode in memory fields
+                 * for detecting, at fsync time, if the inode isn't yet in the
+                 * log tree or it's there but not up to date.
+                 */
+                trans = btrfs_start_transaction(root, 1);
+                if (IS_ERR(trans)) {
+                        err = PTR_ERR(trans);
+                } else {
+                        err = btrfs_update_inode(trans, root, inode);
+                        ret = btrfs_end_transaction(trans, root);
+                }
+        }
        mutex_unlock(&inode->i_mutex);
        if (ret && !err)
                err = ret;
author	Filipe Manana <fdmanana@suse.com>	2015-02-15 17:38:54 -0500
committer	Chris Mason <clm@fb.com>	2015-03-02 17:04:44 -0500
commit	e8c1c76e804b18120e6977fc092769c043876212 (patch)
tree	2bb67d4ae79589af33480772150e83ac5ca78c56
parent	0c0ef4bc842ba6b593bb94f9fb8b653fe18c5ed8 (diff)