Btrfs: fix loss of prealloc extents past i_size after fsync log replay

Currently if we allocate extents beyond an inode's i_size (through the fallocate system call) and then fsync the file, we log the extents but after a power failure we replay them and then immediately drop them. This behaviour happens since about 2009, commit c71bf099abdd ("Btrfs: Avoid orphan inodes cleanup while replaying log"), because it marks the inode as an orphan instead of dropping any extents beyond i_size before replaying logged extents, so after the log replay, and while the mount operation is still ongoing, we find the inode marked as an orphan and then perform a truncation (drop extents beyond the inode's i_size). Because the processing of orphan inodes is still done right after replaying the log and before the mount operation finishes, the intention of that commit does not make any sense (at least as of today). However reverting that behaviour is not enough, because we can not simply discard all extents beyond i_size and then replay logged extents, because we risk dropping extents beyond i_size created in past transactions, for example: add prealloc extent beyond i_size fsync - clears the flag BTRFS_INODE_NEEDS_FULL_SYNC from the inode transaction commit add another prealloc extent beyond i_size fsync - triggers the fast fsync path power failure In that scenario, we would drop the first extent and then replay the second one. To fix this just make sure that all prealloc extents beyond i_size are logged, and if we find too many (which is far from a common case), fallback to a full transaction commit (like we do when logging regular extents in the fast fsync path). Trivial reproducer: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ xfs_io -f -c "pwrite -S 0xab 0 256K" /mnt/foo $ sync $ xfs_io -c "falloc -k 256K 1M" /mnt/foo $ xfs_io -c "fsync" /mnt/foo <power failure> # mount to replay log $ mount /dev/sdb /mnt # at this point the file only has one extent, at offset 0, size 256K A test case for fstests follows soon, covering multiple scenarios that involve adding prealloc extents with previous shrinking truncates and without such truncates. Fixes: c71bf099abdd ("Btrfs: Avoid orphan inodes cleanup while replaying log") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
author: Filipe Manana <fdmanana@suse.com> 2018-04-05 17:55:12 -0400
committer: David Sterba <dsterba@suse.com> 2018-04-12 08:50:36 -0400
commit: 471d557afed155b85da237ec46c549f443eeb5de (patch)
tree: 421a79b1040c727f826d53e5407ae5f88d0e342d
parent: af7227338135d2f1b1552bf9a6d43e02dcba10b9 (diff)
1 files changed, 58 insertions, 5 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 678154d4b78f..d20dbbe214e5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2458,13 +2458,41 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                        if (ret)
                                break;
-                        /* for regular files, make sure corresponding
+                        /*
-                         * orphan item exist. extents past the new EOF
+                         * Before replaying extents, truncate the inode to its
-                         * will be truncated later by orphan cleanup.
+                         * size. We need to do it now and not after log replay
+                         * because before an fsync we can have prealloc extents
+                         * added beyond the inode's i_size. If we did it after,
+                         * through orphan cleanup for example, we would drop
+                         * those prealloc extents just after replaying them.
                         */
                        if (S_ISREG(mode)) {
-                                ret = insert_orphan_item(wc->trans, root,
+                                struct inode *inode;
-                                                         key.objectid);
+                                u64 from;
+                                inode = read_one_inode(root, key.objectid);
+                                if (!inode) {
+                                        ret = -EIO;
+                                        break;
+                                }
+                                from = ALIGN(i_size_read(inode),
+                                             root->fs_info->sectorsize);
+                                ret = btrfs_drop_extents(wc->trans, root, inode,
+                                                         from, (u64)-1, 1);
+                                /*
+                                 * If the nlink count is zero here, the iput
+                                 * will free the inode.  We bump it to make
+                                 * sure it doesn't get freed until the link
+                                 * count fixup is done.
+                                 */
+                                if (!ret) {
+                                        if (inode->i_nlink == 0)
+                                                inc_nlink(inode);
+                                        /* Update link count and nbytes. */
+                                        ret = btrfs_update_inode(wc->trans,
+                                                                 root, inode);
+                                }
+                                iput(inode);
                                if (ret)
                                        break;
                        }
@@ -4359,6 +4387,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                num++;
        }
+        /*
+         * Add all prealloc extents beyond the inode's i_size to make sure we
+         * don't lose them after doing a fast fsync and replaying the log.
+         */
+        if (inode->flags & BTRFS_INODE_PREALLOC) {
+                struct rb_node *node;
+                for (node = rb_last(&tree->map); node; node = rb_prev(node)) {
+                        em = rb_entry(node, struct extent_map, rb_node);
+                        if (em->start < i_size_read(&inode->vfs_inode))
+                                break;
+                        if (!list_empty(&em->list))
+                                continue;
+                        /* Same as above loop. */
+                        if (++num > 32768) {
+                                list_del_init(&tree->modified_extents);
+                                ret = -EFBIG;
+                                goto process;
+                        }
+                        refcount_inc(&em->refs);
+                        set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+                        list_add_tail(&em->list, &extents);
+                }
+        }
        list_sort(NULL, &extents, extent_cmp);
        btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);
        /*
author	Filipe Manana <fdmanana@suse.com>	2018-04-05 17:55:12 -0400
committer	David Sterba <dsterba@suse.com>	2018-04-12 08:50:36 -0400
commit	471d557afed155b85da237ec46c549f443eeb5de (patch)
tree	421a79b1040c727f826d53e5407ae5f88d0e342d
parent	af7227338135d2f1b1552bf9a6d43e02dcba10b9 (diff)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 678154d4b78f..d20dbbe214e5 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c
@@ -2458,13 +2458,41 @@ static int replay_one_buffer(struct btrfs_root log, struct extent_buffer eb,
2458	if (ret)	2458	if (ret)
2459	break;	2459	break;
2460		2460
2461	/* for regular files, make sure corresponding	2461	/*
2462	* orphan item exist. extents past the new EOF	2462	* Before replaying extents, truncate the inode to its
2463	* will be truncated later by orphan cleanup.	2463	* size. We need to do it now and not after log replay
		2464	* because before an fsync we can have prealloc extents
		2465	* added beyond the inode's i_size. If we did it after,
		2466	* through orphan cleanup for example, we would drop
		2467	* those prealloc extents just after replaying them.
2464	*/	2468	*/
2465	if (S_ISREG(mode)) {	2469	if (S_ISREG(mode)) {
2466	ret = insert_orphan_item(wc->trans, root,	2470	struct inode *inode;
2467	key.objectid);	2471	u64 from;
		2472
		2473	inode = read_one_inode(root, key.objectid);
		2474	if (!inode) {
		2475	ret = -EIO;
		2476	break;
		2477	}
		2478	from = ALIGN(i_size_read(inode),
		2479	root->fs_info->sectorsize);
		2480	ret = btrfs_drop_extents(wc->trans, root, inode,
		2481	from, (u64)-1, 1);
		2482	/*
		2483	* If the nlink count is zero here, the iput
		2484	* will free the inode. We bump it to make
		2485	* sure it doesn't get freed until the link
		2486	* count fixup is done.
		2487	*/
		2488	if (!ret) {
		2489	if (inode->i_nlink == 0)
		2490	inc_nlink(inode);
		2491	/* Update link count and nbytes. */
		2492	ret = btrfs_update_inode(wc->trans,
		2493	root, inode);
		2494	}
		2495	iput(inode);
2468	if (ret)	2496	if (ret)
2469	break;	2497	break;
2470	}	2498	}
@@ -4359,6 +4387,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4359	num++;	4387	num++;
4360	}	4388	}
4361		4389
		4390	/*
		4391	* Add all prealloc extents beyond the inode's i_size to make sure we
		4392	* don't lose them after doing a fast fsync and replaying the log.
		4393	*/
		4394	if (inode->flags & BTRFS_INODE_PREALLOC) {
		4395	struct rb_node *node;
		4396
		4397	for (node = rb_last(&tree->map); node; node = rb_prev(node)) {
		4398	em = rb_entry(node, struct extent_map, rb_node);
		4399	if (em->start < i_size_read(&inode->vfs_inode))
		4400	break;
		4401	if (!list_empty(&em->list))
		4402	continue;
		4403	/* Same as above loop. */
		4404	if (++num > 32768) {
		4405	list_del_init(&tree->modified_extents);
		4406	ret = -EFBIG;
		4407	goto process;
		4408	}
		4409	refcount_inc(&em->refs);
		4410	set_bit(EXTENT_FLAG_LOGGING, &em->flags);
		4411	list_add_tail(&em->list, &extents);
		4412	}
		4413	}
		4414
4362	list_sort(NULL, &extents, extent_cmp);	4415	list_sort(NULL, &extents, extent_cmp);
4363	btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);	4416	btrfs_get_logged_extents(inode, logged_list, logged_start, logged_end);
4364	/*	4417	/*