Btrfs: fix race between fsync and lockless direct IO writes

An fsync, using the fast path, can race with a concurrent lockless direct IO write and end up logging a file extent item that points to an extent that wasn't written to yet. This is because the fast fsync path collects ordered extents into a local list and then collects all the new extent maps to log file extent items based on them, while the direct IO write path creates the new extent map before it creates the corresponding ordered extent (and submitting the respective bio(s)). So fix this by making the direct IO write path create ordered extents before the extent maps and make the fast fsync path collect any new ordered extents after it collects the extent maps. Note that making the fsync handler call inode_dio_wait() (after acquiring the inode's i_mutex) would not work and lead to a deadlock when doing AIO, as through AIO we end up in a path where the fsync handler is called (through dio_aio_complete_work() -> dio_complete() -> vfs_fsync_range()) before the inode's dio counter is decremented (inode_dio_wait() waits for this counter to have a value of zero). Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
author: Filipe Manana <fdmanana@suse.com> 2016-01-21 05:17:54 -0500
committer: Chris Mason <clm@fb.com> 2016-01-25 19:50:26 -0500
commit: de0ee0edb21fbab4c7afa3e94573ecfebfb0244e (patch)
tree: 52b0a9db694a7810bed41f04fa5be2b843352484 /fs
parent: 6b5aa88c861cf0e4156e490009e2018d4fc81109 (diff)
2 files changed, 39 insertions, 11 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b8bb7591ff9f..e4565456eb01 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7115,21 +7115,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        if (ret)
                return ERR_PTR(ret);
-        em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+        /*
-                              ins.offset, ins.offset, ins.offset, 0);
+         * Create the ordered extent before the extent map. This is to avoid
-        if (IS_ERR(em)) {
+         * races with the fast fsync path that would lead to it logging file
-                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+         * extent items that point to disk extents that were not yet written to.
-                return em;
+         * The fast fsync path collects ordered extents into a local list and
-        }
+         * then collects all the new extent maps, so we must create the ordered
+         * extent first and make sure the fast fsync path collects any new
+         * ordered extents after collecting new extent maps as well.
+         * The fsync path simply can not rely on inode_dio_wait() because it
+         * causes deadlock with AIO.
+         */
        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
                                           ins.offset, ins.offset, 0);
        if (ret) {
                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-                free_extent_map(em);
                return ERR_PTR(ret);
        }
+        em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+                              ins.offset, ins.offset, ins.offset, 0);
+        if (IS_ERR(em)) {
+                struct btrfs_ordered_extent *oe;
+                btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+                oe = btrfs_lookup_ordered_extent(inode, start);
+                ASSERT(oe);
+                if (WARN_ON(!oe))
+                        return em;
+                set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
+                set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
+                btrfs_remove_ordered_extent(inode, oe);
+                /* Once for our lookup and once for the ordered extents tree. */
+                btrfs_put_ordered_extent(oe);
+                btrfs_put_ordered_extent(oe);
+        }
        return em;
 }
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 323e12cc9d2f..978c3a810893 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                                     struct inode *inode,
                                     struct btrfs_path *path,
                                     struct list_head *logged_list,
-                                     struct btrfs_log_ctx *ctx)
+                                     struct btrfs_log_ctx *ctx,
+                                     const u64 start,
+                                     const u64 end)
 {
        struct extent_map *em, *n;
        struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
        }
        list_sort(NULL, &extents, extent_cmp);
+        /*
+         * Collect any new ordered extents within the range. This is to
+         * prevent logging file extent items without waiting for the disk
+         * location they point to being written. We do this only to deal
+         * with races against concurrent lockless direct IO writes.
+         */
+        btrfs_get_logged_extents(inode, logged_list, start, end);
 process:
        while (!list_empty(&extents)) {
                em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
                        goto out_unlock;
                }
                ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
-                                                &logged_list, ctx);
+                                                &logged_list, ctx, start, end);
                if (ret) {
                        err = ret;
                        goto out_unlock;
author	Filipe Manana <fdmanana@suse.com>	2016-01-21 05:17:54 -0500
committer	Chris Mason <clm@fb.com>	2016-01-25 19:50:26 -0500
commit	de0ee0edb21fbab4c7afa3e94573ecfebfb0244e (patch)
tree	52b0a9db694a7810bed41f04fa5be2b843352484 /fs
parent	6b5aa88c861cf0e4156e490009e2018d4fc81109 (diff)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b8bb7591ff9f..e4565456eb01 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c
@@ -7115,21 +7115,41 @@ static struct extent_map btrfs_new_extent_direct(struct inode inode,
7115	if (ret)	7115	if (ret)
7116	return ERR_PTR(ret);	7116	return ERR_PTR(ret);
7117		7117
7118	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,	7118	/*
7119	ins.offset, ins.offset, ins.offset, 0);	7119	* Create the ordered extent before the extent map. This is to avoid
7120	if (IS_ERR(em)) {	7120	* races with the fast fsync path that would lead to it logging file
7121	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);	7121	* extent items that point to disk extents that were not yet written to.
7122	return em;	7122	* The fast fsync path collects ordered extents into a local list and
7123	}	7123	* then collects all the new extent maps, so we must create the ordered
7124		7124	* extent first and make sure the fast fsync path collects any new
		7125	* ordered extents after collecting new extent maps as well.
		7126	* The fsync path simply can not rely on inode_dio_wait() because it
		7127	* causes deadlock with AIO.
		7128	*/
7125	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,	7129	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
7126	ins.offset, ins.offset, 0);	7130	ins.offset, ins.offset, 0);
7127	if (ret) {	7131	if (ret) {
7128	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);	7132	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7129	free_extent_map(em);
7130	return ERR_PTR(ret);	7133	return ERR_PTR(ret);
7131	}	7134	}
7132		7135
		7136	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
		7137	ins.offset, ins.offset, ins.offset, 0);
		7138	if (IS_ERR(em)) {
		7139	struct btrfs_ordered_extent *oe;
		7140
		7141	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
		7142	oe = btrfs_lookup_ordered_extent(inode, start);
		7143	ASSERT(oe);
		7144	if (WARN_ON(!oe))
		7145	return em;
		7146	set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
		7147	set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
		7148	btrfs_remove_ordered_extent(inode, oe);
		7149	/* Once for our lookup and once for the ordered extents tree. */
		7150	btrfs_put_ordered_extent(oe);
		7151	btrfs_put_ordered_extent(oe);
		7152	}
7133	return em;	7153	return em;
7134	}	7154	}
7135		7155


diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 323e12cc9d2f..978c3a810893 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4127	struct inode *inode,	4127	struct inode *inode,
4128	struct btrfs_path *path,	4128	struct btrfs_path *path,
4129	struct list_head *logged_list,	4129	struct list_head *logged_list,
4130	struct btrfs_log_ctx *ctx)	4130	struct btrfs_log_ctx *ctx,
		4131	const u64 start,
		4132	const u64 end)
4131	{	4133	{
4132	struct extent_map em, n;	4134	struct extent_map em, n;
4133	struct list_head extents;	4135	struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4166	}	4168	}
4167		4169
4168	list_sort(NULL, &extents, extent_cmp);	4170	list_sort(NULL, &extents, extent_cmp);
4169		4171	/*
		4172	* Collect any new ordered extents within the range. This is to
		4173	* prevent logging file extent items without waiting for the disk
		4174	* location they point to being written. We do this only to deal
		4175	* with races against concurrent lockless direct IO writes.
		4176	*/
		4177	btrfs_get_logged_extents(inode, logged_list, start, end);
4170	process:	4178	process:
4171	while (!list_empty(&extents)) {	4179	while (!list_empty(&extents)) {
4172	em = list_entry(extents.next, struct extent_map, list);	4180	em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
4701	goto out_unlock;	4709	goto out_unlock;
4702	}	4710	}
4703	ret = btrfs_log_changed_extents(trans, root, inode, dst_path,	4711	ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4704	&logged_list, ctx);	4712	&logged_list, ctx, start, end);
4705	if (ret) {	4713	if (ret) {
4706	err = ret;	4714	err = ret;
4707	goto out_unlock;	4715	goto out_unlock;