Btrfs: fix race between fsync and lockless direct IO writes

An fsync, using the fast path, can race with a concurrent lockless direct IO write and end up logging a file extent item that points to an extent that wasn't written to yet. This is because the fast fsync path collects ordered extents into a local list and then collects all the new extent maps to log file extent items based on them, while the direct IO write path creates the new extent map before it creates the corresponding ordered extent (and submitting the respective bio(s)). So fix this by making the direct IO write path create ordered extents before the extent maps and make the fast fsync path collect any new ordered extents after it collects the extent maps. Note that making the fsync handler call inode_dio_wait() (after acquiring the inode's i_mutex) would not work and lead to a deadlock when doing AIO, as through AIO we end up in a path where the fsync handler is called (through dio_aio_complete_work() -> dio_complete() -> vfs_fsync_range()) before the inode's dio counter is decremented (inode_dio_wait() waits for this counter to have a value of zero). Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
author: Filipe Manana <fdmanana@suse.com> 2016-01-21 05:17:54 -0500
committer: Chris Mason <clm@fb.com> 2016-01-25 19:50:26 -0500
commit: de0ee0edb21fbab4c7afa3e94573ecfebfb0244e (patch)
tree: 52b0a9db694a7810bed41f04fa5be2b843352484 /fs/btrfs/tree-log.c
parent: 6b5aa88c861cf0e4156e490009e2018d4fc81109 (diff)
1 files changed, 11 insertions, 3 deletions
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 323e12cc9d2f..978c3a810893 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                                     struct inode *inode,
                                     struct btrfs_path *path,
                                     struct list_head *logged_list,
-                                     struct btrfs_log_ctx *ctx)
+                                     struct btrfs_log_ctx *ctx,
+                                     const u64 start,
+                                     const u64 end)
 {
        struct extent_map *em, *n;
        struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
        }
        list_sort(NULL, &extents, extent_cmp);
+        /*
+         * Collect any new ordered extents within the range. This is to
+         * prevent logging file extent items without waiting for the disk
+         * location they point to being written. We do this only to deal
+         * with races against concurrent lockless direct IO writes.
+         */
+        btrfs_get_logged_extents(inode, logged_list, start, end);
 process:
        while (!list_empty(&extents)) {
                em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
                        goto out_unlock;
                }
                ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
-                                                &logged_list, ctx);
+                                                &logged_list, ctx, start, end);
                if (ret) {
                        err = ret;
                        goto out_unlock;
author	Filipe Manana <fdmanana@suse.com>	2016-01-21 05:17:54 -0500
committer	Chris Mason <clm@fb.com>	2016-01-25 19:50:26 -0500
commit	de0ee0edb21fbab4c7afa3e94573ecfebfb0244e (patch)
tree	52b0a9db694a7810bed41f04fa5be2b843352484 /fs/btrfs/tree-log.c
parent	6b5aa88c861cf0e4156e490009e2018d4fc81109 (diff)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 323e12cc9d2f..978c3a810893 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4127	struct inode *inode,	4127	struct inode *inode,
4128	struct btrfs_path *path,	4128	struct btrfs_path *path,
4129	struct list_head *logged_list,	4129	struct list_head *logged_list,
4130	struct btrfs_log_ctx *ctx)	4130	struct btrfs_log_ctx *ctx,
		4131	const u64 start,
		4132	const u64 end)
4131	{	4133	{
4132	struct extent_map em, n;	4134	struct extent_map em, n;
4133	struct list_head extents;	4135	struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4166	}	4168	}
4167		4169
4168	list_sort(NULL, &extents, extent_cmp);	4170	list_sort(NULL, &extents, extent_cmp);
4169		4171	/*
		4172	* Collect any new ordered extents within the range. This is to
		4173	* prevent logging file extent items without waiting for the disk
		4174	* location they point to being written. We do this only to deal
		4175	* with races against concurrent lockless direct IO writes.
		4176	*/
		4177	btrfs_get_logged_extents(inode, logged_list, start, end);
4170	process:	4178	process:
4171	while (!list_empty(&extents)) {	4179	while (!list_empty(&extents)) {
4172	em = list_entry(extents.next, struct extent_map, list);	4180	em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
4701	goto out_unlock;	4709	goto out_unlock;
4702	}	4710	}
4703	ret = btrfs_log_changed_extents(trans, root, inode, dst_path,	4711	ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4704	&logged_list, ctx);	4712	&logged_list, ctx, start, end);
4705	if (ret) {	4713	if (ret) {
4706	err = ret;	4714	err = ret;
4707	goto out_unlock;	4715	goto out_unlock;